1use std::fmt::{Display, Formatter};
8
9use scraper::{ElementRef, Html, Selector};
10use serde::Deserialize;
11use url::Url;
12
13const MAX_LAYOUT_SPACER_LINES: u8 = 3;
14const INDEX_MANIFEST_VERSION: &str = "index.idx/v1";
15const MAX_MANIFEST_BYTES: usize = 32 * 1024;
16const MAX_MANIFEST_HINTS: usize = 64;
17const MAX_MANIFEST_STRING_LEN: usize = 256;
18const MAX_MANIFEST_SELECTOR_COMPLEXITY: usize = 16;
19
20#[derive(Debug, Clone, PartialEq, Eq)]
22pub struct IndexManifest {
23 pub version: String,
25 pub source_url: String,
27 pub scope: String,
29 pub content: IndexContentHint,
31 pub regions: Vec<IndexRegionHint>,
33 pub fields: Vec<IndexFieldHint>,
35 pub forms: Vec<IndexFormHint>,
37 pub dates: Vec<IndexDateHint>,
39}
40
41#[derive(Debug, Clone, PartialEq, Eq, Default)]
43pub struct IndexContentHint {
44 pub main_selector: Option<String>,
46}
47
48#[derive(Debug, Clone, PartialEq, Eq)]
50pub struct IndexRegionHint {
51 pub role: String,
53 pub selector: String,
55 pub collapsed: bool,
57}
58
59#[derive(Debug, Clone, PartialEq, Eq)]
61pub struct IndexFieldHint {
62 pub name: String,
64 pub label: Option<String>,
66}
67
68#[derive(Debug, Clone, PartialEq, Eq)]
70pub struct IndexFormHint {
71 pub name: String,
73 pub selector: Option<String>,
75 pub note: Option<String>,
77}
78
79#[derive(Debug, Clone, Copy, PartialEq, Eq)]
81pub enum IndexDateStyle {
82 Date,
84 DateTime,
86}
87
88impl IndexDateStyle {
89 fn parse(input: &str) -> Option<Self> {
90 match input.trim().to_ascii_lowercase().as_str() {
91 "date" => Some(Self::Date),
92 "datetime" | "date-time" => Some(Self::DateTime),
93 _ => None,
94 }
95 }
96}
97
98#[derive(Debug, Clone, PartialEq, Eq)]
100pub struct IndexDateHint {
101 pub field: String,
103 pub style: IndexDateStyle,
105}
106
107#[derive(Debug, Clone, PartialEq, Eq)]
109pub enum IndexManifestError {
110 TooLarge {
112 max_bytes: usize,
114 actual_bytes: usize,
116 },
117 InvalidJson(String),
119 UnsupportedVersion(String),
121 InvalidSourceUrl(String),
123 InvalidPageUrl(String),
125 CrossOrigin {
127 source_url: String,
129 page_url: String,
131 },
132 InvalidScope(String),
134 OutOfScope {
136 scope: String,
138 page_path: String,
140 },
141 TooManyHints {
143 kind: &'static str,
145 max: usize,
147 },
148 InvalidHint {
150 kind: &'static str,
152 reason: String,
154 },
155}
156
157impl Display for IndexManifestError {
158 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
159 match self {
160 Self::TooLarge {
161 max_bytes,
162 actual_bytes,
163 } => {
164 write!(
165 f,
166 "manifest exceeds limit: {actual_bytes} bytes (max {max_bytes})"
167 )
168 }
169 Self::InvalidJson(error) => write!(f, "manifest JSON is invalid: {error}"),
170 Self::UnsupportedVersion(version) => {
171 write!(f, "unsupported manifest version: {version}")
172 }
173 Self::InvalidSourceUrl(url) => write!(f, "manifest source URL is invalid: {url}"),
174 Self::InvalidPageUrl(url) => write!(f, "page URL is invalid: {url}"),
175 Self::CrossOrigin {
176 source_url,
177 page_url,
178 } => write!(
179 f,
180 "manifest source must be same-origin: {source_url} vs {page_url}"
181 ),
182 Self::InvalidScope(scope) => write!(f, "manifest scope is invalid: {scope}"),
183 Self::OutOfScope { scope, page_path } => {
184 write!(f, "page path {page_path} is outside manifest scope {scope}")
185 }
186 Self::TooManyHints { kind, max } => {
187 write!(f, "manifest has too many {kind} hints (max {max})")
188 }
189 Self::InvalidHint { kind, reason } => {
190 write!(f, "manifest {kind} hint is invalid: {reason}")
191 }
192 }
193 }
194}
195
196impl std::error::Error for IndexManifestError {}
197
198#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Default)]
199struct RawIndexManifest {
200 version: String,
201 #[serde(default)]
202 scope: Option<String>,
203 #[serde(default)]
204 content: RawIndexContentHint,
205 #[serde(default)]
206 regions: Vec<RawIndexRegionHint>,
207 #[serde(default)]
208 fields: Vec<RawIndexFieldHint>,
209 #[serde(default)]
210 forms: Vec<RawIndexFormHint>,
211 #[serde(default)]
212 dates: Vec<RawIndexDateHint>,
213}
214
215#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Default)]
216struct RawIndexContentHint {
217 #[serde(default)]
218 main_selector: Option<String>,
219}
220
221#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
222struct RawIndexRegionHint {
223 role: String,
224 selector: String,
225 #[serde(default)]
226 collapsed: bool,
227}
228
229#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
230struct RawIndexFieldHint {
231 name: String,
232 #[serde(default)]
233 label: Option<String>,
234}
235
236#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
237struct RawIndexFormHint {
238 name: String,
239 #[serde(default)]
240 selector: Option<String>,
241 #[serde(default)]
242 note: Option<String>,
243}
244
245#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
246struct RawIndexDateHint {
247 field: String,
248 style: String,
249}
250
251#[must_use]
253pub fn well_known_index_manifest_url(page_url: &str) -> Option<String> {
254 let mut url = Url::parse(page_url).ok()?;
255 url.set_path("/.well-known/index.idx");
256 url.set_query(None);
257 url.set_fragment(None);
258 Some(url.to_string())
259}
260
261#[must_use]
263pub fn discover_index_manifest_link_from_html(html: &str, page_url: &str) -> Option<String> {
264 let base = Url::parse(page_url).ok()?;
265 let document = Html::parse_document(html);
266 let selector = selector("link[rel][href]")?;
267 for link in document.select(&selector) {
268 let rel = link.value().attr("rel").unwrap_or_default();
269 let is_manifest = rel
270 .split(|c: char| c.is_whitespace() || c == ',')
271 .any(|token| token.eq_ignore_ascii_case("index-manifest"));
272 if !is_manifest {
273 continue;
274 }
275 let href = link.value().attr("href")?;
276 let joined = base.join(href).ok()?;
277 return Some(joined.to_string());
278 }
279 None
280}
281
282#[must_use]
284pub fn discover_index_manifest_link_from_http_link_header(
285 header_value: &str,
286 page_url: &str,
287) -> Option<String> {
288 let base = Url::parse(page_url).ok()?;
289 for chunk in header_value.split(',') {
290 let trimmed = chunk.trim();
291 let Some((target_part, params_part)) = trimmed.split_once('>') else {
292 continue;
293 };
294 let target = target_part.trim().strip_prefix('<')?;
295 let is_manifest = params_part
296 .split(';')
297 .map(str::trim)
298 .filter_map(|part| part.strip_prefix("rel="))
299 .map(|rel| rel.trim_matches('"'))
300 .any(|rel| {
301 rel.split_whitespace()
302 .any(|token| token.eq_ignore_ascii_case("index-manifest"))
303 });
304 if !is_manifest {
305 continue;
306 }
307 let joined = base.join(target).ok()?;
308 return Some(joined.to_string());
309 }
310 None
311}
312
313pub fn parse_index_manifest(
315 input: &str,
316 source_url: &str,
317 page_url: &str,
318) -> Result<IndexManifest, IndexManifestError> {
319 let actual_bytes = input.len();
320 if actual_bytes > MAX_MANIFEST_BYTES {
321 return Err(IndexManifestError::TooLarge {
322 max_bytes: MAX_MANIFEST_BYTES,
323 actual_bytes,
324 });
325 }
326 let source = Url::parse(source_url)
327 .map_err(|_| IndexManifestError::InvalidSourceUrl(source_url.to_owned()))?;
328 let page = Url::parse(page_url)
329 .map_err(|_| IndexManifestError::InvalidPageUrl(page_url.to_owned()))?;
330 if !same_origin(&source, &page) {
331 return Err(IndexManifestError::CrossOrigin {
332 source_url: source_url.to_owned(),
333 page_url: page_url.to_owned(),
334 });
335 }
336
337 let raw = serde_json::from_str::<RawIndexManifest>(input)
338 .map_err(|error| IndexManifestError::InvalidJson(error.to_string()))?;
339 if raw.version != INDEX_MANIFEST_VERSION {
340 return Err(IndexManifestError::UnsupportedVersion(raw.version));
341 }
342
343 let scope = normalize_scope(raw.scope.as_deref(), page.path())?;
344 if !page.path().starts_with(&scope) {
345 return Err(IndexManifestError::OutOfScope {
346 scope,
347 page_path: page.path().to_owned(),
348 });
349 }
350
351 if raw.regions.len() > MAX_MANIFEST_HINTS {
352 return Err(IndexManifestError::TooManyHints {
353 kind: "region",
354 max: MAX_MANIFEST_HINTS,
355 });
356 }
357 if raw.fields.len() > MAX_MANIFEST_HINTS {
358 return Err(IndexManifestError::TooManyHints {
359 kind: "field",
360 max: MAX_MANIFEST_HINTS,
361 });
362 }
363 if raw.forms.len() > MAX_MANIFEST_HINTS {
364 return Err(IndexManifestError::TooManyHints {
365 kind: "form",
366 max: MAX_MANIFEST_HINTS,
367 });
368 }
369 if raw.dates.len() > MAX_MANIFEST_HINTS {
370 return Err(IndexManifestError::TooManyHints {
371 kind: "date",
372 max: MAX_MANIFEST_HINTS,
373 });
374 }
375
376 let main_selector = raw
377 .content
378 .main_selector
379 .as_deref()
380 .map(str::trim)
381 .filter(|value| !value.is_empty())
382 .map(|value| validate_selector("content.main_selector", value))
383 .transpose()?;
384
385 let regions = raw
386 .regions
387 .into_iter()
388 .map(|raw| {
389 let role = validate_hint_text("region.role", &raw.role)?;
390 let selector = validate_selector("region.selector", &raw.selector)?;
391 Ok(IndexRegionHint {
392 role,
393 selector,
394 collapsed: raw.collapsed,
395 })
396 })
397 .collect::<Result<Vec<_>, IndexManifestError>>()?;
398
399 let fields = raw
400 .fields
401 .into_iter()
402 .map(|raw| {
403 let name = validate_hint_text("field.name", &raw.name)?;
404 let label = raw
405 .label
406 .as_deref()
407 .map(str::trim)
408 .filter(|value| !value.is_empty())
409 .map(|value| validate_hint_text("field.label", value))
410 .transpose()?;
411 Ok(IndexFieldHint { name, label })
412 })
413 .collect::<Result<Vec<_>, IndexManifestError>>()?;
414
415 let forms = raw
416 .forms
417 .into_iter()
418 .map(|raw| {
419 let name = validate_hint_text("form.name", &raw.name)?;
420 let selector = raw
421 .selector
422 .as_deref()
423 .map(str::trim)
424 .filter(|value| !value.is_empty())
425 .map(|value| validate_selector("form.selector", value))
426 .transpose()?;
427 let note = raw
428 .note
429 .as_deref()
430 .map(str::trim)
431 .filter(|value| !value.is_empty())
432 .map(|value| validate_hint_text("form.note", value))
433 .transpose()?;
434 Ok(IndexFormHint {
435 name,
436 selector,
437 note,
438 })
439 })
440 .collect::<Result<Vec<_>, IndexManifestError>>()?;
441
442 let dates = raw
443 .dates
444 .into_iter()
445 .map(|raw| {
446 let field = validate_hint_text("date.field", &raw.field)?;
447 let style = IndexDateStyle::parse(&raw.style).ok_or_else(|| {
448 IndexManifestError::InvalidHint {
449 kind: "date.style",
450 reason: format!("unsupported style: {}", raw.style),
451 }
452 })?;
453 Ok(IndexDateHint { field, style })
454 })
455 .collect::<Result<Vec<_>, IndexManifestError>>()?;
456
457 Ok(IndexManifest {
458 version: INDEX_MANIFEST_VERSION.to_owned(),
459 source_url: source_url.to_owned(),
460 scope,
461 content: IndexContentHint { main_selector },
462 regions,
463 fields,
464 forms,
465 dates,
466 })
467}
468
469fn same_origin(left: &Url, right: &Url) -> bool {
470 left.scheme() == right.scheme()
471 && left.host_str() == right.host_str()
472 && left.port_or_known_default() == right.port_or_known_default()
473}
474
475fn normalize_scope(scope: Option<&str>, page_path: &str) -> Result<String, IndexManifestError> {
476 let normalized = scope.unwrap_or("/").trim();
477 if normalized.is_empty() || !normalized.starts_with('/') {
478 return Err(IndexManifestError::InvalidScope(normalized.to_owned()));
479 }
480 if normalized.len() > MAX_MANIFEST_STRING_LEN {
481 return Err(IndexManifestError::InvalidScope(normalized.to_owned()));
482 }
483 if page_path.is_empty() {
484 return Ok(normalized.to_owned());
485 }
486 Ok(normalized.to_owned())
487}
488
489fn validate_hint_text(kind: &'static str, text: &str) -> Result<String, IndexManifestError> {
490 let value = text.trim();
491 if value.is_empty() || value.len() > MAX_MANIFEST_STRING_LEN {
492 return Err(IndexManifestError::InvalidHint {
493 kind,
494 reason: "text length is out of bounds".to_owned(),
495 });
496 }
497 Ok(value.to_owned())
498}
499
500fn validate_selector(kind: &'static str, selector: &str) -> Result<String, IndexManifestError> {
501 let value = validate_hint_text(kind, selector)?;
502 let complexity = value
503 .chars()
504 .filter(|ch| matches!(ch, '>' | '+' | '~' | '[' | ']' | ':' | '*' | '#'))
505 .count();
506 if complexity > MAX_MANIFEST_SELECTOR_COMPLEXITY {
507 return Err(IndexManifestError::InvalidHint {
508 kind,
509 reason: "selector complexity exceeds limit".to_owned(),
510 });
511 }
512 Selector::parse(&value)
513 .map_err(|_| IndexManifestError::InvalidHint {
514 kind,
515 reason: "selector syntax is invalid".to_owned(),
516 })
517 .map(|_| value)
518}
519
520#[derive(Debug, Clone, PartialEq, Eq)]
522pub struct HtmlDocument {
523 pub raw: String,
525 pub title: Option<String>,
527 pub headings: Vec<HtmlHeading>,
529 pub links: Vec<HtmlLink>,
531 pub forms: Vec<HtmlForm>,
533 pub nodes: Vec<HtmlNode>,
535 pub metadata: HtmlMetadata,
537 pub body_text: String,
539}
540
541#[derive(Debug, Clone, PartialEq, Eq, Default)]
543pub struct HtmlMetadata {
544 pub canonical_url: Option<String>,
546 pub language: Option<String>,
548 pub description: Option<String>,
550 pub open_graph_title: Option<String>,
552 pub open_graph_description: Option<String>,
554}
555
556#[derive(Debug, Clone, PartialEq, Eq)]
558pub struct HtmlHeading {
559 pub level: u8,
561 pub text: String,
563}
564
565#[derive(Debug, Clone, PartialEq, Eq)]
567pub struct HtmlLink {
568 pub text: String,
570 pub href: String,
572}
573
574#[derive(Debug, Clone, Copy, PartialEq, Eq)]
576pub enum HtmlSectionRole {
577 Main,
579 Navigation,
581 Aside,
583 Footer,
585 Comments,
587 Related,
589 Unknown,
591}
592
593#[derive(Debug, Clone, PartialEq, Eq)]
595pub struct HtmlForm {
596 pub name: String,
598 pub method: String,
600 pub action: String,
602 pub inputs: Vec<HtmlInput>,
604 pub buttons: Vec<HtmlButton>,
606}
607
608#[derive(Debug, Clone, PartialEq, Eq)]
610pub struct HtmlInput {
611 pub name: String,
613 pub kind: String,
615 pub value: Option<String>,
617 pub required: bool,
619}
620
621#[derive(Debug, Clone, PartialEq, Eq)]
623pub struct HtmlButton {
624 pub name: Option<String>,
626 pub value: Option<String>,
628 pub label: String,
630}
631
632#[derive(Debug, Clone, PartialEq, Eq)]
634pub enum HtmlNode {
635 Heading {
637 level: u8,
639 text: String,
641 },
642 Paragraph(String),
644 Link(HtmlLink),
646 List {
648 ordered: bool,
650 items: Vec<String>,
652 },
653 CodeBlock {
655 language: Option<String>,
657 code: String,
659 },
660 Table {
662 rows: Vec<Vec<String>>,
664 },
665 Spacer {
667 lines: u8,
669 },
670 Section {
672 role: HtmlSectionRole,
674 title: Option<String>,
676 collapsed: bool,
678 nodes: Vec<HtmlNode>,
680 },
681 Image {
683 alt: String,
685 src: Option<String>,
687 },
688 Form(HtmlForm),
690}
691
692#[must_use]
694pub fn parse_html(input: impl Into<String>) -> HtmlDocument {
695 let raw = input.into();
696 let html = Html::parse_document(&raw);
697 let base_url = extract_base_url(&html);
698 let metadata = extract_metadata(&html, base_url.as_ref());
699 let layout_rules = extract_layout_rules(&html);
700 let root = main_content_root(&html);
701 let mut nodes = root
702 .as_ref()
703 .map(|root| extract_nodes(&root.element, base_url.as_ref(), &layout_rules))
704 .unwrap_or_default();
705 if let Some(root) = root.as_ref().filter(|root| root.explicit) {
706 nodes.extend(extract_secondary_sections(
707 &html,
708 &root.element,
709 base_url.as_ref(),
710 &layout_rules,
711 ));
712 }
713 let headings = nodes
714 .iter()
715 .filter_map(|node| match node {
716 HtmlNode::Heading { level, text } => Some(HtmlHeading {
717 level: *level,
718 text: text.clone(),
719 }),
720 _ => None,
721 })
722 .collect::<Vec<_>>();
723 let links = root
724 .as_ref()
725 .map(|root| extract_links(&root.element, base_url.as_ref()))
726 .unwrap_or_default();
727 let forms = nodes
728 .iter()
729 .filter_map(|node| match node {
730 HtmlNode::Form(form) => Some(form.clone()),
731 _ => None,
732 })
733 .collect();
734 let title = extract_title(&html, &metadata, &headings);
735 let body_text = body_text_from_nodes(&nodes);
736
737 HtmlDocument {
738 raw,
739 title,
740 headings,
741 links,
742 forms,
743 nodes,
744 metadata,
745 body_text,
746 }
747}
748
749fn selector(query: &str) -> Option<Selector> {
750 Selector::parse(query).ok()
751}
752
753struct MainContentRoot<'a> {
754 element: ElementRef<'a>,
755 explicit: bool,
756}
757
758fn main_content_root(html: &Html) -> Option<MainContentRoot<'_>> {
759 let mut candidates = Vec::new();
760 for query in [
761 "main",
762 "article",
763 "[role=\"main\"]",
764 "[itemprop=\"articleBody\"]",
765 "#content",
766 ".content",
767 ".article",
768 ".post",
769 ".entry-content",
770 ".markdown-body",
771 ] {
772 if let Some(selector) = selector(query) {
773 for element in html.select(&selector) {
774 candidates.push(MainContentRoot {
775 element,
776 explicit: true,
777 });
778 }
779 }
780 }
781
782 if let Some(best) = best_main_root(candidates) {
783 return Some(best);
784 }
785
786 let body = selector("body").and_then(|selector| html.select(&selector).next());
787 if let Some(body) = body {
788 if let Some(dense_region) = densest_body_region(&body) {
789 return Some(MainContentRoot {
790 element: dense_region,
791 explicit: false,
792 });
793 }
794 return Some(MainContentRoot {
795 element: body,
796 explicit: false,
797 });
798 }
799
800 html.root_element()
801 .first_child()
802 .and_then(ElementRef::wrap)
803 .map(|element| MainContentRoot {
804 element,
805 explicit: false,
806 })
807}
808
809fn best_main_root<'a>(roots: Vec<MainContentRoot<'a>>) -> Option<MainContentRoot<'a>> {
810 roots
811 .into_iter()
812 .max_by_key(|root| main_root_score(&root.element))
813}
814
815fn densest_body_region<'a>(body: &ElementRef<'a>) -> Option<ElementRef<'a>> {
816 let selector = selector("main, article, section, div")?;
817 body.select(&selector)
818 .filter(|element| !is_boilerplate_container(element))
819 .map(|element| {
820 let score = main_root_score(&element);
821 (element, score)
822 })
823 .filter(|(_element, score)| *score >= 6)
824 .max_by_key(|(_element, score)| *score)
825 .map(|(element, _)| element)
826}
827
828fn main_root_score(root: &ElementRef<'_>) -> i32 {
829 let mut heading_count = 0i32;
830 let mut paragraph_count = 0i32;
831 let mut list_count = 0i32;
832 let mut code_count = 0i32;
833 let mut table_count = 0i32;
834 let mut link_count = 0i32;
835 let mut text_chars = 0i32;
836 let mut boilerplate_penalty = 0i32;
837
838 for node in root.descendants() {
839 let Some(element) = ElementRef::wrap(node) else {
840 continue;
841 };
842 let tag = element.value().name();
843 if is_boilerplate_container(&element) {
844 boilerplate_penalty += 3;
845 }
846 match tag {
847 "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => heading_count += 1,
848 "p" => {
849 paragraph_count += 1;
850 text_chars += element_text(&element).chars().count() as i32;
851 }
852 "ul" | "ol" => list_count += 1,
853 "pre" | "code" => {
854 code_count += 1;
855 text_chars += code_text(&element).chars().count() as i32;
856 }
857 "table" => table_count += 1,
858 "a" => link_count += 1,
859 _ => {}
860 }
861 }
862
863 let dense_text_score = text_chars / 120;
864 let link_penalty = (link_count - paragraph_count * 5).clamp(0, 14);
865
866 heading_count * 4
867 + paragraph_count * 5
868 + list_count * 2
869 + code_count * 5
870 + table_count * 3
871 + dense_text_score
872 - link_penalty
873 - boilerplate_penalty
874}
875
876fn extract_title(html: &Html, metadata: &HtmlMetadata, headings: &[HtmlHeading]) -> Option<String> {
877 first_text(html, "title")
878 .or_else(|| metadata.open_graph_title.clone())
879 .or_else(|| headings.first().map(|heading| heading.text.clone()))
880}
881
882fn extract_base_url(html: &Html) -> Option<Url> {
883 let selector = selector("base[href]")?;
884 let href = html
885 .select(&selector)
886 .next()
887 .and_then(|node| node.value().attr("href"))?;
888 Url::parse(href).ok()
889}
890
891fn extract_metadata(html: &Html, base_url: Option<&Url>) -> HtmlMetadata {
892 HtmlMetadata {
893 canonical_url: extract_link_href(html, "link[rel~=\"canonical\"]", base_url),
894 language: extract_language(html),
895 description: extract_meta_content(html, "meta[name=\"description\"]"),
896 open_graph_title: extract_meta_content(html, "meta[property=\"og:title\"]"),
897 open_graph_description: extract_meta_content(html, "meta[property=\"og:description\"]"),
898 }
899}
900
901fn extract_language(html: &Html) -> Option<String> {
902 let selector = selector("html[lang]")?;
903 html.select(&selector)
904 .next()
905 .and_then(|node| node.value().attr("lang"))
906 .map(str::trim)
907 .filter(|value| !value.is_empty())
908 .map(ToOwned::to_owned)
909}
910
911fn extract_link_href(html: &Html, query: &str, base_url: Option<&Url>) -> Option<String> {
912 let selector = selector(query)?;
913 html.select(&selector)
914 .next()
915 .and_then(|node| node.value().attr("href"))
916 .map(|href| resolve_url(href, base_url))
917}
918
919fn extract_meta_content(html: &Html, query: &str) -> Option<String> {
920 let selector = selector(query)?;
921 html.select(&selector)
922 .next()
923 .and_then(|node| node.value().attr("content"))
924 .map(clean_text)
925 .filter(|text| !text.is_empty())
926}
927
928fn first_text(html: &Html, query: &str) -> Option<String> {
929 let selector = selector(query)?;
930 html.select(&selector)
931 .next()
932 .map(|node| element_text(&node))
933 .filter(|text| !text.is_empty())
934}
935
936fn extract_nodes(
937 root: &ElementRef<'_>,
938 base_url: Option<&Url>,
939 layout_rules: &LayoutRules,
940) -> Vec<HtmlNode> {
941 let mut nodes = Vec::new();
942 extract_element_nodes(root, base_url, layout_rules, &mut nodes);
943 trim_layout_spacers(&mut nodes);
944 nodes
945}
946
947fn extract_element_nodes(
948 element: &ElementRef<'_>,
949 base_url: Option<&Url>,
950 layout_rules: &LayoutRules,
951 nodes: &mut Vec<HtmlNode>,
952) {
953 let tag = element.value().name();
954 if tag == "br" {
955 push_spacer(nodes, 1);
956 return;
957 }
958
959 if is_boilerplate_container(element) && !is_primary_content_container(element) {
960 return;
961 }
962
963 if is_content_element(tag) {
964 let spacing = layout_spacing(element, layout_rules);
965 push_spacer(nodes, spacing.before);
966 if let Some(node) = html_node_from_element(element, base_url) {
967 nodes.push(node);
968 }
969 push_spacer(nodes, spacing.after);
970 return;
971 }
972
973 let spacing = is_layout_boundary(tag).then(|| layout_spacing(element, layout_rules));
974 if let Some(spacing) = spacing {
975 push_spacer(nodes, spacing.before);
976 }
977
978 for child in element.children() {
979 if let Some(child_element) = ElementRef::wrap(child) {
980 extract_element_nodes(&child_element, base_url, layout_rules, nodes);
981 }
982 }
983
984 if let Some(spacing) = spacing {
985 push_spacer(nodes, spacing.after);
986 }
987}
988
989fn html_node_from_element(element: &ElementRef<'_>, base_url: Option<&Url>) -> Option<HtmlNode> {
990 match element.value().name() {
991 "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
992 let text = element_text(element);
993 if text.is_empty() {
994 None
995 } else {
996 Some(HtmlNode::Heading {
997 level: heading_level(element.value().name()),
998 text,
999 })
1000 }
1001 }
1002 "p" => {
1003 let text = element_text(element);
1004 (!text.is_empty()
1005 && !is_anchor_only_paragraph(element, &text)
1006 && !is_boilerplate_paragraph(&text))
1007 .then_some(HtmlNode::Paragraph(text))
1008 }
1009 "blockquote" => {
1010 let text = element_text(element);
1011 (!text.is_empty()).then_some(HtmlNode::Paragraph(format!("> {text}")))
1012 }
1013 "ul" | "ol" => {
1014 let items = list_items(element);
1015 (!items.is_empty()).then_some(HtmlNode::List {
1016 ordered: element.value().name() == "ol",
1017 items,
1018 })
1019 }
1020 "pre" | "code" => {
1021 let code = code_text(element);
1022 (!code.trim().is_empty()).then_some(HtmlNode::CodeBlock {
1023 language: code_language(element),
1024 code,
1025 })
1026 }
1027 "table" => {
1028 let rows = table_rows(element);
1029 (!rows.is_empty()).then_some(HtmlNode::Table { rows })
1030 }
1031 "img" => Some(HtmlNode::Image {
1032 alt: image_alt(element),
1033 src: element
1034 .value()
1035 .attr("src")
1036 .map(|src| resolve_url(src, base_url)),
1037 }),
1038 "form" => Some(HtmlNode::Form(extract_form(element, base_url))),
1039 _ => None,
1040 }
1041}
1042
1043fn extract_secondary_sections(
1044 html: &Html,
1045 main_root: &ElementRef<'_>,
1046 base_url: Option<&Url>,
1047 layout_rules: &LayoutRules,
1048) -> Vec<HtmlNode> {
1049 let Some(selector) = selector(
1050 "nav, aside, footer, [role=\"navigation\"], [role=\"contentinfo\"], .sidebar, .related, #related, .comments, #comments",
1051 ) else {
1052 return Vec::new();
1053 };
1054
1055 html.select(&selector)
1056 .filter(|element| {
1057 !is_descendant_of(element, main_root)
1058 && !is_descendant_of(main_root, element)
1059 && element.id() != main_root.id()
1060 })
1061 .filter_map(|element| secondary_section_from_element(&element, base_url, layout_rules))
1062 .collect()
1063}
1064
1065fn secondary_section_from_element(
1066 element: &ElementRef<'_>,
1067 base_url: Option<&Url>,
1068 layout_rules: &LayoutRules,
1069) -> Option<HtmlNode> {
1070 let mut nodes = extract_nodes(element, base_url, layout_rules);
1071 if nodes.is_empty() {
1072 nodes.extend(
1073 extract_links(element, base_url)
1074 .into_iter()
1075 .map(HtmlNode::Link),
1076 );
1077 }
1078 trim_layout_spacers(&mut nodes);
1079
1080 (!nodes.is_empty()).then(|| HtmlNode::Section {
1081 role: section_role(element),
1082 title: section_title(element),
1083 collapsed: true,
1084 nodes,
1085 })
1086}
1087
1088fn is_descendant_of(element: &ElementRef<'_>, ancestor: &ElementRef<'_>) -> bool {
1089 let mut parent = element.parent();
1090 while let Some(node) = parent {
1091 if node.id() == ancestor.id() {
1092 return true;
1093 }
1094 parent = node.parent();
1095 }
1096 false
1097}
1098
1099fn section_role(element: &ElementRef<'_>) -> HtmlSectionRole {
1100 let names = element
1101 .value()
1102 .attr("id")
1103 .into_iter()
1104 .chain(element.value().attr("class"))
1105 .flat_map(str::split_whitespace)
1106 .map(str::to_ascii_lowercase)
1107 .collect::<Vec<_>>();
1108
1109 if names.iter().any(|name| name.contains("comment")) {
1110 return HtmlSectionRole::Comments;
1111 }
1112 if names.iter().any(|name| name.contains("related")) {
1113 return HtmlSectionRole::Related;
1114 }
1115
1116 match element.value().name() {
1117 "nav" => return HtmlSectionRole::Navigation,
1118 "aside" => return HtmlSectionRole::Aside,
1119 "footer" => return HtmlSectionRole::Footer,
1120 _ => {}
1121 }
1122
1123 match element.value().attr("role") {
1124 Some("navigation") => return HtmlSectionRole::Navigation,
1125 Some("contentinfo") => return HtmlSectionRole::Footer,
1126 _ => {}
1127 }
1128
1129 if names
1130 .iter()
1131 .any(|name| name.contains("side") || name.contains("rail"))
1132 {
1133 HtmlSectionRole::Aside
1134 } else {
1135 HtmlSectionRole::Unknown
1136 }
1137}
1138
1139fn section_title(element: &ElementRef<'_>) -> Option<String> {
1140 element
1141 .value()
1142 .attr("aria-label")
1143 .or_else(|| element.value().attr("title"))
1144 .map(clean_text)
1145 .filter(|title| !title.is_empty())
1146 .or_else(|| {
1147 let selector = selector("h1, h2, h3, h4, h5, h6")?;
1148 element
1149 .select(&selector)
1150 .next()
1151 .map(|heading| element_text(&heading))
1152 .filter(|title| !title.is_empty())
1153 })
1154}
1155
1156#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
1157struct LayoutSpacing {
1158 before: u8,
1159 after: u8,
1160}
1161
1162#[derive(Debug, Clone, PartialEq, Eq, Default)]
1163struct LayoutRules {
1164 rules: Vec<LayoutRule>,
1165}
1166
1167#[derive(Debug, Clone, PartialEq, Eq)]
1168struct LayoutRule {
1169 selector: LayoutSelector,
1170 spacing: LayoutSpacing,
1171}
1172
1173#[derive(Debug, Clone, PartialEq, Eq)]
1174enum LayoutSelector {
1175 Tag(String),
1176 Class(String),
1177 Id(String),
1178}
1179
1180fn is_content_element(tag: &str) -> bool {
1181 matches!(
1182 tag,
1183 "h1" | "h2"
1184 | "h3"
1185 | "h4"
1186 | "h5"
1187 | "h6"
1188 | "p"
1189 | "blockquote"
1190 | "ul"
1191 | "ol"
1192 | "pre"
1193 | "code"
1194 | "table"
1195 | "img"
1196 | "form"
1197 )
1198}
1199
1200fn is_layout_boundary(tag: &str) -> bool {
1201 matches!(
1202 tag,
1203 "article" | "section" | "header" | "footer" | "aside" | "div"
1204 )
1205}
1206
1207fn layout_spacing(element: &ElementRef<'_>, rules: &LayoutRules) -> LayoutSpacing {
1208 let mut spacing = default_layout_spacing(element.value().name());
1209
1210 if has_layout_hint_name(element) {
1211 spacing.before = spacing.before.max(1);
1212 spacing.after = spacing.after.max(1);
1213 }
1214
1215 for rule in rules.rules_for(element) {
1216 spacing.before = spacing.before.max(rule.spacing.before);
1217 spacing.after = spacing.after.max(rule.spacing.after);
1218 }
1219
1220 if let Some(style) = element.value().attr("style") {
1221 let inline_spacing = spacing_from_declarations(style);
1222 spacing.before = spacing.before.max(inline_spacing.before);
1223 spacing.after = spacing.after.max(inline_spacing.after);
1224 }
1225
1226 LayoutSpacing {
1227 before: spacing.before.min(MAX_LAYOUT_SPACER_LINES),
1228 after: spacing.after.min(MAX_LAYOUT_SPACER_LINES),
1229 }
1230}
1231
1232fn default_layout_spacing(tag: &str) -> LayoutSpacing {
1233 match tag {
1234 "article" | "section" | "header" | "footer" | "aside" => LayoutSpacing {
1235 before: 1,
1236 after: 1,
1237 },
1238 _ => LayoutSpacing::default(),
1239 }
1240}
1241
1242fn has_layout_hint_name(element: &ElementRef<'_>) -> bool {
1243 element
1244 .value()
1245 .attr("id")
1246 .into_iter()
1247 .chain(element.value().attr("class"))
1248 .flat_map(str::split_whitespace)
1249 .any(|name| {
1250 let name = name.to_ascii_lowercase();
1251 [
1252 "section", "hero", "intro", "outro", "spacer", "block", "panel", "card",
1253 ]
1254 .iter()
1255 .any(|hint| name.contains(hint))
1256 })
1257}
1258
1259fn is_primary_content_container(element: &ElementRef<'_>) -> bool {
1260 let tag = element.value().name();
1261 if matches!(tag, "main" | "article") {
1262 return true;
1263 }
1264 if matches!(element.value().attr("role"), Some("main")) {
1265 return true;
1266 }
1267 element
1268 .value()
1269 .attr("id")
1270 .into_iter()
1271 .chain(element.value().attr("class"))
1272 .flat_map(str::split_whitespace)
1273 .any(|name| {
1274 let lowered = name.to_ascii_lowercase();
1275 lowered.contains("content")
1276 || lowered.contains("article")
1277 || lowered.contains("post")
1278 || lowered.contains("entry")
1279 || lowered.contains("markdown")
1280 })
1281}
1282
1283fn is_boilerplate_container(element: &ElementRef<'_>) -> bool {
1284 let tag = element.value().name();
1285 if matches!(tag, "nav" | "footer") {
1286 return true;
1287 }
1288
1289 element
1290 .value()
1291 .attr("id")
1292 .into_iter()
1293 .chain(element.value().attr("class"))
1294 .flat_map(str::split_whitespace)
1295 .any(|name| {
1296 let lowered = name.to_ascii_lowercase();
1297 lowered.contains("nav")
1298 || lowered.contains("menu")
1299 || lowered.contains("footer")
1300 || lowered.contains("header")
1301 || lowered.contains("sidebar")
1302 || lowered.contains("related")
1303 || lowered.contains("breadcrumb")
1304 || lowered.contains("cookie")
1305 || lowered.contains("newsletter")
1306 || lowered.contains("subscribe")
1307 || lowered.contains("promo")
1308 || lowered.contains("advert")
1309 || lowered.contains("social")
1310 || lowered.contains("share")
1311 })
1312}
1313
1314fn is_boilerplate_paragraph(text: &str) -> bool {
1315 let lowered = text.to_ascii_lowercase();
1316 let matches_phrase = lowered.contains("sign up for")
1317 || lowered.contains("subscribe")
1318 || lowered.contains("cookie policy")
1319 || lowered.contains("all rights reserved")
1320 || lowered.contains("follow us")
1321 || lowered.contains("share this")
1322 || lowered.contains("advertisement");
1323 matches_phrase && text.chars().count() <= 180
1324}
1325
1326impl LayoutRules {
1327 fn rules_for<'a>(
1328 &'a self,
1329 element: &'a ElementRef<'_>,
1330 ) -> impl Iterator<Item = &'a LayoutRule> {
1331 self.rules
1332 .iter()
1333 .filter(move |rule| rule.selector.matches(element))
1334 }
1335}
1336
1337impl LayoutSelector {
1338 fn matches(&self, element: &ElementRef<'_>) -> bool {
1339 match self {
1340 Self::Tag(tag) => element.value().name() == tag,
1341 Self::Class(class) => element
1342 .value()
1343 .attr("class")
1344 .unwrap_or_default()
1345 .split_whitespace()
1346 .any(|candidate| candidate == class),
1347 Self::Id(id) => element.value().attr("id") == Some(id.as_str()),
1348 }
1349 }
1350}
1351
1352fn extract_layout_rules(html: &Html) -> LayoutRules {
1353 let Some(selector) = selector("style") else {
1354 return LayoutRules::default();
1355 };
1356
1357 let mut rules = Vec::new();
1358 for style in html.select(&selector) {
1359 rules.extend(parse_stylesheet_rules(
1360 &style.text().collect::<Vec<_>>().join(" "),
1361 ));
1362 }
1363
1364 LayoutRules { rules }
1365}
1366
1367fn parse_stylesheet_rules(stylesheet: &str) -> Vec<LayoutRule> {
1368 stylesheet
1369 .split('}')
1370 .filter_map(|rule| {
1371 let (selectors, declarations) = rule.split_once('{')?;
1372 let spacing = spacing_from_declarations(declarations);
1373 if spacing.before == 0 && spacing.after == 0 {
1374 return None;
1375 }
1376 Some(
1377 selectors
1378 .split(',')
1379 .filter_map(parse_layout_selector)
1380 .map(move |selector| LayoutRule { selector, spacing })
1381 .collect::<Vec<_>>(),
1382 )
1383 })
1384 .flatten()
1385 .collect()
1386}
1387
1388fn parse_layout_selector(selector: &str) -> Option<LayoutSelector> {
1389 let selector = selector.trim();
1390 if selector.is_empty()
1391 || selector
1392 .chars()
1393 .any(|ch| matches!(ch, ' ' | '\t' | '\n' | '\r' | '>' | '+' | '~' | '[' | ':'))
1394 {
1395 return None;
1396 }
1397
1398 if let Some(class) = selector.strip_prefix('.') {
1399 (!class.is_empty()).then(|| LayoutSelector::Class(class.to_owned()))
1400 } else if let Some(id) = selector.strip_prefix('#') {
1401 (!id.is_empty()).then(|| LayoutSelector::Id(id.to_owned()))
1402 } else {
1403 selector
1404 .chars()
1405 .all(|ch| ch.is_ascii_alphanumeric() || ch == '-')
1406 .then(|| LayoutSelector::Tag(selector.to_ascii_lowercase()))
1407 }
1408}
1409
1410fn spacing_from_declarations(declarations: &str) -> LayoutSpacing {
1411 let mut spacing = LayoutSpacing::default();
1412 for declaration in declarations.split(';') {
1413 let Some((property, value)) = declaration.split_once(':') else {
1414 continue;
1415 };
1416 let property = property.trim().to_ascii_lowercase();
1417 let lines = css_length_to_lines(value.trim());
1418 if lines == 0 {
1419 continue;
1420 }
1421
1422 match property.as_str() {
1423 "margin-top" | "padding-top" => spacing.before = spacing.before.max(lines),
1424 "margin-bottom" | "padding-bottom" | "gap" | "row-gap" => {
1425 spacing.after = spacing.after.max(lines);
1426 }
1427 "margin" | "padding" => {
1428 spacing.before = spacing.before.max(lines);
1429 spacing.after = spacing.after.max(lines);
1430 }
1431 _ => {}
1432 }
1433 }
1434 spacing
1435}
1436
1437fn css_length_to_lines(value: &str) -> u8 {
1438 let value = value.trim().to_ascii_lowercase();
1439 if value == "0"
1440 || value.starts_with("0px")
1441 || value.starts_with("0rem")
1442 || value.starts_with("0em")
1443 {
1444 return 0;
1445 }
1446
1447 let Some(number) = first_css_number(&value) else {
1448 return 0;
1449 };
1450 if number <= 0.0 {
1451 return 0;
1452 }
1453
1454 let lines = if value.contains("rem") || value.contains("em") {
1455 if number >= 4.0 {
1456 3
1457 } else if number >= 2.0 {
1458 2
1459 } else {
1460 1
1461 }
1462 } else if value.contains("px") {
1463 if number >= 48.0 {
1464 3
1465 } else if number >= 28.0 {
1466 2
1467 } else if number >= 12.0 {
1468 1
1469 } else {
1470 0
1471 }
1472 } else if value.contains("vh") || value.contains("vw") {
1473 if number >= 14.0 {
1474 3
1475 } else if number >= 8.0 {
1476 2
1477 } else {
1478 1
1479 }
1480 } else {
1481 1
1482 };
1483
1484 lines.min(MAX_LAYOUT_SPACER_LINES)
1485}
1486
1487fn first_css_number(value: &str) -> Option<f32> {
1488 let mut start = None;
1489 let mut end = 0;
1490 for (index, ch) in value.char_indices() {
1491 if start.is_none() && (ch.is_ascii_digit() || ch == '.') {
1492 start = Some(index);
1493 }
1494 if start.is_some() {
1495 if ch.is_ascii_digit() || ch == '.' {
1496 end = index + ch.len_utf8();
1497 } else {
1498 break;
1499 }
1500 }
1501 }
1502 value.get(start?..end)?.parse().ok()
1503}
1504
1505fn push_spacer(nodes: &mut Vec<HtmlNode>, lines: u8) {
1506 if lines == 0 {
1507 return;
1508 }
1509 let lines = lines.min(MAX_LAYOUT_SPACER_LINES);
1510 if let Some(HtmlNode::Spacer { lines: existing }) = nodes.last_mut() {
1511 *existing = (*existing).max(lines).min(MAX_LAYOUT_SPACER_LINES);
1512 } else {
1513 nodes.push(HtmlNode::Spacer { lines });
1514 }
1515}
1516
1517fn trim_layout_spacers(nodes: &mut Vec<HtmlNode>) {
1518 while matches!(nodes.first(), Some(HtmlNode::Spacer { .. })) {
1519 nodes.remove(0);
1520 }
1521 while matches!(nodes.last(), Some(HtmlNode::Spacer { .. })) {
1522 nodes.pop();
1523 }
1524}
1525
1526fn extract_form(form: &ElementRef<'_>, base_url: Option<&Url>) -> HtmlForm {
1527 let method = form
1528 .value()
1529 .attr("method")
1530 .map(clean_text)
1531 .filter(|method| !method.is_empty())
1532 .unwrap_or_else(|| "GET".to_owned())
1533 .to_ascii_uppercase();
1534 let action = form
1535 .value()
1536 .attr("action")
1537 .map(|action| resolve_url(action, base_url))
1538 .or_else(|| base_url.map(Url::to_string))
1539 .unwrap_or_default();
1540 let name = form
1541 .value()
1542 .attr("name")
1543 .or_else(|| form.value().attr("id"))
1544 .map(clean_text)
1545 .filter(|name| !name.is_empty())
1546 .unwrap_or_else(|| "form".to_owned());
1547
1548 HtmlForm {
1549 name,
1550 method,
1551 action,
1552 inputs: extract_inputs(form),
1553 buttons: extract_buttons(form),
1554 }
1555}
1556
1557fn extract_inputs(form: &ElementRef<'_>) -> Vec<HtmlInput> {
1558 let Some(selector) = selector("input[name], textarea[name], select[name]") else {
1559 return Vec::new();
1560 };
1561
1562 form.select(&selector)
1563 .filter_map(|input| {
1564 let name = input.value().attr("name").map(clean_text)?;
1565 (!name.is_empty()).then_some(HtmlInput {
1566 name,
1567 kind: input_kind(&input),
1568 value: input_value(&input),
1569 required: input.value().attr("required").is_some(),
1570 })
1571 })
1572 .collect()
1573}
1574
1575fn extract_buttons(form: &ElementRef<'_>) -> Vec<HtmlButton> {
1576 let Some(selector) = selector("button, input[type=\"submit\"], input[type=\"button\"]") else {
1577 return Vec::new();
1578 };
1579
1580 form.select(&selector)
1581 .map(|button| HtmlButton {
1582 name: button.value().attr("name").map(clean_text),
1583 value: button.value().attr("value").map(clean_text),
1584 label: button_label(&button),
1585 })
1586 .filter(|button| !button.label.is_empty() || button.name.is_some())
1587 .collect()
1588}
1589
1590fn input_kind(input: &ElementRef<'_>) -> String {
1591 match input.value().name() {
1592 "textarea" => "textarea".to_owned(),
1593 "select" => "select".to_owned(),
1594 _ => input
1595 .value()
1596 .attr("type")
1597 .map(clean_text)
1598 .filter(|kind| !kind.is_empty())
1599 .unwrap_or_else(|| "text".to_owned()),
1600 }
1601}
1602
1603fn input_value(input: &ElementRef<'_>) -> Option<String> {
1604 match input.value().name() {
1605 "textarea" => Some(element_text(input)).filter(|value| !value.is_empty()),
1606 "select" => {
1607 let selected = selector("option[selected]")
1608 .and_then(|selector| input.select(&selector).next())
1609 .and_then(|option| option_value(&option));
1610 selected.or_else(|| {
1611 selector("option")
1612 .and_then(|selector| input.select(&selector).next())
1613 .and_then(|option| option_value(&option))
1614 })
1615 }
1616 _ => input.value().attr("value").map(clean_text),
1617 }
1618}
1619
1620fn option_value(option: &ElementRef<'_>) -> Option<String> {
1621 option
1622 .value()
1623 .attr("value")
1624 .map(clean_text)
1625 .or_else(|| Some(element_text(option)))
1626 .filter(|value| !value.is_empty())
1627}
1628
1629fn button_label(button: &ElementRef<'_>) -> String {
1630 let text = element_text(button);
1631 if !text.is_empty() {
1632 return text;
1633 }
1634
1635 button
1636 .value()
1637 .attr("value")
1638 .map(clean_text)
1639 .filter(|value| !value.is_empty())
1640 .unwrap_or_else(|| "submit".to_owned())
1641}
1642
1643fn is_anchor_only_paragraph(element: &ElementRef<'_>, paragraph_text: &str) -> bool {
1644 let Some(selector) = selector("a[href]") else {
1645 return false;
1646 };
1647 let link_text = element
1648 .select(&selector)
1649 .map(|link| element_text(&link))
1650 .filter(|text| !text.is_empty())
1651 .collect::<Vec<_>>()
1652 .join(" ");
1653
1654 !link_text.is_empty() && link_text == paragraph_text
1655}
1656
1657fn heading_level(tag: &str) -> u8 {
1658 tag.strip_prefix('h')
1659 .and_then(|level| level.parse::<u8>().ok())
1660 .filter(|level| (1..=6).contains(level))
1661 .unwrap_or(1)
1662}
1663
1664fn extract_links(root: &ElementRef<'_>, base_url: Option<&Url>) -> Vec<HtmlLink> {
1665 let Some(selector) = selector("a[href]") else {
1666 return Vec::new();
1667 };
1668
1669 root.select(&selector)
1670 .filter_map(|element| {
1671 let text = element_text(&element);
1672 let href = element.value().attr("href")?;
1673 (!text.is_empty()).then_some(HtmlLink {
1674 text,
1675 href: resolve_url(href, base_url),
1676 })
1677 })
1678 .collect()
1679}
1680
1681fn table_rows(table: &ElementRef<'_>) -> Vec<Vec<String>> {
1682 let Some(row_selector) = selector("tr") else {
1683 return Vec::new();
1684 };
1685 let Some(cell_selector) = selector("th, td") else {
1686 return Vec::new();
1687 };
1688
1689 table
1690 .select(&row_selector)
1691 .map(|row| {
1692 row.select(&cell_selector)
1693 .map(|cell| element_text(&cell))
1694 .filter(|text| !text.is_empty())
1695 .collect::<Vec<_>>()
1696 })
1697 .filter(|row| !row.is_empty())
1698 .collect()
1699}
1700
1701fn list_items(list: &ElementRef<'_>) -> Vec<String> {
1702 let Some(item_selector) = selector("li") else {
1703 return Vec::new();
1704 };
1705
1706 list.select(&item_selector)
1707 .filter(|item| {
1708 item.parent()
1709 .and_then(ElementRef::wrap)
1710 .is_some_and(|parent| parent.id() == list.id())
1711 })
1712 .map(|item| element_text(&item))
1713 .filter(|text| !text.is_empty())
1714 .collect()
1715}
1716
1717fn image_alt(image: &ElementRef<'_>) -> String {
1718 image
1719 .value()
1720 .attr("alt")
1721 .or_else(|| image.value().attr("title"))
1722 .map(clean_text)
1723 .filter(|text| !text.is_empty())
1724 .unwrap_or_else(|| "image".to_owned())
1725}
1726
1727fn code_language(element: &ElementRef<'_>) -> Option<String> {
1728 let class = element.value().attr("class").unwrap_or_default();
1729 class
1730 .split_whitespace()
1731 .find_map(language_from_class)
1732 .or_else(|| {
1733 let selector = selector("code")?;
1734 element.select(&selector).find_map(|code| {
1735 code.value()
1736 .attr("class")
1737 .unwrap_or_default()
1738 .split_whitespace()
1739 .find_map(language_from_class)
1740 })
1741 })
1742}
1743
1744fn language_from_class(class: &str) -> Option<String> {
1745 class
1746 .strip_prefix("language-")
1747 .or_else(|| class.strip_prefix("lang-"))
1748 .map(ToOwned::to_owned)
1749 .filter(|language| !language.is_empty())
1750}
1751
1752fn resolve_url(input: &str, base_url: Option<&Url>) -> String {
1753 let trimmed = input.trim();
1754 if let Ok(url) = Url::parse(trimmed) {
1755 return url.to_string();
1756 }
1757
1758 base_url
1759 .and_then(|base| base.join(trimmed).ok())
1760 .map(|url| url.to_string())
1761 .unwrap_or_else(|| trimmed.to_owned())
1762}
1763
1764fn body_text_from_nodes(nodes: &[HtmlNode]) -> String {
1765 let parts = nodes
1766 .iter()
1767 .filter_map(|node| match node {
1768 HtmlNode::Heading { text, .. }
1769 | HtmlNode::Paragraph(text)
1770 | HtmlNode::CodeBlock { code: text, .. } => Some(text.clone()),
1771 HtmlNode::List { items, .. } => Some(items.join(" ")).filter(|text| !text.is_empty()),
1772 HtmlNode::Link(link) => Some(link.text.clone()).filter(|text| !text.is_empty()),
1773 HtmlNode::Form(_form) => None,
1774 HtmlNode::Spacer { .. } => None,
1775 HtmlNode::Section { nodes, .. } => {
1776 Some(body_text_from_nodes(nodes)).filter(|text| !text.is_empty())
1777 }
1778 _ => None,
1779 })
1780 .collect::<Vec<_>>();
1781
1782 clean_text(parts.join(" "))
1783}
1784
1785fn element_text(element: &ElementRef<'_>) -> String {
1786 clean_text(element.text().collect::<Vec<_>>().join(" "))
1787}
1788
1789fn code_text(element: &ElementRef<'_>) -> String {
1790 element.text().collect::<String>()
1791}
1792
1793fn clean_text(input: impl AsRef<str>) -> String {
1794 input
1795 .as_ref()
1796 .split_whitespace()
1797 .collect::<Vec<_>>()
1798 .join(" ")
1799}
1800
1801#[cfg(test)]
1802mod tests {
1803 use super::{
1804 HtmlNode, HtmlSectionRole, IndexDateStyle, discover_index_manifest_link_from_html,
1805 discover_index_manifest_link_from_http_link_header, parse_html, parse_index_manifest,
1806 well_known_index_manifest_url,
1807 };
1808
1809 #[test]
1810 fn extracts_title() {
1811 let doc = parse_html("<html><title>Hello</title><body></body></html>");
1812 assert_eq!(doc.title.as_deref(), Some("Hello"));
1813 }
1814
1815 #[test]
1816 fn extracts_headings_with_levels() {
1817 let doc = parse_html("<main><h1>Main</h1><h2>Sub</h2></main>");
1818 assert_eq!(doc.headings.len(), 2);
1819 assert_eq!(doc.headings[0].level, 1);
1820 assert_eq!(doc.headings[0].text, "Main");
1821 assert_eq!(doc.headings[1].level, 2);
1822 assert_eq!(doc.headings[1].text, "Sub");
1823 }
1824
1825 #[test]
1826 fn extracts_links_from_main_content() {
1827 let doc = parse_html(
1828 r#"<nav><a href="/noise">Noise</a></nav><main><a href="https://example.com">Example</a></main>"#,
1829 );
1830 assert_eq!(doc.links.len(), 1);
1831 assert_eq!(doc.links[0].text, "Example");
1832 assert_eq!(doc.links[0].href, "https://example.com/");
1833 }
1834
1835 #[test]
1836 fn prioritizes_explicit_main_and_collapses_secondary_regions() {
1837 let doc = parse_html(
1838 r#"
1839 <body>
1840 <nav aria-label="Site"><a href="/docs">Docs</a><a href="/about">About</a></nav>
1841 <main><h1>Main Article</h1><p>Readable body.</p></main>
1842 <aside class="related"><h2>Related</h2><a href="/next">Next</a></aside>
1843 <footer><a href="/license">License</a></footer>
1844 </body>
1845 "#,
1846 );
1847
1848 assert!(matches!(
1849 doc.nodes.first(),
1850 Some(HtmlNode::Heading { text, .. }) if text == "Main Article"
1851 ));
1852 assert!(doc.nodes.iter().any(|node| matches!(
1853 node,
1854 HtmlNode::Section {
1855 role: HtmlSectionRole::Navigation,
1856 title: Some(title),
1857 collapsed: true,
1858 nodes
1859 } if title == "Site" && nodes.len() == 2
1860 )));
1861 assert!(doc.nodes.iter().any(|node| matches!(
1862 node,
1863 HtmlNode::Section {
1864 role: HtmlSectionRole::Related,
1865 collapsed: true,
1866 ..
1867 }
1868 )));
1869 assert!(doc.nodes.iter().any(|node| matches!(
1870 node,
1871 HtmlNode::Section {
1872 role: HtmlSectionRole::Footer,
1873 collapsed: true,
1874 ..
1875 }
1876 )));
1877 }
1878
1879 #[test]
1880 fn main_root_scoring_prefers_dense_article_over_chrome_like_main() {
1881 let doc = parse_html(
1882 r#"
1883 <body>
1884 <main class="top-nav">
1885 <a href="/home">Home</a>
1886 <a href="/pricing">Pricing</a>
1887 <a href="/docs">Docs</a>
1888 </main>
1889 <article id="story">
1890 <h1>Deep Story</h1>
1891 <p>This paragraph carries the actual article payload for readers.</p>
1892 <p>Second paragraph keeps the dense main-content region obvious.</p>
1893 </article>
1894 </body>
1895 "#,
1896 );
1897
1898 assert!(matches!(
1899 doc.nodes.first(),
1900 Some(HtmlNode::Heading { text, .. }) if text == "Deep Story"
1901 ));
1902 assert!(!doc.body_text.contains("Pricing"));
1903 }
1904
1905 #[test]
1906 fn suppresses_boilerplate_containers_and_paragraphs_inside_main() {
1907 let doc = parse_html(
1908 r#"
1909 <main>
1910 <p>Primary body remains.</p>
1911 <div class="newsletter signup">
1912 <p>Sign up for updates and subscribe.</p>
1913 </div>
1914 <div class="related">
1915 <a href="/related">Related link</a>
1916 </div>
1917 </main>
1918 "#,
1919 );
1920
1921 assert!(doc.nodes.iter().any(
1922 |node| matches!(node, HtmlNode::Paragraph(text) if text == "Primary body remains.")
1923 ));
1924 assert!(!doc.body_text.contains("Sign up for updates"));
1925 assert!(!doc.body_text.contains("Related link"));
1926 }
1927
1928 #[test]
1929 fn preserves_br_boundaries_as_spacing_hints() {
1930 let doc = parse_html("<main><p>First line.</p><br><p>Second line.</p></main>");
1931 assert!(
1932 doc.nodes
1933 .iter()
1934 .any(|node| matches!(node, HtmlNode::Spacer { lines: 1 }))
1935 );
1936 }
1937
1938 #[test]
1939 fn extracts_structured_nodes() {
1940 let doc = parse_html(
1941 r#"
1942 <main>
1943 <pre><code class="language-rust">fn main() {}</code></pre>
1944 <ul><li>Read</li><li>Search</li></ul>
1945 <table><tr><th>Name</th></tr><tr><td>Index</td></tr></table>
1946 <img src="/logo.png" alt="Index logo">
1947 </main>
1948 "#,
1949 );
1950
1951 assert!(matches!(
1952 doc.nodes.first(),
1953 Some(HtmlNode::CodeBlock {
1954 language: Some(language),
1955 ..
1956 }) if language == "rust"
1957 ));
1958 assert!(
1959 doc.nodes
1960 .iter()
1961 .any(|node| matches!(node, HtmlNode::List { ordered: false, items } if items == &vec!["Read".to_owned(), "Search".to_owned()]))
1962 );
1963 assert!(
1964 doc.nodes
1965 .iter()
1966 .any(|node| matches!(node, HtmlNode::Table { rows } if rows.len() == 2))
1967 );
1968 assert!(
1969 doc.nodes
1970 .iter()
1971 .any(|node| matches!(node, HtmlNode::Image { alt, .. } if alt == "Index logo"))
1972 );
1973 }
1974
1975 #[test]
1976 fn preserves_pre_and_code_text_whitespace() {
1977 let doc = parse_html(
1978 "<main><pre><code class=\"language-rust\">fn main() {\n println!(\"hi\");\n}</code></pre><code> raw\n block</code></main>",
1979 );
1980
1981 assert!(doc.nodes.iter().any(|node| matches!(
1982 node,
1983 HtmlNode::CodeBlock { language: Some(language), code }
1984 if language == "rust" && code == "fn main() {\n println!(\"hi\");\n}"
1985 )));
1986 assert!(doc.nodes.iter().any(|node| matches!(
1987 node,
1988 HtmlNode::CodeBlock { language: None, code } if code == " raw\n block"
1989 )));
1990 }
1991
1992 #[test]
1993 fn preserves_blockquote_as_quoted_paragraph() {
1994 let doc = parse_html("<main><blockquote>Quoted reply text.</blockquote></main>");
1995 assert!(doc.nodes.iter().any(|node| matches!(
1996 node,
1997 HtmlNode::Paragraph(text) if text == "> Quoted reply text."
1998 )));
1999 }
2000
2001 #[test]
2002 fn extracts_bounded_layout_spacing_from_css_and_block_boundaries() {
2003 let doc = parse_html(
2004 r#"
2005 <html lang="en-US">
2006 <head>
2007 <style>
2008 .hero { margin-bottom: 60px; }
2009 .chapter { padding-top: 2rem; padding-bottom: 6rem; }
2010 p { margin-bottom: 1rem; }
2011 </style>
2012 </head>
2013 <main>
2014 <section class="hero"><h1>Landing</h1><p>Intro.</p></section>
2015 <section class="chapter"><p>Chapter body.</p></section>
2016 </main>
2017 </html>
2018 "#,
2019 );
2020
2021 assert!(
2022 doc.nodes
2023 .iter()
2024 .any(|node| matches!(node, HtmlNode::Spacer { lines } if (1..=3).contains(lines)))
2025 );
2026 assert!(
2027 doc.nodes
2028 .iter()
2029 .any(|node| matches!(node, HtmlNode::Spacer { lines: 3 }))
2030 );
2031 assert!(matches!(
2032 doc.nodes.first(),
2033 Some(HtmlNode::Heading { text, .. }) if text == "Landing"
2034 ));
2035 assert!(matches!(
2036 doc.nodes.last(),
2037 Some(HtmlNode::Paragraph(text)) if text == "Chapter body."
2038 ));
2039 }
2040
2041 #[test]
2042 fn extracts_metadata_and_resolves_relative_urls_against_base() {
2043 let doc = parse_html(
2044 r#"
2045 <html lang="en-US">
2046 <head>
2047 <base href="https://example.com/docs/">
2048 <link rel="canonical" href="guide.html">
2049 <meta name="description" content="A calm reader">
2050 <meta property="og:title" content="OG Title">
2051 <meta property="og:description" content="OG Description">
2052 </head>
2053 <main><a href="chapter.html">Chapter</a><img src="img.png" alt="Image"></main>
2054 </html>
2055 "#,
2056 );
2057
2058 assert_eq!(
2059 doc.metadata.canonical_url.as_deref(),
2060 Some("https://example.com/docs/guide.html")
2061 );
2062 assert_eq!(doc.metadata.language.as_deref(), Some("en-US"));
2063 assert_eq!(doc.metadata.description.as_deref(), Some("A calm reader"));
2064 assert_eq!(doc.metadata.open_graph_title.as_deref(), Some("OG Title"));
2065 assert_eq!(
2066 doc.metadata.open_graph_description.as_deref(),
2067 Some("OG Description")
2068 );
2069 assert_eq!(
2070 doc.links.first().map(|link| link.href.as_str()),
2071 Some("https://example.com/docs/chapter.html")
2072 );
2073 assert!(doc.nodes.iter().any(|node| matches!(
2074 node,
2075 HtmlNode::Image { src: Some(src), .. } if src == "https://example.com/docs/img.png"
2076 )));
2077 }
2078
2079 #[test]
2080 fn extracts_forms_inputs_buttons_and_resolves_actions() {
2081 let doc = parse_html(
2082 r#"
2083 <html>
2084 <head><base href="https://example.com/docs/"></head>
2085 <main>
2086 <form id="search" method="get" action="../search">
2087 <input type="search" name="q" required>
2088 <input type="hidden" name="source" value="docs">
2089 <button type="submit" name="go" value="1">Search</button>
2090 </form>
2091 </main>
2092 </html>
2093 "#,
2094 );
2095
2096 assert_eq!(doc.forms.len(), 1);
2097 let form = &doc.forms[0];
2098 assert_eq!(form.name, "search");
2099 assert_eq!(form.method, "GET");
2100 assert_eq!(form.action, "https://example.com/search");
2101 assert_eq!(form.inputs.len(), 2);
2102 assert!(
2103 form.inputs
2104 .iter()
2105 .any(|input| input.name == "q" && input.required)
2106 );
2107 assert!(
2108 form.buttons
2109 .iter()
2110 .any(|button| { button.name.as_deref() == Some("go") && button.label == "Search" })
2111 );
2112 assert!(
2113 doc.nodes
2114 .iter()
2115 .any(|node| matches!(node, HtmlNode::Form(form) if form.name == "search"))
2116 );
2117 }
2118
2119 #[test]
2120 fn extracts_select_values_for_form_inputs() {
2121 let doc = parse_html(
2122 r#"
2123 <main>
2124 <form id="filters" action="/search">
2125 <select name="sort">
2126 <option value="relevance">Relevance</option>
2127 <option value="recent" selected>Most recent</option>
2128 </select>
2129 <select name="view">
2130 <option>compact</option>
2131 <option value="expanded">expanded</option>
2132 </select>
2133 </form>
2134 </main>
2135 "#,
2136 );
2137
2138 let form = &doc.forms[0];
2139 assert!(
2140 form.inputs
2141 .iter()
2142 .any(|input| input.name == "sort" && input.value.as_deref() == Some("recent"))
2143 );
2144 assert!(
2145 form.inputs
2146 .iter()
2147 .any(|input| input.name == "view" && input.value.as_deref() == Some("compact"))
2148 );
2149 }
2150
2151 #[test]
2152 fn drops_anchor_only_paragraph_nodes_but_keeps_links() {
2153 let doc = parse_html(r#"<main><p><a href="https://example.com">Read more</a></p></main>"#);
2154 assert!(!doc.nodes.iter().any(|node| matches!(
2155 node,
2156 HtmlNode::Paragraph(text) if text == "Read more"
2157 )));
2158 assert_eq!(doc.links.len(), 1);
2159 }
2160
2161 #[test]
2162 fn malformed_html_does_not_panic() {
2163 let doc = parse_html("<main><h1>Broken<p>Still readable<a href=\"/x\">link");
2164 assert_eq!(doc.title.as_deref(), Some("Broken Still readable link"));
2165 assert!(!doc.body_text.is_empty());
2166 }
2167
2168 #[test]
2169 fn index_manifest_discovery_supports_well_known_html_and_header_links() {
2170 let page_url = "https://example.org/docs/guide";
2171 assert_eq!(
2172 well_known_index_manifest_url(page_url).as_deref(),
2173 Some("https://example.org/.well-known/index.idx")
2174 );
2175 assert_eq!(
2176 discover_index_manifest_link_from_html(
2177 r#"<html><head><link rel="index-manifest preload" href="/manifests/site.idx"></head></html>"#,
2178 page_url
2179 )
2180 .as_deref(),
2181 Some("https://example.org/manifests/site.idx")
2182 );
2183 assert_eq!(
2184 discover_index_manifest_link_from_http_link_header(
2185 r#"</meta/index.idx>; rel="index-manifest"; type="application/json""#,
2186 page_url
2187 )
2188 .as_deref(),
2189 Some("https://example.org/meta/index.idx")
2190 );
2191 }
2192
2193 #[test]
2194 fn parse_index_manifest_validates_same_origin_scope_and_date_styles()
2195 -> Result<(), Box<dyn std::error::Error>> {
2196 let manifest = parse_index_manifest(
2197 r#"{
2198 "version": "index.idx/v1",
2199 "scope": "/docs",
2200 "content": { "main_selector": "main article" },
2201 "regions": [{ "role": "related", "selector": "aside.related", "collapsed": true }],
2202 "fields": [{ "name": "updated", "label": "Updated" }],
2203 "forms": [{ "name": "search", "selector": "form.search", "note": "Public search" }],
2204 "dates": [{ "field": "updated", "style": "date" }]
2205 }"#,
2206 "https://example.org/.well-known/index.idx",
2207 "https://example.org/docs/guide",
2208 )?;
2209
2210 assert_eq!(manifest.version, "index.idx/v1");
2211 assert_eq!(manifest.scope, "/docs");
2212 assert_eq!(
2213 manifest.content.main_selector.as_deref(),
2214 Some("main article")
2215 );
2216 assert_eq!(manifest.regions.len(), 1);
2217 assert_eq!(manifest.fields.len(), 1);
2218 assert_eq!(manifest.forms.len(), 1);
2219 assert_eq!(manifest.dates.len(), 1);
2220 assert_eq!(manifest.dates[0].style, IndexDateStyle::Date);
2221 Ok(())
2222 }
2223
2224 #[test]
2225 fn parse_index_manifest_rejects_cross_origin_and_invalid_scope() {
2226 let cross_origin = parse_index_manifest(
2227 r#"{"version":"index.idx/v1"}"#,
2228 "https://cdn.example.org/index.idx",
2229 "https://example.org/docs/guide",
2230 );
2231 assert!(matches!(
2232 cross_origin,
2233 Err(super::IndexManifestError::CrossOrigin { .. })
2234 ));
2235
2236 let invalid_scope = parse_index_manifest(
2237 r#"{"version":"index.idx/v1","scope":"docs"}"#,
2238 "https://example.org/index.idx",
2239 "https://example.org/docs/guide",
2240 );
2241 assert!(matches!(
2242 invalid_scope,
2243 Err(super::IndexManifestError::InvalidScope(_))
2244 ));
2245
2246 let out_of_scope = parse_index_manifest(
2247 r#"{"version":"index.idx/v1","scope":"/blog"}"#,
2248 "https://example.org/index.idx",
2249 "https://example.org/docs/guide",
2250 );
2251 assert!(matches!(
2252 out_of_scope,
2253 Err(super::IndexManifestError::OutOfScope { .. })
2254 ));
2255 }
2256}