Skip to main content

index_dom/
lib.rs

1//! HTML parsing boundary for Index.
2//!
3//! Milestone 1 uses `scraper` to parse hostile, malformed static HTML into a
4//! small semantic representation. Terminal rendering concerns stay out of this
5//! crate.
6
7use std::fmt::{Display, Formatter};
8
9use scraper::{ElementRef, Html, Selector};
10use serde::Deserialize;
11use url::Url;
12
13const MAX_LAYOUT_SPACER_LINES: u8 = 3;
14const INDEX_MANIFEST_VERSION: &str = "index.idx/v1";
15const MAX_MANIFEST_BYTES: usize = 32 * 1024;
16const MAX_MANIFEST_HINTS: usize = 64;
17const MAX_MANIFEST_STRING_LEN: usize = 256;
18const MAX_MANIFEST_SELECTOR_COMPLEXITY: usize = 16;
19
20/// Parsed `index.idx` manifest.
21#[derive(Debug, Clone, PartialEq, Eq)]
22pub struct IndexManifest {
23    /// Protocol version.
24    pub version: String,
25    /// Manifest source URL.
26    pub source_url: String,
27    /// Scope path prefix.
28    pub scope: String,
29    /// Content presentation hints.
30    pub content: IndexContentHint,
31    /// Region hints.
32    pub regions: Vec<IndexRegionHint>,
33    /// Field hints.
34    pub fields: Vec<IndexFieldHint>,
35    /// Form hints.
36    pub forms: Vec<IndexFormHint>,
37    /// Date hints.
38    pub dates: Vec<IndexDateHint>,
39}
40
41/// Content-level hints.
42#[derive(Debug, Clone, PartialEq, Eq, Default)]
43pub struct IndexContentHint {
44    /// Optional preferred main-content selector.
45    pub main_selector: Option<String>,
46}
47
48/// Region hint entry.
49#[derive(Debug, Clone, PartialEq, Eq)]
50pub struct IndexRegionHint {
51    /// Stable role label (for example `main`, `navigation`, `related`).
52    pub role: String,
53    /// CSS selector for the region.
54    pub selector: String,
55    /// Whether the region should be initially collapsed.
56    pub collapsed: bool,
57}
58
59/// Field hint entry.
60#[derive(Debug, Clone, PartialEq, Eq)]
61pub struct IndexFieldHint {
62    /// Stable field name.
63    pub name: String,
64    /// Optional field label.
65    pub label: Option<String>,
66}
67
68/// Form hint entry.
69#[derive(Debug, Clone, PartialEq, Eq)]
70pub struct IndexFormHint {
71    /// Stable form name.
72    pub name: String,
73    /// Optional CSS selector.
74    pub selector: Option<String>,
75    /// Optional short note.
76    pub note: Option<String>,
77}
78
79/// Date style hint.
80#[derive(Debug, Clone, Copy, PartialEq, Eq)]
81pub enum IndexDateStyle {
82    /// Date-only presentation.
83    Date,
84    /// Date and time presentation.
85    DateTime,
86}
87
88impl IndexDateStyle {
89    fn parse(input: &str) -> Option<Self> {
90        match input.trim().to_ascii_lowercase().as_str() {
91            "date" => Some(Self::Date),
92            "datetime" | "date-time" => Some(Self::DateTime),
93            _ => None,
94        }
95    }
96}
97
98/// Date hint entry.
99#[derive(Debug, Clone, PartialEq, Eq)]
100pub struct IndexDateHint {
101    /// Field name this hint applies to.
102    pub field: String,
103    /// Requested style.
104    pub style: IndexDateStyle,
105}
106
107/// Manifest parsing and validation errors.
108#[derive(Debug, Clone, PartialEq, Eq)]
109pub enum IndexManifestError {
110    /// Manifest exceeds size limit.
111    TooLarge {
112        /// Maximum allowed manifest bytes.
113        max_bytes: usize,
114        /// Actual manifest bytes.
115        actual_bytes: usize,
116    },
117    /// Manifest JSON is invalid.
118    InvalidJson(String),
119    /// Manifest version is unsupported.
120    UnsupportedVersion(String),
121    /// Manifest source URL is invalid.
122    InvalidSourceUrl(String),
123    /// Page URL is invalid.
124    InvalidPageUrl(String),
125    /// Manifest source is not same-origin with page URL.
126    CrossOrigin {
127        /// Manifest URL.
128        source_url: String,
129        /// Page URL.
130        page_url: String,
131    },
132    /// Scope value is invalid.
133    InvalidScope(String),
134    /// Scope does not include page URL.
135    OutOfScope {
136        /// Manifest scope.
137        scope: String,
138        /// Requested page path.
139        page_path: String,
140    },
141    /// Hint count exceeded limits.
142    TooManyHints {
143        /// Hint category.
144        kind: &'static str,
145        /// Maximum allowed entries for the category.
146        max: usize,
147    },
148    /// Hint value failed validation.
149    InvalidHint {
150        /// Hint category.
151        kind: &'static str,
152        /// Validation failure reason.
153        reason: String,
154    },
155}
156
157impl Display for IndexManifestError {
158    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
159        match self {
160            Self::TooLarge {
161                max_bytes,
162                actual_bytes,
163            } => {
164                write!(
165                    f,
166                    "manifest exceeds limit: {actual_bytes} bytes (max {max_bytes})"
167                )
168            }
169            Self::InvalidJson(error) => write!(f, "manifest JSON is invalid: {error}"),
170            Self::UnsupportedVersion(version) => {
171                write!(f, "unsupported manifest version: {version}")
172            }
173            Self::InvalidSourceUrl(url) => write!(f, "manifest source URL is invalid: {url}"),
174            Self::InvalidPageUrl(url) => write!(f, "page URL is invalid: {url}"),
175            Self::CrossOrigin {
176                source_url,
177                page_url,
178            } => write!(
179                f,
180                "manifest source must be same-origin: {source_url} vs {page_url}"
181            ),
182            Self::InvalidScope(scope) => write!(f, "manifest scope is invalid: {scope}"),
183            Self::OutOfScope { scope, page_path } => {
184                write!(f, "page path {page_path} is outside manifest scope {scope}")
185            }
186            Self::TooManyHints { kind, max } => {
187                write!(f, "manifest has too many {kind} hints (max {max})")
188            }
189            Self::InvalidHint { kind, reason } => {
190                write!(f, "manifest {kind} hint is invalid: {reason}")
191            }
192        }
193    }
194}
195
196impl std::error::Error for IndexManifestError {}
197
198#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Default)]
199struct RawIndexManifest {
200    version: String,
201    #[serde(default)]
202    scope: Option<String>,
203    #[serde(default)]
204    content: RawIndexContentHint,
205    #[serde(default)]
206    regions: Vec<RawIndexRegionHint>,
207    #[serde(default)]
208    fields: Vec<RawIndexFieldHint>,
209    #[serde(default)]
210    forms: Vec<RawIndexFormHint>,
211    #[serde(default)]
212    dates: Vec<RawIndexDateHint>,
213}
214
215#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Default)]
216struct RawIndexContentHint {
217    #[serde(default)]
218    main_selector: Option<String>,
219}
220
221#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
222struct RawIndexRegionHint {
223    role: String,
224    selector: String,
225    #[serde(default)]
226    collapsed: bool,
227}
228
229#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
230struct RawIndexFieldHint {
231    name: String,
232    #[serde(default)]
233    label: Option<String>,
234}
235
236#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
237struct RawIndexFormHint {
238    name: String,
239    #[serde(default)]
240    selector: Option<String>,
241    #[serde(default)]
242    note: Option<String>,
243}
244
245#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
246struct RawIndexDateHint {
247    field: String,
248    style: String,
249}
250
251/// Returns the canonical same-origin `/.well-known/index.idx` candidate URL.
252#[must_use]
253pub fn well_known_index_manifest_url(page_url: &str) -> Option<String> {
254    let mut url = Url::parse(page_url).ok()?;
255    url.set_path("/.well-known/index.idx");
256    url.set_query(None);
257    url.set_fragment(None);
258    Some(url.to_string())
259}
260
261/// Discovers a manifest URL from HTML `<link rel=\"index-manifest\">`.
262#[must_use]
263pub fn discover_index_manifest_link_from_html(html: &str, page_url: &str) -> Option<String> {
264    let base = Url::parse(page_url).ok()?;
265    let document = Html::parse_document(html);
266    let selector = selector("link[rel][href]")?;
267    for link in document.select(&selector) {
268        let rel = link.value().attr("rel").unwrap_or_default();
269        let is_manifest = rel
270            .split(|c: char| c.is_whitespace() || c == ',')
271            .any(|token| token.eq_ignore_ascii_case("index-manifest"));
272        if !is_manifest {
273            continue;
274        }
275        let href = link.value().attr("href")?;
276        let joined = base.join(href).ok()?;
277        return Some(joined.to_string());
278    }
279    None
280}
281
282/// Discovers a manifest URL from an HTTP `Link` header value.
283#[must_use]
284pub fn discover_index_manifest_link_from_http_link_header(
285    header_value: &str,
286    page_url: &str,
287) -> Option<String> {
288    let base = Url::parse(page_url).ok()?;
289    for chunk in header_value.split(',') {
290        let trimmed = chunk.trim();
291        let Some((target_part, params_part)) = trimmed.split_once('>') else {
292            continue;
293        };
294        let target = target_part.trim().strip_prefix('<')?;
295        let is_manifest = params_part
296            .split(';')
297            .map(str::trim)
298            .filter_map(|part| part.strip_prefix("rel="))
299            .map(|rel| rel.trim_matches('"'))
300            .any(|rel| {
301                rel.split_whitespace()
302                    .any(|token| token.eq_ignore_ascii_case("index-manifest"))
303            });
304        if !is_manifest {
305            continue;
306        }
307        let joined = base.join(target).ok()?;
308        return Some(joined.to_string());
309    }
310    None
311}
312
313/// Parses and validates an `index.idx` manifest body.
314pub fn parse_index_manifest(
315    input: &str,
316    source_url: &str,
317    page_url: &str,
318) -> Result<IndexManifest, IndexManifestError> {
319    let actual_bytes = input.len();
320    if actual_bytes > MAX_MANIFEST_BYTES {
321        return Err(IndexManifestError::TooLarge {
322            max_bytes: MAX_MANIFEST_BYTES,
323            actual_bytes,
324        });
325    }
326    let source = Url::parse(source_url)
327        .map_err(|_| IndexManifestError::InvalidSourceUrl(source_url.to_owned()))?;
328    let page = Url::parse(page_url)
329        .map_err(|_| IndexManifestError::InvalidPageUrl(page_url.to_owned()))?;
330    if !same_origin(&source, &page) {
331        return Err(IndexManifestError::CrossOrigin {
332            source_url: source_url.to_owned(),
333            page_url: page_url.to_owned(),
334        });
335    }
336
337    let raw = serde_json::from_str::<RawIndexManifest>(input)
338        .map_err(|error| IndexManifestError::InvalidJson(error.to_string()))?;
339    if raw.version != INDEX_MANIFEST_VERSION {
340        return Err(IndexManifestError::UnsupportedVersion(raw.version));
341    }
342
343    let scope = normalize_scope(raw.scope.as_deref(), page.path())?;
344    if !page.path().starts_with(&scope) {
345        return Err(IndexManifestError::OutOfScope {
346            scope,
347            page_path: page.path().to_owned(),
348        });
349    }
350
351    if raw.regions.len() > MAX_MANIFEST_HINTS {
352        return Err(IndexManifestError::TooManyHints {
353            kind: "region",
354            max: MAX_MANIFEST_HINTS,
355        });
356    }
357    if raw.fields.len() > MAX_MANIFEST_HINTS {
358        return Err(IndexManifestError::TooManyHints {
359            kind: "field",
360            max: MAX_MANIFEST_HINTS,
361        });
362    }
363    if raw.forms.len() > MAX_MANIFEST_HINTS {
364        return Err(IndexManifestError::TooManyHints {
365            kind: "form",
366            max: MAX_MANIFEST_HINTS,
367        });
368    }
369    if raw.dates.len() > MAX_MANIFEST_HINTS {
370        return Err(IndexManifestError::TooManyHints {
371            kind: "date",
372            max: MAX_MANIFEST_HINTS,
373        });
374    }
375
376    let main_selector = raw
377        .content
378        .main_selector
379        .as_deref()
380        .map(str::trim)
381        .filter(|value| !value.is_empty())
382        .map(|value| validate_selector("content.main_selector", value))
383        .transpose()?;
384
385    let regions = raw
386        .regions
387        .into_iter()
388        .map(|raw| {
389            let role = validate_hint_text("region.role", &raw.role)?;
390            let selector = validate_selector("region.selector", &raw.selector)?;
391            Ok(IndexRegionHint {
392                role,
393                selector,
394                collapsed: raw.collapsed,
395            })
396        })
397        .collect::<Result<Vec<_>, IndexManifestError>>()?;
398
399    let fields = raw
400        .fields
401        .into_iter()
402        .map(|raw| {
403            let name = validate_hint_text("field.name", &raw.name)?;
404            let label = raw
405                .label
406                .as_deref()
407                .map(str::trim)
408                .filter(|value| !value.is_empty())
409                .map(|value| validate_hint_text("field.label", value))
410                .transpose()?;
411            Ok(IndexFieldHint { name, label })
412        })
413        .collect::<Result<Vec<_>, IndexManifestError>>()?;
414
415    let forms = raw
416        .forms
417        .into_iter()
418        .map(|raw| {
419            let name = validate_hint_text("form.name", &raw.name)?;
420            let selector = raw
421                .selector
422                .as_deref()
423                .map(str::trim)
424                .filter(|value| !value.is_empty())
425                .map(|value| validate_selector("form.selector", value))
426                .transpose()?;
427            let note = raw
428                .note
429                .as_deref()
430                .map(str::trim)
431                .filter(|value| !value.is_empty())
432                .map(|value| validate_hint_text("form.note", value))
433                .transpose()?;
434            Ok(IndexFormHint {
435                name,
436                selector,
437                note,
438            })
439        })
440        .collect::<Result<Vec<_>, IndexManifestError>>()?;
441
442    let dates = raw
443        .dates
444        .into_iter()
445        .map(|raw| {
446            let field = validate_hint_text("date.field", &raw.field)?;
447            let style = IndexDateStyle::parse(&raw.style).ok_or_else(|| {
448                IndexManifestError::InvalidHint {
449                    kind: "date.style",
450                    reason: format!("unsupported style: {}", raw.style),
451                }
452            })?;
453            Ok(IndexDateHint { field, style })
454        })
455        .collect::<Result<Vec<_>, IndexManifestError>>()?;
456
457    Ok(IndexManifest {
458        version: INDEX_MANIFEST_VERSION.to_owned(),
459        source_url: source_url.to_owned(),
460        scope,
461        content: IndexContentHint { main_selector },
462        regions,
463        fields,
464        forms,
465        dates,
466    })
467}
468
469fn same_origin(left: &Url, right: &Url) -> bool {
470    left.scheme() == right.scheme()
471        && left.host_str() == right.host_str()
472        && left.port_or_known_default() == right.port_or_known_default()
473}
474
475fn normalize_scope(scope: Option<&str>, page_path: &str) -> Result<String, IndexManifestError> {
476    let normalized = scope.unwrap_or("/").trim();
477    if normalized.is_empty() || !normalized.starts_with('/') {
478        return Err(IndexManifestError::InvalidScope(normalized.to_owned()));
479    }
480    if normalized.len() > MAX_MANIFEST_STRING_LEN {
481        return Err(IndexManifestError::InvalidScope(normalized.to_owned()));
482    }
483    if page_path.is_empty() {
484        return Ok(normalized.to_owned());
485    }
486    Ok(normalized.to_owned())
487}
488
489fn validate_hint_text(kind: &'static str, text: &str) -> Result<String, IndexManifestError> {
490    let value = text.trim();
491    if value.is_empty() || value.len() > MAX_MANIFEST_STRING_LEN {
492        return Err(IndexManifestError::InvalidHint {
493            kind,
494            reason: "text length is out of bounds".to_owned(),
495        });
496    }
497    Ok(value.to_owned())
498}
499
500fn validate_selector(kind: &'static str, selector: &str) -> Result<String, IndexManifestError> {
501    let value = validate_hint_text(kind, selector)?;
502    let complexity = value
503        .chars()
504        .filter(|ch| matches!(ch, '>' | '+' | '~' | '[' | ']' | ':' | '*' | '#'))
505        .count();
506    if complexity > MAX_MANIFEST_SELECTOR_COMPLEXITY {
507        return Err(IndexManifestError::InvalidHint {
508            kind,
509            reason: "selector complexity exceeds limit".to_owned(),
510        });
511    }
512    Selector::parse(&value)
513        .map_err(|_| IndexManifestError::InvalidHint {
514            kind,
515            reason: "selector syntax is invalid".to_owned(),
516        })
517        .map(|_| value)
518}
519
520/// Parsed HTML document.
521#[derive(Debug, Clone, PartialEq, Eq)]
522pub struct HtmlDocument {
523    /// Original input.
524    pub raw: String,
525    /// Best extracted title.
526    pub title: Option<String>,
527    /// Extracted headings in document order.
528    pub headings: Vec<HtmlHeading>,
529    /// Extracted links from the main content region.
530    pub links: Vec<HtmlLink>,
531    /// Forms extracted from the main content region.
532    pub forms: Vec<HtmlForm>,
533    /// Semantic nodes extracted from the main content region.
534    pub nodes: Vec<HtmlNode>,
535    /// Extracted document metadata.
536    pub metadata: HtmlMetadata,
537    /// Text content from extracted user-visible nodes.
538    pub body_text: String,
539}
540
541/// Metadata extracted from the HTML head.
542#[derive(Debug, Clone, PartialEq, Eq, Default)]
543pub struct HtmlMetadata {
544    /// Canonical URL when known.
545    pub canonical_url: Option<String>,
546    /// Declared document language when known.
547    pub language: Option<String>,
548    /// Standard description metadata.
549    pub description: Option<String>,
550    /// OpenGraph title metadata.
551    pub open_graph_title: Option<String>,
552    /// OpenGraph description metadata.
553    pub open_graph_description: Option<String>,
554}
555
556/// Heading extracted from HTML.
557#[derive(Debug, Clone, PartialEq, Eq)]
558pub struct HtmlHeading {
559    /// One-based heading level.
560    pub level: u8,
561    /// Heading text.
562    pub text: String,
563}
564
565/// Link extracted from HTML.
566#[derive(Debug, Clone, PartialEq, Eq)]
567pub struct HtmlLink {
568    /// Link text.
569    pub text: String,
570    /// Link target.
571    pub href: String,
572}
573
574/// Semantic HTML region role.
575#[derive(Debug, Clone, Copy, PartialEq, Eq)]
576pub enum HtmlSectionRole {
577    /// Primary content region.
578    Main,
579    /// Navigation region.
580    Navigation,
581    /// Sidebar or complementary content.
582    Aside,
583    /// Footer or content information.
584    Footer,
585    /// Comments or discussion region.
586    Comments,
587    /// Related links or related content.
588    Related,
589    /// Unknown secondary region.
590    Unknown,
591}
592
593/// Form extracted from HTML.
594#[derive(Debug, Clone, PartialEq, Eq)]
595pub struct HtmlForm {
596    /// Form name or inferred label.
597    pub name: String,
598    /// Submission method.
599    pub method: String,
600    /// Action target.
601    pub action: String,
602    /// Input fields.
603    pub inputs: Vec<HtmlInput>,
604    /// Button actions.
605    pub buttons: Vec<HtmlButton>,
606}
607
608/// Input extracted from a form.
609#[derive(Debug, Clone, PartialEq, Eq)]
610pub struct HtmlInput {
611    /// Input name.
612    pub name: String,
613    /// Input kind.
614    pub kind: String,
615    /// Input value.
616    pub value: Option<String>,
617    /// Whether the field is required.
618    pub required: bool,
619}
620
621/// Button extracted from a form.
622#[derive(Debug, Clone, PartialEq, Eq)]
623pub struct HtmlButton {
624    /// Optional button name.
625    pub name: Option<String>,
626    /// Optional button value.
627    pub value: Option<String>,
628    /// Button label.
629    pub label: String,
630}
631
632/// Semantic HTML content node.
633#[derive(Debug, Clone, PartialEq, Eq)]
634pub enum HtmlNode {
635    /// Heading with one-based level.
636    Heading {
637        /// One-based heading level.
638        level: u8,
639        /// Heading text.
640        text: String,
641    },
642    /// Paragraph text.
643    Paragraph(String),
644    /// Link node used inside semantic sections.
645    Link(HtmlLink),
646    /// Ordered or unordered list.
647    List {
648        /// Whether the list is ordered.
649        ordered: bool,
650        /// List item text in source order.
651        items: Vec<String>,
652    },
653    /// Code block.
654    CodeBlock {
655        /// Optional declared language.
656        language: Option<String>,
657        /// Code text.
658        code: String,
659    },
660    /// Table rows.
661    Table {
662        /// Rows in source order.
663        rows: Vec<Vec<String>>,
664    },
665    /// Bounded vertical spacing hint.
666    Spacer {
667        /// Extra terminal lines suggested by semantic block rhythm.
668        lines: u8,
669    },
670    /// Semantic page region.
671    Section {
672        /// Inferred region role.
673        role: HtmlSectionRole,
674        /// Optional region title.
675        title: Option<String>,
676        /// Whether renderers should initially summarize this region.
677        collapsed: bool,
678        /// Region contents.
679        nodes: Vec<HtmlNode>,
680    },
681    /// Image proxy metadata.
682    Image {
683        /// Alternate text or fallback label.
684        alt: String,
685        /// Optional normalized source URL.
686        src: Option<String>,
687    },
688    /// Web form.
689    Form(HtmlForm),
690}
691
692/// Parses HTML into a semantic document representation.
693#[must_use]
694pub fn parse_html(input: impl Into<String>) -> HtmlDocument {
695    let raw = input.into();
696    let html = Html::parse_document(&raw);
697    let base_url = extract_base_url(&html);
698    let metadata = extract_metadata(&html, base_url.as_ref());
699    let layout_rules = extract_layout_rules(&html);
700    let root = main_content_root(&html);
701    let mut nodes = root
702        .as_ref()
703        .map(|root| extract_nodes(&root.element, base_url.as_ref(), &layout_rules))
704        .unwrap_or_default();
705    if let Some(root) = root.as_ref().filter(|root| root.explicit) {
706        nodes.extend(extract_secondary_sections(
707            &html,
708            &root.element,
709            base_url.as_ref(),
710            &layout_rules,
711        ));
712    }
713    let headings = nodes
714        .iter()
715        .filter_map(|node| match node {
716            HtmlNode::Heading { level, text } => Some(HtmlHeading {
717                level: *level,
718                text: text.clone(),
719            }),
720            _ => None,
721        })
722        .collect::<Vec<_>>();
723    let links = root
724        .as_ref()
725        .map(|root| extract_links(&root.element, base_url.as_ref()))
726        .unwrap_or_default();
727    let forms = nodes
728        .iter()
729        .filter_map(|node| match node {
730            HtmlNode::Form(form) => Some(form.clone()),
731            _ => None,
732        })
733        .collect();
734    let title = extract_title(&html, &metadata, &headings);
735    let body_text = body_text_from_nodes(&nodes);
736
737    HtmlDocument {
738        raw,
739        title,
740        headings,
741        links,
742        forms,
743        nodes,
744        metadata,
745        body_text,
746    }
747}
748
749fn selector(query: &str) -> Option<Selector> {
750    Selector::parse(query).ok()
751}
752
753struct MainContentRoot<'a> {
754    element: ElementRef<'a>,
755    explicit: bool,
756}
757
758fn main_content_root(html: &Html) -> Option<MainContentRoot<'_>> {
759    let mut candidates = Vec::new();
760    for query in [
761        "main",
762        "article",
763        "[role=\"main\"]",
764        "[itemprop=\"articleBody\"]",
765        "#content",
766        ".content",
767        ".article",
768        ".post",
769        ".entry-content",
770        ".markdown-body",
771    ] {
772        if let Some(selector) = selector(query) {
773            for element in html.select(&selector) {
774                candidates.push(MainContentRoot {
775                    element,
776                    explicit: true,
777                });
778            }
779        }
780    }
781
782    if let Some(best) = best_main_root(candidates) {
783        return Some(best);
784    }
785
786    let body = selector("body").and_then(|selector| html.select(&selector).next());
787    if let Some(body) = body {
788        if let Some(dense_region) = densest_body_region(&body) {
789            return Some(MainContentRoot {
790                element: dense_region,
791                explicit: false,
792            });
793        }
794        return Some(MainContentRoot {
795            element: body,
796            explicit: false,
797        });
798    }
799
800    html.root_element()
801        .first_child()
802        .and_then(ElementRef::wrap)
803        .map(|element| MainContentRoot {
804            element,
805            explicit: false,
806        })
807}
808
809fn best_main_root<'a>(roots: Vec<MainContentRoot<'a>>) -> Option<MainContentRoot<'a>> {
810    roots
811        .into_iter()
812        .max_by_key(|root| main_root_score(&root.element))
813}
814
815fn densest_body_region<'a>(body: &ElementRef<'a>) -> Option<ElementRef<'a>> {
816    let selector = selector("main, article, section, div")?;
817    body.select(&selector)
818        .filter(|element| !is_boilerplate_container(element))
819        .map(|element| {
820            let score = main_root_score(&element);
821            (element, score)
822        })
823        .filter(|(_element, score)| *score >= 6)
824        .max_by_key(|(_element, score)| *score)
825        .map(|(element, _)| element)
826}
827
828fn main_root_score(root: &ElementRef<'_>) -> i32 {
829    let mut heading_count = 0i32;
830    let mut paragraph_count = 0i32;
831    let mut list_count = 0i32;
832    let mut code_count = 0i32;
833    let mut table_count = 0i32;
834    let mut link_count = 0i32;
835    let mut text_chars = 0i32;
836    let mut boilerplate_penalty = 0i32;
837
838    for node in root.descendants() {
839        let Some(element) = ElementRef::wrap(node) else {
840            continue;
841        };
842        let tag = element.value().name();
843        if is_boilerplate_container(&element) {
844            boilerplate_penalty += 3;
845        }
846        match tag {
847            "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => heading_count += 1,
848            "p" => {
849                paragraph_count += 1;
850                text_chars += element_text(&element).chars().count() as i32;
851            }
852            "ul" | "ol" => list_count += 1,
853            "pre" | "code" => {
854                code_count += 1;
855                text_chars += code_text(&element).chars().count() as i32;
856            }
857            "table" => table_count += 1,
858            "a" => link_count += 1,
859            _ => {}
860        }
861    }
862
863    let dense_text_score = text_chars / 120;
864    let link_penalty = (link_count - paragraph_count * 5).clamp(0, 14);
865
866    heading_count * 4
867        + paragraph_count * 5
868        + list_count * 2
869        + code_count * 5
870        + table_count * 3
871        + dense_text_score
872        - link_penalty
873        - boilerplate_penalty
874}
875
876fn extract_title(html: &Html, metadata: &HtmlMetadata, headings: &[HtmlHeading]) -> Option<String> {
877    first_text(html, "title")
878        .or_else(|| metadata.open_graph_title.clone())
879        .or_else(|| headings.first().map(|heading| heading.text.clone()))
880}
881
882fn extract_base_url(html: &Html) -> Option<Url> {
883    let selector = selector("base[href]")?;
884    let href = html
885        .select(&selector)
886        .next()
887        .and_then(|node| node.value().attr("href"))?;
888    Url::parse(href).ok()
889}
890
891fn extract_metadata(html: &Html, base_url: Option<&Url>) -> HtmlMetadata {
892    HtmlMetadata {
893        canonical_url: extract_link_href(html, "link[rel~=\"canonical\"]", base_url),
894        language: extract_language(html),
895        description: extract_meta_content(html, "meta[name=\"description\"]"),
896        open_graph_title: extract_meta_content(html, "meta[property=\"og:title\"]"),
897        open_graph_description: extract_meta_content(html, "meta[property=\"og:description\"]"),
898    }
899}
900
901fn extract_language(html: &Html) -> Option<String> {
902    let selector = selector("html[lang]")?;
903    html.select(&selector)
904        .next()
905        .and_then(|node| node.value().attr("lang"))
906        .map(str::trim)
907        .filter(|value| !value.is_empty())
908        .map(ToOwned::to_owned)
909}
910
911fn extract_link_href(html: &Html, query: &str, base_url: Option<&Url>) -> Option<String> {
912    let selector = selector(query)?;
913    html.select(&selector)
914        .next()
915        .and_then(|node| node.value().attr("href"))
916        .map(|href| resolve_url(href, base_url))
917}
918
919fn extract_meta_content(html: &Html, query: &str) -> Option<String> {
920    let selector = selector(query)?;
921    html.select(&selector)
922        .next()
923        .and_then(|node| node.value().attr("content"))
924        .map(clean_text)
925        .filter(|text| !text.is_empty())
926}
927
928fn first_text(html: &Html, query: &str) -> Option<String> {
929    let selector = selector(query)?;
930    html.select(&selector)
931        .next()
932        .map(|node| element_text(&node))
933        .filter(|text| !text.is_empty())
934}
935
936fn extract_nodes(
937    root: &ElementRef<'_>,
938    base_url: Option<&Url>,
939    layout_rules: &LayoutRules,
940) -> Vec<HtmlNode> {
941    let mut nodes = Vec::new();
942    extract_element_nodes(root, base_url, layout_rules, &mut nodes);
943    trim_layout_spacers(&mut nodes);
944    nodes
945}
946
947fn extract_element_nodes(
948    element: &ElementRef<'_>,
949    base_url: Option<&Url>,
950    layout_rules: &LayoutRules,
951    nodes: &mut Vec<HtmlNode>,
952) {
953    let tag = element.value().name();
954    if tag == "br" {
955        push_spacer(nodes, 1);
956        return;
957    }
958
959    if is_boilerplate_container(element) && !is_primary_content_container(element) {
960        return;
961    }
962
963    if is_content_element(tag) {
964        let spacing = layout_spacing(element, layout_rules);
965        push_spacer(nodes, spacing.before);
966        if let Some(node) = html_node_from_element(element, base_url) {
967            nodes.push(node);
968        }
969        push_spacer(nodes, spacing.after);
970        return;
971    }
972
973    let spacing = is_layout_boundary(tag).then(|| layout_spacing(element, layout_rules));
974    if let Some(spacing) = spacing {
975        push_spacer(nodes, spacing.before);
976    }
977
978    for child in element.children() {
979        if let Some(child_element) = ElementRef::wrap(child) {
980            extract_element_nodes(&child_element, base_url, layout_rules, nodes);
981        }
982    }
983
984    if let Some(spacing) = spacing {
985        push_spacer(nodes, spacing.after);
986    }
987}
988
989fn html_node_from_element(element: &ElementRef<'_>, base_url: Option<&Url>) -> Option<HtmlNode> {
990    match element.value().name() {
991        "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
992            let text = element_text(element);
993            if text.is_empty() {
994                None
995            } else {
996                Some(HtmlNode::Heading {
997                    level: heading_level(element.value().name()),
998                    text,
999                })
1000            }
1001        }
1002        "p" => {
1003            let text = element_text(element);
1004            (!text.is_empty()
1005                && !is_anchor_only_paragraph(element, &text)
1006                && !is_boilerplate_paragraph(&text))
1007            .then_some(HtmlNode::Paragraph(text))
1008        }
1009        "blockquote" => {
1010            let text = element_text(element);
1011            (!text.is_empty()).then_some(HtmlNode::Paragraph(format!("> {text}")))
1012        }
1013        "ul" | "ol" => {
1014            let items = list_items(element);
1015            (!items.is_empty()).then_some(HtmlNode::List {
1016                ordered: element.value().name() == "ol",
1017                items,
1018            })
1019        }
1020        "pre" | "code" => {
1021            let code = code_text(element);
1022            (!code.trim().is_empty()).then_some(HtmlNode::CodeBlock {
1023                language: code_language(element),
1024                code,
1025            })
1026        }
1027        "table" => {
1028            let rows = table_rows(element);
1029            (!rows.is_empty()).then_some(HtmlNode::Table { rows })
1030        }
1031        "img" => Some(HtmlNode::Image {
1032            alt: image_alt(element),
1033            src: element
1034                .value()
1035                .attr("src")
1036                .map(|src| resolve_url(src, base_url)),
1037        }),
1038        "form" => Some(HtmlNode::Form(extract_form(element, base_url))),
1039        _ => None,
1040    }
1041}
1042
1043fn extract_secondary_sections(
1044    html: &Html,
1045    main_root: &ElementRef<'_>,
1046    base_url: Option<&Url>,
1047    layout_rules: &LayoutRules,
1048) -> Vec<HtmlNode> {
1049    let Some(selector) = selector(
1050        "nav, aside, footer, [role=\"navigation\"], [role=\"contentinfo\"], .sidebar, .related, #related, .comments, #comments",
1051    ) else {
1052        return Vec::new();
1053    };
1054
1055    html.select(&selector)
1056        .filter(|element| {
1057            !is_descendant_of(element, main_root)
1058                && !is_descendant_of(main_root, element)
1059                && element.id() != main_root.id()
1060        })
1061        .filter_map(|element| secondary_section_from_element(&element, base_url, layout_rules))
1062        .collect()
1063}
1064
1065fn secondary_section_from_element(
1066    element: &ElementRef<'_>,
1067    base_url: Option<&Url>,
1068    layout_rules: &LayoutRules,
1069) -> Option<HtmlNode> {
1070    let mut nodes = extract_nodes(element, base_url, layout_rules);
1071    if nodes.is_empty() {
1072        nodes.extend(
1073            extract_links(element, base_url)
1074                .into_iter()
1075                .map(HtmlNode::Link),
1076        );
1077    }
1078    trim_layout_spacers(&mut nodes);
1079
1080    (!nodes.is_empty()).then(|| HtmlNode::Section {
1081        role: section_role(element),
1082        title: section_title(element),
1083        collapsed: true,
1084        nodes,
1085    })
1086}
1087
1088fn is_descendant_of(element: &ElementRef<'_>, ancestor: &ElementRef<'_>) -> bool {
1089    let mut parent = element.parent();
1090    while let Some(node) = parent {
1091        if node.id() == ancestor.id() {
1092            return true;
1093        }
1094        parent = node.parent();
1095    }
1096    false
1097}
1098
1099fn section_role(element: &ElementRef<'_>) -> HtmlSectionRole {
1100    let names = element
1101        .value()
1102        .attr("id")
1103        .into_iter()
1104        .chain(element.value().attr("class"))
1105        .flat_map(str::split_whitespace)
1106        .map(str::to_ascii_lowercase)
1107        .collect::<Vec<_>>();
1108
1109    if names.iter().any(|name| name.contains("comment")) {
1110        return HtmlSectionRole::Comments;
1111    }
1112    if names.iter().any(|name| name.contains("related")) {
1113        return HtmlSectionRole::Related;
1114    }
1115
1116    match element.value().name() {
1117        "nav" => return HtmlSectionRole::Navigation,
1118        "aside" => return HtmlSectionRole::Aside,
1119        "footer" => return HtmlSectionRole::Footer,
1120        _ => {}
1121    }
1122
1123    match element.value().attr("role") {
1124        Some("navigation") => return HtmlSectionRole::Navigation,
1125        Some("contentinfo") => return HtmlSectionRole::Footer,
1126        _ => {}
1127    }
1128
1129    if names
1130        .iter()
1131        .any(|name| name.contains("side") || name.contains("rail"))
1132    {
1133        HtmlSectionRole::Aside
1134    } else {
1135        HtmlSectionRole::Unknown
1136    }
1137}
1138
1139fn section_title(element: &ElementRef<'_>) -> Option<String> {
1140    element
1141        .value()
1142        .attr("aria-label")
1143        .or_else(|| element.value().attr("title"))
1144        .map(clean_text)
1145        .filter(|title| !title.is_empty())
1146        .or_else(|| {
1147            let selector = selector("h1, h2, h3, h4, h5, h6")?;
1148            element
1149                .select(&selector)
1150                .next()
1151                .map(|heading| element_text(&heading))
1152                .filter(|title| !title.is_empty())
1153        })
1154}
1155
1156#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
1157struct LayoutSpacing {
1158    before: u8,
1159    after: u8,
1160}
1161
1162#[derive(Debug, Clone, PartialEq, Eq, Default)]
1163struct LayoutRules {
1164    rules: Vec<LayoutRule>,
1165}
1166
1167#[derive(Debug, Clone, PartialEq, Eq)]
1168struct LayoutRule {
1169    selector: LayoutSelector,
1170    spacing: LayoutSpacing,
1171}
1172
1173#[derive(Debug, Clone, PartialEq, Eq)]
1174enum LayoutSelector {
1175    Tag(String),
1176    Class(String),
1177    Id(String),
1178}
1179
1180fn is_content_element(tag: &str) -> bool {
1181    matches!(
1182        tag,
1183        "h1" | "h2"
1184            | "h3"
1185            | "h4"
1186            | "h5"
1187            | "h6"
1188            | "p"
1189            | "blockquote"
1190            | "ul"
1191            | "ol"
1192            | "pre"
1193            | "code"
1194            | "table"
1195            | "img"
1196            | "form"
1197    )
1198}
1199
1200fn is_layout_boundary(tag: &str) -> bool {
1201    matches!(
1202        tag,
1203        "article" | "section" | "header" | "footer" | "aside" | "div"
1204    )
1205}
1206
1207fn layout_spacing(element: &ElementRef<'_>, rules: &LayoutRules) -> LayoutSpacing {
1208    let mut spacing = default_layout_spacing(element.value().name());
1209
1210    if has_layout_hint_name(element) {
1211        spacing.before = spacing.before.max(1);
1212        spacing.after = spacing.after.max(1);
1213    }
1214
1215    for rule in rules.rules_for(element) {
1216        spacing.before = spacing.before.max(rule.spacing.before);
1217        spacing.after = spacing.after.max(rule.spacing.after);
1218    }
1219
1220    if let Some(style) = element.value().attr("style") {
1221        let inline_spacing = spacing_from_declarations(style);
1222        spacing.before = spacing.before.max(inline_spacing.before);
1223        spacing.after = spacing.after.max(inline_spacing.after);
1224    }
1225
1226    LayoutSpacing {
1227        before: spacing.before.min(MAX_LAYOUT_SPACER_LINES),
1228        after: spacing.after.min(MAX_LAYOUT_SPACER_LINES),
1229    }
1230}
1231
1232fn default_layout_spacing(tag: &str) -> LayoutSpacing {
1233    match tag {
1234        "article" | "section" | "header" | "footer" | "aside" => LayoutSpacing {
1235            before: 1,
1236            after: 1,
1237        },
1238        _ => LayoutSpacing::default(),
1239    }
1240}
1241
1242fn has_layout_hint_name(element: &ElementRef<'_>) -> bool {
1243    element
1244        .value()
1245        .attr("id")
1246        .into_iter()
1247        .chain(element.value().attr("class"))
1248        .flat_map(str::split_whitespace)
1249        .any(|name| {
1250            let name = name.to_ascii_lowercase();
1251            [
1252                "section", "hero", "intro", "outro", "spacer", "block", "panel", "card",
1253            ]
1254            .iter()
1255            .any(|hint| name.contains(hint))
1256        })
1257}
1258
1259fn is_primary_content_container(element: &ElementRef<'_>) -> bool {
1260    let tag = element.value().name();
1261    if matches!(tag, "main" | "article") {
1262        return true;
1263    }
1264    if matches!(element.value().attr("role"), Some("main")) {
1265        return true;
1266    }
1267    element
1268        .value()
1269        .attr("id")
1270        .into_iter()
1271        .chain(element.value().attr("class"))
1272        .flat_map(str::split_whitespace)
1273        .any(|name| {
1274            let lowered = name.to_ascii_lowercase();
1275            lowered.contains("content")
1276                || lowered.contains("article")
1277                || lowered.contains("post")
1278                || lowered.contains("entry")
1279                || lowered.contains("markdown")
1280        })
1281}
1282
1283fn is_boilerplate_container(element: &ElementRef<'_>) -> bool {
1284    let tag = element.value().name();
1285    if matches!(tag, "nav" | "footer") {
1286        return true;
1287    }
1288
1289    element
1290        .value()
1291        .attr("id")
1292        .into_iter()
1293        .chain(element.value().attr("class"))
1294        .flat_map(str::split_whitespace)
1295        .any(|name| {
1296            let lowered = name.to_ascii_lowercase();
1297            lowered.contains("nav")
1298                || lowered.contains("menu")
1299                || lowered.contains("footer")
1300                || lowered.contains("header")
1301                || lowered.contains("sidebar")
1302                || lowered.contains("related")
1303                || lowered.contains("breadcrumb")
1304                || lowered.contains("cookie")
1305                || lowered.contains("newsletter")
1306                || lowered.contains("subscribe")
1307                || lowered.contains("promo")
1308                || lowered.contains("advert")
1309                || lowered.contains("social")
1310                || lowered.contains("share")
1311        })
1312}
1313
1314fn is_boilerplate_paragraph(text: &str) -> bool {
1315    let lowered = text.to_ascii_lowercase();
1316    let matches_phrase = lowered.contains("sign up for")
1317        || lowered.contains("subscribe")
1318        || lowered.contains("cookie policy")
1319        || lowered.contains("all rights reserved")
1320        || lowered.contains("follow us")
1321        || lowered.contains("share this")
1322        || lowered.contains("advertisement");
1323    matches_phrase && text.chars().count() <= 180
1324}
1325
1326impl LayoutRules {
1327    fn rules_for<'a>(
1328        &'a self,
1329        element: &'a ElementRef<'_>,
1330    ) -> impl Iterator<Item = &'a LayoutRule> {
1331        self.rules
1332            .iter()
1333            .filter(move |rule| rule.selector.matches(element))
1334    }
1335}
1336
1337impl LayoutSelector {
1338    fn matches(&self, element: &ElementRef<'_>) -> bool {
1339        match self {
1340            Self::Tag(tag) => element.value().name() == tag,
1341            Self::Class(class) => element
1342                .value()
1343                .attr("class")
1344                .unwrap_or_default()
1345                .split_whitespace()
1346                .any(|candidate| candidate == class),
1347            Self::Id(id) => element.value().attr("id") == Some(id.as_str()),
1348        }
1349    }
1350}
1351
1352fn extract_layout_rules(html: &Html) -> LayoutRules {
1353    let Some(selector) = selector("style") else {
1354        return LayoutRules::default();
1355    };
1356
1357    let mut rules = Vec::new();
1358    for style in html.select(&selector) {
1359        rules.extend(parse_stylesheet_rules(
1360            &style.text().collect::<Vec<_>>().join(" "),
1361        ));
1362    }
1363
1364    LayoutRules { rules }
1365}
1366
1367fn parse_stylesheet_rules(stylesheet: &str) -> Vec<LayoutRule> {
1368    stylesheet
1369        .split('}')
1370        .filter_map(|rule| {
1371            let (selectors, declarations) = rule.split_once('{')?;
1372            let spacing = spacing_from_declarations(declarations);
1373            if spacing.before == 0 && spacing.after == 0 {
1374                return None;
1375            }
1376            Some(
1377                selectors
1378                    .split(',')
1379                    .filter_map(parse_layout_selector)
1380                    .map(move |selector| LayoutRule { selector, spacing })
1381                    .collect::<Vec<_>>(),
1382            )
1383        })
1384        .flatten()
1385        .collect()
1386}
1387
1388fn parse_layout_selector(selector: &str) -> Option<LayoutSelector> {
1389    let selector = selector.trim();
1390    if selector.is_empty()
1391        || selector
1392            .chars()
1393            .any(|ch| matches!(ch, ' ' | '\t' | '\n' | '\r' | '>' | '+' | '~' | '[' | ':'))
1394    {
1395        return None;
1396    }
1397
1398    if let Some(class) = selector.strip_prefix('.') {
1399        (!class.is_empty()).then(|| LayoutSelector::Class(class.to_owned()))
1400    } else if let Some(id) = selector.strip_prefix('#') {
1401        (!id.is_empty()).then(|| LayoutSelector::Id(id.to_owned()))
1402    } else {
1403        selector
1404            .chars()
1405            .all(|ch| ch.is_ascii_alphanumeric() || ch == '-')
1406            .then(|| LayoutSelector::Tag(selector.to_ascii_lowercase()))
1407    }
1408}
1409
1410fn spacing_from_declarations(declarations: &str) -> LayoutSpacing {
1411    let mut spacing = LayoutSpacing::default();
1412    for declaration in declarations.split(';') {
1413        let Some((property, value)) = declaration.split_once(':') else {
1414            continue;
1415        };
1416        let property = property.trim().to_ascii_lowercase();
1417        let lines = css_length_to_lines(value.trim());
1418        if lines == 0 {
1419            continue;
1420        }
1421
1422        match property.as_str() {
1423            "margin-top" | "padding-top" => spacing.before = spacing.before.max(lines),
1424            "margin-bottom" | "padding-bottom" | "gap" | "row-gap" => {
1425                spacing.after = spacing.after.max(lines);
1426            }
1427            "margin" | "padding" => {
1428                spacing.before = spacing.before.max(lines);
1429                spacing.after = spacing.after.max(lines);
1430            }
1431            _ => {}
1432        }
1433    }
1434    spacing
1435}
1436
1437fn css_length_to_lines(value: &str) -> u8 {
1438    let value = value.trim().to_ascii_lowercase();
1439    if value == "0"
1440        || value.starts_with("0px")
1441        || value.starts_with("0rem")
1442        || value.starts_with("0em")
1443    {
1444        return 0;
1445    }
1446
1447    let Some(number) = first_css_number(&value) else {
1448        return 0;
1449    };
1450    if number <= 0.0 {
1451        return 0;
1452    }
1453
1454    let lines = if value.contains("rem") || value.contains("em") {
1455        if number >= 4.0 {
1456            3
1457        } else if number >= 2.0 {
1458            2
1459        } else {
1460            1
1461        }
1462    } else if value.contains("px") {
1463        if number >= 48.0 {
1464            3
1465        } else if number >= 28.0 {
1466            2
1467        } else if number >= 12.0 {
1468            1
1469        } else {
1470            0
1471        }
1472    } else if value.contains("vh") || value.contains("vw") {
1473        if number >= 14.0 {
1474            3
1475        } else if number >= 8.0 {
1476            2
1477        } else {
1478            1
1479        }
1480    } else {
1481        1
1482    };
1483
1484    lines.min(MAX_LAYOUT_SPACER_LINES)
1485}
1486
1487fn first_css_number(value: &str) -> Option<f32> {
1488    let mut start = None;
1489    let mut end = 0;
1490    for (index, ch) in value.char_indices() {
1491        if start.is_none() && (ch.is_ascii_digit() || ch == '.') {
1492            start = Some(index);
1493        }
1494        if start.is_some() {
1495            if ch.is_ascii_digit() || ch == '.' {
1496                end = index + ch.len_utf8();
1497            } else {
1498                break;
1499            }
1500        }
1501    }
1502    value.get(start?..end)?.parse().ok()
1503}
1504
1505fn push_spacer(nodes: &mut Vec<HtmlNode>, lines: u8) {
1506    if lines == 0 {
1507        return;
1508    }
1509    let lines = lines.min(MAX_LAYOUT_SPACER_LINES);
1510    if let Some(HtmlNode::Spacer { lines: existing }) = nodes.last_mut() {
1511        *existing = (*existing).max(lines).min(MAX_LAYOUT_SPACER_LINES);
1512    } else {
1513        nodes.push(HtmlNode::Spacer { lines });
1514    }
1515}
1516
1517fn trim_layout_spacers(nodes: &mut Vec<HtmlNode>) {
1518    while matches!(nodes.first(), Some(HtmlNode::Spacer { .. })) {
1519        nodes.remove(0);
1520    }
1521    while matches!(nodes.last(), Some(HtmlNode::Spacer { .. })) {
1522        nodes.pop();
1523    }
1524}
1525
1526fn extract_form(form: &ElementRef<'_>, base_url: Option<&Url>) -> HtmlForm {
1527    let method = form
1528        .value()
1529        .attr("method")
1530        .map(clean_text)
1531        .filter(|method| !method.is_empty())
1532        .unwrap_or_else(|| "GET".to_owned())
1533        .to_ascii_uppercase();
1534    let action = form
1535        .value()
1536        .attr("action")
1537        .map(|action| resolve_url(action, base_url))
1538        .or_else(|| base_url.map(Url::to_string))
1539        .unwrap_or_default();
1540    let name = form
1541        .value()
1542        .attr("name")
1543        .or_else(|| form.value().attr("id"))
1544        .map(clean_text)
1545        .filter(|name| !name.is_empty())
1546        .unwrap_or_else(|| "form".to_owned());
1547
1548    HtmlForm {
1549        name,
1550        method,
1551        action,
1552        inputs: extract_inputs(form),
1553        buttons: extract_buttons(form),
1554    }
1555}
1556
1557fn extract_inputs(form: &ElementRef<'_>) -> Vec<HtmlInput> {
1558    let Some(selector) = selector("input[name], textarea[name], select[name]") else {
1559        return Vec::new();
1560    };
1561
1562    form.select(&selector)
1563        .filter_map(|input| {
1564            let name = input.value().attr("name").map(clean_text)?;
1565            (!name.is_empty()).then_some(HtmlInput {
1566                name,
1567                kind: input_kind(&input),
1568                value: input_value(&input),
1569                required: input.value().attr("required").is_some(),
1570            })
1571        })
1572        .collect()
1573}
1574
1575fn extract_buttons(form: &ElementRef<'_>) -> Vec<HtmlButton> {
1576    let Some(selector) = selector("button, input[type=\"submit\"], input[type=\"button\"]") else {
1577        return Vec::new();
1578    };
1579
1580    form.select(&selector)
1581        .map(|button| HtmlButton {
1582            name: button.value().attr("name").map(clean_text),
1583            value: button.value().attr("value").map(clean_text),
1584            label: button_label(&button),
1585        })
1586        .filter(|button| !button.label.is_empty() || button.name.is_some())
1587        .collect()
1588}
1589
1590fn input_kind(input: &ElementRef<'_>) -> String {
1591    match input.value().name() {
1592        "textarea" => "textarea".to_owned(),
1593        "select" => "select".to_owned(),
1594        _ => input
1595            .value()
1596            .attr("type")
1597            .map(clean_text)
1598            .filter(|kind| !kind.is_empty())
1599            .unwrap_or_else(|| "text".to_owned()),
1600    }
1601}
1602
1603fn input_value(input: &ElementRef<'_>) -> Option<String> {
1604    match input.value().name() {
1605        "textarea" => Some(element_text(input)).filter(|value| !value.is_empty()),
1606        "select" => {
1607            let selected = selector("option[selected]")
1608                .and_then(|selector| input.select(&selector).next())
1609                .and_then(|option| option_value(&option));
1610            selected.or_else(|| {
1611                selector("option")
1612                    .and_then(|selector| input.select(&selector).next())
1613                    .and_then(|option| option_value(&option))
1614            })
1615        }
1616        _ => input.value().attr("value").map(clean_text),
1617    }
1618}
1619
1620fn option_value(option: &ElementRef<'_>) -> Option<String> {
1621    option
1622        .value()
1623        .attr("value")
1624        .map(clean_text)
1625        .or_else(|| Some(element_text(option)))
1626        .filter(|value| !value.is_empty())
1627}
1628
1629fn button_label(button: &ElementRef<'_>) -> String {
1630    let text = element_text(button);
1631    if !text.is_empty() {
1632        return text;
1633    }
1634
1635    button
1636        .value()
1637        .attr("value")
1638        .map(clean_text)
1639        .filter(|value| !value.is_empty())
1640        .unwrap_or_else(|| "submit".to_owned())
1641}
1642
1643fn is_anchor_only_paragraph(element: &ElementRef<'_>, paragraph_text: &str) -> bool {
1644    let Some(selector) = selector("a[href]") else {
1645        return false;
1646    };
1647    let link_text = element
1648        .select(&selector)
1649        .map(|link| element_text(&link))
1650        .filter(|text| !text.is_empty())
1651        .collect::<Vec<_>>()
1652        .join(" ");
1653
1654    !link_text.is_empty() && link_text == paragraph_text
1655}
1656
1657fn heading_level(tag: &str) -> u8 {
1658    tag.strip_prefix('h')
1659        .and_then(|level| level.parse::<u8>().ok())
1660        .filter(|level| (1..=6).contains(level))
1661        .unwrap_or(1)
1662}
1663
1664fn extract_links(root: &ElementRef<'_>, base_url: Option<&Url>) -> Vec<HtmlLink> {
1665    let Some(selector) = selector("a[href]") else {
1666        return Vec::new();
1667    };
1668
1669    root.select(&selector)
1670        .filter_map(|element| {
1671            let text = element_text(&element);
1672            let href = element.value().attr("href")?;
1673            (!text.is_empty()).then_some(HtmlLink {
1674                text,
1675                href: resolve_url(href, base_url),
1676            })
1677        })
1678        .collect()
1679}
1680
1681fn table_rows(table: &ElementRef<'_>) -> Vec<Vec<String>> {
1682    let Some(row_selector) = selector("tr") else {
1683        return Vec::new();
1684    };
1685    let Some(cell_selector) = selector("th, td") else {
1686        return Vec::new();
1687    };
1688
1689    table
1690        .select(&row_selector)
1691        .map(|row| {
1692            row.select(&cell_selector)
1693                .map(|cell| element_text(&cell))
1694                .filter(|text| !text.is_empty())
1695                .collect::<Vec<_>>()
1696        })
1697        .filter(|row| !row.is_empty())
1698        .collect()
1699}
1700
1701fn list_items(list: &ElementRef<'_>) -> Vec<String> {
1702    let Some(item_selector) = selector("li") else {
1703        return Vec::new();
1704    };
1705
1706    list.select(&item_selector)
1707        .filter(|item| {
1708            item.parent()
1709                .and_then(ElementRef::wrap)
1710                .is_some_and(|parent| parent.id() == list.id())
1711        })
1712        .map(|item| element_text(&item))
1713        .filter(|text| !text.is_empty())
1714        .collect()
1715}
1716
1717fn image_alt(image: &ElementRef<'_>) -> String {
1718    image
1719        .value()
1720        .attr("alt")
1721        .or_else(|| image.value().attr("title"))
1722        .map(clean_text)
1723        .filter(|text| !text.is_empty())
1724        .unwrap_or_else(|| "image".to_owned())
1725}
1726
1727fn code_language(element: &ElementRef<'_>) -> Option<String> {
1728    let class = element.value().attr("class").unwrap_or_default();
1729    class
1730        .split_whitespace()
1731        .find_map(language_from_class)
1732        .or_else(|| {
1733            let selector = selector("code")?;
1734            element.select(&selector).find_map(|code| {
1735                code.value()
1736                    .attr("class")
1737                    .unwrap_or_default()
1738                    .split_whitespace()
1739                    .find_map(language_from_class)
1740            })
1741        })
1742}
1743
1744fn language_from_class(class: &str) -> Option<String> {
1745    class
1746        .strip_prefix("language-")
1747        .or_else(|| class.strip_prefix("lang-"))
1748        .map(ToOwned::to_owned)
1749        .filter(|language| !language.is_empty())
1750}
1751
1752fn resolve_url(input: &str, base_url: Option<&Url>) -> String {
1753    let trimmed = input.trim();
1754    if let Ok(url) = Url::parse(trimmed) {
1755        return url.to_string();
1756    }
1757
1758    base_url
1759        .and_then(|base| base.join(trimmed).ok())
1760        .map(|url| url.to_string())
1761        .unwrap_or_else(|| trimmed.to_owned())
1762}
1763
1764fn body_text_from_nodes(nodes: &[HtmlNode]) -> String {
1765    let parts = nodes
1766        .iter()
1767        .filter_map(|node| match node {
1768            HtmlNode::Heading { text, .. }
1769            | HtmlNode::Paragraph(text)
1770            | HtmlNode::CodeBlock { code: text, .. } => Some(text.clone()),
1771            HtmlNode::List { items, .. } => Some(items.join(" ")).filter(|text| !text.is_empty()),
1772            HtmlNode::Link(link) => Some(link.text.clone()).filter(|text| !text.is_empty()),
1773            HtmlNode::Form(_form) => None,
1774            HtmlNode::Spacer { .. } => None,
1775            HtmlNode::Section { nodes, .. } => {
1776                Some(body_text_from_nodes(nodes)).filter(|text| !text.is_empty())
1777            }
1778            _ => None,
1779        })
1780        .collect::<Vec<_>>();
1781
1782    clean_text(parts.join(" "))
1783}
1784
1785fn element_text(element: &ElementRef<'_>) -> String {
1786    clean_text(element.text().collect::<Vec<_>>().join(" "))
1787}
1788
1789fn code_text(element: &ElementRef<'_>) -> String {
1790    element.text().collect::<String>()
1791}
1792
1793fn clean_text(input: impl AsRef<str>) -> String {
1794    input
1795        .as_ref()
1796        .split_whitespace()
1797        .collect::<Vec<_>>()
1798        .join(" ")
1799}
1800
1801#[cfg(test)]
1802mod tests {
1803    use super::{
1804        HtmlNode, HtmlSectionRole, IndexDateStyle, discover_index_manifest_link_from_html,
1805        discover_index_manifest_link_from_http_link_header, parse_html, parse_index_manifest,
1806        well_known_index_manifest_url,
1807    };
1808
1809    #[test]
1810    fn extracts_title() {
1811        let doc = parse_html("<html><title>Hello</title><body></body></html>");
1812        assert_eq!(doc.title.as_deref(), Some("Hello"));
1813    }
1814
1815    #[test]
1816    fn extracts_headings_with_levels() {
1817        let doc = parse_html("<main><h1>Main</h1><h2>Sub</h2></main>");
1818        assert_eq!(doc.headings.len(), 2);
1819        assert_eq!(doc.headings[0].level, 1);
1820        assert_eq!(doc.headings[0].text, "Main");
1821        assert_eq!(doc.headings[1].level, 2);
1822        assert_eq!(doc.headings[1].text, "Sub");
1823    }
1824
1825    #[test]
1826    fn extracts_links_from_main_content() {
1827        let doc = parse_html(
1828            r#"<nav><a href="/noise">Noise</a></nav><main><a href="https://example.com">Example</a></main>"#,
1829        );
1830        assert_eq!(doc.links.len(), 1);
1831        assert_eq!(doc.links[0].text, "Example");
1832        assert_eq!(doc.links[0].href, "https://example.com/");
1833    }
1834
1835    #[test]
1836    fn prioritizes_explicit_main_and_collapses_secondary_regions() {
1837        let doc = parse_html(
1838            r#"
1839            <body>
1840              <nav aria-label="Site"><a href="/docs">Docs</a><a href="/about">About</a></nav>
1841              <main><h1>Main Article</h1><p>Readable body.</p></main>
1842              <aside class="related"><h2>Related</h2><a href="/next">Next</a></aside>
1843              <footer><a href="/license">License</a></footer>
1844            </body>
1845            "#,
1846        );
1847
1848        assert!(matches!(
1849            doc.nodes.first(),
1850            Some(HtmlNode::Heading { text, .. }) if text == "Main Article"
1851        ));
1852        assert!(doc.nodes.iter().any(|node| matches!(
1853            node,
1854            HtmlNode::Section {
1855                role: HtmlSectionRole::Navigation,
1856                title: Some(title),
1857                collapsed: true,
1858                nodes
1859            } if title == "Site" && nodes.len() == 2
1860        )));
1861        assert!(doc.nodes.iter().any(|node| matches!(
1862            node,
1863            HtmlNode::Section {
1864                role: HtmlSectionRole::Related,
1865                collapsed: true,
1866                ..
1867            }
1868        )));
1869        assert!(doc.nodes.iter().any(|node| matches!(
1870            node,
1871            HtmlNode::Section {
1872                role: HtmlSectionRole::Footer,
1873                collapsed: true,
1874                ..
1875            }
1876        )));
1877    }
1878
1879    #[test]
1880    fn main_root_scoring_prefers_dense_article_over_chrome_like_main() {
1881        let doc = parse_html(
1882            r#"
1883            <body>
1884              <main class="top-nav">
1885                <a href="/home">Home</a>
1886                <a href="/pricing">Pricing</a>
1887                <a href="/docs">Docs</a>
1888              </main>
1889              <article id="story">
1890                <h1>Deep Story</h1>
1891                <p>This paragraph carries the actual article payload for readers.</p>
1892                <p>Second paragraph keeps the dense main-content region obvious.</p>
1893              </article>
1894            </body>
1895            "#,
1896        );
1897
1898        assert!(matches!(
1899            doc.nodes.first(),
1900            Some(HtmlNode::Heading { text, .. }) if text == "Deep Story"
1901        ));
1902        assert!(!doc.body_text.contains("Pricing"));
1903    }
1904
1905    #[test]
1906    fn suppresses_boilerplate_containers_and_paragraphs_inside_main() {
1907        let doc = parse_html(
1908            r#"
1909            <main>
1910              <p>Primary body remains.</p>
1911              <div class="newsletter signup">
1912                <p>Sign up for updates and subscribe.</p>
1913              </div>
1914              <div class="related">
1915                <a href="/related">Related link</a>
1916              </div>
1917            </main>
1918            "#,
1919        );
1920
1921        assert!(doc.nodes.iter().any(
1922            |node| matches!(node, HtmlNode::Paragraph(text) if text == "Primary body remains.")
1923        ));
1924        assert!(!doc.body_text.contains("Sign up for updates"));
1925        assert!(!doc.body_text.contains("Related link"));
1926    }
1927
1928    #[test]
1929    fn preserves_br_boundaries_as_spacing_hints() {
1930        let doc = parse_html("<main><p>First line.</p><br><p>Second line.</p></main>");
1931        assert!(
1932            doc.nodes
1933                .iter()
1934                .any(|node| matches!(node, HtmlNode::Spacer { lines: 1 }))
1935        );
1936    }
1937
1938    #[test]
1939    fn extracts_structured_nodes() {
1940        let doc = parse_html(
1941            r#"
1942            <main>
1943              <pre><code class="language-rust">fn main() {}</code></pre>
1944              <ul><li>Read</li><li>Search</li></ul>
1945              <table><tr><th>Name</th></tr><tr><td>Index</td></tr></table>
1946              <img src="/logo.png" alt="Index logo">
1947            </main>
1948            "#,
1949        );
1950
1951        assert!(matches!(
1952            doc.nodes.first(),
1953            Some(HtmlNode::CodeBlock {
1954                language: Some(language),
1955                ..
1956            }) if language == "rust"
1957        ));
1958        assert!(
1959            doc.nodes
1960                .iter()
1961                .any(|node| matches!(node, HtmlNode::List { ordered: false, items } if items == &vec!["Read".to_owned(), "Search".to_owned()]))
1962        );
1963        assert!(
1964            doc.nodes
1965                .iter()
1966                .any(|node| matches!(node, HtmlNode::Table { rows } if rows.len() == 2))
1967        );
1968        assert!(
1969            doc.nodes
1970                .iter()
1971                .any(|node| matches!(node, HtmlNode::Image { alt, .. } if alt == "Index logo"))
1972        );
1973    }
1974
1975    #[test]
1976    fn preserves_pre_and_code_text_whitespace() {
1977        let doc = parse_html(
1978            "<main><pre><code class=\"language-rust\">fn main() {\n    println!(\"hi\");\n}</code></pre><code>  raw\n  block</code></main>",
1979        );
1980
1981        assert!(doc.nodes.iter().any(|node| matches!(
1982            node,
1983            HtmlNode::CodeBlock { language: Some(language), code }
1984                if language == "rust" && code == "fn main() {\n    println!(\"hi\");\n}"
1985        )));
1986        assert!(doc.nodes.iter().any(|node| matches!(
1987            node,
1988            HtmlNode::CodeBlock { language: None, code } if code == "  raw\n  block"
1989        )));
1990    }
1991
1992    #[test]
1993    fn preserves_blockquote_as_quoted_paragraph() {
1994        let doc = parse_html("<main><blockquote>Quoted reply text.</blockquote></main>");
1995        assert!(doc.nodes.iter().any(|node| matches!(
1996            node,
1997            HtmlNode::Paragraph(text) if text == "> Quoted reply text."
1998        )));
1999    }
2000
2001    #[test]
2002    fn extracts_bounded_layout_spacing_from_css_and_block_boundaries() {
2003        let doc = parse_html(
2004            r#"
2005            <html lang="en-US">
2006              <head>
2007                <style>
2008                  .hero { margin-bottom: 60px; }
2009                  .chapter { padding-top: 2rem; padding-bottom: 6rem; }
2010                  p { margin-bottom: 1rem; }
2011                </style>
2012              </head>
2013              <main>
2014                <section class="hero"><h1>Landing</h1><p>Intro.</p></section>
2015                <section class="chapter"><p>Chapter body.</p></section>
2016              </main>
2017            </html>
2018            "#,
2019        );
2020
2021        assert!(
2022            doc.nodes
2023                .iter()
2024                .any(|node| matches!(node, HtmlNode::Spacer { lines } if (1..=3).contains(lines)))
2025        );
2026        assert!(
2027            doc.nodes
2028                .iter()
2029                .any(|node| matches!(node, HtmlNode::Spacer { lines: 3 }))
2030        );
2031        assert!(matches!(
2032            doc.nodes.first(),
2033            Some(HtmlNode::Heading { text, .. }) if text == "Landing"
2034        ));
2035        assert!(matches!(
2036            doc.nodes.last(),
2037            Some(HtmlNode::Paragraph(text)) if text == "Chapter body."
2038        ));
2039    }
2040
2041    #[test]
2042    fn extracts_metadata_and_resolves_relative_urls_against_base() {
2043        let doc = parse_html(
2044            r#"
2045            <html lang="en-US">
2046              <head>
2047                <base href="https://example.com/docs/">
2048                <link rel="canonical" href="guide.html">
2049                <meta name="description" content="A calm reader">
2050                <meta property="og:title" content="OG Title">
2051                <meta property="og:description" content="OG Description">
2052              </head>
2053              <main><a href="chapter.html">Chapter</a><img src="img.png" alt="Image"></main>
2054            </html>
2055            "#,
2056        );
2057
2058        assert_eq!(
2059            doc.metadata.canonical_url.as_deref(),
2060            Some("https://example.com/docs/guide.html")
2061        );
2062        assert_eq!(doc.metadata.language.as_deref(), Some("en-US"));
2063        assert_eq!(doc.metadata.description.as_deref(), Some("A calm reader"));
2064        assert_eq!(doc.metadata.open_graph_title.as_deref(), Some("OG Title"));
2065        assert_eq!(
2066            doc.metadata.open_graph_description.as_deref(),
2067            Some("OG Description")
2068        );
2069        assert_eq!(
2070            doc.links.first().map(|link| link.href.as_str()),
2071            Some("https://example.com/docs/chapter.html")
2072        );
2073        assert!(doc.nodes.iter().any(|node| matches!(
2074            node,
2075            HtmlNode::Image { src: Some(src), .. } if src == "https://example.com/docs/img.png"
2076        )));
2077    }
2078
2079    #[test]
2080    fn extracts_forms_inputs_buttons_and_resolves_actions() {
2081        let doc = parse_html(
2082            r#"
2083            <html>
2084              <head><base href="https://example.com/docs/"></head>
2085              <main>
2086                <form id="search" method="get" action="../search">
2087                  <input type="search" name="q" required>
2088                  <input type="hidden" name="source" value="docs">
2089                  <button type="submit" name="go" value="1">Search</button>
2090                </form>
2091              </main>
2092            </html>
2093            "#,
2094        );
2095
2096        assert_eq!(doc.forms.len(), 1);
2097        let form = &doc.forms[0];
2098        assert_eq!(form.name, "search");
2099        assert_eq!(form.method, "GET");
2100        assert_eq!(form.action, "https://example.com/search");
2101        assert_eq!(form.inputs.len(), 2);
2102        assert!(
2103            form.inputs
2104                .iter()
2105                .any(|input| input.name == "q" && input.required)
2106        );
2107        assert!(
2108            form.buttons
2109                .iter()
2110                .any(|button| { button.name.as_deref() == Some("go") && button.label == "Search" })
2111        );
2112        assert!(
2113            doc.nodes
2114                .iter()
2115                .any(|node| matches!(node, HtmlNode::Form(form) if form.name == "search"))
2116        );
2117    }
2118
2119    #[test]
2120    fn extracts_select_values_for_form_inputs() {
2121        let doc = parse_html(
2122            r#"
2123            <main>
2124              <form id="filters" action="/search">
2125                <select name="sort">
2126                  <option value="relevance">Relevance</option>
2127                  <option value="recent" selected>Most recent</option>
2128                </select>
2129                <select name="view">
2130                  <option>compact</option>
2131                  <option value="expanded">expanded</option>
2132                </select>
2133              </form>
2134            </main>
2135            "#,
2136        );
2137
2138        let form = &doc.forms[0];
2139        assert!(
2140            form.inputs
2141                .iter()
2142                .any(|input| input.name == "sort" && input.value.as_deref() == Some("recent"))
2143        );
2144        assert!(
2145            form.inputs
2146                .iter()
2147                .any(|input| input.name == "view" && input.value.as_deref() == Some("compact"))
2148        );
2149    }
2150
2151    #[test]
2152    fn drops_anchor_only_paragraph_nodes_but_keeps_links() {
2153        let doc = parse_html(r#"<main><p><a href="https://example.com">Read more</a></p></main>"#);
2154        assert!(!doc.nodes.iter().any(|node| matches!(
2155            node,
2156            HtmlNode::Paragraph(text) if text == "Read more"
2157        )));
2158        assert_eq!(doc.links.len(), 1);
2159    }
2160
2161    #[test]
2162    fn malformed_html_does_not_panic() {
2163        let doc = parse_html("<main><h1>Broken<p>Still readable<a href=\"/x\">link");
2164        assert_eq!(doc.title.as_deref(), Some("Broken Still readable link"));
2165        assert!(!doc.body_text.is_empty());
2166    }
2167
2168    #[test]
2169    fn index_manifest_discovery_supports_well_known_html_and_header_links() {
2170        let page_url = "https://example.org/docs/guide";
2171        assert_eq!(
2172            well_known_index_manifest_url(page_url).as_deref(),
2173            Some("https://example.org/.well-known/index.idx")
2174        );
2175        assert_eq!(
2176            discover_index_manifest_link_from_html(
2177                r#"<html><head><link rel="index-manifest preload" href="/manifests/site.idx"></head></html>"#,
2178                page_url
2179            )
2180            .as_deref(),
2181            Some("https://example.org/manifests/site.idx")
2182        );
2183        assert_eq!(
2184            discover_index_manifest_link_from_http_link_header(
2185                r#"</meta/index.idx>; rel="index-manifest"; type="application/json""#,
2186                page_url
2187            )
2188            .as_deref(),
2189            Some("https://example.org/meta/index.idx")
2190        );
2191    }
2192
2193    #[test]
2194    fn parse_index_manifest_validates_same_origin_scope_and_date_styles()
2195    -> Result<(), Box<dyn std::error::Error>> {
2196        let manifest = parse_index_manifest(
2197            r#"{
2198                "version": "index.idx/v1",
2199                "scope": "/docs",
2200                "content": { "main_selector": "main article" },
2201                "regions": [{ "role": "related", "selector": "aside.related", "collapsed": true }],
2202                "fields": [{ "name": "updated", "label": "Updated" }],
2203                "forms": [{ "name": "search", "selector": "form.search", "note": "Public search" }],
2204                "dates": [{ "field": "updated", "style": "date" }]
2205            }"#,
2206            "https://example.org/.well-known/index.idx",
2207            "https://example.org/docs/guide",
2208        )?;
2209
2210        assert_eq!(manifest.version, "index.idx/v1");
2211        assert_eq!(manifest.scope, "/docs");
2212        assert_eq!(
2213            manifest.content.main_selector.as_deref(),
2214            Some("main article")
2215        );
2216        assert_eq!(manifest.regions.len(), 1);
2217        assert_eq!(manifest.fields.len(), 1);
2218        assert_eq!(manifest.forms.len(), 1);
2219        assert_eq!(manifest.dates.len(), 1);
2220        assert_eq!(manifest.dates[0].style, IndexDateStyle::Date);
2221        Ok(())
2222    }
2223
2224    #[test]
2225    fn parse_index_manifest_rejects_cross_origin_and_invalid_scope() {
2226        let cross_origin = parse_index_manifest(
2227            r#"{"version":"index.idx/v1"}"#,
2228            "https://cdn.example.org/index.idx",
2229            "https://example.org/docs/guide",
2230        );
2231        assert!(matches!(
2232            cross_origin,
2233            Err(super::IndexManifestError::CrossOrigin { .. })
2234        ));
2235
2236        let invalid_scope = parse_index_manifest(
2237            r#"{"version":"index.idx/v1","scope":"docs"}"#,
2238            "https://example.org/index.idx",
2239            "https://example.org/docs/guide",
2240        );
2241        assert!(matches!(
2242            invalid_scope,
2243            Err(super::IndexManifestError::InvalidScope(_))
2244        ));
2245
2246        let out_of_scope = parse_index_manifest(
2247            r#"{"version":"index.idx/v1","scope":"/blog"}"#,
2248            "https://example.org/index.idx",
2249            "https://example.org/docs/guide",
2250        );
2251        assert!(matches!(
2252            out_of_scope,
2253            Err(super::IndexManifestError::OutOfScope { .. })
2254        ));
2255    }
2256}