Skip to main content

servo_fetch/
layout.rs

1//! CSS layout heuristics — detects page structure (navbar, sidebar, footer, main)
2//! to improve content extraction accuracy.
3
4use serde::Deserialize;
5
6/// Default viewport width used by Servo for rendering.
7pub const VIEWPORT_WIDTH: u32 = 1280;
8/// Default viewport height used by Servo for rendering.
9pub const VIEWPORT_HEIGHT: u32 = 800;
10
11/// ARIA roles that influence our layout heuristics.
12#[derive(Debug, Clone, Deserialize, PartialEq, Eq)]
13#[serde(rename_all = "lowercase")]
14pub(crate) enum Role {
15    Navigation,
16    Complementary,
17    Contentinfo,
18    #[serde(untagged)]
19    Other(String),
20}
21
22impl Role {
23    fn as_str(&self) -> &str {
24        match self {
25            Self::Navigation => "navigation",
26            Self::Complementary => "complementary",
27            Self::Contentinfo => "contentinfo",
28            Self::Other(s) => s,
29        }
30    }
31}
32
33/// CSS `position` values that we treat as stacking overlays (likely navbars).
34#[derive(Debug, Clone, Deserialize, PartialEq, Eq)]
35#[serde(rename_all = "lowercase")]
36enum Position {
37    Fixed,
38    Sticky,
39    #[serde(other)]
40    Other,
41}
42
43impl Position {
44    fn is_stacking(&self) -> bool {
45        matches!(self, Self::Fixed | Self::Sticky)
46    }
47}
48
49/// A page element with CSS layout data, deserialized from the injected JS.
50#[derive(Deserialize)]
51pub struct LayoutElement {
52    tag: String,
53    role: Option<Role>,
54    w: f64,
55    h: f64,
56    position: Position,
57}
58
59impl LayoutElement {
60    fn is_navbar(&self) -> bool {
61        self.position.is_stacking() && self.h < f64::from(VIEWPORT_HEIGHT) * 0.2
62    }
63
64    fn is_sidebar(&self) -> bool {
65        let is_narrow = self.w < f64::from(VIEWPORT_WIDTH) * 0.3;
66        let is_side_tag = matches!(self.tag.as_str(), "ASIDE" | "NAV");
67        let is_side_role = matches!(self.role, Some(Role::Navigation | Role::Complementary));
68        is_narrow && (is_side_tag || is_side_role)
69    }
70
71    fn is_footer(&self) -> bool {
72        let is_full_width = self.w >= f64::from(VIEWPORT_WIDTH) * 0.8;
73        (self.tag == "FOOTER" && is_full_width) || self.role == Some(Role::Contentinfo)
74    }
75
76    fn should_remove(&self) -> bool {
77        self.is_navbar() || self.is_sidebar() || self.is_footer()
78    }
79}
80
81/// CSS selectors for elements that should be stripped before passing to readability.
82#[must_use]
83pub fn selectors_to_strip(elements: &[LayoutElement]) -> Vec<String> {
84    let mut sels: Vec<String> = elements
85        .iter()
86        .filter(|el| el.should_remove())
87        .map(|el| {
88            let tag = el.tag.to_lowercase();
89            match &el.role {
90                Some(role) => format!("{tag}[role=\"{}\"]", role.as_str()),
91                None => tag,
92            }
93        })
94        .collect();
95    sels.sort_unstable();
96    sels.dedup();
97    sels
98}
99
100#[cfg(test)]
101mod tests {
102    use super::*;
103
104    fn el(tag: &str, w: f64, h: f64, position: Position, role: Option<Role>) -> LayoutElement {
105        LayoutElement {
106            tag: tag.to_string(),
107            role,
108            w,
109            h,
110            position,
111        }
112    }
113
114    #[test]
115    fn detects_fixed_navbar() {
116        let sels = selectors_to_strip(&[el("HEADER", 1280.0, 60.0, Position::Fixed, None)]);
117        assert_eq!(sels, vec!["header"]);
118    }
119
120    #[test]
121    fn detects_sticky_navbar() {
122        let sels = selectors_to_strip(&[el("NAV", 1280.0, 50.0, Position::Sticky, None)]);
123        assert_eq!(sels, vec!["nav"]);
124    }
125
126    #[test]
127    fn ignores_tall_fixed_element() {
128        let sels = selectors_to_strip(&[el("DIV", 1280.0, 400.0, Position::Fixed, None)]);
129        assert!(sels.is_empty());
130    }
131
132    #[test]
133    fn detects_narrow_aside_as_sidebar() {
134        let sels = selectors_to_strip(&[el("ASIDE", 300.0, 800.0, Position::Other, None)]);
135        assert_eq!(sels, vec!["aside"]);
136    }
137
138    #[test]
139    fn detects_footer() {
140        let sels = selectors_to_strip(&[el("FOOTER", 1280.0, 100.0, Position::Other, None)]);
141        assert_eq!(sels, vec!["footer"]);
142    }
143
144    #[test]
145    fn ignores_narrow_footer() {
146        // A <footer> inside an <article> is typically narrow — should not be stripped.
147        let sels = selectors_to_strip(&[el("FOOTER", 600.0, 50.0, Position::Other, None)]);
148        assert!(sels.is_empty());
149    }
150
151    #[test]
152    fn detects_contentinfo_role_as_footer() {
153        let sels = selectors_to_strip(&[el("DIV", 1280.0, 100.0, Position::Other, Some(Role::Contentinfo))]);
154        assert_eq!(sels, vec!["div[role=\"contentinfo\"]"]);
155    }
156
157    #[test]
158    fn deduplicates_selectors() {
159        let elements = vec![
160            el("NAV", 200.0, 50.0, Position::Fixed, None),
161            el("NAV", 250.0, 40.0, Position::Sticky, None),
162        ];
163        let sels = selectors_to_strip(&elements);
164        assert_eq!(sels, vec!["nav"]);
165    }
166
167    #[test]
168    fn detects_complementary_role_as_sidebar() {
169        let sels = selectors_to_strip(&[el("DIV", 250.0, 800.0, Position::Other, Some(Role::Complementary))]);
170        assert_eq!(sels, vec!["div[role=\"complementary\"]"]);
171    }
172
173    #[test]
174    fn deserializes_role_from_json() {
175        let el: LayoutElement =
176            serde_json::from_str(r#"{"tag":"DIV","role":"navigation","w":100.0,"h":50.0,"position":"static"}"#)
177                .unwrap();
178        assert_eq!(el.role, Some(Role::Navigation));
179    }
180
181    #[test]
182    fn deserializes_unknown_role_as_other() {
183        let el: LayoutElement =
184            serde_json::from_str(r#"{"tag":"DIV","role":"banner","w":100.0,"h":50.0,"position":"static"}"#).unwrap();
185        assert_eq!(el.role, Some(Role::Other("banner".to_string())));
186    }
187
188    #[test]
189    fn deserializes_unknown_position_as_other() {
190        let el: LayoutElement =
191            serde_json::from_str(r#"{"tag":"DIV","role":null,"w":100.0,"h":50.0,"position":"absolute"}"#).unwrap();
192        assert_eq!(el.position, Position::Other);
193    }
194}