1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
use std::collections::HashMap;

use scraper::ElementRef;

use crate::Result;

/// The raw role values extracted from the WAI-ARIA spec
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct ScrapedAriaRole {
    pub name: String,

    /// Implicit value for role
    pub implicit_values: Option<String>,
    pub is_abstract: bool,
    /// Base concept
    pub base: Option<String>,
    pub are_children_presentational: bool,
    /// Subclass roles
    pub children: Vec<String>,
    /// Prohibited states and properties
    pub disallowed: Vec<String>,
    /// Inherited states and properties
    pub inherited: Vec<String>,
    /// Allowed accessibility child roles
    pub must_contain: Vec<String>,
    pub name_from: Option<String>,
    pub is_name_required: bool,
    /// Superclass role
    pub parent: Vec<String>,
    /// Supported states and properties
    pub properties: Vec<String>,
    /// Related concepts
    pub related: Option<String>,
    /// Required states and properties
    pub required: Vec<String>,
    /// Required accessibility parent roles
    pub scope: Vec<String>,
}

/// The raw property and state values extracted from the WAI-ARIA spec
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct ScrapedAriaProperty {
    pub kind: PropertyKind,
    pub name: String,
    pub idl_name: Option<String>,
    pub description: Option<String>,
    pub is_global: bool,

    /// Used in roles
    pub applicability: Vec<String>,
    /// Inherits into roles
    pub descendants: Vec<String>,
    /// Related concepts
    pub related: Option<String>,
    pub value_kind: String,
    pub values: Vec<String>,
}

/// Whether a `ScrapedProperty` is an ARIA property or state
#[derive(Debug, Clone, Copy, serde::Serialize, serde::Deserialize)]
pub enum PropertyKind {
    Property,
    State,
}

/// The raw element values extracted from the WAI-ARIA spec
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct ScrapedAriaElement {
    pub id: String,
    pub name: String,
    pub implicit_roles: Vec<String>,
    pub allowed_roles: Vec<String>,
    pub allowed_properties: Vec<String>,
    pub global: Option<String>,
    pub checked: Option<String>,
    pub strong: Vec<String>,
    pub links: Vec<String>,
}

/// Parse the W3C WAI-ARIA standards document.
pub fn scrape_aria(spec: String) -> Result<(Vec<ScrapedAriaRole>, Vec<ScrapedAriaProperty>)> {
    let document = scraper::Html::parse_document(&spec);
    let roles = scrape_aria_roles(&document)?;
    let properties = scrape_aria_properties_and_states(&document)?;
    Ok((roles, properties))
}

/// Parse the W3C ARIA in HTML standards document.
pub fn scrape_html_aria(spec: String) -> Result<Vec<ScrapedAriaElement>> {
    let document = scraper::Html::parse_document(&spec);
    let mut specs = vec![];

    let selector = scraper::Selector::parse("#docconformance").unwrap();
    let header = document.select(&selector).next().unwrap();
    let section = ElementRef::wrap(header.parent().unwrap()).unwrap();
    let selector = scraper::Selector::parse("table").unwrap();
    let table = section.select(&selector).next().unwrap();
    let selector = scraper::Selector::parse("tbody tr").unwrap();
    for row in table.select(&selector) {
        let id = extract_id("th", row).unwrap().to_owned();
        let element = extract_str("th", row).unwrap();
        let implicit_roles = extract_vec("td:nth-child(2) a[href^=\"#index-aria-\"]", row);

        let selector = scraper::Selector::parse("td:nth-child(3)").unwrap();
        let allowances = row.select(&selector).next().unwrap();
        let allowed_roles = extract_vec("a[href^=\"#index-aria-\"]", allowances);
        let allowed_properties = extract_vec("a[data-cite^=\"wai-aria-1.2#aria-\"]", allowances);
        let global = extract_str("a[data-cite=\"wai-aria-1.2#global_states\"]", allowances);
        let checked = extract_str("a[href=\"#att-checked\"]", allowances);
        let strong = extract_vec("strong", allowances);
        let links = extract_vec("a:not([href]):not([data-cite])", allowances);

        specs.push(ScrapedAriaElement {
            id,
            name: element,
            implicit_roles,
            allowed_roles,
            allowed_properties,
            global,
            checked,
            strong,
            links,
        })
    }
    Ok(specs)
}

/// Scrape the ARIA role definitions
fn scrape_aria_roles(document: &scraper::Html) -> Result<Vec<ScrapedAriaRole>> {
    let mut specs = vec![];

    let selector = scraper::Selector::parse(".role").unwrap();
    for element in document.select(&selector) {
        let Some(name) = extract_str(".role-name code", element) else {
            continue;
        };

        let implicit_values = extract_str(".implicit-values", element);
        let is_abstract = extract_bool(".role-abstract", element);
        let base = extract_str(".role-base", element);
        let are_children_presentational = extract_bool(".role-childpresentational", element);
        let children = extract_vec(".role-children code", element);
        let disallowed = extract_vec(".role-disallowed code", element);
        let inherited = extract_vec(".role-inherited code", element);
        let must_contain = extract_vec(".role-mustcontain code", element);
        let name_from = extract_str(".role-namefrom", element);
        let is_name_required = extract_bool(".role-namerequired", element);
        let parent = extract_vec(".role-parent code", element);
        let properties = extract_vec(".role-properties code", element);
        let related = extract_str(".role-related", element);
        let required = extract_vec(".required-properties code", element);
        let scope = extract_vec(".role-scope code", element);

        specs.push(ScrapedAriaRole {
            name,
            implicit_values,
            is_abstract,
            base,
            are_children_presentational,
            children,
            disallowed,
            inherited,
            must_contain,
            name_from,
            is_name_required,
            parent,
            properties,
            related,
            required,
            scope,
        })
    }

    Ok(specs)
}

/// Scrape the ARIA property and state definitions
fn scrape_aria_properties_and_states(document: &scraper::Html) -> Result<Vec<ScrapedAriaProperty>> {
    let mut global_properties = vec![];
    let selector = scraper::Selector::parse("#global_states li a").unwrap();
    for element in document.select(&selector) {
        global_properties.push(element.value().attr("href").unwrap()[1..].to_string());
    }

    let mut descriptions = HashMap::new();
    let dt_selector = scraper::Selector::parse("dl#index_state_prop dt").unwrap();
    let dd_selector = scraper::Selector::parse("dl#index_state_prop dd").unwrap();
    for (dt, dd) in document
        .select(&dt_selector)
        .zip(document.select(&dd_selector))
    {
        descriptions.insert(dt.text().collect::<String>(), dd.text().collect::<String>());
    }

    let mut idl_attribute_names = HashMap::new();
    let selector =
        scraper::Selector::parse("#accessibilityroleandproperties-correspondence tr").unwrap();
    for row in document.select(&selector) {
        if let Some(idl) = extract_str("[data-idl=\"attribute\"]", row) {
            if let Some(property) = extract_str(".property-reference, .state-reference", row) {
                idl_attribute_names.insert(property, idl);
            }
        }
    }

    let mut specs = vec![];

    let selector = scraper::Selector::parse(".property, .state").unwrap();
    for element in document.select(&selector) {
        let Some(name) = extract_str(".property-name code, .state-name code", element) else {
            continue;
        };
        let idl_name = idl_attribute_names.get(&name).cloned();
        let description = descriptions.remove(&name);

        let kind = if element.value().classes().any(|x| x == "property") {
            PropertyKind::Property
        } else {
            PropertyKind::State
        };

        let is_global = global_properties.contains(&name);
        let applicability = extract_vec(
            ".property-applicability code, .state-applicability code",
            element,
        );
        let descendants = extract_vec(
            ".property-descendants code, .state-descendants code",
            element,
        );
        let related = extract_str(".property-related, .state-related", element);
        let value_kind = extract_str(".property-value, .state-value", element).unwrap();
        let values = extract_vec(".value-name", element);

        specs.push(ScrapedAriaProperty {
            kind,
            name,
            idl_name,
            description,
            is_global,
            applicability,
            descendants,
            related,
            value_kind,
            values,
        });
    }

    Ok(specs)
}

/// Attempt to extract the id attribute of `selector` from `element`.
fn extract_id<'a>(selector: &str, element: scraper::ElementRef<'a>) -> Option<&'a str> {
    let selector = scraper::Selector::parse(selector).unwrap();
    element
        .select(&selector)
        .next()
        .and_then(|el| el.value().attr("id"))
}

/// Attempt to extract the text content of `selector` from `element`.
fn extract_str(selector: &str, element: scraper::ElementRef) -> Option<String> {
    let selector = scraper::Selector::parse(selector).unwrap();
    element
        .select(&selector)
        .next()
        .map(|el| el.text().collect::<String>().trim().to_owned())
}

/// Extract a boolean value from `element` using `selector`.
///
/// If the selector matches an element and that element's text content is "True" then
/// return `true`, else return `false`.
fn extract_bool(selector: &str, element: scraper::ElementRef) -> bool {
    let selector = scraper::Selector::parse(selector).unwrap();
    if let Some(el) = element.select(&selector).next() {
        if el.text().next() == Some("True") {
            return true;
        }
    }

    false
}

/// Extract a list of `String`s from `element` using `selector`
fn extract_vec(selector: &str, element: scraper::ElementRef) -> Vec<String> {
    let selector = scraper::Selector::parse(selector).unwrap();
    element
        .select(&selector)
        .map(|el| el.text().collect())
        .collect()
}