url_cleaner_engine/glue/parse/html/
get_attribute.rs

1//! Gets attributes from HTML elements.
2
3use thiserror::Error;
4use serde::{Serialize, Deserialize};
5
6use super::*;
7use crate::util::*;
8
9/// The enum of errors that can be encountered when failing to parse an HTML element.
10#[derive(Debug, Error, Clone, Copy)]
11pub enum GAVSyntaxErrorKind {
12    /// The [input-doesnt-start-with-html-element](https://html.spec.whatwg.org/multipage/parsing.html#parse-errors) error.
13    #[error("Input doesn't start with an HTML element.")]
14    InputDoesntStartWithHtmlElement,
15    /// The [unexpected-question-mark-instead-of-tagname](https://html.spec.whatwg.org/multipage/parsing.html#parse-errors) error.
16    #[error("Unexpected question mark instead of tag name.")]
17    UnexpectedQuestionMarkInsteadOfTagName,
18    /// The [invalid-start-of-tag-name](https://html.spec.whatwg.org/multipage/parsing.html#parse-errors) error.
19    #[error("Invalid start of tag name.")]
20    InvalidStartOfTagName,
21    /// The [unexpected-null-character](https://html.spec.whatwg.org/multipage/parsing.html#parse-errors) error.
22    #[error("Unexpected null character.")]
23    UnexpectedNullCharacter,
24    /// The [unexpected-solidus-in-tag](https://html.spec.whatwg.org/multipage/parsing.html#parse-errors) error.
25    #[error("Unexpected solidus in tag.")]
26    UnexpectedSolidusInTag,
27    /// The [unexpected-equals-sign-before-attribute-name](https://html.spec.whatwg.org/multipage/parsing.html#parse-errors) error.
28    #[error("Unexpected equals sign before attribute name.")]
29    UnexpectedEqualsSignBeforeAttributeName,
30    /// The [unexpected-character-in-attribute-name](https://html.spec.whatwg.org/multipage/parsing.html#parse-errors) error.
31    #[error("Unexpected character in attribute name.")]
32    UnexpectedCharacterInAttributeName,
33    /// The [missing-attribute-value](https://html.spec.whatwg.org/multipage/parsing.html#parse-errors) error.
34    #[error("Missing attribute value.")]
35    MissingAttributeValue,
36    /// The [missing-whitespace-between-attributes](https://html.spec.whatwg.org/multipage/parsing.html#parse-errors) error.
37    #[error("Missing whitespace between attributes.")]
38    MissingWhitespaceBetweenAttributes
39}
40
41/// The enum of errors [`get_attribute_value`] can return.
42#[derive(Debug, Error)]
43pub enum GAVError {
44    /// A syntax error.
45    #[error("Syntax error: {index}, {last_bite:?}, {kind:?}")]
46    Syntax {
47        /// The index of the input string the error happened.
48        index: usize,
49        /// The state the previous character put the DFA in.
50        last_bite: GAVLastBite,
51        /// The error kind.
52        kind: GAVSyntaxErrorKind
53    },
54    /// Returned when an [`UnescapeTextError`] is encountered.
55    #[error(transparent)]
56    UnescapeTextError(#[from] UnescapeTextError),
57    /// Returned when the HTML tag isn't finished.
58    #[error("The HTML tag wasn't finished.")]
59    UnfinishedTag
60}
61
62/// The states the DFA in [`get_attribute_value`] can be in.
63#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
64#[serde(deny_unknown_fields)]
65pub enum GAVLastBite {
66    /// The [Data](https://html.spec.whatwg.org/multipage/parsing.html#data-state) state.
67    Data,
68    /// The [Tag open](https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state) state.
69    TagOpen,
70    /// The [Tag name](https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state) state.
71    TagName,
72    /// The [Self-closing start tag](https://html.spec.whatwg.org/multipage/parsing.html#self-closing-start-tag-state) state.
73    SelfClosingStartTag,
74    /// The [Before attribute name](https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state) state.
75    BeforeAttributeName,
76    /// The [Attribute name](https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state) state.
77    AttributeName,
78    /// The [After attribute name](https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-name-state) state.
79    AfterAttributeName,
80    /// The [Before attribute value](https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-value-state) state.
81    BeforeAttributeValue,
82    /// The [Attribute value (double-quoted)](https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(double-quoted)-state) state.
83    AttributeValueDoubleQuoted,
84    /// The [Attribute value (single-quoted)](https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(single-quoted)-state) state.
85    AttributeValueSingleQuoted,
86    /// The [Attribute value (unquoted)](https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(unquoted)-state) state.
87    AttributeValueUnquoted,
88    /// The [After attribute value (quoted)](https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-value-(quoted)-state) state.
89    AfterAttributeValueQuoted,
90    /// The done state.
91    Done
92}
93
94/// The current state of the [`get_attribute_value`] DFA.
95#[derive(Debug)]
96struct GAVState<'a> {
97    /// The input.
98    input: &'a str,
99    /// The name of the attribute to search for.
100    name: &'a str,
101    /// The state the last bite put the DFA in.
102    last_bite: GAVLastBite,
103    /// The return value.
104    ret: Option<Option<&'a str>>,
105    /// The location of the start of the most recent attribute's name.
106    attr_name_start: usize,
107    /// The location of the end of the most recent attribute's name.
108    attr_name_end: usize,
109    /// The location of the start of the most recent attribute's value.
110    attr_value_start: usize
111}
112
113/// Shorthand.
114type LB = GAVLastBite;
115/// Shorthand.
116type EK = GAVSyntaxErrorKind;
117
118/// Take a string that starts with an HTML element and get the value of the last attribute with the specified name.
119///
120/// Currently recoverable errors (as defined by the spec) aren't recovered from. I should eventually make this do that.
121/// # Errors
122/// It's complicated, but TL;DR if the spec says an error happens (even if it can recover), that error is returned.
123///
124/// If the call to [`unescape_text`] returns an error, that error is returned.
125///
126/// If the input doesn't start with a complete HTML start tag, returns the error [`GAVError::UnfinishedTag`].
127/// # Examples
128/// ```
129/// use url_cleaner_engine::glue::*;
130///
131/// assert_eq!(parse::html::get_attribute_value("<a href='aaa'>"       , "href").unwrap(), Some(Some("aaa" .to_string())));
132/// assert_eq!(parse::html::get_attribute_value("<a href='a&quot;a'>"  , "href").unwrap(), Some(Some("a\"a".to_string())));
133/// assert_eq!(parse::html::get_attribute_value("<a href=\"aaa\">"     , "href").unwrap(), Some(Some("aaa" .to_string())));
134/// assert_eq!(parse::html::get_attribute_value("<a href=\"a&quot;a\">", "href").unwrap(), Some(Some("a\"a".to_string())));
135/// assert_eq!(parse::html::get_attribute_value("<a href=aaa>"         , "href").unwrap(), Some(Some("aaa" .to_string())));
136/// assert_eq!(parse::html::get_attribute_value("<a href=a&quot;a>"    , "href").unwrap(), Some(Some("a\"a".to_string())));
137///
138/// assert_eq!(parse::html::get_attribute_value("<a href='aaa'        >", "href").unwrap(), Some(Some("aaa" .to_string())));
139/// assert_eq!(parse::html::get_attribute_value("<a href='a&quot;a'   >", "href").unwrap(), Some(Some("a\"a".to_string())));
140/// assert_eq!(parse::html::get_attribute_value("<a href=\"aaa\"      >", "href").unwrap(), Some(Some("aaa" .to_string())));
141/// assert_eq!(parse::html::get_attribute_value("<a href=\"a&quot;a\" >", "href").unwrap(), Some(Some("a\"a".to_string())));
142/// assert_eq!(parse::html::get_attribute_value("<a href=aaa          >", "href").unwrap(), Some(Some("aaa" .to_string())));
143/// assert_eq!(parse::html::get_attribute_value("<a href=a&quot;a     >", "href").unwrap(), Some(Some("a\"a".to_string())));
144///
145/// assert_eq!(parse::html::get_attribute_value("<a href=b href='aaa'        >", "href").unwrap(), Some(Some("aaa" .to_string())));
146/// assert_eq!(parse::html::get_attribute_value("<a href=b href='a&quot;a'   >", "href").unwrap(), Some(Some("a\"a".to_string())));
147/// assert_eq!(parse::html::get_attribute_value("<a href=b href=\"aaa\"      >", "href").unwrap(), Some(Some("aaa" .to_string())));
148/// assert_eq!(parse::html::get_attribute_value("<a href=b href=\"a&quot;a\" >", "href").unwrap(), Some(Some("a\"a".to_string())));
149/// assert_eq!(parse::html::get_attribute_value("<a href=b href=aaa          >", "href").unwrap(), Some(Some("aaa" .to_string())));
150/// assert_eq!(parse::html::get_attribute_value("<a href=b href=a&quot;a     >", "href").unwrap(), Some(Some("a\"a".to_string())));
151///
152/// assert_eq!(parse::html::get_attribute_value("<a>", "href").unwrap(), None, "1");
153///
154/// assert_eq!(parse::html::get_attribute_value("<a href>"                           , "href").unwrap(), Some(None));
155///
156/// assert_eq!(parse::html::get_attribute_value("<a href href=\"1\">"                , "href").unwrap(), Some(Some("1".to_string())));
157/// assert_eq!(parse::html::get_attribute_value("<a href href=\"1\" href>"           , "href").unwrap(), Some(None));
158/// assert_eq!(parse::html::get_attribute_value("<a href href=\"1\" href href=\"2\">", "href").unwrap(), Some(Some("2".to_string())));
159///
160/// assert_eq!(parse::html::get_attribute_value("<a href=\"1\" href>"                , "href").unwrap(), Some(None));
161/// assert_eq!(parse::html::get_attribute_value("<a href=\"1\" href href=\"2\">"     , "href").unwrap(), Some(Some("2".to_string())));
162/// assert_eq!(parse::html::get_attribute_value("<a href=\"1\" href href=\"2\" href>", "href").unwrap(), Some(None));
163///
164/// assert_eq!(parse::html::get_attribute_value("<a href=\"1\">stuff"              , "href").unwrap(), Some(Some("1".to_string())));
165/// assert_eq!(parse::html::get_attribute_value("<a href=\"1\"><a href=\"2\">"     , "href").unwrap(), Some(Some("1".to_string())));
166/// assert_eq!(parse::html::get_attribute_value("<a href=\"1\">stuff<a href=\"2\">", "href").unwrap(), Some(Some("1".to_string())));
167/// assert_eq!(parse::html::get_attribute_value("<a href=\"1\">href=\"2\""         , "href").unwrap(), Some(Some("1".to_string())));
168/// ```
169pub fn get_attribute_value<'a>(input: &'a str, name: &'a str) -> Result<Option<Option<String>>, GAVError> {
170    debug!(parse::html::get_attribute_value, &(), input, name);
171
172    let mut state = GAVState {
173        input,
174        name,
175        last_bite: LB::Data,
176        ret: None,
177        attr_name_start: 0,
178        attr_name_end: 0,
179        attr_value_start: 0
180    };
181
182    for (i, c) in input.chars().enumerate() {
183        if let Err(e) = munch(&mut state, i, c) {
184            return Err(GAVError::Syntax {
185                index: i,
186                last_bite: state.last_bite,
187                kind: e
188            });
189        }
190        if matches!(state.last_bite, LB::Done) {
191            return Ok(match state.ret {
192                Some(Some(value)) => Some(Some(unescape_text(value)?)),
193                Some(None) => Some(None),
194                None => None
195            });
196        }
197    }
198
199    Err(GAVError::UnfinishedTag)
200}
201
202/// Advance the state of the [`get_attribute_value`] DFA.
203fn munch(state: &mut GAVState, i: usize, c: char) -> Result<(), GAVSyntaxErrorKind> {
204    debug!(parse::html::get_attribute::munch, &(), state, i, c);
205    match (state.last_bite, c) {
206        (LB::Data, '<') => {state.last_bite = LB::TagOpen;},
207        (LB::Data, _  ) => Err(EK::InputDoesntStartWithHtmlElement)?,
208
209
210        (LB::TagOpen, 'a'..='z' | 'A'..='Z') => {state.last_bite = LB::TagName;},
211        (LB::TagOpen, '?'                  ) => Err(EK::UnexpectedQuestionMarkInsteadOfTagName)?,
212        (LB::TagOpen, _                    ) => Err(EK::InvalidStartOfTagName)?,
213
214
215        (LB::TagName, '\t' | '\r' | '\n' | ' ') => {state.last_bite = LB::BeforeAttributeName;},
216        (LB::TagName, '/'                     ) => {state.last_bite = LB::SelfClosingStartTag;},
217        (LB::TagName, '\0'                    ) => Err(EK::UnexpectedNullCharacter)?,
218        (LB::TagName, '>'                     ) => {state.last_bite = LB::Done;},
219        (LB::TagName, _                       ) => {},
220
221
222        (LB::SelfClosingStartTag, '>') => {state.last_bite = LB::Done;},
223        (LB::SelfClosingStartTag, _  ) => Err(EK::UnexpectedSolidusInTag)?,
224
225
226        (LB::BeforeAttributeName, '\t' | '\r' | '\n' | ' ') => {},
227        (LB::BeforeAttributeName, '/' | '>'               ) => {state.last_bite = LB::AfterAttributeName; munch(state, i, c)?;},
228        (LB::BeforeAttributeName, '='                     ) => Err(EK::UnexpectedEqualsSignBeforeAttributeName)?,
229        (LB::BeforeAttributeName, _                       ) => {state.last_bite = LB::AttributeName; state.attr_name_start = i; munch(state, i, c)?;},
230
231
232        (LB::AttributeName, '\t' | '\r' | '\n' | ' ' | '/' | '>' ) => {state.last_bite = LB::AfterAttributeName  ; state.attr_name_end = i; if &state.input[state.attr_name_start..state.attr_name_end] == state.name {state.ret = Some(None);} else {state.ret = None;} munch(state, i, c)?;},
233        (LB::AttributeName, '='                                  ) => {state.last_bite = LB::BeforeAttributeValue; state.attr_name_end = i;},
234        (LB::AttributeName, '\0' | '"' | '\'' | '<'              ) => Err(EK::UnexpectedCharacterInAttributeName)?,
235        (LB::AttributeName, _                                    ) => {},
236
237
238        (LB::AfterAttributeName, '\t' | '\r' | '\n' | ' ') => {},
239        (LB::AfterAttributeName, '/'                     ) => {state.last_bite = LB::SelfClosingStartTag ;},
240        (LB::AfterAttributeName, '='                     ) => {state.last_bite = LB::BeforeAttributeValue;},
241        (LB::AfterAttributeName, '>'                     ) => {state.last_bite = LB::Done; if &state.input[state.attr_name_start..i] == state.name {state.ret = Some(None);}},
242        (LB::AfterAttributeName, _                       ) => {state.last_bite = LB::AttributeName; state.attr_name_start = i; munch(state, i, c)?;},
243
244
245        (LB::BeforeAttributeValue, '\t' | '\r' | '\n' | ' ') => {},
246        #[allow(clippy::arithmetic_side_effects, reason = "Can't happen.")]
247        (LB::BeforeAttributeValue, '"'                     ) => {state.last_bite = LB::AttributeValueDoubleQuoted; state.attr_value_start = i+1;},
248        #[allow(clippy::arithmetic_side_effects, reason = "Can't happen.")]
249        (LB::BeforeAttributeValue, '\''                    ) => {state.last_bite = LB::AttributeValueSingleQuoted; state.attr_value_start = i+1;},
250        (LB::BeforeAttributeValue, '>'                     ) => Err(EK::MissingAttributeValue)?,
251        (LB::BeforeAttributeValue, _                       ) => {state.last_bite = LB::AttributeValueUnquoted; state.attr_value_start = i; munch(state, i, c)?;},
252
253
254        (LB::AttributeValueDoubleQuoted, '"' ) => {state.last_bite = LB::AfterAttributeValueQuoted; if &state.input[state.attr_name_start..state.attr_name_end] == state.name {state.ret = Some(Some(&state.input[state.attr_value_start..i]));}},
255        (LB::AttributeValueDoubleQuoted, '&' ) => {}, // Processed later.
256        (LB::AttributeValueDoubleQuoted, '\0') => Err(EK::UnexpectedNullCharacter)?,
257        (LB::AttributeValueDoubleQuoted, _   ) => {},
258
259
260        (LB::AttributeValueSingleQuoted, '\'') => {state.last_bite = LB::AfterAttributeValueQuoted; if &state.input[state.attr_name_start..state.attr_name_end] == state.name {state.ret = Some(Some(&state.input[state.attr_value_start..i]));}},
261        (LB::AttributeValueSingleQuoted, '&' ) => {}, // Processed later.
262        (LB::AttributeValueSingleQuoted, '\0') => Err(EK::UnexpectedNullCharacter)?,
263        (LB::AttributeValueSingleQuoted, _   ) => {},
264
265
266        (LB::AttributeValueUnquoted, '\t' | '\r' | '\n' | ' ') => {state.last_bite = LB::BeforeAttributeName; if &state.input[state.attr_name_start..state.attr_name_end] == state.name {state.ret = Some(Some(&state.input[state.attr_value_start..i]));}},
267        (LB::AttributeValueUnquoted, '&'                     ) => {}, // Processed later.
268        (LB::AttributeValueUnquoted, '>'                     ) => {state.last_bite = LB::Done; if &state.input[state.attr_name_start..state.attr_name_end] == state.name {state.ret = Some(Some(&state.input[state.attr_value_start..i]));}},
269        (LB::AttributeValueUnquoted, '\0'                    ) => Err(EK::UnexpectedNullCharacter)?,
270        (LB::AttributeValueUnquoted, _                       ) => {},
271
272
273        (LB::AfterAttributeValueQuoted, '\t' | '\r' | '\n' | ' ') => {state.last_bite = LB::BeforeAttributeName;},
274        (LB::AfterAttributeValueQuoted, '/'                     ) => {state.last_bite = LB::SelfClosingStartTag;},
275        (LB::AfterAttributeValueQuoted, '>'                     ) => {state.last_bite = LB::Done;},
276        (LB::AfterAttributeValueQuoted, _                       ) => Err(EK::MissingWhitespaceBetweenAttributes)?,
277
278
279        (LB::Done, _) => panic!("Logic error.")
280    }
281
282    Ok(())
283}