url_cleaner_engine/glue/parse/html/get_attribute.rs
1//! Gets attributes from HTML elements.
2
3use thiserror::Error;
4use serde::{Serialize, Deserialize};
5
6use super::*;
7use crate::util::*;
8
9/// The enum of errors that can be encountered when failing to parse an HTML element.
10#[derive(Debug, Error, Clone, Copy)]
11pub enum GAVSyntaxErrorKind {
12 /// The [input-doesnt-start-with-html-element](https://html.spec.whatwg.org/multipage/parsing.html#parse-errors) error.
13 #[error("Input doesn't start with an HTML element.")]
14 InputDoesntStartWithHtmlElement,
15 /// The [unexpected-question-mark-instead-of-tagname](https://html.spec.whatwg.org/multipage/parsing.html#parse-errors) error.
16 #[error("Unexpected question mark instead of tag name.")]
17 UnexpectedQuestionMarkInsteadOfTagName,
18 /// The [invalid-start-of-tag-name](https://html.spec.whatwg.org/multipage/parsing.html#parse-errors) error.
19 #[error("Invalid start of tag name.")]
20 InvalidStartOfTagName,
21 /// The [unexpected-null-character](https://html.spec.whatwg.org/multipage/parsing.html#parse-errors) error.
22 #[error("Unexpected null character.")]
23 UnexpectedNullCharacter,
24 /// The [unexpected-solidus-in-tag](https://html.spec.whatwg.org/multipage/parsing.html#parse-errors) error.
25 #[error("Unexpected solidus in tag.")]
26 UnexpectedSolidusInTag,
27 /// The [unexpected-equals-sign-before-attribute-name](https://html.spec.whatwg.org/multipage/parsing.html#parse-errors) error.
28 #[error("Unexpected equals sign before attribute name.")]
29 UnexpectedEqualsSignBeforeAttributeName,
30 /// The [unexpected-character-in-attribute-name](https://html.spec.whatwg.org/multipage/parsing.html#parse-errors) error.
31 #[error("Unexpected character in attribute name.")]
32 UnexpectedCharacterInAttributeName,
33 /// The [missing-attribute-value](https://html.spec.whatwg.org/multipage/parsing.html#parse-errors) error.
34 #[error("Missing attribute value.")]
35 MissingAttributeValue,
36 /// The [missing-whitespace-between-attributes](https://html.spec.whatwg.org/multipage/parsing.html#parse-errors) error.
37 #[error("Missing whitespace between attributes.")]
38 MissingWhitespaceBetweenAttributes
39}
40
41/// The enum of errors [`get_attribute_value`] can return.
42#[derive(Debug, Error)]
43pub enum GAVError {
44 /// A syntax error.
45 #[error("Syntax error: {index}, {last_bite:?}, {kind:?}")]
46 Syntax {
47 /// The index of the input string the error happened.
48 index: usize,
49 /// The state the previous character put the DFA in.
50 last_bite: GAVLastBite,
51 /// The error kind.
52 kind: GAVSyntaxErrorKind
53 },
54 /// Returned when an [`UnescapeTextError`] is encountered.
55 #[error(transparent)]
56 UnescapeTextError(#[from] UnescapeTextError),
57 /// Returned when the HTML tag isn't finished.
58 #[error("The HTML tag wasn't finished.")]
59 UnfinishedTag
60}
61
62/// The states the DFA in [`get_attribute_value`] can be in.
63#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
64#[serde(deny_unknown_fields)]
65pub enum GAVLastBite {
66 /// The [Data](https://html.spec.whatwg.org/multipage/parsing.html#data-state) state.
67 Data,
68 /// The [Tag open](https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state) state.
69 TagOpen,
70 /// The [Tag name](https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state) state.
71 TagName,
72 /// The [Self-closing start tag](https://html.spec.whatwg.org/multipage/parsing.html#self-closing-start-tag-state) state.
73 SelfClosingStartTag,
74 /// The [Before attribute name](https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state) state.
75 BeforeAttributeName,
76 /// The [Attribute name](https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state) state.
77 AttributeName,
78 /// The [After attribute name](https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-name-state) state.
79 AfterAttributeName,
80 /// The [Before attribute value](https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-value-state) state.
81 BeforeAttributeValue,
82 /// The [Attribute value (double-quoted)](https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(double-quoted)-state) state.
83 AttributeValueDoubleQuoted,
84 /// The [Attribute value (single-quoted)](https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(single-quoted)-state) state.
85 AttributeValueSingleQuoted,
86 /// The [Attribute value (unquoted)](https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(unquoted)-state) state.
87 AttributeValueUnquoted,
88 /// The [After attribute value (quoted)](https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-value-(quoted)-state) state.
89 AfterAttributeValueQuoted,
90 /// The done state.
91 Done
92}
93
94/// The current state of the [`get_attribute_value`] DFA.
95#[derive(Debug)]
96struct GAVState<'a> {
97 /// The input.
98 input: &'a str,
99 /// The name of the attribute to search for.
100 name: &'a str,
101 /// The state the last bite put the DFA in.
102 last_bite: GAVLastBite,
103 /// The return value.
104 ret: Option<Option<&'a str>>,
105 /// The location of the start of the most recent attribute's name.
106 attr_name_start: usize,
107 /// The location of the end of the most recent attribute's name.
108 attr_name_end: usize,
109 /// The location of the start of the most recent attribute's value.
110 attr_value_start: usize
111}
112
113/// Shorthand.
114type LB = GAVLastBite;
115/// Shorthand.
116type EK = GAVSyntaxErrorKind;
117
118/// Take a string that starts with an HTML element and get the value of the last attribute with the specified name.
119///
120/// Currently recoverable errors (as defined by the spec) aren't recovered from. I should eventually make this do that.
121/// # Errors
122/// It's complicated, but TL;DR if the spec says an error happens (even if it can recover), that error is returned.
123///
124/// If the call to [`unescape_text`] returns an error, that error is returned.
125///
126/// If the input doesn't start with a complete HTML start tag, returns the error [`GAVError::UnfinishedTag`].
127/// # Examples
128/// ```
129/// use url_cleaner_engine::glue::*;
130///
131/// assert_eq!(parse::html::get_attribute_value("<a href='aaa'>" , "href").unwrap(), Some(Some("aaa" .to_string())));
132/// assert_eq!(parse::html::get_attribute_value("<a href='a"a'>" , "href").unwrap(), Some(Some("a\"a".to_string())));
133/// assert_eq!(parse::html::get_attribute_value("<a href=\"aaa\">" , "href").unwrap(), Some(Some("aaa" .to_string())));
134/// assert_eq!(parse::html::get_attribute_value("<a href=\"a"a\">", "href").unwrap(), Some(Some("a\"a".to_string())));
135/// assert_eq!(parse::html::get_attribute_value("<a href=aaa>" , "href").unwrap(), Some(Some("aaa" .to_string())));
136/// assert_eq!(parse::html::get_attribute_value("<a href=a"a>" , "href").unwrap(), Some(Some("a\"a".to_string())));
137///
138/// assert_eq!(parse::html::get_attribute_value("<a href='aaa' >", "href").unwrap(), Some(Some("aaa" .to_string())));
139/// assert_eq!(parse::html::get_attribute_value("<a href='a"a' >", "href").unwrap(), Some(Some("a\"a".to_string())));
140/// assert_eq!(parse::html::get_attribute_value("<a href=\"aaa\" >", "href").unwrap(), Some(Some("aaa" .to_string())));
141/// assert_eq!(parse::html::get_attribute_value("<a href=\"a"a\" >", "href").unwrap(), Some(Some("a\"a".to_string())));
142/// assert_eq!(parse::html::get_attribute_value("<a href=aaa >", "href").unwrap(), Some(Some("aaa" .to_string())));
143/// assert_eq!(parse::html::get_attribute_value("<a href=a"a >", "href").unwrap(), Some(Some("a\"a".to_string())));
144///
145/// assert_eq!(parse::html::get_attribute_value("<a href=b href='aaa' >", "href").unwrap(), Some(Some("aaa" .to_string())));
146/// assert_eq!(parse::html::get_attribute_value("<a href=b href='a"a' >", "href").unwrap(), Some(Some("a\"a".to_string())));
147/// assert_eq!(parse::html::get_attribute_value("<a href=b href=\"aaa\" >", "href").unwrap(), Some(Some("aaa" .to_string())));
148/// assert_eq!(parse::html::get_attribute_value("<a href=b href=\"a"a\" >", "href").unwrap(), Some(Some("a\"a".to_string())));
149/// assert_eq!(parse::html::get_attribute_value("<a href=b href=aaa >", "href").unwrap(), Some(Some("aaa" .to_string())));
150/// assert_eq!(parse::html::get_attribute_value("<a href=b href=a"a >", "href").unwrap(), Some(Some("a\"a".to_string())));
151///
152/// assert_eq!(parse::html::get_attribute_value("<a>", "href").unwrap(), None, "1");
153///
154/// assert_eq!(parse::html::get_attribute_value("<a href>" , "href").unwrap(), Some(None));
155///
156/// assert_eq!(parse::html::get_attribute_value("<a href href=\"1\">" , "href").unwrap(), Some(Some("1".to_string())));
157/// assert_eq!(parse::html::get_attribute_value("<a href href=\"1\" href>" , "href").unwrap(), Some(None));
158/// assert_eq!(parse::html::get_attribute_value("<a href href=\"1\" href href=\"2\">", "href").unwrap(), Some(Some("2".to_string())));
159///
160/// assert_eq!(parse::html::get_attribute_value("<a href=\"1\" href>" , "href").unwrap(), Some(None));
161/// assert_eq!(parse::html::get_attribute_value("<a href=\"1\" href href=\"2\">" , "href").unwrap(), Some(Some("2".to_string())));
162/// assert_eq!(parse::html::get_attribute_value("<a href=\"1\" href href=\"2\" href>", "href").unwrap(), Some(None));
163///
164/// assert_eq!(parse::html::get_attribute_value("<a href=\"1\">stuff" , "href").unwrap(), Some(Some("1".to_string())));
165/// assert_eq!(parse::html::get_attribute_value("<a href=\"1\"><a href=\"2\">" , "href").unwrap(), Some(Some("1".to_string())));
166/// assert_eq!(parse::html::get_attribute_value("<a href=\"1\">stuff<a href=\"2\">", "href").unwrap(), Some(Some("1".to_string())));
167/// assert_eq!(parse::html::get_attribute_value("<a href=\"1\">href=\"2\"" , "href").unwrap(), Some(Some("1".to_string())));
168/// ```
169pub fn get_attribute_value<'a>(input: &'a str, name: &'a str) -> Result<Option<Option<String>>, GAVError> {
170 debug!(parse::html::get_attribute_value, &(), input, name);
171
172 let mut state = GAVState {
173 input,
174 name,
175 last_bite: LB::Data,
176 ret: None,
177 attr_name_start: 0,
178 attr_name_end: 0,
179 attr_value_start: 0
180 };
181
182 for (i, c) in input.chars().enumerate() {
183 if let Err(e) = munch(&mut state, i, c) {
184 return Err(GAVError::Syntax {
185 index: i,
186 last_bite: state.last_bite,
187 kind: e
188 });
189 }
190 if matches!(state.last_bite, LB::Done) {
191 return Ok(match state.ret {
192 Some(Some(value)) => Some(Some(unescape_text(value)?)),
193 Some(None) => Some(None),
194 None => None
195 });
196 }
197 }
198
199 Err(GAVError::UnfinishedTag)
200}
201
202/// Advance the state of the [`get_attribute_value`] DFA.
203fn munch(state: &mut GAVState, i: usize, c: char) -> Result<(), GAVSyntaxErrorKind> {
204 debug!(parse::html::get_attribute::munch, &(), state, i, c);
205 match (state.last_bite, c) {
206 (LB::Data, '<') => {state.last_bite = LB::TagOpen;},
207 (LB::Data, _ ) => Err(EK::InputDoesntStartWithHtmlElement)?,
208
209
210 (LB::TagOpen, 'a'..='z' | 'A'..='Z') => {state.last_bite = LB::TagName;},
211 (LB::TagOpen, '?' ) => Err(EK::UnexpectedQuestionMarkInsteadOfTagName)?,
212 (LB::TagOpen, _ ) => Err(EK::InvalidStartOfTagName)?,
213
214
215 (LB::TagName, '\t' | '\r' | '\n' | ' ') => {state.last_bite = LB::BeforeAttributeName;},
216 (LB::TagName, '/' ) => {state.last_bite = LB::SelfClosingStartTag;},
217 (LB::TagName, '\0' ) => Err(EK::UnexpectedNullCharacter)?,
218 (LB::TagName, '>' ) => {state.last_bite = LB::Done;},
219 (LB::TagName, _ ) => {},
220
221
222 (LB::SelfClosingStartTag, '>') => {state.last_bite = LB::Done;},
223 (LB::SelfClosingStartTag, _ ) => Err(EK::UnexpectedSolidusInTag)?,
224
225
226 (LB::BeforeAttributeName, '\t' | '\r' | '\n' | ' ') => {},
227 (LB::BeforeAttributeName, '/' | '>' ) => {state.last_bite = LB::AfterAttributeName; munch(state, i, c)?;},
228 (LB::BeforeAttributeName, '=' ) => Err(EK::UnexpectedEqualsSignBeforeAttributeName)?,
229 (LB::BeforeAttributeName, _ ) => {state.last_bite = LB::AttributeName; state.attr_name_start = i; munch(state, i, c)?;},
230
231
232 (LB::AttributeName, '\t' | '\r' | '\n' | ' ' | '/' | '>' ) => {state.last_bite = LB::AfterAttributeName ; state.attr_name_end = i; if &state.input[state.attr_name_start..state.attr_name_end] == state.name {state.ret = Some(None);} else {state.ret = None;} munch(state, i, c)?;},
233 (LB::AttributeName, '=' ) => {state.last_bite = LB::BeforeAttributeValue; state.attr_name_end = i;},
234 (LB::AttributeName, '\0' | '"' | '\'' | '<' ) => Err(EK::UnexpectedCharacterInAttributeName)?,
235 (LB::AttributeName, _ ) => {},
236
237
238 (LB::AfterAttributeName, '\t' | '\r' | '\n' | ' ') => {},
239 (LB::AfterAttributeName, '/' ) => {state.last_bite = LB::SelfClosingStartTag ;},
240 (LB::AfterAttributeName, '=' ) => {state.last_bite = LB::BeforeAttributeValue;},
241 (LB::AfterAttributeName, '>' ) => {state.last_bite = LB::Done; if &state.input[state.attr_name_start..i] == state.name {state.ret = Some(None);}},
242 (LB::AfterAttributeName, _ ) => {state.last_bite = LB::AttributeName; state.attr_name_start = i; munch(state, i, c)?;},
243
244
245 (LB::BeforeAttributeValue, '\t' | '\r' | '\n' | ' ') => {},
246 #[allow(clippy::arithmetic_side_effects, reason = "Can't happen.")]
247 (LB::BeforeAttributeValue, '"' ) => {state.last_bite = LB::AttributeValueDoubleQuoted; state.attr_value_start = i+1;},
248 #[allow(clippy::arithmetic_side_effects, reason = "Can't happen.")]
249 (LB::BeforeAttributeValue, '\'' ) => {state.last_bite = LB::AttributeValueSingleQuoted; state.attr_value_start = i+1;},
250 (LB::BeforeAttributeValue, '>' ) => Err(EK::MissingAttributeValue)?,
251 (LB::BeforeAttributeValue, _ ) => {state.last_bite = LB::AttributeValueUnquoted; state.attr_value_start = i; munch(state, i, c)?;},
252
253
254 (LB::AttributeValueDoubleQuoted, '"' ) => {state.last_bite = LB::AfterAttributeValueQuoted; if &state.input[state.attr_name_start..state.attr_name_end] == state.name {state.ret = Some(Some(&state.input[state.attr_value_start..i]));}},
255 (LB::AttributeValueDoubleQuoted, '&' ) => {}, // Processed later.
256 (LB::AttributeValueDoubleQuoted, '\0') => Err(EK::UnexpectedNullCharacter)?,
257 (LB::AttributeValueDoubleQuoted, _ ) => {},
258
259
260 (LB::AttributeValueSingleQuoted, '\'') => {state.last_bite = LB::AfterAttributeValueQuoted; if &state.input[state.attr_name_start..state.attr_name_end] == state.name {state.ret = Some(Some(&state.input[state.attr_value_start..i]));}},
261 (LB::AttributeValueSingleQuoted, '&' ) => {}, // Processed later.
262 (LB::AttributeValueSingleQuoted, '\0') => Err(EK::UnexpectedNullCharacter)?,
263 (LB::AttributeValueSingleQuoted, _ ) => {},
264
265
266 (LB::AttributeValueUnquoted, '\t' | '\r' | '\n' | ' ') => {state.last_bite = LB::BeforeAttributeName; if &state.input[state.attr_name_start..state.attr_name_end] == state.name {state.ret = Some(Some(&state.input[state.attr_value_start..i]));}},
267 (LB::AttributeValueUnquoted, '&' ) => {}, // Processed later.
268 (LB::AttributeValueUnquoted, '>' ) => {state.last_bite = LB::Done; if &state.input[state.attr_name_start..state.attr_name_end] == state.name {state.ret = Some(Some(&state.input[state.attr_value_start..i]));}},
269 (LB::AttributeValueUnquoted, '\0' ) => Err(EK::UnexpectedNullCharacter)?,
270 (LB::AttributeValueUnquoted, _ ) => {},
271
272
273 (LB::AfterAttributeValueQuoted, '\t' | '\r' | '\n' | ' ') => {state.last_bite = LB::BeforeAttributeName;},
274 (LB::AfterAttributeValueQuoted, '/' ) => {state.last_bite = LB::SelfClosingStartTag;},
275 (LB::AfterAttributeValueQuoted, '>' ) => {state.last_bite = LB::Done;},
276 (LB::AfterAttributeValueQuoted, _ ) => Err(EK::MissingWhitespaceBetweenAttributes)?,
277
278
279 (LB::Done, _) => panic!("Logic error.")
280 }
281
282 Ok(())
283}