use html5ever::{
buffer_queue::BufferQueue,
tendril::StrTendril,
tokenizer::{Tag, TagKind, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts},
};
use super::{is_verbatim_elem, plaintext::extract_plaintext};
use crate::types::uri::raw::RawUri;
#[derive(Clone, Default)]
struct LinkExtractor {
links: Vec<RawUri>,
include_verbatim: bool,
inside_excluded_element: bool,
}
impl TokenSink for LinkExtractor {
type Handle = ();
#[allow(clippy::match_same_arms)]
fn process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult<()> {
match token {
Token::CharacterTokens(raw) => {
if self.inside_excluded_element {
return TokenSinkResult::Continue;
}
self.links.extend(extract_plaintext(&raw));
}
Token::TagToken(tag) => {
let Tag {
kind,
name,
self_closing: _self_closing,
attrs,
} = tag;
if !self.include_verbatim && is_verbatim_elem(&name) {
self.inside_excluded_element = matches!(kind, TagKind::StartTag);
return TokenSinkResult::Continue;
}
if let Some(rel) = attrs.iter().find(|attr| &attr.name.local == "rel") {
if rel.value.contains("nofollow") {
return TokenSinkResult::Continue;
}
}
for attr in attrs {
let urls = LinkExtractor::extract_urls_from_elem_attr(
&attr.name.local,
&name,
&attr.value,
);
let new_urls = match urls {
None => extract_plaintext(&attr.value),
Some(urls) => urls
.into_iter()
.map(|url| RawUri {
text: url.to_string(),
element: Some(name.to_string()),
attribute: Some(attr.name.local.to_string()),
})
.collect::<Vec<_>>(),
};
self.links.extend(new_urls);
}
}
Token::ParseError(_err) => {
}
Token::CommentToken(_raw) => (),
Token::NullCharacterToken => (),
Token::DoctypeToken(_doctype) => (),
Token::EOFToken => (),
}
TokenSinkResult::Continue
}
}
impl LinkExtractor {
pub(crate) const fn new(include_verbatim: bool) -> Self {
Self {
links: vec![],
include_verbatim,
inside_excluded_element: false,
}
}
#[allow(clippy::unnested_or_patterns)]
pub(crate) fn extract_urls_from_elem_attr<'a>(
attr_name: &str,
elem_name: &str,
attr_value: &'a str,
) -> Option<impl Iterator<Item = &'a str>> {
match (elem_name, attr_name) {
(_, "href" | "src" | "cite" | "usemap")
| ("applet", "codebase")
| ("body", "background")
| ("button", "formaction")
| ("command", "icon")
| ("form", "action")
| ("frame", "longdesc")
| ("head", "profile")
| ("html", "manifest")
| ("iframe", "longdesc")
| ("img", "longdesc")
| ("input", "formaction")
| ("object", "classid")
| ("object", "codebase")
| ("object", "data")
| ("video", "poster") => {
Some(vec![attr_value].into_iter())
}
(_, "srcset") => {
let mut urls = Vec::new();
for image_candidate_string in attr_value.trim().split(',') {
for part in image_candidate_string.split_ascii_whitespace() {
if part.is_empty() {
continue;
}
urls.push(part);
break;
}
}
Some(urls.into_iter())
}
_ => None,
}
}
}
pub(crate) fn extract_html(buf: &str, include_verbatim: bool) -> Vec<RawUri> {
let mut input = BufferQueue::new();
input.push_back(StrTendril::from(buf));
let mut tokenizer = Tokenizer::new(
LinkExtractor::new(include_verbatim),
TokenizerOpts::default(),
);
let _handle = tokenizer.feed(&mut input);
tokenizer.end();
tokenizer.sink.links
}
#[cfg(test)]
mod tests {
use super::*;
const HTML_INPUT: &str = r#"
<html>
<body>
<p>This is a paragraph with some inline <code>https://example.com</code> and a normal <a href="https://example.org">example</a></p>
<pre>
Some random text
https://foo.com and http://bar.com/some/path
Something else
</pre>
<p><b>bold</b></p>
</body>
</html>"#;
#[test]
fn test_skip_verbatim() {
let expected = vec![RawUri {
text: "https://example.org".to_string(),
element: Some("a".to_string()),
attribute: Some("href".to_string()),
}];
let uris = extract_html(HTML_INPUT, false);
assert_eq!(uris, expected);
}
#[test]
fn test_include_verbatim() {
let expected = vec![
RawUri {
text: "https://example.com".to_string(),
element: None,
attribute: None,
},
RawUri {
text: "https://example.org".to_string(),
element: Some("a".to_string()),
attribute: Some("href".to_string()),
},
RawUri {
text: "https://foo.com".to_string(),
element: None,
attribute: None,
},
RawUri {
text: "http://bar.com/some/path".to_string(),
element: None,
attribute: None,
},
];
let uris = extract_html(HTML_INPUT, true);
assert_eq!(uris, expected);
}
#[test]
fn test_include_nofollow() {
let input = r#"
<a rel="nofollow" href="https://foo.com">do not follow me</a>
<a rel="canonical,nofollow,dns-prefetch" href="https://example.com">do not follow me</a>
<a href="https://example.org">do not follow me</a>
"#;
let expected = vec![RawUri {
text: "https://example.org".to_string(),
element: Some("a".to_string()),
attribute: Some("href".to_string()),
}];
let uris = extract_html(input, false);
assert_eq!(uris, expected);
}
}