use crate::types::{
FileType, InputContent,
uri::raw::{RawUri, SourceSpanProvider},
};
pub mod css;
pub mod html;
pub mod markdown;
mod plaintext;
use css::extract_css;
use markdown::extract_markdown;
use plaintext::extract_raw_uri_from_plaintext;
#[derive(Default, Debug, Clone, Copy)]
pub struct Extractor {
use_html5ever: bool,
include_verbatim: bool,
include_wikilinks: bool,
}
impl Extractor {
#[must_use]
pub const fn new(use_html5ever: bool, include_verbatim: bool, include_wikilinks: bool) -> Self {
Self {
use_html5ever,
include_verbatim,
include_wikilinks,
}
}
#[must_use]
pub fn extract(&self, input_content: &InputContent) -> Vec<RawUri> {
match input_content.file_type {
FileType::Markdown => extract_markdown(
&input_content.content,
self.include_verbatim,
self.include_wikilinks,
),
FileType::Html => {
if self.use_html5ever {
html::html5ever::extract_html(&input_content.content, self.include_verbatim)
} else {
html::html5gum::extract_html(&input_content.content, self.include_verbatim)
}
}
FileType::Css => extract_css(
&input_content.content,
&SourceSpanProvider::from_input(&input_content.content),
),
FileType::Plaintext => extract_raw_uri_from_plaintext(
&input_content.content,
&SourceSpanProvider::from_input(&input_content.content),
),
}
}
}
#[cfg(test)]
mod tests {
use pretty_assertions::assert_eq;
use reqwest::Url;
use std::{collections::HashSet, path::Path};
use test_utils::{fixtures_path, load_fixture, mail, website};
use super::*;
use crate::{
Uri,
types::{
FileType, InputContent, ResolvedInputSource,
uri::raw::{RawUriSpan, span},
},
utils::url::find_links,
};
fn extract_uris(input: &str, file_type: FileType) -> HashSet<Uri> {
let input_content = InputContent::from_string(input, file_type);
let extractor = Extractor::new(false, false, false);
let uris_html5gum: HashSet<Uri> = extractor
.extract(&input_content)
.into_iter()
.filter_map(|raw_uri| Uri::try_from(raw_uri).ok())
.collect();
let uris_html5gum_sorted: Vec<Uri> = {
let mut uris = uris_html5gum.clone().into_iter().collect::<Vec<_>>();
uris.sort();
uris
};
let extractor = Extractor::new(true, false, false);
let uris_html5ever: HashSet<Uri> = extractor
.extract(&input_content)
.into_iter()
.filter_map(|raw_uri| Uri::try_from(raw_uri).ok())
.collect();
let uris_html5ever_sorted: Vec<Uri> = {
let mut uris = uris_html5ever.into_iter().collect::<Vec<_>>();
uris.sort();
uris
};
assert_eq!(
uris_html5gum_sorted, uris_html5ever_sorted,
"Mismatch between html5gum and html5ever"
);
uris_html5gum
}
#[test]
fn verbatim_elem() {
let input = "<pre>https://example.com</pre>";
let uris = extract_uris(input, FileType::Html);
assert!(uris.is_empty());
}
#[test]
fn test_file_type() {
assert_eq!(FileType::from(Path::new("/")), FileType::Plaintext);
assert_eq!(FileType::from("test.md"), FileType::Markdown);
assert_eq!(FileType::from("test.markdown"), FileType::Markdown);
assert_eq!(FileType::from("test.html"), FileType::Html);
assert_eq!(FileType::from("test.txt"), FileType::Plaintext);
assert_eq!(FileType::from("test.something"), FileType::Plaintext);
assert_eq!(
FileType::from("/absolute/path/to/test.something"),
FileType::Plaintext
);
}
#[test]
fn test_skip_markdown_anchors() {
let links = extract_uris("This is [a test](#lol).", FileType::Markdown);
assert!(links.is_empty());
}
#[test]
fn test_skip_markdown_internal_urls() {
let links = extract_uris("This is [a test](./internal).", FileType::Markdown);
assert!(links.is_empty());
}
#[test]
fn test_skip_markdown_email() {
let input = "Get in touch - [Contact Us](mailto:test@test.com)";
let links = extract_uris(input, FileType::Markdown);
let expected = IntoIterator::into_iter([mail!("test@test.com")]).collect::<HashSet<Uri>>();
assert_eq!(links, expected);
}
#[test]
fn relative_urls() {
let links = extract_uris("This is [a test](/internal).", FileType::Markdown);
assert!(links.is_empty());
}
#[test]
fn test_non_markdown_links() {
let input =
"https://endler.dev and https://hello-rust.show/foo/bar?lol=1 at test@example.com";
let links: HashSet<Uri> = extract_uris(input, FileType::Plaintext);
let expected = IntoIterator::into_iter([
website!("https://endler.dev"),
website!("https://hello-rust.show/foo/bar?lol=1"),
mail!("test@example.com"),
])
.collect::<HashSet<Uri>>();
assert_eq!(links, expected);
}
#[test]
fn test_md_escape() {
let input = r"http://msdn.microsoft.com/library/ie/ms535874\(v=vs.85\).aspx";
let links: Vec<_> = find_links(input).collect();
let expected = "http://msdn.microsoft.com/library/ie/ms535874(v=vs.85).aspx)";
matches!(&links[..], [link] if link.as_str() == expected);
}
#[test]
fn test_extract_html5_not_valid_xml() {
let input = load_fixture!("TEST_HTML5.html");
let links = extract_uris(&input, FileType::Html);
let expected_links = IntoIterator::into_iter([
website!("https://example.com/head/home"),
website!("https://example.com/css/style_full_url.css"),
website!("https://example.com/body/a"),
website!("https://example.com/body/div_empty_a"),
])
.collect::<HashSet<Uri>>();
assert_eq!(links, expected_links);
}
#[test]
fn test_extract_relative_url() {
let source = ResolvedInputSource::RemoteUrl(Box::new(
Url::parse("https://example.com/some-post").unwrap(),
));
let contents = r#"<html>
<div class="row">
<a href="https://github.com/lycheeverse/lychee/">GitHub</a>
<a href="/about">About</a>
</div>
</html>"#;
let input_content = &InputContent {
source,
file_type: FileType::Html,
content: contents.to_string(),
};
for use_html5ever in [true, false] {
let extractor = Extractor::new(use_html5ever, false, false);
let links = extractor.extract(input_content);
let urls = links
.into_iter()
.map(|raw_uri| raw_uri.text)
.collect::<HashSet<_>>();
let expected_urls = IntoIterator::into_iter([
String::from("https://github.com/lycheeverse/lychee/"),
String::from("/about"),
])
.collect::<HashSet<_>>();
assert_eq!(urls, expected_urls);
}
}
#[test]
fn test_extract_html5_lowercase_doctype() {
let input = load_fixture!("TEST_HTML5_LOWERCASE_DOCTYPE.html");
let links = extract_uris(&input, FileType::Html);
let expected_links = IntoIterator::into_iter([website!("https://example.com/body/a")])
.collect::<HashSet<Uri>>();
assert_eq!(links, expected_links);
}
#[test]
fn test_extract_html5_minified() {
let input = load_fixture!("TEST_HTML5_MINIFIED.html");
let links = extract_uris(&input, FileType::Html);
let expected_links = IntoIterator::into_iter([
website!("https://example.com/"),
website!("https://example.com/favicon.ico"),
website!("https://example.com/docs/"),
website!("https://example.com/forum"),
])
.collect::<HashSet<Uri>>();
assert_eq!(links, expected_links);
}
#[test]
fn test_extract_html5_malformed() {
let input = load_fixture!("TEST_HTML5_MALFORMED_LINKS.html");
let links = extract_uris(&input, FileType::Html);
let expected_links = IntoIterator::into_iter([website!("https://example.com/valid")])
.collect::<HashSet<Uri>>();
assert_eq!(links, expected_links);
}
#[test]
fn test_extract_html5_custom_elements() {
let input = load_fixture!("TEST_HTML5_CUSTOM_ELEMENTS.html");
let links = extract_uris(&input, FileType::Html);
let expected_links = IntoIterator::into_iter([
website!("https://example.com/some-weird-element"),
website!("https://example.com/even-weirder-src"),
website!("https://example.com/even-weirder-href"),
website!("https://example.com/citations"),
])
.collect::<HashSet<Uri>>();
assert_eq!(links, expected_links);
}
#[test]
fn test_extract_urls_with_at_sign_properly() {
let input = "https://example.com/@test/test http://otherdomain.com/test/@test".to_string();
let links = extract_uris(&input, FileType::Plaintext);
let expected_links = IntoIterator::into_iter([
website!("https://example.com/@test/test"),
website!("http://otherdomain.com/test/@test"),
])
.collect::<HashSet<Uri>>();
assert_eq!(links, expected_links);
}
#[test]
fn test_extract_link_at_end_of_line() {
let input = "https://www.apache.org/licenses/LICENSE-2.0\n";
let links = extract_uris(input, FileType::Plaintext);
let expected_links =
IntoIterator::into_iter([website!("https://www.apache.org/licenses/LICENSE-2.0")])
.collect::<HashSet<Uri>>();
assert_eq!(links, expected_links);
}
#[test]
fn test_extract_css_from_style_tag() {
let input = r#"<html>
<head>
<style>
div {
background-image: url("./lychee.png");
}
</style>
</head>
</html>"#;
let input_content = InputContent::from_string(input, FileType::Html);
let extractor = Extractor::new(false, false, false);
let raw_uris = extractor.extract(&input_content);
assert_eq!(raw_uris, vec![css_url("./lychee.png", span(5, 32))]);
}
#[test]
fn test_extract_css_from_css_file() {
let input = r#"
.example {
background-image: url("./image.png");
background: url('/absolute/path.jpg');
}
@import url(https://example.com/style.css);
"#;
let input_content = InputContent::from_string(input, FileType::Css);
let extractor = Extractor::new(false, false, false);
let raw_uris = extractor.extract(&input_content);
assert_eq!(
raw_uris,
vec![
css_url("./image.png", span(3, 23)),
css_url("/absolute/path.jpg", span(4, 17)),
css_url("https://example.com/style.css", span(6, 9)),
]
);
}
#[test]
fn test_extract_multiple_css_urls_from_style_tag() {
let input = r#"<html>
<head>
<style>
.background {
background-image: url("./bg.png");
}
@font-face {
src: url(../fonts/font.woff2);
}
</style>
</head>
</html>"#;
let input_content = InputContent::from_string(input, FileType::Html);
let extractor = Extractor::new(false, false, false);
let raw_uris = extractor.extract(&input_content);
assert_eq!(
raw_uris,
vec![
css_url("./bg.png", span(5, 32)),
css_url("../fonts/font.woff2", span(8, 19)),
]
);
}
fn css_url(text: &str, span: RawUriSpan) -> RawUri {
RawUri {
text: text.into(),
element: Some("style".into()),
attribute: Some("url".into()),
span,
}
}
}