use regex::Regex;
use std::io::Read;
#[cfg(feature = "discover")]
pub fn html_search(
body: &[u8],
matching_pattern: &str,
base_url: &str,
) -> Box<dyn Iterator<Item = (String, String)>> {
let html = String::from_utf8_lossy(body);
let doc = scraper::Html::parse_document(&html);
let base_selector = scraper::Selector::parse("base").unwrap();
let effective_base_url = doc
.select(&base_selector)
.filter_map(|element| element.value().attr("href"))
.next()
.unwrap_or(base_url);
let base_url_parsed = match url::Url::parse(effective_base_url) {
Ok(u) => u,
Err(_) => return Box::new(std::iter::empty()),
};
let selector = scraper::Selector::parse("a").unwrap();
let re = match Regex::new(matching_pattern) {
Ok(r) => r,
Err(_) => return Box::new(std::iter::empty()),
};
let results: Vec<(String, String)> = doc
.select(&selector)
.filter_map(move |element| {
let href = element.value().attr("href")?;
if let Some(captures) = re.captures(href) {
if let Some(version_match) = captures.get(1) {
let version = version_match.as_str().to_string();
let full_url = match base_url_parsed.join(href) {
Ok(url) => url.to_string(),
Err(_) => return None,
};
Some((version, full_url))
} else {
None
}
} else {
None
}
})
.collect();
Box::new(results.into_iter())
}
pub fn plain_search(
body: &[u8],
matching_pattern: &str,
base_url: &str,
) -> Box<dyn Iterator<Item = (String, String)>> {
let re = match Regex::new(matching_pattern) {
Ok(r) => r,
Err(_) => return Box::new(std::iter::empty()),
};
let text = String::from_utf8_lossy(body);
let base_url_parsed = match url::Url::parse(base_url) {
Ok(u) => u,
Err(_) => return Box::new(std::iter::empty()),
};
let results: Vec<(String, String)> = re
.captures_iter(&text)
.filter_map(|captures| {
if let Some(version_match) = captures.get(1) {
let version = version_match.as_str().to_string();
let matched = captures.get(0).unwrap().as_str();
let full_url = if matched.starts_with("http://") || matched.starts_with("https://")
{
matched.to_string()
} else {
match base_url_parsed.join(matched) {
Ok(url) => url.to_string(),
Err(_) => return None,
}
};
Some((version, full_url))
} else {
None
}
})
.collect();
Box::new(results.into_iter())
}
pub fn search<R: Read>(
searchmode: &str,
mut resp: R,
matching_pattern: &str,
_package: &str,
url: &str,
) -> Result<Box<dyn Iterator<Item = (String, String)>>, std::io::Error> {
let mut body = Vec::new();
resp.read_to_end(&mut body)?;
let iter: Box<dyn Iterator<Item = (String, String)>> = match searchmode {
#[cfg(feature = "discover")]
"html" => html_search(&body, matching_pattern, url),
"plain" => plain_search(&body, matching_pattern, url),
_ => Box::new(std::iter::empty()),
};
Ok(iter)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
#[cfg(feature = "discover")]
fn test_html_search() {
let html = b"<html><body><a href='project-1.0.tar.gz'>v1.0</a><a href='project-2.0.tar.gz'>v2.0</a></body></html>";
let results: Vec<_> =
html_search(html, r"project-(\d+\.\d+)\.tar\.gz", "https://example.com/").collect();
assert_eq!(results.len(), 2);
assert!(results.iter().any(|(v, _)| v == "1.0"));
assert!(results.iter().any(|(v, _)| v == "2.0"));
}
#[test]
#[cfg(feature = "discover")]
fn test_html_search_absolute_urls() {
let html = b"<html><head><base href='https://curl.se'></head><body><a href='download/curl-8.14.0.tar.gz'>curl</a></body></html>";
let results: Vec<_> = html_search(
html,
r"download/curl-([\d.]+)\.tar\.gz",
"https://curl.se/download/",
)
.collect();
assert_eq!(results.len(), 1);
assert_eq!(results[0].0, "8.14.0");
assert_eq!(results[0].1, "https://curl.se/download/curl-8.14.0.tar.gz");
}
#[test]
#[cfg(feature = "discover")]
fn test_html_search_absolute_urls_with_slash_prefix() {
let html = b"<html><body><a href='/download/curl-8.14.0.tar.gz'>curl</a></body></html>";
let results: Vec<_> = html_search(
html,
r"download/curl-([\d.]+)\.tar\.gz",
"https://curl.se/download/",
)
.collect();
assert_eq!(results.len(), 1);
assert_eq!(results[0].0, "8.14.0");
assert_eq!(results[0].1, "https://curl.se/download/curl-8.14.0.tar.gz");
}
#[test]
#[cfg(feature = "discover")]
fn test_html_search_with_absolute_href() {
let html = b"<html><body><a href='https://example.org/files/project-3.5.0.tar.gz'>v3.5.0</a></body></html>";
let results: Vec<_> = html_search(
html,
r"https://example\.org/files/project-([\d.]+)\.tar\.gz",
"https://example.com/",
)
.collect();
assert_eq!(results.len(), 1);
assert_eq!(results[0].0, "3.5.0");
assert_eq!(
results[0].1,
"https://example.org/files/project-3.5.0.tar.gz"
);
}
#[test]
fn test_plain_search() {
let text = b"Available: project-1.0.tar.gz project-2.0.tar.gz";
let results: Vec<_> =
plain_search(text, r"project-(\d+\.\d+)\.tar\.gz", "https://example.com/").collect();
assert_eq!(results.len(), 2);
assert!(results.iter().any(|(v, _)| v == "1.0"));
assert!(results.iter().any(|(v, _)| v == "2.0"));
}
#[test]
fn test_plain_search_absolute_urls() {
let text = b"Available: curl-8.14.0.tar.gz";
let results: Vec<_> =
plain_search(text, r"curl-([\d.]+)\.tar\.gz", "https://curl.se/download/").collect();
assert_eq!(results.len(), 1);
assert_eq!(results[0].0, "8.14.0");
assert_eq!(results[0].1, "https://curl.se/download/curl-8.14.0.tar.gz");
}
#[test]
fn test_plain_search_with_absolute_urls() {
let text = b"Available: https://example.org/files/project-3.5.0.tar.gz";
let results: Vec<_> = plain_search(
text,
r"https://example\.org/files/project-([\d.]+)\.tar\.gz",
"https://example.com/",
)
.collect();
assert_eq!(results.len(), 1);
assert_eq!(results[0].0, "3.5.0");
assert_eq!(
results[0].1,
"https://example.org/files/project-3.5.0.tar.gz"
);
}
}