use std::env;
use std::error::Error;
use std::fmt;
use std::fs;
use std::io::{self, Write};
use std::path::{Path, PathBuf};
use std::time::Duration;
use chrono::{SecondsFormat, Utc};
use encoding_rs::Encoding;
use markup5ever_rcdom::RcDom;
use reqwest::blocking::Client;
use reqwest::header::{HeaderMap, HeaderValue, CONTENT_TYPE, COOKIE, REFERER, USER_AGENT};
use url::Url;
use crate::cache::Cache;
use crate::cookies::Cookie;
use crate::html::{
add_favicon, create_metadata_tag, get_base_url, get_charset, get_robots, get_title,
has_favicon, html_to_dom, serialize_document, set_base_url, set_charset, set_robots,
walk_and_embed_assets,
};
use crate::url::{clean_url, create_data_url, get_referer_url, parse_data_url, resolve_url};
#[derive(Debug)]
pub struct MonolithError {
details: String,
}
impl MonolithError {
fn new(msg: &str) -> MonolithError {
MonolithError {
details: msg.to_string(),
}
}
}
impl fmt::Display for MonolithError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", self.details)
}
}
impl Error for MonolithError {
fn description(&self) -> &str {
&self.details
}
}
#[derive(Debug, PartialEq, Eq, Default)]
pub enum MonolithOutputFormat {
#[default]
HTML,
}
#[derive(Default)]
pub struct Options {
pub base_url: Option<String>,
pub blacklist_domains: bool,
pub cookies: Vec<Cookie>, pub domains: Option<Vec<String>>,
pub encoding: Option<String>,
pub ignore_errors: bool,
pub insecure: bool,
pub isolate: bool,
pub no_audio: bool,
pub no_css: bool,
pub no_fonts: bool,
pub no_frames: bool,
pub no_images: bool,
pub no_js: bool,
pub no_metadata: bool,
pub no_video: bool,
pub output_format: MonolithOutputFormat,
pub silent: bool,
pub timeout: u64,
pub unwrap_noscript: bool,
pub user_agent: Option<String>,
}
const ANSI_COLOR_RED: &str = "\x1b[31m";
const ANSI_COLOR_RESET: &str = "\x1b[0m";
const FILE_SIGNATURES: [[&[u8]; 2]; 18] = [
[b"GIF87a", b"image/gif"],
[b"GIF89a", b"image/gif"],
[b"\xFF\xD8\xFF", b"image/jpeg"],
[b"\x89PNG\x0D\x0A\x1A\x0A", b"image/png"],
[b"<svg ", b"image/svg+xml"],
[b"RIFF....WEBPVP8 ", b"image/webp"],
[b"\x00\x00\x01\x00", b"image/x-icon"],
[b"ID3", b"audio/mpeg"],
[b"\xFF\x0E", b"audio/mpeg"],
[b"\xFF\x0F", b"audio/mpeg"],
[b"OggS", b"audio/ogg"],
[b"RIFF....WAVEfmt ", b"audio/wav"],
[b"fLaC", b"audio/x-flac"],
[b"RIFF....AVI LIST", b"video/avi"],
[b"....ftyp", b"video/mp4"],
[b"\x00\x00\x01\x0B", b"video/mpeg"],
[b"....moov", b"video/quicktime"],
[b"\x1A\x45\xDF\xA3", b"video/webm"],
];
const PLAINTEXT_MEDIA_TYPES: &[&str] = &[
"application/javascript", "application/json", "application/ld+json", "application/x-sh", "application/xhtml+xml", "application/xml", "application/vnd.mozilla.xul+xml", "image/svg+xml", ];
pub fn init_client(options: &Options) -> Client {
let mut header_map = HeaderMap::new();
if let Some(user_agent) = &options.user_agent {
header_map.insert(
USER_AGENT,
HeaderValue::from_str(user_agent).expect("Invalid User-Agent header specified"),
);
}
Client::builder()
.timeout(Duration::from_secs(if options.timeout > 0 {
options.timeout
} else {
600
}))
.danger_accept_invalid_certs(options.insecure)
.default_headers(header_map)
.build()
.expect("Failed to initialize HTTP client")
}
pub fn create_monolithic_document_from_data(
input_data: Vec<u8>,
options: &Options,
cache: &mut Option<Cache>,
input_encoding: Option<String>,
input_target: Option<String>,
) -> Result<(Vec<u8>, Option<String>), MonolithError> {
{
if let Some(custom_output_encoding) = options.encoding.clone() {
if Encoding::for_label_no_replacement(custom_output_encoding.as_bytes()).is_none() {
return Err(MonolithError::new(&format!(
"unknown encoding \"{}\"",
&custom_output_encoding
)));
}
}
}
let client: Client = init_client(options);
let mut base_url: Url = if input_target.is_some() {
Url::parse(&input_target.clone().unwrap()).unwrap()
} else {
Url::parse("data:text/html,").unwrap()
};
let mut document_encoding: String = input_encoding.clone().unwrap_or("utf-8".to_string());
let mut dom: RcDom;
dom = html_to_dom(&input_data, document_encoding.clone());
if let Some(html_charset) = get_charset(&dom.document) {
if !html_charset.is_empty() {
if let Some(document_charset) =
Encoding::for_label_no_replacement(html_charset.as_bytes())
{
document_encoding = html_charset;
dom = html_to_dom(&input_data, document_charset.name().to_string());
}
}
}
let custom_base_url: String = options.base_url.clone().unwrap_or_default();
if custom_base_url.is_empty() {
if let Some(existing_base_url) = get_base_url(&dom.document) {
base_url = resolve_url(&base_url, &existing_base_url);
}
} else {
match Url::parse(&custom_base_url) {
Ok(parsed_url) => {
if parsed_url.scheme() == "file" {
if base_url.scheme() == "file" {
base_url = parsed_url;
}
} else {
base_url = parsed_url;
}
}
Err(_) => {
if base_url.scheme() == "file" {
let path: &Path = Path::new(&custom_base_url);
if path.exists() {
match Url::from_file_path(fs::canonicalize(path).unwrap()) {
Ok(file_url) => {
base_url = file_url;
}
Err(_) => {
return Err(MonolithError::new(&format!(
"could not map given path to base URL \"{}\"",
custom_base_url
)));
}
}
}
}
}
}
}
walk_and_embed_assets(cache, &client, &base_url, &dom.document, options);
if let Some(new_base_url) = options.base_url.clone() {
dom = set_base_url(&dom.document, new_base_url);
}
if !options.no_images
&& (base_url.scheme() == "http" || base_url.scheme() == "https")
&& (input_target.is_some()
&& (input_target.as_ref().unwrap().starts_with("http:")
|| input_target.as_ref().unwrap().starts_with("https:")))
&& !has_favicon(&dom.document)
{
let favicon_ico_url: Url = resolve_url(&base_url, "/favicon.ico");
match retrieve_asset(
cache,
&client,
&base_url,
&favicon_ico_url,
options,
) {
Ok((data, final_url, media_type, charset)) => {
let favicon_data_url: Url =
create_data_url(&media_type, &charset, &data, &final_url);
dom = add_favicon(&dom.document, favicon_data_url.to_string());
}
Err(_) => {
}
}
}
if let meta_robots_content_value = get_robots(&dom.document).unwrap_or_default() {
if meta_robots_content_value.trim().is_empty() || meta_robots_content_value != "none" {
dom = set_robots(dom, "none");
}
}
if let Some(custom_encoding) = options.encoding.clone() {
document_encoding = custom_encoding;
dom = set_charset(dom, document_encoding.clone());
}
let document_title: Option<String> = get_title(&dom.document);
if options.output_format == MonolithOutputFormat::HTML {
let mut result: Vec<u8> = serialize_document(dom, document_encoding, options);
if !options.no_metadata && !input_target.clone().unwrap_or_default().is_empty() {
let mut metadata_comment: String =
create_metadata_tag(&Url::parse(&input_target.unwrap_or_default()).unwrap());
metadata_comment += "\n";
result.splice(0..0, metadata_comment.as_bytes().to_vec());
}
Ok((result, document_title))
} else {
Ok((vec![], document_title))
}
}
pub fn create_monolithic_document(
target: String,
options: &mut Options,
cache: &mut Option<Cache>,
) -> Result<(Vec<u8>, Option<String>), MonolithError> {
if target.is_empty() {
return Err(MonolithError::new("no target specified"));
}
{
if let Some(custom_encoding) = options.encoding.clone() {
if Encoding::for_label_no_replacement(custom_encoding.as_bytes()).is_none() {
return Err(MonolithError::new(&format!(
"unknown encoding \"{}\"",
&custom_encoding
)));
}
}
}
let mut target_url = match target.as_str() {
target_str => match Url::parse(target_str) {
Ok(target_url) => match target_url.scheme() {
"data" | "file" | "http" | "https" => target_url,
unsupported_scheme => {
return Err(MonolithError::new(&format!(
"unsupported target URL scheme \"{}\"",
unsupported_scheme
)));
}
},
Err(_) => {
let path: &Path = Path::new(&target_str);
match path.exists() {
true => match path.is_file() {
true => {
let canonical_path = fs::canonicalize(path).unwrap();
match Url::from_file_path(canonical_path) {
Ok(url) => url,
Err(_) => {
return Err(MonolithError::new(&format!(
"could not generate file URL out of given path \"{}\"",
&target_str
)));
}
}
}
false => {
return Err(MonolithError::new(&format!(
"local target \"{}\" is not a file",
&target_str
)));
}
},
false => {
Url::parse(&format!("http://{}", &target_str)).unwrap()
}
}
}
},
};
let client: Client = init_client(options);
let data: Vec<u8>;
let document_encoding: Option<String>;
if target_url.scheme() == "file"
|| target_url.scheme() == "http"
|| target_url.scheme() == "https"
|| target_url.scheme() == "data"
{
match retrieve_asset(cache, &client, &target_url, &target_url, options) {
Ok((retrieved_data, final_url, media_type, charset)) => {
if !media_type.eq_ignore_ascii_case("text/html")
&& !media_type.eq_ignore_ascii_case("application/xhtml+xml")
{
return Ok((retrieved_data, None));
}
if final_url != target_url {
target_url = final_url.clone();
}
data = retrieved_data;
document_encoding = Some(charset);
}
Err(_) => {
return Err(MonolithError::new("could not retrieve target document"));
}
}
} else {
return Err(MonolithError::new("unsupported target"));
}
create_monolithic_document_from_data(
data,
options,
cache,
document_encoding,
Some(target_url.to_string()),
)
}
pub fn detect_media_type(data: &[u8], url: &Url) -> String {
for file_signature in FILE_SIGNATURES.iter() {
if data.starts_with(file_signature[0]) {
return String::from_utf8(file_signature[1].to_vec()).unwrap();
}
}
let parts: Vec<&str> = url.path().split('/').collect();
detect_media_type_by_file_name(parts.last().unwrap())
}
pub fn detect_media_type_by_file_name(filename: &str) -> String {
let filename_lowercased: &str = &filename.to_lowercase();
let parts: Vec<&str> = filename_lowercased.split('.').collect();
let mime: &str = match parts.last() {
Some(v) => match *v {
"avi" => "video/avi",
"bmp" => "image/bmp",
"css" => "text/css",
"flac" => "audio/flac",
"gif" => "image/gif",
"htm" | "html" => "text/html",
"ico" => "image/x-icon",
"jpeg" | "jpg" => "image/jpeg",
"js" => "text/javascript",
"json" => "application/json",
"jsonld" => "application/ld+json",
"mp3" => "audio/mpeg",
"mp4" | "m4v" => "video/mp4",
"ogg" => "audio/ogg",
"ogv" => "video/ogg",
"pdf" => "application/pdf",
"png" => "image/png",
"svg" => "image/svg+xml",
"swf" => "application/x-shockwave-flash",
"tif" | "tiff" => "image/tiff",
"txt" => "text/plain",
"wav" => "audio/wav",
"webp" => "image/webp",
"woff" => "font/woff",
"woff2" => "font/woff2",
"xhtml" => "application/xhtml+xml",
"xml" => "text/xml",
&_ => "",
},
None => "",
};
mime.to_string()
}
pub fn domain_is_within_domain(domain: &str, domain_to_match_against: &str) -> bool {
if domain_to_match_against.is_empty() {
return false;
}
if domain_to_match_against == "." {
return true;
}
let domain_partials: Vec<&str> = domain.trim_end_matches(".").rsplit(".").collect();
let domain_to_match_against_partials: Vec<&str> = domain_to_match_against
.trim_end_matches(".")
.rsplit(".")
.collect();
let domain_to_match_against_starts_with_a_dot = domain_to_match_against.starts_with(".");
let mut i: usize = 0;
let l: usize = std::cmp::max(
domain_partials.len(),
domain_to_match_against_partials.len(),
);
let mut ok: bool = true;
while i < l {
if !domain_to_match_against_starts_with_a_dot
&& domain_to_match_against_partials.len() < i + 1
{
ok = false;
break;
}
let domain_partial = if domain_partials.len() < i + 1 {
""
} else {
domain_partials.get(i).unwrap()
};
let domain_to_match_against_partial = if domain_to_match_against_partials.len() < i + 1 {
""
} else {
domain_to_match_against_partials.get(i).unwrap()
};
let parts_match = domain_to_match_against_partial.eq_ignore_ascii_case(domain_partial);
if !parts_match && !domain_to_match_against_partial.is_empty() {
ok = false;
break;
}
i += 1;
}
ok
}
pub fn format_output_path(destination: &str, document_title: &str) -> String {
let datetime: &str = &Utc::now().to_rfc3339_opts(SecondsFormat::Secs, true);
destination
.replace("%timestamp%", &datetime.replace(':', "_"))
.replace(
"%title%",
document_title
.to_string()
.replace(['/', '\\'], "_")
.replace('<', "[")
.replace('>', "]")
.replace(':', " - ")
.replace('\"', "")
.replace('|', "-")
.replace('?', "")
.trim_start_matches('.'),
)
.to_string()
.replace('<', "[")
.replace('>', "]")
.replace(':', " - ")
.replace('\"', "")
.replace('|', "-")
.replace('?', "")
.to_string()
}
pub fn is_plaintext_media_type(media_type: &str) -> bool {
media_type.to_lowercase().as_str().starts_with("text/")
|| PLAINTEXT_MEDIA_TYPES.contains(&media_type.to_lowercase().as_str())
}
pub fn parse_content_type(content_type: &str) -> (String, String, bool) {
let mut media_type: String = "text/plain".to_string();
let mut charset: String = "US-ASCII".to_string();
let mut is_base64: bool = false;
let content_type_items: Vec<&str> = content_type.split(';').collect();
let mut i: i8 = 0;
for item in &content_type_items {
if i == 0 {
if !item.trim().is_empty() {
media_type = item.trim().to_string();
}
} else if item.trim().eq_ignore_ascii_case("base64") {
is_base64 = true;
} else if item.trim().starts_with("charset=") {
charset = item.trim().chars().skip(8).collect();
}
i += 1;
}
(media_type, charset, is_base64)
}
pub fn retrieve_asset(
cache: &mut Option<Cache>,
client: &Client,
parent_url: &Url,
url: &Url,
options: &Options,
) -> Result<(Vec<u8>, Url, String, String), reqwest::Error> {
if url.scheme() == "data" {
let (media_type, charset, data) = parse_data_url(url);
Ok((data, url.clone(), media_type, charset))
} else if url.scheme() == "file" {
let cache_key: String = clean_url(url.clone()).as_str().to_string();
if parent_url.scheme() != "file" {
print_error_message(&format!("{} (security error)", &cache_key), options);
client.get("").send()?;
}
let path_buf: PathBuf = url.to_file_path().unwrap().clone();
let path: &Path = path_buf.as_path();
if path.exists() {
if path.is_dir() {
print_error_message(&format!("{} (is a directory)", &cache_key), options);
Err(client.get("").send().unwrap_err())
} else {
print_info_message(&cache_key.to_string(), options);
let file_blob: Vec<u8> = fs::read(path).expect("unable to read file");
Ok((
file_blob.clone(),
url.clone(),
detect_media_type(&file_blob, url),
"".to_string(),
))
}
} else {
print_error_message(&format!("{} (file not found)", &url), options);
Err(client.get("").send().unwrap_err())
}
} else {
let cache_key: String = clean_url(url.clone()).as_str().to_string();
if cache.is_some() && cache.as_ref().unwrap().contains_key(&cache_key) {
print_info_message(&format!("{} (from cache)", &cache_key), options);
Ok((
cache.as_ref().unwrap().get(&cache_key).unwrap().0.to_vec(),
url.clone(),
cache.as_ref().unwrap().get(&cache_key).unwrap().1,
cache.as_ref().unwrap().get(&cache_key).unwrap().2,
))
} else {
if let Some(domains) = &options.domains {
let domain_matches = domains
.iter()
.any(|d| domain_is_within_domain(url.host_str().unwrap(), d.trim()));
if (options.blacklist_domains && domain_matches)
|| (!options.blacklist_domains && !domain_matches)
{
return Err(client.get("").send().unwrap_err());
}
}
let mut headers = HeaderMap::new();
if !options.cookies.is_empty() {
for cookie in &options.cookies {
if !cookie.is_expired() && cookie.matches_url(url.as_str()) {
let cookie_header_value: String = cookie.name.clone() + "=" + &cookie.value;
headers
.insert(COOKIE, HeaderValue::from_str(&cookie_header_value).unwrap());
}
}
}
if ["https", "http"].contains(&parent_url.scheme()) && parent_url != url {
headers.insert(
REFERER,
HeaderValue::from_str(get_referer_url(parent_url.clone()).as_str()).unwrap(),
);
}
match client.get(url.as_str()).headers(headers).send() {
Ok(response) => {
if !options.ignore_errors && response.status() != reqwest::StatusCode::OK {
print_error_message(
&format!("{} ({})", &cache_key, response.status()),
options,
);
return Err(client.get("").send().unwrap_err());
}
let response_url: Url = response.url().clone();
if url.as_str() == response_url.as_str() {
print_info_message(&cache_key.to_string(), options);
} else {
print_info_message(
&format!("{} -> {}", &cache_key, &response_url),
options,
);
}
let new_cache_key: String = clean_url(response_url.clone()).to_string();
let content_type: &str = response
.headers()
.get(CONTENT_TYPE)
.and_then(|header| header.to_str().ok())
.unwrap_or("");
let (media_type, charset, _is_base64) = parse_content_type(content_type);
let mut data: Vec<u8> = vec![];
match response.bytes() {
Ok(b) => {
data = b.to_vec();
}
Err(error) => {
print_error_message(&format!("{}", error), options);
}
}
if cache.is_some() {
cache.as_mut().unwrap().set(
&new_cache_key,
&data,
media_type.clone(),
charset.clone(),
);
}
Ok((data, response_url, media_type, charset))
}
Err(error) => {
print_error_message(&format!("{} ({})", &cache_key, error), options);
Err(client.get("").send().unwrap_err())
}
}
}
}
}
pub fn print_error_message(text: &str, options: &Options) {
if !options.silent {
let stderr = io::stderr();
let mut handle = stderr.lock();
const ENV_VAR_NO_COLOR: &str = "NO_COLOR";
const ENV_VAR_TERM: &str = "TERM";
let mut no_color =
env::var_os(ENV_VAR_NO_COLOR).is_some() || atty::isnt(atty::Stream::Stderr);
if let Some(term) = env::var_os(ENV_VAR_TERM) {
if term == "dumb" {
no_color = true;
}
}
if handle
.write_all(
format!(
"{}{}{}\n",
if no_color { "" } else { ANSI_COLOR_RED },
&text,
if no_color { "" } else { ANSI_COLOR_RESET },
)
.as_bytes(),
)
.is_ok()
{}
}
}
pub fn print_info_message(text: &str, options: &Options) {
if !options.silent {
let stderr = io::stderr();
let mut handle = stderr.lock();
if handle.write_all(format!("{}\n", &text).as_bytes()).is_ok() {}
}
}