#![warn(missing_docs)]
#[cfg(feature = "image")]
mod img;
pub use epub_builder::EpubVersion;
use epub_builder::{self, EpubBuilder, EpubContent, ReferenceType, ZipLibrary};
#[cfg(feature = "image")]
pub use img::{FilterType, ImgTransform};
use kuchiki::{Attribute, ExpandedName, NodeRef};
use log::{trace, warn};
use mail_parser::{Header, HeaderName, HeaderValue, MessageParser, PartType};
use markup5ever::{namespace_url, ns, Namespace, Prefix, QualName};
use readable_readability::Readability;
use std::cmp::Reverse;
use std::collections::btree_map::Entry;
use std::collections::BTreeMap;
use std::error::Error as StdError;
use std::fmt::{Display, Error as FmtError, Formatter};
use std::io::{Read, Write};
#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub enum ImageFormat {
#[default]
Jpeg,
Png,
}
impl ImageFormat {
fn ext(&self) -> &'static str {
match self {
ImageFormat::Jpeg => "jpg",
ImageFormat::Png => "png",
}
}
fn mime(&self) -> &'static str {
match self {
ImageFormat::Jpeg => "image/jpeg",
ImageFormat::Png => "image/png",
}
}
}
#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub enum ImageHandling {
#[default]
Strip,
Filter,
Keep,
}
#[derive(Debug)]
pub struct Repub<Css, Trans> {
pub include_url: bool,
pub include_title: bool,
pub include_byline: bool,
pub include_cover: bool,
pub strip_links: bool,
pub href_sim_thresh: f64,
pub image_handling: ImageHandling,
pub css: Css,
pub transform: Trans,
pub epub_version: EpubVersion,
}
pub trait ImageTransform {
type Output<'a>: Read + 'a;
fn transform<'a, S: AsRef<str>>(
&self,
buff: &'a [u8],
mime: S,
) -> Option<(Self::Output<'a>, ImageFormat)>;
}
pub struct NoopTransform;
impl ImageTransform for NoopTransform {
type Output<'a> = &'a [u8];
fn transform<'a, S: AsRef<str>>(
&self,
buff: &'a [u8],
mime: S,
) -> Option<(Self::Output<'a>, ImageFormat)> {
let fmt = match mime.as_ref() {
"image/jpeg" => Some(ImageFormat::Jpeg),
"image/png" => Some(ImageFormat::Png),
_ => None,
}?;
Some((buff, fmt))
}
}
impl Default for Repub<&'static str, NoopTransform> {
fn default() -> Self {
Self {
include_url: false,
include_title: false,
include_byline: false,
include_cover: false,
strip_links: false,
href_sim_thresh: 0.0,
image_handling: ImageHandling::default(),
css: "",
transform: NoopTransform,
epub_version: EpubVersion::V20,
}
}
}
#[non_exhaustive]
#[derive(Debug, PartialEq, Eq)]
pub enum Error {
InvalidImageFormat,
MhtmlParseError,
MhtmlFormatError,
ImageConversionError,
EpubCreationError,
EpubWritingError,
}
impl Display for Error {
fn fmt(&self, fmt: &mut Formatter<'_>) -> Result<(), FmtError> {
write!(fmt, "{self:?}")
}
}
impl StdError for Error {}
impl From<epub_builder::Error> for Error {
fn from(_: epub_builder::Error) -> Self {
Error::EpubCreationError
}
}
fn get_header<'a, 'b>(
headers: &'a [Header<'b>],
header: HeaderName,
) -> Option<&'a HeaderValue<'b>> {
headers
.iter()
.find(|head| head.name == header)
.map(|head| &head.value)
}
fn new_elem(
name: &str,
attributes: impl IntoIterator<Item = (Namespace, Option<Prefix>, impl AsRef<str>, impl AsRef<str>)>,
children: impl IntoIterator<Item = NodeRef>,
) -> NodeRef {
let node = NodeRef::new_element(
QualName::new(None, ns!(svg), name.into()),
attributes.into_iter().map(|(ns, prefix, attr, value)| {
(
ExpandedName::new(ns, attr.as_ref()),
Attribute {
prefix,
value: value.as_ref().into(),
},
)
}),
);
for child in children {
node.append(child);
}
node
}
fn new_attrless_elem(name: &str, children: impl IntoIterator<Item = NodeRef>) -> NodeRef {
let attrs: [(Namespace, Option<Prefix>, &str, &str); 0] = [];
new_elem(name, attrs, children)
}
fn next_node(node: &NodeRef) -> Option<NodeRef> {
node.first_child().or_else(|| next_node_skip(node))
}
fn next_node_skip(node: &NodeRef) -> Option<NodeRef> {
node.next_sibling()
.or_else(|| node.ancestors().find_map(|n| n.next_sibling()))
}
impl<C, T> Repub<C, T>
where
C: AsRef<str>,
T: ImageTransform,
{
fn find_url<'a>(
&self,
data: &'a BTreeMap<&'a str, (&'a str, &'a [u8])>,
src: &str,
) -> Option<(Reverse<usize>, String, &'a str, &'a [u8])> {
let decoded = percent_encoding::percent_decode_str(src)
.decode_utf8()
.ok()?;
if let Some((mime, data)) = data.get(decoded.as_ref()) {
Some((Reverse(0), decoded.to_string(), mime, data))
} else if self.href_sim_thresh > 0.0 {
let thresh: usize =
f64::trunc(decoded.chars().count() as f64 * self.href_sim_thresh) as usize;
let (dist, href, mime, data) = data
.iter()
.map(|(href, (mime, data))| (strsim::levenshtein(href, &decoded), href, mime, data))
.min()?;
if dist < thresh {
Some((Reverse(dist), href.to_string(), mime, data))
} else {
warn!("didn't find approximate match for image: {decoded}");
None
}
} else {
warn!("didn't find exact match for image: {decoded}");
None
}
}
pub fn mhtml_to_epub(
&self,
mhtml: impl AsRef<str>,
out: &mut impl Write,
) -> Result<Option<String>, Error> {
let msg = MessageParser::default()
.parse(mhtml.as_ref().as_bytes())
.ok_or(Error::MhtmlParseError)?;
let (first, rest) = msg.parts.split_first().ok_or(Error::MhtmlFormatError)?;
let subject = get_header(&first.headers, HeaderName::Subject).and_then(|val| match val {
HeaderValue::Text(title) => Some(title.as_ref()),
_ => None,
});
let (main, resources) = rest.split_first().ok_or(Error::MhtmlFormatError)?;
let loc =
get_header(&main.headers, HeaderName::ContentLocation).and_then(|val| match val {
HeaderValue::Text(loc) => Some(loc),
_ => None,
});
let html = if let PartType::Html(content) = &main.body {
Ok(content)
} else {
Err(Error::MhtmlFormatError)
}?;
let (node, meta) = Readability::new().parse(html);
let title = meta
.article_title
.as_ref()
.map(String::as_ref)
.or_else(|| meta.page_title.as_ref().map(String::as_ref))
.or(subject);
let mut epub = EpubBuilder::new(ZipLibrary::new()?)?;
if let Some(title) = title {
epub.metadata("title", title)?;
}
if let Some(author) = &meta.byline {
epub.metadata("author", author)?;
}
let mut image_data = BTreeMap::new();
for attach in resources {
let ctype = get_header(&attach.headers, HeaderName::ContentType);
let loc = get_header(&attach.headers, HeaderName::ContentLocation);
if let (
Some(HeaderValue::ContentType(ctype)),
Some(HeaderValue::Text(loc)),
PartType::Binary(body),
) = (ctype, loc, &attach.body)
{
if let ("image", Some(mime)) = (ctype.ctype(), ctype.subtype()) {
match image_data.entry(loc.as_ref()) {
Entry::Vacant(ent) => {
ent.insert((mime, body.as_ref()));
}
Entry::Occupied(mut ent) => {
let (_, old) = ent.get();
if old.len() < body.len() {
ent.insert((mime, body.as_ref()));
}
}
}
}
}
}
let cover_img = if self.include_cover {
if let Some((image, fmt)) = meta
.image_url
.as_ref()
.and_then(|cover| self.find_url(&image_data, cover))
.and_then(|(_, _, mime, img)| {
self.transform.transform(img, format!("image/{}", mime))
})
{
let file_name = format!("image_cover.{}", fmt.ext());
epub.add_cover_image(&file_name, image, fmt.mime())?;
Some(file_name)
} else {
None
}
} else {
None
};
let mut images = BTreeMap::new();
let mut current = node.first_child();
while let Some(node) = current {
if let Some(data) = node.as_element() {
match &*data.name.local {
"a" if self.strip_links => {
while let Some(child) = node.last_child() {
node.insert_after(child);
}
current = next_node_skip(&node);
node.detach();
}
"img" | "picture" => {
if self.image_handling != ImageHandling::Strip {
let mut matched = None;
for dec in node.inclusive_descendants() {
if let Some(dec_dat) = dec.as_element() {
let attrs = dec_dat.attributes.borrow();
if let Some(src) = attrs.get("src") {
matched =
std::cmp::max(matched, self.find_url(&image_data, src));
}
if let Some(srcset) = attrs.get("srcset") {
for src in srcset.split(',') {
matched = std::cmp::max(
matched,
self.find_url(&image_data, src.trim()),
);
}
}
}
}
if let Some((_, url, mime, img)) = matched {
let num = images.len();
let path = match (images.entry(url), self.image_handling) {
(Entry::Vacant(ent), _) => {
let trans = self
.transform
.transform(img, format!("image/{}", mime));
let name = match trans {
Some((image, fmt)) => {
let name = format!("image_{num}.{}", fmt.ext());
epub.add_resource(&name, image, fmt.mime())?;
Some(name)
}
None => None,
};
ent.insert(name).as_ref()
}
(_, ImageHandling::Filter) => None, (Entry::Occupied(ent), _) => ent.into_mut().as_ref(),
};
if let Some(image_path) = path {
node.insert_before(new_elem(
"img",
[(ns!(), None, "src", image_path)],
[],
));
}
}
}
current = next_node_skip(&node);
node.detach();
}
_ => {
current = next_node(&node);
}
}
} else {
current = next_node(&node);
}
}
let body_node = new_attrless_elem("body", []);
if self.include_url {
if let Some(url) = loc {
body_node.append(new_elem(
"a",
[(ns!(), None, "href", url.as_ref())],
[NodeRef::new_text(url.as_ref())],
));
}
}
if self.include_title {
if let Some(title) = title {
body_node.append(new_attrless_elem("h1", [NodeRef::new_text(title)]));
}
}
if self.include_byline {
if let Some(byline) = &meta.byline {
body_node.append(new_elem(
"address",
[(ns!(), None, "style", "font-style: italic")],
[NodeRef::new_text(byline)],
));
}
}
if let Some(src) = cover_img {
body_node.append(new_elem(
"div",
[(ns!(), None, "style", "margin-top: 1em")],
[new_elem("img", [(ns!(), None, "src", src)], [])],
));
}
if node
.as_element()
.map(|data| &*data.name.local == "body")
.unwrap_or(true)
{
while let Some(child) = node.first_child() {
body_node.append(child);
}
} else {
body_node.append(node);
}
let head_node = new_attrless_elem(
"head",
[
new_elem(
"meta",
[
(ns!(), None, "http-equiv", "Content-Type"),
(
ns!(),
None,
"content",
"application/xhtml+xml; charset=utf-8",
),
],
[],
),
new_elem(
"link",
[
(ns!(), None, "type", "text/css"),
(ns!(), None, "rel", "stylesheet"),
(ns!(), None, "href", "stylesheet.css"),
],
[],
),
],
);
if let Some(title) = title {
head_node.insert_after(new_attrless_elem("title", [NodeRef::new_text(title)]))
}
let html_node = new_elem(
"html",
[
(ns!(xmlns), None, "xmlns", "http://www.w3.org/1999/xhtml"),
(
ns!(xmlns),
Some("xmlns".into()),
"epub",
"http://www.w3.org/1999/xhtml",
),
],
[head_node, body_node],
);
let document = NodeRef::new_document();
document.append(NodeRef::new_doctype(r#"html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd""#, "", ""));
document.append(html_node);
let mut content: Vec<_> = r#"<?xml version="1.0" encoding="UTF-8"?>"#.as_bytes().into();
document.serialize(&mut content).unwrap();
trace!("full html: {}", std::str::from_utf8(&content).unwrap());
epub.add_content(
EpubContent::new("article.xhtml", &*content)
.title(title.unwrap_or("[missing title]"))
.reftype(ReferenceType::Text),
)?;
epub.stylesheet(self.css.as_ref().as_bytes())?;
epub.epub_version(self.epub_version);
epub.generate(out).or(Err(Error::EpubWritingError))?;
Ok(title.map(str::to_string))
}
}
#[cfg(test)]
#[cfg(feature = "image")]
mod tests {
use super::{EpubVersion, FilterType, ImageFormat, ImageHandling, ImgTransform, Repub};
use base64::engine::general_purpose::STANDARD;
use base64::Engine;
use epub::doc::EpubDoc;
use image::DynamicImage;
use std::io::{Cursor, Seek, Write};
fn create_mhtml(
doc: impl AsRef<str>,
loc: impl AsRef<str>,
title: impl AsRef<str>,
images: impl IntoIterator<Item = impl AsRef<str>>,
) -> String {
let mut img = Cursor::new(Vec::new());
DynamicImage::new_rgb8(1, 1)
.write_to(&mut img, image::ImageFormat::Png)
.unwrap();
let img_str = STANDARD.encode(img.into_inner());
let mut res = Vec::new();
writeln!(
res,
r#"From: <Saved by Blink>
Snapshot-Content-Location: {loc}
Subject: {title}
Date: Sat, 7 Jan 2023 20:59:18 -0000
MIME-Version: 1.0
Content-Type: multipart/related;
type="text/html";
boundary="----multipart-boundary----"
------multipart-boundary----
Content-Type: text/html
Content-ID: <frame-0@mhtml.blink>
Content-Transfer-Encoding: quoted-printable
Content-Location: {loc}
"#,
loc = loc.as_ref(),
title = title.as_ref(),
)
.unwrap();
res.write("ed_printable::encode(doc.as_ref().as_bytes()))
.unwrap();
for img in images {
writeln!(
res,
"------multipart-boundary----
Content-Type: image/png
Content-Transfer-Encoding: base64
Content-Location: {}
",
img.as_ref(),
)
.unwrap();
for line in img_str.as_bytes().chunks(76) {
res.write(line).unwrap();
writeln!(res).unwrap();
}
}
writeln!(res, "------multipart-boundary------").unwrap();
String::from_utf8(res).unwrap()
}
#[test]
fn no_images() {
let images: [&'static str; 0] = [];
let mhtml = create_mhtml(
r#"<!doctype html><html><head></head><body><div><p>text</p><img src="img.png" alt="info"><p>more text</p></body></html>"#,
"https://test.html",
"a fake doc",
images,
);
let mut buff = Cursor::new(Vec::new());
Repub::default().mhtml_to_epub(&mhtml, &mut buff).unwrap();
buff.rewind().unwrap();
let mut doc = EpubDoc::from_reader(&mut buff).unwrap();
assert_eq!(*doc.metadata.get("title").unwrap(), ["a fake doc"]);
let (contents, _) = doc.get_current_str().unwrap();
assert!(contents.contains("<p>text</p><p>more text</p>"),);
}
#[test]
#[cfg(feature = "image")]
fn options() {
let mhtml = create_mhtml(
r#"<!doctype html><html><head></head><body><div><p>text</p><img src="close_img.png" alt="info"><p>more text</p></body></html>"#,
"https://test.html",
"a fake doc",
["img.png"],
);
let mut buff = Cursor::new(Vec::new());
Repub {
include_url: true,
include_title: true,
include_byline: true,
include_cover: true,
strip_links: true,
href_sim_thresh: 1.0,
image_handling: ImageHandling::Keep,
css: "div { margin: 1em }",
transform: ImgTransform {
brightness: 1.2,
max_width: 100,
max_height: 100,
filter_type: FilterType::CatmullRom,
output_format: ImageFormat::Jpeg,
},
epub_version: EpubVersion::V20,
}
.mhtml_to_epub(&mhtml, &mut buff)
.unwrap();
buff.rewind().unwrap();
let mut doc = EpubDoc::from_reader(&mut buff).unwrap();
assert_eq!(*doc.metadata.get("title").unwrap(), ["a fake doc"]);
assert_eq!(
doc.resources.get("stylesheet.css"),
Some(&("OEBPS/stylesheet.css".into(), "text/css".into()))
);
let (css, _) = doc.get_resource_str("stylesheet.css").unwrap();
assert_eq!(css, "div { margin: 1em }");
let (contents, _) = doc.get_current_str().unwrap();
eprintln!("{}", contents);
assert!(contents.contains(r#"<?xml version="1.0" encoding="UTF-8"?>"#));
assert!(contents.contains(r#"<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">"#));
assert!(contents.contains(r#"<html xmlns:epub="http://www.w3.org/1999/xhtml" xmlns="http://www.w3.org/1999/xhtml">"#));
assert!(contents
.contains(r#"<a href="https://test.html">https://test.html</a><h1>a fake doc</h1>"#));
assert!(contents.contains(r#"<p>text</p><img src="image_0.jpg"></img><p>more text</p>"#));
assert!(contents
.contains(r#"<link href="stylesheet.css" rel="stylesheet" type="text/css"></link>"#));
}
}