1#![warn(missing_docs)]
33#[cfg(feature = "image")]
34mod img;
35
36pub use epub_builder::EpubVersion;
37use epub_builder::{EpubBuilder, EpubContent, ReferenceType, ZipLibrary};
38use eyre::Report;
39#[cfg(feature = "image")]
40pub use img::{FilterType, ImgTransform};
41use kuchiki::{Attribute, ExpandedName, NodeRef};
42use log::{trace, warn};
43use mail_parser::{Header, HeaderName, HeaderValue, MessageParser, PartType};
44use markup5ever::{namespace_url, ns, Namespace, Prefix, QualName};
45use readable_readability::Readability;
46use std::cmp::Reverse;
47use std::collections::btree_map::Entry;
48use std::collections::BTreeMap;
49use std::error::Error as StdError;
50use std::fmt::{Display, Error as FmtError, Formatter};
51use std::io::{Read, Write};
52
53#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
55pub enum ImageFormat {
56 #[default]
58 Jpeg,
59 Png,
61}
62
63impl ImageFormat {
64 fn ext(&self) -> &'static str {
66 match self {
67 ImageFormat::Jpeg => "jpg",
68 ImageFormat::Png => "png",
69 }
70 }
71
72 fn mime(&self) -> &'static str {
74 match self {
75 ImageFormat::Jpeg => "image/jpeg",
76 ImageFormat::Png => "image/png",
77 }
78 }
79}
80
81#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
83pub enum ImageHandling {
84 #[default]
86 Strip,
87 Filter,
89 Keep,
91}
92
93#[derive(Debug)]
95pub struct Repub<Css, Trans> {
96 pub include_url: bool,
98 pub include_title: bool,
100 pub include_byline: bool,
102 pub include_cover: bool,
104 pub strip_links: bool,
106 pub href_sim_thresh: f64,
113 pub image_handling: ImageHandling,
115 pub css: Css,
117 pub transform: Trans,
119 pub epub_version: EpubVersion,
121}
122
123pub trait ImageTransform {
125 type Output<'a>: Read + 'a;
127
128 fn transform<'a, S: AsRef<str>>(
132 &self,
133 buff: &'a [u8],
134 mime: S,
135 ) -> Option<(Self::Output<'a>, ImageFormat)>;
136}
137
138pub struct NoopTransform;
140
141impl ImageTransform for NoopTransform {
142 type Output<'a> = &'a [u8];
143
144 fn transform<'a, S: AsRef<str>>(
145 &self,
146 buff: &'a [u8],
147 mime: S,
148 ) -> Option<(Self::Output<'a>, ImageFormat)> {
149 let fmt = match mime.as_ref() {
150 "image/jpeg" => Some(ImageFormat::Jpeg),
151 "image/png" => Some(ImageFormat::Png),
152 _ => None,
153 }?;
154 Some((buff, fmt))
155 }
156}
157
158impl Default for Repub<&'static str, NoopTransform> {
159 fn default() -> Self {
161 Self {
162 include_url: false,
163 include_title: false,
164 include_byline: false,
165 include_cover: false,
166 strip_links: false,
167 href_sim_thresh: 0.0,
168 image_handling: ImageHandling::default(),
169 css: "",
170 transform: NoopTransform,
171 epub_version: EpubVersion::V20,
172 }
173 }
174}
175
176#[non_exhaustive]
178#[derive(Debug, PartialEq, Eq)]
179pub enum Error {
180 InvalidImageFormat,
182 MhtmlParseError,
184 MhtmlFormatError,
186 ImageConversionError,
188 EpubCreationError,
190 EpubWritingError,
192}
193
194impl Display for Error {
195 fn fmt(&self, fmt: &mut Formatter<'_>) -> Result<(), FmtError> {
196 write!(fmt, "{self:?}")
197 }
198}
199
200impl StdError for Error {}
201
202impl From<Report> for Error {
203 fn from(_: Report) -> Self {
204 Error::EpubCreationError
205 }
206}
207
208fn get_header<'a, 'b>(
210 headers: &'a [Header<'b>],
211 header: HeaderName,
212) -> Option<&'a HeaderValue<'b>> {
213 headers
214 .iter()
215 .find(|head| head.name == header)
216 .map(|head| &head.value)
217}
218
219fn new_elem(
221 name: &str,
222 attributes: impl IntoIterator<Item = (Namespace, Option<Prefix>, impl AsRef<str>, impl AsRef<str>)>,
223 children: impl IntoIterator<Item = NodeRef>,
224) -> NodeRef {
225 let node = NodeRef::new_element(
226 QualName::new(None, ns!(svg), name.into()),
228 attributes.into_iter().map(|(ns, prefix, attr, value)| {
229 (
230 ExpandedName::new(ns, attr.as_ref()),
231 Attribute {
232 prefix,
233 value: value.as_ref().into(),
234 },
235 )
236 }),
237 );
238 for child in children {
239 node.append(child);
240 }
241 node
242}
243
244fn new_attrless_elem(name: &str, children: impl IntoIterator<Item = NodeRef>) -> NodeRef {
246 let attrs: [(Namespace, Option<Prefix>, &str, &str); 0] = [];
247 new_elem(name, attrs, children)
248}
249
250fn next_node(node: &NodeRef) -> Option<NodeRef> {
252 node.first_child().or_else(|| next_node_skip(node))
253}
254
255fn next_node_skip(node: &NodeRef) -> Option<NodeRef> {
257 node.next_sibling()
258 .or_else(|| node.ancestors().find_map(|n| n.next_sibling()))
259}
260
261impl<C, T> Repub<C, T>
262where
263 C: AsRef<str>,
264 T: ImageTransform,
265{
266 fn find_url<'a>(
268 &self,
269 data: &'a BTreeMap<&'a str, (&'a str, &'a [u8])>,
270 src: &str,
271 ) -> Option<(Reverse<usize>, String, &'a str, &'a [u8])> {
272 let decoded = percent_encoding::percent_decode_str(src)
273 .decode_utf8()
274 .ok()?;
275 if let Some((mime, data)) = data.get(decoded.as_ref()) {
276 Some((Reverse(0), decoded.to_string(), mime, data))
277 } else if self.href_sim_thresh > 0.0 {
278 let thresh: usize =
279 f64::trunc(decoded.chars().count() as f64 * self.href_sim_thresh) as usize;
280 let (dist, href, mime, data) = data
281 .iter()
282 .map(|(href, (mime, data))| (strsim::levenshtein(href, &decoded), href, mime, data))
283 .min()?;
284 if dist < thresh {
285 Some((Reverse(dist), href.to_string(), mime, data))
286 } else {
287 warn!("didn't find approximate match for image: {decoded}");
288 None
289 }
290 } else {
291 warn!("didn't find exact match for image: {decoded}");
292 None
293 }
294 }
295
296 pub fn mhtml_to_epub(
298 &self,
299 mhtml: impl AsRef<str>,
300 out: &mut impl Write,
301 ) -> Result<Option<String>, Error> {
302 let msg = MessageParser::default()
304 .parse(mhtml.as_ref().as_bytes())
305 .ok_or(Error::MhtmlParseError)?;
306 let (first, rest) = msg.parts.split_first().ok_or(Error::MhtmlFormatError)?;
307 let subject = get_header(&first.headers, HeaderName::Subject).and_then(|val| match val {
308 HeaderValue::Text(title) => Some(title.as_ref()),
309 _ => None,
310 });
311 let (main, resources) = rest.split_first().ok_or(Error::MhtmlFormatError)?;
312 let loc =
313 get_header(&main.headers, HeaderName::ContentLocation).and_then(|val| match val {
314 HeaderValue::Text(loc) => Some(loc),
315 _ => None,
316 });
317 let html = if let PartType::Html(content) = &main.body {
318 Ok(content)
319 } else {
320 Err(Error::MhtmlFormatError)
321 }?;
322 let (node, meta) = Readability::new().parse(html);
323 let title = meta
324 .article_title
325 .as_ref()
326 .map(String::as_ref)
327 .or_else(|| meta.page_title.as_ref().map(String::as_ref))
328 .or(subject);
329
330 let mut epub = EpubBuilder::new(ZipLibrary::new()?)?;
332 if let Some(title) = title {
333 epub.metadata("title", title)?;
334 }
335 if let Some(author) = &meta.byline {
336 epub.metadata("author", author)?;
337 }
338
339 let mut image_data = BTreeMap::new();
341 for attach in resources {
342 let ctype = get_header(&attach.headers, HeaderName::ContentType);
343 let loc = get_header(&attach.headers, HeaderName::ContentLocation);
344 if let (
345 Some(HeaderValue::ContentType(ctype)),
346 Some(HeaderValue::Text(loc)),
347 PartType::Binary(body),
348 ) = (ctype, loc, &attach.body)
349 {
350 if let ("image", Some(mime)) = (ctype.ctype(), ctype.subtype()) {
351 match image_data.entry(loc.as_ref()) {
352 Entry::Vacant(ent) => {
353 ent.insert((mime, body.as_ref()));
354 }
355 Entry::Occupied(mut ent) => {
356 let (_, old) = ent.get();
357 if old.len() < body.len() {
360 ent.insert((mime, body.as_ref()));
361 }
362 }
363 }
364 }
365 }
366 }
367
368 let cover_img = if self.include_cover {
370 if let Some((image, fmt)) = meta
371 .image_url
372 .as_ref()
373 .and_then(|cover| self.find_url(&image_data, cover))
375 .and_then(|(_, _, mime, img)| {
377 self.transform.transform(img, format!("image/{}", mime))
378 })
379 {
380 let file_name = format!("image_cover.{}", fmt.ext());
381 epub.add_cover_image(&file_name, image, fmt.mime())?;
382 Some(file_name)
383 } else {
384 None
385 }
386 } else {
387 None
388 };
389
390 let mut images = BTreeMap::new();
392 let mut current = node.first_child();
395 while let Some(node) = current {
396 if let Some(data) = node.as_element() {
397 match &*data.name.local {
398 "a" if self.strip_links => {
399 while let Some(child) = node.last_child() {
400 node.insert_after(child);
401 }
402 current = next_node_skip(&node);
403 node.detach();
404 }
405 "img" | "picture" => {
406 if self.image_handling != ImageHandling::Strip {
407 let mut matched = None;
409 for dec in node.inclusive_descendants() {
410 if let Some(dec_dat) = dec.as_element() {
411 let attrs = dec_dat.attributes.borrow();
412 if let Some(src) = attrs.get("src") {
413 matched =
414 std::cmp::max(matched, self.find_url(&image_data, src));
415 }
416 if let Some(srcset) = attrs.get("srcset") {
417 for src in srcset.split(',') {
418 matched = std::cmp::max(
419 matched,
420 self.find_url(&image_data, src.trim()),
421 );
422 }
423 }
424 }
425 }
426 if let Some((_, url, mime, img)) = matched {
428 let num = images.len();
429 let path = match (images.entry(url), self.image_handling) {
430 (Entry::Vacant(ent), _) => {
431 let trans = self
432 .transform
433 .transform(img, format!("image/{}", mime));
434 let name = match trans {
435 Some((image, fmt)) => {
436 let name = format!("image_{num}.{}", fmt.ext());
437 epub.add_resource(&name, image, fmt.mime())?;
438 Some(name)
439 }
440 None => None,
441 };
442 ent.insert(name).as_ref()
443 }
444 (_, ImageHandling::Filter) => None, (Entry::Occupied(ent), _) => ent.into_mut().as_ref(),
446 };
447 if let Some(image_path) = path {
449 node.insert_before(new_elem(
450 "img",
451 [(ns!(), None, "src", image_path)],
453 [],
454 ));
455 }
456 }
457 }
458 current = next_node_skip(&node);
460 node.detach();
461 }
462 _ => {
463 current = next_node(&node);
465 }
466 }
467 } else {
468 current = next_node(&node);
470 }
471 }
472
473 let body_node = new_attrless_elem("body", []);
475 if self.include_url {
477 if let Some(url) = loc {
478 body_node.append(new_elem(
479 "a",
480 [(ns!(), None, "href", url.as_ref())],
481 [NodeRef::new_text(url.as_ref())],
482 ));
483 }
484 }
485 if self.include_title {
487 if let Some(title) = title {
488 body_node.append(new_attrless_elem("h1", [NodeRef::new_text(title)]));
489 }
490 }
491 if self.include_byline {
493 if let Some(byline) = &meta.byline {
494 body_node.append(new_elem(
495 "address",
496 [(ns!(), None, "style", "font-style: italic")],
497 [NodeRef::new_text(byline)],
498 ));
499 }
500 }
501 if let Some(src) = cover_img {
503 body_node.append(new_elem(
504 "div",
505 [(ns!(), None, "style", "margin-top: 1em")],
506 [new_elem("img", [(ns!(), None, "src", src)], [])],
507 ));
508 }
509 if node
511 .as_element()
512 .map(|data| &*data.name.local == "body")
513 .unwrap_or(true)
514 {
515 while let Some(child) = node.first_child() {
516 body_node.append(child);
517 }
518 } else {
519 body_node.append(node);
520 }
521 let head_node = new_attrless_elem(
523 "head",
524 [
525 new_elem(
526 "meta",
527 [
528 (ns!(), None, "http-equiv", "Content-Type"),
529 (
530 ns!(),
531 None,
532 "content",
533 "application/xhtml+xml; charset=utf-8",
534 ),
535 ],
536 [],
537 ),
538 new_elem(
539 "link",
540 [
541 (ns!(), None, "type", "text/css"),
542 (ns!(), None, "rel", "stylesheet"),
543 (ns!(), None, "href", "stylesheet.css"),
544 ],
545 [],
546 ),
547 ],
548 );
549 if let Some(title) = title {
550 head_node.insert_after(new_attrless_elem("title", [NodeRef::new_text(title)]))
551 }
552 let html_node = new_elem(
554 "html",
555 [
556 (ns!(xmlns), None, "xmlns", "http://www.w3.org/1999/xhtml"),
557 (
558 ns!(xmlns),
559 Some("xmlns".into()),
560 "epub",
561 "http://www.w3.org/1999/xhtml",
562 ),
563 ],
564 [head_node, body_node],
565 );
566 let document = NodeRef::new_document();
568 document.append(NodeRef::new_doctype(r#"html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd""#, "", ""));
569 document.append(html_node);
570
571 let mut content: Vec<_> = r#"<?xml version="1.0" encoding="UTF-8"?>"#.as_bytes().into();
574 document.serialize(&mut content).unwrap();
575 trace!("full html: {}", std::str::from_utf8(&content).unwrap());
576
577 epub.add_content(
578 EpubContent::new("article.xhtml", &*content)
579 .title(title.unwrap_or("[missing title]"))
580 .reftype(ReferenceType::Text),
581 )?;
582
583 epub.stylesheet(self.css.as_ref().as_bytes())?;
585 epub.epub_version(self.epub_version);
586 epub.generate(out).or(Err(Error::EpubWritingError))?;
587 Ok(title.map(str::to_string))
588 }
589}
590
591#[cfg(test)]
592#[cfg(feature = "image")]
593mod tests {
594 use super::{EpubVersion, FilterType, ImageHandling, ImageFormat, ImgTransform, Repub};
595 use base64::engine::general_purpose::STANDARD;
596 use base64::Engine;
597 use epub::doc::EpubDoc;
598 use image::DynamicImage;
599 use std::io::{Cursor, Seek, Write};
600
601 fn create_mhtml(
602 doc: impl AsRef<str>,
603 loc: impl AsRef<str>,
604 title: impl AsRef<str>,
605 images: impl IntoIterator<Item = impl AsRef<str>>,
606 ) -> String {
607 let mut img = Cursor::new(Vec::new());
608 DynamicImage::new_rgb8(1, 1)
609 .write_to(&mut img, image::ImageFormat::Png)
610 .unwrap();
611 let img_str = STANDARD.encode(img.into_inner());
612
613 let mut res = Vec::new();
614 writeln!(
615 res,
616 r#"From: <Saved by Blink>
617Snapshot-Content-Location: {loc}
618Subject: {title}
619Date: Sat, 7 Jan 2023 20:59:18 -0000
620MIME-Version: 1.0
621Content-Type: multipart/related;
622 type="text/html";
623 boundary="----multipart-boundary----"
624
625
626------multipart-boundary----
627Content-Type: text/html
628Content-ID: <frame-0@mhtml.blink>
629Content-Transfer-Encoding: quoted-printable
630Content-Location: {loc}
631"#,
632 loc = loc.as_ref(),
633 title = title.as_ref(),
634 )
635 .unwrap();
636 res.write("ed_printable::encode(doc.as_ref().as_bytes()))
637 .unwrap();
638
639 for img in images {
640 writeln!(
641 res,
642 "------multipart-boundary----
643Content-Type: image/png
644Content-Transfer-Encoding: base64
645Content-Location: {}
646",
647 img.as_ref(),
648 )
649 .unwrap();
650 for line in img_str.as_bytes().chunks(76) {
651 res.write(line).unwrap();
652 writeln!(res).unwrap();
653 }
654 }
655
656 writeln!(res, "------multipart-boundary------").unwrap();
657 String::from_utf8(res).unwrap()
658 }
659
660 #[test]
661 fn no_images() {
662 let images: [&'static str; 0] = [];
663 let mhtml = create_mhtml(
664 r#"<!doctype html><html><head></head><body><div><p>text</p><img src="img.png" alt="info"><p>more text</p></body></html>"#,
665 "https://test.html",
666 "a fake doc",
667 images,
668 );
669 let mut buff = Cursor::new(Vec::new());
670 Repub::default().mhtml_to_epub(&mhtml, &mut buff).unwrap();
671 buff.rewind().unwrap();
672 let mut doc = EpubDoc::from_reader(&mut buff).unwrap();
673 assert_eq!(*doc.metadata.get("title").unwrap(), ["a fake doc"]);
674 let (contents, _) = doc.get_current_str().unwrap();
675 assert!(contents.contains("<p>text</p><p>more text</p>"),);
676 }
677
678 #[test]
679 #[cfg(feature = "image")]
680 fn options() {
681 let mhtml = create_mhtml(
682 r#"<!doctype html><html><head></head><body><div><p>text</p><img src="close_img.png" alt="info"><p>more text</p></body></html>"#,
683 "https://test.html",
684 "a fake doc",
685 ["img.png"],
686 );
687 let mut buff = Cursor::new(Vec::new());
688 Repub {
689 include_url: true,
690 include_title: true,
691 include_byline: true,
692 include_cover: true,
693 strip_links: true,
694 href_sim_thresh: 1.0,
695 image_handling: ImageHandling::Keep,
696 css: "div { margin: 1em }",
697 transform: ImgTransform {
698 brightness: 1.2,
699 max_width: 100,
700 max_height: 100,
701 filter_type: FilterType::CatmullRom,
702 output_format: ImageFormat::Jpeg,
703 },
704 epub_version: EpubVersion::V20,
705 }
706 .mhtml_to_epub(&mhtml, &mut buff)
707 .unwrap();
708 buff.rewind().unwrap();
709 let mut doc = EpubDoc::from_reader(&mut buff).unwrap();
710 assert_eq!(*doc.metadata.get("title").unwrap(), ["a fake doc"]);
711 assert_eq!(
712 doc.resources.get("stylesheet.css"),
713 Some(&("OEBPS/stylesheet.css".into(), "text/css".into()))
714 );
715 let (css, _) = doc.get_resource_str("stylesheet.css").unwrap();
716 assert_eq!(css, "div { margin: 1em }");
717 let (contents, _) = doc.get_current_str().unwrap();
718 eprintln!("{}", contents);
719 assert!(contents.contains(r#"<?xml version="1.0" encoding="UTF-8"?>"#));
720 assert!(contents.contains(r#"<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">"#));
721 assert!(contents.contains(r#"<html xmlns:epub="http://www.w3.org/1999/xhtml" xmlns="http://www.w3.org/1999/xhtml">"#));
722 assert!(contents
723 .contains(r#"<a href="https://test.html">https://test.html</a><h1>a fake doc</h1>"#));
724 assert!(contents.contains(r#"<p>text</p><img src="image_0.jpg"></img><p>more text</p>"#));
725 assert!(contents
726 .contains(r#"<link href="stylesheet.css" rel="stylesheet" type="text/css"></link>"#));
727 }
728}