1#![warn(missing_docs)]
33#[cfg(feature = "image")]
34mod img;
35
36pub use epub_builder::EpubVersion;
37use epub_builder::{self, EpubBuilder, EpubContent, ReferenceType, ZipLibrary};
38#[cfg(feature = "image")]
39pub use img::{FilterType, ImgTransform};
40use kuchiki::{Attribute, ExpandedName, NodeRef};
41use log::{trace, warn};
42use mail_parser::{Header, HeaderName, HeaderValue, MessageParser, PartType};
43use markup5ever::{namespace_url, ns, Namespace, Prefix, QualName};
44use readable_readability::Readability;
45use std::cmp::Reverse;
46use std::collections::btree_map::Entry;
47use std::collections::BTreeMap;
48use std::error::Error as StdError;
49use std::fmt::{Display, Error as FmtError, Formatter};
50use std::io::{Read, Write};
51
52#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
54pub enum ImageFormat {
55 #[default]
57 Jpeg,
58 Png,
60}
61
62impl ImageFormat {
63 fn ext(&self) -> &'static str {
65 match self {
66 ImageFormat::Jpeg => "jpg",
67 ImageFormat::Png => "png",
68 }
69 }
70
71 fn mime(&self) -> &'static str {
73 match self {
74 ImageFormat::Jpeg => "image/jpeg",
75 ImageFormat::Png => "image/png",
76 }
77 }
78}
79
80#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
82pub enum ImageHandling {
83 #[default]
85 Strip,
86 Filter,
88 Keep,
90}
91
92#[derive(Debug)]
94pub struct Repub<Css, Trans> {
95 pub include_url: bool,
97 pub include_title: bool,
99 pub include_byline: bool,
101 pub include_cover: bool,
103 pub strip_links: bool,
105 pub href_sim_thresh: f64,
112 pub image_handling: ImageHandling,
114 pub css: Css,
116 pub transform: Trans,
118 pub epub_version: EpubVersion,
120}
121
122pub trait ImageTransform {
124 type Output<'a>: Read + 'a;
126
127 fn transform<'a, S: AsRef<str>>(
131 &self,
132 buff: &'a [u8],
133 mime: S,
134 ) -> Option<(Self::Output<'a>, ImageFormat)>;
135}
136
137pub struct NoopTransform;
139
140impl ImageTransform for NoopTransform {
141 type Output<'a> = &'a [u8];
142
143 fn transform<'a, S: AsRef<str>>(
144 &self,
145 buff: &'a [u8],
146 mime: S,
147 ) -> Option<(Self::Output<'a>, ImageFormat)> {
148 let fmt = match mime.as_ref() {
149 "image/jpeg" => Some(ImageFormat::Jpeg),
150 "image/png" => Some(ImageFormat::Png),
151 _ => None,
152 }?;
153 Some((buff, fmt))
154 }
155}
156
157impl Default for Repub<&'static str, NoopTransform> {
158 fn default() -> Self {
160 Self {
161 include_url: false,
162 include_title: false,
163 include_byline: false,
164 include_cover: false,
165 strip_links: false,
166 href_sim_thresh: 0.0,
167 image_handling: ImageHandling::default(),
168 css: "",
169 transform: NoopTransform,
170 epub_version: EpubVersion::V20,
171 }
172 }
173}
174
175#[non_exhaustive]
177#[derive(Debug, PartialEq, Eq)]
178pub enum Error {
179 InvalidImageFormat,
181 MhtmlParseError,
183 MhtmlFormatError,
185 ImageConversionError,
187 EpubCreationError,
189 EpubWritingError,
191}
192
193impl Display for Error {
194 fn fmt(&self, fmt: &mut Formatter<'_>) -> Result<(), FmtError> {
195 write!(fmt, "{self:?}")
196 }
197}
198
199impl StdError for Error {}
200
201impl From<epub_builder::Error> for Error {
202 fn from(_: epub_builder::Error) -> Self {
203 Error::EpubCreationError
204 }
205}
206
207fn get_header<'a, 'b>(
209 headers: &'a [Header<'b>],
210 header: HeaderName,
211) -> Option<&'a HeaderValue<'b>> {
212 headers
213 .iter()
214 .find(|head| head.name == header)
215 .map(|head| &head.value)
216}
217
218fn new_elem(
220 name: &str,
221 attributes: impl IntoIterator<Item = (Namespace, Option<Prefix>, impl AsRef<str>, impl AsRef<str>)>,
222 children: impl IntoIterator<Item = NodeRef>,
223) -> NodeRef {
224 let node = NodeRef::new_element(
225 QualName::new(None, ns!(svg), name.into()),
227 attributes.into_iter().map(|(ns, prefix, attr, value)| {
228 (
229 ExpandedName::new(ns, attr.as_ref()),
230 Attribute {
231 prefix,
232 value: value.as_ref().into(),
233 },
234 )
235 }),
236 );
237 for child in children {
238 node.append(child);
239 }
240 node
241}
242
243fn new_attrless_elem(name: &str, children: impl IntoIterator<Item = NodeRef>) -> NodeRef {
245 let attrs: [(Namespace, Option<Prefix>, &str, &str); 0] = [];
246 new_elem(name, attrs, children)
247}
248
249fn next_node(node: &NodeRef) -> Option<NodeRef> {
251 node.first_child().or_else(|| next_node_skip(node))
252}
253
254fn next_node_skip(node: &NodeRef) -> Option<NodeRef> {
256 node.next_sibling()
257 .or_else(|| node.ancestors().find_map(|n| n.next_sibling()))
258}
259
260impl<C, T> Repub<C, T>
261where
262 C: AsRef<str>,
263 T: ImageTransform,
264{
265 fn find_url<'a>(
267 &self,
268 data: &'a BTreeMap<&'a str, (&'a str, &'a [u8])>,
269 src: &str,
270 ) -> Option<(Reverse<usize>, String, &'a str, &'a [u8])> {
271 let decoded = percent_encoding::percent_decode_str(src)
272 .decode_utf8()
273 .ok()?;
274 if let Some((mime, data)) = data.get(decoded.as_ref()) {
275 Some((Reverse(0), decoded.to_string(), mime, data))
276 } else if self.href_sim_thresh > 0.0 {
277 let thresh: usize =
278 f64::trunc(decoded.chars().count() as f64 * self.href_sim_thresh) as usize;
279 let (dist, href, mime, data) = data
280 .iter()
281 .map(|(href, (mime, data))| (strsim::levenshtein(href, &decoded), href, mime, data))
282 .min()?;
283 if dist < thresh {
284 Some((Reverse(dist), href.to_string(), mime, data))
285 } else {
286 warn!("didn't find approximate match for image: {decoded}");
287 None
288 }
289 } else {
290 warn!("didn't find exact match for image: {decoded}");
291 None
292 }
293 }
294
295 pub fn mhtml_to_epub(
297 &self,
298 mhtml: impl AsRef<str>,
299 out: &mut impl Write,
300 ) -> Result<Option<String>, Error> {
301 let msg = MessageParser::default()
303 .parse(mhtml.as_ref().as_bytes())
304 .ok_or(Error::MhtmlParseError)?;
305 let (first, rest) = msg.parts.split_first().ok_or(Error::MhtmlFormatError)?;
306 let subject = get_header(&first.headers, HeaderName::Subject).and_then(|val| match val {
307 HeaderValue::Text(title) => Some(title.as_ref()),
308 _ => None,
309 });
310 let (main, resources) = rest.split_first().ok_or(Error::MhtmlFormatError)?;
311 let loc =
312 get_header(&main.headers, HeaderName::ContentLocation).and_then(|val| match val {
313 HeaderValue::Text(loc) => Some(loc),
314 _ => None,
315 });
316 let html = if let PartType::Html(content) = &main.body {
317 Ok(content)
318 } else {
319 Err(Error::MhtmlFormatError)
320 }?;
321 let (node, meta) = Readability::new().parse(html);
322 let title = meta
323 .article_title
324 .as_ref()
325 .map(String::as_ref)
326 .or_else(|| meta.page_title.as_ref().map(String::as_ref))
327 .or(subject);
328
329 let mut epub = EpubBuilder::new(ZipLibrary::new()?)?;
331 if let Some(title) = title {
332 epub.metadata("title", title)?;
333 }
334 if let Some(author) = &meta.byline {
335 epub.metadata("author", author)?;
336 }
337
338 let mut image_data = BTreeMap::new();
340 for attach in resources {
341 let ctype = get_header(&attach.headers, HeaderName::ContentType);
342 let loc = get_header(&attach.headers, HeaderName::ContentLocation);
343 if let (
344 Some(HeaderValue::ContentType(ctype)),
345 Some(HeaderValue::Text(loc)),
346 PartType::Binary(body),
347 ) = (ctype, loc, &attach.body)
348 {
349 if let ("image", Some(mime)) = (ctype.ctype(), ctype.subtype()) {
350 match image_data.entry(loc.as_ref()) {
351 Entry::Vacant(ent) => {
352 ent.insert((mime, body.as_ref()));
353 }
354 Entry::Occupied(mut ent) => {
355 let (_, old) = ent.get();
356 if old.len() < body.len() {
359 ent.insert((mime, body.as_ref()));
360 }
361 }
362 }
363 }
364 }
365 }
366
367 let cover_img = if self.include_cover {
369 if let Some((image, fmt)) = meta
370 .image_url
371 .as_ref()
372 .and_then(|cover| self.find_url(&image_data, cover))
374 .and_then(|(_, _, mime, img)| {
376 self.transform.transform(img, format!("image/{}", mime))
377 })
378 {
379 let file_name = format!("image_cover.{}", fmt.ext());
380 epub.add_cover_image(&file_name, image, fmt.mime())?;
381 Some(file_name)
382 } else {
383 None
384 }
385 } else {
386 None
387 };
388
389 let mut images = BTreeMap::new();
391 let mut current = node.first_child();
394 while let Some(node) = current {
395 if let Some(data) = node.as_element() {
396 match &*data.name.local {
397 "a" if self.strip_links => {
398 while let Some(child) = node.last_child() {
399 node.insert_after(child);
400 }
401 current = next_node_skip(&node);
402 node.detach();
403 }
404 "img" | "picture" => {
405 if self.image_handling != ImageHandling::Strip {
406 let mut matched = None;
408 for dec in node.inclusive_descendants() {
409 if let Some(dec_dat) = dec.as_element() {
410 let attrs = dec_dat.attributes.borrow();
411 if let Some(src) = attrs.get("src") {
412 matched =
413 std::cmp::max(matched, self.find_url(&image_data, src));
414 }
415 if let Some(srcset) = attrs.get("srcset") {
416 for src in srcset.split(',') {
417 matched = std::cmp::max(
418 matched,
419 self.find_url(&image_data, src.trim()),
420 );
421 }
422 }
423 }
424 }
425 if let Some((_, url, mime, img)) = matched {
427 let num = images.len();
428 let path = match (images.entry(url), self.image_handling) {
429 (Entry::Vacant(ent), _) => {
430 let trans = self
431 .transform
432 .transform(img, format!("image/{}", mime));
433 let name = match trans {
434 Some((image, fmt)) => {
435 let name = format!("image_{num}.{}", fmt.ext());
436 epub.add_resource(&name, image, fmt.mime())?;
437 Some(name)
438 }
439 None => None,
440 };
441 ent.insert(name).as_ref()
442 }
443 (_, ImageHandling::Filter) => None, (Entry::Occupied(ent), _) => ent.into_mut().as_ref(),
445 };
446 if let Some(image_path) = path {
448 node.insert_before(new_elem(
449 "img",
450 [(ns!(), None, "src", image_path)],
452 [],
453 ));
454 }
455 }
456 }
457 current = next_node_skip(&node);
459 node.detach();
460 }
461 _ => {
462 current = next_node(&node);
464 }
465 }
466 } else {
467 current = next_node(&node);
469 }
470 }
471
472 let body_node = new_attrless_elem("body", []);
474 if self.include_url {
476 if let Some(url) = loc {
477 body_node.append(new_elem(
478 "a",
479 [(ns!(), None, "href", url.as_ref())],
480 [NodeRef::new_text(url.as_ref())],
481 ));
482 }
483 }
484 if self.include_title {
486 if let Some(title) = title {
487 body_node.append(new_attrless_elem("h1", [NodeRef::new_text(title)]));
488 }
489 }
490 if self.include_byline {
492 if let Some(byline) = &meta.byline {
493 body_node.append(new_elem(
494 "address",
495 [(ns!(), None, "style", "font-style: italic")],
496 [NodeRef::new_text(byline)],
497 ));
498 }
499 }
500 if let Some(src) = cover_img {
502 body_node.append(new_elem(
503 "div",
504 [(ns!(), None, "style", "margin-top: 1em")],
505 [new_elem("img", [(ns!(), None, "src", src)], [])],
506 ));
507 }
508 if node
510 .as_element()
511 .map(|data| &*data.name.local == "body")
512 .unwrap_or(true)
513 {
514 while let Some(child) = node.first_child() {
515 body_node.append(child);
516 }
517 } else {
518 body_node.append(node);
519 }
520 let head_node = new_attrless_elem(
522 "head",
523 [
524 new_elem(
525 "meta",
526 [
527 (ns!(), None, "http-equiv", "Content-Type"),
528 (
529 ns!(),
530 None,
531 "content",
532 "application/xhtml+xml; charset=utf-8",
533 ),
534 ],
535 [],
536 ),
537 new_elem(
538 "link",
539 [
540 (ns!(), None, "type", "text/css"),
541 (ns!(), None, "rel", "stylesheet"),
542 (ns!(), None, "href", "stylesheet.css"),
543 ],
544 [],
545 ),
546 ],
547 );
548 if let Some(title) = title {
549 head_node.insert_after(new_attrless_elem("title", [NodeRef::new_text(title)]))
550 }
551 let html_node = new_elem(
553 "html",
554 [
555 (ns!(xmlns), None, "xmlns", "http://www.w3.org/1999/xhtml"),
556 (
557 ns!(xmlns),
558 Some("xmlns".into()),
559 "epub",
560 "http://www.w3.org/1999/xhtml",
561 ),
562 ],
563 [head_node, body_node],
564 );
565 let document = NodeRef::new_document();
567 document.append(NodeRef::new_doctype(r#"html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd""#, "", ""));
568 document.append(html_node);
569
570 let mut content: Vec<_> = r#"<?xml version="1.0" encoding="UTF-8"?>"#.as_bytes().into();
573 document.serialize(&mut content).unwrap();
574 trace!("full html: {}", std::str::from_utf8(&content).unwrap());
575
576 epub.add_content(
577 EpubContent::new("article.xhtml", &*content)
578 .title(title.unwrap_or("[missing title]"))
579 .reftype(ReferenceType::Text),
580 )?;
581
582 epub.stylesheet(self.css.as_ref().as_bytes())?;
584 epub.epub_version(self.epub_version);
585 epub.generate(out).or(Err(Error::EpubWritingError))?;
586 Ok(title.map(str::to_string))
587 }
588}
589
590#[cfg(test)]
591#[cfg(feature = "image")]
592mod tests {
593 use super::{EpubVersion, FilterType, ImageFormat, ImageHandling, ImgTransform, Repub};
594 use base64::engine::general_purpose::STANDARD;
595 use base64::Engine;
596 use epub::doc::EpubDoc;
597 use image::DynamicImage;
598 use std::io::{Cursor, Seek, Write};
599
600 fn create_mhtml(
601 doc: impl AsRef<str>,
602 loc: impl AsRef<str>,
603 title: impl AsRef<str>,
604 images: impl IntoIterator<Item = impl AsRef<str>>,
605 ) -> String {
606 let mut img = Cursor::new(Vec::new());
607 DynamicImage::new_rgb8(1, 1)
608 .write_to(&mut img, image::ImageFormat::Png)
609 .unwrap();
610 let img_str = STANDARD.encode(img.into_inner());
611
612 let mut res = Vec::new();
613 writeln!(
614 res,
615 r#"From: <Saved by Blink>
616Snapshot-Content-Location: {loc}
617Subject: {title}
618Date: Sat, 7 Jan 2023 20:59:18 -0000
619MIME-Version: 1.0
620Content-Type: multipart/related;
621 type="text/html";
622 boundary="----multipart-boundary----"
623
624
625------multipart-boundary----
626Content-Type: text/html
627Content-ID: <frame-0@mhtml.blink>
628Content-Transfer-Encoding: quoted-printable
629Content-Location: {loc}
630"#,
631 loc = loc.as_ref(),
632 title = title.as_ref(),
633 )
634 .unwrap();
635 res.write("ed_printable::encode(doc.as_ref().as_bytes()))
636 .unwrap();
637
638 for img in images {
639 writeln!(
640 res,
641 "------multipart-boundary----
642Content-Type: image/png
643Content-Transfer-Encoding: base64
644Content-Location: {}
645",
646 img.as_ref(),
647 )
648 .unwrap();
649 for line in img_str.as_bytes().chunks(76) {
650 res.write(line).unwrap();
651 writeln!(res).unwrap();
652 }
653 }
654
655 writeln!(res, "------multipart-boundary------").unwrap();
656 String::from_utf8(res).unwrap()
657 }
658
659 #[test]
660 fn no_images() {
661 let images: [&'static str; 0] = [];
662 let mhtml = create_mhtml(
663 r#"<!doctype html><html><head></head><body><div><p>text</p><img src="img.png" alt="info"><p>more text</p></body></html>"#,
664 "https://test.html",
665 "a fake doc",
666 images,
667 );
668 let mut buff = Cursor::new(Vec::new());
669 Repub::default().mhtml_to_epub(&mhtml, &mut buff).unwrap();
670 buff.rewind().unwrap();
671 let mut doc = EpubDoc::from_reader(&mut buff).unwrap();
672 assert_eq!(*doc.metadata.get("title").unwrap(), ["a fake doc"]);
673 let (contents, _) = doc.get_current_str().unwrap();
674 assert!(contents.contains("<p>text</p><p>more text</p>"),);
675 }
676
677 #[test]
678 #[cfg(feature = "image")]
679 fn options() {
680 let mhtml = create_mhtml(
681 r#"<!doctype html><html><head></head><body><div><p>text</p><img src="close_img.png" alt="info"><p>more text</p></body></html>"#,
682 "https://test.html",
683 "a fake doc",
684 ["img.png"],
685 );
686 let mut buff = Cursor::new(Vec::new());
687 Repub {
688 include_url: true,
689 include_title: true,
690 include_byline: true,
691 include_cover: true,
692 strip_links: true,
693 href_sim_thresh: 1.0,
694 image_handling: ImageHandling::Keep,
695 css: "div { margin: 1em }",
696 transform: ImgTransform {
697 brightness: 1.2,
698 max_width: 100,
699 max_height: 100,
700 filter_type: FilterType::CatmullRom,
701 output_format: ImageFormat::Jpeg,
702 },
703 epub_version: EpubVersion::V20,
704 }
705 .mhtml_to_epub(&mhtml, &mut buff)
706 .unwrap();
707 buff.rewind().unwrap();
708 let mut doc = EpubDoc::from_reader(&mut buff).unwrap();
709 assert_eq!(*doc.metadata.get("title").unwrap(), ["a fake doc"]);
710 assert_eq!(
711 doc.resources.get("stylesheet.css"),
712 Some(&("OEBPS/stylesheet.css".into(), "text/css".into()))
713 );
714 let (css, _) = doc.get_resource_str("stylesheet.css").unwrap();
715 assert_eq!(css, "div { margin: 1em }");
716 let (contents, _) = doc.get_current_str().unwrap();
717 eprintln!("{}", contents);
718 assert!(contents.contains(r#"<?xml version="1.0" encoding="UTF-8"?>"#));
719 assert!(contents.contains(r#"<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">"#));
720 assert!(contents.contains(r#"<html xmlns:epub="http://www.w3.org/1999/xhtml" xmlns="http://www.w3.org/1999/xhtml">"#));
721 assert!(contents
722 .contains(r#"<a href="https://test.html">https://test.html</a><h1>a fake doc</h1>"#));
723 assert!(contents.contains(r#"<p>text</p><img src="image_0.jpg"></img><p>more text</p>"#));
724 assert!(contents
725 .contains(r#"<link href="stylesheet.css" rel="stylesheet" type="text/css"></link>"#));
726 }
727}