pulldown-cmark-to-flowed 0.1.0

//! Library to create [`format=flowed`][rfc3676sec4] plain text from markdown parsed by
//! [`pulldown-cmark`].
//!
//! `format=flowed` is a small extension to plain text that allows for line wrapping to
//! happen on the client side depending on the screen width of the client. Its main
//! purpose is for Text E-Mail to provide an improved experience without going through
//! HTML. At least [Thunderbird] supports this natively.
//!
//! # Example
//!
//! ```
//! use pulldown_cmark::Parser;
//!
//! let md = "Your markdown goes here";
//! // only the options (and any subset thereof) returned by this function are supported
//! let opts = pulldown_cmark_to_flowed::parser_options();
//! let parser = Parser::new_ext(&md, opts);
//! let mut txt = String::new();
//! pulldown_cmark_to_flowed::push_text(&mut txt, parser);
//! ```
//!
//! If your markdown input looked like this:
//!
//! ```markdown
#![doc = concat!(include_str!("../tests/example.md"), "```")]
//!
//! Then your output looks like this:
//!
//! ```text
#![doc = concat!(include_str!("../tests/example.txt"), "```")]
//!
//! # Work in Progress
//!
//! This library does not yet support all features of [`pulldown-cmark`], and there are
//! some things that could certainly be made configurable (such as the preferred line
//! width). If you need a feature implemented or think that something could be done
//! better, please do open an [issue].
//!
//!
//!  [`pulldown-cmark`]: pulldown_cmark
//!  [rfc3676sec4]: https://datatracker.ietf.org/doc/html/rfc3676#section-4
//!  [Thunderbird]: https://en.wikipedia.org/wiki/Mozilla_Thunderbird
//!  [issue]: https://codeberg.org/proto-x/pulldown-cmark-to-flowed/issues

use hashbrown::HashMap;
use pulldown_cmark::{CowStr, Event, HeadingLevel, LinkType, Options, Tag, TagEnd};
use std::{mem, ops::AddAssign};

/// The `Content-Type` value for `format=flowed` plain text.
pub const CONTENT_TYPE: &str = r#"text/plain; charset="utf-8"; format="flowed""#;

/// The maximum line width for text/plain messages.
const MAX_LINE_WIDTH: usize = 78;
/// The preferred line width for text/plain messages.
///
/// [RFC3676](https://datatracker.ietf.org/doc/html/rfc3676) recommends 66.
const PREFERRED_LINE_WIDTH: usize = 66;

/// The parser options that this library is designed for.
///
/// Eventually we will try to support all options, but for now, all options not enabled
/// here are unsupported.
pub fn parser_options() -> Options {
	Options::ENABLE_FOOTNOTES
		| Options::ENABLE_STRIKETHROUGH
		| Options::ENABLE_SMART_PUNCTUATION
		| Options::ENABLE_WIKILINKS
}

/// Convert the markdown parser to a nicer text representation that one might expect
/// when reading an email.
///
/// Use the parser options returned by [`parser_options()`]! Support for arbitrary options
/// will eventually be implemented but for now it is not.
///
/// The output is suitable for `format=flowed` as definied in
/// [RFC3676](https://datatracker.ietf.org/doc/html/rfc3676).
pub fn push_text<'a, I>(s: &mut String, iter: I)
where
	I: Iterator<Item = Event<'a>>
{
	let mut state = State::new(s);
	push_text_to_state(&mut state, iter);
}

/// Convert markdown to somewhat-nicely-styled text.
struct State<'a, 's> {
	/// The text buffer.
	txt: &'s mut String,
	/// The number of newlines (with indentation but no other content) that are currently
	/// appended to the string.
	trailing_newlines: u8,

	/// The currently active indentations.
	indentations: Vec<&'static str>,

	/// The length of the text of the current heading.
	heading_len: usize,
	/// The level of the current heading.
	heading_lvl: Option<HeadingLevel>,

	/// Whether we are currently inside of a codeblock.
	code_block: bool,

	/// The number of HTML blocks that are currently open.
	html_blocks: u8,

	/// The stack of indices of the currently open lists. The index is None if the list
	/// is unordered.
	lists: Vec<Option<u64>>,

	/// Footnotes (both text and links). Indices will be +1'ed!
	footnotes: Vec<String>,
	/// Footnote label to footnote index mapping.
	footnote_labels: HashMap<CowStr<'a>, usize>,
	/// Set when we are currently parsing the footnote with the attached index.
	in_footnote: Option<usize>,
	/// The footnote indices of the currently active links and images
	footnote_links: Vec<usize>
}

impl<'s> State<'_, 's> {
	fn new(txt: &'s mut String) -> Self {
		Self {
			txt,
			trailing_newlines: u8::MAX,

			indentations: Vec::new(),

			heading_len: 0,
			heading_lvl: None,

			code_block: false,

			html_blocks: 0,

			lists: Vec::new(),

			footnotes: Vec::new(),
			footnote_labels: HashMap::new(),
			in_footnote: None,
			footnote_links: Vec::new()
		}
	}
}

impl<T: AsRef<str>> AddAssign<T> for State<'_, '_> {
	fn add_assign(&mut self, rhs: T) {
		*self.txt += rhs.as_ref();
	}
}

impl<'a> State<'a, '_> {
	/// Returns the current column of the current line.
	fn column(&self) -> usize {
		let line_begin_idx = self.txt.rfind("\r\n").map(|idx| idx + 2).unwrap_or(0);
		// -1 because we space-stuff all lines
		self.txt[line_begin_idx ..].chars().count() - 1
	}

	fn newline_impl(&mut self, space_stuffing: bool, quotes_only: bool) {
		// if the previous line contained only whitespace, remove that whitespace
		if let Some(mut idx) = self.txt.rfind("\r\n").map(|idx| idx + 2) {
			while self.txt[idx ..].starts_with('>') {
				idx += 1;
			}
			if self.txt[idx ..].chars().all(|ch| ch == ' ') {
				self.txt.truncate(idx);
			}
		}

		*self.txt += "\r\n";
		if space_stuffing
			&& self
				.indentations
				.first()
				.is_none_or(|indent| *indent != ">")
		{
			self.txt.push(' ');
		}
		self.trailing_newlines += 1;

		// add "indentations" (not necessarily whitespace)
		for indent in &self.indentations {
			if !quotes_only || *indent == ">" {
				*self.txt += indent;
			}
		}

		// Space-stuff the inner content of the quotation line. We do this because
		// it looks nicer for clients that don't support format=flowed.
		if space_stuffing && self.indentations.last().is_some_and(|i| *i == ">") {
			self.txt.push(' ');
		}
	}

	/// Add newlines as necessary to acchieve the number of trailing newlines as
	/// requested.
	fn newlines(&mut self, count: u8) {
		while self.trailing_newlines < count {
			// For simplicity and for better display in mail clients that don't
			// understand format=flowed, we space-stuff all lines
			self.newline_impl(true, false);
		}
	}

	fn add_text_unwrapped(&mut self, text: &str) {
		if text.is_empty() {
			return;
		}

		*self.txt += text;
		self.trailing_newlines = 0;

		if self.heading_lvl.is_some() {
			let mut column = self.column();
			if self.txt.ends_with(' ') {
				column -= 1;
			}
			self.heading_len = self.heading_len.max(column);
		}
	}

	fn add_text_wrapping(&mut self, text: &str) {
		if text.is_empty() {
			return;
		}

		let column = self.column();
		let optimal_length = PREFERRED_LINE_WIDTH.saturating_sub(column);
		let max_length = MAX_LINE_WIDTH.saturating_sub(column);

		if text.len() <= optimal_length {
			self.add_text_unwrapped(text);
			return;
		}

		// Find the index of the spaces before and after the optimal length
		let mut space_before = None;
		let mut space_after = None;
		let mut space_after_within_max_length = false;
		let mut byte = 0;
		for (i, ch) in text.chars().enumerate() {
			if ch == ' ' {
				if i <= optimal_length {
					space_before = Some(byte);
				} else {
					space_after = Some(byte);
					space_after_within_max_length = i <= max_length;
					break;
				}
			}
			byte += ch.len_utf8();
		}
		let space = match (space_before, space_after) {
			(Some(space_before), None) => space_before,
			(None, Some(space_after)) if space_after_within_max_length => space_after,
			(Some(space_before), Some(space_after)) => {
				if !space_after_within_max_length {
					// space_after is unusable because it is too long
					space_before
				} else if (optimal_length - space_before) <= space_after {
					// space_before deviates no more than space_after from the
					// preferred line width
					space_before
				} else {
					space_after
				}
			},
			(None, _) if self.trailing_newlines == 0 && self.txt.ends_with(' ') => {
				self.newline_impl(true, true);
				self.add_text_wrapping(text);
				return;
			},
			(None, Some(space_after)) => {
				// unable to fit into max line width, so use anyways
				space_after
			},
			_ => {
				// unable to fit and/or unable to break, so just append as is
				self.add_text_unwrapped(text);
				return;
			}
		};

		// if space >= text.len() {
		// 	eprintln!("ERROR: {space} is >= {}", text.len());
		// 	eprintln!(
		// 		" HELP: column={column}, optimal_length={optimal_length}, max_length={max_length}"
		// 	);
		// 	self.add_text_unwrapped(text);
		// }

		// Split after the whitespace: If the whitespace is at the end of the
		// line, then it is treated as a soft line break.
		let (before, after) = text.split_at(space + 1);
		self.add_text_unwrapped(before);
		self.newline_impl(true, true);
		self.add_text_wrapping(after);
	}

	fn get_or_create_footnote(&mut self, label: CowStr<'a>) -> usize {
		match self.footnote_labels.get(&label) {
			Some(footnote_idx) => *footnote_idx,
			None => {
				let footnote_idx = self.footnotes.len();
				self.footnotes.push(String::new());
				self.footnote_labels.insert(label, footnote_idx);
				footnote_idx
			}
		}
	}
}

// TODO verify the format=flowed claim and fix any incompatibilities
fn push_text_to_state<'a, I>(txt: &mut State<'a, '_>, iter: I)
where
	I: Iterator<Item = Event<'a>>
{
	// The space-stuffing of the initial line might be missing.
	if txt.txt.is_empty() || txt.txt.ends_with('\n') {
		txt.txt.push(' ');
	}

	// Go through all events generated by the parser.
	for event in iter {
		match event {
			Event::Start(Tag::Paragraph) => {
				if let Some(footnote_idx) = txt.in_footnote {
					let footnote_txt = &mut txt.footnotes[footnote_idx];
					if !footnote_txt.is_empty() {
						*footnote_txt += "\n\n";
					}
				} else {
					txt.newlines(2);
				}
			},

			Event::Start(Tag::Heading { level, .. }) => {
				txt.newlines(3);
				txt.heading_lvl = Some(level);
				txt.heading_len = 0;
			},

			Event::Start(Tag::BlockQuote(_)) => {
				// add one newline without the `>` indentation in expectation that the
				// next block will probably ask for the indentation anyways
				txt.newlines(1);
				txt.indentations.push(">");
			},

			Event::Start(Tag::CodeBlock(_)) => {
				txt.newlines(2);
				txt.code_block = true;
			},

			Event::Start(Tag::HtmlBlock) => {
				txt.html_blocks += 1;
			},

			Event::Start(Tag::List(list_idx)) => {
				txt.newlines(2);
				txt.lists.push(list_idx);
			},

			Event::Start(Tag::Item) => {
				txt.newlines(2);
				let list_idx = txt
					.lists
					.last_mut()
					.expect("Markdown parser found a list item outside of a list");
				if let Some(list_idx) = list_idx {
					let list_idx_str = format!("{list_idx}. ");
					for _ in 0 .. 4usize.saturating_sub(list_idx_str.len()) {
						txt.txt.push(' ');
					}
					*txt.txt += &list_idx_str;
					*list_idx += 1;
				} else {
					*txt += "  • ";
				}
				txt.indentations.push("    ");
			},

			Event::Start(Tag::FootnoteDefinition(label)) => {
				let footnote_idx = txt.get_or_create_footnote(label);
				txt.in_footnote = Some(footnote_idx);
			},

			Event::Start(Tag::DefinitionList)
			| Event::Start(Tag::DefinitionListTitle)
			| Event::Start(Tag::DefinitionListDefinition) => {
				unreachable!("Definition lists are not enabled in the parser options")
			},

			Event::Start(Tag::Table(_))
			| Event::Start(Tag::TableHead)
			| Event::Start(Tag::TableRow)
			| Event::Start(Tag::TableCell) => {
				unreachable!("Tables are not enabled in the parser options")
			},

			Event::Start(Tag::Emphasis)
			| Event::Start(Tag::Strong)
			| Event::Start(Tag::Strikethrough) => {
				// those cannot be represented in plain text, so we do nothing
			},

			Event::Start(Tag::Superscript) | Event::Start(Tag::Subscript) => {
				unreachable!("Super/Subscript are not enabled in the parser options")
			},

			Event::Start(Tag::Link {
				link_type: LinkType::Autolink,
				..
			}) => {
				// the link is already written to the text, so set the idx to 0 to avoid
				// it being written twice
				txt.footnote_links.push(0);
			},

			Event::Start(Tag::Link { dest_url, .. })
			| Event::Start(Tag::Image { dest_url, .. }) => {
				txt.footnotes.push(dest_url.into_string());
				txt.footnote_links.push(txt.footnotes.len());
			},

			Event::Start(Tag::MetadataBlock(_)) => {
				unreachable!("Metadata blacks are not enabled in the parser options")
			},

			Event::End(TagEnd::Paragraph) => {
				// TODO do we need to do anything here?
			},

			Event::End(TagEnd::Heading(level)) => {
				txt.newlines(1);
				let ch = match level {
					HeadingLevel::H1 => '=',
					_ => '-'
				};
				for _ in 0 .. txt.heading_len {
					txt.txt.push(ch);
				}
				txt.trailing_newlines = 0;
				txt.newlines(2);
				txt.heading_lvl = None;
			},

			Event::End(TagEnd::BlockQuote(_)) => {
				let indent = txt.indentations.pop();
				debug_assert_eq!(indent, Some(">"));
			},

			Event::End(TagEnd::CodeBlock) => {
				debug_assert!(txt.code_block);
				txt.code_block = false;
			},

			Event::End(TagEnd::HtmlBlock) => {
				txt.html_blocks -= 1;
			},

			Event::End(TagEnd::List(_)) => {
				let list_idx = txt.lists.pop();
				debug_assert!(list_idx.is_some());
			},

			Event::End(TagEnd::Item) => {
				let indent = txt.indentations.pop();
				debug_assert_eq!(indent, Some("    "));
			},

			Event::End(TagEnd::FootnoteDefinition) => {
				txt.in_footnote = None;
			},

			Event::End(TagEnd::DefinitionList)
			| Event::End(TagEnd::DefinitionListTitle)
			| Event::End(TagEnd::DefinitionListDefinition) => {
				unreachable!("Definition lists are not enabled in the parser options")
			},

			Event::End(TagEnd::Table)
			| Event::End(TagEnd::TableHead)
			| Event::End(TagEnd::TableRow)
			| Event::End(TagEnd::TableCell) => {
				unreachable!("Tables are not enabled in the parser options")
			},

			Event::End(TagEnd::Emphasis)
			| Event::End(TagEnd::Strong)
			| Event::End(TagEnd::Strikethrough) => {
				// those cannot be represented in plain text, so we do nothing
			},

			Event::End(TagEnd::Superscript) | Event::End(TagEnd::Subscript) => {
				unreachable!("Super/Subscript are not enabled in the parser options")
			},

			Event::End(TagEnd::Link) | Event::End(TagEnd::Image) => {
				let footnote_idx = txt
					.footnote_links
					.pop()
					.expect("Markdown parser found a closing link/image that isn't open");
				if footnote_idx != 0 {
					txt.add_text_wrapping(&format!(" [{footnote_idx}]"));
				}
			},

			Event::End(TagEnd::MetadataBlock(_)) => {
				unreachable!("Metadata blocks are not enabled in the parser options")
			},

			// Event::Code is `inline code`, no special treatment there
			Event::Text(text) | Event::Code(text) => {
				// TODO respect the current state before printing text

				// footnotes are written later, so we just cache the text for now
				if let Some(footnote_idx) = txt.in_footnote {
					txt.footnotes[footnote_idx] += &text;
				}
				// but special treatment to code blocks: no wrapping here
				else if txt.code_block {
					for line in text.lines() {
						// remove trailing whitespace to avoid accidental flowing
						*txt += line.trim_end_matches(' ');
						txt.trailing_newlines = 0;
						txt.newlines(1);
					}
				} else {
					txt.add_text_wrapping(&text);
				}
			},

			Event::InlineMath(_) | Event::DisplayMath(_) => {
				unreachable!("Math is not enabled in the parser options")
			},

			Event::Html(_) | Event::InlineHtml(_) => {
				// intentionally ignoring HTML blocks
			},

			Event::FootnoteReference(label) => {
				let footnote_idx = txt.get_or_create_footnote(label);
				txt.add_text_wrapping(&format!("[{footnote_idx}]"));
			},

			Event::SoftBreak => {
				// soft breaks just get translated to a space
				if let Some(footnote_idx) = txt.in_footnote {
					txt.footnotes[footnote_idx].push(' ');
				} else {
					txt.add_text_wrapping(" ");
				}
			},

			Event::HardBreak => {
				if let Some(footnote_idx) = txt.in_footnote {
					txt.footnotes[footnote_idx] += "\n";
				} else {
					// we must trim whitespace to ensure the hard break is actually a
					// hard break
					while txt.txt.ends_with(' ') {
						txt.txt.pop();
					}
					// force a newline
					txt.trailing_newlines = 0;
					txt.newlines(1);
				}
			},

			Event::Rule => {
				txt.newlines(1);
				for _ in 0 .. MAX_LINE_WIDTH {
					txt.add_text_unwrapped("-");
				}
				txt.newlines(1);
			},

			Event::TaskListMarker(_) => {
				unreachable!("Task lists are not enabled in the parser options")
			}
		}
	}

	// Now let's add all of the footnotes
	if !txt.footnotes.is_empty() {
		txt.newlines(1);
		// We use the "signature separator" as a "footnote separator"
		txt.newline_impl(false, true);
		debug_assert!(txt.txt.ends_with("\r\n"));
		txt.add_text_unwrapped("-- ");
		for (i, f) in mem::take(&mut txt.footnotes).into_iter().enumerate() {
			let multiline = f.contains('\n');
			txt.newlines(1);
			if multiline {
				txt.newlines(2);
			}
			let f_label = format!("[{}]: ", i + 1);
			for _ in 0 .. 6usize.saturating_sub(f_label.len()) {
				txt.txt.push(' ');
			}
			*txt += &f_label;
			txt.indentations.push("      ");
			for line in f.lines() {
				txt.newlines(1);
				txt.add_text_wrapping(line);
				// manually set the line as "dirty" so that consequtive newlines in the
				// footnotes (such as those produces by the paragraph separation) are
				// written to the text
				txt.trailing_newlines = 0;
			}
			txt.indentations.pop();
			if multiline {
				txt.newlines(2);
			}
		}
	}

	// Always add a trailing newline
	txt.newline_impl(false, true);
}