Skip to main content

pulldown_cmark_to_flowed/
lib.rs

1//! Library to create [`format=flowed`][rfc3676sec4] plain text from markdown parsed by
2//! [`pulldown-cmark`].
3//!
4//! `format=flowed` is a small extension to plain text that allows for line wrapping to
5//! happen on the client side depending on the screen width of the client. Its main
6//! purpose is for Text E-Mail to provide an improved experience without going through
7//! HTML. At least [Thunderbird] supports this natively.
8//!
9//! # Example
10//!
11//! ```
12//! use pulldown_cmark::Parser;
13//!
14//! let md = "Your markdown goes here";
15//! // only the options (and any subset thereof) returned by this function are supported
16//! let opts = pulldown_cmark_to_flowed::parser_options();
17//! let parser = Parser::new_ext(&md, opts);
18//! let mut txt = String::new();
19//! pulldown_cmark_to_flowed::push_text(&mut txt, parser);
20//! ```
21//!
22//! If your markdown input looked like this:
23//!
24//! ```markdown
25#![doc = concat!(include_str!("../tests/example.md"), "```")]
26//!
27//! Then your output looks like this:
28//!
29//! ```text
30#![doc = concat!(include_str!("../tests/example.txt"), "```")]
31//!
32//! # Work in Progress
33//!
34//! This library does not yet support all features of [`pulldown-cmark`], and there are
35//! some things that could certainly be made configurable (such as the preferred line
36//! width). If you need a feature implemented or think that something could be done
37//! better, please do open an [issue].
38//!
39//!
40//!  [`pulldown-cmark`]: pulldown_cmark
41//!  [rfc3676sec4]: https://datatracker.ietf.org/doc/html/rfc3676#section-4
42//!  [Thunderbird]: https://en.wikipedia.org/wiki/Mozilla_Thunderbird
43//!  [issue]: https://codeberg.org/proto-x/pulldown-cmark-to-flowed/issues
44
45use hashbrown::HashMap;
46use pulldown_cmark::{CowStr, Event, HeadingLevel, LinkType, Options, Tag, TagEnd};
47use std::{mem, ops::AddAssign};
48
49/// The `Content-Type` value for `format=flowed` plain text.
50pub const CONTENT_TYPE: &str = r#"text/plain; charset="utf-8"; format="flowed""#;
51
52/// The maximum line width for text/plain messages.
53const MAX_LINE_WIDTH: usize = 78;
54/// The preferred line width for text/plain messages.
55///
56/// [RFC3676](https://datatracker.ietf.org/doc/html/rfc3676) recommends 66.
57const PREFERRED_LINE_WIDTH: usize = 66;
58
59/// The parser options that this library is designed for.
60///
61/// Eventually we will try to support all options, but for now, all options not enabled
62/// here are unsupported.
63pub fn parser_options() -> Options {
64	Options::ENABLE_FOOTNOTES
65		| Options::ENABLE_STRIKETHROUGH
66		| Options::ENABLE_SMART_PUNCTUATION
67		| Options::ENABLE_WIKILINKS
68}
69
70/// Convert the markdown parser to a nicer text representation that one might expect
71/// when reading an email.
72///
73/// Use the parser options returned by [`parser_options()`]! Support for arbitrary options
74/// will eventually be implemented but for now it is not.
75///
76/// The output is suitable for `format=flowed` as definied in
77/// [RFC3676](https://datatracker.ietf.org/doc/html/rfc3676).
78pub fn push_text<'a, I>(s: &mut String, iter: I)
79where
80	I: Iterator<Item = Event<'a>>
81{
82	let mut state = State::new(s);
83	push_text_to_state(&mut state, iter);
84}
85
86/// Convert markdown to somewhat-nicely-styled text.
87struct State<'a, 's> {
88	/// The text buffer.
89	txt: &'s mut String,
90	/// The number of newlines (with indentation but no other content) that are currently
91	/// appended to the string.
92	trailing_newlines: u8,
93
94	/// The currently active indentations.
95	indentations: Vec<&'static str>,
96
97	/// The length of the text of the current heading.
98	heading_len: usize,
99	/// The level of the current heading.
100	heading_lvl: Option<HeadingLevel>,
101
102	/// Whether we are currently inside of a codeblock.
103	code_block: bool,
104
105	/// The number of HTML blocks that are currently open.
106	html_blocks: u8,
107
108	/// The stack of indices of the currently open lists. The index is None if the list
109	/// is unordered.
110	lists: Vec<Option<u64>>,
111
112	/// Footnotes (both text and links). Indices will be +1'ed!
113	footnotes: Vec<String>,
114	/// Footnote label to footnote index mapping.
115	footnote_labels: HashMap<CowStr<'a>, usize>,
116	/// Set when we are currently parsing the footnote with the attached index.
117	in_footnote: Option<usize>,
118	/// The footnote indices of the currently active links and images
119	footnote_links: Vec<usize>
120}
121
122impl<'s> State<'_, 's> {
123	fn new(txt: &'s mut String) -> Self {
124		Self {
125			txt,
126			trailing_newlines: u8::MAX,
127
128			indentations: Vec::new(),
129
130			heading_len: 0,
131			heading_lvl: None,
132
133			code_block: false,
134
135			html_blocks: 0,
136
137			lists: Vec::new(),
138
139			footnotes: Vec::new(),
140			footnote_labels: HashMap::new(),
141			in_footnote: None,
142			footnote_links: Vec::new()
143		}
144	}
145}
146
147impl<T: AsRef<str>> AddAssign<T> for State<'_, '_> {
148	fn add_assign(&mut self, rhs: T) {
149		*self.txt += rhs.as_ref();
150	}
151}
152
153impl<'a> State<'a, '_> {
154	/// Returns the current column of the current line.
155	fn column(&self) -> usize {
156		let line_begin_idx = self.txt.rfind("\r\n").map(|idx| idx + 2).unwrap_or(0);
157		// -1 because we space-stuff all lines
158		self.txt[line_begin_idx ..].chars().count() - 1
159	}
160
161	fn newline_impl(&mut self, space_stuffing: bool, quotes_only: bool) {
162		// if the previous line contained only whitespace, remove that whitespace
163		if let Some(mut idx) = self.txt.rfind("\r\n").map(|idx| idx + 2) {
164			while self.txt[idx ..].starts_with('>') {
165				idx += 1;
166			}
167			if self.txt[idx ..].chars().all(|ch| ch == ' ') {
168				self.txt.truncate(idx);
169			}
170		}
171
172		*self.txt += "\r\n";
173		if space_stuffing
174			&& self
175				.indentations
176				.first()
177				.is_none_or(|indent| *indent != ">")
178		{
179			self.txt.push(' ');
180		}
181		self.trailing_newlines += 1;
182
183		// add "indentations" (not necessarily whitespace)
184		for indent in &self.indentations {
185			if !quotes_only || *indent == ">" {
186				*self.txt += indent;
187			}
188		}
189
190		// Space-stuff the inner content of the quotation line. We do this because
191		// it looks nicer for clients that don't support format=flowed.
192		if space_stuffing && self.indentations.last().is_some_and(|i| *i == ">") {
193			self.txt.push(' ');
194		}
195	}
196
197	/// Add newlines as necessary to acchieve the number of trailing newlines as
198	/// requested.
199	fn newlines(&mut self, count: u8) {
200		while self.trailing_newlines < count {
201			// For simplicity and for better display in mail clients that don't
202			// understand format=flowed, we space-stuff all lines
203			self.newline_impl(true, false);
204		}
205	}
206
207	fn add_text_unwrapped(&mut self, text: &str) {
208		if text.is_empty() {
209			return;
210		}
211
212		*self.txt += text;
213		self.trailing_newlines = 0;
214
215		if self.heading_lvl.is_some() {
216			let mut column = self.column();
217			if self.txt.ends_with(' ') {
218				column -= 1;
219			}
220			self.heading_len = self.heading_len.max(column);
221		}
222	}
223
224	fn add_text_wrapping(&mut self, text: &str) {
225		if text.is_empty() {
226			return;
227		}
228
229		let column = self.column();
230		let optimal_length = PREFERRED_LINE_WIDTH.saturating_sub(column);
231		let max_length = MAX_LINE_WIDTH.saturating_sub(column);
232
233		if text.len() <= optimal_length {
234			self.add_text_unwrapped(text);
235			return;
236		}
237
238		// Find the index of the spaces before and after the optimal length
239		let mut space_before = None;
240		let mut space_after = None;
241		let mut space_after_within_max_length = false;
242		let mut byte = 0;
243		for (i, ch) in text.chars().enumerate() {
244			if ch == ' ' {
245				if i <= optimal_length {
246					space_before = Some(byte);
247				} else {
248					space_after = Some(byte);
249					space_after_within_max_length = i <= max_length;
250					break;
251				}
252			}
253			byte += ch.len_utf8();
254		}
255		let space = match (space_before, space_after) {
256			(Some(space_before), None) => space_before,
257			(None, Some(space_after)) if space_after_within_max_length => space_after,
258			(Some(space_before), Some(space_after)) => {
259				if !space_after_within_max_length {
260					// space_after is unusable because it is too long
261					space_before
262				} else if (optimal_length - space_before) <= space_after {
263					// space_before deviates no more than space_after from the
264					// preferred line width
265					space_before
266				} else {
267					space_after
268				}
269			},
270			(None, _) if self.trailing_newlines == 0 && self.txt.ends_with(' ') => {
271				self.newline_impl(true, true);
272				self.add_text_wrapping(text);
273				return;
274			},
275			(None, Some(space_after)) => {
276				// unable to fit into max line width, so use anyways
277				space_after
278			},
279			_ => {
280				// unable to fit and/or unable to break, so just append as is
281				self.add_text_unwrapped(text);
282				return;
283			}
284		};
285
286		// if space >= text.len() {
287		// 	eprintln!("ERROR: {space} is >= {}", text.len());
288		// 	eprintln!(
289		// 		" HELP: column={column}, optimal_length={optimal_length}, max_length={max_length}"
290		// 	);
291		// 	self.add_text_unwrapped(text);
292		// }
293
294		// Split after the whitespace: If the whitespace is at the end of the
295		// line, then it is treated as a soft line break.
296		let (before, after) = text.split_at(space + 1);
297		self.add_text_unwrapped(before);
298		self.newline_impl(true, true);
299		self.add_text_wrapping(after);
300	}
301
302	fn get_or_create_footnote(&mut self, label: CowStr<'a>) -> usize {
303		match self.footnote_labels.get(&label) {
304			Some(footnote_idx) => *footnote_idx,
305			None => {
306				let footnote_idx = self.footnotes.len();
307				self.footnotes.push(String::new());
308				self.footnote_labels.insert(label, footnote_idx);
309				footnote_idx
310			}
311		}
312	}
313}
314
315// TODO verify the format=flowed claim and fix any incompatibilities
316fn push_text_to_state<'a, I>(txt: &mut State<'a, '_>, iter: I)
317where
318	I: Iterator<Item = Event<'a>>
319{
320	// The space-stuffing of the initial line might be missing.
321	if txt.txt.is_empty() || txt.txt.ends_with('\n') {
322		txt.txt.push(' ');
323	}
324
325	// Go through all events generated by the parser.
326	for event in iter {
327		match event {
328			Event::Start(Tag::Paragraph) => {
329				if let Some(footnote_idx) = txt.in_footnote {
330					let footnote_txt = &mut txt.footnotes[footnote_idx];
331					if !footnote_txt.is_empty() {
332						*footnote_txt += "\n\n";
333					}
334				} else {
335					txt.newlines(2);
336				}
337			},
338
339			Event::Start(Tag::Heading { level, .. }) => {
340				txt.newlines(3);
341				txt.heading_lvl = Some(level);
342				txt.heading_len = 0;
343			},
344
345			Event::Start(Tag::BlockQuote(_)) => {
346				// add one newline without the `>` indentation in expectation that the
347				// next block will probably ask for the indentation anyways
348				txt.newlines(1);
349				txt.indentations.push(">");
350			},
351
352			Event::Start(Tag::CodeBlock(_)) => {
353				txt.newlines(2);
354				txt.code_block = true;
355			},
356
357			Event::Start(Tag::HtmlBlock) => {
358				txt.html_blocks += 1;
359			},
360
361			Event::Start(Tag::List(list_idx)) => {
362				txt.newlines(2);
363				txt.lists.push(list_idx);
364			},
365
366			Event::Start(Tag::Item) => {
367				txt.newlines(2);
368				let list_idx = txt
369					.lists
370					.last_mut()
371					.expect("Markdown parser found a list item outside of a list");
372				if let Some(list_idx) = list_idx {
373					let list_idx_str = format!("{list_idx}. ");
374					for _ in 0 .. 4usize.saturating_sub(list_idx_str.len()) {
375						txt.txt.push(' ');
376					}
377					*txt.txt += &list_idx_str;
378					*list_idx += 1;
379				} else {
380					*txt += "  • ";
381				}
382				txt.indentations.push("    ");
383			},
384
385			Event::Start(Tag::FootnoteDefinition(label)) => {
386				let footnote_idx = txt.get_or_create_footnote(label);
387				txt.in_footnote = Some(footnote_idx);
388			},
389
390			Event::Start(Tag::DefinitionList)
391			| Event::Start(Tag::DefinitionListTitle)
392			| Event::Start(Tag::DefinitionListDefinition) => {
393				unreachable!("Definition lists are not enabled in the parser options")
394			},
395
396			Event::Start(Tag::Table(_))
397			| Event::Start(Tag::TableHead)
398			| Event::Start(Tag::TableRow)
399			| Event::Start(Tag::TableCell) => {
400				unreachable!("Tables are not enabled in the parser options")
401			},
402
403			Event::Start(Tag::Emphasis)
404			| Event::Start(Tag::Strong)
405			| Event::Start(Tag::Strikethrough) => {
406				// those cannot be represented in plain text, so we do nothing
407			},
408
409			Event::Start(Tag::Superscript) | Event::Start(Tag::Subscript) => {
410				unreachable!("Super/Subscript are not enabled in the parser options")
411			},
412
413			Event::Start(Tag::Link {
414				link_type: LinkType::Autolink,
415				..
416			}) => {
417				// the link is already written to the text, so set the idx to 0 to avoid
418				// it being written twice
419				txt.footnote_links.push(0);
420			},
421
422			Event::Start(Tag::Link { dest_url, .. })
423			| Event::Start(Tag::Image { dest_url, .. }) => {
424				txt.footnotes.push(dest_url.into_string());
425				txt.footnote_links.push(txt.footnotes.len());
426			},
427
428			Event::Start(Tag::MetadataBlock(_)) => {
429				unreachable!("Metadata blacks are not enabled in the parser options")
430			},
431
432			Event::End(TagEnd::Paragraph) => {
433				// TODO do we need to do anything here?
434			},
435
436			Event::End(TagEnd::Heading(level)) => {
437				txt.newlines(1);
438				let ch = match level {
439					HeadingLevel::H1 => '=',
440					_ => '-'
441				};
442				for _ in 0 .. txt.heading_len {
443					txt.txt.push(ch);
444				}
445				txt.trailing_newlines = 0;
446				txt.newlines(2);
447				txt.heading_lvl = None;
448			},
449
450			Event::End(TagEnd::BlockQuote(_)) => {
451				let indent = txt.indentations.pop();
452				debug_assert_eq!(indent, Some(">"));
453			},
454
455			Event::End(TagEnd::CodeBlock) => {
456				debug_assert!(txt.code_block);
457				txt.code_block = false;
458			},
459
460			Event::End(TagEnd::HtmlBlock) => {
461				txt.html_blocks -= 1;
462			},
463
464			Event::End(TagEnd::List(_)) => {
465				let list_idx = txt.lists.pop();
466				debug_assert!(list_idx.is_some());
467			},
468
469			Event::End(TagEnd::Item) => {
470				let indent = txt.indentations.pop();
471				debug_assert_eq!(indent, Some("    "));
472			},
473
474			Event::End(TagEnd::FootnoteDefinition) => {
475				txt.in_footnote = None;
476			},
477
478			Event::End(TagEnd::DefinitionList)
479			| Event::End(TagEnd::DefinitionListTitle)
480			| Event::End(TagEnd::DefinitionListDefinition) => {
481				unreachable!("Definition lists are not enabled in the parser options")
482			},
483
484			Event::End(TagEnd::Table)
485			| Event::End(TagEnd::TableHead)
486			| Event::End(TagEnd::TableRow)
487			| Event::End(TagEnd::TableCell) => {
488				unreachable!("Tables are not enabled in the parser options")
489			},
490
491			Event::End(TagEnd::Emphasis)
492			| Event::End(TagEnd::Strong)
493			| Event::End(TagEnd::Strikethrough) => {
494				// those cannot be represented in plain text, so we do nothing
495			},
496
497			Event::End(TagEnd::Superscript) | Event::End(TagEnd::Subscript) => {
498				unreachable!("Super/Subscript are not enabled in the parser options")
499			},
500
501			Event::End(TagEnd::Link) | Event::End(TagEnd::Image) => {
502				let footnote_idx = txt
503					.footnote_links
504					.pop()
505					.expect("Markdown parser found a closing link/image that isn't open");
506				if footnote_idx != 0 {
507					txt.add_text_wrapping(&format!(" [{footnote_idx}]"));
508				}
509			},
510
511			Event::End(TagEnd::MetadataBlock(_)) => {
512				unreachable!("Metadata blocks are not enabled in the parser options")
513			},
514
515			// Event::Code is `inline code`, no special treatment there
516			Event::Text(text) | Event::Code(text) => {
517				// TODO respect the current state before printing text
518
519				// footnotes are written later, so we just cache the text for now
520				if let Some(footnote_idx) = txt.in_footnote {
521					txt.footnotes[footnote_idx] += &text;
522				}
523				// but special treatment to code blocks: no wrapping here
524				else if txt.code_block {
525					for line in text.lines() {
526						// remove trailing whitespace to avoid accidental flowing
527						*txt += line.trim_end_matches(' ');
528						txt.trailing_newlines = 0;
529						txt.newlines(1);
530					}
531				} else {
532					txt.add_text_wrapping(&text);
533				}
534			},
535
536			Event::InlineMath(_) | Event::DisplayMath(_) => {
537				unreachable!("Math is not enabled in the parser options")
538			},
539
540			Event::Html(_) | Event::InlineHtml(_) => {
541				// intentionally ignoring HTML blocks
542			},
543
544			Event::FootnoteReference(label) => {
545				let footnote_idx = txt.get_or_create_footnote(label);
546				txt.add_text_wrapping(&format!("[{footnote_idx}]"));
547			},
548
549			Event::SoftBreak => {
550				// soft breaks just get translated to a space
551				if let Some(footnote_idx) = txt.in_footnote {
552					txt.footnotes[footnote_idx].push(' ');
553				} else {
554					txt.add_text_wrapping(" ");
555				}
556			},
557
558			Event::HardBreak => {
559				if let Some(footnote_idx) = txt.in_footnote {
560					txt.footnotes[footnote_idx] += "\n";
561				} else {
562					// we must trim whitespace to ensure the hard break is actually a
563					// hard break
564					while txt.txt.ends_with(' ') {
565						txt.txt.pop();
566					}
567					// force a newline
568					txt.trailing_newlines = 0;
569					txt.newlines(1);
570				}
571			},
572
573			Event::Rule => {
574				txt.newlines(1);
575				for _ in 0 .. MAX_LINE_WIDTH {
576					txt.add_text_unwrapped("-");
577				}
578				txt.newlines(1);
579			},
580
581			Event::TaskListMarker(_) => {
582				unreachable!("Task lists are not enabled in the parser options")
583			}
584		}
585	}
586
587	// Now let's add all of the footnotes
588	if !txt.footnotes.is_empty() {
589		txt.newlines(1);
590		// We use the "signature separator" as a "footnote separator"
591		txt.newline_impl(false, true);
592		debug_assert!(txt.txt.ends_with("\r\n"));
593		txt.add_text_unwrapped("-- ");
594		for (i, f) in mem::take(&mut txt.footnotes).into_iter().enumerate() {
595			let multiline = f.contains('\n');
596			txt.newlines(1);
597			if multiline {
598				txt.newlines(2);
599			}
600			let f_label = format!("[{}]: ", i + 1);
601			for _ in 0 .. 6usize.saturating_sub(f_label.len()) {
602				txt.txt.push(' ');
603			}
604			*txt += &f_label;
605			txt.indentations.push("      ");
606			for line in f.lines() {
607				txt.newlines(1);
608				txt.add_text_wrapping(line);
609				// manually set the line as "dirty" so that consequtive newlines in the
610				// footnotes (such as those produces by the paragraph separation) are
611				// written to the text
612				txt.trailing_newlines = 0;
613			}
614			txt.indentations.pop();
615			if multiline {
616				txt.newlines(2);
617			}
618		}
619	}
620
621	// Always add a trailing newline
622	txt.newline_impl(false, true);
623}