pulldown-cmark-to-flowed 0.1.0

Convert Markdown to Plain Text with format=flowed
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
//! Library to create [`format=flowed`][rfc3676sec4] plain text from markdown parsed by
//! [`pulldown-cmark`].
//!
//! `format=flowed` is a small extension to plain text that allows for line wrapping to
//! happen on the client side depending on the screen width of the client. Its main
//! purpose is for Text E-Mail to provide an improved experience without going through
//! HTML. At least [Thunderbird] supports this natively.
//!
//! # Example
//!
//! ```
//! use pulldown_cmark::Parser;
//!
//! let md = "Your markdown goes here";
//! // only the options (and any subset thereof) returned by this function are supported
//! let opts = pulldown_cmark_to_flowed::parser_options();
//! let parser = Parser::new_ext(&md, opts);
//! let mut txt = String::new();
//! pulldown_cmark_to_flowed::push_text(&mut txt, parser);
//! ```
//!
//! If your markdown input looked like this:
//!
//! ```markdown
#![doc = concat!(include_str!("../tests/example.md"), "```")]
//!
//! Then your output looks like this:
//!
//! ```text
#![doc = concat!(include_str!("../tests/example.txt"), "```")]
//!
//! # Work in Progress
//!
//! This library does not yet support all features of [`pulldown-cmark`], and there are
//! some things that could certainly be made configurable (such as the preferred line
//! width). If you need a feature implemented or think that something could be done
//! better, please do open an [issue].
//!
//!
//!  [`pulldown-cmark`]: pulldown_cmark
//!  [rfc3676sec4]: https://datatracker.ietf.org/doc/html/rfc3676#section-4
//!  [Thunderbird]: https://en.wikipedia.org/wiki/Mozilla_Thunderbird
//!  [issue]: https://codeberg.org/proto-x/pulldown-cmark-to-flowed/issues

use hashbrown::HashMap;
use pulldown_cmark::{CowStr, Event, HeadingLevel, LinkType, Options, Tag, TagEnd};
use std::{mem, ops::AddAssign};

/// The `Content-Type` value for `format=flowed` plain text.
pub const CONTENT_TYPE: &str = r#"text/plain; charset="utf-8"; format="flowed""#;

/// The maximum line width for text/plain messages.
const MAX_LINE_WIDTH: usize = 78;
/// The preferred line width for text/plain messages.
///
/// [RFC3676](https://datatracker.ietf.org/doc/html/rfc3676) recommends 66.
const PREFERRED_LINE_WIDTH: usize = 66;

/// The parser options that this library is designed for.
///
/// Eventually we will try to support all options, but for now, all options not enabled
/// here are unsupported.
pub fn parser_options() -> Options {
	Options::ENABLE_FOOTNOTES
		| Options::ENABLE_STRIKETHROUGH
		| Options::ENABLE_SMART_PUNCTUATION
		| Options::ENABLE_WIKILINKS
}

/// Convert the markdown parser to a nicer text representation that one might expect
/// when reading an email.
///
/// Use the parser options returned by [`parser_options()`]! Support for arbitrary options
/// will eventually be implemented but for now it is not.
///
/// The output is suitable for `format=flowed` as definied in
/// [RFC3676](https://datatracker.ietf.org/doc/html/rfc3676).
pub fn push_text<'a, I>(s: &mut String, iter: I)
where
	I: Iterator<Item = Event<'a>>
{
	let mut state = State::new(s);
	push_text_to_state(&mut state, iter);
}

/// Convert markdown to somewhat-nicely-styled text.
struct State<'a, 's> {
	/// The text buffer.
	txt: &'s mut String,
	/// The number of newlines (with indentation but no other content) that are currently
	/// appended to the string.
	trailing_newlines: u8,

	/// The currently active indentations.
	indentations: Vec<&'static str>,

	/// The length of the text of the current heading.
	heading_len: usize,
	/// The level of the current heading.
	heading_lvl: Option<HeadingLevel>,

	/// Whether we are currently inside of a codeblock.
	code_block: bool,

	/// The number of HTML blocks that are currently open.
	html_blocks: u8,

	/// The stack of indices of the currently open lists. The index is None if the list
	/// is unordered.
	lists: Vec<Option<u64>>,

	/// Footnotes (both text and links). Indices will be +1'ed!
	footnotes: Vec<String>,
	/// Footnote label to footnote index mapping.
	footnote_labels: HashMap<CowStr<'a>, usize>,
	/// Set when we are currently parsing the footnote with the attached index.
	in_footnote: Option<usize>,
	/// The footnote indices of the currently active links and images
	footnote_links: Vec<usize>
}

impl<'s> State<'_, 's> {
	fn new(txt: &'s mut String) -> Self {
		Self {
			txt,
			trailing_newlines: u8::MAX,

			indentations: Vec::new(),

			heading_len: 0,
			heading_lvl: None,

			code_block: false,

			html_blocks: 0,

			lists: Vec::new(),

			footnotes: Vec::new(),
			footnote_labels: HashMap::new(),
			in_footnote: None,
			footnote_links: Vec::new()
		}
	}
}

impl<T: AsRef<str>> AddAssign<T> for State<'_, '_> {
	fn add_assign(&mut self, rhs: T) {
		*self.txt += rhs.as_ref();
	}
}

impl<'a> State<'a, '_> {
	/// Returns the current column of the current line.
	fn column(&self) -> usize {
		let line_begin_idx = self.txt.rfind("\r\n").map(|idx| idx + 2).unwrap_or(0);
		// -1 because we space-stuff all lines
		self.txt[line_begin_idx ..].chars().count() - 1
	}

	fn newline_impl(&mut self, space_stuffing: bool, quotes_only: bool) {
		// if the previous line contained only whitespace, remove that whitespace
		if let Some(mut idx) = self.txt.rfind("\r\n").map(|idx| idx + 2) {
			while self.txt[idx ..].starts_with('>') {
				idx += 1;
			}
			if self.txt[idx ..].chars().all(|ch| ch == ' ') {
				self.txt.truncate(idx);
			}
		}

		*self.txt += "\r\n";
		if space_stuffing
			&& self
				.indentations
				.first()
				.is_none_or(|indent| *indent != ">")
		{
			self.txt.push(' ');
		}
		self.trailing_newlines += 1;

		// add "indentations" (not necessarily whitespace)
		for indent in &self.indentations {
			if !quotes_only || *indent == ">" {
				*self.txt += indent;
			}
		}

		// Space-stuff the inner content of the quotation line. We do this because
		// it looks nicer for clients that don't support format=flowed.
		if space_stuffing && self.indentations.last().is_some_and(|i| *i == ">") {
			self.txt.push(' ');
		}
	}

	/// Add newlines as necessary to acchieve the number of trailing newlines as
	/// requested.
	fn newlines(&mut self, count: u8) {
		while self.trailing_newlines < count {
			// For simplicity and for better display in mail clients that don't
			// understand format=flowed, we space-stuff all lines
			self.newline_impl(true, false);
		}
	}

	fn add_text_unwrapped(&mut self, text: &str) {
		if text.is_empty() {
			return;
		}

		*self.txt += text;
		self.trailing_newlines = 0;

		if self.heading_lvl.is_some() {
			let mut column = self.column();
			if self.txt.ends_with(' ') {
				column -= 1;
			}
			self.heading_len = self.heading_len.max(column);
		}
	}

	fn add_text_wrapping(&mut self, text: &str) {
		if text.is_empty() {
			return;
		}

		let column = self.column();
		let optimal_length = PREFERRED_LINE_WIDTH.saturating_sub(column);
		let max_length = MAX_LINE_WIDTH.saturating_sub(column);

		if text.len() <= optimal_length {
			self.add_text_unwrapped(text);
			return;
		}

		// Find the index of the spaces before and after the optimal length
		let mut space_before = None;
		let mut space_after = None;
		let mut space_after_within_max_length = false;
		let mut byte = 0;
		for (i, ch) in text.chars().enumerate() {
			if ch == ' ' {
				if i <= optimal_length {
					space_before = Some(byte);
				} else {
					space_after = Some(byte);
					space_after_within_max_length = i <= max_length;
					break;
				}
			}
			byte += ch.len_utf8();
		}
		let space = match (space_before, space_after) {
			(Some(space_before), None) => space_before,
			(None, Some(space_after)) if space_after_within_max_length => space_after,
			(Some(space_before), Some(space_after)) => {
				if !space_after_within_max_length {
					// space_after is unusable because it is too long
					space_before
				} else if (optimal_length - space_before) <= space_after {
					// space_before deviates no more than space_after from the
					// preferred line width
					space_before
				} else {
					space_after
				}
			},
			(None, _) if self.trailing_newlines == 0 && self.txt.ends_with(' ') => {
				self.newline_impl(true, true);
				self.add_text_wrapping(text);
				return;
			},
			(None, Some(space_after)) => {
				// unable to fit into max line width, so use anyways
				space_after
			},
			_ => {
				// unable to fit and/or unable to break, so just append as is
				self.add_text_unwrapped(text);
				return;
			}
		};

		// if space >= text.len() {
		// 	eprintln!("ERROR: {space} is >= {}", text.len());
		// 	eprintln!(
		// 		" HELP: column={column}, optimal_length={optimal_length}, max_length={max_length}"
		// 	);
		// 	self.add_text_unwrapped(text);
		// }

		// Split after the whitespace: If the whitespace is at the end of the
		// line, then it is treated as a soft line break.
		let (before, after) = text.split_at(space + 1);
		self.add_text_unwrapped(before);
		self.newline_impl(true, true);
		self.add_text_wrapping(after);
	}

	fn get_or_create_footnote(&mut self, label: CowStr<'a>) -> usize {
		match self.footnote_labels.get(&label) {
			Some(footnote_idx) => *footnote_idx,
			None => {
				let footnote_idx = self.footnotes.len();
				self.footnotes.push(String::new());
				self.footnote_labels.insert(label, footnote_idx);
				footnote_idx
			}
		}
	}
}

// TODO verify the format=flowed claim and fix any incompatibilities
fn push_text_to_state<'a, I>(txt: &mut State<'a, '_>, iter: I)
where
	I: Iterator<Item = Event<'a>>
{
	// The space-stuffing of the initial line might be missing.
	if txt.txt.is_empty() || txt.txt.ends_with('\n') {
		txt.txt.push(' ');
	}

	// Go through all events generated by the parser.
	for event in iter {
		match event {
			Event::Start(Tag::Paragraph) => {
				if let Some(footnote_idx) = txt.in_footnote {
					let footnote_txt = &mut txt.footnotes[footnote_idx];
					if !footnote_txt.is_empty() {
						*footnote_txt += "\n\n";
					}
				} else {
					txt.newlines(2);
				}
			},

			Event::Start(Tag::Heading { level, .. }) => {
				txt.newlines(3);
				txt.heading_lvl = Some(level);
				txt.heading_len = 0;
			},

			Event::Start(Tag::BlockQuote(_)) => {
				// add one newline without the `>` indentation in expectation that the
				// next block will probably ask for the indentation anyways
				txt.newlines(1);
				txt.indentations.push(">");
			},

			Event::Start(Tag::CodeBlock(_)) => {
				txt.newlines(2);
				txt.code_block = true;
			},

			Event::Start(Tag::HtmlBlock) => {
				txt.html_blocks += 1;
			},

			Event::Start(Tag::List(list_idx)) => {
				txt.newlines(2);
				txt.lists.push(list_idx);
			},

			Event::Start(Tag::Item) => {
				txt.newlines(2);
				let list_idx = txt
					.lists
					.last_mut()
					.expect("Markdown parser found a list item outside of a list");
				if let Some(list_idx) = list_idx {
					let list_idx_str = format!("{list_idx}. ");
					for _ in 0 .. 4usize.saturating_sub(list_idx_str.len()) {
						txt.txt.push(' ');
					}
					*txt.txt += &list_idx_str;
					*list_idx += 1;
				} else {
					*txt += "  • ";
				}
				txt.indentations.push("    ");
			},

			Event::Start(Tag::FootnoteDefinition(label)) => {
				let footnote_idx = txt.get_or_create_footnote(label);
				txt.in_footnote = Some(footnote_idx);
			},

			Event::Start(Tag::DefinitionList)
			| Event::Start(Tag::DefinitionListTitle)
			| Event::Start(Tag::DefinitionListDefinition) => {
				unreachable!("Definition lists are not enabled in the parser options")
			},

			Event::Start(Tag::Table(_))
			| Event::Start(Tag::TableHead)
			| Event::Start(Tag::TableRow)
			| Event::Start(Tag::TableCell) => {
				unreachable!("Tables are not enabled in the parser options")
			},

			Event::Start(Tag::Emphasis)
			| Event::Start(Tag::Strong)
			| Event::Start(Tag::Strikethrough) => {
				// those cannot be represented in plain text, so we do nothing
			},

			Event::Start(Tag::Superscript) | Event::Start(Tag::Subscript) => {
				unreachable!("Super/Subscript are not enabled in the parser options")
			},

			Event::Start(Tag::Link {
				link_type: LinkType::Autolink,
				..
			}) => {
				// the link is already written to the text, so set the idx to 0 to avoid
				// it being written twice
				txt.footnote_links.push(0);
			},

			Event::Start(Tag::Link { dest_url, .. })
			| Event::Start(Tag::Image { dest_url, .. }) => {
				txt.footnotes.push(dest_url.into_string());
				txt.footnote_links.push(txt.footnotes.len());
			},

			Event::Start(Tag::MetadataBlock(_)) => {
				unreachable!("Metadata blacks are not enabled in the parser options")
			},

			Event::End(TagEnd::Paragraph) => {
				// TODO do we need to do anything here?
			},

			Event::End(TagEnd::Heading(level)) => {
				txt.newlines(1);
				let ch = match level {
					HeadingLevel::H1 => '=',
					_ => '-'
				};
				for _ in 0 .. txt.heading_len {
					txt.txt.push(ch);
				}
				txt.trailing_newlines = 0;
				txt.newlines(2);
				txt.heading_lvl = None;
			},

			Event::End(TagEnd::BlockQuote(_)) => {
				let indent = txt.indentations.pop();
				debug_assert_eq!(indent, Some(">"));
			},

			Event::End(TagEnd::CodeBlock) => {
				debug_assert!(txt.code_block);
				txt.code_block = false;
			},

			Event::End(TagEnd::HtmlBlock) => {
				txt.html_blocks -= 1;
			},

			Event::End(TagEnd::List(_)) => {
				let list_idx = txt.lists.pop();
				debug_assert!(list_idx.is_some());
			},

			Event::End(TagEnd::Item) => {
				let indent = txt.indentations.pop();
				debug_assert_eq!(indent, Some("    "));
			},

			Event::End(TagEnd::FootnoteDefinition) => {
				txt.in_footnote = None;
			},

			Event::End(TagEnd::DefinitionList)
			| Event::End(TagEnd::DefinitionListTitle)
			| Event::End(TagEnd::DefinitionListDefinition) => {
				unreachable!("Definition lists are not enabled in the parser options")
			},

			Event::End(TagEnd::Table)
			| Event::End(TagEnd::TableHead)
			| Event::End(TagEnd::TableRow)
			| Event::End(TagEnd::TableCell) => {
				unreachable!("Tables are not enabled in the parser options")
			},

			Event::End(TagEnd::Emphasis)
			| Event::End(TagEnd::Strong)
			| Event::End(TagEnd::Strikethrough) => {
				// those cannot be represented in plain text, so we do nothing
			},

			Event::End(TagEnd::Superscript) | Event::End(TagEnd::Subscript) => {
				unreachable!("Super/Subscript are not enabled in the parser options")
			},

			Event::End(TagEnd::Link) | Event::End(TagEnd::Image) => {
				let footnote_idx = txt
					.footnote_links
					.pop()
					.expect("Markdown parser found a closing link/image that isn't open");
				if footnote_idx != 0 {
					txt.add_text_wrapping(&format!(" [{footnote_idx}]"));
				}
			},

			Event::End(TagEnd::MetadataBlock(_)) => {
				unreachable!("Metadata blocks are not enabled in the parser options")
			},

			// Event::Code is `inline code`, no special treatment there
			Event::Text(text) | Event::Code(text) => {
				// TODO respect the current state before printing text

				// footnotes are written later, so we just cache the text for now
				if let Some(footnote_idx) = txt.in_footnote {
					txt.footnotes[footnote_idx] += &text;
				}
				// but special treatment to code blocks: no wrapping here
				else if txt.code_block {
					for line in text.lines() {
						// remove trailing whitespace to avoid accidental flowing
						*txt += line.trim_end_matches(' ');
						txt.trailing_newlines = 0;
						txt.newlines(1);
					}
				} else {
					txt.add_text_wrapping(&text);
				}
			},

			Event::InlineMath(_) | Event::DisplayMath(_) => {
				unreachable!("Math is not enabled in the parser options")
			},

			Event::Html(_) | Event::InlineHtml(_) => {
				// intentionally ignoring HTML blocks
			},

			Event::FootnoteReference(label) => {
				let footnote_idx = txt.get_or_create_footnote(label);
				txt.add_text_wrapping(&format!("[{footnote_idx}]"));
			},

			Event::SoftBreak => {
				// soft breaks just get translated to a space
				if let Some(footnote_idx) = txt.in_footnote {
					txt.footnotes[footnote_idx].push(' ');
				} else {
					txt.add_text_wrapping(" ");
				}
			},

			Event::HardBreak => {
				if let Some(footnote_idx) = txt.in_footnote {
					txt.footnotes[footnote_idx] += "\n";
				} else {
					// we must trim whitespace to ensure the hard break is actually a
					// hard break
					while txt.txt.ends_with(' ') {
						txt.txt.pop();
					}
					// force a newline
					txt.trailing_newlines = 0;
					txt.newlines(1);
				}
			},

			Event::Rule => {
				txt.newlines(1);
				for _ in 0 .. MAX_LINE_WIDTH {
					txt.add_text_unwrapped("-");
				}
				txt.newlines(1);
			},

			Event::TaskListMarker(_) => {
				unreachable!("Task lists are not enabled in the parser options")
			}
		}
	}

	// Now let's add all of the footnotes
	if !txt.footnotes.is_empty() {
		txt.newlines(1);
		// We use the "signature separator" as a "footnote separator"
		txt.newline_impl(false, true);
		debug_assert!(txt.txt.ends_with("\r\n"));
		txt.add_text_unwrapped("-- ");
		for (i, f) in mem::take(&mut txt.footnotes).into_iter().enumerate() {
			let multiline = f.contains('\n');
			txt.newlines(1);
			if multiline {
				txt.newlines(2);
			}
			let f_label = format!("[{}]: ", i + 1);
			for _ in 0 .. 6usize.saturating_sub(f_label.len()) {
				txt.txt.push(' ');
			}
			*txt += &f_label;
			txt.indentations.push("      ");
			for line in f.lines() {
				txt.newlines(1);
				txt.add_text_wrapping(line);
				// manually set the line as "dirty" so that consequtive newlines in the
				// footnotes (such as those produces by the paragraph separation) are
				// written to the text
				txt.trailing_newlines = 0;
			}
			txt.indentations.pop();
			if multiline {
				txt.newlines(2);
			}
		}
	}

	// Always add a trailing newline
	txt.newline_impl(false, true);
}