gemrendr 0.3.1 - Docs.rs

//! gemrendr   Turns Gemtext into idiomatic HTML.
//! Copyright (C) 2025  AverageHelper
//!
//! This program is free software: you can redistribute it and/or modify
//! it under the terms of the GNU General Public License as published by
//! the Free Software Foundation, either version 3 of the License, or
//! (at your option) any later version.
//!
//! This program is distributed in the hope that it will be useful,
//! but WITHOUT ANY WARRANTY; without even the implied warranty of
//! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//! GNU General Public License for more details.
//!
//! You should have received a copy of the GNU General Public License
//! along with this program.  If not, see <https://www.gnu.org/licenses/>.

use crate::gemtext::{Document, GemtextContentBlock, HeadingLevel};
use alloc::string::String;
use alloc::{borrow::ToOwned, vec::Vec};
use core::str::FromStr;

/// Parses the given text document into Gemtext.
fn parse_document(document: &str) -> Vec<GemtextContentBlock> {
	use GemtextContentBlock::{Heading, Link, List, Pre, Quote, Text};

	// Spec: "Gemtext is a line-oriented format. A document consists of one or more lines."
	// TODO: Make a stateful line-mode parser instead? That way, we'll have no chance of running out of memeory if a file has a bajillion lines of mixed text, and we can even do a REPL!
	let mut lines = document.lines();
	let mut body: Vec<GemtextContentBlock> = Vec::with_capacity(512);
	let mut line = lines.next();
	while let Some(definite_line) = line {
		// Spec: "The parser MUST be in "normal mode" at the beginning of parsing a document."
		match GemtextLine::parse_normal(definite_line) {
			GemtextLine::Heading { level, content } => {
				body.push(Heading { level, content });
				line = lines.next(); // proceed to next line
			}
			GemtextLine::Link { target, label } => {
				body.push(Link { target, label });
				line = lines.next(); // proceed to next line
			}
			GemtextLine::ListItem { content } => {
				// Loop to collect all adjacent list items
				let mut items: Vec<String> = Vec::with_capacity(8);
				items.push(content);
				line = lines.next();
				'list: while let Some(line_b) = line {
					match GemtextLine::parse_normal(line_b) {
						GemtextLine::ListItem { content } => {
							items.push(content);
							line = lines.next();
						}
						_ => break 'list,
					}
				}
				// Commit the List block
				body.push(List { items });
			}
			GemtextLine::PreformatToggle { alt_text } => {
				// Loop to collect all adjacent text lines
				let mut text_lines: Vec<String> = Vec::with_capacity(8);
				line = lines.next();
				'pre: while let Some(line_b) = line {
					match GemtextLine::parse_pre_formatted(line_b) {
						GemtextLine::Text { content } => {
							text_lines.push(content);
							line = lines.next();
						}
						GemtextLine::PreformatToggle { .. } => break 'pre,
						_ => unreachable!("got something other than Text or Pre in preformat mode"),
					}
				}
				// Commit the Preformatted block (ignoring second alt_text)
				let content = text_lines.join("\n");
				if !content.is_empty() {
					body.push(Pre { alt_text, content });
				}
				line = lines.next(); // ignore this marker, advance to next line
			}
			GemtextLine::Quote { content } => {
				body.push(Quote { content });
				line = lines.next(); // proceed to next line
			}
			GemtextLine::Text { content } if content.is_empty() => {
				body.push(Text {
					content: String::from("\n"),
				});
				line = lines.next(); // proceed to next line
			}
			GemtextLine::Text { content } => {
				body.push(Text { content });
				line = lines.next(); // proceed to next line
			}
		}
	}

	body
}

/// A line of Gemtext content.
///
/// Unlike `GemtextContentBlock`, these always correspond to a single
/// line of Gemtext.
///
/// From the spec:
/// > There are six distinct line types. Each line belongs to exactly
/// > one type, and it is possible to unambiguously determine this
/// > type by inspecting the first three characters of the line.
#[derive(Debug)]
enum GemtextLine {
	Text {
		content: String,
	},
	Link {
		target: String,
		label: Option<String>,
	},
	Heading {
		level: HeadingLevel,
		content: String,
	},
	ListItem {
		content: String,
	},
	Quote {
		content: String,
	},
	PreformatToggle {
		alt_text: Option<String>,
	},
}

impl GemtextLine {
	/// Parses the given text as a line of normal (not "pre-formatted")
	/// Gemtext.
	///
	/// From the spec:
	/// > There are six distinct line types. Each line belongs to
	/// > exactly one type, and it is possible to unambiguously determine
	/// > this type by inspecting the first three characters of the line.
	/// > A line's type, in conjunction with the parser state (see below)
	/// > determines the manner in which it should be presented to the
	/// > user. Any details of presentation or rendering associated with
	/// > a particular line type are strictly limited in scope to that
	/// > individual line.
	/// >
	/// > \[...\]
	/// >
	/// > A compliant gemtext parser must maintain a single bit of
	/// > internal state, corresponding to whether the parser is in
	/// > "normal mode" or "pre-formatted mode". The state of the parser
	/// > controls how the type of a line is recognised and how a line is
	/// > to be handled given its type.
	fn parse_normal(line: &str) -> Self {
		// Spec: "In normal mode [...]"

		// Spec: "Lines beginning with "#" are heading lines. Heading lines consist of one, two or three consecutive "#" characters, followed by optional whitespace, followed by heading text. The number of # characters indicates the "level" of heading;"
		if let Some(heading) = line.strip_prefix("###") {
			// Spec: "### lines are sub-sub headings"
			Self::Heading {
				level: HeadingLevel::Three,
				content: heading.to_owned().trim().to_owned(),
			}
		} else if let Some(heading) = line.strip_prefix("##") {
			// Spec: "## lines are sub-headings"
			Self::Heading {
				level: HeadingLevel::Two,
				content: heading.to_owned().trim().to_owned(),
			}
		} else if let Some(heading) = line.strip_prefix("#") {
			// Spec: "# lines are headings"
			Self::Heading {
				level: HeadingLevel::One,
				content: heading.to_owned().trim().to_owned(),
			}
		} else if let Some(link) = line.strip_prefix("=>") {
			// Spec: "All lines beginning with the two characters "=>" are link lines."

			// Spec: "Link lines have the following syntax:
			// =>[<whitespace>]<URL>[<whitespace><USER-FRIENDLY LINK NAME>]
			// where:
			// * <whitespace> is any non-zero number of consecutive spaces or tabs
			// * Square brackets indicate that the enclosed content is optional.
			// * <URL> is a URL, which may be absolute or relative."
			let parts = link.trim();
			if parts.is_empty() {
				Self::Text {
					content: line.trim().to_owned(), // line only starts with "=>", no URL means there's no link
				}
			} else {
				// We've trimmed both ends, so the first <whitespace> block doesn't matter here
				let target_end_idx = parts.find(char::is_whitespace).unwrap_or(parts.len());
				let target = &parts[..target_end_idx];
				let label = parts[target_end_idx..].trim();
				if label.is_empty() {
					Self::Link {
						target: target.trim().to_owned(),
						label: None,
					}
				} else {
					Self::Link {
						target: target.trim().to_owned(),
						label: Some(label.trim().to_owned()), // TODO: Mitigate cases where the label might be or look like a URL that differs from the target
					}
				}
			}
		} else if let Some(item) = line.strip_prefix("* ") {
			// Spec: "Lines beginning with "* " are list items."
			Self::ListItem {
				content: item.trim().to_owned(),
			}
		} else if let Some(quote) = line.strip_prefix(">") {
			// Spec: "Lines beginning with ">" are quote lines."
			Self::Quote {
				content: quote.trim().to_owned(),
			}
		} else if let Some(alt_text) = line.strip_prefix("```") {
			// Spec: "Any line whose first three characters are "```" (i.e. three consecutive back ticks with no leading whitespace) are preformatted toggle lines."
			let alt_text = alt_text.trim();
			if alt_text.is_empty() {
				Self::PreformatToggle { alt_text: None }
			} else {
				// Spec: "Any text following the leading "```" of a preformat toggle line MAY be interpreted by the client as "alt text" pertaining to the preformatted text lines which follow the toggle line."
				Self::PreformatToggle {
					alt_text: Some(alt_text.to_owned()),
				}
			}
		} else {
			// Spec: "Text lines are the "default" line type, in the sense that all other line types are recognised by virtue of beginning with a specific identifying prefix. Any line which does not begin with such a prefix is a text line."
			Self::Text {
				content: line.to_owned(),
			}
		}
	}

	/// Parses the given text as a line of normal pre-formatted" Gemtext.
	///
	/// From the spec:
	/// > There are six distinct line types. Each line belongs to
	/// > exactly one type, and it is possible to unambiguously determine
	/// > this type by inspecting the first three characters of the line.
	/// > A line's type, in conjunction with the parser state (see below)
	/// > determines the manner in which it should be presented to the
	/// > user. Any details of presentation or rendering associated with
	/// > a particular line type are strictly limited in scope to that
	/// > individual line.
	/// >
	/// > \[...\]
	/// >
	/// > A compliant gemtext parser must maintain a single bit of
	/// > internal state, corresponding to whether the parser is in
	/// > "normal mode" or "pre-formatted mode". The state of the parser
	/// > controls how the type of a line is recognised and how a line is
	/// > to be handled given its type.
	fn parse_pre_formatted(line: &str) -> Self {
		// Spec: "In pre-formatted mode[...]"

		if line.strip_prefix("```").is_some() {
			// Spec: "Any line whose first three characters are "```" (i.e. three consecutive back ticks with no leading whitespace) are preformatted toggle lines."
			// Spec: "Any text following the leading "```" of a preformat toggle line MUST be ignored by clients."
			Self::PreformatToggle { alt_text: None }
		} else {
			// Spec: "Any line which does not begin with the three characters "```" is a text line."
			Self::Text {
				content: line.to_owned(),
			}
		}
	}
}

impl FromStr for Document {
	type Err = core::convert::Infallible;

	fn from_str(s: &str) -> Result<Self, Self::Err> {
		let contents = parse_document(s);
		Ok(Self { contents })
	}
}

// MARK: - Tests

#[cfg(test)]
mod tests {
	use super::*;
	use alloc::vec;

	#[test]
	fn test_parses_headings() {
		use GemtextContentBlock::Heading;
		use HeadingLevel::{One, Three, Two};

		let h1 = Heading {
			level: One,
			content: "Heading".into(),
		};
		let h2 = Heading {
			level: Two,
			content: "Heading".into(),
		};
		let h3 = Heading {
			level: Three,
			content: "Heading".into(),
		};
		let cases = [
			("#Heading", h1.clone()),
			("# Heading", h1.clone()),
			("#     	Heading", h1.clone()),
			("#     	Heading     ", h1.clone()),
			("# Heading     ", h1),
			("##Heading", h2.clone()),
			("## Heading", h2.clone()),
			("##    	 Heading", h2),
			("###Heading", h3.clone()),
			("### Heading", h3.clone()),
			("###   	   Heading", h3.clone()),
			("###   	   Heading", h3),
		];
		for (test, expected) in cases {
			let document: Document = test.parse().unwrap();
			let lines = document.contents;
			assert_eq!(lines.len(), 1);
			let result = lines.first().expect("single-line document");
			assert_eq!(*result, expected);
		}
	}

	#[test]
	fn test_parses_links() {
		use GemtextContentBlock::Link;

		#[rustfmt::skip]
		let cases = [
			("=> test", Link { target: "test".into(), label: None }),
			("=> test link", Link { target: "test".into(), label: Some("link".into()) }),
			("=> /foo", Link { target: "/foo".into(), label: None }),
			("=> foo://bar", Link { target: "foo://bar".into(), label: None }),
			("=> foo://bar ext", Link { target: "foo://bar".into(), label: Some("ext".into()) }),
			("=> foo://bar foo://baz", Link { target: "foo://bar".into(), label: Some("foo://baz".into()) }),
		];
		for (test, expected) in cases {
			let document: Document = test.parse().unwrap();
			let lines = document.contents;
			assert_eq!(lines.len(), 1);
			let result = lines.first().expect("single-line document");
			assert_eq!(*result, expected);
		}
	}

	#[test]
	fn test_one_newline_same_paragraph() {
		let content = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.";
		let lines = parse_document(content);
		assert_eq!(
			lines,
			vec![
				GemtextContentBlock::Text {
					content: String::from(
						"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."
					)
				},
				GemtextContentBlock::Text {
					content: String::from(
						"Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."
					)
				}
			]
		);
	}

	#[test]
	fn test_two_newlines_different_paragraphs() {
		use GemtextContentBlock::Text;

		let content = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.

Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.";
		let line = parse_document(content);
		assert_eq!(
			line,
			vec![
				Text {
					content: String::from(
						"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."
					),
				},
				Text {
					content: String::from("\n"),
				},
				Text {
					content: String::from(
						"Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."
					),
				},
			]
		);
	}

	#[test]
	fn test_parse_unclosed_pre() {
		// Spec: "The state of the parser [whether "pre-formatted mode" or "normal mode""] at the end of a document has no meaning or consequences."

		use GemtextContentBlock::{Heading, Pre, Text};
		use HeadingLevel::One;
		use pretty_assertions::assert_eq;

		let content = include_str!("../tests/samples/no_close_pre.gmi");
		let lines = parse_document(content);
		assert_eq!(
			lines,
			vec![
				Heading {
					level: One,
					content: String::from("Test"),
				},
				Text {
					content: String::from(
						"The following is an un-closed pre-formatted block. It should parse the same as if it were properly closed:"
					)
				},
				Pre {
					alt_text: None,
					content: String::from("This is the block.")
				},
			]
		);
	}

	#[test]
	fn test_parse_pre_blocks() {
		use GemtextContentBlock::Pre;
		use pretty_assertions::assert_eq;

		let content = include_str!("../tests/samples/pre_blocks.gmi");
		let blocks = parse_document(content);
		assert_eq!(
			blocks,
			vec![
				Pre {
					alt_text: None,
					content: "This one has no alt text.".into()
				},
				Pre {
					alt_text: Some("Things and stuff".into()),
					content: "This has alt text.".into()
				},
				Pre {
					alt_text: Some("gemtext".into()),
					content: "This has Gemtext markup.\n=> gemini://geminiprotocol.net/docs/gemtext-specification.gmi Gemtext Specification"
						.into()
				},
			]
		);
	}

	#[test]
	fn test_parse_gemtext_sample() {
		use GemtextContentBlock::{Heading, Link, List, Pre, Quote, Text};
		use HeadingLevel::{One, Three, Two};
		use pretty_assertions::assert_eq;

		let content = include_str!("../tests/samples/sample.gmi");
		let lines = parse_document(content);
		assert_eq!(
			lines,
			vec![
				Heading {
					level: One,
					content: String::from("This is a heading!"),
				},
				Text {
					content: String::from("\n")
				},
				Heading {
					level: One,
					content: String::from("This is also a heading!"),
				},
				Text {
					content: String::from("\n")
				},
				Text {
					content: String::from("This is some text. It is not special.")
				},
				Text {
					content: String::from("\n")
				},
				Heading {
					level: Two,
					content: String::from("Another heading")
				},
				Text {
					content: String::from("\n")
				},
				Heading {
					level: Two,
					content: String::from("yes, leading spaces are optional")
				},
				Text {
					content: String::from("\n")
				},
				Link {
					target: String::from(
						"gemini://geminiprotocol.net/docs/gemtext-specification.gmi"
					),
					label: None
				},
				Text {
					content: String::from("\n")
				},
				Link {
					target: String::from(
						"gemini://geminiprotocol.net/docs/gemtext-specification.gmi"
					),
					label: Some(String::from(
						"Here is a friendly link to the Gemtext specification"
					))
				},
				Text {
					content: String::from("\n")
				},
				Link {
					target: String::from(
						"https://geminiprotocol.net/docs/gemtext-specification.gmi"
					),
					label: Some(String::from("gemini protocol isn't special for links"))
				},
				Text {
					content: String::from("\n")
				},
				Link {
					target: String::from("gemini://geminiprotocol.net/"),
					label: Some(String::from("multiple kinds of whitespace"))
				},
				Text {
					content: String::from("\n")
				},
				Link {
					target: String::from("gemini://geminiprotocol.net/"),
					label: Some(String::from("multiple whitespace between parts!"))
				},
				Text {
					content: String::from("\n")
				},
				Link {
					target: String::from("gemini://geminiprotocol.net/"),
					label: Some(String::from("weird      whitespace within tag!")),
				},
				Text {
					content: String::from("\n")
				},
				Link {
					target: String::from("/foo/bar/baz.txt"),
					label: Some(String::from("leading slash still counts"))
				},
				Text {
					content: String::from("\n")
				},
				Link {
					target: String::from("/foo/bar.png"),
					label: Some(String::from("Inline image!")),
				},
				Text {
					content: String::from("\n")
				},
				Text {
					content: String::from(
						"Links may also be closer together. Here are some examples from the spec:"
					)
				},
				Link {
					target: String::from("gemini://example.org/"),
					label: None
				},
				Link {
					target: String::from("gemini://example.org/"),
					label: Some(String::from("An example link"))
				},
				Link {
					target: String::from("gemini://example.org/foo"),
					label: Some(String::from("Another example link at the same host"))
				},
				Link {
					target: String::from("foo/bar/baz.txt"),
					label: Some(String::from("A relative link"))
				},
				Link {
					target: String::from("gopher://example.org:70/1"),
					label: Some(String::from("A gopher link"))
				},
				Text {
					content: String::from("\n")
				},
				Link {
					target: String::from("this"),
					label: Some(String::from("shouldn't be a link, but it is."))
				},
				Link {
					target: String::from("gemini://example.org/"),
					label: Some(String::from("the leading space is optional"))
				},
				Text {
					content: String::from("\n")
				},
				Text {
					content: String::from("Not a link:"),
				},
				Text {
					content: String::from("=>"),
				},
				Text {
					content: String::from("\n"),
				},
				Heading {
					level: Three,
					content: String::from("Why not try lists?")
				},
				Text {
					content: String::from("\n")
				},
				Text {
					content: String::from("\n")
				},
				Heading {
					level: Three,
					content: String::from("spaces are still optional")
				},
				Text {
					content: String::from("\n")
				},
				List {
					items: vec![
						String::from("This is a list item."),
						String::from("Here's another."),
						String::from("Still a list item!"),
					]
				},
				Text {
					content: String::from("*This is not.")
				},
				Text {
					content: String::from("** This is also not.")
				},
				Text {
					content: String::from("*	Neither is this.")
				},
				Text {
					content: String::from("- Not a list item.")
				},
				Text {
					content: String::from("  - Not a list item.")
				},
				Text {
					content: String::from("\n")
				},
				Quote {
					content: String::from("Someone said this.")
				},
				Text {
					content: String::from("\n")
				},
				Quote {
					content: String::from("Someone also said this.")
				},
				Text {
					content: String::from("\n")
				},
				Quote {
					content: String::from("this is not the same quote."),
				},
				Quote {
					content: String::new(),
				},
				Quote {
					content: String::from("multiline quote! if only."),
				},
				Text {
					content: String::from("\n")
				},
				Pre {
					alt_text: None,
					content: String::from(
						r"This text is preformatted.

# Hello, world!

This isn't to be treated as Gemtext:
=> gemini://example.com

> no one said this"
					)
				},
				Text {
					content: String::from("\n")
				},
				Pre {
					alt_text: None,
					content: String::from("This is also plaintext")
				},
				Text {
					content: String::from("\n")
				},
				Text {
					content: String::from("\n")
				},
				Pre {
					alt_text: Some(String::from("This is alt text")),
					content: String::from("This text is also preformatted.")
				},
				Text {
					content: String::from("\n")
				},
				Pre {
					alt_text: Some(String::from(
						r#"Art by Joan Stark of a camp site. A small tent faces a small campfire. There is a log nearby, perfect for sitting on. The initials "jgs" can be seen."#
					)),
					content: String::from(
						r#"        ______
jgs    /     /\
      /     /  \
     /_____/----\_    (
    "     "          ).
   _ ___          o (:') o
  (@))_))        o ~/~~\~ o
                  o  o  o"#
					)
				},
				Text {
					content: String::from("\n")
				},
				Text {
					content: String::from(
						"Syntax highlighting may be applied to preformatted blocks:"
					)
				},
				Pre {
					alt_text: Some(String::from("javascript")),
					content: String::from("	column.substring(0,num)"),
				},
				Text {
					content: String::from("\n")
				},
				Text {
					content: String::from(
						r#"The spec says, "Any text following the leading "```" of a preformat toggle line MUST be ignored by clients." So..."#
					)
				},
				Text {
					content: String::from("wow, what WAS that??")
				},
			]
		);
	}

	#[test]
	#[cfg(not(tarpaulin))]
	fn bench_parsing() {
		use std::time::Instant;

		const ITERS: u128 = 500;
		let mut times = Vec::with_capacity(ITERS as usize);
		for _ in 0..ITERS {
			let input = include_str!("../tests/samples/sample.gmi");

			let start = Instant::now();
			let _parsed = parse_document(input);
			times.push(start.elapsed().as_nanos());
		}
		let average = times.iter().sum::<u128>() / ITERS;

		const LIMIT: u128 = 80_000; // nanoseconds
		assert!(average < LIMIT, "{average} ns is too slow!");
	}
}