rg_formats 0.1.2

//! Parsing, Processing and serializing of `.msd` files. This is the file format that
//! underpins `.sm` and `.ssc`.
//!
//! **You might be looking for an `.sm` parser. Use [`sm`][crate::sm] for that.**
//!
//! MSD or "Music Score Data" is an extremely old format for very early dance pad
//! emulators. While nowadays the `.msd` extension is effectively dead, it lives on in
//! `.sm` and `.ssc`, as its grammar underpins both.
//!
//! This module implements a stepmania-accurate MSD parsers, with error correction and all.
//!
//! This includes handling *extremely* obscure parse cases, like `#FOO;BAR;BAR`, which
//! every other SM parser I've tested chokes on.
//!
//! # What is MSD?
//!
//! An average MSD file looks like this:
//!
//! ```msd
//! #TITLE:Hello World;
//! #ARTIST:McLusky;
//! //--------------- dance-single - Todestrieb ----------------
//! #NOTES:
//!      dance-single:
//!      Todestrieb:
//!      Hard:
//!      15:
//!      0,0,0,0,0:
//! 0000
//! 1000
//! 0000
//! 1000;
//! ```
//!
//! Despite what you might intuit by looking at it, MSD parses into a 2 dimensional array.
//!
//! The `#` value indicates a tag. A file may have multiple of the same tag, and there are
//! formats implemented ontop of MSD (like SSC) that depend on this behaviour.
//!
//! Subsequent colons -- `:` -- indicate the values associated with that column. For
//! example, the `#NOTES` field in the above example should parse into:
//!
//! ```json
//! ["dance-single", "Todestrieb", "Hard", "15", "0,0,0,0,0", "0000\n1000\n0000\n1000"]
//! ```
//!
//! MSD is a grammar which is theoretically as follows:
//!
//! ```ebnf
//! "#" tag_name (":" value)* ";""
//! ```
//!
//! However, many people back in the day were editing these files in notepad, so Stepmania
//! added very helpful error correction to `.sm` files. This allowed things such as:
//!
//! ```msd
//! #ARTIST:McLusky
//!   #Title:Without MSD I Am Nothing;
//! ```
//!
//! to parse perfectly fine (note the lack of semi-colon, indentation, and missing
//! capitalization on the `#TITLE` field.)
//!
//! Even more devilishly, SM's error correction allows this to parse:
//!
//! ```msd
//! #NOTES:
//!      dance-single
//!      Todestrieb; // <-- note the SEMICOLON HERE!!!
//!      Hard:
//!      15:
//!      0,0,0,0,0:
//! 0000
//! 1000
//! 0000
//! 1000;
//! ```
//!
//! As a result of this error correction, Stepmania's implementation of MSD ends up
//! looking like this:
//!
//! ```ebnf
//! ("\n" | start_of_file) "#" tag_name ( ( ":" | ";" ) tag_value )+
//! ```
//!
//! # Other Things
//!
//! MSD also supports `//` comments, and `\` as an escape character. This allows:
//!
//! ```msd
//! #TITLE\//:SongTitle\;
//! \ #ARTIST:McLusky
//! ```
//!
//! to parse as
//! ```json
//! ["TITLE//", "SongTitle"]
//! ["ARTIST", "McLusky"]
//! ```
//!
//! [sm]: crate::sm

use std::fmt::Debug;

use super::utils::ByteIterator;
use crate::{read, utils::ByteString};

/// An MSD file is a vector of [`MsdElement`]s. Tags may be repeated, and will *always*
/// be uppercased for SM compatibility.
#[derive(Debug, Clone, Default)]
pub struct MsdFile {
	/// The elements this MsdFile defined.
	pub elements: Vec<MsdElement>,
}

impl MsdFile {
	/// Make an empty MsdFile.
	pub fn new() -> Self {
		Self::default()
	}

	/// Get the first element with this tag.
	pub fn with_tag(&self, tag: &str) -> Option<MsdElement> {
		let tag_upper = tag.to_ascii_uppercase();
		let tag_upper = tag_upper.as_bytes();

		for el in &self.elements {
			if *el.tag == *tag_upper {
				return Some(el.clone());
			}
		}

		None
	}

	/// Get the first instance of this tag, and get the first value.
	/// Common for things like
	/// #TITLE:Foo;
	///        ^^^
	///
	/// Returns Some("foo") (boxed and sliced whatever)
	pub fn first_tag_first_val(&self, tag: &str) -> Option<Box<[u8]>> {
		let v = self.with_tag(tag)?;

		match v.values.first() {
			Some(v) => {
				// empty strings count as none
				if v.is_empty() {
					return None;
				}

				Some(v.clone())
			}
			None => None,
		}
	}

	/// Get all elements with this tag. This is for things like #NOTES, which may
	/// appear multiple times.
	pub fn all_with_tag(&self, tag: &str) -> Vec<&MsdElement> {
		let tag_upper = tag.to_ascii_uppercase();
		let tag_upper = tag_upper.as_bytes();

		self.elements
			.iter()
			.filter(|f| *f.tag == *tag_upper)
			.collect()
	}
}

/// An element in the MSD format. This is parsed from data like:
///
/// ```txt
/// #TITLE:Hello World;
/// #ARTIST:Foo;
/// ```
#[derive(Clone)]
pub struct MsdElement {
	/// The first value in this element -- this is prefixed with a `#` and ALWAYS
	/// uppercased.
	pub tag: ByteString,
	/// The values this tag pointed to. MSD elements can have an arbitrary amount of
	/// elements tied to their tag:
	///
	/// ```txt
	/// #TAG:Value1:value2:Value3;
	/// ```
	pub values: Vec<ByteString>,
}

impl Debug for MsdElement {
	fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
		f.debug_struct("MsdElement")
			.field("tag", &String::from_utf8_lossy(&self.tag))
			.field(
				"values",
				&self
					.values
					.iter()
					.map(|f| String::from_utf8_lossy(f))
					.collect::<Vec<_>>(),
			)
			.finish()
	}
}

fn write_param(data: &mut Vec<u8>, param: &ByteString) {
	for byte in param.iter() {
		let byte = *byte;

		if matches!(byte, b':' | b';' | b'#' | b'\\') {
			data.push(b'\\');
		}

		data.push(byte);
	}
}

/// Given a list of [`MsdElement`]s, serialize them. This can be used to serialize SM
/// files, or SSC files, or whatnot.
pub fn serialize_msd_elements(elements: Vec<MsdElement>) -> Box<[u8]> {
	let mut data = vec![];

	for el in elements {
		data.push(b'#');

		write_param(&mut data, &el.tag);

		for value in el.values {
			data.push(b':');

			write_param(&mut data, &value);
		}

		data.push(b';');
		data.push(b'\n');
	}

	data.into()
}

/// SM's "MSD" format is the internal underpinning for the
/// SM, SSC, DWI and SMA formats.
///
/// It is a list of tuples with the intended grammar of:
///
/// ```bnf
/// "#" tag_name (":" value)+ ";"
/// ```
///
/// However, some error correction code built into stepmania results in the grammar
/// being *vastly* different than intended. Instead, we have
///
/// ```bnf
/// separator ::= ":" | ";"
///
/// "#" tag_name (separator value)+ \n
/// ```
///
/// Most importantly, even though it looks like a hashmap
/// repeated keys are perfectly allowed and infact important
/// to the syntax.
pub fn from_bytes(buf: &[u8]) -> MsdFile {
	let mut elements: Vec<MsdElement> = vec![];
	let mut b_iter = ByteIterator::new(buf);

	loop {
		let (param_list, term_reason) = parse_param_list(&mut b_iter);

		// only add the tag to the elements if it's non-empty
		if let Some(tag) = param_list.first() {
			elements.push(MsdElement {
				tag: tag.to_ascii_uppercase().into(),
				values: param_list.into_iter().skip(1).collect(),
			})
		}

		// found the last thing in this file
		if matches!(term_reason, ParamListTermReason::Eof) {
			break;
		}
	}

	MsdFile { elements }
}

enum ParamListTermReason {
	Eof,
	Hash,
}

/// Parse "#TAG:VALUE:VALUE2;" into a vector of ["TAG", "VALUE", "VALUE2"]
/// Also returns the reason why we terminated.
fn parse_param_list(b_iter: &mut ByteIterator) -> (Vec<ByteString>, ParamListTermReason) {
	let mut params = vec![];

	// firstly skip until values actually start
	loop {
		match b_iter.read() {
			// if we see a hash, values have started
			Some(b'#') => break,
			// if we hit EOF, return params
			None => return (params, ParamListTermReason::Eof),
			// otherwise, keep going
			_ => continue,
		}
	}

	loop {
		let (param, terminate) = parse_param(b_iter);

		match terminate {
			ParamTermReason::Hash => {
				// walk back one since we've *passed* the "#", but it's important
				// for the next iteration.
				b_iter.rewind();

				// only add the last param if it's not empty
				// this prevents
				// #ARTIST:FOO;
				// from being parsed as ["ARTIST", "FOO", ""]
				if !param.is_empty() {
					params.push(param);
				}

				return (params, ParamListTermReason::Hash);
			}
			ParamTermReason::Eof => {
				// see above
				if !param.is_empty() {
					params.push(param);
				}

				return (params, ParamListTermReason::Eof);
			}
			// the other separators are completely equivalent
			// #TAG:VALUE;#TAG2:VALUE2;
			// should parse as ["TAG", "VALUE", "#TAG2", "VALUE2"]
			// which is weird
			_ => {}
		}

		params.push(param);

		// otherwise keep slurping as many params as you can.
	}
}

enum ParamTermReason {
	Hash,
	Colon,
	Semicolon,
	Eof,
}

fn parse_param(b_iter: &mut ByteIterator) -> (ByteString, ParamTermReason) {
	// inner function for convenience since we have multiple annoying returns.
	fn parse_val_inner(b_iter: &mut ByteIterator) -> ByteString {
		let mut param = vec![];

		loop {
			let byte = read!(b_iter);

			match byte {
				b'\\' => {
					// escape char means read the next char literally.
					let next_byte = read!(b_iter);

					// even if this is a special char, add it to the param.
					param.push(next_byte);
				}
				// since read!() increments the cursor
				// peek will be off-by-one.
				// comments are two /'s literally in a row.
				b'/' if b_iter.peek() == Some(&b'/') => {
					// it's a comment, read until newline or eof

					let mut chomp = read!(b_iter);
					while chomp != b'\n' {
						chomp = read!(b_iter);
					}

					// keep the newline
					param.push(b'\n');
				}
				b'#' => {
					// a hash could be part of the value:
					// #TITLE:Magical #girl;
					// or could be the sign of a missing semicolon:
					// #TITLE:Magical
					// #ARTIST:Girl

					// this depends on whether we see a newline preceding this.
					// go backwards through what we've already read.
					for backtrack_byte in param.iter().rev() {
						if *backtrack_byte == b'\n' {
							// this was meant to be a separator, exit the function.
							return param.into();
						}

						if *backtrack_byte == b' ' || *backtrack_byte == b'\t' {
							// things like
							// #Title:foo
							//    #artist:bar
							// should still be understood as a missing semicolon
							continue;
						}

						// no, this # is part of the value literally
						break;
					}

					param.push(byte);
				}
				b':' | b';' => {
					// end of value
					return param.into();
				}
				_ => {
					// everything else is just part of the value
					param.push(byte)
				}
			}
		}

		param.into()
	}

	// trim all leading and trailing whitespace from the value.
	let trimmed = parse_val_inner(b_iter).trim_ascii().into();

	let t_reason = match b_iter.prev() {
		Some(b'#') => ParamTermReason::Hash,
		Some(b':') => ParamTermReason::Colon,
		Some(b';') => ParamTermReason::Semicolon,
		None => ParamTermReason::Eof,

		// should be impossible
		u => panic!("Unexpected terminator {u:?}"),
	};

	(trimmed, t_reason)
}

#[cfg(test)]
mod tests {
	use super::*;
	use crate::test_utils::test_file_read;
	use pretty_assertions::assert_eq;

	macro_rules! test_parse_prm {
		($input: expr, $index: expr, $out: expr) => {{
			let mut biter = ByteIterator::new($input);
			biter.set_index($index);

			assert_eq!(
				std::str::from_utf8(&parse_param(&mut biter).0).unwrap(),
				$out
			)
		}};
	}

	#[test]
	fn p_val() {
		test_parse_prm!(b"#TITLE:AMONG US;", 1, "TITLE");
		test_parse_prm!(b"#TITLE:AMONG US;", 7, "AMONG US");
	}

	#[test]
	fn newlines() {
		test_parse_prm!(
			b"#NOTEDATA:
0000
0100
0000
0001, // measure 1
0000
0000
0000
0000
, // measure 2
;",
			10,
			"0000
0100
0000
0001, 
0000
0000
0000
0000
,"
		);

		test_parse_prm!(
			b"#TITLE: New Line
Here;",
			7,
			"New Line
Here"
		);

		test_parse_prm!(
			b"#TITLE: Missing Semicolon
#ARTIST: foo;",
			7,
			"Missing Semicolon"
		);

		test_parse_prm!(
			b"#TITLE: Missing Semicolon wspace
	  #ARTIST: foo;",
			7,
			"Missing Semicolon wspace"
		);

		test_parse_prm!(
			b"#TITLE: Missing Semicolon
	  heyo #ARTIST: foo;",
			7,
			"Missing Semicolon
	  heyo #ARTIST"
		);
	}

	#[test]
	fn escape() {
		test_parse_prm!(br"#TITLE: foo\:bar;", 7, r"foo:bar");
		test_parse_prm!(br"#TITLE: foo\\:bar;", 7, r"foo\");

		test_parse_prm!(br"#TITLE: foo\\\:bar;", 7, r"foo\:bar");
		test_parse_prm!(
			br"#TITLE: foo\
#artist",
			7,
			r"foo"
		);
	}

	#[test]
	fn multival() {
		test_parse_prm!(b"#MULTI:VALUE:VALUE2;", 7, "VALUE");
		test_parse_prm!(b"#MULTI:VALUE:VALUE2;", 13, "VALUE2");
	}

	#[test]
	fn comment() {
		test_parse_prm!(b"#TITLE: A/BCD;", 7, "A/BCD");
		test_parse_prm!(b"#TITLE: A//BCD;", 7, "A");
	}

	macro_rules! test_parse_plist {
		($input: expr, $out: expr) => {{
			let mut biter = ByteIterator::new($input);

			assert_eq!(
				parse_param_list(&mut biter)
					.0
					.iter()
					.map(|b| std::str::from_utf8(&b).unwrap())
					.collect::<Vec<_>>(),
				$out
			);
		}};
	}

	#[test]
	fn list() {
		test_parse_plist!(b"#TITLE:FOO", vec!["TITLE", "FOO"]);
		test_parse_plist!(b"#TITLE:FOO;BAR:BAZ", vec!["TITLE", "FOO", "BAR", "BAZ"]);

		test_parse_plist!(
			b"#TITLE:FOO;
#ARTIST:AMONG_US;",
			vec!["TITLE", "FOO"]
		);

		// lol, wtf
		// very surprising parse, but this is how SM handles it aswell.
		// /shrug
		test_parse_plist!(b"#TITLE:FOO;#BAR:BAZ", vec!["TITLE", "FOO", "#BAR", "BAZ"]);

		test_parse_plist!(
			b"#TITLE:FOO
;#BAR:BAZ",
			vec!["TITLE", "FOO", "#BAR", "BAZ"]
		);

		test_parse_plist!(
			b"#TITLE:FOO
#BAR:BAZ",
			vec!["TITLE", "FOO"]
		);
		test_parse_plist!(
			b"#TITLE:FOO:
#BAR:BAZ",
			vec!["TITLE", "FOO"]
		);

		test_parse_plist!(
			br"#TITLE:FOO:
\ #BAR:BAZ",
			vec!["TITLE", "FOO"]
		);
	}

	fn full_comp(buf: &[u8], expected: Vec<Vec<&str>>) {
		let parse_res = from_bytes(buf);

		for (i, values) in expected.iter().enumerate() {
			let e = parse_res.elements[i].clone();

			assert_eq!(
				std::str::from_utf8(&e.tag).unwrap(),
				values.first().unwrap().to_owned()
			);

			assert_eq!(
				e.values
					.into_iter()
					.map(|f| std::str::from_utf8(&f).unwrap().to_owned())
					.collect::<Vec<String>>(),
				values
					.iter()
					.skip(1)
					// lolwhat
					.map(|f| f.to_owned().to_owned())
					.collect::<Vec<String>>()
			);
		}
	}

	#[test]
	fn full_parse() {
		let expected = vec![
			vec!["TITLE", "[11] [120] Le Perv (zk test ed.)"],
			vec!["ARTIST", "Carpenter Brut"],
			vec!["SUBTITLE", ""],
		];

		full_comp(&test_file_read("zk-test.sm"), expected);
	}

	#[test]
	fn full_parse_simple() {
		let expected = vec![
			vec!["TITLE", "Song Title"],
			vec!["ARTIST", "Song Artist"],
			vec!["SUBTITLE", ""],
			vec!["WHITESPACE", "Foo"],
		];

		full_comp(
			br"#TITLE:Song Title;
#Artist:Song Artist;
#SUBTITLE:;
#WHITESPACE:   Foo;",
			expected,
		);
	}
}