parse-changelog 0.5.3

/*!
Simple changelog parser, written in Rust.

# Examples

```rust
let changelog = "\
## 0.1.2 - 2020-03-01

- Bug fixes.

## 0.1.1 - 2020-02-01

- Added `Foo`.
- Added `Bar`.

## 0.1.0 - 2020-01-01

Initial release
";

// Parse changelog.
let changelog = parse_changelog::parse(changelog).unwrap();

// Get the latest release.
assert_eq!(changelog[0].version, "0.1.2");
assert_eq!(changelog[0].title, "0.1.2 - 2020-03-01");
assert_eq!(changelog[0].notes, "- Bug fixes.");

// Get the specified release.
assert_eq!(changelog["0.1.0"].title, "0.1.0 - 2020-01-01");
assert_eq!(changelog["0.1.0"].notes, "Initial release");
assert_eq!(changelog["0.1.1"].title, "0.1.1 - 2020-02-01");
assert_eq!(
    changelog["0.1.1"].notes,
    "- Added `Foo`.\n\
     - Added `Bar`."
);
```

The key of the map returned does not include prefixes such as
"v", "Version ", etc.

```rust
let changelog_a = "\
## Version 0.1.0 - 2020-01-01
Initial release
";
let changelog_b = "\
## v0.1.0 - 2020-02-01
Initial release
";

let changelog_a = parse_changelog::parse(changelog_a).unwrap();
let changelog_b = parse_changelog::parse(changelog_b).unwrap();
// Not `changelog_a["Version 0.1.0"]`
assert_eq!(changelog_a["0.1.0"].version, "0.1.0");
assert_eq!(changelog_a["0.1.0"].title, "Version 0.1.0 - 2020-01-01");
assert_eq!(changelog_a["0.1.0"].notes, "Initial release");
// Not `changelog_b["v0.1.0"]`
assert_eq!(changelog_b["0.1.0"].version, "0.1.0");
assert_eq!(changelog_b["0.1.0"].title, "v0.1.0 - 2020-02-01");
assert_eq!(changelog_b["0.1.0"].notes, "Initial release");
```

# Supported Format

By default, this crate is intended to support markdown-based changelogs
that have the title of each release starts with the version format based on
[Semantic Versioning][semver]. (e.g., [Keep a Changelog][keepachangelog]'s
changelog format.)

## Headings

The heading for each release must be Atx-style (1-6 `#`) or
Setext-style (`=` or `-` in a line under text), and the heading levels
must match with other releases.

Atx-style headings:

```markdown
## 0.1.0
```

```markdown
### 0.1.0
```

Setext-style headings:

```markdown
0.1.0
=====
```

```markdown
0.1.0
-----
```

## Titles

The title of each release must start with a text or a link text (text with
`[` and `]`) that starts with a valid [version format](#versions) or
[prefix format](#prefixes). For example:

```markdown
## [0.2.0]

description...

## 0.1.0

description...
```

### Prefixes

You can include characters before the version as prefix.

```text
### Version 0.1.0
   ^^^^^^^^
```

By default only "v", "Version ", "Release ", and "" (no prefix) are
allowed as prefixes.

To customize the prefix format, use the [`Parser::prefix_format`] method.

### Versions

```text
### v0.1.0 -- 2020-01-01
    ^^^^^
```

The default version format is based on [Semantic Versioning][semver].

This is parsed by using the following regular expression:

```text
^(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)(-[0-9A-Za-z\.-]+)?(\+[0-9A-Za-z\.-]+)?$|^Unreleased$
```

**Note:** To get the 'Unreleased' section in the CLI, you need to explicitly specify 'Unreleased' as the version.

To customize the version format, use the [`Parser::version_format`] method.

### Suffixes

You can freely include characters after the version.

```text
## 0.1.0 - 2020-01-01
       ^^^^^^^^^^^^^
```

# Optional features

- **`serde`** — Implements [`serde::Serialize`] trait for parse-changelog types.

[`serde::Serialize`]: https://docs.rs/serde/1/serde/trait.Serialize.html
[keepachangelog]: https://keepachangelog.com
[semver]: https://semver.org
*/

#![doc(test(
    no_crate_inject,
    attr(
        deny(warnings, rust_2018_idioms, single_use_lifetimes),
        allow(dead_code, unused_variables)
    )
))]
#![forbid(unsafe_code)]
#![warn(
    missing_debug_implementations,
    missing_docs,
    rust_2018_idioms,
    single_use_lifetimes,
    unreachable_pub
)]
#![warn(
    clippy::pedantic,
    // lints for public library
    clippy::alloc_instead_of_core,
    clippy::exhaustive_enums,
    clippy::exhaustive_structs,
    clippy::std_instead_of_alloc,
    clippy::std_instead_of_core,
)]
#![allow(clippy::cast_possible_truncation, clippy::must_use_candidate)]

#[cfg(doctest)]
// https://github.com/rust-lang/rust/issues/82768
#[cfg_attr(doctest, cfg_attr(doctest, doc = include_str!("../README.md")))]
const _README: () = ();

#[cfg(test)]
mod tests;

#[cfg(test)]
#[path = "gen/assert_impl.rs"]
mod assert_impl;

mod error;

use core::mem;

use indexmap::IndexMap;
use memchr::memmem;
use once_cell::sync::Lazy;
use regex::Regex;

pub use crate::error::Error;
use crate::error::Result;

/// A changelog.
///
/// The key is a version, and the value is the release note for that version.
///
/// The order is the same as the order written in the original text. (e.g., if
/// [the latest version comes first][keepachangelog], `changelog[0]` is the
/// release note for the latest version)
///
/// This type is returned by [`parse`] function or [`Parser::parse`] method.
///
/// [keepachangelog]: https://keepachangelog.com
pub type Changelog<'a> = IndexMap<&'a str, Release<'a>>;

/// Parses release notes from the given `text`.
///
/// This function uses the default version and prefix format. If you want to use
/// another format, consider using the [`Parser`] type instead.
///
/// See crate level documentation for changelog and version format supported
/// by default.
///
/// # Errors
///
/// Returns an error if any of the following:
///
/// - There are multiple release notes for one version.
/// - No release was found. This usually means that the changelog isn't
///   written in the supported format.
pub fn parse(text: &str) -> Result<Changelog<'_>> {
    Parser::new().parse(text)
}

/// Returns an iterator over all release notes in the given `text`.
///
/// Unlike [`parse`] function, the returned iterator doesn't error on
/// duplicate release notes or empty changelog.
///
/// This function uses the default version and prefix format. If you want to use
/// another format, consider using the [`Parser`] type instead.
///
/// See crate level documentation for changelog and version format supported
/// by default.
pub fn parse_iter(text: &str) -> ParseIter<'_, 'static> {
    ParseIter::new(text, None, None)
}

/// A release note for a version.
#[allow(single_use_lifetimes)] // https://github.com/rust-lang/rust/issues/69952
#[derive(Debug, Clone, PartialEq, Eq)]
#[cfg_attr(feature = "serde", derive(serde_crate::Serialize))]
#[cfg_attr(feature = "serde", serde(crate = "serde_crate"))]
#[non_exhaustive]
pub struct Release<'a> {
    /// The version of this release.
    ///
    /// ```text
    /// ### Version 0.1.0 -- 2020-01-01
    ///            ^^^^^
    /// ```
    ///
    /// This is the same value as the key of the [`Changelog`] type.
    pub version: &'a str,
    /// The title of this release.
    ///
    /// ```text
    /// ### Version 0.1.0 -- 2020-01-01
    ///    ^^^^^^^^^^^^^^^^^^^^^^^^^^^
    /// ```
    ///
    /// Note that leading and trailing [whitespaces](char::is_whitespace) have
    /// been removed.
    pub title: &'a str,
    /// The descriptions of this release.
    ///
    /// Note that leading and trailing newlines have been removed.
    pub notes: &'a str,
}

/// A changelog parser.
#[derive(Debug, Default)]
pub struct Parser {
    /// Version format. e.g., "0.1.0" in "# v0.1.0 (2020-01-01)".
    ///
    /// If `None`, `DEFAULT_VERSION_FORMAT` is used.
    version_format: Option<Regex>,
    /// Prefix format. e.g., "v" in "# v0.1.0 (2020-01-01)", "Version " in
    /// "# Version 0.1.0 (2020-01-01)".
    ///
    /// If `None`, `DEFAULT_PREFIX_FORMAT` is used.
    prefix_format: Option<Regex>,
}

impl Parser {
    /// Creates a new changelog parser.
    pub fn new() -> Self {
        Self::default()
    }

    /// Sets the version format.
    ///
    /// ```text
    /// ### v0.1.0 -- 2020-01-01
    ///     ^^^^^
    /// ```
    ///
    /// *Tip*: To customize the text before the version number (e.g., "v" in "# v0.1.0",
    /// "Version " in "# Version 0.1.0", etc.), use the [`prefix_format`] method
    /// instead of this method.
    ///
    /// # Default
    ///
    /// The default version format is based on [Semantic Versioning][semver].
    ///
    /// This is parsed by using the following regular expression:
    ///
    /// ```text
    /// ^(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)(-[0-9A-Za-z\.-]+)?(\+[0-9A-Za-z\.-]+)?$|^Unreleased$
    /// ```
    ///
    /// **Note:** To get the 'Unreleased' section in the CLI, you need to explicitly specify 'Unreleased' as the version.
    ///
    /// # Errors
    ///
    /// Returns an error if any of the following:
    ///
    /// - The specified format is not a valid regular expression or supported by
    ///   [regex] crate.
    /// - The specified format is empty or contains only
    ///   [whitespace](char::is_whitespace).
    ///
    /// [`prefix_format`]: Self::prefix_format
    /// [regex]: https://docs.rs/regex
    /// [semver]: https://semver.org
    pub fn version_format(&mut self, format: &str) -> Result<&mut Self> {
        if format.trim().is_empty() {
            return Err(Error::format("empty or whitespace version format"));
        }
        self.version_format = Some(Regex::new(format).map_err(Error::new)?);
        Ok(self)
    }

    /// Sets the prefix format.
    ///
    /// "Prefix" means the range from the first non-whitespace character after
    /// heading to the character before the version (including whitespace
    /// characters). For example:
    ///
    /// ```text
    /// ### Version 0.1.0 -- 2020-01-01
    ///    ^^^^^^^^
    /// ```
    ///
    /// ```text
    /// ### v0.1.0 -- 2020-01-01
    ///    ^
    /// ```
    ///
    /// # Default
    ///
    /// By default only "v", "Version ", "Release ", and "" (no prefix) are
    /// allowed as prefixes.
    ///
    /// This is parsed by using the following regular expression:
    ///
    /// ```text
    /// ^(v|Version |Release )?
    /// ```
    ///
    /// # Errors
    ///
    /// Returns an error if any of the following:
    ///
    /// - The specified format is not a valid regular expression or supported by
    ///   [regex] crate.
    ///
    /// [regex]: https://docs.rs/regex
    pub fn prefix_format(&mut self, format: &str) -> Result<&mut Self> {
        self.prefix_format = Some(Regex::new(format).map_err(Error::new)?);
        Ok(self)
    }

    /// Parses release notes from the given `text`.
    ///
    /// See crate level documentation for changelog and version format supported
    /// by default.
    ///
    /// # Errors
    ///
    /// Returns an error if any of the following:
    ///
    /// - There are multiple release notes for one version.
    /// - No release was found. This usually means that the changelog isn't
    ///   written in the supported format, or that the specified format is wrong
    ///   if you specify your own format.
    pub fn parse<'a>(&self, text: &'a str) -> Result<Changelog<'a>> {
        let mut map = IndexMap::new();
        for release in self.parse_iter(text) {
            if let Some(release) = map.insert(release.version, release) {
                return Err(Error::parse(format!(
                    "multiple release notes for '{}'",
                    release.version
                )));
            }
        }
        if map.is_empty() {
            return Err(Error::parse("no release was found"));
        }
        Ok(map)
    }

    /// Returns an iterator over all release notes in the given `text`.
    ///
    /// Unlike [`parse`] method, the returned iterator doesn't error on
    /// duplicate release notes or empty changelog.
    ///
    /// See crate level documentation for changelog and version format supported
    /// by default.
    ///
    /// [`parse`]: Self::parse
    pub fn parse_iter<'a, 'r>(&'r self, text: &'a str) -> ParseIter<'a, 'r> {
        ParseIter::new(text, self.version_format.as_ref(), self.prefix_format.as_ref())
    }
}

/// An iterator over release notes.
///
/// This type is returned by [`parse_iter`] function or [`Parser::parse_iter`] method.
#[allow(missing_debug_implementations)]
#[must_use = "iterators are lazy and do nothing unless consumed"]
pub struct ParseIter<'a, 'r> {
    version_format: &'r Regex,
    prefix_format: &'r Regex,
    find_open: memmem::Finder<'static>,
    find_close: memmem::Finder<'static>,
    lines: Lines<'a>,
    /// The heading level of release sections. 1-6
    level: Option<u8>,
}

const OPEN: &[u8] = b"<!--";
const CLOSE: &[u8] = b"-->";

static DEFAULT_PREFIX_FORMAT: Lazy<Regex> =
    Lazy::new(|| Regex::new(r"^(v|Version |Release )?").unwrap());
static DEFAULT_VERSION_FORMAT: Lazy<Regex> = Lazy::new(|| {
    Regex::new(r"^(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)(-[0-9A-Za-z\.-]+)?(\+[0-9A-Za-z\.-]+)?$|^Unreleased$")
    .unwrap()
});

impl<'a, 'r> ParseIter<'a, 'r> {
    fn new(
        text: &'a str,
        version_format: Option<&'r Regex>,
        prefix_format: Option<&'r Regex>,
    ) -> Self {
        Self {
            version_format: version_format.unwrap_or(&DEFAULT_VERSION_FORMAT),
            prefix_format: prefix_format.unwrap_or(&DEFAULT_PREFIX_FORMAT),
            find_open: memmem::Finder::new(OPEN),
            find_close: memmem::Finder::new(CLOSE),
            lines: Lines::new(text),
            level: None,
        }
    }

    fn end_release(
        &self,
        mut cur_release: Release<'a>,
        release_note_start: usize,
        line_start: usize,
    ) -> Release<'a> {
        assert!(!cur_release.version.is_empty());
        if release_note_start < line_start {
            // Remove trailing newlines.
            cur_release.notes = self.lines.text[release_note_start..line_start - 1].trim_end();
        }
        cur_release
    }

    fn handle_comment(&self, on_comment: &mut bool, line: &'a str) {
        let mut line = Some(line);
        while let Some(l) = line {
            match (self.find_open.find(l.as_bytes()), self.find_close.find(l.as_bytes())) {
                (None, None) => {}
                // <!-- ...
                (Some(_), None) => *on_comment = true,
                // ... -->
                (None, Some(_)) => *on_comment = false,
                (Some(open), Some(close)) => {
                    if open < close {
                        // <!-- ... -->
                        *on_comment = false;
                        line = l.get(close + CLOSE.len()..);
                    } else {
                        // --> ... <!--
                        *on_comment = true;
                        line = l.get(open + OPEN.len()..);
                    }
                    continue;
                }
            }
            break;
        }
    }
}

impl<'a> Iterator for ParseIter<'a, '_> {
    type Item = Release<'a>;

    fn next(&mut self) -> Option<Self::Item> {
        // If `true`, we are in a code block ("```").
        let mut on_code_block = false;
        // If `true`, we are in a comment (`<!--` and `-->`).
        let mut on_comment = false;
        let mut release_note_start = None;
        let mut cur_release = Release { version: "", title: "", notes: "" };

        while let Some((line, line_start, line_end)) = self.lines.peek() {
            let heading =
                if on_code_block || on_comment { None } else { heading(line, &mut self.lines) };
            if heading.is_none() {
                self.lines.next();
                if trim(line).starts_with("```") {
                    on_code_block = !on_code_block;
                }

                if !on_code_block {
                    self.handle_comment(&mut on_comment, line);
                }

                // Non-heading lines are always considered part of the current
                // section.

                if line_end == self.lines.text.len() {
                    break;
                }
                continue;
            }
            let heading = heading.unwrap();
            if let Some(release_level) = self.level {
                if heading.level > release_level {
                    // Consider sections that have lower heading levels than
                    // release sections are part of the current section.
                    self.lines.next();
                    if line_end == self.lines.text.len() {
                        break;
                    }
                    continue;
                }
                if heading.level < release_level {
                    // Ignore sections that have higher heading levels than
                    // release sections.
                    self.lines.next();
                    if let Some(release_note_start) = release_note_start {
                        return Some(self.end_release(cur_release, release_note_start, line_start));
                    }
                    if line_end == self.lines.text.len() {
                        break;
                    }
                    continue;
                }
                if let Some(release_note_start) = release_note_start {
                    return Some(self.end_release(cur_release, release_note_start, line_start));
                }
            }

            debug_assert!(release_note_start.is_none());
            let version = extract_version_from_title(heading.text, self.prefix_format);
            if !self.version_format.is_match(version) {
                // Ignore non-release sections that have the same heading
                // levels as release sections.
                self.lines.next();
                if line_end == self.lines.text.len() {
                    break;
                }
                continue;
            };

            cur_release.version = version;
            cur_release.title = heading.text;
            self.level.get_or_insert(heading.level);

            self.lines.next();
            if heading.style == HeadingStyle::Setext {
                // Skip an underline after a Setext-style heading.
                self.lines.next();
            }
            while let Some((next, ..)) = self.lines.peek() {
                if next.trim().is_empty() {
                    // Skip newlines after a heading.
                    self.lines.next();
                } else {
                    break;
                }
            }
            if let Some((_, line_start, _)) = self.lines.peek() {
                release_note_start = Some(line_start);
            } else {
                break;
            }
        }

        if !cur_release.version.is_empty() {
            if let Some(release_note_start) = release_note_start {
                if let Some(nodes) = self.lines.text.get(release_note_start..) {
                    // Remove trailing newlines.
                    cur_release.notes = nodes.trim_end();
                }
            }
            return Some(cur_release);
        }

        None
    }
}

struct Lines<'a> {
    text: &'a str,
    iter: memchr::Memchr<'a>,
    line_start: usize,
    peeked: Option<(&'a str, usize, usize)>,
    peeked2: Option<(&'a str, usize, usize)>,
}

impl<'a> Lines<'a> {
    fn new(text: &'a str) -> Self {
        Self {
            text,
            iter: memchr::memchr_iter(b'\n', text.as_bytes()),
            line_start: 0,
            peeked: None,
            peeked2: None,
        }
    }

    fn peek(&mut self) -> Option<(&'a str, usize, usize)> {
        self.peeked = self.next();
        self.peeked
    }

    fn peek2(&mut self) -> Option<(&'a str, usize, usize)> {
        let peeked = self.next();
        let peeked2 = self.next();
        self.peeked = peeked;
        self.peeked2 = peeked2;
        self.peeked2
    }
}

impl<'a> Iterator for Lines<'a> {
    type Item = (&'a str, usize, usize);

    fn next(&mut self) -> Option<Self::Item> {
        if let Some(triple) = self.peeked.take() {
            return Some(triple);
        }
        if let Some(triple) = self.peeked2.take() {
            return Some(triple);
        }
        let (line, line_end) = match self.iter.next() {
            Some(line_end) => (&self.text[self.line_start..line_end], line_end),
            None => (self.text.get(self.line_start..)?, self.text.len()),
        };
        let line_start = mem::replace(&mut self.line_start, line_end + 1);
        Some((line, line_start, line_end))
    }
}

struct Heading<'a> {
    text: &'a str,
    level: u8,
    style: HeadingStyle,
}

#[derive(Eq, PartialEq)]
enum HeadingStyle {
    /// Atx-style headings use 1-6 `#` characters at the start of the line,
    /// corresponding to header levels 1-6.
    Atx,
    /// Setext-style headings are “underlined” using equal signs `=` (for
    /// first-level headings) and dashes `-` (for second-level headings).
    Setext,
}

fn heading<'a>(line: &'a str, lines: &mut Lines<'a>) -> Option<Heading<'a>> {
    let line = trim(line);
    if line.starts_with('#') {
        let mut level = 0;
        while level <= 7 && line.as_bytes().get(level) == Some(&b'#') {
            level += 1;
        }
        // https://pandoc.org/try/?params=%7B%22text%22%3A%22%23%23%23%23%23%23%5Cn%3D%3D%3D%5Cn%5Cn%23%23%23%23%23%23%23%5Cn%3D%3D%3D%5Cn%5Cn%23%23%23%23%23%23+%5Cn%3D%3D%3D%5Cn%5Cn%23%23%23%23%23%23+a%5Cn%3D%3D%3D%5Cn%5Cn%23%23%23%23%23%23+b%5Cn%22%2C%22to%22%3A%22html5%22%2C%22from%22%3A%22commonmark%22%2C%22standalone%22%3Afalse%2C%22embed-resources%22%3Afalse%2C%22table-of-contents%22%3Afalse%2C%22number-sections%22%3Afalse%2C%22citeproc%22%3Afalse%2C%22html-math-method%22%3A%22plain%22%2C%22wrap%22%3A%22auto%22%2C%22highlight-style%22%3Anull%2C%22files%22%3A%7B%7D%2C%22template%22%3Anull%7D
        if level < 7 && line.as_bytes().get(level).map_or(true, |&b| b == b' ') {
            return Some(Heading {
                text: line.get(level + 1..).unwrap_or_default().trim(),
                level: level as _,
                style: HeadingStyle::Atx,
            });
        }
    }
    if let Some((next, ..)) = lines.peek2() {
        let next = trim(next);
        if next.is_empty() {
            None
        } else if next.as_bytes().iter().all(|&b| b == b'=') {
            Some(Heading { text: line, level: 1, style: HeadingStyle::Setext })
        } else if next.as_bytes().iter().all(|&b| b == b'-') {
            Some(Heading { text: line, level: 2, style: HeadingStyle::Setext })
        } else {
            None
        }
    } else {
        None
    }
}

fn trim(s: &str) -> &str {
    let mut count = 0;
    while count <= 4 && s.as_bytes().get(count) == Some(&b' ') {
        count += 1;
    }
    // Indents less than 4 are ignored.
    if count < 4 {
        s[count..].trim_end()
    } else {
        s.trim_end()
    }
}

fn extract_version_from_title<'a>(mut text: &'a str, prefix_format: &Regex) -> &'a str {
    // Remove link from prefix
    // [Version 1.0.0 2022-01-01]
    // ^
    text = text.strip_prefix('[').unwrap_or(text);
    // Remove prefix
    // Version 1.0.0 2022-01-01]
    // ^^^^^^^^
    if let Some(m) = prefix_format.find(text) {
        text = &text[m.end()..];
    }
    // Remove whitespace after the version and the strings following it
    // 1.0.0 2022-01-01]
    //      ^^^^^^^^^^^^
    text = text.split(char::is_whitespace).next().unwrap();
    // Remove link from version
    // Version [1.0.0 2022-01-01]
    //         ^
    // [Version 1.0.0] 2022-01-01
    //               ^
    // Version [1.0.0] 2022-01-01
    //         ^     ^
    unlink(text)
}

/// If a leading `[` or trailing `]` exists, returns a string with it removed.
///
/// # Note
///
/// This is not a full "unlink" on markdown, but this is enough as this crate
/// does not parse a string at the end of headings.
fn unlink(mut s: &str) -> &str {
    // [1.0.0]
    // ^
    s = s.strip_prefix('[').unwrap_or(s);
    if let Some(pos) = s.find(']') {
        // 1.0.0]
        //      ^
        if pos + 1 == s.len() {
            return &s[..pos];
        }
        let l = &s[pos + 1..];
        // 1.0.0](link)
        //      ^^^^^^^
        // 1.0.0][link]
        //      ^^^^^^^
        if l.starts_with('(') && l.find(')').map_or(false, |pos| pos + 1 == l.len())
            || l.starts_with('[') && l.find(']').map_or(false, |pos| pos + 1 == l.len())
        {
            return &s[..pos];
        }
    }
    s
}