pulldown-cmark 0.5.2

A pull parser for CommonMark
Documentation
// Copyright 2018 Google LLC
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.

//! Link label parsing and matching.

use unicase::UniCase;

use crate::strings::CowStr;
use crate::scanners::{scan_eol, is_ascii_whitespace};

pub enum ReferenceLabel<'a> {
    Link(CowStr<'a>),
    Footnote(CowStr<'a>),
}

pub type LinkLabel<'a> = UniCase<CowStr<'a>>;

/// Returns number of bytes (including brackets) and label on success.
pub(crate) fn scan_link_label(text: &str) -> Option<(usize, ReferenceLabel<'_>)> {
    if text.len() < 2 || text.as_bytes()[0] != b'[' { return None; }
    let pair = if b'^' == text.as_bytes()[1] {
        let (byte_index, cow) = scan_link_label_rest(&text[2..])?;
        (byte_index + 2, ReferenceLabel::Footnote(cow))
    } else {
        let (byte_index, cow) = scan_link_label_rest(&text[1..])?;
        (byte_index + 1, ReferenceLabel::Link(cow))
    };
    Some(pair)
}

/// Assumes the opening bracket has already been scanned.
/// Returns the number of bytes read (including closing bracket) and label on success.
pub(crate) fn scan_link_label_rest(text: &str) -> Option<(usize, CowStr<'_>)> {
    let bytes = text.as_bytes();
    let mut ix = 0;
    let mut only_white_space = true;
    let mut codepoints = 0;
    // no worries, doesnt allocate until we push things onto it
    let mut label = String::new();
    let mut mark = 0;

    loop {
        if codepoints >= 1000 { return None; }
        match *bytes.get(ix)? {
            b'[' => return None,
            b']' => break,
            b'\\' => {
                ix += 2;
                codepoints += 2;
                only_white_space = false;
            }
            b if is_ascii_whitespace(b) => {
                // normalize labels by collapsing whitespaces, including linebreaks
                let mut whitespaces = 0;
                let mut linebreaks = 0;
                let whitespace_start = ix;

                while ix < bytes.len() && is_ascii_whitespace(bytes[ix]) {
                    if let Some(eol_bytes) = scan_eol(&bytes[ix..]) {
                        linebreaks += 1;
                        if linebreaks > 1 {
                            return None;
                        }
                        ix += eol_bytes;
                        whitespaces += 2; // indicate that we need to replace
                    } else {
                        whitespaces += if bytes[ix] == b' ' {
                            1
                        } else {
                            2
                        };
                        ix += 1;
                    }
                }
                if whitespaces > 1 {
                    label.push_str(&text[mark..whitespace_start]);
                    label.push(' ');
                    mark = ix;
                    codepoints += ix - whitespace_start;
                } else {
                    codepoints += 1;
                }
            }
            b => {
                only_white_space = false;
                ix += 1;
                if b & 0b1000_0000 != 0 {
                    codepoints += 1;
                }
            }
        }
    }

    if only_white_space {
        None
    } else {
        let cow = if mark == 0 {
            text[..ix].into()
        } else {
            label.push_str(&text[mark..ix]);
            label.into()
        };
        Some((ix + 1, cow))
    }
}


#[cfg(test)]
mod test {
    use super::scan_link_label_rest;

    #[test]
    fn whitespace_normalization() {
        let input = "«\t\tBlurry Eyes\t\t»][blurry_eyes]";
        let expected_output = "« Blurry Eyes »"; // regular spaces!

        let (_bytes, normalized_label) = scan_link_label_rest(input).unwrap();
        assert_eq!(expected_output, normalized_label.as_ref());
    }

    #[test]
    fn return_carriage_linefeed_ok() {
        let input = "hello\r\nworld\r\n]";
        assert!(scan_link_label_rest(input).is_some());
    }
}