mwtitle 0.2.0-alpha.1

MediaWiki title validation and formatting
Documentation
/*
Copyright (C) 2021 Erutuon

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

use std::ops::Range;

use tinyvec::ArrayVec;

#[derive(Debug, Clone)]
pub(crate) enum Segment {
    Num(Range<usize>),
    Colons(Range<usize>),
}

impl Segment {
    fn is_colons(&self) -> bool {
        matches!(self, Segment::Colons(_))
    }

    fn range(&self) -> &Range<usize> {
        match self {
            Segment::Num(range) | Segment::Colons(range) => range,
        }
    }

    fn range_mut(&mut self) -> &mut Range<usize> {
        match self {
            Segment::Num(range) | Segment::Colons(range) => range,
        }
    }
}

/// This implementation is solely to allow the use of `ArrayVec`,
/// which requires its item `T` to implement `Default` to avoid
/// having to put unsafe `std::mem::MaybeUninit<T>` in not-yet-filled slots.
/// In the functions that use `Segment`, the default Segment is invalid
/// and it will never be encountered unless we have bugs
/// because `ipv6_split_iter_rev` constructs `Segment`s
/// containing ranges that enclose valid char encodings.
impl Default for Segment {
    fn default() -> Self {
        Self::Colons(0..0)
    }
}

fn ipv6_split_iter_rev(ip: &str) -> impl Iterator<Item = Segment> + '_ {
    let mut colons = ip
        .bytes()
        .enumerate()
        .rev()
        .map(|(pos, b)| (pos, b == b':'));
    let mut last = None::<Segment>;
    std::iter::from_fn(move || {
        for (pos, is_colon) in colons.by_ref() {
            match &mut last {
                Some(segment) if is_colon == segment.is_colons() => {
                    segment.range_mut().start -= 1
                }
                _ => {
                    let range = pos..pos + 1;
                    if let Some(segment) = last.replace(if is_colon {
                        Segment::Colons(range)
                    } else {
                        Segment::Num(range)
                    }) {
                        return Some(segment);
                    }
                }
            }
        }
        last.take()
    })
}

// TODO: Get rid of this; TitleCodec doesn't care
// why something isn't an IPv6 address.
#[derive(Debug)]
pub(crate) enum Error {
    TooManyConsecutiveDigits(Range<usize>),
    TooManyConsecutiveColons(Range<usize>),
    MoreThanOneDoubleColon,
    TooManySegments,
    TooFewSegments(usize),
    InvalidDigit(Range<usize>),
    TrailingColon,
    LeadingColon,
}

type SegmentVec = ArrayVec<[Segment; 9]>;

pub(crate) fn parse_ipv6_rev(ip: &str) -> Result<SegmentVec, Error> {
    let mut found_double_colon = false;
    let mut number_segment_count = 0;
    use Error::*;
    use Segment::*;
    let segments = ipv6_split_iter_rev(ip)
    // Skip single colons that are not at the beginning or end.
    // Single colons must be between two valid hex u16 numbers.
    // Because the hex u16 condition is checked elsewhere,
    // we can only need to look at single colons that are
    // at the very beginning or very end of the input;
    // we can skip any in the middle.
    .filter(|segment| {
        !matches!(segment, Colons(range) if range.len() == 1 && !(range.start == 0 || range.end == ip.len()))
    }).map(|segment| {
        match segment {
            Num(ref range) => {
                if range.len() > 4 {
                    return Err(TooManyConsecutiveDigits(range.clone()));
                } else if !ip[range.clone()]
                    .bytes()
                    .all(|b| b.is_ascii_hexdigit())
                {
                    return Err(InvalidDigit(range.clone()));
                }
                number_segment_count += 1;
            }
            Colons(ref range) => match range.len() {
                1 => {
                    // Single colons in the middle have already been filtered out.
                    if range.start == 0 {
                        return Err(LeadingColon);
                    } else if range.end == ip.len() {
                        return Err(TrailingColon);
                    }
                }
                2 => {
                    if found_double_colon {
                        return Err(MoreThanOneDoubleColon);
                    } else {
                        found_double_colon = true;
                        number_segment_count += 1;
                    }
                }
                _ => return Err(TooManyConsecutiveColons(range.clone())),
            },
        }
        Ok(segment)
    })
    // 8 is the maximum number of segments in a valid IP address,
    // when single colons have been filtered out.
    // Collect 1 extra so `map` will see leading colons
    // and emit `LeadingColon`.
    // (Leading and not trailing colon
    // because we are iterating from the end to the start.)
    // Otherwise the code below would emit `TooManySegments`.
    .take(9);
    // SAFETY: `collect` won't try to insert more items than the `SegmentVec` can hold
    // because take(9) has limited the iterator to 9 items.
    let segments = segments.collect::<Result<SegmentVec, _>>()?;
    // The address must have exactly 8 Num(_)s or a double colon.
    if number_segment_count < 8 && !found_double_colon {
        Err(TooFewSegments(number_segment_count))
    // Check that we don't have more than 8 numerical segments or double colons
    // and that we consumed all the input.
    } else if segments.len() > 8
        || segments
            .last()
            .map(|segment| segment.range().start != 0)
            .unwrap_or(false)
    {
        Err(TooManySegments)
    } else {
        Ok(segments)
    }
}

#[test]
fn parser_rejects_invalid_ipv6_addresses() {
    macro_rules! rep {
        ($thing:expr; $count:expr) => {
            vec![$thing; $count].join(":")
        };
    }
    for input in [
        // too many colons
        ":::",
        ":::1",
        "1:::",
        "1:::1",
        // leading and trailing single colons
        &(rep!["01"; 8] + ":"),
        &(":".to_string() + &rep!["01"; 8]),
        // too few digits
        &rep!["01"; 7],
        // too many digits or double colons
        &rep!["01"; 9],
        &("::".to_string() + &rep!["01"; 8]),
        &(rep!["1"; 8] + "::"),
        // 8 total numbers and double colons, but two or more double colons
        &("::".to_string() + &rep!["01"; 6] + "::"),
        &("::".to_string() + &rep!["01"; 2] + "::" + &rep!["1"; 3] + "::"),
        &(rep!["01"; 2] + "::" + &rep!["1"; 2] + "::" + &rep!["1"; 2]),
        // too many digits per number
        "00001:1:1:1:1:1:1:1",
        "1:1:1:1:1:1:1:00000",
        // extraneous characters
        &(" ".to_string() + &rep!["01"; 7]),
        &(" :".to_string() + &rep!["01"; 8]),
        &(rep!["01"; 7] + " "),
        &rep!["g"; 8],
    ] {
        let parsed = parse_ipv6_rev(input).map(|vec| {
            vec.into_iter()
                .map(|segment| input.get(segment.range().clone()))
                .collect::<Vec<_>>()
        });
        assert!(
            parsed.is_err(),
            "{:?}\nshould not be successfully parsed, but it yielded the following numerical segments:\n{:?}",
            input,
            parsed
        );
    }
}