1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
/*
 * normal.rs
 *
 * wikidot-normalize - Library to provide Wikidot-compatible normalization.
 * Copyright (c) 2019 Ammon Smith
 *
 * wikidot-normalize is available free of charge under the terms of the MIT
 * License. You are free to redistribute and/or modify it under those
 * terms. It is distributed in the hopes that it will be useful, but
 * WITHOUT ANY WARRANTY. See the LICENSE file for more details.
 *
 */

use regex::Regex;
use std::borrow::Cow;
use std::mem;
use std::str::Utf8Error;

lazy_static! {
    static ref NON_URL: Regex = Regex::new(r"([^\w:/\-]+|-{2,})").unwrap();
    static ref MULTIPLE_COLONS: Regex = Regex::new(r":{2,}").unwrap();
    static ref START_DASHES: Regex = Regex::new(r"(^|/+)(?P<dash>-+)").unwrap();
    static ref END_DASHES: Regex = Regex::new(r"(?P<dash>-+)($|/+)").unwrap();
    static ref MULTIPLE_SLASHES: Regex = Regex::new(r"/{2,}").unwrap();
    static ref END_SLASHES: Regex = Regex::new(r"/+$").unwrap();
}

#[inline]
fn percent_decode(input: &str) -> Result<Cow<str>, Utf8Error> {
    use percent_encoding::percent_decode_str;

    percent_decode_str(input).decode_utf8()
}

/// Converts an arbitrary string into Wikidot normalized form, first decoding percent notation.
///
/// See [`normalize`] for information on normal form.
///
/// [`normalize`]: ./fn.normalize.html
pub fn normalize_decode(name: &mut String) {
    // Perform percent-decoding
    match percent_decode(&name) {
        Ok(Cow::Borrowed(_)) => (),
        Ok(Cow::Owned(mut decoded)) => mem::swap(name, &mut decoded),
        Err(_) => warn!("Error decoding percent string"),
    }

    normalize(name);
}

/// Converts an arbitrary string into Wikidot normalized form.
///
/// This will convert non-alphanumeric characters to dashes and
/// makes it lowercase.
///
/// Examples:
/// * `Big Cheese Horace` -> `big-cheese-horace`
/// * `bottom--Text` -> `bottom-text`
/// * `Tufto's Proposal` -> `tufto-s-proposal`
/// * `-test-` -> `test`
pub fn normalize(name: &mut String) {
    // Lowercase
    name.make_ascii_lowercase();

    // Squash multiple colons
    while let Some(mtch) = MULTIPLE_COLONS.find(name) {
        let start = mtch.start();
        let end = mtch.end();
        name.replace_range(start..end, ":");
    }

    // Convert non-URL characters to dashes
    while let Some(mtch) = NON_URL.find(name) {
        let start = mtch.start();
        let end = mtch.end();
        name.replace_range(start..end, "-");
    }

    // Remove leading and trailing dashes
    let get_range = |captures: regex::Captures| {
        let mtch = captures.name("dash").unwrap();
        let start = mtch.start();
        let end = mtch.end();

        start..end
    };

    while let Some(captures) = START_DASHES.captures(name) {
        let range = get_range(captures);
        name.replace_range(range, "");
    }

    while let Some(captures) = END_DASHES.captures(name) {
        let range = get_range(captures);
        name.replace_range(range, "");
    }

    // Squash multiple slashes
    while let Some(mtch) = MULTIPLE_SLASHES.find(name) {
        let start = mtch.start();
        let end = mtch.end();
        name.replace_range(start..end, "/");
    }

    // Remove trailing slashes, unless it's just '/'
    if name.len() > 1 {
        while let Some(mtch) = END_SLASHES.find(name) {
            let start = mtch.start();
            let end = mtch.end();
            name.replace_range(start..end, "");
        }
    }
}

/// Determines if an arbitrary string is already in Wikidot normalized form.
///
/// If `slash` is true, the forward slash character is also accepted.
pub fn is_normal(name: &str, slash: bool) -> bool {
    // Is all lowercase
    let is_valid_char = |ch: char| -> bool {
        ch.is_ascii_lowercase()
            || ch.is_digit(10)
            || ch == ':'
            || ch == '-'
            || ch == '_'
            || (slash && ch == '/')
    };

    if !name.chars().all(is_valid_char) {
        return false;
    }

    // No special characters
    if NON_URL.find(name).is_some() {
        return false;
    }

    // Check multiple colons
    if MULTIPLE_COLONS.find(name).is_some() {
        return false;
    }

    // Has leading or trailing dashes
    if START_DASHES.find(name).is_some() {
        return false;
    }

    if END_DASHES.find(name).is_some() {
        return false;
    }

    // Check multiple slashes
    if MULTIPLE_SLASHES.find(name).is_some() {
        return false;
    }

    // Has trailing slashes
    if name.len() > 1 && END_SLASHES.find(name).is_some() {
        return false;
    }

    true
}