wikidot_normalize/
normal.rs

1/*
2 * normal.rs
3 *
4 * wikidot-normalize - Library to provide Wikidot-compatible normalization.
5 * Copyright (c) 2019-2023 Emmie Maeda
6 *
7 * wikidot-normalize is available free of charge under the terms of the MIT
8 * License. You are free to redistribute and/or modify it under those
9 * terms. It is distributed in the hopes that it will be useful, but
10 * WITHOUT ANY WARRANTY. See the LICENSE file for more details.
11 *
12 */
13
14use crate::category::merge_multi_categories;
15use crate::underscore::replace_underscores;
16use crate::unicode::{casefold, normalize_nfkc};
17use once_cell::sync::Lazy;
18use regex::Regex;
19use trim_in_place::TrimInPlace;
20
21macro_rules! regex {
22    ($name:tt, $expr:expr) => {
23        static $name: Lazy<Regex> = Lazy::new(|| Regex::new($expr).unwrap());
24    };
25}
26
27regex!(NON_NORMAL, r"[^\p{L}\p{N}\-:_]");
28regex!(LEADING_OR_TRAILING_DASHES, r"(^-+)|(-+$)");
29regex!(MULTIPLE_DASHES, r"-{2,}");
30regex!(MULTIPLE_COLONS, r":{2,}");
31regex!(COLON_DASH, r"(:-)|(-:)");
32regex!(UNDERSCORE_DASH, r"(_-)|(-_)");
33regex!(LEADING_OR_TRAILING_COLON, r"(^:)|(:$)");
34
35/// Converts an arbitrary string into Wikidot normalized form.
36///
37/// This will convert non-alphanumeric characters to dashes and
38/// case fold it.
39///
40/// Examples:
41/// * `Big Cheese Horace` -> `big-cheese-horace`
42/// * `bottom--Text` -> `bottom-text`
43/// * `Tufto's Proposal` -> `tufto-s-proposal`
44/// * `-test-` -> `test`
45pub fn normalize(text: &mut String) {
46    // Remove leading and trailing whitespace
47    //
48    // Note that stdlib .trim() is &str -> &str,
49    // we want this to be in-place on a String.
50    text.trim_in_place();
51
52    // Remove leading slash, if present.
53    if text.starts_with('/') {
54        text.replace_range(..1, "");
55    }
56
57    // Normalize to unicode NFKC.
58    normalize_nfkc(text);
59
60    // Perform case folding.
61    // This lowercases all the characters in the string, based on
62    // unicode codepoint data.
63    casefold(text);
64
65    // Replace all characters not allowed in normal form.
66    replace_in_place(text, &NON_NORMAL, "-");
67
68    // Replace all prior colons with dashes, to make an "extra long category".
69    // See https://scuttle.atlassian.net/browse/WJ-355
70    merge_multi_categories(text);
71
72    // Replace non-leading underscores with dashes.
73    //
74    // Permits names like "_template" or "category:_template".
75    replace_underscores(text);
76
77    // Remove any leading or trailing dashes.
78    replace_in_place(text, &LEADING_OR_TRAILING_DASHES, "");
79
80    // Merge multiple dashes and colons into one.
81    replace_in_place(text, &MULTIPLE_DASHES, "-");
82    replace_in_place(text, &MULTIPLE_COLONS, ":");
83
84    // Remove any leading or trailing dashes next to colons or underscores.
85    replace_in_place(text, &COLON_DASH, ":");
86    replace_in_place(text, &UNDERSCORE_DASH, "_");
87
88    // Remove any leading or trailing colons.
89    replace_in_place(text, &LEADING_OR_TRAILING_COLON, "");
90
91    // Remove explicit _default category, if it exists.
92    if text.starts_with("_default:") {
93        text.replace_range(..9, "");
94    }
95}
96
97fn replace_in_place(text: &mut String, regex: &Regex, replace_with: &str) {
98    use regex::Captures;
99    use std::ops::Range;
100
101    fn get_range(captures: Captures) -> Range<usize> {
102        let mtch = captures.get(0).unwrap();
103        let start = mtch.start();
104        let end = mtch.end();
105
106        start..end
107    }
108
109    while let Some(captures) = regex.captures(text) {
110        let range = get_range(captures);
111        text.replace_range(range, replace_with);
112    }
113}
114
115#[test]
116fn test_normalize() {
117    macro_rules! check {
118        ($input:expr, $expected:expr $(,)?) => {{
119            let mut text = str!($input);
120            normalize(&mut text);
121            assert_eq!(text, $expected, "Normalized text doesn't match expected");
122        }};
123    }
124
125    check!("", "");
126    check!("Big Cheese Horace", "big-cheese-horace");
127    check!("bottom--Text", "bottom-text");
128    check!("Tufto's Proposal", "tufto-s-proposal");
129    check!(" - Test - ", "test");
130    check!("--TEST--", "test");
131    check!("-test-", "test");
132    check!(":test", "test");
133    check!("test:", "test");
134    check!(":test:", "test");
135    check!("/Some Page", "some-page");
136    check!("some/Page", "some-page");
137    check!("some,Page", "some-page");
138    check!("End of Death Hub", "end-of-death-hub");
139    check!("$100 is a lot of money", "100-is-a-lot-of-money");
140    check!("$100 is a lot of money!", "100-is-a-lot-of-money");
141    check!("snake_case", "snake-case");
142    check!("long__snake__case", "long-snake-case");
143    check!("snake-_dash", "snake-dash");
144    check!("snake_-dash", "snake-dash");
145    check!("snake_-_dash", "snake-dash");
146    check!("_template", "_template");
147    check!("_template_", "_template");
148    check!("__template", "_template");
149    check!("__template_", "_template");
150    check!("template_", "template");
151    check!("template__", "template");
152    check!("_Template", "_template");
153    check!("_Template_", "_template");
154    check!("__Template", "_template");
155    check!("__Template_", "_template");
156    check!("Template_", "template");
157    check!("Template__", "template");
158    check!(" <[ TEST ]> ", "test");
159    check!("ÄÀ-áö ðñæ_þß*řƒŦ", "äà-áö-ðñæ-þß-řƒŧ");
160    check!("Site-五", "site-五");
161    check!("ᒥᐢᑕᓇᐢᑯᐍᐤ--1", "ᒥᐢᑕᓇᐢᑯᐍᐤ-1");
162    check!("ᒥᐢᑕᓇᐢᑯᐍᐤ:_template", "ᒥᐢᑕᓇᐢᑯᐍᐤ:_template");
163    check!("🚗A‱B⁜C", "a-b-c");
164    check!("Ⰰ_á_X", "ⰰ-á-x");
165    check!("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!", "");
166    check!("Component:image block", "component:image-block");
167    check!("fragment:scp-4447-2", "fragment:scp-4447-2");
168    check!("fragment::scp-4447-2", "fragment:scp-4447-2");
169    check!("FRAGMENT:SCP-4447 (2)", "fragment:scp-4447-2");
170    check!("protected_:fragment_:page", "protected-fragment:page");
171    check!("protected:_fragment_:page", "protected-fragment:page");
172    check!("fragment:_template", "fragment:_template");
173    check!("fragment:__template", "fragment:_template");
174    check!("fragment:_template_", "fragment:_template");
175    check!("fragment::_template", "fragment:_template");
176    check!("_default:_template", "_template");
177    check!("_default:__template", "_template");
178    check!("_default:_template_", "_template");
179    check!("_default::_template", "_template");
180    check!("/fragment:_template", "fragment:_template");
181    check!("/fragment:__template", "fragment:_template");
182    check!("/fragment:_template_", "fragment:_template");
183    check!("/fragment::_template", "fragment:_template");
184    check!("/_default:_template", "_template");
185    check!("/_default:__template", "_template");
186    check!("/_default:_template_", "_template");
187    check!("/_default::_template", "_template");
188    check!(
189        "protected:fragment:_template",
190        "protected-fragment:_template",
191    );
192    check!(
193        "protected:fragment:__template",
194        "protected-fragment:_template",
195    );
196    check!(
197        "protected:fragment:_template_",
198        "protected-fragment:_template",
199    );
200    check!(
201        "protected:fragment::_template",
202        "protected-fragment:_template",
203    );
204    check!(
205        "protected::fragment:_template",
206        "protected-fragment:_template",
207    );
208    check!(
209        "protected::fragment::_template",
210        "protected-fragment:_template",
211    );
212    check!(
213        "protected:archived:fragment:page",
214        "protected-archived-fragment:page",
215    );
216}