//! Sanitise names for use in file systems and the likes.
//!
//! The output string is guaranteed to be shorter than or equal to the input string in length,
//! except for file names that are reserved on Windows (see [`Options::windows_safe`]), in which
//! case an underscore is appended to the base name (e.g. NUL → NUL_, aux.h → aux_.h).
//!
//! The key parts of the API:
//!
#![cfg_attr(feature = "alloc", doc = "\
- <code>[sanitise][](input: &str) -> String</code>: the simplest thing to call;
- <code>[sanitise_with_options][](input: &str, options: &Options<_>) -> String</code>:
when you want to tweak the nature of the sanitisation; and")]
#![cfg_attr(not(feature = "alloc"), doc = "\
- <s><code>[sanitise][](input: &str) -> String</code>: the simplest thing to call</s>
*(disabled in this build due to compiling without the `alloc` feature)*;
- <s><code>[sanitise_with_options][](input: &str, options: &Options<_>) -> String</code>:
when you want to tweak the nature of the sanitisation</s> *(disabled in this build due to
compiling without the `alloc` feature)*; and")]
//!
//! - [`Options`], with detailed descriptions of each option.
//!
//! And for advanced users that want to control allocations or other similar things:
//!
//! - <code>[sanitise_to][](input: &str, options: &Options<_>, out: &mut <em>String</em>)</code>,
//! sanitising into
#![cfg_attr(feature = "alloc", doc = " a `String`")]
#![cfg_attr(not(feature = "alloc"), doc = " a string")]
#![cfg_attr(feature = "tinyvec_string", doc = " or [`tinyvec_string::ArrayString`]")]
#![cfg_attr(all(docsrs, feature = "tinyvec_string"), doc = " (when enabled)")]
//! that you provide,
//! for which the following methods may help:
//!
//! - <code>[max_alloc_size][](options: &Options<_>)</code> or
//! <code>[max_alloc_size_const][](options: &Options<Option<char>>)</code>,
//! to suggest a size for scratch buffer
#![cfg_attr(feature = "tinyvec_string", doc = " or `ArrayString`")]
//! applications; and
//!
//! - <code>[sufficient_alloc_size][](input: &str, options: &Options<_>) -> usize</code>, to
//! suggest a size that will definitely be sufficient for one given input (mainly useful when you
//! are crafting a path with stuff before and after it).
//!
//! … but that’s dangerous territory, deep rabbit holes; ask if you actually *need* them—don’t be
//! like me. (When I am laid in earth, may my wrongs create no trouble in thy breast. Remember me,
//! but ah! forget my fate.)
//!
//! ### Conditional compilation/Cargo features
//!
//! This crate has several features:
//!
//! - **std**, enabled by default. Implies *alloc*. Disable it to get `#![no_std]` operation.
//!
//! - **alloc**, enabled by default via *std*. Provides the ability to sanitise to a `String` in
//! `sanitise_to`, and the `sanitise` and `sanitise_with_options` functions.
//!
//! - **tinyvec_string**, disabled by default. Provides the ability to sanitise to
//! `tinyvec_string::ArrayString`, which works without *alloc*.
//!
//! - **const-fn-trait-bound**, disabled by default, requires rustc nightly at the time of writing.
//! Makes [`max_alloc_size`] const.
//!
//! These docs were built with these features enabled:
#![cfg_attr(feature = "std", doc = " <span class='stab portability'><code>std</code></span>")]
#![cfg_attr(feature = "alloc", doc = " <span class='stab portability'><code>alloc</code></span>")]
#![cfg_attr(feature = "tinyvec_string", doc = " <span class='stab portability'><code>tinyvec_string</code></span>")]
#![cfg_attr(feature = "const-fn-trait-bound", doc = " <span class='stab portability'><code>const-fn-trait-bound</code></span>")]
#![cfg_attr(
all(
not(feature = "std"),
not(feature = "alloc"),
not(feature = "tinyvec_string"),
not(feature = "const-fn-trait-bound"),
),
doc = " *(none of them)*")]
//!
//! … and these features disabled:
#![cfg_attr(not(feature = "std"), doc = " <span class='stab portability'><code>std</code></span>")]
#![cfg_attr(not(feature = "alloc"), doc = " <span class='stab portability'><code>alloc</code></span>")]
#![cfg_attr(not(feature = "tinyvec_string"), doc = " <span class='stab portability'><code>tinyvec_string</code></span>")]
#![cfg_attr(not(feature = "const-fn-trait-bound"), doc = " <span class='stab portability'><code>const-fn-trait-bound</code></span>")]
#![cfg_attr(
all(
feature = "std",
feature = "alloc",
feature = "tinyvec_string",
feature = "const-fn-trait-bound",
),
doc = " *(none of them)*")]
// End docs.
#![cfg_attr(not(feature = "std"), no_std)]
#![cfg_attr(feature = "const-fn-trait-bound", feature(const_fn_trait_bound))]
#![cfg_attr(docsrs, feature(doc_cfg))]
#![cfg_attr(not(feature = "alloc"), allow(rustdoc::broken_intra_doc_links))] // I’m lazy.
#[cfg(feature = "alloc")]
extern crate alloc;
#[cfg(feature = "alloc")]
use alloc::string::String;
use core::ops::{Deref, Index, Range, RangeFrom, RangeBounds};
/// Sanitisation options. Defaults marked on each field.
///
/// Take a look around, but I think everything’s pretty sane by default; the ones I think you’re
/// most likely to want to change are `url_safe` and `windows_safe`, though `replace_with`,
/// `collapse_replacements` and `six_measures_of_barley` can be interesting too for yielding
/// prettier results.
///
/// If you set `length_limit` to `usize::MAX`, all the bool fields to `false`, and
/// `six_measures_of_barley` to an empty string, `sanitise` will not alter the input string in any
/// way. But that would be a rather expensive alternative to `.clone()`. In practice, I doubt you
/// ever want to disable `most_fs_safe`, which is a good baseline.
#[derive(Debug)]
pub struct Options<R: Replace> {
/// Limit the complete file name to this many UTF-8 code units. The default is **255**, which
/// is suitable for all practical platforms.
///
/// (Some file systems limit lengths in UTF-8 code units and some in UTF-16 code units, but
/// UTF-16 never takes more code units than UTF-8 to encode a given Unicode string, so we can
/// ignore it.)
///
/// Reasons you might want to reduce it:
///
/// 1. You haven’t appended the extension yet, and so want to subtract the extension’s length.
/// (In that case I suggest writing `Options::DEFAULT.length_limit` instead of hard coding
/// 255—that’ll work in const context.)
///
/// 2. You want smoother Windows support, for on Windows some things start falling over if the
/// total path length is greater than 260 characters; so measuring or estimating the path
/// length could potentially be useful—but unless you know, probably don’t worry too much,
/// someone’ll probably drop it deep in a node_modules tree at some point and then you’ll be
/// in trouble anyway. 😀 <!-- Okay, okay, so node_modules trees aren’t typically flattened
/// almost entirely these days; but let me have my joke, please? -->
///
/// One other mildly significant note here: if you care about Apple’s pre-2017 HFS+ file
/// system, you should perform Unicode normalisation to NFD (most likely via the
/// `unicode-normalization` crate) before performing sanitisation, because the decomposed form
/// may be longer; if you don’t, then the path will be normalised to NFD by the file system
/// when you try to write it, which could take it over 255 and make it fail. I don’t think
/// there are any popular file systems that normalise any more, though APFS kinda prefers NFC,
/// so you might want to normalise to NFC. I do not know if normalising to NFC will ever
/// lengthen a UTF-8 string, but the spec allows it to (UAX #15, goal 3.2).
///
/// The minimum permitted value is 10, for reasons of implementation convenience and because I
/// don’t think there’s any legitimate use case for a smaller value. If you provide a value
/// less than ten, you’ll get an empty string back every time.
///
/// Truncations are performed at `char` granularity (Unicode scalar value), which means that
/// extended grapheme clusters could be broken. This could change in the future (it’ll be an
/// optional dependency on `unicode-segmentation`), but for now it was just too much thought.
/// If I ever implement this, I’ll probably ditch the minimum value of 10 too.
// (Most significantly, it doesn’t play terribly nicely with extension cleverness: six would no
// longer be sufficient to guarantee a base name, so more involved calculations and overflow
// tracking would need to be done. It’s perfectly achievable, but painful.)
pub length_limit: usize,
/// When allocating the string (since it allocates as small a string as possible), reserve at
/// least this many extra bytes. This is good for efficiency when you append the extension
/// after sanitisation (in which case, also disable `extension_cleverness`). Default **0**.
pub reserve_extra: usize,
/// Make other options try to be clever about a file extension in the input. Default `true`.
///
/// Specifically, if a file extension is detected (done by looking for the last full stop in
/// the name, and splitting at that point into base name and extension):
///
/// 1. `length_limit` will try to keep the extension intact, truncating the base name rather
/// than the extension. “Try”, because if the extension is longer than six code units less
/// than the length limit, it will be deemed unsalvageable. (Why six? The base name must
/// retain at least one character, so for convenience that’s four UTF-8 code units, plus one
/// more for the dot, and if `windows_safe` is on, the longest reserved name causes a five
/// code unit base name like `LPT1_`, and ridiculously long extensions are a corner case
/// anyway so I decided to just call it a day at six. If I subsequently implement
/// grapheme-cluster-aware truncation, this six will increase if the first grapheme cluster
/// in the base name is more than five code units long.) An unsalvageable extension is the
/// only case where sanitisation may take two steps to quiesce, rather than one: if the
/// extension is entirely truncated and the base name contains a dot which in a subsequent
/// run will be interpreted as the extension separator, trimming will happen around it on
/// that subsequent run but not the first.
///
/// 2. `windows_safe` will detect reserved names with extensions.
///
/// 3. `trim_spaces_and_full_stops` and `trim_more_punctuation` will trim those characters from
/// the end of the base name and the start of the extension, in addition to the start and
/// end of the full name. (Expressed otherwise, the base name and extension will be trimmed
/// independently.)
///
/// If you’re appending the extension after sanitisation, you should turn this to false.
pub extension_cleverness: bool,
/// Remove characters that are not safe on just about any file system. Default `true`, and if
/// you actually want to disable it you’re probably using the wrong crate.
///
/// This plus `length_limit` is enough to satisfy most platforms other than Windows, though
/// cleaning somewhat more is probably a good idea.
///
/// Characters removed:
///
/// - `/` (slash)
/// - ␀ (null, character zero)
///
/// Also disallows names comprising exclusively dots (`"."`, `".."`, `"..."`, *&c.*), NOT using
/// `replace_with` on them but yielding an empty string.
///
/// This is a tiny subset of `windows_safe`.
pub most_fs_safe: bool,
/// Ensure the file name is safe on Windows. Default `true`.
///
/// [These are the rules applied:](https://docs.microsoft.com/en-au/windows/win32/fileio/naming-a-file#naming-conventions)
///
/// - These characters are removed (and `replace_with` employed):
///
/// - `<` (less than)
/// - `>` (greater than)
/// - `:` (colon)
/// - `"` (double quote)
/// - `/` (forward slash)
/// - `\` (backslash)
/// - `|` (vertical bar/pipe)
/// - `?` (question mark)
/// - `*` (asterisk)
/// - The C0 control characters, 0–31 and 127 (U+0000–U+001F, U+007F); note that U+007F isn’t
/// actually part of C0, but Microsoft included it in this list so I do too.
///
/// - Names must not end with a space or a dot (so these are removed recursively—for reasons of
/// technical convenience, `replace_with` is NOT employed).
///
/// - These names are reserved (and so a trailing underscore is added to the base name),
/// including with an extension if `extension_cleverness` is enabled:
///
/// - CON, PRN, AUX, NUL,
/// - COM1, COM2, COM3, COM4, COM5, COM6, COM7, COM8, COM9,
/// - LPT1, LPT2, LPT3, LPT4, LPT5, LPT6, LPT7, LPT8, and LPT9
///
/// Most of these restrictions are actually not quite universal in Windows, but getting around
/// them requires switching into POSIX mode or using long UNC paths (e.g. `\\.\C:\CON`,
/// `\\?\D:\aux.h`), and your life will certainly be miserable if you try using them; so
/// they’re all considered not Windows-safe.
pub windows_safe: bool,
/// Remove characters that may be problematic in the usual places in URLs. Default `false`.
///
/// If you want something URL-safe, consider slugifying instead (see below).
///
/// This removes any character that is not what’s called a [*URL code point*], also removes the
/// characters `&`, `/` and `?`, and forbids the names `.` and `..` which have a special
/// meaning in paths. The result is either an empty string, or suitable for use as a path
/// component, query string value or fragment, without generally *needing* percent-encoding:
/// such a URL will be correctly parsed by a WHATWG URL Standard parser, though nominally
/// invalid¹, but older or poorer-quality URL parsers may need percent-encoding to cope with
/// the non-ASCII that is retained.
///
/// Some notable characters that are removed: `/`, `\`, `%`, `?`, `#`, `&`, `"`, and space.
///
/// Almost all non-ASCII is retained.
///
/// Notes on using these URLs in some common formats:
///
/// - In HTML, no escaping is needed in `<a href="http://www.example/fïle_ñamê">`, because `&`
/// and `"` are the only two characters needing escaping in a double-quoted attribute value,
/// and both are removed by `url_safe`.
///
/// - In plain text formats following the longstanding convention of angle bracket delimition
/// (`<http://www.example/lïke_τhis>`), no escaping should be required as `>` is removed by
/// `url_safe`. This includes Markdown. However, some such parsers could be stricter about
/// what’s allowed inside the angle brackets, so you may need or want to use a URL Standard
/// serialiser to do percent-encoding of the non-ASCII.
///
/// - In Markdown `[text](href)` links, you’ll want to manually percent-encode `(` to `%28` and
/// `)` to `%29`. This is yet another bad choice in Markdown’s technical foundation:
/// parentheses aren’t percent-encoded, never have been; so using a URL Standard serialiser
/// won’t help you, you’ll instead need to manually encode them, or unpaired parentheses will
/// break the link and possibly eat your laundry².
///
/// Given that this produces nominally-invalid URLs, you may be wondering why to bother at all;
/// it really comes down to characters like `?`, `/` and `#`: you *can* include them in paths
/// by percent-encoding, but it’s too likely that *somewhere* along the way, *something* will
/// mangle your path, not encoding it properly, and everything will break—basically the entire
/// *system* has to process the URL correctly; ever tried a path component containing `%2F`?
/// But if you’ve removed the genuinely problematic characters, then in theory things can no
/// longer go wrong once you’re past the parser. And being able to skip percent-encoding your
/// URLs when you know you’ll be using a proper URL parser is nice.
///
/// I deliberately haven’t provided an option for removing characters that would make a URL
/// nominally invalid (which is “non-ASCII”), because I think that goes too far: in such a
/// case, I don’t think you should *strip* such characters, but rather slugify the whole thing
/// (which can do things like `Voilà!` → `voila`).
///
/// `replace_with` is used for the character removals, but NOT for the forbidding of the names
/// `"."` and `".."`, for which it will instead yield an empty string.
///
/// —⁂—
///
/// ¹ “Invalid” is just a label in WHATWG specs; it doesn’t change anything, and parsing is
/// still well-defined, it’s generally just a hint that either you may have made a mistake,
/// or that older tools might not handle this case the same way.
///
/// ² When Americans say “eat your laundry” they mean the *clothes*. An Australian seeking to
/// express *that* concept would say “eat your washing” (and probably be looked at strangely
/// because it’s not an expression in common use). The laundry is the room in which clothes
/// are washed; so when I say injection attacks might eat your laundry——
///
/// [*URL code point*]: https://url.spec.whatwg.org/#url-code-points
pub url_safe: bool,
/// Replace all sequences of whitespace with one space. Default `true`.
///
/// This uses the Unicode `White_Space` property to decide ([`char::is_whitespace`]).
///
/// This is done in two phases:
///
/// 1. Before safety character replacements, each whitespace character is normalised to a
/// U+0020 SPACE; `replace_with` is not invoked.
///
/// 2. After all character replacements, adjacent spaces (including any produced by
/// `replace_with`, independent of `collapse_replacements`) are collapsed to just one.
pub normalise_whitespace: bool,
/// Remove spaces and full stops (`.`) from the start and end of the name. Default `true`.
///
/// `normalise_whitespace` is performed before this; with it on, this will trim all whitespace,
/// with it off it’ll only trim U+0020 SPACE.
///
/// All things that invoke `replace_with` are performed before this; thus, if you replace a
/// character with a space or full stop, that could get trimmed. `replace_with` is not invoked
/// on any characters removed by this.
///
/// If `extension_cleverness` is enabled (which it is by default), on names with an extension
/// this trims from the start and end of the base name and extension independently, rather than
/// just the start and end of the full string. That is, `" foo . bar . baz "` will become
/// `"foo . bar.baz"` with `extension_cleverness`, and `"foo . bar . baz"` without.
///
/// This is independent of `windows_safe`, which also trims trailing spaces and dots from the
/// complete name.
// BTW: U+002E is named FULL STOP, oh uncouth Americans. 😀
pub trim_spaces_and_full_stops: bool,
/// Remove a few more punctuationy characters from the start and end of the name.
/// Default `true`.
///
/// This is a more aggressive supplement to `trim_spaces_and_full_stops`, trimming from the
/// same places in the same way. These characters are removed:
///
/// - `_` (underscore; especially significant because `replace_with` defaults to an underscore)
/// - `-` (hyphen/dash/minus)
/// - `,` (comma)
/// - `;` (semicolon)
pub trim_more_punctuation: bool,
/// Remove control characters. Default `true`.
///
/// This removes all characters with the general category *Control*: C0 controls U+0000–U+001F,
/// control character U+007F, and C1 controls U+0080–U+009F.
///
/// `replace_with` is invoked on these removals.
pub remove_control_characters: bool,
/// Remove BiDi control characters that are relevant to reordering attacks. Default `true`.
///
/// <https://trojansource.codes/trojan-source.pdf> is a paper with info about the attack.
///
/// This removes U+202A–U+202E and U+2066–U+2069. It does NOT remove the remaining three
/// Bidi_Control characters U+061C, U+200E and U+200F (ALM, LRM, RLM),
/// which are not implicated in the attack and are conceivably useful in file names.
///
/// `replace_with` is invoked on these removals.
pub remove_reordering_characters: bool,
/// Where characters are removed (except as marked), replace them with this.
/// Default `Some('_')`.
///
/// If you provide a character that would normally be removed, it will not be removed: that
/// processing is done once only.
///
/// If you provide a character that would be trimmed, it may or may not be trimmed: end matches
/// will be trimmed, start matches only will be if ridiculously long names and/or extensions
/// force unusual truncation, exposing the start of the string (so that it gets trimmed to
/// nothing).
pub replace_with: R,
/// Where multiple adjacent characters are to be replaced, only replace the first, and remove
/// any subsequent ones. Default `false`.
///
/// See also `normalise_whitespace`, which can collapse replacements if you replace with
/// whitespace.
pub collapse_replacements: bool,
/// If sanitisation would leave the path empty, return this string instead. Default `"_"`.
///
/// This exists because I found myself writing `if name.is_empty() { name.push('_') }` after
/// every time I called `sanitise`. I think most of the time you don’t want to be left with an
/// empty string, and inserting *something* is tolerable, so this is on by default as something
/// fairly neutral that aligns with the `replace_with` default as well. You can effectively
/// disable this by setting this to an empty string.
///
/// `length_limit` is not taken into account on this. If you put something ridiculously long in
/// it, you brought it on yourself and I wash my hands of it, as Pontius Pilate of old.
///
/// (Read *Ruth 3:15–17* from the Bible to understand the name of this option.)
pub six_measures_of_barley: &'static str,
}
// Implemented on just one type for inference reasons. One might wonder why I use an associated
// constant at all. This would not be an unreasonable thing to wonder.
impl Options<Option<char>> {
/// The default options. This is more useful than `Options::default()` (which just returns
/// this) because it’s const, so you can access `Options::DEFAULT.length_limit` in const
/// context.
pub const DEFAULT: Self = Options {
length_limit: 255,
reserve_extra: 0,
extension_cleverness: true,
most_fs_safe: true,
windows_safe: true,
url_safe: false,
normalise_whitespace: true,
trim_spaces_and_full_stops: true,
trim_more_punctuation: true,
remove_control_characters: true,
remove_reordering_characters: true,
replace_with: Some('_'),
collapse_replacements: false,
six_measures_of_barley: "_",
};
}
impl Default for Options<Option<char>> {
fn default() -> Self {
Self::DEFAULT
}
}
impl<R: Replace> Options<R> {
/// A workaround for an otherwise-messy type situation with filling in defaults.
///
/// This solves the problem that you can’t write this:
///
/// ```rust,ignore
/// Options { replace_with: |c| /* … */, ..Options::DEFAULT }
/// ```
///
/// … because struct update syntax doesn’t currently allow you to change types, and
/// `Options::DEFAULT` is an `Options<Option<char>>`, but with a closure for `replace_with`
/// you’re needing to change it to `Options<[closure@…]>`. So instead, write like one of these:
///
/// ```rust,ignore
/// Options::DEFAULT.with_replace_with(|c| /* … */)
/// Options { /* … */, ..Options::DEFAULT }.with_replace_with(|c| /* … */)
/// ```
///
/// If you’re using nightly rustc, you can try the [incomplete type-changing-struct-update
/// feature](https://github.com/rust-lang/rust/issues/86555) instead, which lets the first code
/// work (so long as this unstable and incomplete feature is working):
///
/// ```rust,ignore
/// #![feature(type_changing_struct_update)]
/// use sanitise_file_name::Options;
///
/// fn main() {
/// Options { replace_with: |c| /* … */, ..Options::DEFAULT }
/// }
/// ```
pub fn with_replace_with<R2: Replace>(self, new_replace_with: R2) -> Options<R2> {
Options {
length_limit: self.length_limit,
reserve_extra: self.reserve_extra,
extension_cleverness: self.extension_cleverness,
most_fs_safe: self.most_fs_safe,
windows_safe: self.windows_safe,
url_safe: self.url_safe,
normalise_whitespace: self.normalise_whitespace,
trim_spaces_and_full_stops: self.trim_spaces_and_full_stops,
trim_more_punctuation: self.trim_more_punctuation,
remove_control_characters: self.remove_control_characters,
remove_reordering_characters: self.remove_reordering_characters,
replace_with: new_replace_with,
collapse_replacements: self.collapse_replacements,
six_measures_of_barley: self.six_measures_of_barley,
}
}
}
/// See [`Options::replace_with`].
pub trait Replace {
// “Why no *string* replacement?” I hear you ask.
// Because then I couldn’t guarantee one allocation.
fn replace(&self, char_being_removed: char) -> Option<char>;
}
/// `None`: just remove the character, don’t replace it.
/// `Some`: replace the character with this character.
impl Replace for Option<char> {
fn replace(&self, _: char) -> Option<char> {
*self
}
}
/// Call this function with the character that is being removed,
/// and if it returns a character, replace it with that.
impl<F: Fn(char) -> Option<char>> Replace for F {
fn replace(&self, c: char) -> Option<char> {
self(c)
}
}
fn is_most_fs_safe_char(c: char) -> bool {
c != '/' && c != '\0'
}
fn is_url_safe_char(c: char) -> bool {
// Safe characters are those in the *URL code point* set, minus &, / and ?.
//
// Definitions from the URL and Infra Standards:
//
// > The *URL code points* are ASCII alphanumeric, U+0021 (!), U+0024 ($), U+0026 (&),
// > U+0027 ('), U+0028 LEFT PARENTHESIS, U+0029 RIGHT PARENTHESIS, U+002A (*), U+002B (+),
// > U+002C (,), U+002D (-), U+002E (.), U+002F (/), U+003A (:), U+003B (;), U+003D (=),
// > U+003F (?), U+0040 (@), U+005F (_), U+007E (~), and code points in the range U+00A0 to
// > U+10FFFD, inclusive, excluding surrogates and noncharacters.
//
// > A *noncharacter* is a code point that is in the range U+FDD0 to U+FDEF, inclusive, or
// > U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF,
// > U+5FFFE, U+5FFFF, U+6FFFE, U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF,
// > U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE, U+DFFFF, U+EFFFE, U+EFFFF,
// > U+FFFFE, U+FFFFF, U+10FFFE, or U+10FFFF.
//
// Surrogates are already excluded by the `char` data type.
matches!(c,
'A'..='Z' | 'a'..='z' | '0'..='9' |
'!' | '$' | /* '&' deliberately excluded */ '\'' | '(' | ')' | '*' | '+' | ',' | '-' |
'.' | /* '/' deliberately excluded */ ':' | ';' | '=' | /* '?' deliberately excluded */
'@' | '_' | '~' | '\u{a0}'..='\u{fdcf}' | '\u{fdf0}'..='\u{10fffd}')
// Exclude the remaining noncharacters U+??FFFE and U+??FFFF:
&& (c as u32) & 0xfffe != 0xfffe
}
fn is_windows_safe_char(char: char) -> bool {
!matches!(char,
'<' | '>' | ':' | '"' | '/' | '\\' | '|' | '?' | '*' |
'\u{0}'..='\u{1f}' | '\u{7f}')
}
fn is_space_or_full_stop(c: char) -> bool {
matches!(c, ' ' | '.')
}
fn is_more_punctuation_character(c: char) -> bool {
matches!(c, '_' | '-' | ',' | ';')
}
fn is_reordering_character(c: char) -> bool {
matches!(c, '\u{202A}'..='\u{202E}' | '\u{2066}'..='\u{2069}')
}
fn is_reserved_windows_file_name(name: &str) -> bool {
matches!(name.as_bytes(),
| [b'C' | b'c', b'O' | b'o', b'N' | b'n']
| [b'P' | b'p', b'R' | b'r', b'N' | b'n']
| [b'A' | b'a', b'U' | b'u', b'X' | b'x']
| [b'N' | b'n', b'U' | b'u', b'L' | b'l']
| [b'C' | b'c', b'O' | b'o', b'M' | b'm', b'1'..=b'9']
| [b'L' | b'l', b'P' | b'p', b'T' | b't', b'1'..=b'9'])
}
/// Split a name on its final '.', returning (base name, extension) if there is one.
/// Both could be empty.
fn split_extension(input: &str) -> Option<(&str, &str)> {
input
.as_bytes()
.iter()
.enumerate()
.rev()
.find(|(_, c)| **c == b'.')
.map(|(dot_index, _)| (&input[..dot_index], &input[dot_index + 1..]))
}
/// Sanitise a file name with the default options.
/// See [`Options`] for a description of what all the options do.
///
/// The return value should be suitable as a file name, and will not be empty (if it *would* be
/// empty, it’ll be `_` instead, per [`Options::six_measures_of_barley`]’s default).
#[cfg(feature = "alloc")]
#[cfg_attr(docsrs, doc(cfg(feature = "alloc")))]
pub fn sanitise(s: &str) -> String {
sanitise_with_options(s, &Options::DEFAULT)
}
/// Calculate a sufficient allocation size for the string used. This number will never exceed
/// `input.len() + 1 + options.reserve_extra`, and will be less on ridiculously long inputs.
///
/// Only intended for use by crazy allocation-counters like me.
pub fn sufficient_alloc_size<R: Replace>(input: &str, options: &Options<R>) -> usize {
if options.length_limit < 10 {
return 0;
}
if options.extension_cleverness {
if let Some((base_name, extension)) = split_extension(input) {
let extension_length_limit = options.length_limit - 6;
let might_add_underscore = |n| {
if (n == 3 || n == 4) && options.windows_safe {
n + '_'.len_utf8()
} else {
n
}
};
return (
might_add_underscore(base_name.len()).min(options.length_limit) +
'.'.len_utf8() +
extension.len().min(extension_length_limit)
// No reserve_extra on this side because this is the size needed *while working*,
// but reserve_extra is only needed when we’re done.
).max(
might_add_underscore(input.len())
.min(options.length_limit)
.max(options.six_measures_of_barley.len()) +
options.reserve_extra
)
}
}
(input.len().min(options.length_limit) + if options.windows_safe { '_'.len_utf8() } else { 0 })
.max(options.six_measures_of_barley.len()) + options.reserve_extra
}
// Alas, <usize as Ord>::max isn’t const.
const fn max(a: usize, b: usize) -> usize { if a > b { a } else { b } }
macro_rules! max_alloc_size_body {
($options:ident) => {{
if $options.length_limit < 10 {
return 0;
}
let baseline = max($options.length_limit, $options.six_measures_of_barley.len())
+ $options.reserve_extra;
if $options.extension_cleverness {
let extension_length_limit = $options.length_limit - 6;
max($options.length_limit + '.'.len_utf8() + extension_length_limit, baseline)
} else {
baseline
}
}}
}
#[cfg(not(feature = "const-fn-trait-bound"))]
/// Calculate the maximum allocation size required for a given set of options, to correctly handle
/// any input.
///
/// This is intended for the scratch buffer approach, where you keep one string around and keep on
/// sanitising a whole bunch of inputs into it in turn, or for array-allocated strings like with
/// `tinyvec_string`.
///
/// This is unfortunately not currently a const fn. If you need a const fn (e.g. to craft an
/// precisely-sized `ArrayString`), you may:
///
/// 1. Enable the `const-fn-trait-bound` feature on this crate (requires nightly rustc), which will
/// change this function to be const, or
///
/// 2. Use [`max_alloc_size_const`] instead, which requires `R = Option<char>`.
/// (There’s also a `tinyvec_string` usage demonstration there.)
pub fn max_alloc_size<R: Replace>(options: &Options<R>) -> usize {
max_alloc_size_body!(options)
}
#[cfg(feature = "const-fn-trait-bound")]
/// Calculate the maximum allocation size required for a given set of options, to correctly handle
/// any input.
///
/// This is intended for the scratch buffer approach, where you keep one string around and keep on
/// sanitising a whole bunch of inputs into it in turn, or for array-allocated strings like with
/// `tinyvec_string`.
///
/// This is a const fn because this crate was compiled with the `const-fn-trait-bound` feature
/// enabled (which requires nightly rustc at the time of writing).
///
/// See also [`max_alloc_size_const`] for an example of using this with `tinyvec_string`.
pub const fn max_alloc_size<R: Replace>(options: &Options<R>) -> usize {
max_alloc_size_body!(options)
}
/// A `const` variant of [`max_alloc_size`].
///
/// Sample usage, combined with `tinyvec_string` (with its `rustc_1_55` feature enabled):
///
/// ```rust,ignore
/// use tinyvec_string::ArrayString;
/// let mut string =
/// ArrayString::<[u8; max_alloc_size_const(&Options::DEFAULT)]>::new();
/// sanitise_to("input name", &Options::DEFAULT, &mut string);
/// ```
///
/// Once `const-fn-trait-bound` is stabilised, this method will be deprecated.
#[cfg_attr(feature = "const-fn-trait-bound", doc = "\n \
Since you compiled this crate with the `const-fn-trait-bound` feature, you don’t need this
method. Be cheerful and use `max_alloc_size` instead!")]
pub const fn max_alloc_size_const(options: &Options<Option<char>>) -> usize {
max_alloc_size_body!(options)
}
/// Sanitise a file name. See [`Options`] for a description of what all the options do.
///
/// The return value should be suitable as a file name for the specified options,
/// unless it’s empty which can only happen if the option `six_measures_of_barley` is empty (or if
/// the `length_limit` option is illegally small, actually).
#[cfg(feature = "alloc")]
#[cfg_attr(docsrs, doc(cfg(feature = "alloc")))]
pub fn sanitise_with_options<R: Replace>(input: &str, options: &Options<R>) -> String {
let mut out = String::with_capacity(sufficient_alloc_size(input, options));
#[cfg(test)]
let initial_capacity = out.capacity();
sanitise_to(input, options, &mut out);
#[cfg(test)]
if initial_capacity != out.capacity() {
// I’m serious about this making exactly one allocation. No reallocating allowed.
panic!("Capacity changed from {initial_capacity} to {} (on {:?} → {:?})",
out.capacity(), input, out);
}
out
}
/// A target for sanitisation: essentially the subset of `String` functionality used.
///
/// It might have been nice to use something like `Read + Write + Seek` instead, but the need to
/// delete things after writing means that you need still more, and in the end it’s much easier to
/// treat it as a string.
///
/// I’ve provided implementations for `String` (if the *alloc* feature is enabled, which it is by
/// default) and `tinyvec_string::ArrayString` (if the *tinyvec_string* feature is enabled),
/// but there’s nothing preventing you from implementing it on other similar string types.
pub trait Stringy:
Index<Range<usize>, Output = str> +
Index<RangeFrom<usize>, Output = str> +
Deref<Target = str> +
Extend<char>
{
fn push(&mut self, ch: char);
fn push_str(&mut self, string: &str);
fn pop(&mut self) -> Option<char>;
fn truncate(&mut self, new_len: usize);
fn replace_range<R>(&mut self, range: R, replace_with: &str) where R: RangeBounds<usize>;
}
#[cfg(feature = "alloc")]
#[cfg_attr(docsrs, doc(cfg(feature = "alloc")))]
impl Stringy for String {
#[inline] fn push(&mut self, ch: char) { self.push(ch) }
#[inline] fn push_str(&mut self, string: &str) { self.push_str(string) }
#[inline] fn pop(&mut self) -> Option<char> { self.pop() }
#[inline] fn truncate(&mut self, new_len: usize) { self.truncate(new_len) }
#[inline] fn replace_range<R>(&mut self, range: R, replace_with: &str)
where R: RangeBounds<usize>
{ self.replace_range(range, replace_with) }
}
#[cfg(feature = "tinyvec_string")]
#[cfg_attr(docsrs, doc(cfg(feature = "tinyvec_string")))]
impl<A: tinyvec_string::bytearray::ByteArray> Stringy for tinyvec_string::ArrayString<A> {
#[inline] fn push(&mut self, ch: char) { self.push(ch) }
#[inline] fn push_str(&mut self, string: &str) { self.push_str(string) }
#[inline] fn pop(&mut self) -> Option<char> { self.pop() }
#[inline] fn truncate(&mut self, new_len: usize) { self.truncate(new_len) }
#[inline] fn replace_range<R>(&mut self, range: R, replace_with: &str)
where R: RangeBounds<usize>
{ self.replace_range(range, replace_with) }
}
/// Sanitise a file name into an existing `String`. Intended for power users only.
///
/// When you use [`sanitise`] or [`sanitise_with_options`], the perfect allocation is artisanally
/// crafted (or something). If you use this carelessly, you may actually cause *more* allocations
/// to be made, rather than less. You may therefore wish to use [`sufficient_alloc_size`] in some
/// cases to calculate how much more to reserve ahead of time.
///
/// See [`Options`] for a description of what all the options do.
///
/// After calling this, `out` will be the same length or longer, never shorter. If you want to know
/// *how much* longer, store and compare the length yourself.
pub fn sanitise_to<R: Replace, S: Stringy>(input: &str, options: &Options<R>, out: &mut S) {
let protected = out.len();
// I said in the docs don’t set it to less than 10, but without this zero leads to some
// unreachable!() being reached, which is æsthetically displeasing, so I’m just going to return
// empty strings for unreasonably small length limits. 🙂
if options.length_limit < 10 {
return;
}
// When label-break-value stabilises I’ll switch to that, but until then, loop it is.
#[allow(clippy::never_loop)]
loop { // breaks after exactly one iteration.
if options.extension_cleverness {
if let Some((base_name, extension)) = split_extension(input) {
// With extension-awareness, when the path exceeds length_limit, we prefer
// to truncate from the base name rather than from the extension. But we don’t
// know how much we’ll need to truncate until we’ve finished processing the
// extension, so we have to allocate a maximum of roughly twice as much as
// we’ll end up needing.
//
// For implementation convenience in this corner case, we’ve declared a maximum
// extension length of six less than length_limit (explained on
// Options::extension_cleverness).
let extension_length_limit = options.length_limit - 6;
sanitise_part(base_name, options, options.length_limit, false, out, protected);
let base_len = out.len() - protected;
out.push('.');
let extension_truncated = sanitise_part(
extension,
options,
extension_length_limit,
true,
out,
// It’s OK for trimming to take out the entire file name (so we’re deliberately
// not including base_len in this), but we mustn’t touch what’s not ours.
protected,
);
let mut total_len = out.len() - protected;
if total_len > options.length_limit {
if extension_truncated {
// Extension is unsalvageable: truncate from the end.
while total_len > options.length_limit {
match out.pop() {
Some(c) => total_len -= c.len_utf8(),
None => unreachable!(),
}
}
// Length is now acceptable, but that could have left us with
// undesirable trailing characters, so run trim again.
out.truncate(protected + trim_end(&out[protected..], options, true).len());
} else {
// (Sigh. Whose brilliant idea was it to try to preserve extensions anyway?
// Maybe if I’d realised the pain it’d cause I wouldn’t have bothered.
// It’s not like anyone *else* does it. But it is good, say I. And now I’m
// even mulling over grapheme-cluster-aware truncation. Am I mad?)
let base_name_end_index = base_len;
let mut base_chars = out[protected..protected + base_name_end_index].chars();
while total_len > options.length_limit {
match base_chars.next_back() {
Some(c) => {
total_len -= c.len_utf8();
}
None => unreachable!(),
}
}
let base_name = trim_end(
&out[protected..protected + base_chars.as_str().len()],
options,
false,
);
let range = protected + base_name.len()..protected + base_name_end_index;
out.replace_range(range, "");
}
}
break;
}
}
// Extension cleverness disabled, or no extension found: the much simpler path!
sanitise_part(input, options, options.length_limit, false, out, protected);
break;
}
// Finally one last bit of processing: checking names that are all dots
// (though normally windows_safe will already have truncated them to zero).
if (options.url_safe && (&out[protected..] == "." || &out[protected..] == "..")) ||
(options.most_fs_safe && out[protected..].chars().all(|c| c == '.'))
{
out.truncate(protected);
}
if out[protected..].is_empty() {
out.push_str(options.six_measures_of_barley);
}
}
fn sanitise_part<R: Replace, S: Stringy>(
input: &str,
options: &Options<R>,
length_limit: usize,
is_extension: bool,
out: &mut S,
protected: usize,
) -> bool {
let mut len = 0;
let mut did_truncate = false;
let mut last_was_remove = false;
let mut last_was_whitespace = false;
out.extend(input.chars()
.map(|mut c| {
c = if options.normalise_whitespace && c.is_whitespace() { ' ' } else { c };
(c,
(options.most_fs_safe && !is_most_fs_safe_char(c)) ||
(options.windows_safe && !is_windows_safe_char(c)) ||
(options.url_safe && !is_url_safe_char(c)) ||
(options.remove_control_characters && c.is_control()) ||
(options.remove_reordering_characters && is_reordering_character(c))
)
})
.filter_map(|(c, remove)| {
if options.collapse_replacements {
if remove && last_was_remove {
return None;
}
last_was_remove = remove;
}
if remove { options.replace_with.replace(c) } else { Some(c) }
.filter(|&c| {
if options.normalise_whitespace {
let is_whitespace = c == ' ';
let drop = last_was_whitespace && is_whitespace;
last_was_whitespace = is_whitespace;
!drop
} else {
true
}
})
})
.skip_while(|&c| {
(options.trim_spaces_and_full_stops && is_space_or_full_stop(c)) ||
(options.trim_more_punctuation && is_more_punctuation_character(c))
})
.take_while(|&c| {
let new_len = len + c.len_utf8();
if new_len <= length_limit {
len = new_len;
true
} else {
did_truncate = true;
false
}
})
);
if len > 0 {
// We’ve added something non-trimmed, that’ll guard the potential reserved name underscore.
out.truncate(protected + trim_end(&out[protected..], options, is_extension).len());
}
if !is_extension && options.windows_safe && is_reserved_windows_file_name(&out[protected..]) {
// This underscore looks to be in danger of being end-trimmed,
// but in practice we’ve ensured that it won’t be
// (except maybe one case with a lower length_limit than permitted).
out.push('_');
}
// Whew. Finally done. Breathe a sigh of relief.
did_truncate
}
fn trim_end<'a, R: Replace>(out: &'a str, options: &Options<R>, is_extension: bool) -> &'a str {
let trim_space_or_full_stop = options.trim_spaces_and_full_stops ||
((is_extension || !options.extension_cleverness) && options.windows_safe);
out.trim_end_matches(|c| {
(trim_space_or_full_stop && is_space_or_full_stop(c)) ||
(options.trim_more_punctuation && is_more_punctuation_character(c))
})
}
// A concession to those poor Americans et al. 😀
#[cfg(feature = "alloc")]
#[cfg_attr(docsrs, doc(cfg(feature = "alloc")))]
pub use sanitise as sanitize;
#[cfg(feature = "alloc")]
#[cfg_attr(docsrs, doc(cfg(feature = "alloc")))]
pub use sanitise_with_options as sanitize_with_options;
pub use sanitise_to as sanitize_to;
// How did this get to almost a thousand lines by this point? I’m sure I started out with only
// twenty or so. But then I got careful about allocations, and added extension cleverness, and
// added more features, and documented exhaustingly, and oops, a thousand lines, lines that are
// convoluted at times. Well, I succeeded in all my *functional* goals, with better precision,
// theoretically better but untested speed, better behaviour around extensions, single-allocation
// and even *no*-allocation operation; but utterly lost sight of simple and obviously-correct code.
// Was it worth it? Eh, probably.
// --- Tests ---
#[cfg(feature = "alloc")]
#[test]
fn test_length_limit_things() {
// I wrote these tests before I wrote the matrix. I might as well delete them, but I haven’t.
let short = Options {
length_limit: 10,
..Options::DEFAULT
};
assert_eq!(sanitise_with_options("abcdef.ghij", &short), "abcde.ghij");
// Unsalvageable extension
assert_eq!(sanitise_with_options("abcde.fghij", &short), "abcde.fghi");
// Windows reserved name protection
assert_eq!(sanitise_with_options("AUX.abcdef", &short), "AUX_.abcd");
assert_eq!(sanitise_with_options("AUX.abcdef", &Options { windows_safe: false, ..short }),
"AUX.abcd");
assert_eq!(sanitise_with_options("lpT7.abcdef", &short), "lpT7_.abcd");
assert_eq!(sanitise_with_options("cOm6.abcdef", &Options { windows_safe: false, ..short }),
"cOm6.abcd");
assert_eq!(sanitise("CON"), "CON_");
assert_eq!(sanitise("aux.h"), "aux_.h");
assert_eq!(sanitise("Lpt1.exe"), "Lpt1_.exe");
assert_eq!(sanitise("xyz"), "xyz");
assert_eq!(sanitise(""), "_");
assert_eq!(sanitise("nül"), "nül");
assert_eq!(sanitise("COM1.jpg.png"), "COM1.jpg.png");
}
#[cfg(feature = "alloc")]
#[test]
fn matrix() {
// Look, I know I said I didn’t want std, but I *need* it for these tests, y’see?
#[cfg(not(feature = "std"))]
extern crate std;
use std::prelude::rust_2021::*;
use std::fmt::Write;
use std::{eprintln, println, format, vec};
fn case<
R: Replace,
#[cfg(feature = "tinyvec_string")]
A: tinyvec_string::bytearray::ByteArray,
>(
set_name: &'static str,
options_name: &'static str,
options: &Options<R>,
paths: &mut Vec<String>,
unsteady_state: &mut Vec<(&'static str, &'static str, String, String, String)>,
// Apparently you can’t do `case::<#[cfg] A>()`, so we have to do this instead.
#[cfg(feature = "tinyvec_string")]
_: std::marker::PhantomData<A>,
) {
println!("Sanitising {set_name} with options {options_name}");
#[cfg(feature = "tinyvec_string")]
let mut array_string = tinyvec_string::ArrayString::<A>::new();
let mut sanitised = String::new();
let mut capacity = String::new();
let mut scratch = if options_name == "passthrough" {
// “memory allocation of 18446744073709551615 bytes failed” 😀
String::new()
} else {
String::with_capacity(max_alloc_size(options) + 1)
};
let scratch_size = scratch.capacity();
for input in std::fs::read_to_string(format!("tests/{set_name}.txt")).unwrap().lines() {
let output = sanitise_with_options(input, options);
// A couple of sanity checks make sense here.
if output.len() > options.length_limit {
panic!(
"Input {input} sanitised to {output}, which at {len} is greater than the allowed {max}",
len = output.len(),
max = options.length_limit,
);
}
if options.windows_safe && is_reserved_windows_file_name(
options.extension_cleverness.then(|| &*output)
.and_then(split_extension)
.map(|(base, _)| base)
.unwrap_or(&output)
) {
panic!("Input {input} sanitised to {output}, which is a reserved Windows file name");
}
// And ensure sanitise_to is working properly also.
scratch.truncate(0);
scratch.push('.'); // A trimmable character, and not six_measures_of_barley.
sanitise_to(input, options, &mut scratch);
assert_eq!(scratch.chars().next(), Some('.'));
assert_eq!(scratch[1..], output);
#[cfg(feature = "tinyvec_string")]
{
if array_string.capacity() > 0 {
array_string.truncate(0);
array_string.push(' '); // A trimmable character, and not six_measures_of_barley.
sanitise_to(input, options, &mut array_string);
assert_eq!(array_string.chars().next(), Some(' '));
assert_eq!(array_string[1..], output);
}
}
sanitised.push_str(&output);
sanitised.push('\n');
let _ = writeln!(capacity, "{}", output.capacity());
if input != output {
if options_name == "passthrough" {
unsteady_state.push(
(set_name, options_name, input.to_owned(), output.clone(), output),
);
} else {
let repeated = sanitise_with_options(&output, options);
if repeated != output {
sanitised.push_str("⚠ Sanitisation did not reach a steady state. Next line shows the effect of resanitising the line above. ⚠\n");
sanitised.push_str(&repeated);
sanitised.push('\n');
unsteady_state.push(
(set_name, options_name, input.to_owned(), output, repeated),
);
}
}
}
}
let sanitised_name = format!("tests/{set_name}.{options_name}.sanitised");
let capacity_name = format!("tests/{set_name}.{options_name}.capacity");
std::fs::write(&sanitised_name, sanitised).unwrap();
std::fs::write(&capacity_name, capacity).unwrap();
paths.push(sanitised_name);
paths.push(capacity_name);
if options_name != "passthrough" {
assert_eq!(scratch_size, scratch.capacity(), "scratch buffer reallocated");
}
}
let mut unsteady_state = vec![];
let mut paths = vec![];
let d = Options::DEFAULT;
for name in ["blns", "misc"] {
macro_rules! case {
// On $array_size: I tried using roughly { max_alloc_size_const(&options) + 1 },
// but threading it all through was just too painful, especially in the absence of
// const-fn-trait-bound. So I’ll just do one separate test for that.
($array_size:literal, $options_name:expr, $options:expr) => {
let options = &$options;
// +1 for the ' ' we prefix.
let required_size = max_alloc_size(options).saturating_add(1);
assert!($array_size == 0 || required_size <= $array_size,
"Test case design error: array being given {} bytes, but {} are needed",
$array_size,
required_size
);
case(
name,
$options_name,
options,
&mut paths,
&mut unsteady_state,
// TODO: after https://github.com/ThatsNoMoon/tinyvec_string/issues/3 is
// resolved, ditch $array_size and use max_alloc_size instead.
#[cfg(feature = "tinyvec_string")]
std::marker::PhantomData::<[u8; $array_size]>,
);
}
}
// Assumption that I decline to “test” because it’d be silly:
// sanitise(…) == sanitise_with_options(…, &Options::DEFAULT).
case!(512, "default", d);
case!(512, "realistic-length_limit-reduction", Options { length_limit: Options::DEFAULT.length_limit - 4, ..d });
case!(512, "url_safe", Options { url_safe: true, ..d });
case!(512, "silly-replace_with", Options::DEFAULT.with_replace_with(|c| char::from_u32(c as u32 + 1)));
case!(512, "no-windows_safe", Options { windows_safe: false, ..d });
case!(256, "no-extension_cleverness", Options { extension_cleverness: false, ..d });
// 10 + 1
case!(11, "short-sans-extension_cleverness", Options { length_limit: 10, extension_cleverness: false, ..d });
// 15 + 1
case!(16, "short", Options { length_limit: 10, ..d });
case!(0, "passthrough", Options {
length_limit: usize::MAX,
reserve_extra: 0,
extension_cleverness: false,
most_fs_safe: false,
windows_safe: false,
url_safe: false,
normalise_whitespace: false,
trim_spaces_and_full_stops: false,
trim_more_punctuation: false,
remove_control_characters: false,
remove_reordering_characters: false,
replace_with: None,
collapse_replacements: false,
six_measures_of_barley: "",
});
macro_rules! case_only {
($option:ident) => {{
let mut options = Options {
most_fs_safe: false,
windows_safe: false,
url_safe: false,
normalise_whitespace: false,
trim_spaces_and_full_stops: false,
trim_more_punctuation: false,
remove_control_characters: false,
remove_reordering_characters: false,
..d
};
options.$option = true;
case!(512, concat!("just-", stringify!($option)), options);
}}
}
case_only!(most_fs_safe);
case_only!(windows_safe);
case_only!(url_safe);
case_only!(normalise_whitespace);
case_only!(trim_spaces_and_full_stops);
case_only!(trim_more_punctuation);
case_only!(remove_control_characters);
case_only!(remove_reordering_characters);
// Eh, I’m bored now. That’ll do.
}
let mut complain_of_unsteady_states = false;
if !unsteady_state.is_empty() {
for (set, options, original, first, second) in &unsteady_state {
match (*set, *options, &**original, &**first, &**second) {
("blns", "short", "Dr. Herman I. Libshitz", "Dr. Herman", "Dr.Herm") |
("blns", "short", r#"{{ "".__class__.__mro__[2].__subclasses__()[40]("/etc/passwd").read() }}"#, "{{ __.__cl", "{{.cl") => {
// Skip known cases of unsalvageable extensions combining with dots in the base
// name to effectively give a new extension, making quiescence take two steps.
// Making these steady-state would take too much effort, and the harm is
// minimal (the unsteady state is still a correctly sanitised name).
},
(_, "silly-replace_with", _, _, _) => {
// Certainly this one isn’t steady-state!
},
(_, "passthrough", _, _, _) => {
complain_of_unsteady_states = true;
eprintln!("Unsteady state in {set} with {options} options, diff tests/{set}.{options}.txt and tests/{set}.{options}.sanitised");
},
_ => {
complain_of_unsteady_states = true;
eprintln!("Unknown unsteady state in {set} with {options} options, look for the ⚠ symbol in tests/{set}.{options}.sanitised");
},
}
}
}
if !std::process::Command::new("git")
.arg("diff")
.arg("--exit-code")
.arg("--text")
.args(&paths)
.status()
.unwrap()
.success()
{
panic!("sanitisation produced different results than are known, review the diffs");
}
// A guard against committing an unsteady state.
if complain_of_unsteady_states {
panic!("Some sanitisations unexpectedly failed to reach a steady state.");
}
}
#[cfg(feature = "tinyvec_string")]
#[test]
#[should_panic]
fn test_tinyvec_string_panic() {
// I’ve already tested various normal cases, including that ridiculously long strings don’t
// cause overflow on moderately limited arrays with moderate length limits; but I haven’t
// demonstrated the panic that occurs if the array is too short. So here’s this now. 🙂
sanitise_to(
"Watch me panic!",
&Options::DEFAULT,
&mut tinyvec_string::ArrayString::<[u8; 12]>::new(),
);
}
#[cfg(feature = "tinyvec_string")]
#[test]
fn test_tinyvec_string_max_alloc_size() {
use tinyvec_string::ArrayString;
// Note: this is *currently* 505, but I declare that not part of the compatibility contract;
// extension cleverness/grapheme cluster changes could lead to it increasing to 510.
let mut string: ArrayString<[u8; 505]> =
ArrayString::<[u8; max_alloc_size_const(&Options::DEFAULT)]>::new();
assert_eq!(string.capacity(), 505);
sanitise_to(
"Watch everything being hunky dory, even when I throw unreasonably long values at it all. \
Even when dots become extension separators; yes, even then. Then further: into the deep \
reaches of testing, where things start to get garbled, and having written at least 255 \
characters, I have to now write just as much of extension—horror. But that was a full \
stop so that this could now be the extension, and I can’t put a dot for the next while› \
¿Whatever will I do? I suspect things are getting out of hand here, but I can’t stop now; \
I’m running out of things to write, but it should be enough by now!",
&Options::DEFAULT,
&mut string,
);
assert_eq!(string, "Watch everything being hunky dory, even when I throw unreasonably long \
values at it all. Even when dots become extension separators; yes, even then. Then \
further_ into the deep reaches of testing, where things start to get garbled, and having \
written at l");
}