posix_string 0.1.0

Intelligent conversion from Unicode to POSIX portable strings
Documentation
//! Converts Unicode strings to ones containing only characters from the [POSIX Portable File Name Character
//! Set (Wikipedia)](https://en.wikipedia.org/wiki/Portable_character_set#Portable_filename_character_set).
//! Other characters are converted to the closest ASCII representation using
//! [deunicode (docs.rs)](https://docs.rs/deunicode/latest/deunicode/) where possible and removed otherwise,
//! and delimiters are automatically inserted where necessary using an algorithm described further down. The
//! converted strings may then be used as user-facing filenames or keys in systems where portability is
//! required. 
//!
//! There are primarily two APIs:
//! - `posix_string::convert`: performs a conversion from Unicode to POSIX portable characters. 
//! - `posix_string::convert_filename`: performs the same conversion as above and additionally enforces a
//! maximum length of 255 while attempting to leave the extension unchanged. 
//!
//! Each API also has a non-allocating `*_iter` variant which take a `char` iterator input and produce a `char`
//! iterator output. 
//!
//! ## Examples
//! ```
//! assert_eq!(posix_string::convert("Horsey 🦄🦄"), "horsey_unicorn_unicorn");
//! assert_eq!(posix_string::convert("Næstved"), "naestved");
//! assert_eq!(posix_string::convert("晒后假日"), "shai_hou_jia_ri");
//! assert_eq!(posix_string::convert("Београд - Добановци"), "beograd-dobanovtsi");
//! assert_eq!(posix_string::convert(" 🌵 . 🌵 Prickly/delimiters 🌵!"), "cactus.cactus_prickly_delimiters_cactus");
//! 
//! assert_eq!(posix_string::convert_filename("Güneş Sonrası", b"json"), "gunes_sonrasi.json");
//! assert_eq!(posix_string::convert_filename("😃-My filename-😃", b".toml"), "smiley-my_filename-smiley.toml");
//! ```
//!
//! ## Delimiter insertion algorithm
//! The goal is to insert a delimiting `_` before and after a conversion like `😃 → smiley` to ensure
//! that e.g. `晒后假日` gets converted as `shai_hou_jia_ri` and not `shaihoujiari`. However, this can't be
//! done carte blanche since the input symbol may already be surrounded by a delimiter or string terminals;
//! e.g., we would otherwise get conversions like `😃.😃 → _smiley__smiley_`. Instead of inserting a
//! delimiter directly, we therefore insert a special _marker_ character, which indicates "we here need a
//! delimiter". A marker is then reified as a delimiting `_` if both the following conditions are met:
//! - The next character is not a delimiter, string terminal, or marker. 
//! - The previous non-marker character is not a delimiter or string terminal.
//! 
//! The "non-marker" clauses above ensure that multiple sequential markers get reified as at most one
//! delimiter. 
//! 
//! The markers are inserted around a conversion if one of the following conditions are met:
//! - There was no viable conversion. If so, we assume that the input character was non-alphabetic and is
//! therefore best represented as a delimiter.
//! - The conversion has length > 1 and was from a non-alphabetic input character. This ensures that we're
//! not adding markers around e.g. `ä` in `aäa → aaa` or `æ` in `aæa → aaea`. 
//! 
//! Additionally, input characters can be wholly replaced with a marker if it's an ASCII symbol not among the
//! allowed ones (`._-`). We do this instead of directly replacing them with an allowed delimiter since it
//! ensures that multiple sequential non-allowed symbols are replaced with at most one delimiter. E.g.,
//! `a!"#b` gets converted as `a_b` and not `a___b`. 
//!
//! Note that there are simpler ways of ensuring the same delimiter requirements by creating an intermediate
//! buffer and filtering superfluous delimiters, but this would require dynamic allocations. 

/// Converts Unicode characters to POSIX portable characters inside an iterator. 
///
/// See the [crate-level documentation](self) for more information. 
///
/// The output may be longer or shorter than the input. No internal allocations are performed. 
///
/// # Examples
/// ```
/// assert!(posix_string::convert_iter("Horsey 🦄🦄".chars()).eq("horsey_unicorn_unicorn".chars()));
/// assert!(posix_string::convert_iter("Београд - Добановци".chars()).eq("beograd-dobanovtsi".chars()));
/// ```
pub fn convert_iter(input: impl Iterator<Item = char>) -> impl Iterator<Item = char> {
    const LEGAL_SYMBOLS: &str = "._-";

    #[derive(Clone, Copy)]
    enum Cell {
        Normal(char), 
        Delimiter(char), 
        Marker, 
    }

    input
        .map(|char| (char, deunicode::deunicode_char(char)))
        .flat_map(|(char, conversion)| {
            let marker = conversion
                .is_none_or(|string| string.len() > 1 && !char.is_alphabetic())
                .then_some(Cell::Marker);
            let symbols = conversion
                .unwrap_or("")
                .chars()
                .map(|char| match char {
                    x if x.is_ascii_alphanumeric() => Cell::Normal(x.to_ascii_lowercase()),
                    x if LEGAL_SYMBOLS.contains(x) => Cell::Delimiter(x),
                    _ => Cell::Marker, 
                });
            std::iter::chain(std::iter::chain(marker, symbols), marker)
        })
        .scan((false, true), |(prev_marker, prev_delimiter), cell| {
            let (curr_marker, curr_delimiter) = match cell {
                Cell::Normal(_) => (false, false),
                Cell::Delimiter(_) => (false, true),
                Cell::Marker => (true, *prev_delimiter), 
            };
            let prev_marker = std::mem::replace(prev_marker, curr_marker);
            let prev_delimiter = std::mem::replace(prev_delimiter, curr_delimiter);
            let needs_delimiter = prev_marker && !prev_delimiter && !curr_delimiter;

            Some(match cell {
                Cell::Normal(x) => Some((needs_delimiter, x)),
                Cell::Delimiter(x) => Some((needs_delimiter, x)),
                Cell::Marker => None, 
            })
        })
        .flatten()
        .flat_map(|(needs_delimiter, char)| {
            let delimiter = needs_delimiter.then_some('_');
            let char = std::iter::once(char);
            std::iter::chain(delimiter, char)
        })
}

/// Converts Unicode characters to POSIX portable characters inside a string. 
///
/// See the [crate-level documentation](self) for more information.
///
/// The output may be longer or shorter than the input. The only internal allocation is the output string.
/// See [`convert_iter`] for a completely non-allocating variant. 
///
/// # Examples
/// ```
/// assert_eq!(posix_string::convert("Horsey 🦄🦄"), "horsey_unicorn_unicorn");
/// assert_eq!(posix_string::convert("Næstved"), "naestved");
/// assert_eq!(posix_string::convert("晒后假日"), "shai_hou_jia_ri");
/// assert_eq!(posix_string::convert("Београд - Добановци"), "beograd-dobanovtsi");
/// assert_eq!(posix_string::convert(" 🌵 . 🌵 Prickly/delimiters 🌵!"), "cactus.cactus_prickly_delimiters_cactus");
/// ```
pub fn convert(input: &str) -> String {
    convert_iter(input.chars()).collect()    
}

/// Converts Unicode characters to POSIX portable characters inside a filename iterator, and appends an
/// extension while ensuring that the output does not exceed 255 characters. 
///
/// See the [crate-level documentation](self) for more information on the conversion.
///
/// If characters must be removed, they are removed from the filename first such that the extension remains
/// unchanged. If the extension is ≥ 255 characters long, characters are removed from the extension instead.
/// The extension is assumed to be composed of POSIX portable characters, and a leading `.` is automatically
/// inserted if missing. 
///
///
/// The output may be longer or shorter than the input. No internal allocations are performed. 
///
/// # Examples
/// ```
/// assert_eq!(posix_string::convert_filename("Güneş Sonrası", b"json"), "gunes_sonrasi.json");
/// assert_eq!(posix_string::convert_filename("😃-My filename-😃", b".toml"), "smiley-my_filename-smiley.toml");
/// ```
pub fn convert_filename_iter(filename: impl Iterator<Item = char>, extension: &[u8]) -> impl Iterator<Item =  char> {
    const MAX_LENGTH: usize = 255;

    // insert leading period in extension if missing
    let extension_period = match extension.starts_with(b".") {
        true => None, 
        false => Some(&b'.'), 
    };
    let extension = || std::iter::chain(extension_period, extension)
        .map(|&ascii| ascii as char);

    // remove overflowing characters from the filename, unless the extension is itself overflowing (in which
    // case there's no way to guarantee extension integrity anyways)
    let filename_max_length = match MAX_LENGTH.checked_sub(extension().count()) {
        Some(max_length) => max_length,
        None => MAX_LENGTH, 
    };

    convert_iter(filename)
        .take(filename_max_length)
        .chain(extension())
        .take(MAX_LENGTH)
}

/// Converts Unicode characters to POSIX portable characters inside a filename, and appends an extension
/// while ensuring that the output does not exceed 255 characters. 
/// 
/// See the [crate-level documentation](self) for more information on the conversion.
///
/// If characters must be removed, they are removed from the filename first such that the extension remains
/// unchanged. If the extension is ≥ 255 characters long, characters are removed from the extension instead.
/// The extension is assumed to be composed of POSIX portable characters, and a leading `.` is automatically
/// inserted if missing. 
///
/// The output may be longer or shorter than the input. The only internal allocation is the output string.
/// See [`convert_filename_iter`] for a completely non-allocating variant. 
///
/// # Examples
/// ```
/// assert_eq!(posix_string::convert_filename("Güneş Sonrası", b"json"), "gunes_sonrasi.json");
/// assert_eq!(posix_string::convert_filename("😃-My filename-😃", b".toml"), "smiley-my_filename-smiley.toml");
/// ```
/// The output may be longer or shorter than the input. The only internal allocation is the output string. 
/// See [`convert_filename_iter`] for a non-allocating variant. 
pub fn convert_filename(filename: &str, extension: &[u8]) -> String {
    convert_filename_iter(filename.chars(), extension).collect()
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn basic() {
        assert_eq!(convert("hello world"), "hello_world");
        assert_eq!(convert("åäöæø"), "aaoaeo");
        assert_eq!(convert("晒后假日"), "shai_hou_jia_ri");
        assert_eq!(convert("Güneş Sonrası"), "gunes_sonrasi");
        assert_eq!(convert("😃"), "smiley");
        assert_eq!(convert("😃.😃"), "smiley.smiley");
        assert_eq!(convert("My filename 😃😃"), "my_filename_smiley_smiley");
    }

    #[test]
    fn marker_edge_cases() {
        assert_eq!(convert("😃  😃"), "smiley_smiley");
        assert_eq!(convert("😃\r\n\t😃"), "smiley_smiley");
        assert_eq!(convert("😃    😃"), "smiley_smiley");
        assert_eq!(convert("😃 - 😃"), "smiley-smiley");
        assert_eq!(convert("😃  -  😃"), "smiley-smiley");

        assert_eq!(convert(", abc"), "abc");
        assert_eq!(convert("_,abc"), "_abc");
        assert_eq!(convert("abc ,"), "abc");
        assert_eq!(convert("abc,_"), "abc_");
    }

    #[test]
    fn filenames() {
        assert_eq!(convert_filename("abc", b"toml"), "abc.toml");
        assert_eq!(convert_filename("abc", b".toml"), "abc.toml");
        
        let max_length: String = (0..256).map(|i| format!("{:x}", i % 16)).collect();

        let converted = convert_filename(&max_length, b"toml");
        assert!(converted.len() == 255);
        assert!(converted.ends_with("0123456789.toml"));

        let converted = convert_filename("filename", max_length.as_bytes());
        assert!(converted.len() == 255);
        assert!(converted.ends_with("012345"));
    }
}