urlable 0.2.0

A comprehensive URL manipulation library for Rust, providing utilities for parsing, encoding, and manipulating URLs with support for query strings, path manipulation, punycode domains and more
Documentation
use std::char;

// Constants for Punycode encoding
const BASE: u32 = 36;
const T_MIN: u32 = 1;
const T_MAX: u32 = 26;
const SKEW: u32 = 38;
const DAMP: u32 = 700;
const INITIAL_BIAS: u32 = 72;
const INITIAL_N: u32 = 128;

/// Adapts the bias value based on the current state
fn adapt(mut delta: u32, num_points: u32, first_time: bool) -> u32 {
    // Initial delta adjustment
    delta = if first_time { delta / DAMP } else { delta >> 1 };
    delta += delta / num_points;

    // Calculate k value
    let mut k = 0;
    while delta > ((BASE - T_MIN) * T_MAX) / 2 {
        delta /= BASE - T_MIN;
        k += BASE;
    }

    k + (((BASE - T_MIN + 1) * delta) / (delta + SKEW))
}

/// Converts a Unicode string to ASCII Punycode
pub fn to_ascii(input: &str) -> String {
    // Return early if input is already ASCII
    if !input.chars().any(|c| c as u32 > 0x7f) {
        return input.to_string();
    }

    let mut output = String::new();
    let chars: Vec<char> = input.chars().collect();

    // Handle basic ASCII characters first
    let basic_length = process_ascii_chars(&chars, &mut output);
    if basic_length > 0 {
        output.push('-');
    }

    // Process non-ASCII characters
    process_non_ascii_chars(&chars, basic_length, &mut output);

    format!("xn--{}", output)
}

/// Process ASCII characters and return count
fn process_ascii_chars(chars: &[char], output: &mut String) -> u32 {
    let mut count = 0;
    for &c in chars {
        if c as u32 <= 0x7f {
            output.push(c);
            count += 1;
        }
    }
    count
}

/// Process non-ASCII characters
fn process_non_ascii_chars(chars: &[char], basic_length: u32, output: &mut String) {
    let mut n = INITIAL_N;
    let mut delta = 0;
    let mut bias = INITIAL_BIAS;
    let mut h = basic_length;
    let input_length = chars.len() as u32;

    while h < input_length {
        // Find next smallest codepoint >= n
        let min_cp = chars
            .iter()
            .map(|&c| c as u32)
            .filter(|&cp| cp >= n)
            .min()
            .unwrap_or(0x10ffff);

        delta += (min_cp - n) * (h + 1);
        n = min_cp;

        // Process each character
        for &c in chars {
            let cp = c as u32;
            if cp < n {
                delta += 1;
            } else if cp == n {
                encode_delta(delta, bias, output);
                bias = adapt(delta, h + 1, h == basic_length);
                delta = 0;
                h += 1;
            }
        }

        delta += 1;
        n += 1;
    }
}

/// Encode a delta value to output string
fn encode_delta(mut delta: u32, bias: u32, output: &mut String) {
    let mut k = BASE;

    loop {
        let t = if k <= bias {
            T_MIN
        } else if k >= bias + T_MAX {
            T_MAX
        } else {
            k - bias
        };

        if delta < t {
            break;
        }

        let digit = t + ((delta - t) % (BASE - t));
        output.push(encode_digit(digit));
        delta = (delta - t) / (BASE - t);
        k += BASE;
    }

    output.push(encode_digit(delta));
}

/// Convert a digit to its encoded character representation
fn encode_digit(d: u32) -> char {
    char::from_u32(if d < 26 {
        d + 'a' as u32
    } else {
        d + '0' as u32 - 26
    })
    .unwrap()
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_to_ascii() {
        // Basic ASCII strings
        assert_eq!(to_ascii("hello"), "hello");
        assert_eq!(to_ascii("test123"), "test123");
        assert_eq!(to_ascii("example.com"), "example.com");

        // Unicode characters
        assert_eq!(to_ascii("München"), "xn--Mnchen-3ya");
        assert_eq!(to_ascii("北京"), "xn--1lq90i");
        assert_eq!(to_ascii("東京"), "xn--1lqs71d");
        assert_eq!(to_ascii("παράδειγμα"), "xn--hxajbheg2az3al");
        assert_eq!(to_ascii("bücher"), "xn--bcher-kva");

        // Emojis and symbols
        assert_eq!(to_ascii(""), "xn--n3h");
        assert_eq!(to_ascii(""), "xn--bih");
        assert_eq!(to_ascii(""), "xn--p3h");
        assert_eq!(to_ascii(""), "xn--g6h");

        // Mixed ASCII and Unicode
        assert_eq!(to_ascii("hello 世界"), "xn--hello -ur7iy09x");
        assert_eq!(to_ascii("test☃test"), "xn--testtest-qt4e");
        assert_eq!(to_ascii("example⌘.com"), "xn--example.com-t00g");

        // Edge cases
        assert_eq!(to_ascii(""), "");
        assert_eq!(to_ascii(" "), " ");
        assert_eq!(to_ascii("-"), "-");
        assert_eq!(to_ascii("."), ".");
    }

    #[test]
    fn test_encode_digit() {
        // Test lowercase letters (0-25)
        assert_eq!(encode_digit(0), 'a');
        assert_eq!(encode_digit(25), 'z');

        // Test numbers (26-35)
        assert_eq!(encode_digit(26), '0');
        assert_eq!(encode_digit(35), '9');

        // Test some values in between
        assert_eq!(encode_digit(13), 'n');
        assert_eq!(encode_digit(30), '4');
    }
}