unic_idna_punycode/
lib.rs

1// Copyright 2013 The rust-url developers.
2// Copyright 2017 The UNIC Project Developers.
3//
4// See the COPYRIGHT file at the top-level directory of this distribution.
5//
6// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
7// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
8// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
9// option. This file may not be copied, modified, or distributed
10// except according to those terms.
11
12#![warn(
13    bad_style,
14    missing_debug_implementations,
15    missing_docs,
16    unconditional_recursion
17)]
18#![deny(unsafe_code)]
19
20//! # UNIC — IDNA — Punycode (RFC 3492)
21//!
22//! A component of [`unic`: Unicode and Internationalization Crates for Rust](/unic/).
23//!
24//! Implementation of Punycode ([RFC 3492](http://tools.ietf.org/html/rfc3492)) algorithm.
25//!
26//! Since Punycode fundamentally works on Unicode Code-Points,
27//! `encode` and `decode` take and return slices and vectors of `char`.
28//! `encode_str` and `decode_to_string` provide convenience wrappers
29//! that convert from and to Rust’s UTF-8 based `str` and `String` types.
30
31use std::char;
32use std::u32;
33
34mod pkg_info;
35pub use crate::pkg_info::{PKG_DESCRIPTION, PKG_NAME, PKG_VERSION};
36
37// Bootstring parameters for Punycode
38static BASE: u32 = 36;
39static T_MIN: u32 = 1;
40static T_MAX: u32 = 26;
41static SKEW: u32 = 38;
42static DAMP: u32 = 700;
43static INITIAL_BIAS: u32 = 72;
44static INITIAL_N: u32 = 0x80;
45static DELIMITER: char = '-';
46
47#[inline]
48fn adapt(mut delta: u32, num_points: u32, first_time: bool) -> u32 {
49    delta /= if first_time { DAMP } else { 2 };
50    delta += delta / num_points;
51    let mut k = 0;
52    while delta > ((BASE - T_MIN) * T_MAX) / 2 {
53        delta /= BASE - T_MIN;
54        k += BASE;
55    }
56    k + (((BASE - T_MIN + 1) * delta) / (delta + SKEW))
57}
58
59/// Convert Punycode to an Unicode `String`.
60///
61/// This is a convenience wrapper around `decode`.
62#[inline]
63pub fn decode_to_string(input: &str) -> Option<String> {
64    decode(input).map(|chars| chars.into_iter().collect())
65}
66
67/// Convert Punycode to Unicode.
68///
69/// Return None on malformed input or overflow.
70/// Overflow can only happen on inputs that take more than
71/// 63 encoded bytes, the DNS limit on domain name labels.
72#[cfg_attr(feature = "cargo-clippy", allow(cast_lossless))]
73pub fn decode(input: &str) -> Option<Vec<char>> {
74    // Handle "basic" (ASCII) code points.
75    // They are encoded as-is before the last delimiter, if any.
76    let (mut output, input) = match input.rfind(DELIMITER) {
77        None => (Vec::new(), input),
78        Some(position) => (
79            input[..position].chars().collect(),
80            if position > 0 {
81                &input[position + 1..]
82            } else {
83                input
84            },
85        ),
86    };
87    let mut code_point = INITIAL_N;
88    let mut bias = INITIAL_BIAS;
89    let mut i = 0;
90    let mut iter = input.bytes();
91    loop {
92        let previous_i = i;
93        let mut weight = 1;
94        let mut k = BASE;
95        let mut byte = match iter.next() {
96            None => break,
97            Some(byte) => byte,
98        };
99        // Decode a generalized variable-length integer into delta,
100        // which gets added to i.
101        loop {
102            let digit = match byte {
103                byte @ b'0'..=b'9' => byte - b'0' + 26,
104                byte @ b'A'..=b'Z' => byte - b'A',
105                byte @ b'a'..=b'z' => byte - b'a',
106                _ => return None,
107            } as u32;
108            if digit > (u32::MAX - i) / weight {
109                return None; // Overflow
110            }
111            i += digit * weight;
112            let t = if k <= bias {
113                T_MIN
114            } else if k >= bias + T_MAX {
115                T_MAX
116            } else {
117                k - bias
118            };
119            if digit < t {
120                break;
121            }
122            if weight > u32::MAX / (BASE - t) {
123                return None; // Overflow
124            }
125            weight *= BASE - t;
126            k += BASE;
127            byte = match iter.next() {
128                None => return None, // End of input before the end of this delta
129                Some(byte) => byte,
130            };
131        }
132        let length = output.len() as u32;
133        bias = adapt(i - previous_i, length + 1, previous_i == 0);
134        if i / (length + 1) > u32::MAX - code_point {
135            return None; // Overflow
136        }
137        // i was supposed to wrap around from length+1 to 0,
138        // incrementing code_point each time.
139        code_point += i / (length + 1);
140        i %= length + 1;
141        let c = match char::from_u32(code_point) {
142            Some(c) => c,
143            None => return None,
144        };
145        output.insert(i as usize, c);
146        i += 1;
147    }
148    Some(output)
149}
150
151/// Convert an Unicode `str` to Punycode.
152///
153/// This is a convenience wrapper around `encode`.
154#[inline]
155pub fn encode_str(input: &str) -> Option<String> {
156    encode(&input.chars().collect::<Vec<char>>())
157}
158
159/// Convert Unicode to Punycode.
160///
161/// Return None on overflow, which can only happen on inputs that would take more than
162/// 63 encoded bytes, the DNS limit on domain name labels.
163#[allow(unsafe_code)]
164pub fn encode(input: &[char]) -> Option<String> {
165    // Handle "basic" (ASCII) code points. They are encoded as-is.
166    let output_bytes = input
167        .iter()
168        .filter_map(|&c| if c.is_ascii() { Some(c as u8) } else { None })
169        .collect();
170    let mut output = unsafe { String::from_utf8_unchecked(output_bytes) };
171    let basic_length = output.len() as u32;
172    if basic_length > 0 {
173        output.push_str("-")
174    }
175    let mut code_point = INITIAL_N;
176    let mut delta = 0;
177    let mut bias = INITIAL_BIAS;
178    let mut processed = basic_length;
179    let input_length = input.len() as u32;
180    while processed < input_length {
181        // All code points < code_point have been handled already.
182        // Find the next larger one.
183        let min_code_point = input
184            .iter()
185            .map(|&c| c as u32)
186            .filter(|&c| c >= code_point)
187            .min()
188            .unwrap();
189        if min_code_point - code_point > (u32::MAX - delta) / (processed + 1) {
190            return None; // Overflow
191        }
192        // Increase delta to advance the decoder’s <code_point,i> state to <min_code_point,0>
193        delta += (min_code_point - code_point) * (processed + 1);
194        code_point = min_code_point;
195        for &c in input {
196            let c = c as u32;
197            if c < code_point {
198                delta += 1;
199                if delta == 0 {
200                    return None; // Overflow
201                }
202            }
203            if c == code_point {
204                // Represent delta as a generalized variable-length integer:
205                let mut q = delta;
206                let mut k = BASE;
207                loop {
208                    let t = if k <= bias {
209                        T_MIN
210                    } else if k >= bias + T_MAX {
211                        T_MAX
212                    } else {
213                        k - bias
214                    };
215                    if q < t {
216                        break;
217                    }
218                    let value = t + ((q - t) % (BASE - t));
219                    output.push(value_to_digit(value));
220                    q = (q - t) / (BASE - t);
221                    k += BASE;
222                }
223                output.push(value_to_digit(q));
224                bias = adapt(delta, processed + 1, processed == basic_length);
225                delta = 0;
226                processed += 1;
227            }
228        }
229        delta += 1;
230        code_point += 1;
231    }
232    Some(output)
233}
234
235#[inline]
236fn value_to_digit(value: u32) -> char {
237    match value {
238        0..=25 => (value as u8 + b'a') as char,       // a..=z
239        26..=35 => (value as u8 - 26 + b'0') as char, // 0..=9
240        _ => panic!("Value larger than BASE: {}", value),
241    }
242}