use crate::bootstring::{adapt_bias, encode_digit, threshold, BASE, INITIAL_BIAS};
pub(crate) const PREFIX: &str = "_N_";
pub(crate) const DELIMITER: &str = "__";
pub fn is_xid_identifier(s: &str) -> bool {
let mut chars = s.chars();
match chars.next() {
None => false, Some(first) => {
if first == '_' {
chars.all(unicode_ident::is_xid_continue)
} else {
unicode_ident::is_xid_start(first) && chars.all(unicode_ident::is_xid_continue)
}
}
}
}
pub(crate) fn needs_encoding(s: &str) -> bool {
if s.is_empty() {
return false;
}
if s.starts_with(PREFIX) {
return true;
}
!is_xid_identifier(s)
}
pub fn encode(input: &str) -> String {
if input.is_empty() {
return String::new();
}
if !needs_encoding(input) {
return input.to_string();
}
if input.starts_with(PREFIX) {
if let Ok(decoded) = crate::decode::decode(input) {
if needs_encoding(&decoded) {
debug_assert_eq!(encode_impl(&decoded), input, "identity violation");
return input.to_string();
}
}
}
encode_impl(input)
}
pub(crate) fn encode_impl(input: &str) -> String {
let chars: Vec<char> = input.chars().collect();
let mut is_basic: Vec<bool> = vec![true; chars.len()];
let mut consecutive_underscores = 0;
for (i, &c) in chars.iter().enumerate() {
if !unicode_ident::is_xid_continue(c) {
is_basic[i] = false;
consecutive_underscores = 0;
} else if c == '_' {
consecutive_underscores += 1;
if consecutive_underscores >= 2 {
is_basic[i] = false;
}
} else {
consecutive_underscores = 0;
}
}
let non_basic_count = is_basic.iter().filter(|&&b| !b).count();
if non_basic_count > 0 {
for i in (0..chars.len()).rev() {
if is_basic[i] {
if chars[i] == '_' {
is_basic[i] = false;
} else {
break;
}
}
}
}
let mut basic = String::new();
let mut non_basic: Vec<(usize, char)> = Vec::new();
let mut last_was_underscore = false;
for (i, &c) in chars.iter().enumerate() {
if is_basic[i] {
if c == '_' && last_was_underscore {
non_basic.push((i, c));
} else {
basic.push(c);
last_was_underscore = c == '_';
}
} else {
non_basic.push((i, c));
}
}
if non_basic.is_empty() {
return format!("{}{}", PREFIX, basic);
}
let encoded = encode_insertions(&non_basic);
format!("{}{}{}{}", PREFIX, basic, DELIMITER, encoded)
}
fn encode_insertions(insertions: &[(usize, char)]) -> String {
let mut output = String::new();
let mut bias: u32 = INITIAL_BIAS;
let mut prev_pos: usize = 0;
for (idx, &(pos, c)) in insertions.iter().enumerate() {
let pos_delta = if idx == 0 { pos } else { pos - prev_pos - 1 };
encode_varint(&mut output, pos_delta as u32, bias);
bias = adapt_bias(pos_delta as u32, (idx + 1) as u32, idx == 0);
let cp = c as u32;
encode_varint(&mut output, cp, bias);
bias = adapt_bias(cp, (idx + 2) as u32, false);
prev_pos = pos;
}
output
}
fn encode_varint(output: &mut String, mut value: u32, bias: u32) {
let mut k: u32 = BASE;
loop {
let t = threshold(k, bias);
if value < t {
output.push(encode_digit(value).expect("value should be < BASE"));
break;
}
let digit = t + (value - t) % (BASE - t);
output.push(encode_digit(digit).expect("digit should be < BASE"));
value = (value - t) / (BASE - t);
k += BASE;
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_is_xid_identifier() {
assert!(is_xid_identifier("foo"));
assert!(is_xid_identifier("_foo"));
assert!(is_xid_identifier("foo123"));
assert!(is_xid_identifier("café"));
assert!(is_xid_identifier("名前"));
assert!(is_xid_identifier("_1"));
assert!(is_xid_identifier("_"));
assert!(!is_xid_identifier("")); assert!(!is_xid_identifier("123")); assert!(!is_xid_identifier("foo bar")); assert!(!is_xid_identifier("foo-bar")); }
#[test]
fn test_needs_encoding() {
assert!(!needs_encoding("foo"));
assert!(!needs_encoding("café"));
assert!(!needs_encoding("")); assert!(!needs_encoding("foo__bar"));
assert!(needs_encoding("foo bar")); assert!(needs_encoding("foo-bar")); assert!(needs_encoding("123foo")); assert!(needs_encoding("_N_test")); assert!(needs_encoding("_N_foo__bar")); }
#[test]
fn test_encode_valid_xid() {
assert_eq!(encode("foo"), "foo");
assert_eq!(encode("café"), "café");
assert_eq!(encode("名前"), "名前");
assert_eq!(encode("foo123"), "foo123");
}
#[test]
fn test_encode_empty() {
assert_eq!(encode(""), "");
}
#[test]
fn test_encode_with_space() {
let encoded = encode("hello world");
assert!(encoded.starts_with(PREFIX));
assert!(encoded.contains(DELIMITER));
assert!(encoded.contains("helloworld"));
}
#[test]
fn test_encode_with_hyphen() {
let encoded = encode("foo-bar");
assert!(encoded.starts_with(PREFIX));
assert!(encoded.contains("foobar"));
}
#[test]
fn test_encode_starts_with_digit() {
let encoded = encode("123foo");
assert!(encoded.starts_with(PREFIX));
}
#[test]
fn test_encode_prefix_collision() {
let encoded = encode("_N_test");
assert!(encoded.starts_with(PREFIX));
assert_ne!(encoded, "_N_test");
}
#[test]
fn test_encode_prefix_collision_invalid_encoding() {
let input = "_N_abc__9";
let encoded = encode(input);
assert!(encoded.starts_with(PREFIX));
assert_ne!(encoded, input);
let decoded = crate::decode::decode(&encoded).unwrap();
assert_eq!(decoded, input);
}
#[test]
fn test_encode_consecutive_underscores_with_non_basic() {
let encoded = encode("a__b c");
assert!(encoded.starts_with(PREFIX));
let decoded = crate::decode::decode(&encoded).unwrap();
assert_eq!(decoded, "a__b c");
}
#[test]
fn test_encode_double_underscore_passthrough() {
assert_eq!(encode("foo__bar"), "foo__bar");
assert_eq!(encode("a__b__c"), "a__b__c");
}
#[test]
fn test_encode_prefix_with_double_underscore() {
let encoded = encode("_N_foo__bar");
assert!(encoded.starts_with(PREFIX));
assert_eq!(encoded, "_N_foo__bar");
let decoded = crate::decode::decode(&encoded).unwrap();
assert_ne!(decoded, "_N_foo__bar"); }
#[test]
fn test_encode_trailing_underscore() {
let encoded = encode("_ ");
assert!(encoded.starts_with(PREFIX));
assert!(encoded.contains(DELIMITER));
let after_prefix = &encoded[PREFIX.len()..];
let delim_pos = after_prefix.find(DELIMITER).unwrap();
let basic = &after_prefix[..delim_pos];
assert!(
!basic.ends_with('_'),
"basic '{}' ends with underscore",
basic
);
}
}