#[must_use]
pub fn tag_char_encode(input: &str) -> String {
let mut out = String::with_capacity(input.len() * 4);
for c in input.chars() {
let cp = c as u32;
if cp <= 0x7F {
if let Some(tag) = char::from_u32(0xE0000 + cp) {
out.push(tag);
continue;
}
}
out.push(c);
}
out
}
#[must_use]
pub fn variation_selector_pad(input: &str, selector: char) -> String {
let sel = match selector as u32 {
0xFE00..=0xFE0F | 0xE0100..=0xE01EF => selector,
_ => '\u{FE0F}',
};
let mut out = String::with_capacity(input.len() * 2 + input.chars().count() * sel.len_utf8());
for c in input.chars() {
out.push(c);
out.push(sel);
}
out
}
#[must_use]
pub fn variation_selector_supplementary_pad(input: &str) -> String {
let mut out = String::with_capacity(input.len() * 5);
for (i, c) in (0_u32..).zip(input.chars()) {
out.push(c);
let sel_cp = 0xE0100 + (i % 0xF0);
if let Some(sel) = char::from_u32(sel_cp) {
out.push(sel);
}
}
out
}
#[must_use]
pub fn ligature_encode(input: &str) -> String {
const LIGATURES: &[(&str, char)] = &[
("ffi", '\u{FB03}'),
("ffl", '\u{FB04}'),
("ff", '\u{FB00}'),
("fi", '\u{FB01}'),
("fl", '\u{FB02}'),
("st", '\u{FB06}'),
("ſt", '\u{FB05}'),
];
let mut out = String::with_capacity(input.len());
let mut rest = input;
'outer: while !rest.is_empty() {
for &(pat, replacement) in LIGATURES {
if let Some(stripped) = rest.strip_prefix(pat) {
out.push(replacement);
rest = stripped;
continue 'outer;
}
}
let mut chars = rest.chars();
if let Some(c) = chars.next() {
out.push(c);
}
rest = chars.as_str();
}
out
}
#[must_use]
pub fn circled_letter_encode(input: &str) -> String {
let mut out = String::with_capacity(input.len() * 4);
for c in input.chars() {
match c {
'A'..='Z' => {
let off = (c as u32) - ('A' as u32);
if let Some(repl) = char::from_u32(0x24B6 + off) {
out.push(repl);
continue;
}
}
'a'..='z' => {
let off = (c as u32) - ('a' as u32);
if let Some(repl) = char::from_u32(0x24D0 + off) {
out.push(repl);
continue;
}
}
_ => {}
}
out.push(c);
}
out
}
#[must_use]
pub fn parenthesized_letter_encode(input: &str) -> String {
let mut out = String::with_capacity(input.len() * 4);
for c in input.chars() {
match c {
'A'..='Z' => {
let off = (c as u32) - ('A' as u32);
if let Some(repl) = char::from_u32(0x1F110 + off) {
out.push(repl);
continue;
}
}
'a'..='z' => {
let off = (c as u32) - ('a' as u32);
if let Some(repl) = char::from_u32(0x249C + off) {
out.push(repl);
continue;
}
}
_ => {}
}
out.push(c);
}
out
}
#[must_use]
pub fn soft_hyphen_inject(input: &str) -> String {
if input.is_empty() {
return String::new();
}
let char_count = input.chars().count();
let mut out = String::with_capacity(input.len() + (char_count.saturating_sub(1)) * 2);
let mut first = true;
for c in input.chars() {
if !first {
out.push('\u{00AD}');
}
first = false;
out.push(c);
}
out
}
#[must_use]
pub fn word_joiner_wrap(input: &str) -> String {
let mut out = String::with_capacity(input.len() * 4);
for c in input.chars() {
out.push('\u{2060}');
out.push(c);
}
out.push('\u{2060}');
out
}
pub const INVISIBLE_ENCODER_NAMES: &[&str] = &[
"tag_char_encode",
"variation_selector_pad",
"variation_selector_supplementary_pad",
"ligature_encode",
"circled_letter_encode",
"parenthesized_letter_encode",
"soft_hyphen_inject",
"word_joiner_wrap",
];
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn tag_char_round_trips_via_codepoint_subtraction() {
let encoded = tag_char_encode("SELECT");
let recovered: String = encoded
.chars()
.map(|c| {
let cp = c as u32;
if (0xE0000..=0xE007F).contains(&cp) {
char::from_u32(cp - 0xE0000).unwrap_or(c)
} else {
c
}
})
.collect();
assert_eq!(recovered, "SELECT");
}
#[test]
fn tag_char_preserves_non_ascii() {
let encoded = tag_char_encode("SELECT' OR Ä");
assert!(
encoded.contains('Ä'),
"non-ASCII passes through: {encoded:?}"
);
}
#[test]
fn tag_char_every_byte_changes() {
let raw = "SELECT";
let encoded = tag_char_encode(raw);
assert_ne!(raw, encoded);
for c in encoded.chars() {
let cp = c as u32;
assert!((0xE0000..=0xE007F).contains(&cp), "non-tag codepoint: {c}");
}
}
#[test]
fn tag_char_handles_empty() {
assert_eq!(tag_char_encode(""), "");
}
#[test]
fn variation_selector_default_is_fe0f() {
let out = variation_selector_pad("AB", '\u{FE0F}');
assert!(out.contains('\u{FE0F}'));
assert_eq!(out.chars().count(), 4); }
#[test]
fn variation_selector_invalid_falls_back_to_fe0f() {
let out = variation_selector_pad("X", 'a');
assert!(out.contains('\u{FE0F}'), "fallback selector: {out:?}");
}
#[test]
fn variation_selector_accepts_supplementary_range() {
let out = variation_selector_pad("X", '\u{E0100}');
assert!(out.contains('\u{E0100}'));
}
#[test]
fn variation_selector_supplementary_varies_per_position() {
let out = variation_selector_supplementary_pad("AB");
let selectors: Vec<char> = out
.chars()
.filter(|c| (0xE0100..=0xE01EF).contains(&(*c as u32)))
.collect();
assert_eq!(selectors.len(), 2);
assert_ne!(
selectors[0], selectors[1],
"selectors must differ per position"
);
}
#[test]
fn ligature_encode_replaces_known_digraphs() {
let out = ligature_encode("effect official offload");
assert!(out.contains('\u{FB00}'), "ff → ff in 'effect': {out:?}");
assert!(out.contains('\u{FB03}'), "ffi → ffi in 'official': {out:?}");
assert!(out.contains('\u{FB04}'), "ffl → ffl in 'offload': {out:?}");
}
#[test]
fn ligature_encode_prefers_longest_match() {
let out = ligature_encode("ffi");
assert_eq!(out, "\u{FB03}");
assert!(!out.contains('\u{FB00}'));
}
#[test]
fn ligature_encode_passes_unmatched_chars() {
let out = ligature_encode("axyz");
assert_eq!(out, "axyz");
}
#[test]
fn ligature_encode_handles_empty() {
assert_eq!(ligature_encode(""), "");
}
#[test]
fn circled_letter_uppercase_and_lowercase() {
let out = circled_letter_encode("Aa");
assert!(out.contains('\u{24B6}'), "A → Ⓐ: {out:?}");
assert!(out.contains('\u{24D0}'), "a → ⓐ: {out:?}");
}
#[test]
fn circled_letter_preserves_punctuation() {
let out = circled_letter_encode("A'B");
assert!(out.contains('\''), "quote preserved: {out:?}");
}
#[test]
fn parenthesized_letter_uppercase_and_lowercase() {
let out = parenthesized_letter_encode("Bb");
assert!(out.contains('\u{1F111}'), "B → 🄑: {out:?}");
assert!(out.contains('\u{249D}'), "b → ⒝: {out:?}");
}
#[test]
fn circled_and_parenthesized_produce_different_bytes() {
let raw = "SELECT";
let circled = circled_letter_encode(raw);
let parens = parenthesized_letter_encode(raw);
assert_ne!(
circled, parens,
"rotation partners must produce distinct byte streams"
);
}
#[test]
fn soft_hyphen_inject_between_each_pair() {
let out = soft_hyphen_inject("ABC");
let count = out.chars().filter(|&c| c == '\u{00AD}').count();
assert_eq!(count, 2, "soft hyphen between each pair: {out:?}");
}
#[test]
fn soft_hyphen_inject_empty_is_empty() {
assert_eq!(soft_hyphen_inject(""), "");
}
#[test]
fn soft_hyphen_inject_single_char_unchanged() {
assert_eq!(soft_hyphen_inject("A"), "A");
}
#[test]
fn word_joiner_wraps_both_ends() {
let out = word_joiner_wrap("AB");
let count = out.chars().filter(|&c| c == '\u{2060}').count();
assert_eq!(count, 3, "wrap with joiner at each boundary: {out:?}");
}
#[test]
fn all_encoders_preserve_utf8_validity() {
let payload = "' OR 1=1 -- SELECT * FROM users";
let encoders: &[fn(&str) -> String] = &[
tag_char_encode,
|s| variation_selector_pad(s, '\u{FE0F}'),
variation_selector_supplementary_pad,
ligature_encode,
circled_letter_encode,
parenthesized_letter_encode,
soft_hyphen_inject,
word_joiner_wrap,
];
for (i, enc) in encoders.iter().enumerate() {
let out = enc(payload);
assert!(
!out.is_empty(),
"encoder #{i} produced empty on non-empty input"
);
}
}
#[test]
fn all_encoders_are_deterministic() {
let payload = "SELECT' OR 1=1";
let encoders: &[fn(&str) -> String] = &[
tag_char_encode,
|s| variation_selector_pad(s, '\u{FE0F}'),
variation_selector_supplementary_pad,
ligature_encode,
circled_letter_encode,
parenthesized_letter_encode,
soft_hyphen_inject,
word_joiner_wrap,
];
for enc in encoders {
assert_eq!(enc(payload), enc(payload), "encoder must be deterministic");
}
}
#[test]
fn all_encoders_handle_empty_input() {
let encoders: &[fn(&str) -> String] = &[
tag_char_encode,
|s| variation_selector_pad(s, '\u{FE0F}'),
variation_selector_supplementary_pad,
ligature_encode,
circled_letter_encode,
parenthesized_letter_encode,
soft_hyphen_inject,
word_joiner_wrap,
];
for enc in encoders {
let out = enc("");
assert!(out.len() < 8, "empty input must produce ~empty output");
}
}
#[test]
fn invisible_encoder_names_match_pub_fns() {
assert_eq!(INVISIBLE_ENCODER_NAMES.len(), 8);
for name in INVISIBLE_ENCODER_NAMES {
assert!(!name.is_empty());
assert!(
name.chars().all(|c| c.is_ascii_lowercase() || c == '_'),
"encoder names must be snake_case: {name}"
);
}
}
#[test]
fn adversarial_large_input_does_not_panic() {
let big = "A".repeat(10_000);
let _ = tag_char_encode(&big);
let _ = variation_selector_pad(&big, '\u{FE0F}');
let _ = variation_selector_supplementary_pad(&big);
let _ = ligature_encode(&big);
let _ = circled_letter_encode(&big);
let _ = parenthesized_letter_encode(&big);
let _ = soft_hyphen_inject(&big);
let _ = word_joiner_wrap(&big);
}
#[test]
fn unicode_input_round_trip_safe() {
let payload = "Ä' OR ñ=1 -- 日本";
let encoders: &[fn(&str) -> String] = &[
tag_char_encode,
|s| variation_selector_pad(s, '\u{FE0F}'),
ligature_encode,
circled_letter_encode,
parenthesized_letter_encode,
soft_hyphen_inject,
word_joiner_wrap,
];
for enc in encoders {
let out = enc(payload);
assert!(out.contains('日') || out.contains('Ä') || out.contains('ñ'));
}
}
}