wafrift_encoding/encoding/
invisible.rs1#[must_use]
51pub fn tag_char_encode(input: &str) -> String {
52 let mut out = String::with_capacity(input.len() * 4);
53 for c in input.chars() {
54 let cp = c as u32;
55 if cp <= 0x7F {
56 if let Some(tag) = char::from_u32(0xE0000 + cp) {
58 out.push(tag);
59 continue;
60 }
61 }
62 out.push(c);
63 }
64 out
65}
66
67#[must_use]
73pub fn variation_selector_pad(input: &str, selector: char) -> String {
74 let sel = match selector as u32 {
75 0xFE00..=0xFE0F | 0xE0100..=0xE01EF => selector,
76 _ => '\u{FE0F}',
77 };
78 let mut out = String::with_capacity(input.len() * 2 + input.chars().count() * sel.len_utf8());
79 for c in input.chars() {
80 out.push(c);
81 out.push(sel);
82 }
83 out
84}
85
86#[must_use]
92pub fn variation_selector_supplementary_pad(input: &str) -> String {
93 let mut out = String::with_capacity(input.len() * 5);
94 for (i, c) in (0_u32..).zip(input.chars()) {
95 out.push(c);
96 let sel_cp = 0xE0100 + (i % 0xF0);
97 if let Some(sel) = char::from_u32(sel_cp) {
98 out.push(sel);
99 }
100 }
101 out
102}
103
104#[must_use]
111pub fn ligature_encode(input: &str) -> String {
112 const LIGATURES: &[(&str, char)] = &[
115 ("ffi", '\u{FB03}'),
116 ("ffl", '\u{FB04}'),
117 ("ff", '\u{FB00}'),
118 ("fi", '\u{FB01}'),
119 ("fl", '\u{FB02}'),
120 ("st", '\u{FB06}'),
121 ("ſt", '\u{FB05}'),
122 ];
123 let mut out = String::with_capacity(input.len());
124 let mut rest = input;
125 'outer: while !rest.is_empty() {
126 for &(pat, replacement) in LIGATURES {
127 if let Some(stripped) = rest.strip_prefix(pat) {
128 out.push(replacement);
129 rest = stripped;
130 continue 'outer;
131 }
132 }
133 let mut chars = rest.chars();
135 if let Some(c) = chars.next() {
136 out.push(c);
137 }
138 rest = chars.as_str();
139 }
140 out
141}
142
143#[must_use]
150pub fn circled_letter_encode(input: &str) -> String {
151 let mut out = String::with_capacity(input.len() * 4);
152 for c in input.chars() {
153 match c {
154 'A'..='Z' => {
155 let off = (c as u32) - ('A' as u32);
156 if let Some(repl) = char::from_u32(0x24B6 + off) {
157 out.push(repl);
158 continue;
159 }
160 }
161 'a'..='z' => {
162 let off = (c as u32) - ('a' as u32);
163 if let Some(repl) = char::from_u32(0x24D0 + off) {
164 out.push(repl);
165 continue;
166 }
167 }
168 _ => {}
169 }
170 out.push(c);
171 }
172 out
173}
174
175#[must_use]
183pub fn parenthesized_letter_encode(input: &str) -> String {
184 let mut out = String::with_capacity(input.len() * 4);
185 for c in input.chars() {
186 match c {
187 'A'..='Z' => {
188 let off = (c as u32) - ('A' as u32);
189 if let Some(repl) = char::from_u32(0x1F110 + off) {
190 out.push(repl);
191 continue;
192 }
193 }
194 'a'..='z' => {
195 let off = (c as u32) - ('a' as u32);
196 if let Some(repl) = char::from_u32(0x249C + off) {
197 out.push(repl);
198 continue;
199 }
200 }
201 _ => {}
202 }
203 out.push(c);
204 }
205 out
206}
207
208#[must_use]
214pub fn soft_hyphen_inject(input: &str) -> String {
215 if input.is_empty() {
223 return String::new();
224 }
225 let char_count = input.chars().count();
227 let mut out = String::with_capacity(input.len() + (char_count.saturating_sub(1)) * 2);
228 let mut first = true;
229 for c in input.chars() {
230 if !first {
231 out.push('\u{00AD}');
232 }
233 first = false;
234 out.push(c);
235 }
236 out
237}
238
239#[must_use]
245pub fn word_joiner_wrap(input: &str) -> String {
246 let mut out = String::with_capacity(input.len() * 4);
247 for c in input.chars() {
248 out.push('\u{2060}');
249 out.push(c);
250 }
251 out.push('\u{2060}');
252 out
253}
254
255pub const INVISIBLE_ENCODER_NAMES: &[&str] = &[
259 "tag_char_encode",
260 "variation_selector_pad",
261 "variation_selector_supplementary_pad",
262 "ligature_encode",
263 "circled_letter_encode",
264 "parenthesized_letter_encode",
265 "soft_hyphen_inject",
266 "word_joiner_wrap",
267];
268
269#[cfg(test)]
270mod tests {
271 use super::*;
272
273 #[test]
274 fn tag_char_round_trips_via_codepoint_subtraction() {
275 let encoded = tag_char_encode("SELECT");
276 let recovered: String = encoded
277 .chars()
278 .map(|c| {
279 let cp = c as u32;
280 if (0xE0000..=0xE007F).contains(&cp) {
281 char::from_u32(cp - 0xE0000).unwrap_or(c)
282 } else {
283 c
284 }
285 })
286 .collect();
287 assert_eq!(recovered, "SELECT");
288 }
289
290 #[test]
291 fn tag_char_preserves_non_ascii() {
292 let encoded = tag_char_encode("SELECT' OR Ä");
293 assert!(
294 encoded.contains('Ä'),
295 "non-ASCII passes through: {encoded:?}"
296 );
297 }
298
299 #[test]
300 fn tag_char_every_byte_changes() {
301 let raw = "SELECT";
302 let encoded = tag_char_encode(raw);
303 assert_ne!(raw, encoded);
304 for c in encoded.chars() {
306 let cp = c as u32;
307 assert!((0xE0000..=0xE007F).contains(&cp), "non-tag codepoint: {c}");
308 }
309 }
310
311 #[test]
312 fn tag_char_handles_empty() {
313 assert_eq!(tag_char_encode(""), "");
314 }
315
316 #[test]
317 fn variation_selector_default_is_fe0f() {
318 let out = variation_selector_pad("AB", '\u{FE0F}');
319 assert!(out.contains('\u{FE0F}'));
320 assert_eq!(out.chars().count(), 4); }
322
323 #[test]
324 fn variation_selector_invalid_falls_back_to_fe0f() {
325 let out = variation_selector_pad("X", 'a');
326 assert!(out.contains('\u{FE0F}'), "fallback selector: {out:?}");
327 }
328
329 #[test]
330 fn variation_selector_accepts_supplementary_range() {
331 let out = variation_selector_pad("X", '\u{E0100}');
332 assert!(out.contains('\u{E0100}'));
333 }
334
335 #[test]
336 fn variation_selector_supplementary_varies_per_position() {
337 let out = variation_selector_supplementary_pad("AB");
338 let selectors: Vec<char> = out
339 .chars()
340 .filter(|c| (0xE0100..=0xE01EF).contains(&(*c as u32)))
341 .collect();
342 assert_eq!(selectors.len(), 2);
343 assert_ne!(
344 selectors[0], selectors[1],
345 "selectors must differ per position"
346 );
347 }
348
349 #[test]
350 fn ligature_encode_replaces_known_digraphs() {
351 let out = ligature_encode("effect official offload");
355 assert!(out.contains('\u{FB00}'), "ff → ff in 'effect': {out:?}");
356 assert!(out.contains('\u{FB03}'), "ffi → ffi in 'official': {out:?}");
357 assert!(out.contains('\u{FB04}'), "ffl → ffl in 'offload': {out:?}");
358 }
359
360 #[test]
361 fn ligature_encode_prefers_longest_match() {
362 let out = ligature_encode("ffi");
364 assert_eq!(out, "\u{FB03}");
365 assert!(!out.contains('\u{FB00}'));
366 }
367
368 #[test]
369 fn ligature_encode_passes_unmatched_chars() {
370 let out = ligature_encode("axyz");
371 assert_eq!(out, "axyz");
372 }
373
374 #[test]
375 fn ligature_encode_handles_empty() {
376 assert_eq!(ligature_encode(""), "");
377 }
378
379 #[test]
380 fn circled_letter_uppercase_and_lowercase() {
381 let out = circled_letter_encode("Aa");
382 assert!(out.contains('\u{24B6}'), "A → Ⓐ: {out:?}");
383 assert!(out.contains('\u{24D0}'), "a → ⓐ: {out:?}");
384 }
385
386 #[test]
387 fn circled_letter_preserves_punctuation() {
388 let out = circled_letter_encode("A'B");
389 assert!(out.contains('\''), "quote preserved: {out:?}");
390 }
391
392 #[test]
393 fn parenthesized_letter_uppercase_and_lowercase() {
394 let out = parenthesized_letter_encode("Bb");
395 assert!(out.contains('\u{1F111}'), "B → 🄑: {out:?}");
396 assert!(out.contains('\u{249D}'), "b → ⒝: {out:?}");
397 }
398
399 #[test]
400 fn circled_and_parenthesized_produce_different_bytes() {
401 let raw = "SELECT";
402 let circled = circled_letter_encode(raw);
403 let parens = parenthesized_letter_encode(raw);
404 assert_ne!(
405 circled, parens,
406 "rotation partners must produce distinct byte streams"
407 );
408 }
409
410 #[test]
411 fn soft_hyphen_inject_between_each_pair() {
412 let out = soft_hyphen_inject("ABC");
413 let count = out.chars().filter(|&c| c == '\u{00AD}').count();
415 assert_eq!(count, 2, "soft hyphen between each pair: {out:?}");
416 }
417
418 #[test]
419 fn soft_hyphen_inject_empty_is_empty() {
420 assert_eq!(soft_hyphen_inject(""), "");
421 }
422
423 #[test]
424 fn soft_hyphen_inject_single_char_unchanged() {
425 assert_eq!(soft_hyphen_inject("A"), "A");
426 }
427
428 #[test]
429 fn word_joiner_wraps_both_ends() {
430 let out = word_joiner_wrap("AB");
431 let count = out.chars().filter(|&c| c == '\u{2060}').count();
432 assert_eq!(count, 3, "wrap with joiner at each boundary: {out:?}");
434 }
435
436 #[test]
437 fn all_encoders_preserve_utf8_validity() {
438 let payload = "' OR 1=1 -- SELECT * FROM users";
439 let encoders: &[fn(&str) -> String] = &[
440 tag_char_encode,
441 |s| variation_selector_pad(s, '\u{FE0F}'),
442 variation_selector_supplementary_pad,
443 ligature_encode,
444 circled_letter_encode,
445 parenthesized_letter_encode,
446 soft_hyphen_inject,
447 word_joiner_wrap,
448 ];
449 for (i, enc) in encoders.iter().enumerate() {
450 let out = enc(payload);
451 assert!(
454 !out.is_empty(),
455 "encoder #{i} produced empty on non-empty input"
456 );
457 }
458 }
459
460 #[test]
461 fn all_encoders_are_deterministic() {
462 let payload = "SELECT' OR 1=1";
463 let encoders: &[fn(&str) -> String] = &[
464 tag_char_encode,
465 |s| variation_selector_pad(s, '\u{FE0F}'),
466 variation_selector_supplementary_pad,
467 ligature_encode,
468 circled_letter_encode,
469 parenthesized_letter_encode,
470 soft_hyphen_inject,
471 word_joiner_wrap,
472 ];
473 for enc in encoders {
474 assert_eq!(enc(payload), enc(payload), "encoder must be deterministic");
475 }
476 }
477
478 #[test]
479 fn all_encoders_handle_empty_input() {
480 let encoders: &[fn(&str) -> String] = &[
481 tag_char_encode,
482 |s| variation_selector_pad(s, '\u{FE0F}'),
483 variation_selector_supplementary_pad,
484 ligature_encode,
485 circled_letter_encode,
486 parenthesized_letter_encode,
487 soft_hyphen_inject,
488 word_joiner_wrap,
489 ];
490 for enc in encoders {
491 let out = enc("");
492 assert!(out.len() < 8, "empty input must produce ~empty output");
496 }
497 }
498
499 #[test]
500 fn invisible_encoder_names_match_pub_fns() {
501 assert_eq!(INVISIBLE_ENCODER_NAMES.len(), 8);
506 for name in INVISIBLE_ENCODER_NAMES {
507 assert!(!name.is_empty());
508 assert!(
509 name.chars().all(|c| c.is_ascii_lowercase() || c == '_'),
510 "encoder names must be snake_case: {name}"
511 );
512 }
513 }
514
515 #[test]
516 fn adversarial_large_input_does_not_panic() {
517 let big = "A".repeat(10_000);
518 let _ = tag_char_encode(&big);
519 let _ = variation_selector_pad(&big, '\u{FE0F}');
520 let _ = variation_selector_supplementary_pad(&big);
521 let _ = ligature_encode(&big);
522 let _ = circled_letter_encode(&big);
523 let _ = parenthesized_letter_encode(&big);
524 let _ = soft_hyphen_inject(&big);
525 let _ = word_joiner_wrap(&big);
526 }
527
528 #[test]
529 fn unicode_input_round_trip_safe() {
530 let payload = "Ä' OR ñ=1 -- 日本";
531 let encoders: &[fn(&str) -> String] = &[
532 tag_char_encode,
533 |s| variation_selector_pad(s, '\u{FE0F}'),
534 ligature_encode,
535 circled_letter_encode,
536 parenthesized_letter_encode,
537 soft_hyphen_inject,
538 word_joiner_wrap,
539 ];
540 for enc in encoders {
541 let out = enc(payload);
542 assert!(out.contains('日') || out.contains('Ä') || out.contains('ñ'));
545 }
546 }
547}