zalgo_codec/
lib.rs

1//! This crate lets you convert an ASCII text string into a single unicode grapheme cluster and back.
2//! It also provides a procedural macro that lets you embed such a grapheme cluster and decode it into source code at compile time.  
3//! This lets you reach new lows in the field of self-documenting code.
4//!
5//! The encoded string will be ~2 times larger than the original in terms of bytes.
6//!
7//! Additionally the crate provides a function to encode Python code and wrap the result in a decoder that
8//! decodes and executes the encoded string, retaining the functionality of the original code.
9//!
10//! There are two ways of interacting with the codec.
11//! The first one is to call the encoding and decoding functions directly,
12//! and the second one is to use the [`ZalgoString`] wrapper type.
13//!
14//! # Examples
15//!
16//! Encode a string to a grapheme cluster with [`zalgo_encode`]:
17//! ```
18//! # use zalgo_codec::{EncodeError, zalgo_encode};
19//! let s = "Zalgo";
20//! let encoded = zalgo_encode(s)?;
21//! assert_eq!(encoded, "É̺͇͌͏");
22//! # Ok::<(), EncodeError>(())
23//! ```
24//! Decode the grapheme cluster back into a string with [`zalgo_decode`]:
25//! ```
26//! # use zalgo_codec::{zalgo_decode, DecodeError};
27//! # extern crate alloc;
28//! let encoded = "É̺͇͌͏";
29//! let s = zalgo_decode(encoded)?;
30//! assert_eq!(s, "Zalgo");
31//! # Ok::<(), DecodeError>(())
32//! ```
33//! The [`ZalgoString`] type can be used to encode a string and handle the result in various ways:
34//! ```
35//! # use zalgo_codec::{EncodeError, ZalgoString};
36//! let s = "Zalgo";
37//! let zstr = ZalgoString::new(s)?;
38//! assert_eq!(zstr, "É̺͇͌͏");
39//! assert_eq!(zstr.len(), 2 * s.len() + 1);
40//! assert_eq!(zstr.decoded_len(), s.len());
41//! assert_eq!(zstr.bytes().next(), Some(69));
42//! assert_eq!(zstr.decoded_chars().next_back(), Some('o'));
43//! # Ok::<(), EncodeError>(())
44//! ```
45//! Encode Rust source code and embed it in your program with the [`zalgo_embed!`] proc-macro:
46//! ```
47//! # #[cfg(feature = "macro")]
48//! # {
49//! # use zalgo_codec::zalgo_embed;
50//! // This grapheme cluster was made by encoding "add(x: i32, y: i32) -> i32 {x + y}"
51//! zalgo_embed!("E͎͉͙͉̞͉͙͆̀́̈́̈́̈̀̓̒̌̀̀̓̒̉̀̍̀̓̒̀͛̀̋̀͘̚̚͘͝");
52//!
53//! // The `add` function is now available
54//! assert_eq!(add(10, 20), 30);
55//! # }
56//! ```
57//!
58//! # Feature flags
59//!
60//! `std`: enables [`EncodeError`] and [`DecodeError`] to capture a [`Backtrace`](std::backtrace::Backtrace).
61//! If this feature is not enabled the library is `no_std` compatible, but still uses the `alloc` crate.
62//!
63//! `serde`: derives the `Serialize` and `Deserialize` traits from [`serde`](https://docs.rs/serde) for [`ZalgoString`].
64//!
65//! `rkyv`: derives the `Serialize`, `Deserialize`, and `Archive` traits from [`rkyv`](https://docs.rs/rkyv) for [`ZalgoString`].
66//!
67//! `macro` *(enabled by default)*: exports the procedural macros [`zalgo_embed!`] and [`zalgofy!`].
68//!   
69//! # Explanation
70//!
71//! Characters U+0300–U+036F are the combining characters for unicode Latin.
72//! The fun thing about combining characters is that you can add as many of these characters
73//! as you like to the original character and it does not create any new symbols,
74//! it only adds symbols on top of the character. It's supposed to be used in order to
75//! create characters such as `á` by taking a normal `a` and adding another character
76//! to give it the mark (U+301, in this case). Fun fact: Unicode doesn't specify
77//! any limit on the number of these characters.
78//! Conveniently, this gives us 112 different characters we can map to,
79//! which nicely maps to the ASCII character range 0x20 -> 0x7F, aka all the non-control characters.
80//! The only issue is that we can't have new lines in this system, so to fix that,
81//! we can simply map 0x7F (DEL) to 0x0A (LF).
82//! This can be represented as `(CHARACTER - 11) % 133 - 21`, and decoded with `(CHARACTER + 22) % 133 + 10`.  
83//!
84//! <details><summary><b>Full conversion table</b></summary>
85//!
86//! | ASCII character | Encoded |  
87//! |---|---|  
88//! | A | U+321 |  
89//! | B | U+322 |  
90//! | C | U+323 |  
91//! | D | U+324 |  
92//! | E | U+325 |  
93//! | F | U+326 |  
94//! | G | U+327 |  
95//! | H | U+328 |  
96//! | I | U+329 |  
97//! | J | U+32A |  
98//! | K | U+32B |  
99//! | L | U+32C |  
100//! | M | U+32D |  
101//! | N | U+32E |  
102//! | O | U+32F |  
103//! | P | U+330 |  
104//! | Q | U+331 |  
105//! | R | U+332 |  
106//! | S | U+333 |  
107//! | T | U+334 |  
108//! | U | U+335 |  
109//! | V | U+336 |  
110//! | W | U+337 |  
111//! | X | U+338 |  
112//! | Y | U+339 |  
113//! | Z | U+33A |  
114//! | a | U+341 |  
115//! | b | U+342 |  
116//! | c | U+343 |  
117//! | d | U+344 |  
118//! | e | U+345 |  
119//! | f | U+346 |  
120//! | g | U+347 |  
121//! | h | U+348 |  
122//! | i | U+349 |  
123//! | j | U+34A |  
124//! | k | U+34B |  
125//! | l | U+34C |  
126//! | m | U+34D |  
127//! | n | U+34E |  
128//! | o | U+34F |  
129//! | p | U+350 |  
130//! | q | U+351 |  
131//! | r | U+352 |  
132//! | s | U+353 |  
133//! | t | U+354 |  
134//! | u | U+355 |  
135//! | v | U+356 |  
136//! | w | U+357 |  
137//! | x | U+358 |  
138//! | y | U+359 |  
139//! | z | U+35A |  
140//! | 1 | U+311 |  
141//! | 2 | U+312 |  
142//! | 3 | U+313 |  
143//! | 4 | U+314 |  
144//! | 5 | U+315 |  
145//! | 6 | U+316 |  
146//! | 7 | U+317 |  
147//! | 8 | U+318 |  
148//! | 9 | U+319 |  
149//! | 0 | U+310 |  
150//! |   | U+300 |  
151//! | ! | U+301 |  
152//! | " | U+302 |  
153//! | # | U+303 |  
154//! | $ | U+304 |  
155//! | % | U+305 |  
156//! | & | U+306 |  
157//! | ' | U+307 |  
158//! | ( | U+308 |  
159//! | ) | U+309 |  
160//! | * | U+30A |  
161//! | + | U+30B |  
162//! | , | U+30C |  
163//! | - | U+30D |  
164//! | \ | U+33C |  
165//! | . | U+30E |  
166//! | / | U+30F |  
167//! | : | U+31A |  
168//! | ; | U+31B |  
169//! | < | U+31C |  
170//! | = | U+31D |  
171//! | > | U+31E |  
172//! | ? | U+31F |  
173//! | @ | U+320 |  
174//! | \n| U+36F |  
175//!
176//! </details>
177//!
178//! # Experiment with the codec
179//!
180//! There is an executable available for experimenting with the codec on text and files.
181//! It can also be used to generate grapheme clusters from source code for use with [`zalgo_embed!`].
182//! It can be installed with `cargo install zalgo-codec --features binary`.
183//! You can optionally enable the `gui` feature during installation to include a rudimentary GUI mode for the program.
184
185#![no_std]
186#![cfg_attr(docsrs, feature(doc_auto_cfg))]
187
188#[cfg(feature = "std")]
189extern crate std;
190
191pub use zalgo_codec_common::{
192    zalgo_decode, zalgo_encode, zalgo_string, zalgo_wrap_python, DecodeError, EncodeError,
193    ZalgoString,
194};
195
196#[cfg(feature = "macro")]
197pub use zalgo_codec_macro::{zalgo_embed, zalgofy};
198
199#[cfg(test)]
200mod tests {
201    extern crate alloc;
202
203    use super::*;
204    use alloc::string::String;
205    use core::str;
206    use rand::{
207        distr::{Distribution, SampleString},
208        seq::IndexedRandom,
209        Rng,
210    };
211    use unicode_segmentation::UnicodeSegmentation;
212
213    struct PrintableAsciiAndNewline;
214
215    impl Distribution<char> for PrintableAsciiAndNewline {
216        fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> char {
217            *b" !\"#$%&'()*,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVXYZ[\\]^_`abcdefghijklmnopqrstuvxyz{|}~\n".choose(rng).unwrap() as char
218        }
219    }
220
221    impl SampleString for PrintableAsciiAndNewline {
222        fn append_string<R: Rng + ?Sized>(&self, rng: &mut R, string: &mut String, len: usize) {
223            string.reserve(len);
224            for _ in 0..len {
225                string.push(self.sample(rng));
226            }
227        }
228    }
229
230    #[cfg(feature = "macro")]
231    #[test]
232    fn test_embed_function() {
233        let code = "fn add(x: i32, y: i32) -> i32 {x + y}";
234
235        let encoded = zalgo_encode(code).unwrap();
236        assert_eq!(encoded, "E͎͉͙͉̞͉͙͆̀́̈́̈́̈̀̓̒̌̀̀̓̒̉̀̍̀̓̒̀͛̀̋̀͘̚̚͘͝");
237
238        zalgo_embed!("E͎͉͙͉̞͉͙͆̀́̈́̈́̈̀̓̒̌̀̀̓̒̉̀̍̀̓̒̀͛̀̋̀͘̚̚͘͝");
239
240        // Now the `add` function is available
241        assert_eq!(add(10, 20), 30)
242    }
243
244    #[cfg(feature = "macro")]
245    #[test]
246    fn test_embed_expression() {
247        let x = 20;
248        let y = -10;
249
250        let expr = "x + y";
251
252        let encoded = zalgo_encode(expr).unwrap();
253
254        assert_eq!(encoded, "È͙̋̀͘");
255
256        // It works on expressions, too!
257        let z = zalgo_embed!("È͙̋̀͘");
258        assert_eq!(z, x + y);
259    }
260
261    #[test]
262    fn verify() {
263        const TEST_STRING_1: &str = "the greatest adventure is going to bed";
264        let out_string = str::from_utf8(b"E\xcd\x94\xcd\x88\xcd\x85\xcc\x80\xcd\x87\xcd\x92\xcd\x85\xcd\x81\xcd\x94\xcd\x85\xcd\x93\xcd\x94\xcc\x80\xcd\x81\xcd\x84\xcd\x96\xcd\x85\xcd\x8e\xcd\x94\xcd\x95\xcd\x92\xcd\x85\xcc\x80\xcd\x89\xcd\x93\xcc\x80\xcd\x87\xcd\x8f\xcd\x89\xcd\x8e\xcd\x87\xcc\x80\xcd\x94\xcd\x8f\xcc\x80\xcd\x82\xcd\x85\xcd\x84").unwrap();
265        assert_eq!(zalgo_encode(TEST_STRING_1).unwrap(), out_string);
266
267        const TEST_STRING_2: &str =
268            "I'll have you know I graduated top of my class in the Navy Seals";
269        assert_eq!(
270            zalgo_decode(&zalgo_encode(TEST_STRING_2).unwrap()).unwrap(),
271            TEST_STRING_2
272        );
273
274        const ASCII_CHAR_TABLE: &str = r##"ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvxyz1234567890 !"#$%&'()*+,-\./:;<=>?@"##;
275        assert_eq!(
276            zalgo_decode(&zalgo_encode(ASCII_CHAR_TABLE).unwrap()).unwrap(),
277            ASCII_CHAR_TABLE
278        );
279
280        // Checking that randomly generated alphanumeric strings are encoded in a lossless fashion, and that they contain a single grapheme cluster
281        for _ in 0..100 {
282            let s = PrintableAsciiAndNewline.sample_string(&mut rand::rng(), 100);
283            let encoded = zalgo_encode(&s).unwrap();
284            assert_eq!(zalgo_decode(&encoded).unwrap(), s);
285            assert_eq!(encoded.as_str().graphemes(true).count(), 1)
286        }
287    }
288
289    #[test]
290    fn newlines() {
291        assert_eq!(&zalgo_encode("\n").unwrap(), "Eͯ",);
292        const TEST_STRING: &str = "The next sentence is true.\nThe previous sentence is false.";
293        assert_eq!(
294            zalgo_decode(&zalgo_encode(TEST_STRING).unwrap()).unwrap(),
295            TEST_STRING,
296        );
297    }
298
299    #[test]
300    fn check_errors() {
301        assert!(zalgo_encode("We got the Ä Ö Å, you aint got the Ä Ö Å").is_err());
302        assert!(zalgo_encode("\t").is_err());
303        assert!(zalgo_encode("\r").is_err());
304        assert!(zalgo_encode("\0").is_err());
305    }
306
307    #[cfg(feature = "macro")]
308    #[test]
309    fn check_zalgofy() {
310        const ZS: &str = zalgofy!("Zalgo");
311        assert_eq!(zalgo_decode(ZS).unwrap(), "Zalgo");
312    }
313}