cipherstash_core/
string.rs

1use lazy_static::lazy_static;
2use num_bigint::{BigUint, ToBigUint};
3use regex::Regex;
4use thiserror::Error;
5
6#[derive(Error, Debug)]
7pub enum OrderiseStringError {
8    #[error("Can only order strings that are pure ASCII")]
9    NotAscii,
10}
11
12pub fn orderise_string(s: &str) -> Result<Vec<u64>, OrderiseStringError> {
13    if !s.is_ascii() {
14        return Err(OrderiseStringError::NotAscii);
15    }
16
17    lazy_static! {
18        static ref NON_ALPHANUMERIC_OR_SPACE_REGEX: Regex =
19            Regex::new("[^a-z0-9[:space:]]+").unwrap();
20    }
21
22    lazy_static! {
23        static ref SPACE_REGEX: Regex = Regex::new("[[:space:]]+").unwrap();
24    }
25
26    lazy_static! {
27        static ref DIGIT_REGEX: Regex = Regex::new("[0-9]").unwrap();
28    }
29
30    // This all very much relies on ASCII character numbering.  A copy of `ascii`(7)
31    // up on a convenient terminal may assist in understanding what's going
32    // on here.
33
34    // First up, let's transmogrify the string we were given into one that only contains
35    // a controlled subset of characters, that we can easily map into a smaller numeric
36    // space.
37
38    let mut s = s.to_lowercase();
39
40    // Any group of rando characters sort at the end.
41    s = NON_ALPHANUMERIC_OR_SPACE_REGEX
42        .replace_all(&s, "~")
43        .to_string();
44
45    // Any amount of whitespace comes immediately after letters.
46    s = SPACE_REGEX.replace_all(&s, "{").to_string();
47
48    // Numbers come after spaces.
49    s = DIGIT_REGEX.replace_all(&s, "|").to_string();
50
51    // Next, we turn that string of characters into a "packed" number that represents the
52    // whole string, but in a more compact form than would be used if each character took
53    // up the full seven or eight bits used by regular ASCII.
54    let mut n = s
55        .bytes()
56        // 'a' => 1, 'b' => 2, ..., 'z' => 27, '{' => 28, '|' => 29,
57        // '}' => 30 (unused), '~' => 31.  0 is kept as "no character" so
58        // that short strings sort before longer ones.
59        .map(|c| BigUint::from(c - 96))
60        // Turn the whole thing into one giant number, with each character
61        // occupying five bits of said number.
62        .fold(0.to_biguint().unwrap(), |i, c| (i << 5) + c);
63
64    // Thirdly, we need to turn the number into one whose in-memory representation
65    // has a length in bits that is a multiple of 64.  This is to ensure that
66    // the first character has the most-significant bits possible, so it
67    // sorts the highest.
68    n <<= 64 - (s.len() * 5) % 64;
69
70    let mut terms = Vec::new();
71
72    let two_pow_64 = 2.to_biguint().unwrap().pow(64);
73
74    // And now, semi-finally, we can turn all that gigantic mess into a vec of u64 terms.
75    while n > 0.to_biguint().unwrap() {
76        // Unwrapping is fine here because we'll end up with a number that
77        // fits into a u64.
78        let term = (&n % &two_pow_64).try_into().unwrap();
79
80        terms.insert(0, term);
81
82        n >>= 64;
83    }
84
85    // Only six ORE ciphertexts can fit into the database.
86    terms.truncate(6);
87
88    Ok(terms)
89}
90
91#[cfg(test)]
92mod tests {
93    use super::*;
94    use serde::Deserialize;
95    use std::{fs, path::Path};
96
97    #[test]
98    fn test_orderise_string_non_ascii() {
99        let result = orderise_string("JalapeƱo");
100
101        assert!(matches!(result, Err(OrderiseStringError::NotAscii)));
102
103        let message = result.err().unwrap().to_string();
104
105        assert_eq!(message, "Can only order strings that are pure ASCII")
106    }
107
108    #[test]
109    // Asserts that output from orderise_string matches output from the ruby client.
110    // Tests against a file that contains a snapshot of results from the ruby client.
111    fn test_orderise_string_gives_same_output_as_ruby_clint() {
112        #[derive(Deserialize, Debug)]
113        struct TestCase {
114            input: String,
115            output: Vec<u64>,
116        }
117
118        let case_file_path =
119            Path::new(env!("CARGO_MANIFEST_DIR")).join("./orderise_string_test_cases.json");
120        let json_str = fs::read_to_string(case_file_path).expect("couldn't read test case file");
121        let test_cases: Vec<TestCase> =
122            serde_json::from_str(&json_str).expect("couldn't parse test cases");
123
124        for test_case in test_cases {
125            let result = orderise_string(&test_case.input);
126
127            assert!(
128                result.is_ok(),
129                "Expected orderise_string to succeed given {:?}, but got error: {:?}",
130                &test_case.input,
131                result
132            );
133
134            assert_eq!(
135                result.unwrap(),
136                test_case.output,
137                "\n orderise_string didn't match for input: {:?}",
138                test_case.input
139            );
140        }
141    }
142}