cipher_utils/
frequency.rs

1use itertools::Itertools;
2
3// Re import self just for readability, i.e., `frequency::of()` vs just `of()`.
4use crate::frequency;
5
6/// Returns the frequencies of each letter of the English alphabet as a map between
7/// characters and percentage of words they appear in. The returned map will include both
8/// lowercase and uppercase characters, with the lowercase and uppercase variant of each
9/// letter having the same frequency value. To get a specific subset, use `Frequency::english_lowercase()`
10/// or `Frequency::english_uppercase()`.
11///
12/// # Performance
13/// This is `O(1)`.
14///
15/// # Returns
16/// A map of letters and their frequencies.
17pub fn english() -> &'static std::collections::HashMap<char, f64> {
18    &ENGLISH_FREQUENCY
19}
20
21/// Returns the frequencies of each letter of the English alphabet as a map between
22/// characters and percentage of words they appear in. The returned map will include
23/// only lowercase characters. To get a different subset, use `Frequency::english_uppercase()`
24/// or `Frequency::english()` for both.
25///
26/// # Performance
27/// This is `O(1)`.
28///
29/// # Returns
30/// A map of letters and their frequencies.
31pub fn english_lowercase() -> &'static std::collections::HashMap<char, f64> {
32    &ENGLISH_LOWERCASE_FREQUENCY
33}
34
35/// Returns the frequencies of each letter of the English alphabet as a map between
36/// characters and percentage of words they appear in. The returned map will include
37/// only uppercase characters. To get a different subset, use `Frequency::english_lowercase()`
38/// or `Frequency::english()` for both.
39///
40/// # Performance
41/// This is `O(1)`.
42///
43/// # Returns
44/// A map of letters and their frequencies.
45pub fn english_uppercase() -> &'static std::collections::HashMap<char, f64> {
46    &ENGLISH_UPPERCASE_FREQUENCY
47}
48
49/// Returns a frequency map of the given text. The returned map maps characters to
50/// the percent of the entire string that the character makes up. To get the counts of each character,
51/// use `frequency::counts()`. This is also case-insensitive; The case-sensitive version is
52/// `frequency::of_cased`.
53///
54/// # Performance
55/// This is `O(n)`.
56///
57/// # Returns
58/// A map of characters and the percentage of the string they make up.
59pub fn of(text: &str) -> std::collections::HashMap<char, f64> {
60    frequency::counts(text)
61        .into_iter()
62        .map(|(character, count)| (character, count as f64 / text.len() as f64))
63        .collect()
64}
65
66/// Returns a frequency map of the given text. The returned map maps characters to
67/// the percent of the entire string that the character makes up. To get the counts of each character,
68/// use `frequency::counts()`. This is also case-sensitive; The case-insensitive version is
69/// `frequency::of`.
70///
71/// # Performance
72/// This is `O(n)`.
73///
74/// # Returns
75/// A map of characters and the percentage of the string they make up.
76pub fn of_cased(text: &str) -> std::collections::HashMap<char, f64> {
77    frequency::cased_counts(text)
78        .into_iter()
79        .map(|(character, count)| (character, count as f64 / text.len() as f64))
80        .collect()
81}
82
83/// Returns a frequency map of the given text. The turned map maps characters to the number of
84/// times they appear in the given string. To get a frequency map that maps characters to percentages,
85/// use `Frequency::of()`.
86///
87/// This function treats uppercase and lowercase as identical, and the returned map contains both for each character.
88/// Use `Frequency::cased_counts()` to retrieve a map that's case-sensitive.
89///
90/// # Performance
91/// This is `O(n)`.
92///
93/// # Returns
94/// A map of characters and the number of times they appear in the given string.
95pub fn counts(text: &str) -> std::collections::HashMap<char, usize> {
96    text.to_lowercase().chars().counts()
97}
98
99/// Returns a frequency map of the given text. The turned map maps characters to the number of
100/// times they appear in the given string. To get a frequency map that maps characters to percentages,
101/// use `frequency::of_cased()`.
102///
103/// This function treats uppercase and lowercase as different, and the returned map contains mappings for both
104/// that are present. Use `frequency::counts()` to retrieve a map that's case-insensitive.
105///
106/// # Performance
107/// This is `O(n)`.
108///
109/// # Returns
110/// A map of characters and the number of times they appear in the given string.
111pub fn cased_counts(text: &str) -> std::collections::HashMap<char, usize> {
112    text.chars().counts()
113}
114
115/// Converts each character in the given text to the character that has the closest frequency in the English alphabet.
116/// This will not reuse characters, i.e., if the closest frequency to 'B' is 'E' and the closest frequency to 'C' is
117/// also 'E', once 'B' is mapped to 'E', 'C' cannot be mapped to 'E' and will be mapped to something else.
118///
119/// # Parameters
120/// - `text` - The text to map to English frequencies
121///
122/// # Returns
123/// The mapped text to English frequencies
124pub fn mapped_to_english(text: &str) -> String {
125    let mut available_frequencies = ENGLISH_LOWERCASE_FREQUENCY.clone();
126    let character_frequencies = frequency::of(text);
127    let mut character_map = std::collections::HashMap::new();
128    text.chars()
129        .map(|character| {
130            *character_map.entry(character).or_insert_with(|| {
131                let new_character = available_frequencies
132                    .iter()
133                    .map(|english| (*english.0, (english.1 - character_frequencies.get(&character).unwrap()).abs()))
134                    .min_by(|first, other| first.1.total_cmp(&other.1))
135                    .unwrap()
136                    .0;
137                available_frequencies.remove(&new_character);
138                new_character
139            })
140        })
141        .collect()
142}
143
144/// Returns the English character whose frequency is closest to the given frequency percentage.
145///
146/// # Parameters
147/// - `frequency` - The frequency to get the closest character of. This should be a small number for
148/// accurate results, i.e., around the range `0.00074 - 0.127`
149pub fn closest_english_letter(frequency: f64) -> char {
150    ENGLISH_LOWERCASE_FREQUENCY
151        .iter()
152        .map(|(letter, english_frequency)| (*letter, (english_frequency - frequency).abs()))
153        .min_by(|first, other| first.1.total_cmp(&other.1))
154        .unwrap()
155        .0
156}
157
158/// Returns a "score" in `(0, 1]` that describes how well the given text's letter frequencies fit the same distribution
159/// as standard English. A higher score (closer to 1) indicates the text's frequency is closer to English.
160///
161/// Note that this only scores the distribution itself, not the actual letter frequencies. For example, a simple monoalphabetic
162/// substitution cipher would get an almost perfect score, since the frequency distribution is unchanged from the plaintext.
163///
164/// # Parameters
165/// - `text` - The text to get the distribution score of.
166///
167/// # Returns
168/// The frequency distribution fitness score, in `(0, 1]`.
169pub fn distribution_score(text: &str) -> f64 {
170    let frequency_map = frequency::of(text);
171    let frequencies = frequency_map.iter().map(|item| item.1).sorted_by(|item, other| item.total_cmp(other)).rev();
172    let english_frequencies = ENGLISH_LOWERCASE_FREQUENCY.values().sorted_by(|item, other| item.total_cmp(other)).rev();
173    let mut differences = Vec::new();
174    for (frequency, english_frequency) in frequencies.zip(english_frequencies) {
175        differences.push(1. - (frequency - english_frequency).abs() / 0.99926);
176    }
177
178    differences.iter().fold(0., |accumulator, current| accumulator + current) / differences.len() as f64
179}
180
181pub fn character_score(text: &str) -> f64 {
182    let scores = frequency::of(text)
183        .into_iter()
184        .filter_map(|(character, frequency)| {
185            ENGLISH_FREQUENCY
186                .get(&character)
187                .map(|english_frequency| 1. - (frequency - english_frequency).abs() / 0.99926)
188        })
189        .collect::<Vec<_>>();
190    scores.iter().fold(0., |accumulator, current| accumulator + current) / scores.len() as f64
191}
192
193lazy_static::lazy_static! {
194    static ref ENGLISH_LOWERCASE_FREQUENCY: std::collections::HashMap<char, f64> = std::collections::HashMap::from([
195        ('a', 0.082),
196        ('b', 0.015),
197        ('c', 0.028),
198        ('d', 0.043),
199        ('e', 0.127),
200        ('f', 0.022),
201        ('g', 0.020),
202        ('h', 0.061),
203        ('i', 0.070),
204        ('j', 0.0015),
205        ('k', 0.0077),
206        ('l', 0.040),
207        ('m', 0.024),
208        ('n', 0.067),
209        ('o', 0.075),
210        ('p', 0.019),
211        ('q', 0.00095),
212        ('r', 0.060),
213        ('s', 0.063),
214        ('t', 0.091),
215        ('u', 0.028),
216        ('v', 0.0098),
217        ('w', 0.024),
218        ('x', 0.0015),
219        ('y', 0.020),
220        ('z', 0.00074),
221    ]);
222    static ref ENGLISH_UPPERCASE_FREQUENCY: std::collections::HashMap<char, f64> = std::collections::HashMap::from([
223        ('A', 0.082),
224        ('B', 0.015),
225        ('C', 0.028),
226        ('D', 0.043),
227        ('E', 0.127),
228        ('F', 0.022),
229        ('G', 0.020),
230        ('H', 0.061),
231        ('I', 0.070),
232        ('J', 0.0015),
233        ('K', 0.0077),
234        ('L', 0.040),
235        ('M', 0.024),
236        ('N', 0.067),
237        ('O', 0.075),
238        ('P', 0.019),
239        ('Q', 0.00095),
240        ('R', 0.060),
241        ('S', 0.063),
242        ('T', 0.091),
243        ('U', 0.028),
244        ('V', 0.0098),
245        ('W', 0.024),
246        ('X', 0.0015),
247        ('Y', 0.020),
248        ('Z', 0.00074)
249    ]);
250    static ref ENGLISH_FREQUENCY: std::collections::HashMap<char, f64> = std::collections::HashMap::from([
251        ('a', 0.082),
252        ('b', 0.015),
253        ('c', 0.028),
254        ('d', 0.043),
255        ('e', 0.127),
256        ('f', 0.022),
257        ('g', 0.020),
258        ('h', 0.061),
259        ('i', 0.070),
260        ('j', 0.0015),
261        ('k', 0.0077),
262        ('l', 0.040),
263        ('m', 0.024),
264        ('n', 0.067),
265        ('o', 0.075),
266        ('p', 0.019),
267        ('q', 0.00095),
268        ('r', 0.060),
269        ('s', 0.063),
270        ('t', 0.091),
271        ('u', 0.028),
272        ('v', 0.0098),
273        ('w', 0.024),
274        ('x', 0.0015),
275        ('y', 0.020),
276        ('z', 0.00074),
277        ('A', 0.082),
278        ('B', 0.015),
279        ('C', 0.028),
280        ('D', 0.043),
281        ('E', 0.127),
282        ('F', 0.022),
283        ('G', 0.020),
284        ('H', 0.061),
285        ('I', 0.070),
286        ('J', 0.0015),
287        ('K', 0.0077),
288        ('L', 0.040),
289        ('M', 0.024),
290        ('N', 0.067),
291        ('O', 0.075),
292        ('P', 0.019),
293        ('Q', 0.00095),
294        ('R', 0.060),
295        ('S', 0.063),
296        ('T', 0.091),
297        ('U', 0.028),
298        ('V', 0.0098),
299        ('W', 0.024),
300        ('X', 0.0015),
301        ('Y', 0.020),
302        ('Z', 0.00074)
303    ]);
304}