rand_utf8/
lib.rs

1#![no_std]
2#![deny(unsafe_code)]
3#![deny(missing_docs)]
4#![deny(warnings)]
5//! Random utf8 utility. This crate is `#![no_std]` but requires `alloc`.
6//!
7//! ### Example
8//!
9//! ```rust
10//! # use rand::SeedableRng;
11//! let mut rng = rand::rngs::SmallRng::seed_from_u64(0);
12//! let my_str = rand_utf8::rand_utf8(&mut rng, 32);
13//! assert_eq!(32, my_str.as_bytes().len());
14//! ```
15
16extern crate alloc;
17
18use alloc::boxed::Box;
19use alloc::collections::VecDeque;
20use alloc::string::String;
21use alloc::vec::Vec;
22
23struct CharUtf8Kind {
24    buf: [u8; 32],
25    chars: VecDeque<char>,
26}
27
28impl CharUtf8Kind {
29    pub fn new() -> Self {
30        Self {
31            buf: [0; 32],
32            chars: VecDeque::with_capacity(32),
33        }
34    }
35
36    pub fn next<R: rand::Rng>(&mut self, rng: &mut R) -> char {
37        if self.chars.is_empty() {
38            rng.fill(&mut self.buf);
39            for c in String::from_utf8_lossy(&self.buf).chars() {
40                if c as u32 > 0 && c != char::REPLACEMENT_CHARACTER {
41                    self.chars.push_back(c);
42                }
43            }
44        }
45        match self.chars.pop_front() {
46            None => self.next(rng),
47            Some(c) => c,
48        }
49    }
50}
51
52struct CharU32Kind;
53
54impl CharU32Kind {
55    pub fn new() -> Self {
56        Self
57    }
58
59    pub fn next<R: rand::Rng>(&mut self, rng: &mut R) -> char {
60        loop {
61            let c = rng.random_range(1..=0x110000);
62            if let Some(c) = char::from_u32(c) {
63                return c;
64            }
65        }
66    }
67}
68
69/// Generate a valid random unicode string, targeting well distributed
70/// utf8 bytes.
71/// rand::distributions::DistString produces random u32 code points,
72/// but the majority of these code points produce 4 utf8 bytes each of which
73/// are > 128, resulting in poor distribution.
74/// This function mixes in random valid utf8 bytes < 128 to fix this issue.
75/// Bytes may not be very well distributed if len < 8.
76pub fn rand_utf8<R: rand::Rng>(rng: &mut R, len: usize) -> Box<str> {
77    let mut chars = Vec::with_capacity(len);
78    let mut byte_count = 0;
79    let mut utf8_kind = CharUtf8Kind::new();
80    let mut u32_kind = CharU32Kind::new();
81
82    while byte_count < len {
83        let kind = if len - byte_count < 4 {
84            // if we're nearing the end, we need the smaller utf8 kind
85            0
86        } else {
87            // 0, 1, 2, and 3 will give us the smaller utf8 kind
88            // 4 will give us the larger u32 kind
89            rng.random_range(0..=4)
90        };
91
92        if kind < 4 {
93            // do the smaller utf8 kind generation which tends to make
94            // single utf8 bytes < 128
95            let c = utf8_kind.next(rng);
96            let c_len = c.len_utf8();
97            if byte_count + c_len > len {
98                continue;
99            }
100            byte_count += c_len;
101            chars.push(c);
102        } else {
103            // do the larger u32 kind generation which tends to make
104            // 4 byte utf8 blocks with all bytes > 128
105            let c = u32_kind.next(rng);
106            byte_count += c.len_utf8();
107            chars.push(c);
108        }
109    }
110
111    use rand::seq::SliceRandom;
112    chars.shuffle(rng);
113
114    String::from_iter(chars.iter()).into_boxed_str()
115}
116
117#[cfg(test)]
118mod tests {
119    use super::*;
120
121    #[test]
122    fn correct_size() {
123        use rand::SeedableRng;
124        let mut rng = rand::rngs::SmallRng::seed_from_u64(0);
125
126        for size in 8..100 {
127            let s = rand_utf8(&mut rng, size);
128            assert_eq!(size, s.as_bytes().len());
129        }
130    }
131
132    fn validate_distribution(do_assert: bool, distribution: &[u32; 256]) {
133        let mut min = distribution[1];
134        let mut max = distribution[1];
135        let mut avg_tot = 0.0;
136        let mut avg_cnt = 0.0;
137        let mut score_sum = 0.0;
138
139        for count in distribution.iter() {
140            avg_tot += *count as f64;
141            avg_cnt += 1.0;
142
143            if *count < min {
144                min = *count;
145            }
146            if *count > max {
147                max = *count;
148            }
149        }
150
151        let avg = avg_tot / avg_cnt;
152
153        libc_print::libc_println!("min: {}, max: {}, mean: {}", min, max, avg,);
154
155        for (i, count) in distribution.iter().enumerate() {
156            let count = *count as f64;
157
158            let dif = if count > avg {
159                avg / count
160            } else {
161                count / avg
162            };
163
164            score_sum += dif;
165
166            libc_print::libc_println!("{:03}: {:04} {:0.2}", i, count, dif);
167
168            // this is pretty arbitrary, but if we tweak the algorithm,
169            // and it breaks the tests, at least we'll have to
170            // consciously change this:
171            if do_assert {
172                if i >= 1 && i <= 191 {
173                    assert!(dif > 0.5);
174                } else if i >= 194 && i <= 244 {
175                    assert!(dif > 0.06);
176                }
177            }
178        }
179
180        libc_print::libc_println!("-- score_sum: {:0.2} --", score_sum);
181    }
182
183    fn distribution_test<F>(do_assert: bool, f: F)
184    where
185        F: FnOnce(usize, usize) -> [u32; 256],
186    {
187        let distribution = f(1024, 32);
188        validate_distribution(do_assert, &distribution);
189    }
190
191    #[test]
192    fn distribution() {
193        libc_print::libc_println!("# rand::distributions::DistString");
194
195        distribution_test(false, |count, len| {
196            use rand::distr::SampleString;
197            use rand::SeedableRng;
198            let mut rng = rand::rngs::SmallRng::seed_from_u64(2);
199
200            let mut distribution = [0_u32; 256];
201
202            for _ in 0..count {
203                for b in rand::distr::StandardUniform
204                    .sample_string(&mut rng, len)
205                    .as_bytes()
206                {
207                    distribution[*b as usize] += 1;
208                }
209            }
210
211            distribution
212        });
213
214        libc_print::libc_println!("# rand_utf8");
215
216        distribution_test(true, |count, len| {
217            use rand::SeedableRng;
218            let mut rng = rand::rngs::SmallRng::seed_from_u64(1);
219
220            let mut distribution = [0_u32; 256];
221
222            for _ in 0..count {
223                for b in rand_utf8(&mut rng, len).as_bytes() {
224                    distribution[*b as usize] += 1;
225                }
226            }
227
228            distribution
229        });
230    }
231}