datasynth_core/distributions/
line_item.rs1use rand::prelude::*;
13use rand_chacha::ChaCha8Rng;
14use serde::{Deserialize, Serialize};
15
16#[derive(Debug, Clone, Serialize, Deserialize)]
20pub struct LineItemDistributionConfig {
21 pub two_items: f64,
23 pub three_items: f64,
25 pub four_items: f64,
27 pub five_items: f64,
29 pub six_items: f64,
31 pub seven_items: f64,
33 pub eight_items: f64,
35 pub nine_items: f64,
37 pub ten_to_ninety_nine: f64,
39 pub hundred_to_nine_ninety_nine: f64,
41 pub thousand_plus: f64,
43}
44
45impl Default for LineItemDistributionConfig {
46 fn default() -> Self {
47 Self {
49 two_items: 0.6068,
50 three_items: 0.0577,
51 four_items: 0.1663,
52 five_items: 0.0306,
53 six_items: 0.0332,
54 seven_items: 0.0113,
55 eight_items: 0.0188,
56 nine_items: 0.0042,
57 ten_to_ninety_nine: 0.0633,
58 hundred_to_nine_ninety_nine: 0.0076,
59 thousand_plus: 0.0002,
60 }
61 }
62}
63
64impl LineItemDistributionConfig {
65 pub fn validate(&self) -> Result<(), String> {
67 let sum = self.two_items
68 + self.three_items
69 + self.four_items
70 + self.five_items
71 + self.six_items
72 + self.seven_items
73 + self.eight_items
74 + self.nine_items
75 + self.ten_to_ninety_nine
76 + self.hundred_to_nine_ninety_nine
77 + self.thousand_plus;
78
79 if (sum - 1.0).abs() > 0.01 {
80 return Err(format!(
81 "Line item distribution probabilities sum to {}, expected ~1.0",
82 sum
83 ));
84 }
85 Ok(())
86 }
87
88 fn cumulative(&self) -> [f64; 11] {
90 let mut cum = [0.0; 11];
91 cum[0] = self.two_items;
92 cum[1] = cum[0] + self.three_items;
93 cum[2] = cum[1] + self.four_items;
94 cum[3] = cum[2] + self.five_items;
95 cum[4] = cum[3] + self.six_items;
96 cum[5] = cum[4] + self.seven_items;
97 cum[6] = cum[5] + self.eight_items;
98 cum[7] = cum[6] + self.nine_items;
99 cum[8] = cum[7] + self.ten_to_ninety_nine;
100 cum[9] = cum[8] + self.hundred_to_nine_ninety_nine;
101 cum[10] = cum[9] + self.thousand_plus;
102 cum
103 }
104}
105
106#[derive(Debug, Clone, Serialize, Deserialize)]
108pub struct EvenOddDistributionConfig {
109 pub even: f64,
111 pub odd: f64,
113}
114
115impl Default for EvenOddDistributionConfig {
116 fn default() -> Self {
117 Self {
119 even: 0.88,
120 odd: 0.12,
121 }
122 }
123}
124
125#[derive(Debug, Clone, Serialize, Deserialize)]
127pub struct DebitCreditDistributionConfig {
128 pub equal: f64,
130 pub more_debit: f64,
132 pub more_credit: f64,
134}
135
136impl Default for DebitCreditDistributionConfig {
137 fn default() -> Self {
138 Self {
140 equal: 0.82,
141 more_debit: 0.07,
142 more_credit: 0.11,
143 }
144 }
145}
146
147pub struct LineItemSampler {
152 rng: ChaCha8Rng,
154 even_odd_config: EvenOddDistributionConfig,
156 debit_credit_config: DebitCreditDistributionConfig,
158 cumulative: [f64; 11],
160}
161
162impl LineItemSampler {
163 pub fn new(seed: u64) -> Self {
165 let line_config = LineItemDistributionConfig::default();
166 let cumulative = line_config.cumulative();
167
168 Self {
169 rng: ChaCha8Rng::seed_from_u64(seed),
170 even_odd_config: EvenOddDistributionConfig::default(),
171 debit_credit_config: DebitCreditDistributionConfig::default(),
172 cumulative,
173 }
174 }
175
176 pub fn with_config(
178 seed: u64,
179 line_config: LineItemDistributionConfig,
180 even_odd_config: EvenOddDistributionConfig,
181 debit_credit_config: DebitCreditDistributionConfig,
182 ) -> Self {
183 let cumulative = line_config.cumulative();
184
185 Self {
186 rng: ChaCha8Rng::seed_from_u64(seed),
187 even_odd_config,
188 debit_credit_config,
189 cumulative,
190 }
191 }
192
193 pub fn sample_count(&mut self) -> usize {
195 let p: f64 = self.rng.random();
196
197 if p < self.cumulative[0] {
199 2
200 } else if p < self.cumulative[1] {
201 3
202 } else if p < self.cumulative[2] {
203 4
204 } else if p < self.cumulative[3] {
205 5
206 } else if p < self.cumulative[4] {
207 6
208 } else if p < self.cumulative[5] {
209 7
210 } else if p < self.cumulative[6] {
211 8
212 } else if p < self.cumulative[7] {
213 9
214 } else if p < self.cumulative[8] {
215 self.rng.random_range(10..100)
217 } else if p < self.cumulative[9] {
218 self.rng.random_range(100..1000)
220 } else {
221 self.rng.random_range(1000..10000)
223 }
224 }
225
226 pub fn sample_even(&mut self) -> bool {
228 self.rng.random::<f64>() < self.even_odd_config.even
229 }
230
231 pub fn sample_count_with_parity(&mut self) -> usize {
236 let base_count = self.sample_count();
237 let should_be_even = self.sample_even();
238
239 let is_even = base_count.is_multiple_of(2);
241 if should_be_even != is_even {
242 if base_count <= 2 {
244 base_count + 1
246 } else if self.rng.random::<bool>() {
247 base_count + 1
249 } else {
250 base_count - 1
252 }
253 } else {
254 base_count
255 }
256 }
257
258 pub fn sample_debit_credit_type(&mut self) -> DebitCreditSplit {
260 let p: f64 = self.rng.random();
261
262 if p < self.debit_credit_config.equal {
263 DebitCreditSplit::Equal
264 } else if p < self.debit_credit_config.equal + self.debit_credit_config.more_debit {
265 DebitCreditSplit::MoreDebit
266 } else {
267 DebitCreditSplit::MoreCredit
268 }
269 }
270
271 pub fn sample(&mut self) -> LineItemSpec {
273 let total_count = self.sample_count_with_parity();
274 let split_type = self.sample_debit_credit_type();
275
276 let (debit_count, credit_count) = match split_type {
277 DebitCreditSplit::Equal => {
278 let half = total_count / 2;
279 (half, total_count - half)
280 }
281 DebitCreditSplit::MoreDebit => {
282 let debit = (total_count as f64 * 0.6).round() as usize;
284 let debit = debit.max(1).min(total_count - 1);
285 (debit, total_count - debit)
286 }
287 DebitCreditSplit::MoreCredit => {
288 let credit = (total_count as f64 * 0.6).round() as usize;
290 let credit = credit.max(1).min(total_count - 1);
291 (total_count - credit, credit)
292 }
293 };
294
295 LineItemSpec {
296 total_count,
297 debit_count,
298 credit_count,
299 split_type,
300 }
301 }
302
303 pub fn reset(&mut self, seed: u64) {
305 self.rng = ChaCha8Rng::seed_from_u64(seed);
306 }
307}
308
309#[derive(Debug, Clone, Copy, PartialEq, Eq)]
311pub enum DebitCreditSplit {
312 Equal,
314 MoreDebit,
316 MoreCredit,
318}
319
320#[derive(Debug, Clone)]
322pub struct LineItemSpec {
323 pub total_count: usize,
325 pub debit_count: usize,
327 pub credit_count: usize,
329 pub split_type: DebitCreditSplit,
331}
332
333impl LineItemSpec {
334 pub fn is_valid(&self) -> bool {
336 self.total_count >= 2
337 && self.debit_count >= 1
338 && self.credit_count >= 1
339 && self.debit_count + self.credit_count == self.total_count
340 }
341}
342
343#[cfg(test)]
344#[allow(clippy::unwrap_used)]
345mod tests {
346 use super::*;
347
348 #[test]
349 fn test_default_config_valid() {
350 let config = LineItemDistributionConfig::default();
351 assert!(config.validate().is_ok());
352 }
353
354 #[test]
355 fn test_sampler_determinism() {
356 let mut sampler1 = LineItemSampler::new(42);
357 let mut sampler2 = LineItemSampler::new(42);
358
359 for _ in 0..100 {
360 assert_eq!(sampler1.sample_count(), sampler2.sample_count());
361 }
362 }
363
364 #[test]
365 fn test_sampler_distribution() {
366 let mut sampler = LineItemSampler::new(42);
367 let sample_size = 100_000;
368
369 let mut counts = std::collections::HashMap::new();
370 for _ in 0..sample_size {
371 let count = sampler.sample_count();
372 *counts.entry(count).or_insert(0) += 1;
373 }
374
375 let two_count = *counts.get(&2).unwrap_or(&0) as f64 / sample_size as f64;
377 assert!(
378 two_count > 0.55 && two_count < 0.65,
379 "Expected ~60% 2-item entries, got {}%",
380 two_count * 100.0
381 );
382
383 let four_count = *counts.get(&4).unwrap_or(&0) as f64 / sample_size as f64;
385 assert!(
386 four_count > 0.13 && four_count < 0.20,
387 "Expected ~16% 4-item entries, got {}%",
388 four_count * 100.0
389 );
390 }
391
392 #[test]
393 fn test_line_item_spec_valid() {
394 let mut sampler = LineItemSampler::new(42);
395
396 for _ in 0..1000 {
397 let spec = sampler.sample();
398 assert!(spec.is_valid(), "Invalid spec: {:?}", spec);
399 }
400 }
401}