datasynth_core/distributions/
line_item.rs1use rand::prelude::*;
13use rand_chacha::ChaCha8Rng;
14use serde::{Deserialize, Serialize};
15
16#[derive(Debug, Clone, Serialize, Deserialize)]
20pub struct LineItemDistributionConfig {
21 pub two_items: f64,
23 pub three_items: f64,
25 pub four_items: f64,
27 pub five_items: f64,
29 pub six_items: f64,
31 pub seven_items: f64,
33 pub eight_items: f64,
35 pub nine_items: f64,
37 pub ten_to_ninety_nine: f64,
39 pub hundred_to_nine_ninety_nine: f64,
41 pub thousand_plus: f64,
43}
44
45impl Default for LineItemDistributionConfig {
46 fn default() -> Self {
47 Self {
49 two_items: 0.6068,
50 three_items: 0.0577,
51 four_items: 0.1663,
52 five_items: 0.0306,
53 six_items: 0.0332,
54 seven_items: 0.0113,
55 eight_items: 0.0188,
56 nine_items: 0.0042,
57 ten_to_ninety_nine: 0.0633,
58 hundred_to_nine_ninety_nine: 0.0076,
59 thousand_plus: 0.0002,
60 }
61 }
62}
63
64impl LineItemDistributionConfig {
65 pub fn validate(&self) -> Result<(), String> {
67 let sum = self.two_items
68 + self.three_items
69 + self.four_items
70 + self.five_items
71 + self.six_items
72 + self.seven_items
73 + self.eight_items
74 + self.nine_items
75 + self.ten_to_ninety_nine
76 + self.hundred_to_nine_ninety_nine
77 + self.thousand_plus;
78
79 if (sum - 1.0).abs() > 0.01 {
80 return Err(format!(
81 "Line item distribution probabilities sum to {}, expected ~1.0",
82 sum
83 ));
84 }
85 Ok(())
86 }
87
88 fn cumulative(&self) -> [f64; 11] {
90 let mut cum = [0.0; 11];
91 cum[0] = self.two_items;
92 cum[1] = cum[0] + self.three_items;
93 cum[2] = cum[1] + self.four_items;
94 cum[3] = cum[2] + self.five_items;
95 cum[4] = cum[3] + self.six_items;
96 cum[5] = cum[4] + self.seven_items;
97 cum[6] = cum[5] + self.eight_items;
98 cum[7] = cum[6] + self.nine_items;
99 cum[8] = cum[7] + self.ten_to_ninety_nine;
100 cum[9] = cum[8] + self.hundred_to_nine_ninety_nine;
101 cum[10] = cum[9] + self.thousand_plus;
102 cum
103 }
104}
105
106#[derive(Debug, Clone, Serialize, Deserialize)]
108pub struct EvenOddDistributionConfig {
109 pub even: f64,
111 pub odd: f64,
113}
114
115impl Default for EvenOddDistributionConfig {
116 fn default() -> Self {
117 Self {
119 even: 0.88,
120 odd: 0.12,
121 }
122 }
123}
124
125#[derive(Debug, Clone, Serialize, Deserialize)]
127pub struct DebitCreditDistributionConfig {
128 pub equal: f64,
130 pub more_debit: f64,
132 pub more_credit: f64,
134}
135
136impl Default for DebitCreditDistributionConfig {
137 fn default() -> Self {
138 Self {
140 equal: 0.82,
141 more_debit: 0.07,
142 more_credit: 0.11,
143 }
144 }
145}
146
147pub struct LineItemSampler {
152 rng: ChaCha8Rng,
154 #[allow(dead_code)]
156 line_config: LineItemDistributionConfig,
157 even_odd_config: EvenOddDistributionConfig,
159 debit_credit_config: DebitCreditDistributionConfig,
161 cumulative: [f64; 11],
163}
164
165impl LineItemSampler {
166 pub fn new(seed: u64) -> Self {
168 let line_config = LineItemDistributionConfig::default();
169 let cumulative = line_config.cumulative();
170
171 Self {
172 rng: ChaCha8Rng::seed_from_u64(seed),
173 line_config,
174 even_odd_config: EvenOddDistributionConfig::default(),
175 debit_credit_config: DebitCreditDistributionConfig::default(),
176 cumulative,
177 }
178 }
179
180 pub fn with_config(
182 seed: u64,
183 line_config: LineItemDistributionConfig,
184 even_odd_config: EvenOddDistributionConfig,
185 debit_credit_config: DebitCreditDistributionConfig,
186 ) -> Self {
187 let cumulative = line_config.cumulative();
188
189 Self {
190 rng: ChaCha8Rng::seed_from_u64(seed),
191 line_config,
192 even_odd_config,
193 debit_credit_config,
194 cumulative,
195 }
196 }
197
198 pub fn sample_count(&mut self) -> usize {
200 let p: f64 = self.rng.gen();
201
202 if p < self.cumulative[0] {
204 2
205 } else if p < self.cumulative[1] {
206 3
207 } else if p < self.cumulative[2] {
208 4
209 } else if p < self.cumulative[3] {
210 5
211 } else if p < self.cumulative[4] {
212 6
213 } else if p < self.cumulative[5] {
214 7
215 } else if p < self.cumulative[6] {
216 8
217 } else if p < self.cumulative[7] {
218 9
219 } else if p < self.cumulative[8] {
220 self.rng.gen_range(10..100)
222 } else if p < self.cumulative[9] {
223 self.rng.gen_range(100..1000)
225 } else {
226 self.rng.gen_range(1000..10000)
228 }
229 }
230
231 pub fn sample_even(&mut self) -> bool {
233 self.rng.gen::<f64>() < self.even_odd_config.even
234 }
235
236 pub fn sample_count_with_parity(&mut self) -> usize {
241 let base_count = self.sample_count();
242 let should_be_even = self.sample_even();
243
244 let is_even = base_count % 2 == 0;
246 if should_be_even != is_even {
247 if base_count <= 2 {
249 base_count + 1
251 } else if self.rng.gen::<bool>() {
252 base_count + 1
254 } else {
255 base_count - 1
257 }
258 } else {
259 base_count
260 }
261 }
262
263 pub fn sample_debit_credit_type(&mut self) -> DebitCreditSplit {
265 let p: f64 = self.rng.gen();
266
267 if p < self.debit_credit_config.equal {
268 DebitCreditSplit::Equal
269 } else if p < self.debit_credit_config.equal + self.debit_credit_config.more_debit {
270 DebitCreditSplit::MoreDebit
271 } else {
272 DebitCreditSplit::MoreCredit
273 }
274 }
275
276 pub fn sample(&mut self) -> LineItemSpec {
278 let total_count = self.sample_count_with_parity();
279 let split_type = self.sample_debit_credit_type();
280
281 let (debit_count, credit_count) = match split_type {
282 DebitCreditSplit::Equal => {
283 let half = total_count / 2;
284 (half, total_count - half)
285 }
286 DebitCreditSplit::MoreDebit => {
287 let debit = (total_count as f64 * 0.6).round() as usize;
289 let debit = debit.max(1).min(total_count - 1);
290 (debit, total_count - debit)
291 }
292 DebitCreditSplit::MoreCredit => {
293 let credit = (total_count as f64 * 0.6).round() as usize;
295 let credit = credit.max(1).min(total_count - 1);
296 (total_count - credit, credit)
297 }
298 };
299
300 LineItemSpec {
301 total_count,
302 debit_count,
303 credit_count,
304 split_type,
305 }
306 }
307
308 pub fn reset(&mut self, seed: u64) {
310 self.rng = ChaCha8Rng::seed_from_u64(seed);
311 }
312}
313
314#[derive(Debug, Clone, Copy, PartialEq, Eq)]
316pub enum DebitCreditSplit {
317 Equal,
319 MoreDebit,
321 MoreCredit,
323}
324
325#[derive(Debug, Clone)]
327pub struct LineItemSpec {
328 pub total_count: usize,
330 pub debit_count: usize,
332 pub credit_count: usize,
334 pub split_type: DebitCreditSplit,
336}
337
338impl LineItemSpec {
339 pub fn is_valid(&self) -> bool {
341 self.total_count >= 2
342 && self.debit_count >= 1
343 && self.credit_count >= 1
344 && self.debit_count + self.credit_count == self.total_count
345 }
346}
347
348#[cfg(test)]
349mod tests {
350 use super::*;
351
352 #[test]
353 fn test_default_config_valid() {
354 let config = LineItemDistributionConfig::default();
355 assert!(config.validate().is_ok());
356 }
357
358 #[test]
359 fn test_sampler_determinism() {
360 let mut sampler1 = LineItemSampler::new(42);
361 let mut sampler2 = LineItemSampler::new(42);
362
363 for _ in 0..100 {
364 assert_eq!(sampler1.sample_count(), sampler2.sample_count());
365 }
366 }
367
368 #[test]
369 fn test_sampler_distribution() {
370 let mut sampler = LineItemSampler::new(42);
371 let sample_size = 100_000;
372
373 let mut counts = std::collections::HashMap::new();
374 for _ in 0..sample_size {
375 let count = sampler.sample_count();
376 *counts.entry(count).or_insert(0) += 1;
377 }
378
379 let two_count = *counts.get(&2).unwrap_or(&0) as f64 / sample_size as f64;
381 assert!(
382 two_count > 0.55 && two_count < 0.65,
383 "Expected ~60% 2-item entries, got {}%",
384 two_count * 100.0
385 );
386
387 let four_count = *counts.get(&4).unwrap_or(&0) as f64 / sample_size as f64;
389 assert!(
390 four_count > 0.13 && four_count < 0.20,
391 "Expected ~16% 4-item entries, got {}%",
392 four_count * 100.0
393 );
394 }
395
396 #[test]
397 fn test_line_item_spec_valid() {
398 let mut sampler = LineItemSampler::new(42);
399
400 for _ in 0..1000 {
401 let spec = sampler.sample();
402 assert!(spec.is_valid(), "Invalid spec: {:?}", spec);
403 }
404 }
405}