datasynth-generators 2.4.0

50+ data generators covering GL, P2P, O2C, S2C, HR, manufacturing, audit, tax, treasury, and ESG
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
//! Expense report generator for the Hire-to-Retire (H2R) process.
//!
//! Generates employee expense reports with realistic line items across categories
//! (travel, meals, lodging, transportation, etc.), policy violation detection,
//! and approval workflow statuses.

use chrono::{Datelike, NaiveDate};
use datasynth_config::schema::ExpenseConfig;
use datasynth_core::models::{ExpenseCategory, ExpenseLineItem, ExpenseReport, ExpenseStatus};
use datasynth_core::utils::{sample_decimal_range, seeded_rng};
use datasynth_core::uuid_factory::{DeterministicUuidFactory, GeneratorType};
use rand::prelude::*;
use rand_chacha::ChaCha8Rng;
use rust_decimal::Decimal;
use smallvec::SmallVec;
use std::collections::HashMap;
use tracing::debug;

/// Generates [`ExpenseReport`] records for employees over a period.
pub struct ExpenseReportGenerator {
    rng: ChaCha8Rng,
    uuid_factory: DeterministicUuidFactory,
    item_uuid_factory: DeterministicUuidFactory,
    /// Per-generator config drives expense submission rate and policy thresholds.
    config: ExpenseConfig,
    /// Pool of real employee IDs for approved_by references.
    employee_ids_pool: Vec<String>,
    /// Pool of real cost center IDs.
    cost_center_ids_pool: Vec<String>,
    /// Mapping of employee_id → employee_name for denormalization (DS-011).
    employee_names: HashMap<String, String>,
    /// Optional country pack for locale-aware generation (set via
    /// `set_country_pack`); drives locale-specific currencies.
    country_pack: Option<datasynth_core::CountryPack>,
}

impl ExpenseReportGenerator {
    /// Create a new expense report generator with default configuration.
    pub fn new(seed: u64) -> Self {
        Self {
            rng: seeded_rng(seed, 0),
            uuid_factory: DeterministicUuidFactory::new(seed, GeneratorType::ExpenseReport),
            item_uuid_factory: DeterministicUuidFactory::with_sub_discriminator(
                seed,
                GeneratorType::ExpenseReport,
                1,
            ),
            config: ExpenseConfig::default(),
            employee_ids_pool: Vec::new(),
            cost_center_ids_pool: Vec::new(),
            employee_names: HashMap::new(),
            country_pack: None,
        }
    }

    /// Create an expense report generator with custom configuration.
    pub fn with_config(seed: u64, config: ExpenseConfig) -> Self {
        Self {
            rng: seeded_rng(seed, 0),
            uuid_factory: DeterministicUuidFactory::new(seed, GeneratorType::ExpenseReport),
            item_uuid_factory: DeterministicUuidFactory::with_sub_discriminator(
                seed,
                GeneratorType::ExpenseReport,
                1,
            ),
            config,
            employee_ids_pool: Vec::new(),
            cost_center_ids_pool: Vec::new(),
            employee_names: HashMap::new(),
            country_pack: None,
        }
    }

    /// Set the country pack for locale-aware generation.
    ///
    /// When set, the generator can use locale-specific currencies and
    /// business rules from the country pack.  Currently the pack is
    /// stored for future expansion; existing behaviour is unchanged
    /// when no pack is provided.
    pub fn set_country_pack(&mut self, pack: datasynth_core::CountryPack) {
        self.country_pack = Some(pack);
    }

    /// Set ID pools for cross-reference coherence.
    ///
    /// When pools are non-empty, the generator selects `approved_by` from
    /// `employee_ids` and `cost_center` from `cost_center_ids` instead of
    /// fabricating placeholder IDs.
    pub fn with_pools(mut self, employee_ids: Vec<String>, cost_center_ids: Vec<String>) -> Self {
        self.employee_ids_pool = employee_ids;
        self.cost_center_ids_pool = cost_center_ids;
        self
    }

    /// Set the employee name mapping for denormalization (DS-011).
    ///
    /// Maps employee IDs to their display names so that generated expense
    /// reports include the employee name for graph export convenience.
    pub fn with_employee_names(mut self, names: HashMap<String, String>) -> Self {
        self.employee_names = names;
        self
    }

    /// Generate expense reports using the stored config and country pack.
    ///
    /// Uses `self.config` for submission rate and policy thresholds, and
    /// `self.country_pack` for locale-specific currency.
    pub fn generate_from_config(
        &mut self,
        employee_ids: &[String],
        period_start: NaiveDate,
        period_end: NaiveDate,
    ) -> Vec<ExpenseReport> {
        let config = self.config.clone();
        self.generate(employee_ids, period_start, period_end, &config)
    }

    /// Generate expense reports for employees over the given period.
    ///
    /// Only `config.submission_rate` fraction of employees submit reports each
    /// month within the period.
    ///
    /// # Arguments
    ///
    /// * `employee_ids` - Slice of employee identifiers
    /// * `period_start` - Start of the period (inclusive)
    /// * `period_end` - End of the period (inclusive)
    /// * `config` - Expense management configuration (overrides stored config)
    pub fn generate(
        &mut self,
        employee_ids: &[String],
        period_start: NaiveDate,
        period_end: NaiveDate,
        config: &ExpenseConfig,
    ) -> Vec<ExpenseReport> {
        let currency = self
            .country_pack
            .as_ref()
            .map(|cp| cp.locale.default_currency.clone())
            .filter(|c| !c.is_empty())
            .unwrap_or_else(|| "USD".to_string());
        self.generate_with_currency(employee_ids, period_start, period_end, config, &currency)
    }

    /// Generate expense reports with a specific company currency.
    pub fn generate_with_currency(
        &mut self,
        employee_ids: &[String],
        period_start: NaiveDate,
        period_end: NaiveDate,
        config: &ExpenseConfig,
        currency: &str,
    ) -> Vec<ExpenseReport> {
        debug!(employee_count = employee_ids.len(), %period_start, %period_end, currency, "Generating expense reports");
        let mut reports = Vec::new();

        // Iterate over each month in the period
        let mut current_month_start = period_start;
        while current_month_start <= period_end {
            let month_end = self.month_end(current_month_start).min(period_end);

            for employee_id in employee_ids {
                // Only submission_rate fraction of employees submit per month
                if self.rng.random_bool(config.submission_rate.min(1.0)) {
                    let report = self.generate_report(
                        employee_id,
                        current_month_start,
                        month_end,
                        config,
                        currency,
                    );
                    reports.push(report);
                }
            }

            // Advance to next month
            current_month_start = self.next_month_start(current_month_start);
        }

        reports
    }

    /// Generate a single expense report for an employee within a date range.
    fn generate_report(
        &mut self,
        employee_id: &str,
        period_start: NaiveDate,
        period_end: NaiveDate,
        config: &ExpenseConfig,
        currency: &str,
    ) -> ExpenseReport {
        let report_id = self.uuid_factory.next().to_string();

        // 1-5 line items per report
        let item_count = self.rng.random_range(1..=5);
        let mut line_items = SmallVec::with_capacity(item_count);
        let mut total_amount = Decimal::ZERO;

        for _ in 0..item_count {
            let item = self.generate_line_item(period_start, period_end, currency);
            total_amount += item.amount;
            line_items.push(item);
        }

        // Submission date: usually within a few days after the last expense
        let max_expense_date = line_items
            .iter()
            .map(|li: &ExpenseLineItem| li.date)
            .max()
            .unwrap_or(period_end);
        let submission_lag = self.rng.random_range(0..=5);
        let submission_date = max_expense_date + chrono::Duration::days(submission_lag);

        // Trip/purpose descriptions
        let descriptions = [
            "Client site visit",
            "Conference attendance",
            "Team offsite meeting",
            "Customer presentation",
            "Training workshop",
            "Quarterly review travel",
            "Sales meeting",
            "Project kickoff",
        ];
        let description = descriptions[self.rng.random_range(0..descriptions.len())].to_string();

        // Status distribution: 70% Approved, 10% Paid, 10% Submitted, 5% Rejected, 5% Draft
        let status_roll: f64 = self.rng.random();
        let status = if status_roll < 0.70 {
            ExpenseStatus::Approved
        } else if status_roll < 0.80 {
            ExpenseStatus::Paid
        } else if status_roll < 0.90 {
            ExpenseStatus::Submitted
        } else if status_roll < 0.95 {
            ExpenseStatus::Rejected
        } else {
            ExpenseStatus::Draft
        };

        let approved_by = if matches!(status, ExpenseStatus::Approved | ExpenseStatus::Paid) {
            if !self.employee_ids_pool.is_empty() {
                let idx = self.rng.random_range(0..self.employee_ids_pool.len());
                Some(self.employee_ids_pool[idx].clone())
            } else {
                Some(format!("MGR-{:04}", self.rng.random_range(1..=100)))
            }
        } else {
            None
        };

        let approved_date = if matches!(status, ExpenseStatus::Approved | ExpenseStatus::Paid) {
            let approval_lag = self.rng.random_range(1..=7);
            Some(submission_date + chrono::Duration::days(approval_lag))
        } else {
            None
        };

        let paid_date = if status == ExpenseStatus::Paid {
            approved_date.map(|ad| ad + chrono::Duration::days(self.rng.random_range(3..=14)))
        } else {
            None
        };

        // Cost center and department
        let cost_center = if self.rng.random_bool(0.70) {
            if !self.cost_center_ids_pool.is_empty() {
                let idx = self.rng.random_range(0..self.cost_center_ids_pool.len());
                Some(self.cost_center_ids_pool[idx].clone())
            } else {
                Some(format!("CC-{:03}", self.rng.random_range(100..=500)))
            }
        } else {
            None
        };

        let department = if self.rng.random_bool(0.80) {
            let departments = [
                "Engineering",
                "Sales",
                "Marketing",
                "Finance",
                "HR",
                "Operations",
                "Legal",
                "IT",
                "Executive",
            ];
            Some(departments[self.rng.random_range(0..departments.len())].to_string())
        } else {
            None
        };

        // Policy violations: based on config.policy_violation_rate per line item
        let policy_violation_rate = config.policy_violation_rate;
        let mut policy_violations = Vec::new();
        for item in &line_items {
            if self.rng.random_bool(policy_violation_rate.min(1.0)) {
                let violation = self.pick_violation(item);
                policy_violations.push(violation);
            }
        }

        ExpenseReport {
            report_id,
            employee_id: employee_id.to_string(),
            submission_date,
            description,
            status,
            total_amount,
            currency: currency.to_string(),
            line_items,
            approved_by,
            approved_date,
            paid_date,
            cost_center,
            department,
            policy_violations,
            employee_name: self.employee_names.get(employee_id).cloned(),
        }
    }

    /// Generate a single expense line item with a random category and amount.
    fn generate_line_item(
        &mut self,
        period_start: NaiveDate,
        period_end: NaiveDate,
        currency: &str,
    ) -> ExpenseLineItem {
        let item_id = self.item_uuid_factory.next().to_string();

        // Pick a category and generate an appropriate amount range
        let (category, amount_min, amount_max, desc, merchant) = self.pick_category();

        let amount = sample_decimal_range(
            &mut self.rng,
            Decimal::from_f64_retain(amount_min).unwrap_or(Decimal::ONE),
            Decimal::from_f64_retain(amount_max).unwrap_or(Decimal::ONE),
        )
        .round_dp(2);

        // Date within the period
        let days_in_period = (period_end - period_start).num_days().max(1);
        let offset = self.rng.random_range(0..=days_in_period);
        let date = period_start + chrono::Duration::days(offset);

        // Receipt attached: 85% of the time
        let receipt_attached = self.rng.random_bool(0.85);

        ExpenseLineItem {
            item_id,
            category,
            date,
            amount,
            currency: currency.to_string(),
            description: desc,
            receipt_attached,
            merchant,
        }
    }

    /// Pick an expense category with corresponding amount range, description, and merchant.
    fn pick_category(&mut self) -> (ExpenseCategory, f64, f64, String, Option<String>) {
        let roll: f64 = self.rng.random();

        if roll < 0.20 {
            let merchants = [
                "Delta Airlines",
                "United Airlines",
                "American Airlines",
                "Southwest",
            ];
            let merchant = merchants[self.rng.random_range(0..merchants.len())].to_string();
            (
                ExpenseCategory::Travel,
                200.0,
                2000.0,
                "Airfare - business travel".to_string(),
                Some(merchant),
            )
        } else if roll < 0.40 {
            let merchants = [
                "Restaurant ABC",
                "Cafe Express",
                "Business Lunch Co",
                "Steakhouse Prime",
                "Sushi Palace",
            ];
            let merchant = merchants[self.rng.random_range(0..merchants.len())].to_string();
            (
                ExpenseCategory::Meals,
                20.0,
                100.0,
                "Business meal".to_string(),
                Some(merchant),
            )
        } else if roll < 0.55 {
            let merchants = ["Marriott", "Hilton", "Hyatt", "Holiday Inn", "Best Western"];
            let merchant = merchants[self.rng.random_range(0..merchants.len())].to_string();
            (
                ExpenseCategory::Lodging,
                100.0,
                500.0,
                "Hotel accommodation".to_string(),
                Some(merchant),
            )
        } else if roll < 0.70 {
            let merchants = ["Uber", "Lyft", "Hertz", "Enterprise", "Airport Parking"];
            let merchant = merchants[self.rng.random_range(0..merchants.len())].to_string();
            (
                ExpenseCategory::Transportation,
                10.0,
                200.0,
                "Ground transportation".to_string(),
                Some(merchant),
            )
        } else if roll < 0.80 {
            (
                ExpenseCategory::Office,
                15.0,
                300.0,
                "Office supplies".to_string(),
                Some("Office Depot".to_string()),
            )
        } else if roll < 0.88 {
            (
                ExpenseCategory::Entertainment,
                50.0,
                500.0,
                "Client entertainment".to_string(),
                None,
            )
        } else if roll < 0.95 {
            (
                ExpenseCategory::Training,
                100.0,
                1500.0,
                "Professional development".to_string(),
                None,
            )
        } else {
            (
                ExpenseCategory::Other,
                10.0,
                200.0,
                "Miscellaneous expense".to_string(),
                None,
            )
        }
    }

    /// Generate a policy violation description for a given line item.
    fn pick_violation(&mut self, item: &ExpenseLineItem) -> String {
        let violations = match item.category {
            ExpenseCategory::Meals => vec![
                "Exceeds daily meal limit",
                "Alcohol included without approval",
                "Missing itemized receipt",
            ],
            ExpenseCategory::Travel => vec![
                "Booked outside preferred vendor",
                "Class upgrade not pre-approved",
                "Booking made less than 7 days in advance",
            ],
            ExpenseCategory::Lodging => vec![
                "Exceeds nightly rate limit",
                "Extended stay without approval",
                "Non-preferred hotel chain",
            ],
            _ => vec![
                "Missing receipt",
                "Insufficient business justification",
                "Exceeds category spending limit",
            ],
        };

        violations[self.rng.random_range(0..violations.len())].to_string()
    }

    /// Get the last day of the month for a given date.
    fn month_end(&self, date: NaiveDate) -> NaiveDate {
        let (year, month) = if date.month() == 12 {
            (date.year() + 1, 1)
        } else {
            (date.year(), date.month() + 1)
        };
        NaiveDate::from_ymd_opt(year, month, 1)
            .unwrap_or(date)
            .pred_opt()
            .unwrap_or(date)
    }

    /// Get the first day of the next month.
    fn next_month_start(&self, date: NaiveDate) -> NaiveDate {
        let (year, month) = if date.month() == 12 {
            (date.year() + 1, 1)
        } else {
            (date.year(), date.month() + 1)
        };
        NaiveDate::from_ymd_opt(year, month, 1).unwrap_or(date)
    }
}

#[cfg(test)]
#[allow(clippy::unwrap_used)]
mod tests {
    use super::*;

    fn test_employee_ids() -> Vec<String> {
        (1..=10).map(|i| format!("EMP-{:04}", i)).collect()
    }

    #[test]
    fn test_basic_expense_generation() {
        let mut gen = ExpenseReportGenerator::new(42);
        let employees = test_employee_ids();
        let period_start = NaiveDate::from_ymd_opt(2024, 1, 1).unwrap();
        let period_end = NaiveDate::from_ymd_opt(2024, 1, 31).unwrap();
        let config = ExpenseConfig::default();

        let reports = gen.generate(&employees, period_start, period_end, &config);

        // With 30% submission rate and 10 employees, expect ~3 reports per month
        assert!(!reports.is_empty());
        assert!(
            reports.len() <= employees.len(),
            "Should not exceed employee count for a single month"
        );

        for report in &reports {
            assert!(!report.report_id.is_empty());
            assert!(!report.employee_id.is_empty());
            assert!(report.total_amount > Decimal::ZERO);
            assert!(!report.line_items.is_empty());
            assert!(report.line_items.len() <= 5);

            // Total should equal sum of line items
            let line_sum: Decimal = report.line_items.iter().map(|li| li.amount).sum();
            assert_eq!(report.total_amount, line_sum);

            for item in &report.line_items {
                assert!(!item.item_id.is_empty());
                assert!(item.amount > Decimal::ZERO);
            }
        }
    }

    #[test]
    fn test_deterministic_expenses() {
        let employees = test_employee_ids();
        let period_start = NaiveDate::from_ymd_opt(2024, 3, 1).unwrap();
        let period_end = NaiveDate::from_ymd_opt(2024, 3, 31).unwrap();
        let config = ExpenseConfig::default();

        let mut gen1 = ExpenseReportGenerator::new(42);
        let reports1 = gen1.generate(&employees, period_start, period_end, &config);

        let mut gen2 = ExpenseReportGenerator::new(42);
        let reports2 = gen2.generate(&employees, period_start, period_end, &config);

        assert_eq!(reports1.len(), reports2.len());
        for (a, b) in reports1.iter().zip(reports2.iter()) {
            assert_eq!(a.report_id, b.report_id);
            assert_eq!(a.employee_id, b.employee_id);
            assert_eq!(a.total_amount, b.total_amount);
            assert_eq!(a.status, b.status);
            assert_eq!(a.line_items.len(), b.line_items.len());
        }
    }

    #[test]
    fn test_expense_status_and_violations() {
        let mut gen = ExpenseReportGenerator::new(99);
        // Use more employees for a broader sample
        let employees: Vec<String> = (1..=30).map(|i| format!("EMP-{:04}", i)).collect();
        let period_start = NaiveDate::from_ymd_opt(2024, 1, 1).unwrap();
        let period_end = NaiveDate::from_ymd_opt(2024, 6, 30).unwrap();
        let config = ExpenseConfig::default();

        let reports = gen.generate(&employees, period_start, period_end, &config);

        // With 30 employees over 6 months, we should have a decent sample
        assert!(
            reports.len() > 10,
            "Expected multiple reports, got {}",
            reports.len()
        );

        let approved = reports
            .iter()
            .filter(|r| r.status == ExpenseStatus::Approved)
            .count();
        let paid = reports
            .iter()
            .filter(|r| r.status == ExpenseStatus::Paid)
            .count();
        let submitted = reports
            .iter()
            .filter(|r| r.status == ExpenseStatus::Submitted)
            .count();
        let rejected = reports
            .iter()
            .filter(|r| r.status == ExpenseStatus::Rejected)
            .count();
        let draft = reports
            .iter()
            .filter(|r| r.status == ExpenseStatus::Draft)
            .count();

        // Approved should be the majority
        assert!(approved > 0, "Expected at least some approved reports");
        // Check that we have a mix of statuses
        assert!(
            paid + submitted + rejected + draft > 0,
            "Expected a mix of statuses beyond approved"
        );

        // Check policy violations exist somewhere
        let total_violations: usize = reports.iter().map(|r| r.policy_violations.len()).sum();
        assert!(
            total_violations > 0,
            "Expected at least some policy violations across {} reports",
            reports.len()
        );
    }

    #[test]
    fn test_country_pack_does_not_break_generation() {
        let mut gen = ExpenseReportGenerator::new(42);
        // Setting a default country pack should not alter basic generation behaviour.
        gen.set_country_pack(datasynth_core::CountryPack::default());

        let employees = test_employee_ids();
        let period_start = NaiveDate::from_ymd_opt(2024, 1, 1).unwrap();
        let period_end = NaiveDate::from_ymd_opt(2024, 1, 31).unwrap();
        let config = ExpenseConfig::default();

        let reports = gen.generate(&employees, period_start, period_end, &config);

        assert!(!reports.is_empty());
        for report in &reports {
            assert!(!report.report_id.is_empty());
            assert!(report.total_amount > Decimal::ZERO);
        }
    }
}