datasynth_eval/statistical/
temporal.rs1use crate::error::{EvalError, EvalResult};
7use chrono::{Datelike, NaiveDate, Weekday};
8use serde::{Deserialize, Serialize};
9use std::collections::HashMap;
10
11pub const MONTH_END_SPIKE: f64 = 2.5;
13pub const QUARTER_END_SPIKE: f64 = 4.0;
14pub const YEAR_END_SPIKE: f64 = 6.0;
15pub const WEEKEND_RATIO: f64 = 0.10;
16
17#[derive(Debug, Clone, Serialize, Deserialize)]
19pub struct TemporalAnalysis {
20 pub sample_size: usize,
22 pub start_date: NaiveDate,
24 pub end_date: NaiveDate,
26 pub days_spanned: i64,
28 pub pattern_correlation: f64,
30 pub month_end_spike: f64,
32 pub quarter_end_spike: f64,
34 pub year_end_spike: f64,
36 pub weekend_ratio: f64,
38 pub day_of_week_distribution: HashMap<String, f64>,
40 pub day_of_week_correlation: f64,
42 pub monthly_distribution: HashMap<u32, usize>,
44 pub passes: bool,
46}
47
48#[derive(Debug, Clone)]
50pub struct TemporalEntry {
51 pub posting_date: NaiveDate,
53}
54
55const DAY_WEIGHTS: [f64; 7] = [
57 1.3, 1.1, 1.0, 1.0, 0.85, 0.05, 0.05, ];
65
66pub struct TemporalAnalyzer {
68 analyze_industry_seasonality: bool,
70}
71
72impl TemporalAnalyzer {
73 pub fn new() -> Self {
75 Self {
76 analyze_industry_seasonality: false,
77 }
78 }
79
80 pub fn with_industry_seasonality(mut self) -> Self {
82 self.analyze_industry_seasonality = true;
83 self
84 }
85
86 pub fn analyze(&self, entries: &[TemporalEntry]) -> EvalResult<TemporalAnalysis> {
88 let n = entries.len();
89 if n < 10 {
90 return Err(EvalError::InsufficientData {
91 required: 10,
92 actual: n,
93 });
94 }
95
96 let dates: Vec<NaiveDate> = entries.iter().map(|e| e.posting_date).collect();
98 let start_date = *dates.iter().min().unwrap();
99 let end_date = *dates.iter().max().unwrap();
100 let days_spanned = (end_date - start_date).num_days() + 1;
101
102 let mut daily_counts: HashMap<NaiveDate, usize> = HashMap::new();
104 for entry in entries {
105 *daily_counts.entry(entry.posting_date).or_insert(0) += 1;
106 }
107
108 let avg_daily = n as f64 / days_spanned as f64;
110
111 let month_end_spike = self.calculate_month_end_spike(&daily_counts, avg_daily);
113
114 let quarter_end_spike = self.calculate_quarter_end_spike(&daily_counts, avg_daily);
116
117 let year_end_spike = self.calculate_year_end_spike(&daily_counts, avg_daily);
119
120 let weekend_count = entries
122 .iter()
123 .filter(|e| {
124 let weekday = e.posting_date.weekday();
125 weekday == Weekday::Sat || weekday == Weekday::Sun
126 })
127 .count();
128 let weekend_ratio = weekend_count as f64 / n as f64;
129
130 let mut dow_counts = [0usize; 7];
132 for entry in entries {
133 let idx = entry.posting_date.weekday().num_days_from_monday() as usize;
134 dow_counts[idx] += 1;
135 }
136 let total_dow: usize = dow_counts.iter().sum();
137 let mut day_of_week_distribution = HashMap::new();
138 let weekdays = [
139 "Monday",
140 "Tuesday",
141 "Wednesday",
142 "Thursday",
143 "Friday",
144 "Saturday",
145 "Sunday",
146 ];
147 for (i, name) in weekdays.iter().enumerate() {
148 day_of_week_distribution
149 .insert(name.to_string(), dow_counts[i] as f64 / total_dow as f64);
150 }
151
152 let day_of_week_correlation = self.calculate_dow_correlation(&dow_counts);
154
155 let mut monthly_distribution: HashMap<u32, usize> = HashMap::new();
157 for entry in entries {
158 *monthly_distribution
159 .entry(entry.posting_date.month())
160 .or_insert(0) += 1;
161 }
162
163 let pattern_correlation =
165 self.calculate_pattern_correlation(&daily_counts, start_date, end_date, avg_daily);
166
167 let passes = pattern_correlation >= 0.5 && (weekend_ratio - WEEKEND_RATIO).abs() < 0.15;
169
170 Ok(TemporalAnalysis {
171 sample_size: n,
172 start_date,
173 end_date,
174 days_spanned,
175 pattern_correlation,
176 month_end_spike,
177 quarter_end_spike,
178 year_end_spike,
179 weekend_ratio,
180 day_of_week_distribution,
181 day_of_week_correlation,
182 monthly_distribution,
183 passes,
184 })
185 }
186
187 fn calculate_month_end_spike(
189 &self,
190 daily_counts: &HashMap<NaiveDate, usize>,
191 avg_daily: f64,
192 ) -> f64 {
193 if avg_daily <= 0.0 {
194 return 1.0;
195 }
196
197 let month_end_dates: Vec<&NaiveDate> = daily_counts
198 .keys()
199 .filter(|d| self.is_month_end(**d))
200 .collect();
201
202 if month_end_dates.is_empty() {
203 return 1.0;
204 }
205
206 let month_end_total: usize = month_end_dates
207 .iter()
208 .filter_map(|d| daily_counts.get(*d))
209 .sum();
210 let month_end_avg = month_end_total as f64 / month_end_dates.len() as f64;
211
212 month_end_avg / avg_daily
213 }
214
215 fn calculate_quarter_end_spike(
217 &self,
218 daily_counts: &HashMap<NaiveDate, usize>,
219 avg_daily: f64,
220 ) -> f64 {
221 if avg_daily <= 0.0 {
222 return 1.0;
223 }
224
225 let quarter_end_dates: Vec<&NaiveDate> = daily_counts
226 .keys()
227 .filter(|d| self.is_quarter_end(**d))
228 .collect();
229
230 if quarter_end_dates.is_empty() {
231 return 1.0;
232 }
233
234 let quarter_end_total: usize = quarter_end_dates
235 .iter()
236 .filter_map(|d| daily_counts.get(*d))
237 .sum();
238 let quarter_end_avg = quarter_end_total as f64 / quarter_end_dates.len() as f64;
239
240 quarter_end_avg / avg_daily
241 }
242
243 fn calculate_year_end_spike(
245 &self,
246 daily_counts: &HashMap<NaiveDate, usize>,
247 avg_daily: f64,
248 ) -> f64 {
249 if avg_daily <= 0.0 {
250 return 1.0;
251 }
252
253 let year_end_dates: Vec<&NaiveDate> = daily_counts
254 .keys()
255 .filter(|d| self.is_year_end(**d))
256 .collect();
257
258 if year_end_dates.is_empty() {
259 return 1.0;
260 }
261
262 let year_end_total: usize = year_end_dates
263 .iter()
264 .filter_map(|d| daily_counts.get(*d))
265 .sum();
266 let year_end_avg = year_end_total as f64 / year_end_dates.len() as f64;
267
268 year_end_avg / avg_daily
269 }
270
271 fn is_month_end(&self, date: NaiveDate) -> bool {
273 let next_month = if date.month() == 12 {
274 NaiveDate::from_ymd_opt(date.year() + 1, 1, 1)
275 } else {
276 NaiveDate::from_ymd_opt(date.year(), date.month() + 1, 1)
277 };
278 if let Some(next) = next_month {
279 let days_to_end = (next - date).num_days();
280 days_to_end <= 5
281 } else {
282 false
283 }
284 }
285
286 fn is_quarter_end(&self, date: NaiveDate) -> bool {
288 let quarter_end_months = [3, 6, 9, 12];
289 quarter_end_months.contains(&date.month()) && self.is_month_end(date)
290 }
291
292 fn is_year_end(&self, date: NaiveDate) -> bool {
294 date.month() == 12 && self.is_month_end(date)
295 }
296
297 fn calculate_dow_correlation(&self, observed: &[usize; 7]) -> f64 {
299 let total: usize = observed.iter().sum();
300 if total == 0 {
301 return 0.0;
302 }
303
304 let observed_norm: Vec<f64> = observed.iter().map(|&c| c as f64 / total as f64).collect();
306
307 let total_weight: f64 = DAY_WEIGHTS.iter().sum();
309 let expected_norm: Vec<f64> = DAY_WEIGHTS.iter().map(|&w| w / total_weight).collect();
310
311 let mean_obs = observed_norm.iter().sum::<f64>() / 7.0;
313 let mean_exp = expected_norm.iter().sum::<f64>() / 7.0;
314
315 let numerator: f64 = (0..7)
316 .map(|i| (observed_norm[i] - mean_obs) * (expected_norm[i] - mean_exp))
317 .sum();
318
319 let var_obs: f64 = observed_norm.iter().map(|o| (o - mean_obs).powi(2)).sum();
320 let var_exp: f64 = expected_norm.iter().map(|e| (e - mean_exp).powi(2)).sum();
321
322 let denominator = (var_obs * var_exp).sqrt();
323
324 if denominator > 0.0 {
325 numerator / denominator
326 } else {
327 0.0
328 }
329 }
330
331 fn calculate_pattern_correlation(
333 &self,
334 daily_counts: &HashMap<NaiveDate, usize>,
335 start_date: NaiveDate,
336 end_date: NaiveDate,
337 avg_daily: f64,
338 ) -> f64 {
339 let mut expected: Vec<f64> = Vec::new();
341 let mut observed: Vec<f64> = Vec::new();
342
343 let mut current = start_date;
344 while current <= end_date {
345 let mut multiplier = 1.0;
346
347 let weekday = current.weekday();
349 if weekday == Weekday::Sat || weekday == Weekday::Sun {
350 multiplier *= 0.1;
351 } else {
352 let dow_idx = weekday.num_days_from_monday() as usize;
354 multiplier *= DAY_WEIGHTS[dow_idx] / 1.0;
355 }
356
357 if self.is_month_end(current) {
359 multiplier *= MONTH_END_SPIKE / 2.5;
360 }
361
362 if self.is_year_end(current) {
364 multiplier *= YEAR_END_SPIKE / MONTH_END_SPIKE;
365 } else if self.is_quarter_end(current) {
366 multiplier *= QUARTER_END_SPIKE / MONTH_END_SPIKE;
367 }
368
369 expected.push(avg_daily * multiplier);
370 observed.push(*daily_counts.get(¤t).unwrap_or(&0) as f64);
371
372 current = current.succ_opt().unwrap_or(current);
373 }
374
375 if expected.is_empty() {
377 return 0.0;
378 }
379
380 let n = expected.len() as f64;
381 let mean_exp = expected.iter().sum::<f64>() / n;
382 let mean_obs = observed.iter().sum::<f64>() / n;
383
384 let numerator: f64 = expected
385 .iter()
386 .zip(observed.iter())
387 .map(|(e, o)| (e - mean_exp) * (o - mean_obs))
388 .sum();
389
390 let var_exp: f64 = expected.iter().map(|e| (e - mean_exp).powi(2)).sum();
391 let var_obs: f64 = observed.iter().map(|o| (o - mean_obs).powi(2)).sum();
392
393 let denominator = (var_exp * var_obs).sqrt();
394
395 if denominator > 0.0 {
396 numerator / denominator
397 } else {
398 0.0
399 }
400 }
401}
402
403impl Default for TemporalAnalyzer {
404 fn default() -> Self {
405 Self::new()
406 }
407}
408
409#[cfg(test)]
410mod tests {
411 use super::*;
412
413 fn create_entries(dates: Vec<NaiveDate>) -> Vec<TemporalEntry> {
414 dates
415 .into_iter()
416 .map(|d| TemporalEntry { posting_date: d })
417 .collect()
418 }
419
420 #[test]
421 fn test_temporal_analysis_basic() {
422 let entries = create_entries(vec![
423 NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
424 NaiveDate::from_ymd_opt(2024, 1, 16).unwrap(),
425 NaiveDate::from_ymd_opt(2024, 1, 17).unwrap(),
426 NaiveDate::from_ymd_opt(2024, 1, 18).unwrap(),
427 NaiveDate::from_ymd_opt(2024, 1, 19).unwrap(),
428 NaiveDate::from_ymd_opt(2024, 1, 22).unwrap(),
429 NaiveDate::from_ymd_opt(2024, 1, 23).unwrap(),
430 NaiveDate::from_ymd_opt(2024, 1, 24).unwrap(),
431 NaiveDate::from_ymd_opt(2024, 1, 25).unwrap(),
432 NaiveDate::from_ymd_opt(2024, 1, 26).unwrap(),
433 ]);
434
435 let analyzer = TemporalAnalyzer::new();
436 let result = analyzer.analyze(&entries).unwrap();
437
438 assert_eq!(result.sample_size, 10);
439 assert!(!result.day_of_week_distribution.is_empty());
440 }
441
442 #[test]
443 fn test_weekend_ratio() {
444 let mut entries = Vec::new();
445 for i in 1..=10 {
447 entries.push(TemporalEntry {
448 posting_date: NaiveDate::from_ymd_opt(2024, 1, i).unwrap(),
449 });
450 }
451 let analyzer = TemporalAnalyzer::new();
455 let result = analyzer.analyze(&entries).unwrap();
456
457 assert!(result.weekend_ratio >= 0.0);
459 assert!(result.weekend_ratio <= 1.0);
460 }
461
462 #[test]
463 fn test_insufficient_data() {
464 let entries = create_entries(vec![NaiveDate::from_ymd_opt(2024, 1, 1).unwrap()]);
465 let analyzer = TemporalAnalyzer::new();
466 let result = analyzer.analyze(&entries);
467 assert!(matches!(result, Err(EvalError::InsufficientData { .. })));
468 }
469}