Skip to main content

shodh_memory/
ab_testing.rs

1// Statistical coefficients require full precision for accuracy
2#![allow(clippy::excessive_precision)]
3
4//! A/B Testing Infrastructure for Relevance Scoring
5//!
6//! Provides rigorous experimentation framework for comparing different
7//! relevance scoring configurations. Supports:
8//!
9//! - Multiple concurrent experiments
10//! - Consistent user assignment (same user always gets same variant)
11//! - Statistical significance testing (chi-squared, confidence intervals)
12//! - Metric tracking (impressions, clicks, success rate, latency)
13//! - Automatic winner detection with configurable significance threshold
14//!
15//! # Example
16//!
17//! ```ignore
18//! let manager = ABTestManager::new();
19//!
20//! // Create a test comparing semantic vs entity weight emphasis
21//! let test = ABTest::new("semantic_vs_entity")
22//!     .with_control(LearnedWeights::default())
23//!     .with_treatment(LearnedWeights {
24//!         semantic: 0.5,
25//!         entity: 0.25,
26//!         ..Default::default()
27//!     })
28//!     .with_traffic_split(0.5)
29//!     .build();
30//!
31//! manager.create_test(test)?;
32//!
33//! // Get variant for a user
34//! let variant = manager.get_variant("test_id", "user_123")?;
35//!
36//! // Record metrics
37//! manager.record_impression("test_id", "user_123")?;
38//! manager.record_click("test_id", "user_123", memory_id)?;
39//!
40//! // Check results
41//! let results = manager.analyze_test("test_id")?;
42//! if results.is_significant {
43//!     println!("Winner: {:?}", results.winner);
44//! }
45//! ```
46
47use std::collections::HashMap;
48use std::hash::{Hash, Hasher};
49use std::sync::Arc;
50
51use chrono::{DateTime, Duration, Utc};
52use parking_lot::RwLock;
53use serde::{Deserialize, Serialize};
54use uuid::Uuid;
55
56use crate::relevance::LearnedWeights;
57
58// =============================================================================
59// CONSTANTS
60// =============================================================================
61
62/// Default significance level (p < 0.05)
63pub const DEFAULT_SIGNIFICANCE_LEVEL: f64 = 0.05;
64
65/// Minimum sample size before statistical analysis is valid
66pub const MIN_SAMPLE_SIZE: u64 = 100;
67
68/// Default traffic split (50/50)
69pub const DEFAULT_TRAFFIC_SPLIT: f32 = 0.5;
70
71/// Chi-squared critical values for different significance levels (df=1)
72/// p=0.05 -> 3.841, p=0.01 -> 6.635, p=0.001 -> 10.828
73const CHI_SQUARED_CRITICAL_005: f64 = 3.841;
74const CHI_SQUARED_CRITICAL_001: f64 = 6.635;
75const CHI_SQUARED_CRITICAL_0001: f64 = 10.828;
76
77/// Sample Ratio Mismatch threshold (5% deviation triggers warning)
78const SRM_THRESHOLD: f64 = 0.05;
79
80/// Minimum effect size (Cohen's h) for practical significance
81const MIN_PRACTICAL_EFFECT_SIZE: f64 = 0.1;
82
83// =============================================================================
84// ADVANCED STATISTICAL TYPES
85// =============================================================================
86
87/// Bayesian analysis results
88#[derive(Debug, Clone, Serialize, Deserialize)]
89pub struct BayesianAnalysis {
90    /// Probability that treatment is better than control (0-1)
91    pub prob_treatment_better: f64,
92    /// Probability that control is better than treatment (0-1)
93    pub prob_control_better: f64,
94    /// Expected lift of treatment over control
95    pub expected_lift: f64,
96    /// Credible interval for treatment effect (95%)
97    pub credible_interval: (f64, f64),
98    /// Risk of choosing treatment if it's actually worse
99    pub risk_treatment: f64,
100    /// Risk of choosing control if treatment is actually better
101    pub risk_control: f64,
102}
103
104/// Effect size metrics
105#[derive(Debug, Clone, Serialize, Deserialize)]
106pub struct EffectSize {
107    /// Cohen's h for proportions (0.2 = small, 0.5 = medium, 0.8 = large)
108    pub cohens_h: f64,
109    /// Interpretation of effect size
110    pub interpretation: EffectSizeInterpretation,
111    /// Relative risk (treatment rate / control rate)
112    pub relative_risk: f64,
113    /// Odds ratio
114    pub odds_ratio: f64,
115    /// Number needed to treat (NNT) - how many users to see one additional success
116    pub nnt: f64,
117}
118
119/// Effect size interpretation
120#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
121pub enum EffectSizeInterpretation {
122    Negligible,
123    Small,
124    Medium,
125    Large,
126}
127
128impl std::fmt::Display for EffectSizeInterpretation {
129    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
130        match self {
131            Self::Negligible => write!(f, "negligible"),
132            Self::Small => write!(f, "small"),
133            Self::Medium => write!(f, "medium"),
134            Self::Large => write!(f, "large"),
135        }
136    }
137}
138
139/// Sample Ratio Mismatch detection result
140#[derive(Debug, Clone, Serialize, Deserialize)]
141pub struct SRMCheck {
142    /// Whether SRM is detected (data quality issue)
143    pub srm_detected: bool,
144    /// Expected ratio based on traffic split
145    pub expected_ratio: f64,
146    /// Observed ratio
147    pub observed_ratio: f64,
148    /// Chi-squared statistic for SRM test
149    pub chi_squared: f64,
150    /// P-value for SRM test
151    pub p_value: f64,
152    /// Severity of the mismatch
153    pub severity: SRMSeverity,
154}
155
156/// Severity of sample ratio mismatch
157#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
158pub enum SRMSeverity {
159    None,
160    Warning,
161    Critical,
162}
163
164/// Sequential testing state for valid early stopping
165#[derive(Debug, Clone, Serialize, Deserialize)]
166pub struct SequentialTest {
167    /// Current analysis number (1, 2, 3, ...)
168    pub analysis_number: u32,
169    /// Total planned analyses
170    pub planned_analyses: u32,
171    /// Alpha spent so far
172    pub alpha_spent: f64,
173    /// Current significance threshold (adjusted for multiple looks)
174    pub current_alpha: f64,
175    /// Can we stop early?
176    pub can_stop_early: bool,
177    /// Reason for stopping (if applicable)
178    pub stop_reason: Option<String>,
179}
180
181/// Guardrail metric that must not degrade
182#[derive(Debug, Clone, Serialize, Deserialize)]
183pub struct GuardrailMetric {
184    /// Name of the metric
185    pub name: String,
186    /// Baseline value (control)
187    pub baseline: f64,
188    /// Current value (treatment)
189    pub current: f64,
190    /// Maximum allowed degradation (e.g., 0.05 = 5%)
191    pub max_degradation: f64,
192    /// Is the guardrail breached?
193    pub is_breached: bool,
194    /// P-value for degradation test
195    pub degradation_p_value: f64,
196}
197
198/// Multi-Armed Bandit state for adaptive allocation
199#[derive(Debug, Clone, Serialize, Deserialize)]
200pub struct BanditState {
201    /// Algorithm type
202    pub algorithm: BanditAlgorithm,
203    /// Alpha parameter for each arm (successes + 1)
204    pub alphas: Vec<f64>,
205    /// Beta parameter for each arm (failures + 1)
206    pub betas: Vec<f64>,
207    /// Current allocation probabilities
208    pub allocation_probs: Vec<f64>,
209    /// Total reward collected
210    pub total_reward: f64,
211    /// Regret estimate
212    pub estimated_regret: f64,
213}
214
215/// Bandit algorithm type
216#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
217pub enum BanditAlgorithm {
218    /// Thompson Sampling (Bayesian)
219    ThompsonSampling,
220    /// Upper Confidence Bound
221    UCB1,
222    /// Epsilon-greedy
223    EpsilonGreedy,
224}
225
226// =============================================================================
227// CORE TYPES
228// =============================================================================
229
230/// Variant in an A/B test
231#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
232pub enum ABTestVariant {
233    /// Control group (baseline/existing behavior)
234    Control,
235    /// Treatment group (new behavior being tested)
236    Treatment,
237}
238
239impl ABTestVariant {
240    pub fn as_str(&self) -> &'static str {
241        match self {
242            ABTestVariant::Control => "control",
243            ABTestVariant::Treatment => "treatment",
244        }
245    }
246}
247
248/// Status of an A/B test
249#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
250pub enum ABTestStatus {
251    /// Test is being configured, not yet active
252    Draft,
253    /// Test is actively running and collecting data
254    Running,
255    /// Test is paused (no new assignments, still tracking existing)
256    Paused,
257    /// Test has concluded (winner determined or manually stopped)
258    Completed,
259    /// Test was archived (historical record)
260    Archived,
261}
262
263/// Metrics tracked for each variant
264#[derive(Debug, Clone, Default, Serialize, Deserialize)]
265pub struct VariantMetrics {
266    /// Number of times this variant was shown
267    pub impressions: u64,
268    /// Number of times user interacted positively (clicked/used memory)
269    pub clicks: u64,
270    /// Number of explicit positive feedback signals
271    pub positive_feedback: u64,
272    /// Number of explicit negative feedback signals
273    pub negative_feedback: u64,
274    /// Sum of relevance scores for computing average
275    pub total_relevance_score: f64,
276    /// Sum of latencies in microseconds
277    pub total_latency_us: u64,
278    /// Number of latency samples
279    pub latency_samples: u64,
280    /// Unique users in this variant
281    pub unique_users: u64,
282    /// Memory IDs that received clicks (for analysis)
283    pub clicked_memory_ids: Vec<Uuid>,
284}
285
286impl VariantMetrics {
287    /// Click-through rate (CTR)
288    pub fn ctr(&self) -> f64 {
289        if self.impressions == 0 {
290            0.0
291        } else {
292            self.clicks as f64 / self.impressions as f64
293        }
294    }
295
296    /// Success rate (positive / (positive + negative))
297    pub fn success_rate(&self) -> f64 {
298        let total = self.positive_feedback + self.negative_feedback;
299        if total == 0 {
300            0.0
301        } else {
302            self.positive_feedback as f64 / total as f64
303        }
304    }
305
306    /// Average relevance score
307    pub fn avg_relevance_score(&self) -> f64 {
308        if self.impressions == 0 {
309            0.0
310        } else {
311            self.total_relevance_score / self.impressions as f64
312        }
313    }
314
315    /// Average latency in milliseconds
316    pub fn avg_latency_ms(&self) -> f64 {
317        if self.latency_samples == 0 {
318            0.0
319        } else {
320            (self.total_latency_us as f64 / self.latency_samples as f64) / 1000.0
321        }
322    }
323
324    /// Conversion rate per unique user
325    pub fn conversion_rate(&self) -> f64 {
326        if self.unique_users == 0 {
327            0.0
328        } else {
329            self.clicks as f64 / self.unique_users as f64
330        }
331    }
332}
333
334/// Configuration for an A/B test
335#[derive(Debug, Clone, Serialize, Deserialize)]
336pub struct ABTestConfig {
337    /// Unique identifier for the test
338    pub id: String,
339    /// Human-readable name
340    pub name: String,
341    /// Description of what's being tested
342    pub description: String,
343    /// Weights for control group
344    pub control_weights: LearnedWeights,
345    /// Weights for treatment group
346    pub treatment_weights: LearnedWeights,
347    /// Fraction of traffic to send to treatment (0.0-1.0)
348    pub traffic_split: f32,
349    /// Significance level for statistical tests (default 0.05)
350    pub significance_level: f64,
351    /// Minimum impressions before declaring winner
352    pub min_impressions: u64,
353    /// Maximum duration before auto-completing
354    pub max_duration_hours: Option<u64>,
355    /// Tags for categorization
356    pub tags: Vec<String>,
357}
358
359impl Default for ABTestConfig {
360    fn default() -> Self {
361        Self {
362            id: Uuid::new_v4().to_string(),
363            name: String::new(),
364            description: String::new(),
365            control_weights: LearnedWeights::default(),
366            treatment_weights: LearnedWeights::default(),
367            traffic_split: DEFAULT_TRAFFIC_SPLIT,
368            significance_level: DEFAULT_SIGNIFICANCE_LEVEL,
369            min_impressions: MIN_SAMPLE_SIZE,
370            max_duration_hours: Some(168), // 1 week default
371            tags: Vec::new(),
372        }
373    }
374}
375
376/// An A/B test experiment
377#[derive(Debug, Clone, Serialize, Deserialize)]
378pub struct ABTest {
379    /// Unique test identifier (shortcut to config.id)
380    #[serde(skip)]
381    pub id: String,
382    /// Configuration
383    pub config: ABTestConfig,
384    /// Current status
385    pub status: ABTestStatus,
386    /// When the test was created
387    pub created_at: DateTime<Utc>,
388    /// When the test started running
389    pub started_at: Option<DateTime<Utc>>,
390    /// When the test completed
391    pub completed_at: Option<DateTime<Utc>>,
392    /// Metrics for control group
393    pub control_metrics: VariantMetrics,
394    /// Metrics for treatment group
395    pub treatment_metrics: VariantMetrics,
396    /// User assignments (user_id -> variant)
397    #[serde(skip)]
398    user_assignments: HashMap<String, ABTestVariant>,
399}
400
401impl ABTest {
402    /// Create a new A/B test builder
403    pub fn builder(name: &str) -> ABTestBuilder {
404        ABTestBuilder::new(name)
405    }
406
407    /// Create from config
408    pub fn from_config(config: ABTestConfig) -> Self {
409        let id = config.id.clone();
410        Self {
411            id,
412            config,
413            status: ABTestStatus::Draft,
414            created_at: Utc::now(),
415            started_at: None,
416            completed_at: None,
417            control_metrics: VariantMetrics::default(),
418            treatment_metrics: VariantMetrics::default(),
419            user_assignments: HashMap::new(),
420        }
421    }
422
423    /// Get variant for a user (consistent assignment)
424    pub fn get_variant(&mut self, user_id: &str) -> ABTestVariant {
425        // Check if user already assigned
426        if let Some(&variant) = self.user_assignments.get(user_id) {
427            return variant;
428        }
429
430        // Consistent hashing for new users
431        let variant = self.assign_variant(user_id);
432
433        // Track unique users
434        match variant {
435            ABTestVariant::Control => self.control_metrics.unique_users += 1,
436            ABTestVariant::Treatment => self.treatment_metrics.unique_users += 1,
437        }
438
439        self.user_assignments.insert(user_id.to_string(), variant);
440        variant
441    }
442
443    /// Assign variant using consistent hashing
444    fn assign_variant(&self, user_id: &str) -> ABTestVariant {
445        // Hash user_id + test_id for consistent assignment
446        let mut hasher = std::collections::hash_map::DefaultHasher::new();
447        user_id.hash(&mut hasher);
448        self.config.id.hash(&mut hasher);
449        let hash = hasher.finish();
450
451        // Convert to 0.0-1.0 range
452        let bucket = (hash % 10000) as f32 / 10000.0;
453
454        if bucket < self.config.traffic_split {
455            ABTestVariant::Treatment
456        } else {
457            ABTestVariant::Control
458        }
459    }
460
461    /// Get weights for a variant
462    pub fn get_weights(&self, variant: ABTestVariant) -> &LearnedWeights {
463        match variant {
464            ABTestVariant::Control => &self.config.control_weights,
465            ABTestVariant::Treatment => &self.config.treatment_weights,
466        }
467    }
468
469    /// Get metrics for a variant
470    pub fn get_metrics(&self, variant: ABTestVariant) -> &VariantMetrics {
471        match variant {
472            ABTestVariant::Control => &self.control_metrics,
473            ABTestVariant::Treatment => &self.treatment_metrics,
474        }
475    }
476
477    /// Get mutable metrics for a variant
478    fn get_metrics_mut(&mut self, variant: ABTestVariant) -> &mut VariantMetrics {
479        match variant {
480            ABTestVariant::Control => &mut self.control_metrics,
481            ABTestVariant::Treatment => &mut self.treatment_metrics,
482        }
483    }
484
485    /// Record an impression
486    pub fn record_impression(&mut self, user_id: &str, relevance_score: f64, latency_us: u64) {
487        let variant = self.get_variant(user_id);
488        let metrics = self.get_metrics_mut(variant);
489        metrics.impressions += 1;
490        metrics.total_relevance_score += relevance_score;
491        metrics.total_latency_us += latency_us;
492        metrics.latency_samples += 1;
493    }
494
495    /// Record a click/interaction
496    pub fn record_click(&mut self, user_id: &str, memory_id: Uuid) {
497        let variant = self.get_variant(user_id);
498        let metrics = self.get_metrics_mut(variant);
499        metrics.clicks += 1;
500        metrics.clicked_memory_ids.push(memory_id);
501    }
502
503    /// Record explicit feedback
504    pub fn record_feedback(&mut self, user_id: &str, positive: bool) {
505        let variant = self.get_variant(user_id);
506        let metrics = self.get_metrics_mut(variant);
507        if positive {
508            metrics.positive_feedback += 1;
509        } else {
510            metrics.negative_feedback += 1;
511        }
512    }
513
514    /// Check if test has enough data for analysis
515    pub fn has_sufficient_data(&self) -> bool {
516        self.control_metrics.impressions >= self.config.min_impressions
517            && self.treatment_metrics.impressions >= self.config.min_impressions
518    }
519
520    /// Check if test has exceeded max duration
521    pub fn is_expired(&self) -> bool {
522        if let (Some(started), Some(max_hours)) = (self.started_at, self.config.max_duration_hours)
523        {
524            let elapsed = Utc::now().signed_duration_since(started);
525            elapsed > Duration::hours(max_hours as i64)
526        } else {
527            false
528        }
529    }
530
531    /// Start the test
532    pub fn start(&mut self) {
533        if self.status == ABTestStatus::Draft {
534            self.status = ABTestStatus::Running;
535            self.started_at = Some(Utc::now());
536        }
537    }
538
539    /// Pause the test
540    pub fn pause(&mut self) {
541        if self.status == ABTestStatus::Running {
542            self.status = ABTestStatus::Paused;
543        }
544    }
545
546    /// Resume the test
547    pub fn resume(&mut self) {
548        if self.status == ABTestStatus::Paused {
549            self.status = ABTestStatus::Running;
550        }
551    }
552
553    /// Complete the test
554    pub fn complete(&mut self) {
555        if self.status == ABTestStatus::Running || self.status == ABTestStatus::Paused {
556            self.status = ABTestStatus::Completed;
557            self.completed_at = Some(Utc::now());
558        }
559    }
560
561    /// Archive the test
562    pub fn archive(&mut self) {
563        self.status = ABTestStatus::Archived;
564    }
565}
566
567/// Builder for creating A/B tests
568pub struct ABTestBuilder {
569    config: ABTestConfig,
570}
571
572impl ABTestBuilder {
573    pub fn new(name: &str) -> Self {
574        Self {
575            config: ABTestConfig {
576                name: name.to_string(),
577                ..Default::default()
578            },
579        }
580    }
581
582    pub fn with_id(mut self, id: &str) -> Self {
583        self.config.id = id.to_string();
584        self
585    }
586
587    pub fn with_description(mut self, description: &str) -> Self {
588        self.config.description = description.to_string();
589        self
590    }
591
592    pub fn with_control(mut self, weights: LearnedWeights) -> Self {
593        self.config.control_weights = weights;
594        self
595    }
596
597    pub fn with_treatment(mut self, weights: LearnedWeights) -> Self {
598        self.config.treatment_weights = weights;
599        self
600    }
601
602    pub fn with_traffic_split(mut self, split: f32) -> Self {
603        self.config.traffic_split = split.clamp(0.0, 1.0);
604        self
605    }
606
607    pub fn with_significance_level(mut self, level: f64) -> Self {
608        self.config.significance_level = level.clamp(0.001, 0.1);
609        self
610    }
611
612    pub fn with_min_impressions(mut self, min: u64) -> Self {
613        self.config.min_impressions = min.max(MIN_SAMPLE_SIZE);
614        self
615    }
616
617    pub fn with_max_duration_hours(mut self, hours: u64) -> Self {
618        self.config.max_duration_hours = Some(hours);
619        self
620    }
621
622    pub fn with_tags(mut self, tags: Vec<String>) -> Self {
623        self.config.tags = tags;
624        self
625    }
626
627    pub fn build(self) -> ABTest {
628        ABTest::from_config(self.config)
629    }
630}
631
632// =============================================================================
633// STATISTICAL ANALYSIS
634// =============================================================================
635
636/// Results of statistical analysis
637#[derive(Debug, Clone, Serialize, Deserialize)]
638pub struct ABTestResults {
639    /// Test ID
640    pub test_id: String,
641    /// Whether the result is statistically significant
642    pub is_significant: bool,
643    /// Confidence level achieved (1 - p-value)
644    pub confidence_level: f64,
645    /// Chi-squared statistic
646    pub chi_squared: f64,
647    /// P-value
648    pub p_value: f64,
649    /// Winning variant (if significant)
650    pub winner: Option<ABTestVariant>,
651    /// Relative improvement of winner over loser
652    pub relative_improvement: f64,
653    /// Control group CTR
654    pub control_ctr: f64,
655    /// Treatment group CTR
656    pub treatment_ctr: f64,
657    /// Control group success rate
658    pub control_success_rate: f64,
659    /// Treatment group success rate
660    pub treatment_success_rate: f64,
661    /// 95% confidence interval for treatment effect
662    pub confidence_interval: (f64, f64),
663    /// Recommendations based on results
664    pub recommendations: Vec<String>,
665    /// Analysis timestamp
666    pub analyzed_at: DateTime<Utc>,
667}
668
669/// Statistical analyzer for A/B tests
670pub struct ABTestAnalyzer;
671
672impl ABTestAnalyzer {
673    /// Analyze an A/B test and return results
674    pub fn analyze(test: &ABTest) -> ABTestResults {
675        let control = &test.control_metrics;
676        let treatment = &test.treatment_metrics;
677
678        // Calculate CTRs
679        let control_ctr = control.ctr();
680        let treatment_ctr = treatment.ctr();
681
682        // Calculate success rates
683        let control_success = control.success_rate();
684        let treatment_success = treatment.success_rate();
685
686        // Chi-squared test for CTR difference
687        let (chi_squared, p_value) = Self::chi_squared_test(
688            control.impressions,
689            control.clicks,
690            treatment.impressions,
691            treatment.clicks,
692        );
693
694        // Determine significance
695        let is_significant = p_value < test.config.significance_level
696            && control.impressions >= test.config.min_impressions
697            && treatment.impressions >= test.config.min_impressions;
698
699        // Determine winner
700        let winner = if is_significant {
701            if treatment_ctr > control_ctr {
702                Some(ABTestVariant::Treatment)
703            } else {
704                Some(ABTestVariant::Control)
705            }
706        } else {
707            None
708        };
709
710        // Calculate relative improvement
711        let relative_improvement = if control_ctr > 0.0 {
712            (treatment_ctr - control_ctr) / control_ctr * 100.0
713        } else {
714            0.0
715        };
716
717        // Calculate confidence interval for treatment effect
718        let confidence_interval = Self::calculate_confidence_interval(
719            control.impressions,
720            control.clicks,
721            treatment.impressions,
722            treatment.clicks,
723        );
724
725        // Generate recommendations
726        let recommendations = Self::generate_recommendations(
727            test,
728            is_significant,
729            winner,
730            relative_improvement,
731            &confidence_interval,
732        );
733
734        ABTestResults {
735            test_id: test.config.id.clone(),
736            is_significant,
737            confidence_level: 1.0 - p_value,
738            chi_squared,
739            p_value,
740            winner,
741            relative_improvement,
742            control_ctr,
743            treatment_ctr,
744            control_success_rate: control_success,
745            treatment_success_rate: treatment_success,
746            confidence_interval,
747            recommendations,
748            analyzed_at: Utc::now(),
749        }
750    }
751
752    /// Chi-squared test for comparing two proportions
753    ///
754    /// Tests H0: p1 = p2 (no difference in conversion rates)
755    /// Returns (chi_squared_statistic, p_value)
756    fn chi_squared_test(n1: u64, x1: u64, n2: u64, x2: u64) -> (f64, f64) {
757        if n1 == 0 || n2 == 0 {
758            return (0.0, 1.0);
759        }
760
761        let n1 = n1 as f64;
762        let x1 = x1 as f64;
763        let n2 = n2 as f64;
764        let x2 = x2 as f64;
765
766        // Pooled proportion
767        let p_pooled = (x1 + x2) / (n1 + n2);
768
769        // Expected values under null hypothesis
770        let e1_success = n1 * p_pooled;
771        let e1_failure = n1 * (1.0 - p_pooled);
772        let e2_success = n2 * p_pooled;
773        let e2_failure = n2 * (1.0 - p_pooled);
774
775        // Avoid division by zero
776        if e1_success < 5.0 || e1_failure < 5.0 || e2_success < 5.0 || e2_failure < 5.0 {
777            // Sample size too small for chi-squared approximation
778            return (0.0, 1.0);
779        }
780
781        // Chi-squared statistic
782        let chi_squared = (x1 - e1_success).powi(2) / e1_success
783            + ((n1 - x1) - e1_failure).powi(2) / e1_failure
784            + (x2 - e2_success).powi(2) / e2_success
785            + ((n2 - x2) - e2_failure).powi(2) / e2_failure;
786
787        // P-value approximation (df=1)
788        let p_value = Self::chi_squared_p_value(chi_squared);
789
790        (chi_squared, p_value)
791    }
792
793    /// Approximate p-value for chi-squared distribution with df=1
794    fn chi_squared_p_value(chi_squared: f64) -> f64 {
795        if chi_squared <= 0.0 {
796            return 1.0;
797        }
798
799        // Use lookup table for common critical values
800        if chi_squared >= CHI_SQUARED_CRITICAL_0001 {
801            0.0001
802        } else if chi_squared >= CHI_SQUARED_CRITICAL_001 {
803            // Interpolate between 0.001 and 0.0001
804            let ratio = (chi_squared - CHI_SQUARED_CRITICAL_001)
805                / (CHI_SQUARED_CRITICAL_0001 - CHI_SQUARED_CRITICAL_001);
806            0.001 - ratio * 0.0009
807        } else if chi_squared >= CHI_SQUARED_CRITICAL_005 {
808            // Interpolate between 0.05 and 0.001
809            let ratio = (chi_squared - CHI_SQUARED_CRITICAL_005)
810                / (CHI_SQUARED_CRITICAL_001 - CHI_SQUARED_CRITICAL_005);
811            0.05 - ratio * 0.049
812        } else {
813            // Below 0.05 significance
814            // Rough approximation: p ≈ exp(-chi_squared/2) for small values
815            0.05 + (1.0 - chi_squared / CHI_SQUARED_CRITICAL_005) * 0.95
816        }
817    }
818
819    /// Calculate 95% confidence interval for treatment effect
820    fn calculate_confidence_interval(n1: u64, x1: u64, n2: u64, x2: u64) -> (f64, f64) {
821        if n1 == 0 || n2 == 0 {
822            return (0.0, 0.0);
823        }
824
825        let p1 = x1 as f64 / n1 as f64;
826        let p2 = x2 as f64 / n2 as f64;
827        let diff = p2 - p1;
828
829        // Standard error of difference
830        let se = ((p1 * (1.0 - p1) / n1 as f64) + (p2 * (1.0 - p2) / n2 as f64)).sqrt();
831
832        // 95% CI: diff ± 1.96 * SE
833        let margin = 1.96 * se;
834        (diff - margin, diff + margin)
835    }
836
837    /// Generate actionable recommendations
838    fn generate_recommendations(
839        test: &ABTest,
840        is_significant: bool,
841        winner: Option<ABTestVariant>,
842        relative_improvement: f64,
843        confidence_interval: &(f64, f64),
844    ) -> Vec<String> {
845        let mut recommendations = Vec::new();
846
847        let total_impressions =
848            test.control_metrics.impressions + test.treatment_metrics.impressions;
849
850        // Check sample size
851        if total_impressions < MIN_SAMPLE_SIZE * 2 {
852            recommendations.push(format!(
853                "Insufficient data: {} impressions collected, need at least {} for reliable analysis",
854                total_impressions,
855                MIN_SAMPLE_SIZE * 2
856            ));
857            return recommendations;
858        }
859
860        if is_significant {
861            match winner {
862                Some(ABTestVariant::Treatment) => {
863                    recommendations.push(format!(
864                        "Treatment variant wins with {relative_improvement:.1}% relative improvement"
865                    ));
866                    recommendations
867                        .push("Recommendation: Deploy treatment weights to production".to_string());
868
869                    if relative_improvement > 20.0 {
870                        recommendations.push(
871                            "Strong effect detected - consider investigating what drove the improvement".to_string()
872                        );
873                    }
874                }
875                Some(ABTestVariant::Control) => {
876                    recommendations.push(format!(
877                        "Control variant wins - treatment performed {:.1}% worse",
878                        -relative_improvement
879                    ));
880                    recommendations.push(
881                        "Recommendation: Keep current weights, do not deploy treatment".to_string(),
882                    );
883                }
884                None => {}
885            }
886        } else {
887            recommendations.push("No statistically significant difference detected".to_string());
888
889            // Check if close to significance
890            let (ci_low, ci_high) = *confidence_interval;
891            if ci_low < 0.0 && ci_high > 0.0 {
892                recommendations.push(
893                    "Confidence interval includes zero - effect may be negligible".to_string(),
894                );
895            }
896
897            // Suggest more data
898            let current_power = Self::estimate_power(test);
899            if current_power < 0.8 {
900                let needed = Self::estimate_needed_sample_size(test, 0.8);
901                recommendations.push(format!(
902                    "Current statistical power: {:.1}%. Need ~{} more impressions per variant for 80% power",
903                    current_power * 100.0,
904                    needed
905                ));
906            }
907        }
908
909        // Check for data quality issues
910        if test.control_metrics.latency_samples > 0 && test.treatment_metrics.latency_samples > 0 {
911            let control_latency = test.control_metrics.avg_latency_ms();
912            let treatment_latency = test.treatment_metrics.avg_latency_ms();
913            let latency_diff = (treatment_latency - control_latency) / control_latency * 100.0;
914
915            if latency_diff.abs() > 20.0 {
916                recommendations.push(format!(
917                    "Warning: Latency differs by {latency_diff:.1}% between variants - may affect user behavior"
918                ));
919            }
920        }
921
922        recommendations
923    }
924
925    /// Estimate statistical power of current test
926    fn estimate_power(test: &ABTest) -> f64 {
927        let n1 = test.control_metrics.impressions as f64;
928        let n2 = test.treatment_metrics.impressions as f64;
929        let p1 = test.control_metrics.ctr();
930        let p2 = test.treatment_metrics.ctr();
931
932        if n1 == 0.0 || n2 == 0.0 || p1 == 0.0 {
933            return 0.0;
934        }
935
936        // Effect size (Cohen's h)
937        let h = 2.0 * ((p2.sqrt()).asin() - (p1.sqrt()).asin());
938
939        // Pooled sample size effect
940        let n_eff = 2.0 / (1.0 / n1 + 1.0 / n2);
941
942        // Approximate power (simplified)
943        let z = h * (n_eff / 2.0).sqrt();
944        let power = 0.5 * (1.0 + Self::erf(z / 2.0_f64.sqrt()));
945
946        power.clamp(0.0, 1.0)
947    }
948
949    /// Error function approximation
950    fn erf(x: f64) -> f64 {
951        let a1 = 0.254829592;
952        let a2 = -0.284496736;
953        let a3 = 1.421413741;
954        let a4 = -1.453152027;
955        let a5 = 1.061405429;
956        let p = 0.3275911;
957
958        let sign = if x < 0.0 { -1.0 } else { 1.0 };
959        let x = x.abs();
960
961        let t = 1.0 / (1.0 + p * x);
962        let y = 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * (-x * x).exp();
963
964        sign * y
965    }
966
967    /// Estimate sample size needed for desired power
968    fn estimate_needed_sample_size(test: &ABTest, target_power: f64) -> u64 {
969        let p1 = test.control_metrics.ctr();
970        let p2 = test.treatment_metrics.ctr();
971
972        if p1 == 0.0 || p2 == 0.0 || (p2 - p1).abs() < 0.001 {
973            return 10000; // Default large number
974        }
975
976        // Effect size
977        let effect = (p2 - p1).abs();
978        let pooled_p = (p1 + p2) / 2.0;
979        let pooled_var = pooled_p * (1.0 - pooled_p);
980
981        // Z-scores for alpha=0.05 and target power
982        let z_alpha = 1.96;
983        let z_beta = Self::inverse_normal_cdf(target_power);
984
985        // Sample size formula
986        let n = 2.0 * pooled_var * (z_alpha + z_beta).powi(2) / effect.powi(2);
987
988        n.ceil() as u64
989    }
990
991    /// Inverse normal CDF approximation
992    fn inverse_normal_cdf(p: f64) -> f64 {
993        // Rational approximation
994        let a = [
995            -3.969683028665376e+01,
996            2.209460984245205e+02,
997            -2.759285104469687e+02,
998            1.383577518672690e+02,
999            -3.066479806614716e+01,
1000            2.506628277459239e+00,
1001        ];
1002        let b = [
1003            -5.447609879822406e+01,
1004            1.615858368580409e+02,
1005            -1.556989798598866e+02,
1006            6.680131188771972e+01,
1007            -1.328068155288572e+01,
1008        ];
1009        let c = [
1010            -7.784894002430293e-03,
1011            -3.223964580411365e-01,
1012            -2.400758277161838e+00,
1013            -2.549732539343734e+00,
1014            4.374664141464968e+00,
1015            2.938163982698783e+00,
1016        ];
1017        let d = [
1018            7.784695709041462e-03,
1019            3.224671290700398e-01,
1020            2.445134137142996e+00,
1021            3.754408661907416e+00,
1022        ];
1023
1024        let p_low = 0.02425;
1025        let p_high = 1.0 - p_low;
1026
1027        if p < p_low {
1028            let q = (-2.0 * p.ln()).sqrt();
1029            (((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5])
1030                / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1.0)
1031        } else if p <= p_high {
1032            let q = p - 0.5;
1033            let r = q * q;
1034            (((((a[0] * r + a[1]) * r + a[2]) * r + a[3]) * r + a[4]) * r + a[5]) * q
1035                / (((((b[0] * r + b[1]) * r + b[2]) * r + b[3]) * r + b[4]) * r + 1.0)
1036        } else {
1037            let q = (-2.0 * (1.0 - p).ln()).sqrt();
1038            -(((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5])
1039                / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1.0)
1040        }
1041    }
1042
1043    // =========================================================================
1044    // ADVANCED ANALYSIS METHODS
1045    // =========================================================================
1046
1047    /// Perform Bayesian analysis using Beta-Binomial model
1048    ///
1049    /// Returns probability that treatment is better, expected lift, and credible intervals
1050    pub fn bayesian_analysis(test: &ABTest) -> BayesianAnalysis {
1051        let control = &test.control_metrics;
1052        let treatment = &test.treatment_metrics;
1053
1054        // Beta distribution parameters (using Jeffreys prior: alpha=0.5, beta=0.5)
1055        let alpha_c = control.clicks as f64 + 0.5;
1056        let beta_c = (control.impressions as f64 - control.clicks as f64) + 0.5;
1057        let alpha_t = treatment.clicks as f64 + 0.5;
1058        let beta_t = (treatment.impressions as f64 - treatment.clicks as f64) + 0.5;
1059
1060        // Monte Carlo simulation for probability of being better
1061        let n_samples = 10000;
1062        let mut treatment_wins = 0;
1063        let mut lift_sum = 0.0;
1064        let mut lifts = Vec::with_capacity(n_samples);
1065
1066        // Simple random sampling using linear congruential generator
1067        let mut seed = 12345u64;
1068        let lcg = |s: &mut u64| -> f64 {
1069            *s = s.wrapping_mul(6364136223846793005).wrapping_add(1);
1070            (*s as f64) / (u64::MAX as f64)
1071        };
1072
1073        for _ in 0..n_samples {
1074            // Sample from Beta distributions using inverse transform
1075            let p_c = Self::beta_sample(alpha_c, beta_c, &mut seed, &lcg);
1076            let p_t = Self::beta_sample(alpha_t, beta_t, &mut seed, &lcg);
1077
1078            if p_t > p_c {
1079                treatment_wins += 1;
1080            }
1081
1082            let lift = if p_c > 0.0 { (p_t - p_c) / p_c } else { 0.0 };
1083            lift_sum += lift;
1084            lifts.push(lift);
1085        }
1086
1087        lifts.sort_by(|a, b| a.total_cmp(b));
1088
1089        let prob_treatment_better = treatment_wins as f64 / n_samples as f64;
1090        let expected_lift = lift_sum / n_samples as f64;
1091
1092        // 95% credible interval
1093        let ci_low = lifts[(n_samples as f64 * 0.025) as usize];
1094        let ci_high = lifts[(n_samples as f64 * 0.975) as usize];
1095
1096        // Expected loss (risk) calculation
1097        let risk_treatment =
1098            lifts.iter().filter(|&&l| l < 0.0).map(|l| -l).sum::<f64>() / n_samples as f64;
1099        let risk_control = lifts.iter().filter(|&&l| l > 0.0).sum::<f64>() / n_samples as f64;
1100
1101        BayesianAnalysis {
1102            prob_treatment_better,
1103            prob_control_better: 1.0 - prob_treatment_better,
1104            expected_lift,
1105            credible_interval: (ci_low, ci_high),
1106            risk_treatment,
1107            risk_control,
1108        }
1109    }
1110
1111    /// Sample from Beta distribution using inverse transform sampling
1112    fn beta_sample(alpha: f64, beta: f64, seed: &mut u64, lcg: &impl Fn(&mut u64) -> f64) -> f64 {
1113        // Use ratio of gamma samples for Beta
1114        let gamma_a = Self::gamma_sample(alpha, seed, lcg);
1115        let gamma_b = Self::gamma_sample(beta, seed, lcg);
1116        gamma_a / (gamma_a + gamma_b)
1117    }
1118
1119    /// Sample from Gamma distribution using Marsaglia and Tsang's method
1120    fn gamma_sample(alpha: f64, seed: &mut u64, lcg: &impl Fn(&mut u64) -> f64) -> f64 {
1121        if alpha < 1.0 {
1122            // For alpha < 1, use rejection method
1123            return Self::gamma_sample(alpha + 1.0, seed, lcg) * lcg(seed).powf(1.0 / alpha);
1124        }
1125
1126        let d = alpha - 1.0 / 3.0;
1127        let c = 1.0 / (9.0 * d).sqrt();
1128
1129        loop {
1130            let x = Self::normal_sample(seed, lcg);
1131            let v = (1.0 + c * x).powi(3);
1132            if v > 0.0 {
1133                let u = lcg(seed);
1134                if u < 1.0 - 0.0331 * x.powi(4) || u.ln() < 0.5 * x.powi(2) + d * (1.0 - v + v.ln())
1135                {
1136                    return d * v;
1137                }
1138            }
1139        }
1140    }
1141
1142    /// Sample from standard normal using Box-Muller transform
1143    fn normal_sample(seed: &mut u64, lcg: &impl Fn(&mut u64) -> f64) -> f64 {
1144        let u1 = lcg(seed);
1145        let u2 = lcg(seed);
1146        (-2.0 * u1.ln()).sqrt() * (2.0 * std::f64::consts::PI * u2).cos()
1147    }
1148
1149    /// Calculate effect size metrics (Cohen's h, relative risk, odds ratio, NNT)
1150    pub fn calculate_effect_size(test: &ABTest) -> EffectSize {
1151        let p1 = test.control_metrics.ctr();
1152        let p2 = test.treatment_metrics.ctr();
1153
1154        // Cohen's h for proportions
1155        let phi1 = 2.0 * p1.sqrt().asin();
1156        let phi2 = 2.0 * p2.sqrt().asin();
1157        let cohens_h = (phi2 - phi1).abs();
1158
1159        // Interpretation based on Cohen's conventions
1160        let interpretation = if cohens_h < 0.2 {
1161            EffectSizeInterpretation::Negligible
1162        } else if cohens_h < 0.5 {
1163            EffectSizeInterpretation::Small
1164        } else if cohens_h < 0.8 {
1165            EffectSizeInterpretation::Medium
1166        } else {
1167            EffectSizeInterpretation::Large
1168        };
1169
1170        // Relative risk
1171        let relative_risk = if p1 > 0.0 { p2 / p1 } else { 0.0 };
1172
1173        // Odds ratio
1174        let odds_c = if p1 < 1.0 {
1175            p1 / (1.0 - p1)
1176        } else {
1177            f64::INFINITY
1178        };
1179        let odds_t = if p2 < 1.0 {
1180            p2 / (1.0 - p2)
1181        } else {
1182            f64::INFINITY
1183        };
1184        let odds_ratio = if odds_c > 0.0 && odds_c.is_finite() {
1185            odds_t / odds_c
1186        } else {
1187            0.0
1188        };
1189
1190        // Number needed to treat
1191        let ard = (p2 - p1).abs(); // Absolute risk difference
1192        let nnt = if ard > 0.0 { 1.0 / ard } else { f64::INFINITY };
1193
1194        EffectSize {
1195            cohens_h,
1196            interpretation,
1197            relative_risk,
1198            odds_ratio,
1199            nnt,
1200        }
1201    }
1202
1203    /// Check for Sample Ratio Mismatch (data quality issue)
1204    ///
1205    /// SRM occurs when the observed traffic split differs from expected,
1206    /// indicating a bug in randomization or data collection
1207    pub fn check_srm(test: &ABTest) -> SRMCheck {
1208        let expected_ratio = test.config.traffic_split as f64;
1209        let total = test.control_metrics.impressions + test.treatment_metrics.impressions;
1210
1211        if total == 0 {
1212            return SRMCheck {
1213                srm_detected: false,
1214                expected_ratio,
1215                observed_ratio: 0.5,
1216                chi_squared: 0.0,
1217                p_value: 1.0,
1218                severity: SRMSeverity::None,
1219            };
1220        }
1221
1222        let observed_ratio = test.treatment_metrics.impressions as f64 / total as f64;
1223
1224        // Expected counts
1225        let expected_control = total as f64 * (1.0 - expected_ratio);
1226        let expected_treatment = total as f64 * expected_ratio;
1227
1228        // Chi-squared test for SRM
1229        let chi_sq = (test.control_metrics.impressions as f64 - expected_control).powi(2)
1230            / expected_control
1231            + (test.treatment_metrics.impressions as f64 - expected_treatment).powi(2)
1232                / expected_treatment;
1233
1234        let p_value = Self::chi_squared_p_value(chi_sq);
1235
1236        // Determine severity
1237        let deviation = (observed_ratio - expected_ratio).abs();
1238        let severity = if p_value > 0.01 {
1239            SRMSeverity::None
1240        } else if deviation < SRM_THRESHOLD {
1241            SRMSeverity::Warning
1242        } else {
1243            SRMSeverity::Critical
1244        };
1245
1246        SRMCheck {
1247            srm_detected: p_value < 0.01,
1248            expected_ratio,
1249            observed_ratio,
1250            chi_squared: chi_sq,
1251            p_value,
1252            severity,
1253        }
1254    }
1255
1256    /// Sequential testing with O'Brien-Fleming alpha spending
1257    ///
1258    /// Allows valid early stopping while controlling Type I error
1259    pub fn sequential_analysis(
1260        test: &ABTest,
1261        analysis_number: u32,
1262        planned_analyses: u32,
1263    ) -> SequentialTest {
1264        let fraction = analysis_number as f64 / planned_analyses as f64;
1265
1266        // O'Brien-Fleming alpha spending function
1267        // Spends very little alpha early, more as test progresses
1268        let alpha = test.config.significance_level;
1269        let alpha_spent = 2.0
1270            * (1.0
1271                - Self::normal_cdf(Self::inverse_normal_cdf(1.0 - alpha / 2.0) / fraction.sqrt()));
1272
1273        // Current significance threshold
1274        let current_alpha = alpha_spent / analysis_number as f64;
1275
1276        // Perform test at current threshold
1277        let (_, p_value) = Self::chi_squared_test(
1278            test.control_metrics.impressions,
1279            test.control_metrics.clicks,
1280            test.treatment_metrics.impressions,
1281            test.treatment_metrics.clicks,
1282        );
1283
1284        let can_stop_early = p_value < current_alpha
1285            && test.control_metrics.impressions >= test.config.min_impressions / 2
1286            && test.treatment_metrics.impressions >= test.config.min_impressions / 2;
1287
1288        let stop_reason = if can_stop_early {
1289            let effect = Self::calculate_effect_size(test);
1290            if effect.interpretation == EffectSizeInterpretation::Negligible {
1291                Some("Futility: Effect size too small to be practically significant".to_string())
1292            } else {
1293                Some(format!(
1294                    "Efficacy: Significant result with {} effect",
1295                    effect.interpretation
1296                ))
1297            }
1298        } else {
1299            None
1300        };
1301
1302        SequentialTest {
1303            analysis_number,
1304            planned_analyses,
1305            alpha_spent,
1306            current_alpha,
1307            can_stop_early,
1308            stop_reason,
1309        }
1310    }
1311
1312    /// Normal CDF approximation
1313    fn normal_cdf(x: f64) -> f64 {
1314        0.5 * (1.0 + Self::erf(x / 2.0_f64.sqrt()))
1315    }
1316
1317    /// Comprehensive analysis combining all methods
1318    ///
1319    /// Returns actionable insights focused on what matters for users:
1320    /// - Should we ship this change?
1321    /// - Is the effect meaningful (not just statistically significant)?
1322    /// - Are there data quality issues?
1323    /// - What's the risk of making the wrong decision?
1324    pub fn comprehensive_analysis(test: &ABTest) -> ComprehensiveAnalysis {
1325        let frequentist = Self::analyze(test);
1326        let bayesian = Self::bayesian_analysis(test);
1327        let effect_size = Self::calculate_effect_size(test);
1328        let srm = Self::check_srm(test);
1329        let sequential = Self::sequential_analysis(test, 1, 5);
1330
1331        // Decision logic: combine statistical and practical significance
1332        let is_practically_significant = effect_size.cohens_h >= MIN_PRACTICAL_EFFECT_SIZE;
1333        let has_data_quality_issues = srm.srm_detected;
1334        let high_confidence =
1335            bayesian.prob_treatment_better > 0.95 || bayesian.prob_control_better > 0.95;
1336        let low_risk = bayesian.risk_treatment < 0.01 || bayesian.risk_control < 0.01;
1337
1338        // Ship decision
1339        let should_ship = frequentist.is_significant
1340            && is_practically_significant
1341            && !has_data_quality_issues
1342            && high_confidence
1343            && low_risk
1344            && frequentist.winner == Some(ABTestVariant::Treatment);
1345
1346        // Generate user-focused insights
1347        let mut insights = Vec::new();
1348
1349        // Primary insight
1350        if should_ship {
1351            insights.push(format!(
1352                "✅ SHIP IT: Treatment is {:.1}% better with {:.1}% confidence and {} effect size",
1353                bayesian.expected_lift * 100.0,
1354                bayesian.prob_treatment_better * 100.0,
1355                effect_size.interpretation
1356            ));
1357        } else if frequentist.winner == Some(ABTestVariant::Control) && frequentist.is_significant {
1358            insights.push(format!(
1359                "❌ DO NOT SHIP: Control is {:.1}% better. Treatment would hurt users.",
1360                -bayesian.expected_lift * 100.0
1361            ));
1362        } else {
1363            insights.push("⏳ KEEP TESTING: Not enough evidence to make a decision".to_string());
1364        }
1365
1366        // Explain why
1367        if !frequentist.is_significant {
1368            insights.push(format!(
1369                "📊 p-value = {:.4} (need < {:.2})",
1370                frequentist.p_value, test.config.significance_level
1371            ));
1372        }
1373
1374        if !is_practically_significant {
1375            insights.push(format!(
1376                "📏 Effect is {} (Cohen's h = {:.3}) - may not matter to users",
1377                effect_size.interpretation, effect_size.cohens_h
1378            ));
1379        }
1380
1381        if has_data_quality_issues {
1382            insights.push(format!(
1383                "⚠️ DATA QUALITY: Sample ratio mismatch detected ({:.1}% vs expected {:.1}%)",
1384                srm.observed_ratio * 100.0,
1385                srm.expected_ratio * 100.0
1386            ));
1387        }
1388
1389        // Risk assessment
1390        if bayesian.risk_treatment > 0.01 {
1391            insights.push(format!(
1392                "🎲 Risk if shipping treatment: {:.2}% expected loss",
1393                bayesian.risk_treatment * 100.0
1394            ));
1395        }
1396
1397        // User impact
1398        if effect_size.nnt.is_finite() && effect_size.nnt < 1000.0 {
1399            insights.push(format!(
1400                "👥 Impact: 1 in {:.0} users will benefit from this change",
1401                effect_size.nnt
1402            ));
1403        }
1404
1405        ComprehensiveAnalysis {
1406            frequentist,
1407            bayesian,
1408            effect_size,
1409            srm,
1410            sequential,
1411            should_ship,
1412            is_practically_significant,
1413            insights,
1414        }
1415    }
1416}
1417
1418/// Comprehensive analysis result combining all statistical methods
1419#[derive(Debug, Clone, Serialize, Deserialize)]
1420pub struct ComprehensiveAnalysis {
1421    /// Frequentist analysis results
1422    pub frequentist: ABTestResults,
1423    /// Bayesian analysis results
1424    pub bayesian: BayesianAnalysis,
1425    /// Effect size metrics
1426    pub effect_size: EffectSize,
1427    /// Sample ratio mismatch check
1428    pub srm: SRMCheck,
1429    /// Sequential testing state
1430    pub sequential: SequentialTest,
1431    /// Final recommendation: should we ship?
1432    pub should_ship: bool,
1433    /// Is the effect practically significant (not just statistically)?
1434    pub is_practically_significant: bool,
1435    /// User-focused insights and recommendations
1436    pub insights: Vec<String>,
1437}
1438
1439// =============================================================================
1440// TEST MANAGER
1441// =============================================================================
1442
1443/// Manager for multiple A/B tests
1444pub struct ABTestManager {
1445    /// Active tests by ID
1446    tests: Arc<RwLock<HashMap<String, ABTest>>>,
1447    /// Archived tests (for historical analysis)
1448    archived: Arc<RwLock<Vec<ABTest>>>,
1449}
1450
1451impl Default for ABTestManager {
1452    fn default() -> Self {
1453        Self::new()
1454    }
1455}
1456
1457impl ABTestManager {
1458    /// Create a new test manager
1459    pub fn new() -> Self {
1460        Self {
1461            tests: Arc::new(RwLock::new(HashMap::new())),
1462            archived: Arc::new(RwLock::new(Vec::new())),
1463        }
1464    }
1465
1466    /// Create a new A/B test
1467    pub fn create_test(&self, test: ABTest) -> Result<String, ABTestError> {
1468        let id = test.config.id.clone();
1469
1470        let mut tests = self.tests.write();
1471        if tests.contains_key(&id) {
1472            return Err(ABTestError::TestAlreadyExists(id));
1473        }
1474
1475        tests.insert(id.clone(), test);
1476        Ok(id)
1477    }
1478
1479    /// Get a test by ID
1480    pub fn get_test(&self, test_id: &str) -> Option<ABTest> {
1481        self.tests.read().get(test_id).cloned()
1482    }
1483
1484    /// List all active tests
1485    pub fn list_tests(&self) -> Vec<ABTest> {
1486        self.tests.read().values().cloned().collect()
1487    }
1488
1489    /// List tests by status
1490    pub fn list_tests_by_status(&self, status: ABTestStatus) -> Vec<ABTest> {
1491        self.tests
1492            .read()
1493            .values()
1494            .filter(|t| t.status == status)
1495            .cloned()
1496            .collect()
1497    }
1498
1499    /// Start a test
1500    pub fn start_test(&self, test_id: &str) -> Result<(), ABTestError> {
1501        let mut tests = self.tests.write();
1502        let test = tests
1503            .get_mut(test_id)
1504            .ok_or_else(|| ABTestError::TestNotFound(test_id.to_string()))?;
1505
1506        if test.status != ABTestStatus::Draft {
1507            return Err(ABTestError::InvalidState(format!(
1508                "Cannot start test in {:?} state",
1509                test.status
1510            )));
1511        }
1512
1513        test.start();
1514        Ok(())
1515    }
1516
1517    /// Pause a test
1518    pub fn pause_test(&self, test_id: &str) -> Result<(), ABTestError> {
1519        let mut tests = self.tests.write();
1520        let test = tests
1521            .get_mut(test_id)
1522            .ok_or_else(|| ABTestError::TestNotFound(test_id.to_string()))?;
1523
1524        test.pause();
1525        Ok(())
1526    }
1527
1528    /// Resume a test
1529    pub fn resume_test(&self, test_id: &str) -> Result<(), ABTestError> {
1530        let mut tests = self.tests.write();
1531        let test = tests
1532            .get_mut(test_id)
1533            .ok_or_else(|| ABTestError::TestNotFound(test_id.to_string()))?;
1534
1535        test.resume();
1536        Ok(())
1537    }
1538
1539    /// Complete a test
1540    pub fn complete_test(&self, test_id: &str) -> Result<ABTestResults, ABTestError> {
1541        let results = {
1542            let tests = self.tests.read();
1543            let test = tests
1544                .get(test_id)
1545                .ok_or_else(|| ABTestError::TestNotFound(test_id.to_string()))?;
1546
1547            ABTestAnalyzer::analyze(test)
1548        };
1549
1550        let mut tests = self.tests.write();
1551        if let Some(test) = tests.get_mut(test_id) {
1552            test.complete();
1553        }
1554
1555        Ok(results)
1556    }
1557
1558    /// Archive a test (move to archived storage)
1559    pub fn archive_test(&self, test_id: &str) -> Result<(), ABTestError> {
1560        let mut tests = self.tests.write();
1561        let mut test = tests
1562            .remove(test_id)
1563            .ok_or_else(|| ABTestError::TestNotFound(test_id.to_string()))?;
1564
1565        test.archive();
1566        self.archived.write().push(test);
1567
1568        Ok(())
1569    }
1570
1571    /// Delete a test (permanent)
1572    pub fn delete_test(&self, test_id: &str) -> Result<(), ABTestError> {
1573        let mut tests = self.tests.write();
1574        tests
1575            .remove(test_id)
1576            .ok_or_else(|| ABTestError::TestNotFound(test_id.to_string()))?;
1577        Ok(())
1578    }
1579
1580    /// Get variant for a user in a specific test
1581    pub fn get_variant(&self, test_id: &str, user_id: &str) -> Result<ABTestVariant, ABTestError> {
1582        let mut tests = self.tests.write();
1583        let test = tests
1584            .get_mut(test_id)
1585            .ok_or_else(|| ABTestError::TestNotFound(test_id.to_string()))?;
1586
1587        if test.status != ABTestStatus::Running {
1588            return Err(ABTestError::TestNotRunning(test_id.to_string()));
1589        }
1590
1591        Ok(test.get_variant(user_id))
1592    }
1593
1594    /// Get weights for a user (handles test assignment)
1595    pub fn get_weights_for_user(
1596        &self,
1597        test_id: &str,
1598        user_id: &str,
1599    ) -> Result<LearnedWeights, ABTestError> {
1600        let mut tests = self.tests.write();
1601        let test = tests
1602            .get_mut(test_id)
1603            .ok_or_else(|| ABTestError::TestNotFound(test_id.to_string()))?;
1604
1605        if test.status != ABTestStatus::Running {
1606            return Err(ABTestError::TestNotRunning(test_id.to_string()));
1607        }
1608
1609        let variant = test.get_variant(user_id);
1610        Ok(test.get_weights(variant).clone())
1611    }
1612
1613    /// Record an impression
1614    pub fn record_impression(
1615        &self,
1616        test_id: &str,
1617        user_id: &str,
1618        relevance_score: f64,
1619        latency_us: u64,
1620    ) -> Result<(), ABTestError> {
1621        let mut tests = self.tests.write();
1622        let test = tests
1623            .get_mut(test_id)
1624            .ok_or_else(|| ABTestError::TestNotFound(test_id.to_string()))?;
1625
1626        if test.status != ABTestStatus::Running {
1627            return Err(ABTestError::TestNotRunning(test_id.to_string()));
1628        }
1629
1630        test.record_impression(user_id, relevance_score, latency_us);
1631        Ok(())
1632    }
1633
1634    /// Record a click
1635    pub fn record_click(
1636        &self,
1637        test_id: &str,
1638        user_id: &str,
1639        memory_id: Uuid,
1640    ) -> Result<(), ABTestError> {
1641        let mut tests = self.tests.write();
1642        let test = tests
1643            .get_mut(test_id)
1644            .ok_or_else(|| ABTestError::TestNotFound(test_id.to_string()))?;
1645
1646        if test.status != ABTestStatus::Running {
1647            return Err(ABTestError::TestNotRunning(test_id.to_string()));
1648        }
1649
1650        test.record_click(user_id, memory_id);
1651        Ok(())
1652    }
1653
1654    /// Record explicit feedback
1655    pub fn record_feedback(
1656        &self,
1657        test_id: &str,
1658        user_id: &str,
1659        positive: bool,
1660    ) -> Result<(), ABTestError> {
1661        let mut tests = self.tests.write();
1662        let test = tests
1663            .get_mut(test_id)
1664            .ok_or_else(|| ABTestError::TestNotFound(test_id.to_string()))?;
1665
1666        if test.status != ABTestStatus::Running {
1667            return Err(ABTestError::TestNotRunning(test_id.to_string()));
1668        }
1669
1670        test.record_feedback(user_id, positive);
1671        Ok(())
1672    }
1673
1674    /// Analyze a test
1675    pub fn analyze_test(&self, test_id: &str) -> Result<ABTestResults, ABTestError> {
1676        let tests = self.tests.read();
1677        let test = tests
1678            .get(test_id)
1679            .ok_or_else(|| ABTestError::TestNotFound(test_id.to_string()))?;
1680
1681        Ok(ABTestAnalyzer::analyze(test))
1682    }
1683
1684    /// Get all archived tests
1685    pub fn list_archived(&self) -> Vec<ABTest> {
1686        self.archived.read().clone()
1687    }
1688
1689    /// Check and auto-complete expired tests
1690    pub fn check_expired_tests(&self) -> Vec<String> {
1691        let mut expired = Vec::new();
1692
1693        let mut tests = self.tests.write();
1694        for (id, test) in tests.iter_mut() {
1695            if test.status == ABTestStatus::Running && test.is_expired() {
1696                test.complete();
1697                expired.push(id.clone());
1698            }
1699        }
1700
1701        expired
1702    }
1703
1704    /// Get summary of all tests
1705    pub fn summary(&self) -> ABTestManagerSummary {
1706        let tests = self.tests.read();
1707        let archived = self.archived.read();
1708
1709        let mut draft = 0;
1710        let mut running = 0;
1711        let mut paused = 0;
1712        let mut completed = 0;
1713
1714        for test in tests.values() {
1715            match test.status {
1716                ABTestStatus::Draft => draft += 1,
1717                ABTestStatus::Running => running += 1,
1718                ABTestStatus::Paused => paused += 1,
1719                ABTestStatus::Completed => completed += 1,
1720                ABTestStatus::Archived => {}
1721            }
1722        }
1723
1724        ABTestManagerSummary {
1725            total_active: tests.len(),
1726            draft,
1727            running,
1728            paused,
1729            completed,
1730            archived: archived.len(),
1731        }
1732    }
1733}
1734
1735/// Summary of test manager state
1736#[derive(Debug, Clone, Serialize, Deserialize)]
1737pub struct ABTestManagerSummary {
1738    pub total_active: usize,
1739    pub draft: usize,
1740    pub running: usize,
1741    pub paused: usize,
1742    pub completed: usize,
1743    pub archived: usize,
1744}
1745
1746// =============================================================================
1747// ERRORS
1748// =============================================================================
1749
1750/// Errors from A/B testing operations
1751#[derive(Debug, Clone, thiserror::Error)]
1752pub enum ABTestError {
1753    #[error("Test not found: {0}")]
1754    TestNotFound(String),
1755
1756    #[error("Test already exists: {0}")]
1757    TestAlreadyExists(String),
1758
1759    #[error("Test is not running: {0}")]
1760    TestNotRunning(String),
1761
1762    #[error("Invalid state: {0}")]
1763    InvalidState(String),
1764
1765    #[error("Insufficient data for analysis")]
1766    InsufficientData,
1767}
1768
1769// =============================================================================
1770// TESTS
1771// =============================================================================
1772
1773#[cfg(test)]
1774mod tests {
1775    use super::*;
1776
1777    #[test]
1778    fn test_variant_assignment_consistency() {
1779        let mut test = ABTest::builder("test").with_traffic_split(0.5).build();
1780
1781        // Same user should always get same variant
1782        let user = "user_123";
1783        let variant1 = test.get_variant(user);
1784        let variant2 = test.get_variant(user);
1785        let variant3 = test.get_variant(user);
1786
1787        assert_eq!(variant1, variant2);
1788        assert_eq!(variant2, variant3);
1789    }
1790
1791    #[test]
1792    fn test_traffic_split() {
1793        let mut test = ABTest::builder("test").with_traffic_split(0.5).build();
1794
1795        let mut _control_count = 0;
1796        let mut treatment_count = 0;
1797
1798        // Assign many users
1799        for i in 0..1000 {
1800            let user = format!("user_{i}");
1801            match test.get_variant(&user) {
1802                ABTestVariant::Control => _control_count += 1,
1803                ABTestVariant::Treatment => treatment_count += 1,
1804            }
1805        }
1806
1807        // Should be roughly 50/50 (within 10% tolerance)
1808        let ratio = treatment_count as f64 / 1000.0;
1809        assert!(ratio > 0.4 && ratio < 0.6, "Ratio was {ratio}");
1810    }
1811
1812    #[test]
1813    fn test_metrics_tracking() {
1814        let mut test = ABTest::builder("test").build();
1815        test.start();
1816
1817        // Record some metrics
1818        test.record_impression("user_1", 0.8, 5000);
1819        test.record_impression("user_1", 0.7, 4000);
1820        test.record_click("user_1", Uuid::new_v4());
1821        test.record_feedback("user_1", true);
1822
1823        let variant = test.get_variant("user_1");
1824        let metrics = test.get_metrics(variant);
1825
1826        assert_eq!(metrics.impressions, 2);
1827        assert_eq!(metrics.clicks, 1);
1828        assert_eq!(metrics.positive_feedback, 1);
1829        assert_eq!(metrics.unique_users, 1);
1830        assert!((metrics.ctr() - 0.5).abs() < 0.001);
1831    }
1832
1833    #[test]
1834    fn test_chi_squared_significant() {
1835        // Clear difference: 10% vs 20% CTR with large sample
1836        let (chi_sq, p_value) = ABTestAnalyzer::chi_squared_test(
1837            1000, 100, // Control: 10% CTR
1838            1000, 200, // Treatment: 20% CTR
1839        );
1840
1841        assert!(chi_sq > CHI_SQUARED_CRITICAL_005);
1842        assert!(p_value < 0.05);
1843    }
1844
1845    #[test]
1846    fn test_chi_squared_not_significant() {
1847        // Small difference with small sample
1848        let (chi_sq, p_value) = ABTestAnalyzer::chi_squared_test(
1849            50, 5, // Control: 10% CTR
1850            50, 6, // Treatment: 12% CTR
1851        );
1852
1853        // Should not be significant (sample too small)
1854        assert!(p_value > 0.05 || chi_sq < CHI_SQUARED_CRITICAL_005);
1855    }
1856
1857    #[test]
1858    fn test_confidence_interval() {
1859        let (low, high) = ABTestAnalyzer::calculate_confidence_interval(
1860            1000, 100, // 10% CTR
1861            1000, 150, // 15% CTR
1862        );
1863
1864        // Difference is 5%, CI should contain it
1865        assert!(low < 0.05);
1866        assert!(high > 0.05);
1867        // And CI should not include 0 (significant difference)
1868        assert!(low > 0.0 || high < 0.0 || (low < 0.0 && high > 0.0));
1869    }
1870
1871    #[test]
1872    fn test_manager_lifecycle() {
1873        let manager = ABTestManager::new();
1874
1875        // Create test
1876        let test = ABTest::builder("test_lifecycle")
1877            .with_description("Test lifecycle management")
1878            .build();
1879
1880        let id = manager.create_test(test).unwrap();
1881
1882        // Start test
1883        manager.start_test(&id).unwrap();
1884        let test = manager.get_test(&id).unwrap();
1885        assert_eq!(test.status, ABTestStatus::Running);
1886
1887        // Record some data
1888        manager.record_impression(&id, "user_1", 0.8, 5000).unwrap();
1889        manager.record_click(&id, "user_1", Uuid::new_v4()).unwrap();
1890
1891        // Analyze
1892        let results = manager.analyze_test(&id).unwrap();
1893        assert!(!results.is_significant); // Not enough data
1894
1895        // Complete
1896        manager.complete_test(&id).unwrap();
1897        let test = manager.get_test(&id).unwrap();
1898        assert_eq!(test.status, ABTestStatus::Completed);
1899
1900        // Archive
1901        manager.archive_test(&id).unwrap();
1902        assert!(manager.get_test(&id).is_none());
1903        assert_eq!(manager.list_archived().len(), 1);
1904    }
1905
1906    #[test]
1907    fn test_learned_weights_integration() {
1908        let control = LearnedWeights::default();
1909        let mut treatment = LearnedWeights {
1910            semantic: 0.6,
1911            entity: 0.2,
1912            ..Default::default()
1913        };
1914        treatment.normalize();
1915
1916        let test = ABTest::builder("weights_test")
1917            .with_control(control.clone())
1918            .with_treatment(treatment.clone())
1919            .build();
1920
1921        assert_eq!(
1922            test.get_weights(ABTestVariant::Control).semantic,
1923            control.semantic
1924        );
1925        assert_eq!(
1926            test.get_weights(ABTestVariant::Treatment).semantic,
1927            treatment.semantic
1928        );
1929    }
1930
1931    #[test]
1932    fn test_ctr_calculation() {
1933        let mut metrics = VariantMetrics::default();
1934
1935        assert_eq!(metrics.ctr(), 0.0); // No impressions
1936
1937        metrics.impressions = 100;
1938        metrics.clicks = 10;
1939
1940        assert!((metrics.ctr() - 0.1).abs() < 0.001);
1941    }
1942
1943    #[test]
1944    fn test_success_rate_calculation() {
1945        let mut metrics = VariantMetrics::default();
1946
1947        assert_eq!(metrics.success_rate(), 0.0); // No feedback
1948
1949        metrics.positive_feedback = 8;
1950        metrics.negative_feedback = 2;
1951
1952        assert!((metrics.success_rate() - 0.8).abs() < 0.001);
1953    }
1954
1955    #[test]
1956    fn test_power_estimation() {
1957        let mut test = ABTest::builder("power_test").build();
1958
1959        // Add significant data
1960        for i in 0..500 {
1961            let user = format!("control_{i}");
1962            test.user_assignments
1963                .insert(user.clone(), ABTestVariant::Control);
1964            test.control_metrics.impressions += 1;
1965            test.control_metrics.unique_users += 1;
1966            if i % 10 == 0 {
1967                // 10% CTR
1968                test.control_metrics.clicks += 1;
1969            }
1970        }
1971
1972        for i in 0..500 {
1973            let user = format!("treatment_{i}");
1974            test.user_assignments
1975                .insert(user.clone(), ABTestVariant::Treatment);
1976            test.treatment_metrics.impressions += 1;
1977            test.treatment_metrics.unique_users += 1;
1978            if i % 5 == 0 {
1979                // 20% CTR
1980                test.treatment_metrics.clicks += 1;
1981            }
1982        }
1983
1984        let power = ABTestAnalyzer::estimate_power(&test);
1985        assert!(power > 0.5, "Power was {power}"); // Should have decent power with this effect size
1986    }
1987
1988    #[test]
1989    fn test_manager_summary() {
1990        let manager = ABTestManager::new();
1991
1992        // Create tests in different states
1993        let test1 = ABTest::builder("draft_test").build();
1994        manager.create_test(test1).unwrap();
1995
1996        let test2 = ABTest::builder("running_test").build();
1997        let id2 = manager.create_test(test2).unwrap();
1998        manager.start_test(&id2).unwrap();
1999
2000        let summary = manager.summary();
2001        assert_eq!(summary.total_active, 2);
2002        assert_eq!(summary.draft, 1);
2003        assert_eq!(summary.running, 1);
2004    }
2005
2006    #[test]
2007    fn test_recommendations_generation() {
2008        let mut test = ABTest::builder("recommendations_test")
2009            .with_min_impressions(100)
2010            .build();
2011
2012        // Add data showing treatment wins
2013        test.control_metrics.impressions = 1000;
2014        test.control_metrics.clicks = 100; // 10% CTR
2015        test.treatment_metrics.impressions = 1000;
2016        test.treatment_metrics.clicks = 200; // 20% CTR
2017
2018        let results = ABTestAnalyzer::analyze(&test);
2019
2020        assert!(results.is_significant);
2021        assert_eq!(results.winner, Some(ABTestVariant::Treatment));
2022        assert!(!results.recommendations.is_empty());
2023        assert!(results
2024            .recommendations
2025            .iter()
2026            .any(|r| r.contains("Treatment")));
2027    }
2028
2029    #[test]
2030    fn test_ab_demo_with_numbers() {
2031        println!("\n========================================");
2032        println!("       A/B TESTING DEMO WITH NUMBERS");
2033        println!("========================================\n");
2034
2035        // Scenario 1: Clear winner
2036        println!("📊 SCENARIO 1: Clear Winner (Treatment significantly better)");
2037        println!("   Control:   1000 impressions, 100 clicks (10.0% CTR)");
2038        println!("   Treatment: 1000 impressions, 200 clicks (20.0% CTR)");
2039
2040        let (chi_sq, p_value) = ABTestAnalyzer::chi_squared_test(1000, 100, 1000, 200);
2041        let (ci_low, ci_high) = ABTestAnalyzer::calculate_confidence_interval(1000, 100, 1000, 200);
2042
2043        println!("\n   RESULTS:");
2044        println!("   ├─ Chi-squared statistic: {chi_sq:.4}");
2045        println!("   ├─ P-value: {p_value:.6}");
2046        let significant = if p_value < 0.05 { "YES ✓" } else { "NO ✗" };
2047        println!("   ├─ Significant (p < 0.05): {significant}");
2048        println!("   ├─ 95% Confidence Interval: ({ci_low:.4}, {ci_high:.4})");
2049        let improvement = ((0.20 - 0.10) / 0.10) * 100.0;
2050        println!("   └─ Relative improvement: {improvement:.1}%");
2051
2052        // Scenario 2: No significant difference
2053        println!("\n📊 SCENARIO 2: No Significant Difference (Sample too small)");
2054        println!("   Control:   50 impressions, 5 clicks (10.0% CTR)");
2055        println!("   Treatment: 50 impressions, 6 clicks (12.0% CTR)");
2056
2057        let (chi_sq2, p_value2) = ABTestAnalyzer::chi_squared_test(50, 5, 50, 6);
2058        let (ci_low2, ci_high2) = ABTestAnalyzer::calculate_confidence_interval(50, 5, 50, 6);
2059
2060        println!("\n   RESULTS:");
2061        println!("   ├─ Chi-squared statistic: {chi_sq2:.4}");
2062        println!("   ├─ P-value: {p_value2:.6}");
2063        let significant2 = if p_value2 < 0.05 { "YES ✓" } else { "NO ✗" };
2064        println!("   ├─ Significant (p < 0.05): {significant2}");
2065        println!("   ├─ 95% Confidence Interval: ({ci_low2:.4}, {ci_high2:.4})");
2066        let ci_includes_zero = if ci_low2 < 0.0 && ci_high2 > 0.0 {
2067            "YES"
2068        } else {
2069            "NO"
2070        };
2071        println!("   └─ CI includes 0: {ci_includes_zero} (effect may be due to chance)");
2072
2073        // Scenario 3: Full analysis with recommendations
2074        println!("\n📊 SCENARIO 3: Full Analysis with Recommendations");
2075        let mut test = ABTest::builder("semantic_weight_test")
2076            .with_min_impressions(100)
2077            .build();
2078
2079        test.control_metrics.impressions = 5000;
2080        test.control_metrics.clicks = 500; // 10% CTR
2081        test.control_metrics.positive_feedback = 400;
2082        test.control_metrics.negative_feedback = 50;
2083
2084        test.treatment_metrics.impressions = 5000;
2085        test.treatment_metrics.clicks = 750; // 15% CTR
2086        test.treatment_metrics.positive_feedback = 600;
2087        test.treatment_metrics.negative_feedback = 30;
2088
2089        let results = ABTestAnalyzer::analyze(&test);
2090
2091        println!("   Test: Comparing semantic weight emphasis");
2092        println!("   Control:   5000 impressions, 500 clicks (10.0% CTR)");
2093        println!("   Treatment: 5000 impressions, 750 clicks (15.0% CTR)");
2094        println!("\n   STATISTICAL RESULTS:");
2095        println!("   ├─ Chi-squared: {:.4}", results.chi_squared);
2096        println!("   ├─ P-value: {:.8}", results.p_value);
2097        println!(
2098            "   ├─ Confidence Level: {:.2}%",
2099            results.confidence_level * 100.0
2100        );
2101        println!(
2102            "   ├─ Significant: {}",
2103            if results.is_significant {
2104                "YES ✓"
2105            } else {
2106                "NO ✗"
2107            }
2108        );
2109        println!("   ├─ Winner: {:?}", results.winner);
2110        println!(
2111            "   ├─ Relative Improvement: {:.2}%",
2112            results.relative_improvement
2113        );
2114        println!("   ├─ Control CTR: {:.2}%", results.control_ctr * 100.0);
2115        println!("   ├─ Treatment CTR: {:.2}%", results.treatment_ctr * 100.0);
2116        println!(
2117            "   └─ 95% CI: ({:.4}, {:.4})",
2118            results.confidence_interval.0, results.confidence_interval.1
2119        );
2120
2121        println!("\n   RECOMMENDATIONS:");
2122        for (i, rec) in results.recommendations.iter().enumerate() {
2123            println!("   {}. {}", i + 1, rec);
2124        }
2125
2126        println!("\n========================================");
2127        println!("        END OF A/B TESTING DEMO");
2128        println!("========================================\n");
2129
2130        // Assertions to make sure the test still works
2131        assert!(results.is_significant);
2132        assert_eq!(results.winner, Some(ABTestVariant::Treatment));
2133    }
2134
2135    #[test]
2136    fn test_comprehensive_analysis_demo() {
2137        println!("\n========================================");
2138        println!("    COMPREHENSIVE A/B ANALYSIS DEMO");
2139        println!("    (Dynamic Weight-Based Simulation)");
2140        println!("========================================\n");
2141
2142        // =================================================================
2143        // DYNAMIC SIMULATION: Compare old vs new relevance weights
2144        // =================================================================
2145
2146        // Control: Old weights (pre-CTX-3) - no momentum amplification, no access_count, no graph_strength
2147        let control_weights = LearnedWeights {
2148            semantic: 0.35,
2149            entity: 0.30,
2150            tag: 0.10,
2151            importance: 0.10,
2152            momentum: 0.15,      // Old: lower momentum weight, no amplification
2153            access_count: 0.0,   // Old: not used
2154            graph_strength: 0.0, // Old: not used
2155            update_count: 0,
2156            last_updated: None,
2157        };
2158
2159        // Treatment: New weights (CTX-3) - momentum amplification, access_count, graph_strength
2160        let treatment_weights = LearnedWeights::default(); // Uses current optimized defaults
2161
2162        println!("📊 WEIGHT COMPARISON:");
2163        println!("   Control (old):   semantic={:.2}, entity={:.2}, momentum={:.2}, access={:.2}, graph={:.2}",
2164            control_weights.semantic, control_weights.entity, control_weights.momentum,
2165            control_weights.access_count, control_weights.graph_strength);
2166        println!("   Treatment (new): semantic={:.2}, entity={:.2}, momentum={:.2}, access={:.2}, graph={:.2}\n",
2167            treatment_weights.semantic, treatment_weights.entity, treatment_weights.momentum,
2168            treatment_weights.access_count, treatment_weights.graph_strength);
2169
2170        // Generate synthetic memory corpus with varying characteristics
2171        // Each tuple: (semantic, entity, tag, importance, momentum_ema, access_count, graph_strength, is_truly_relevant)
2172        // The key insight: some memories LOOK good (high semantic/entity) but have poor track record
2173        // Treatment should deprioritize these based on momentum/access/graph signals
2174        #[allow(clippy::type_complexity)]
2175        let memory_corpus: Vec<(f32, f32, f32, f32, f32, u32, f32, bool)> = vec![
2176            // === HIGH-VALUE: Good signals + good track record ===
2177            (0.8, 0.7, 0.5, 0.8, 0.9, 15, 0.9, true), // Consistently helpful, frequently accessed
2178            (0.7, 0.8, 0.6, 0.7, 0.8, 12, 0.85, true), // Strong entity match, proven value
2179            (0.9, 0.6, 0.4, 0.9, 0.7, 10, 0.8, true), // High semantic, good track record
2180            (0.6, 0.9, 0.7, 0.6, 0.85, 8, 0.75, true), // Entity-heavy, reliable
2181            // === TRAPS: Look good but misleading (control will surface these, treatment won't) ===
2182            (0.95, 0.9, 0.8, 0.9, -0.6, 1, 0.15, false), // BEST semantic/entity but terrible momentum
2183            (0.9, 0.85, 0.7, 0.85, -0.4, 0, 0.1, false), // High scores, never accessed, weak graph
2184            (0.88, 0.82, 0.6, 0.8, -0.5, 1, 0.2, false), // Looks great, proven misleading
2185            (0.85, 0.88, 0.75, 0.82, -0.3, 2, 0.25, false), // Strong traditional signals, poor history
2186            // === MEDIUM: Mixed signals ===
2187            (0.7, 0.5, 0.3, 0.6, 0.4, 4, 0.5, true), // Decent, somewhat proven
2188            (0.5, 0.6, 0.4, 0.5, 0.3, 3, 0.45, true), // Moderate all around
2189            (0.6, 0.55, 0.35, 0.55, 0.35, 3, 0.4, true), // Average
2190            // === LOW-VALUE: Poor across the board ===
2191            (0.4, 0.3, 0.2, 0.4, 0.1, 1, 0.2, false), // Low everything
2192            (0.35, 0.4, 0.25, 0.35, -0.1, 1, 0.15, false), // Below average
2193        ];
2194
2195        // Score all memories with both weight sets and rank them
2196        let mut control_ranked: Vec<(usize, f32, bool)> = memory_corpus
2197            .iter()
2198            .enumerate()
2199            .map(|(idx, &(sem, ent, tag, imp, mom, acc, graph, relevant))| {
2200                let score = control_weights.fuse_scores_full(sem, ent, tag, imp, mom, acc, graph);
2201                (idx, score, relevant)
2202            })
2203            .collect();
2204
2205        let mut treatment_ranked: Vec<(usize, f32, bool)> = memory_corpus
2206            .iter()
2207            .enumerate()
2208            .map(|(idx, &(sem, ent, tag, imp, mom, acc, graph, relevant))| {
2209                let score = treatment_weights.fuse_scores_full(sem, ent, tag, imp, mom, acc, graph);
2210                (idx, score, relevant)
2211            })
2212            .collect();
2213
2214        // Sort by score descending
2215        control_ranked.sort_by(|a, b| b.1.total_cmp(&a.1));
2216        treatment_ranked.sort_by(|a, b| b.1.total_cmp(&a.1));
2217
2218        println!("🔍 RANKING COMPARISON (top 8):");
2219        println!("   Control ranking:");
2220        for (rank, (idx, score, relevant)) in control_ranked.iter().take(8).enumerate() {
2221            let status = if *relevant {
2222                "✓ relevant"
2223            } else {
2224                "✗ TRAP"
2225            };
2226            println!(
2227                "      #{}: memory[{}] score={:.3} {}",
2228                rank + 1,
2229                idx,
2230                score,
2231                status
2232            );
2233        }
2234        println!("   Treatment ranking:");
2235        for (rank, (idx, score, relevant)) in treatment_ranked.iter().take(8).enumerate() {
2236            let status = if *relevant {
2237                "✓ relevant"
2238            } else {
2239                "✗ TRAP"
2240            };
2241            println!(
2242                "      #{}: memory[{}] score={:.3} {}",
2243                rank + 1,
2244                idx,
2245                score,
2246                status
2247            );
2248        }
2249        println!();
2250
2251        // Simulate sessions: Claude uses surfaced memories, user gives feedback
2252        // Model: trap in context → probability of bad outcome (negative feedback)
2253        // Success = user gives positive feedback (Claude's action was helpful)
2254        let num_sessions = 1000;
2255        let memories_surfaced = 5;
2256
2257        // Count relevant vs trap in top K for each variant
2258        let control_top_k: Vec<bool> = control_ranked
2259            .iter()
2260            .take(memories_surfaced)
2261            .map(|x| x.2)
2262            .collect();
2263        let treatment_top_k: Vec<bool> = treatment_ranked
2264            .iter()
2265            .take(memories_surfaced)
2266            .map(|x| x.2)
2267            .collect();
2268
2269        let control_relevant_count = control_top_k.iter().filter(|&&r| r).count();
2270        let treatment_relevant_count = treatment_top_k.iter().filter(|&&r| r).count();
2271        let control_trap_count = memories_surfaced - control_relevant_count;
2272        let treatment_trap_count = memories_surfaced - treatment_relevant_count;
2273
2274        // Probability of bad outcome = trap_ratio (each trap has chance to mislead)
2275        let control_trap_ratio = control_trap_count as f32 / memories_surfaced as f32;
2276        let treatment_trap_ratio = treatment_trap_count as f32 / memories_surfaced as f32;
2277
2278        println!("📈 CONTEXT QUALITY (top {memories_surfaced}):");
2279        let control_trap_pct = control_trap_ratio * 100.0;
2280        let treatment_trap_pct = treatment_trap_ratio * 100.0;
2281        println!(
2282            "   Control:   {control_relevant_count} relevant, {control_trap_count} traps ({control_trap_pct:.0}% trap ratio)"
2283        );
2284        println!(
2285            "   Treatment: {treatment_relevant_count} relevant, {treatment_trap_count} traps ({treatment_trap_pct:.0}% trap ratio)\n"
2286        );
2287
2288        // Deterministic seeding for reproducibility (LCG PRNG)
2289        let mut rng_state: u64 = 42;
2290        let next_rand = |state: &mut u64| -> f32 {
2291            *state = state.wrapping_mul(6364136223846793005).wrapping_add(1);
2292            // Use upper 32 bits for better distribution, divide by 2^32 for [0, 1) range
2293            ((*state >> 32) as f32) / (0x1_0000_0000_u64 as f32)
2294        };
2295
2296        let mut control_positive = 0u64;
2297        let mut control_negative = 0u64;
2298        let mut treatment_positive = 0u64;
2299        let mut treatment_negative = 0u64;
2300
2301        for _session in 0..num_sessions {
2302            // Control: Claude uses context, user responds
2303            // Bad outcome probability = trap_ratio (trap misleads Claude → bad action)
2304            if next_rand(&mut rng_state) < control_trap_ratio {
2305                control_negative += 1; // Trap caused bad outcome
2306            } else {
2307                control_positive += 1; // Good outcome
2308            }
2309
2310            // Treatment: same model
2311            if next_rand(&mut rng_state) < treatment_trap_ratio {
2312                treatment_negative += 1;
2313            } else {
2314                treatment_positive += 1;
2315            }
2316        }
2317
2318        // For NNT calculation: impressions = sessions, clicks = successful sessions
2319        // CTR = success rate = positive / total
2320        let num_impressions = num_sessions as u64;
2321        let control_clicks = control_positive; // Success = click
2322        let treatment_clicks = treatment_positive;
2323
2324        // Build test with dynamic results
2325        let mut test = ABTest::builder("relevance_weights_experiment")
2326            .with_description(
2327                "CTX-3: Quality over quantity - momentum, access_count, graph_strength",
2328            )
2329            .with_control(control_weights)
2330            .with_treatment(treatment_weights)
2331            .with_min_impressions(100)
2332            .with_traffic_split(0.5)
2333            .build();
2334
2335        test.control_metrics.impressions = num_impressions;
2336        test.control_metrics.clicks = control_clicks;
2337        test.control_metrics.unique_users = (num_impressions as f64 * 0.85) as u64;
2338        test.control_metrics.positive_feedback = control_positive;
2339        test.control_metrics.negative_feedback = control_negative;
2340
2341        test.treatment_metrics.impressions = num_impressions;
2342        test.treatment_metrics.clicks = treatment_clicks;
2343        test.treatment_metrics.unique_users = (num_impressions as f64 * 0.85) as u64;
2344        test.treatment_metrics.positive_feedback = treatment_positive;
2345        test.treatment_metrics.negative_feedback = treatment_negative;
2346
2347        let control_ctr = (control_clicks as f64 / num_impressions as f64) * 100.0;
2348        let treatment_ctr = (treatment_clicks as f64 / num_impressions as f64) * 100.0;
2349
2350        let analysis = ABTestAnalyzer::comprehensive_analysis(&test);
2351
2352        println!("📊 DYNAMIC SIMULATION RESULTS:");
2353        println!(
2354            "   ├─ Control:   {num_impressions} impressions, {control_clicks} clicks ({control_ctr:.1}% CTR)"
2355        );
2356        println!("   │            positive={control_positive}, negative={control_negative}");
2357        println!(
2358            "   └─ Treatment: {num_impressions} impressions, {treatment_clicks} clicks ({treatment_ctr:.1}% CTR)"
2359        );
2360        println!("                 positive={treatment_positive}, negative={treatment_negative}\n");
2361
2362        println!("🔬 FREQUENTIST ANALYSIS:");
2363        let chi_sq = analysis.frequentist.chi_squared;
2364        let p_val = analysis.frequentist.p_value;
2365        println!("   ├─ Chi-squared: {chi_sq:.4}");
2366        println!("   ├─ P-value: {p_val:.6}");
2367        let significant = if analysis.frequentist.is_significant {
2368            "YES ✓"
2369        } else {
2370            "NO ✗"
2371        };
2372        println!("   ├─ Significant: {significant}");
2373        let winner = &analysis.frequentist.winner;
2374        println!("   └─ Winner: {winner:?}\n");
2375
2376        println!("🎲 BAYESIAN ANALYSIS:");
2377        let prob_treat = analysis.bayesian.prob_treatment_better * 100.0;
2378        let exp_lift = analysis.bayesian.expected_lift * 100.0;
2379        let ci_lo = analysis.bayesian.credible_interval.0 * 100.0;
2380        let ci_hi = analysis.bayesian.credible_interval.1 * 100.0;
2381        let risk_treat = analysis.bayesian.risk_treatment * 100.0;
2382        let risk_ctrl = analysis.bayesian.risk_control * 100.0;
2383        println!("   ├─ P(Treatment better): {prob_treat:.2}%");
2384        println!("   ├─ Expected lift: {exp_lift:.2}%");
2385        println!("   ├─ 95% Credible Interval: ({ci_lo:.2}%, {ci_hi:.2}%)");
2386        println!("   ├─ Risk if shipping treatment: {risk_treat:.3}%");
2387        println!("   └─ Risk if keeping control: {risk_ctrl:.3}%\n");
2388
2389        println!("📏 EFFECT SIZE:");
2390        let cohens_h = analysis.effect_size.cohens_h;
2391        let interpretation = &analysis.effect_size.interpretation;
2392        let rel_risk = analysis.effect_size.relative_risk;
2393        let odds_ratio = analysis.effect_size.odds_ratio;
2394        let nnt = analysis.effect_size.nnt;
2395        println!("   ├─ Cohen's h: {cohens_h:.4}");
2396        println!("   ├─ Interpretation: {interpretation}");
2397        println!("   ├─ Relative Risk: {rel_risk:.2}x");
2398        println!("   ├─ Odds Ratio: {odds_ratio:.2}");
2399        if nnt.is_finite() {
2400            println!("   └─ NNT (Number Needed to Treat): {nnt:.0}\n");
2401        } else {
2402            println!("   └─ NNT: N/A (no effect)\n");
2403        }
2404
2405        println!("⚖️ DATA QUALITY (SRM Check):");
2406        let expected_ratio = analysis.srm.expected_ratio * 100.0;
2407        let observed_ratio = analysis.srm.observed_ratio * 100.0;
2408        println!("   ├─ Expected ratio: {expected_ratio:.1}%");
2409        println!("   ├─ Observed ratio: {observed_ratio:.1}%");
2410        let srm_detected = if analysis.srm.srm_detected {
2411            "YES ⚠️"
2412        } else {
2413            "NO ✓"
2414        };
2415        println!("   ├─ SRM Detected: {srm_detected}");
2416        let severity = &analysis.srm.severity;
2417        println!("   └─ Severity: {severity:?}\n");
2418
2419        println!("📈 SEQUENTIAL TESTING:");
2420        let analysis_num = analysis.sequential.analysis_number;
2421        let planned = analysis.sequential.planned_analyses;
2422        let alpha_spent = analysis.sequential.alpha_spent;
2423        println!("   ├─ Analysis #{analysis_num} of {planned}");
2424        println!("   ├─ Alpha spent: {alpha_spent:.4}");
2425        let current_alpha = analysis.sequential.current_alpha;
2426        println!("   ├─ Current threshold: {current_alpha:.4}");
2427        let can_stop = if analysis.sequential.can_stop_early {
2428            "YES ✓"
2429        } else {
2430            "NO - Continue testing"
2431        };
2432        println!("   └─ Can stop early: {can_stop}\n");
2433
2434        println!("═══════════════════════════════════════");
2435        println!("🎯 FINAL DECISION:");
2436        println!("═══════════════════════════════════════");
2437        let should_ship = if analysis.should_ship {
2438            "YES ✅"
2439        } else {
2440            "NO ❌"
2441        };
2442        println!("   Should ship: {should_ship}");
2443        let practically_sig = if analysis.is_practically_significant {
2444            "YES"
2445        } else {
2446            "NO"
2447        };
2448        println!("   Practically significant: {practically_sig}");
2449        println!("\n📋 USER-FOCUSED INSIGHTS:");
2450        for insight in &analysis.insights {
2451            println!("   • {insight}");
2452        }
2453
2454        // Calculate and display NNT
2455        let ard = (treatment_ctr - control_ctr) / 100.0; // Absolute Risk Difference
2456        let nnt = if ard > 0.0 { 1.0 / ard } else { f64::INFINITY };
2457        let ctr_diff = treatment_ctr - control_ctr;
2458        println!("\n🎯 KEY METRIC:");
2459        println!(
2460            "   ├─ CTR Improvement: {control_ctr:.1}% → {treatment_ctr:.1}% (+{ctr_diff:.1}%)"
2461        );
2462        let ard_pct = ard * 100.0;
2463        println!("   ├─ ARD (Absolute Risk Difference): {ard_pct:.2}%");
2464        if nnt.is_finite() && nnt < 100.0 {
2465            println!("   └─ NNT (Number Needed to Treat): {nnt:.0}");
2466            println!("      (1 in {nnt:.0} users benefit from treatment)");
2467        } else {
2468            println!("   └─ NNT: N/A (no significant improvement)");
2469        }
2470
2471        println!("\n========================================");
2472        println!("     END OF COMPREHENSIVE ANALYSIS");
2473        println!("========================================\n");
2474
2475        // Assertions - verify treatment outperforms control
2476        assert!(
2477            treatment_clicks >= control_clicks,
2478            "Treatment ({treatment_clicks} clicks) should outperform Control ({control_clicks} clicks)"
2479        );
2480        assert!(
2481            treatment_ctr >= control_ctr,
2482            "Treatment CTR ({treatment_ctr:.1}%) should be >= Control CTR ({control_ctr:.1}%)"
2483        );
2484        // Treatment should have better positive/negative ratio
2485        let control_quality = if control_negative > 0 {
2486            control_positive as f64 / control_negative as f64
2487        } else {
2488            control_positive as f64
2489        };
2490        let treatment_quality = if treatment_negative > 0 {
2491            treatment_positive as f64 / treatment_negative as f64
2492        } else {
2493            treatment_positive as f64
2494        };
2495        assert!(
2496            treatment_quality >= control_quality * 0.9, // Allow 10% tolerance
2497            "Treatment quality ratio ({treatment_quality:.2}) should be >= Control ({control_quality:.2})"
2498        );
2499        assert!(!analysis.insights.is_empty());
2500    }
2501
2502    #[test]
2503    fn test_bayesian_analysis() {
2504        let mut test = ABTest::builder("bayesian_test").build();
2505
2506        test.control_metrics.impressions = 1000;
2507        test.control_metrics.clicks = 100; // 10%
2508        test.treatment_metrics.impressions = 1000;
2509        test.treatment_metrics.clicks = 150; // 15%
2510
2511        let bayesian = ABTestAnalyzer::bayesian_analysis(&test);
2512
2513        // Treatment should have high probability of being better
2514        assert!(bayesian.prob_treatment_better > 0.9);
2515        assert!(bayesian.expected_lift > 0.0);
2516        // Credible interval should not include large negative values
2517        assert!(bayesian.credible_interval.0 > -0.5);
2518    }
2519
2520    #[test]
2521    fn test_effect_size_calculation() {
2522        let mut test = ABTest::builder("effect_test").build();
2523
2524        test.control_metrics.impressions = 1000;
2525        test.control_metrics.clicks = 100; // 10%
2526        test.treatment_metrics.impressions = 1000;
2527        test.treatment_metrics.clicks = 200; // 20%
2528
2529        let effect = ABTestAnalyzer::calculate_effect_size(&test);
2530
2531        // 10% to 20% should be a small-to-medium effect
2532        assert!(effect.cohens_h > 0.2);
2533        assert!(effect.relative_risk > 1.5);
2534        // NNT should be 10 (1/(0.2-0.1))
2535        assert!((effect.nnt - 10.0).abs() < 0.5);
2536    }
2537
2538    #[test]
2539    fn test_srm_detection() {
2540        let mut test = ABTest::builder("srm_test").with_traffic_split(0.5).build();
2541
2542        // Severe SRM: expected 50/50, got 70/30
2543        test.control_metrics.impressions = 700;
2544        test.treatment_metrics.impressions = 300;
2545
2546        let srm = ABTestAnalyzer::check_srm(&test);
2547
2548        // Should detect SRM
2549        assert!(srm.srm_detected);
2550        assert_eq!(srm.severity, SRMSeverity::Critical);
2551    }
2552
2553    #[test]
2554    fn test_sequential_analysis() {
2555        let mut test = ABTest::builder("sequential_test")
2556            .with_min_impressions(100)
2557            .build();
2558
2559        test.control_metrics.impressions = 500;
2560        test.control_metrics.clicks = 25; // 5%
2561        test.treatment_metrics.impressions = 500;
2562        test.treatment_metrics.clicks = 75; // 15%
2563
2564        // Early look (analysis 1 of 5)
2565        let seq = ABTestAnalyzer::sequential_analysis(&test, 1, 5);
2566
2567        assert_eq!(seq.analysis_number, 1);
2568        assert_eq!(seq.planned_analyses, 5);
2569        // O'Brien-Fleming is very conservative early
2570        assert!(seq.alpha_spent < 0.01);
2571    }
2572}