single_rust 0.5.8

Single-cell analysis in Rust
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
//! # Highly Variable Gene Detection
//!
//! This module implements methods for identifying highly variable genes (HVGs) in single-cell RNA-seq data.
//! HVGs are genes that exhibit high variability across cells beyond what would be expected from technical noise,
//! and are crucial for downstream analysis like dimensionality reduction and clustering.
//!
//! ## Why Highly Variable Genes?
//!
//! Single-cell datasets typically contain 20,000+ genes, but most show little biological variation:
//! - **Technical noise**: Sequencing depth, amplification bias, dropout events
//! - **Housekeeping genes**: Consistently expressed across cell types
//! - **Lowly expressed genes**: Dominated by technical variation
//!
//! By focusing on highly variable genes, we:
//! - Reduce computational burden and noise
//! - Highlight biologically relevant variation
//! - Improve clustering and visualization quality
//! - Focus on genes driving cell-to-cell differences
//!
//! ## Implemented Methods
//!
//! ### 1. Seurat Method (Default)
//! Based on Satija lab's approach, models the mean-variance relationship using binning:
//! - Bins genes by expression level
//! - Calculates expected dispersion for each bin
//! - Identifies genes with higher-than-expected dispersion
//! - **Best for**: Most single-cell datasets, widely validated
//!
//! ### 2. SVR Method  
//! Uses Support Vector Regression to model mean-variance relationship:
//! - Fits SVR to log(mean) vs log(variance)
//! - Identifies genes with high residuals from the trend
//! - **Best for**: Datasets with complex mean-variance relationships
//!
//! ### 3. Cell Ranger Method
//! 10x Genomics approach (not yet implemented):
//! - Similar to Seurat but with different normalization
//! - **Best for**: Datasets processed with Cell Ranger pipeline
//!
//! ## Usage Examples
//!
//! ```rust,ignore
//! use single_rust::memory::processing::hvg::compute_highly_variable_genes;
//! use single_rust::shared::processing::HVGParams;
//!
//! // Basic usage with defaults (Seurat method)
//! compute_highly_variable_genes(&adata, None)?;
//!
//! // Custom parameters
//! let params = HVGParams {
//!     n_top_genes: Some(2000),     // Select top 2000 most variable genes
//!     min_mean: 0.0125,            // Minimum mean expression
//!     max_mean: 3.0,               // Maximum mean expression  
//!     min_dispersion: 0.5,         // Minimum normalized dispersion
//!     n_bins: 20,                  // Number of expression bins
//!     flavor: FlavorType::Seurat,  // Method to use
//! };
//! compute_highly_variable_genes(&adata, Some(params))?;
//!
//! // Access results
//! let var_df = adata.var().get_data();
//! let hvg_mask = var_df.column("highly_variable")?.bool()?;
//! let hvg_genes: Vec<_> = hvg_mask.into_iter()
//!     .enumerate()
//!     .filter_map(|(i, is_hvg)| if is_hvg.unwrap_or(false) { Some(i) } else { None })
//!     .collect();
//! ```
//!
//! ## Output Columns
//!
//! Results are stored in `adata.var()` with method-specific columns:
//!
//! ### Seurat Method
//! - `means`: Log1p mean expression per gene
//! - `dispersions`: Log dispersion (variance/mean)
//! - `dispersions_norm`: Normalized dispersion scores
//! - `highly_variable`: Boolean mask for selected genes
//!
//! ### SVR Method  
//! - `means`: Mean expression per gene
//! - `variances`: Variance per gene
//! - `residuals`: Residuals from SVR fit
//! - `residuals_standardized`: Standardized residuals
//! - `mean_variance_trend`: Predicted variance from SVR
//! - `highly_variable`: Boolean mask for selected genes
//!
//! ## Parameter Guidelines
//!
//! ### Gene Count Selection
//! - **1000-2000 genes**: Good for initial exploration
//! - **2000-5000 genes**: Standard for most analyses  
//! - **5000+ genes**: May include noise, use with caution
//!
//! ### Expression Filters
//! - `min_mean`: Remove lowly expressed genes (default: 0.0125)
//! - `max_mean`: Remove highly expressed genes (default: 3.0)
//! - Adjust based on your normalization and log-transformation
//!
//! ### Method Selection
//! - **Seurat**: Robust, widely used, good default choice
//! - **SVR**: Better for complex datasets, more computationally intensive
//! - **Cell Ranger**: Use if data processed with 10x Cell Ranger

use crate::shared::processing::{fit_svr, standardize_log_form_vec, FlavorType, HVGParams};
use crate::{ComputeSum, ComputeVariance};
use anndata_memory::{IMAnnData, IMArrayElement};
use polars::prelude::Column;
use single_utilities::types::Direction;

/// Compute highly variable genes using the specified method and parameters.
///
/// This is the main entry point for HVG detection. It analyzes the mean-variance relationship
/// in the expression data to identify genes with biological variability above technical noise.
///
/// ## Algorithm Overview
///
/// 1. **Calculate Statistics**: Compute mean and variance for each gene
/// 2. **Model Relationship**: Fit mean-variance relationship using chosen method
/// 3. **Identify Outliers**: Find genes with higher variability than expected
/// 4. **Apply Filters**: Filter by expression level and dispersion thresholds
/// 5. **Store Results**: Add columns to `adata.var()` with results
///
/// ## Parameters
///
/// * `adata` - AnnData object containing expression data
/// * `params` - HVG parameters (uses defaults if None)
///
/// ## Method Details
///
/// - **Seurat**: Bins genes by expression, normalizes dispersion within bins
/// - **SVR**: Fits support vector regression to model mean-variance trend
/// - **Cell Ranger**: (Not implemented) 10x Genomics approach
///
/// ## Results Storage
///
/// All results are stored as new columns in `adata.var()`. The exact columns
/// depend on the method used, but all methods add a `highly_variable` boolean column.
///
/// ## Examples
///
/// ```rust,ignore
/// // Use defaults (Seurat method, top 2000 genes)
/// compute_highly_variable_genes(&adata, None)?;
///
/// // Custom SVR analysis
/// let params = HVGParams {
///     flavor: FlavorType::SVR,
///     n_top_genes: Some(3000),
///     min_mean: 0.01,
///     max_mean: 4.0,
///     min_dispersion: 0.3,
///     n_bins: 25,
/// };
/// compute_highly_variable_genes(&adata, Some(params))?;
/// ```
pub fn compute_highly_variable_genes(
    adata: &IMAnnData,
    params: Option<HVGParams>,
) -> anyhow::Result<()> {
    let params = params.unwrap_or_default();
    let x = adata.x();

    match params.flavor {
        FlavorType::Seurat => compute_seurat_hvg(adata, &x, params),
        FlavorType::CellRanger => compute_cell_ranger_hvg(adata, &x, params),
        FlavorType::SVR => compute_svr_hvg(adata, &x, params),
    }
}

/// Post-process dispersion statistics for Seurat method to handle edge cases.
///
/// Handles bins with single genes where standard deviation cannot be computed.
/// Following the original Seurat implementation, single-gene bins get special treatment
/// to ensure they can still contribute to HVG selection.
///
/// ## Parameters
/// * `bin_means` - Mean dispersion values for each bin
/// * `bin_stds` - Standard deviation of dispersions for each bin
///
/// ## Behavior
/// For bins with NaN standard deviation (single gene bins):
/// - Sets std = mean and mean = 0
/// - This results in normalized dispersion = 1 for these genes
fn postprocess_seurat_dispersions(
    bin_means: &mut [f64],
    bin_stds: &mut [f64],
) -> anyhow::Result<()> {
    // This matches Python's _postprocess_dispersions_seurat
    for i in 0..bin_means.len() {
        if bin_stds[i].is_nan() {
            // For single-gene bins, set std = mean and mean = 0
            // This effectively sets normalized dispersion to 1
            bin_stds[i] = bin_means[i];
            bin_means[i] = 0.0;
        }
    }
    Ok(())
}

/// Create equal-width bins for gene expression levels.
///
/// Divides the expression range into equal-width bins for the Seurat method.
/// This allows modeling of mean-variance relationship by expression level,
/// as highly expressed genes naturally have different variance characteristics.
///
/// ## Parameters
/// * `log_means` - Log-transformed mean expression values
/// * `n_bins` - Number of bins to create
///
/// ## Returns
/// * Bin assignment for each gene (0-based indices)
/// * Bin edge values for reference
///
/// ## Algorithm
/// 1. Find min/max of valid (finite) expression values
/// 2. Create equal-width intervals between min and max
/// 3. Assign each gene to appropriate bin
/// 4. Handle edge cases (NaN values, exact maximum)
fn equal_width_binning(log_means: &[f64], n_bins: usize) -> anyhow::Result<(Vec<usize>, Vec<f64>)> {
    // Find min and max (excluding NaN)
    let mut valid_means: Vec<f64> = log_means
        .iter()
        .filter(|x| x.is_finite())
        .copied()
        .collect();

    if valid_means.is_empty() {
        return Err(anyhow::anyhow!("No valid mean values found"));
    }

    valid_means.sort_by(|a, b| a.partial_cmp(b).unwrap());
    let min_mean = valid_means[0];
    let max_mean = valid_means[valid_means.len() - 1];

    // Create equal-width bins
    let bin_width = (max_mean - min_mean) / n_bins as f64;
    let mut bin_edges = vec![0.0; n_bins + 1];

    for (i, edge) in bin_edges.iter_mut().enumerate().take(n_bins + 1) {
        *edge = min_mean + (i as f64) * bin_width;
    }

    // Make sure the last edge includes the maximum value
    bin_edges[n_bins] = max_mean + 1e-10;

    // Assign each gene to a bin
    let mut bin_indices = vec![0; log_means.len()];

    for (i, &mean) in log_means.iter().enumerate() {
        if !mean.is_finite() {
            bin_indices[i] = 0; // Assign NaN/inf to first bin
            continue;
        }

        // Find which bin this value belongs to
        let mut bin_idx = 0;
        for j in 0..n_bins {
            if mean >= bin_edges[j] && mean < bin_edges[j + 1] {
                bin_idx = j;
                break;
            }
        }

        // Handle edge case where mean == max_mean
        if mean == max_mean {
            bin_idx = n_bins - 1;
        }

        bin_indices[i] = bin_idx;
    }

    Ok((bin_indices, bin_edges))
}

/// Calculate mean and standard deviation of dispersions within each expression bin.
///
/// For the Seurat method, this establishes the expected mean-variance relationship
/// by computing statistics within bins of similar expression levels.
///
/// ## Parameters
/// * `log_dispersions` - Log-transformed dispersion values (variance/mean)
/// * `bin_indices` - Bin assignment for each gene
/// * `n_bins` - Total number of bins
///
/// ## Returns
/// * Mean dispersion for each bin
/// * Standard deviation of dispersion for each bin
///
/// ## Special Cases
/// - Empty bins: Both mean and std set to NaN
/// - Single-gene bins: Mean = gene value, std = NaN (handled by postprocessing)
/// - Uses sample standard deviation (n-1 denominator)
fn calculate_bin_stats(
    log_dispersions: &[f64],
    bin_indices: &[usize],
    n_bins: usize,
) -> anyhow::Result<(Vec<f64>, Vec<f64>)> {
    let mut bin_values: Vec<Vec<f64>> = vec![Vec::new(); n_bins];

    // Collect values for each bin (excluding NaN)
    for (i, &bin_idx) in bin_indices.iter().enumerate() {
        let disp = log_dispersions[i];
        if !disp.is_nan() && bin_idx < n_bins {
            bin_values[bin_idx].push(disp);
        }
    }

    let mut bin_means = vec![0.0; n_bins];
    let mut bin_stds = vec![0.0; n_bins];

    for bin_idx in 0..n_bins {
        let values = &bin_values[bin_idx];

        if values.is_empty() {
            bin_means[bin_idx] = f64::NAN;
            bin_stds[bin_idx] = f64::NAN;
        } else if values.len() == 1 {
            // Single gene in bin - Python sets std to NaN
            bin_means[bin_idx] = values[0];
            bin_stds[bin_idx] = f64::NAN;
        } else {
            // Calculate mean
            let mean = values.iter().sum::<f64>() / values.len() as f64;
            bin_means[bin_idx] = mean;

            // Calculate standard deviation
            let variance =
                values.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / (values.len() - 1) as f64;

            bin_stds[bin_idx] = variance.sqrt();
        }
    }

    Ok((bin_means, bin_stds))
}

/// Normalize dispersions by subtracting expected values and dividing by standard deviation.
///
/// This creates z-scores for dispersion values, allowing identification of genes
/// with unusually high variability relative to genes with similar expression levels.
///
/// ## Parameters
/// * `log_dispersions` - Log-transformed dispersion values
/// * `bin_indices` - Bin assignment for each gene  
/// * `bin_means` - Expected dispersion for each bin
/// * `bin_stds` - Standard deviation of dispersion for each bin
///
/// ## Returns
/// * Normalized dispersion scores (z-scores)
///
/// ## Formula
/// `normalized_dispersion = (observed_dispersion - expected_dispersion) / std_dispersion`
///
/// Higher values indicate genes that are more variable than expected.
fn normalize_dispersions(
    log_dispersions: &[f64],
    bin_indices: &[usize],
    bin_means: &[f64],
    bin_stds: &[f64],
) -> anyhow::Result<Vec<f64>> {
    let mut normalized_dispersions = vec![0.0; log_dispersions.len()];

    for (i, &disp) in log_dispersions.iter().enumerate() {
        let bin_idx = bin_indices[i];

        if bin_idx >= bin_means.len() {
            normalized_dispersions[i] = f64::NAN;
            continue;
        }

        let mean = bin_means[bin_idx];
        let std = bin_stds[bin_idx];

        if disp.is_nan() || mean.is_nan() || std.is_nan() || std == 0.0 {
            normalized_dispersions[i] = f64::NAN;
        } else {
            normalized_dispersions[i] = (disp - mean) / std;
        }
    }

    Ok(normalized_dispersions)
}

/// Select highly variable genes based on dispersion scores and expression filters.
///
/// Final step of HVG detection that applies thresholds to identify the most
/// biologically relevant genes. Supports both top-N selection and threshold-based selection.
///
/// ## Parameters
/// * `log_means` - Log-transformed mean expression values
/// * `dispersion_norm` - Normalized dispersion scores
/// * `n_top_genes` - Optional: select this many top genes by dispersion
/// * `min_mean` - Minimum mean expression threshold
/// * `max_mean` - Maximum mean expression threshold  
/// * `min_dispersion` - Minimum normalized dispersion threshold
///
/// ## Returns
/// * Boolean mask indicating highly variable genes
///
/// ## Selection Logic
/// - If `n_top_genes` specified: Select top N genes by normalized dispersion
/// - Otherwise: Apply all threshold filters (mean and dispersion)
/// - Genes with NaN dispersions are handled appropriately for each mode
fn subset_genes(
    log_means: &[f64], // These are already log-transformed
    dispersion_norm: &[f64],
    n_top_genes: Option<usize>,
    min_mean: f64,
    max_mean: f64,
    min_dispersion: f64,
) -> anyhow::Result<Vec<bool>> {
    let mut highly_variable = vec![false; log_means.len()];

    if let Some(n_top) = n_top_genes {
        // Python's approach for n_top_genes:
        // 1. First, remove NaN values to compute threshold
        let non_nan_dispersions: Vec<f64> = dispersion_norm
            .iter()
            .filter(|&&d| !d.is_nan())
            .copied()
            .collect();

        if non_nan_dispersions.is_empty() {
            return Ok(highly_variable);
        }

        // Find the nth highest value
        let n_to_select = n_top.min(non_nan_dispersions.len());
        let mut sorted_dispersions = non_nan_dispersions.clone();
        sorted_dispersions.sort_by(|a, b| b.partial_cmp(a).unwrap_or(std::cmp::Ordering::Equal));

        let threshold = sorted_dispersions[n_to_select - 1];

        // 2. Now apply threshold to nan_to_num version (NaN → -inf)
        for i in 0..dispersion_norm.len() {
            let disp_value = if dispersion_norm[i].is_nan() {
                f64::NEG_INFINITY // Python uses -inf for NaN in final selection
            } else {
                dispersion_norm[i]
            };

            if disp_value >= threshold {
                highly_variable[i] = true;
            }
        }
    } else {
        // Original cutoff-based selection
        // Python applies nan_to_num (NaN → 0) before checking bounds
        let clean_dispersions: Vec<f64> = dispersion_norm
            .iter()
            .map(|&d| if d.is_nan() { 0.0 } else { d })
            .collect();

        // Apply mean filters
        let valid_by_mean: Vec<bool> = log_means
            .iter()
            .map(|&log_mean| log_mean > min_mean && log_mean < max_mean)
            .collect();

        for i in 0..log_means.len() {
            highly_variable[i] = valid_by_mean[i] && clean_dispersions[i] > min_dispersion;
        }
    }

    Ok(highly_variable)
}

/// Compute highly variable genes using the Seurat method.
///
/// The Seurat method models the mean-variance relationship by:
/// 1. Calculating dispersion (variance/mean) for each gene
/// 2. Binning genes by expression level  
/// 3. Computing expected dispersion within each bin
/// 4. Normalizing dispersions to z-scores
/// 5. Selecting genes with high normalized dispersion
///
/// ## Method Details
/// - Uses log1p transformation of means for binning
/// - Handles zero dispersions by setting to NaN
/// - Applies equal-width binning strategy
/// - Stores log-transformed values for consistency with scanpy
///
/// ## Parameters
/// * `adata` - AnnData object with expression data
/// * `x` - Expression matrix
/// * `params` - HVG parameters including thresholds and bin count
///
/// ## Output Columns
/// - `means`: Log1p mean expression
/// - `dispersions`: Log dispersion values
/// - `dispersions_norm`: Z-score normalized dispersions
/// - `highly_variable`: Boolean selection mask
fn compute_seurat_hvg(
    adata: &IMAnnData,
    x: &IMArrayElement,
    params: HVGParams,
) -> anyhow::Result<()> {
    let n_obs = adata.n_obs();

    // Calculate means from raw counts
    let raw_means: Vec<f64> = x
        .sum_whole(&Direction::COLUMN)?
        .iter()
        .map(|sum: &f64| sum / n_obs as f64)
        .collect();

    let variances: Vec<f64> = x.variance_whole::<u32, f64>(&Direction::COLUMN)?;

    // Calculate dispersions with proper handling of zero means
    let dispersions: Vec<f64> = raw_means
        .iter()
        .zip(variances.iter())
        .map(|(&mean, &var)| {
            let safe_mean = if mean > 1e-12 { mean } else { 1e-12 };
            var / safe_mean
        })
        .collect();

    // For Seurat flavor, use log1p of means for binning and storage
    // This matches what Python does AFTER reverting log normalization
    let log1p_means: Vec<f64> = raw_means.iter().map(|&x| (x + 1.0).ln()).collect();

    // Log dispersions with NaN for zero dispersions (matching Python)
    let log_dispersions: Vec<f64> = dispersions
        .iter()
        .map(|&x| {
            if x > 0.0 {
                x.ln()
            } else {
                f64::NAN // Python sets dispersion[dispersion == 0] = np.nan
            }
        })
        .collect();

    let n_bins = params.n_bins;

    // Use equal-width binning on log1p_means (like Python's pd.cut)
    let (bin_indices, _) = equal_width_binning(&log1p_means, n_bins)?;

    // Calculate mean and std for each bin
    let (mut bin_means, mut bin_stds) =
        calculate_bin_stats(&log_dispersions, &bin_indices, n_bins)?;

    // Handle single-gene bins (like Python's _postprocess_dispersions_seurat)
    postprocess_seurat_dispersions(&mut bin_means, &mut bin_stds)?;

    // Normalize dispersions
    let normalized_dispersions =
        normalize_dispersions(&log_dispersions, &bin_indices, &bin_means, &bin_stds)?;

    // Select highly variable genes using raw means for filtering
    let highly_variable = subset_genes(
        &log1p_means, // Pass log-transformed means
        &normalized_dispersions,
        params.n_top_genes,
        params.min_mean,
        params.max_mean,
        params.min_dispersion,
    )?;

    // Store results - IMPORTANT: Store log1p means to match Python
    let mut var_df = adata.var().get_data();
    var_df.with_column(Column::new("means".into(), log1p_means))?; // Store log1p means like Python
    var_df.with_column(Column::new("dispersions".into(), log_dispersions))?; // Store log dispersions
    var_df.with_column(Column::new(
        "dispersions_norm".into(),
        normalized_dispersions,
    ))?;
    var_df.with_column(Column::new("highly_variable".into(), highly_variable))?;

    adata.var().set_data(var_df)
}

/// Placeholder for Cell Ranger highly variable gene detection.
///
/// The Cell Ranger method follows a similar approach to Seurat but with
/// modifications specific to the 10x Genomics processing pipeline.
/// This implementation is planned for future development.
///
/// ## TODO
/// - Implement Cell Ranger-specific normalization
/// - Add proper mean-variance modeling for 10x data
/// - Include Cell Ranger-specific parameter handling
fn compute_cell_ranger_hvg(
    _adata: &IMAnnData,
    _x: &IMArrayElement,
    _params: HVGParams,
) -> anyhow::Result<()> {
    todo!("Cell Ranger flavor is not implemented yet!")
}

/// Compute highly variable genes using Support Vector Regression (SVR).
///
/// The SVR method uses machine learning to model the mean-variance relationship:
/// 1. Computes log(mean) and log(variance) for each gene
/// 2. Fits SVR model to predict variance from mean
/// 3. Calculates residuals (observed - predicted variance)
/// 4. Standardizes residuals to identify outliers
/// 5. Selects genes with high standardized residuals
///
/// ## Advantages
/// - More flexible modeling of mean-variance relationship
/// - Can capture non-linear trends
/// - Less sensitive to binning artifacts
///
/// ## Disadvantages  
/// - More computationally intensive
/// - Requires tuning of SVR hyperparameters
/// - May be less stable with small datasets
///
/// ## Parameters
/// * `adata` - AnnData object with expression data
/// * `x` - Expression matrix
/// * `params` - HVG parameters including selection criteria
///
/// ## Output Columns
/// - `means`: Mean expression per gene
/// - `variances`: Variance per gene
/// - `residuals`: SVR residuals (observed - predicted variance)
/// - `residuals_standardized`: Z-score standardized residuals  
/// - `mean_variance_trend`: Predicted variance from SVR model
/// - `highly_variable`: Boolean selection mask
fn compute_svr_hvg(adata: &IMAnnData, x: &IMArrayElement, params: HVGParams) -> anyhow::Result<()> {
    let n_obs = adata.n_obs();
    let means: Vec<f64> = x
        .sum_whole(&Direction::COLUMN)?
        .iter()
        .map(|sum: &f64| sum / n_obs as f64)
        .collect();

    let variances: Vec<f64> = x.variance_whole::<u32, f64>(&Direction::COLUMN)?;

    let log_means: Vec<f64> = means.iter().map(|&x| x.ln()).collect();
    let log_variances: Vec<f64> = variances.iter().map(|&x| x.ln()).collect();

    let (residuals, y_pred) = fit_svr(&log_means, &log_variances)?;
    let standardized_results = standardize_log_form_vec(&residuals);

    let mut highly_variable = vec![false; means.len()];
    if let Some(n_top) = params.n_top_genes {
        let mut indices: Vec<usize> = (0..standardized_results.len()).collect();
        indices.sort_by(|&a, &b| {
            standardized_results[b]
                .partial_cmp(&standardized_results[a])
                .unwrap()
        });
        for &idx in indices.iter().take(n_top) {
            if means[idx] >= params.min_mean && means[idx] <= params.max_mean {
                highly_variable[idx] = true;
            }
        }
    } else {
        for i in 0..means.len() {
            highly_variable[i] = means[i] >= params.min_mean
                && means[i] <= params.max_mean
                && standardized_results[i] > params.min_dispersion;
        }
    }

    let mut var_df = adata.var().get_data();
    var_df.with_column(Column::new("means".into(), means))?;
    var_df.with_column(Column::new("variances".into(), variances))?;
    var_df.with_column(Column::new("residuals".into(), residuals))?;
    var_df.with_column(Column::new("highly_variable".into(), highly_variable))?;
    var_df.with_column(Column::new(
        "residuals_standardized".into(),
        standardized_results,
    ))?;
    var_df.with_column(Column::new("mean_variance_trend".into(), y_pred))?;

    adata.var().set_data(var_df)
}