ferrotorch-core 0.6.2

Core tensor and autograd engine for ferrotorch — PyTorch in Rust
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
//! Masked tensors — `torch.masked.MaskedTensor` analog.
//!
//! A [`MaskedTensor`] pairs a data tensor with a boolean mask, where mask
//! entries indicate which positions are "valid". Reductions, arithmetic,
//! and `to_tensor` / `filled` all honour the mask.
//!
//! # Mask convention
//!
//! Matches `torch.masked.MaskedTensor`: `mask[i] == true` means the value
//! is valid (use it); `mask[i] == false` means the value is masked out
//! (ignored by reductions, replaced by `fill_value` when materialised).
//! This is the **opposite** of NumPy's `numpy.ma`, which uses
//! `mask=True` to mean "invalid". Helpers below translate at the
//! boundary when delegating to [`ferray_ma`].
//!
//! # GPU discipline
//!
//! No silent CPU↔GPU round trips. Reductions (`masked_sum` / `masked_mean` /
//! `masked_min` / `masked_max`) lower to on-device kernels for f32/f64
//! (#597 / #627). The constructors `masked_invalid` / `masked_equal` compute
//! their boolean predicate ON-DEVICE for f32/f64 CUDA inputs via
//! `GpuBackend::isfinite_mask` / `ne_scalar_mask` (#1545); only the resulting
//! boolean mask is read back to populate the host-resident `Vec<bool>` (the
//! mask is host-side BY DESIGN — this is a one-way readback of the freshly
//! computed predicate, not a round trip of the value data, which never leaves
//! the device). `masked_where` takes a host `&[bool]` condition and is
//! device-agnostic. bf16/f16 constructors still take the host walk.
//!
//! ## REQ status (per `.design/ferrotorch-core/masked.md`)
//!
//! | REQ | Status | Evidence |
//! |---|---|---|
//! | REQ-1 | SHIPPED | `MaskedTensor::new` at `masked.rs:60`; consumer: re-export `ferrotorch_core::MaskedTensor` at `lib.rs:167` |
//! | REQ-2 | SHIPPED | `MaskedTensor::from_data` at `masked.rs:78`; consumer: re-export at `lib.rs:167` |
//! | REQ-3 | SHIPPED | `with_fill_value` at `masked.rs:84`; consumer: re-export at `lib.rs:167` |
//! | REQ-4 | SHIPPED | `filled`/`to_tensor` at `masked.rs:131,143`; consumer: re-export at `lib.rs:167` |
//! | REQ-5 | SHIPPED | `masked_sum`/`masked_mean`/`masked_min`/`masked_max`/`masked_count` at `masked.rs:200,275,322,330,419`; consumer: re-export at `lib.rs:167-170` |
//! | REQ-6 | SHIPPED | `masked_where`/`masked_invalid`/`masked_equal` (`masked_invalid`/`masked_equal` in `masked.rs`); consumer: re-export at `lib.rs`. GPU predicate masks for `masked_invalid`/`masked_equal` (f32/f64) via `GpuBackend::isfinite_mask`/`ne_scalar_mask` (#1545); consumer: those constructors' CUDA branches in `masked.rs` |
//! | REQ-7 | SHIPPED | `to_ferray` at `masked.rs:165`; consumer: `to_ferray_round_trip_mean_matches_inhouse` pins the bridge |

use ferray_core::{Array as FerrayArray, IxDyn as FerrayIxDyn};
use ferray_ma::masked_array::MaskedArray;

use crate::dtype::Float;
use crate::error::{FerrotorchError, FerrotorchResult};
use crate::storage::TensorStorage;
use crate::tensor::Tensor;

// ---------------------------------------------------------------------------
// MaskedTensor
// ---------------------------------------------------------------------------

/// A tensor paired with a boolean mask.
///
/// `mask[i] == true` means the entry is **valid**; `false` means it is
/// **masked out**. This matches `torch.masked.MaskedTensor`.
///
/// `fill_value` is substituted for masked entries when [`to_tensor`] /
/// [`filled`](Self::filled) is called. Defaults to zero.
#[derive(Clone, Debug)]
pub struct MaskedTensor<T: Float> {
    data: Tensor<T>,
    /// Length equals `data.numel()`. Stored flat in C-order to match the
    /// underlying tensor layout.
    mask: Vec<bool>,
    fill_value: T,
}

impl<T: Float> MaskedTensor<T> {
    /// Build a masked tensor from a data tensor + boolean mask.
    ///
    /// `mask` must have exactly `data.numel()` elements. Accepts both CPU
    /// and CUDA tensors — GPU paths in [`masked_sum`] / [`masked_mean`]
    /// lower to `mul + reduce_sum`. (#597)
    ///
    /// Mask convention: `mask[i] == true` means VALID (torch convention).
    pub fn new(data: Tensor<T>, mask: Vec<bool>) -> FerrotorchResult<Self> {
        if mask.len() != data.numel() {
            return Err(FerrotorchError::ShapeMismatch {
                message: format!(
                    "MaskedTensor::new: mask length {} != data numel {}",
                    mask.len(),
                    data.numel()
                ),
            });
        }
        Ok(Self {
            data,
            mask,
            fill_value: <T as num_traits::Zero>::zero(),
        })
    }

    /// Build a masked tensor from data only, with all entries marked valid.
    pub fn from_data(data: Tensor<T>) -> FerrotorchResult<Self> {
        let n = data.numel();
        Self::new(data, vec![true; n])
    }

    /// Override the fill value used by [`Self::filled`] / [`Self::to_tensor`].
    pub fn with_fill_value(mut self, fill_value: T) -> Self {
        self.fill_value = fill_value;
        self
    }

    /// The underlying data tensor (regardless of mask).
    #[inline]
    pub fn data(&self) -> &Tensor<T> {
        &self.data
    }

    /// Borrow the boolean mask. `true` = valid, `false` = masked out.
    #[inline]
    pub fn mask(&self) -> &[bool] {
        &self.mask
    }

    /// The fill value used when materialising masked entries.
    #[inline]
    pub fn fill_value(&self) -> T {
        self.fill_value
    }

    /// Logical shape (same as `data().shape()`).
    #[inline]
    pub fn shape(&self) -> &[usize] {
        self.data.shape()
    }

    /// Total number of entries, masked or not.
    #[inline]
    pub fn numel(&self) -> usize {
        self.data.numel()
    }

    /// Number of entries currently marked valid.
    pub fn count_valid(&self) -> usize {
        self.mask.iter().filter(|&&v| v).count()
    }

    /// Number of entries currently masked out.
    pub fn count_masked(&self) -> usize {
        self.mask.iter().filter(|&&v| !v).count()
    }

    /// Materialise into a plain `Tensor<T>` by substituting `fill_value`
    /// at every masked-out position.
    pub fn filled(&self) -> FerrotorchResult<Tensor<T>> {
        let data_vec = self.data.data_vec()?;
        let out: Vec<T> = data_vec
            .iter()
            .zip(self.mask.iter())
            .map(|(&v, &valid)| if valid { v } else { self.fill_value })
            .collect();
        Tensor::from_storage(TensorStorage::cpu(out), self.data.shape().to_vec(), false)
    }

    /// Alias of [`Self::filled`] mirroring `torch.Tensor`'s naming.
    #[inline]
    pub fn to_tensor(&self) -> FerrotorchResult<Tensor<T>> {
        self.filled()
    }
}

// ---------------------------------------------------------------------------
// ferray-ma bridge
//
// ferray-ma's MaskedArray uses NumPy semantics (mask=true means INVALID).
// We invert at the boundary so internal callers see the torch convention
// (mask=true means VALID).
// ---------------------------------------------------------------------------

impl<T: Float> MaskedTensor<T> {
    /// Convert to a `ferray_ma::MaskedArray<U, IxDyn>` for delegating to
    /// ferray-ma's wider op surface (var/std, masked sort, ufunc support,
    /// etc.). Element type is generic over `U: Float + Element` because
    /// ferray-ma's bound is more restrictive than ferrotorch's `Float`
    /// trait — typical choices are `f32` or `f64`.
    ///
    /// Inverts the mask to match NumPy semantics (`true` = invalid)
    /// since ferrotorch uses the torch convention (`true` = valid).
    pub fn to_ferray<U>(&self, op: &'static str) -> FerrotorchResult<MaskedArray<U, FerrayIxDyn>>
    where
        U: ferray_core::Element + Copy + num_traits::Float + 'static,
    {
        let data_vec = self.data.data_vec()?;
        let data_u: Vec<U> = data_vec
            .into_iter()
            .map(|v| U::from(v.to_f64().unwrap()).unwrap())
            .collect();
        let arr =
            FerrayArray::<U, FerrayIxDyn>::from_vec(FerrayIxDyn::new(self.data.shape()), data_u)
                .map_err(FerrotorchError::Ferray)?;
        // Invert mask: ferrotorch true=valid → numpy true=invalid.
        let inv: Vec<bool> = self.mask.iter().map(|&v| !v).collect();
        let mask_arr =
            FerrayArray::<bool, FerrayIxDyn>::from_vec(FerrayIxDyn::new(self.data.shape()), inv)
                .map_err(FerrotorchError::Ferray)?;
        MaskedArray::new(arr, mask_arr).map_err(|e| FerrotorchError::InvalidArgument {
            message: format!("{op}: {e}"),
        })
    }
}

// ---------------------------------------------------------------------------
// Reductions (sum / mean / count)
// ---------------------------------------------------------------------------

/// Sum of valid entries; returns a 0-d tensor.
///
/// Mirrors `torch.masked.MaskedTensor.sum()` (torch.masked uses the same
/// "ignore masked, sum the rest" semantics as numpy.ma).
///
/// On GPU, lowers to `data * mask_as_float → reduce_sum` (#597). The mask
/// is uploaded once and reused for `masked_mean`'s denominator if both
/// are computed.
pub fn masked_sum<T: Float>(mt: &MaskedTensor<T>) -> FerrotorchResult<Tensor<T>> {
    if mt.data.is_cuda() && (is_f32::<T>() || is_f64::<T>()) {
        return masked_sum_gpu(mt);
    }
    if mt.data.is_cuda() {
        return Err(FerrotorchError::NotImplementedOnCuda { op: "masked_sum" });
    }
    // Walk the data + mask in one pass.
    let data = mt.data.data_vec()?;
    let mut acc = <T as num_traits::Zero>::zero();
    for (&v, &valid) in data.iter().zip(mt.mask.iter()) {
        if valid {
            acc += v;
        }
    }
    Tensor::from_storage(TensorStorage::cpu(vec![acc]), vec![], false)
}

/// GPU lowering: build a float-valued mask tensor, multiply, reduce-sum.
fn masked_sum_gpu<T: Float>(mt: &MaskedTensor<T>) -> FerrotorchResult<Tensor<T>> {
    let device = mt.data.device();
    let mask_t: Tensor<T> = mask_as_float_tensor(&mt.mask, mt.data.shape(), device)?;
    let backend = crate::gpu_dispatch::gpu_backend().ok_or(FerrotorchError::DeviceUnavailable)?;
    let numel = mt.data.numel();
    // #1658: normalise a narrowed-offset CUDA data tensor to a packed offset-0
    // buffer before the elementwise mul reads element 0. The mask tensor is
    // freshly built offset-0, so only the data side needs it.
    let data = mt.data.contiguous()?;
    let prod_h = if is_f32::<T>() {
        backend.mul_f32(data.gpu_handle()?, mask_t.gpu_handle()?)?
    } else {
        backend.mul_f64(data.gpu_handle()?, mask_t.gpu_handle()?)?
    };
    let sum_h = if is_f32::<T>() {
        backend.sum_f32(&prod_h, numel)?
    } else {
        backend.sum_f64(&prod_h, numel)?
    };
    Tensor::from_storage(TensorStorage::gpu(sum_h), vec![], false)
}

/// Build a float Tensor<T> on `device` from a bool mask, with shape
/// matching the masked-tensor data. true → 1, false → 0.
fn mask_as_float_tensor<T: Float>(
    mask: &[bool],
    shape: &[usize],
    device: crate::device::Device,
) -> FerrotorchResult<Tensor<T>> {
    let one = T::from(1.0).unwrap();
    let zero = <T as num_traits::Zero>::zero();
    let data: Vec<T> = mask.iter().map(|&b| if b { one } else { zero }).collect();
    let cpu = Tensor::from_storage(TensorStorage::cpu(data), shape.to_vec(), false)?;
    if device.is_cuda() {
        cpu.to(device)
    } else {
        Ok(cpu)
    }
}

/// Helper: are we operating on `f32`?
#[inline]
fn is_f32<T: Float>() -> bool {
    std::mem::size_of::<T>() == 4
}

/// Helper: are we operating on `f64`?
#[inline]
fn is_f64<T: Float>() -> bool {
    std::mem::size_of::<T>() == 8
}

/// Mean of valid entries; returns a 0-d tensor.
///
/// If every entry is masked, returns `NaN` (matches torch.masked).
///
/// GPU path computes `sum(data * mask_f) / count_valid` using the same
/// `mul + reduce_sum` lowering as [`masked_sum`]. Empty-mask case is
/// detected on host (the count is constant in `mask` so no GPU round-trip
/// is needed for it). (#597)
pub fn masked_mean<T: Float>(mt: &MaskedTensor<T>) -> FerrotorchResult<Tensor<T>> {
    if mt.data.is_cuda() && (is_f32::<T>() || is_f64::<T>()) {
        return masked_mean_gpu(mt);
    }
    if mt.data.is_cuda() {
        return Err(FerrotorchError::NotImplementedOnCuda { op: "masked_mean" });
    }
    let data = mt.data.data_vec()?;
    let mut acc = <T as num_traits::Zero>::zero();
    let mut count: usize = 0;
    for (&v, &valid) in data.iter().zip(mt.mask.iter()) {
        if valid {
            acc += v;
            count += 1;
        }
    }
    let val = if count == 0 {
        T::from(f64::NAN).unwrap()
    } else {
        acc / T::from(count as f64).unwrap()
    };
    Tensor::from_storage(TensorStorage::cpu(vec![val]), vec![], false)
}

fn masked_mean_gpu<T: Float>(mt: &MaskedTensor<T>) -> FerrotorchResult<Tensor<T>> {
    let count = mt.count_valid();
    if count == 0 {
        // All-masked → NaN. Skip GPU work entirely.
        let nan = T::from(f64::NAN).unwrap();
        return Tensor::from_storage(TensorStorage::cpu(vec![nan]), vec![], false);
    }
    let sum = masked_sum_gpu(mt)?;
    // sum is a 0-d tensor on GPU. Divide by count on host (single element).
    // Pull just the one element back, divide, return a 0-d CPU tensor.
    let sum_val = sum.cpu()?.data()?[0];
    let mean = sum_val / T::from(count as f64).unwrap();
    Tensor::from_storage(TensorStorage::cpu(vec![mean]), vec![], false)
}

/// Min of valid entries; returns a 0-d tensor (NaN if all masked).
///
/// GPU path: uses the fused `masked_reduce_min` PTX kernel (#627). Single
/// launch reads `(data, mask_f)` directly and combines `mask_f != 0 ?
/// data : +inf` into the running min accumulator — no intermediate
/// buffers, no CPU-side sentinel construction. Same f32/f64-only gate as
/// `masked_sum` / `masked_mean`; other dtypes (bf16/f16) take the CPU
/// walk, matching the existing masked surface.
pub fn masked_min<T: Float>(mt: &MaskedTensor<T>) -> FerrotorchResult<Tensor<T>> {
    if mt.data.is_cuda() && (is_f32::<T>() || is_f64::<T>()) {
        return masked_extremum_gpu(mt, true);
    }
    masked_extremum_cpu(mt, true)
}

/// Max of valid entries; returns a 0-d tensor (NaN if all masked).
pub fn masked_max<T: Float>(mt: &MaskedTensor<T>) -> FerrotorchResult<Tensor<T>> {
    if mt.data.is_cuda() && (is_f32::<T>() || is_f64::<T>()) {
        return masked_extremum_gpu(mt, false);
    }
    masked_extremum_cpu(mt, false)
}

/// CPU implementation: walk data + mask in one pass.
fn masked_extremum_cpu<T: Float>(
    mt: &MaskedTensor<T>,
    pick_min: bool,
) -> FerrotorchResult<Tensor<T>> {
    let device = mt.data.device();
    let data = mt.data.data_vec()?;
    let mut best: Option<T> = None;
    for (&v, &valid) in data.iter().zip(mt.mask.iter()) {
        if !valid {
            continue;
        }
        best = Some(match best {
            None => v,
            Some(b) if pick_min => {
                if v < b {
                    v
                } else {
                    b
                }
            }
            Some(b) => {
                if v > b {
                    v
                } else {
                    b
                }
            }
        });
    }
    let val = best.unwrap_or_else(|| T::from(f64::NAN).unwrap());
    let cpu = Tensor::from_storage(TensorStorage::cpu(vec![val]), vec![], false)?;
    if device.is_cuda() {
        cpu.to(device)
    } else {
        Ok(cpu)
    }
}

/// GPU lowering via the **fused** masked-reduce kernel (#627).
///
/// Single PTX launch that combines `mask_f[i] != 0 ? data[i] : ±inf`
/// directly into the running min/max accumulator. No intermediate
/// `prod` / `filled` buffers, no CPU-side sentinel construction — the
/// only data uploaded is the float mask itself, which we already need
/// for the indicator role.
fn masked_extremum_gpu<T: Float>(
    mt: &MaskedTensor<T>,
    pick_min: bool,
) -> FerrotorchResult<Tensor<T>> {
    // All-masked → NaN, short-circuit before allocating GPU buffers.
    if mt.count_valid() == 0 {
        let nan = T::from(f64::NAN).unwrap();
        return Tensor::from_storage(TensorStorage::cpu(vec![nan]), vec![], false);
    }

    let device = mt.data.device();
    let backend = crate::gpu_dispatch::gpu_backend().ok_or(FerrotorchError::DeviceUnavailable)?;
    let numel = mt.data.numel();

    // Build the [0/1] float mask on device. This is the only host upload —
    // the mask is fundamentally a boolean Vec on the host side, so it has
    // to land on the device once per call regardless. The fused kernel
    // reads it directly and folds the sentinel-fill into the reduce.
    let mask_t: Tensor<T> = mask_as_float_tensor(&mt.mask, mt.data.shape(), device)?;

    // #1658: normalise a narrowed-offset CUDA data tensor to a packed offset-0
    // buffer before the fused masked reduce reads element 0. The mask tensor is
    // freshly built offset-0, so only the data side needs it.
    let data = mt.data.contiguous()?;
    let result_h = if pick_min {
        if is_f32::<T>() {
            backend.masked_min_f32(data.gpu_handle()?, mask_t.gpu_handle()?, numel)?
        } else {
            backend.masked_min_f64(data.gpu_handle()?, mask_t.gpu_handle()?, numel)?
        }
    } else if is_f32::<T>() {
        backend.masked_max_f32(data.gpu_handle()?, mask_t.gpu_handle()?, numel)?
    } else {
        backend.masked_max_f64(data.gpu_handle()?, mask_t.gpu_handle()?, numel)?
    };

    Tensor::from_storage(TensorStorage::gpu(result_h), vec![], false)
}

/// Number of valid (unmasked) entries; returns a 0-d tensor in `T`.
pub fn masked_count<T: Float>(mt: &MaskedTensor<T>) -> FerrotorchResult<Tensor<T>> {
    let n = mt.count_valid();
    Tensor::from_storage(
        TensorStorage::cpu(vec![T::from(n as f64).unwrap()]),
        vec![],
        false,
    )
}

// ---------------------------------------------------------------------------
// Constructors mirroring numpy.ma / torch.masked
// ---------------------------------------------------------------------------

/// Wrap `data` with `condition` interpreted as "where condition is true,
/// mask the value out". Matches `numpy.ma.masked_where`. The resulting
/// [`MaskedTensor`] has `mask = !condition` under the torch convention.
pub fn masked_where<T: Float>(
    data: Tensor<T>,
    condition: &[bool],
) -> FerrotorchResult<MaskedTensor<T>> {
    if condition.len() != data.numel() {
        return Err(FerrotorchError::ShapeMismatch {
            message: format!(
                "masked_where: condition length {} != data numel {}",
                condition.len(),
                data.numel()
            ),
        });
    }
    let mask: Vec<bool> = condition.iter().map(|&c| !c).collect();
    MaskedTensor::new(data, mask)
}

/// Mask out non-finite entries (NaN, ±∞). Matches `numpy.ma.masked_invalid`.
///
/// On CUDA the `isfinite` predicate runs on-device via the
/// `GpuBackend::isfinite_mask` PTX kernel (#1545); only the resulting boolean
/// mask is read back to populate the host-resident `Vec<bool>` (see
/// [`predicate_mask_gpu`] — this is NOT a CPU↔GPU round trip of the value
/// data, which never leaves the device). f32/f64 only; other dtypes take the
/// host walk.
pub fn masked_invalid<T: Float>(data: Tensor<T>) -> FerrotorchResult<MaskedTensor<T>> {
    if data.is_cuda() && (is_f32::<T>() || is_f64::<T>()) {
        let backend =
            crate::gpu_dispatch::gpu_backend().ok_or(FerrotorchError::DeviceUnavailable)?;
        // #1658: normalise a narrowed-offset CUDA view to a packed offset-0
        // buffer before the isfinite predicate reads element 0. The mask is in
        // logical (offset-honouring) order, matching `data`'s own `data_vec`
        // order, so the original `data` is stored unchanged in the result.
        let data_c = data.contiguous()?;
        let mask_h = backend.isfinite_mask(data_c.gpu_handle()?)?;
        let mask = predicate_mask_gpu(backend, &mask_h, data.numel())?;
        return MaskedTensor::new(data, mask);
    }
    if data.is_cuda() {
        return Err(FerrotorchError::NotImplementedOnCuda {
            op: "masked_invalid",
        });
    }
    let data_vec = data.data_vec()?;
    // mask=true means VALID, so finite -> true.
    let mask: Vec<bool> = data_vec
        .iter()
        .map(|v| {
            let f = v.to_f64().unwrap();
            f.is_finite()
        })
        .collect();
    MaskedTensor::new(data, mask)
}

/// Mask out entries equal to `value`. Matches `numpy.ma.masked_equal`.
///
/// On CUDA the `v != value` predicate (the VALID mask under the torch
/// convention) runs on-device via `GpuBackend::ne_scalar_mask` (#1545); only
/// the boolean mask is read back ([`predicate_mask_gpu`]). f32/f64 only.
pub fn masked_equal<T: Float + PartialEq>(
    data: Tensor<T>,
    value: T,
) -> FerrotorchResult<MaskedTensor<T>> {
    if data.is_cuda() && (is_f32::<T>() || is_f64::<T>()) {
        let backend =
            crate::gpu_dispatch::gpu_backend().ok_or(FerrotorchError::DeviceUnavailable)?;
        let value_f = value
            .to_f64()
            .ok_or_else(|| FerrotorchError::InvalidArgument {
                message: "masked_equal: value not representable as f64".into(),
            })?;
        // #1658: normalise a narrowed-offset CUDA view to a packed offset-0
        // buffer before the `!= value` predicate reads element 0. The mask is in
        // logical order, matching `data`'s `data_vec` order, so the original
        // `data` is stored unchanged.
        let data_c = data.contiguous()?;
        let mask_h = backend.ne_scalar_mask(data_c.gpu_handle()?, value_f)?;
        let mask = predicate_mask_gpu(backend, &mask_h, data.numel())?;
        return MaskedTensor::new(data, mask);
    }
    if data.is_cuda() {
        return Err(FerrotorchError::NotImplementedOnCuda { op: "masked_equal" });
    }
    let data_vec = data.data_vec()?;
    let mask: Vec<bool> = data_vec.iter().map(|&v| v != value).collect();
    MaskedTensor::new(data, mask)
}

/// Read a device-resident `DType::Bool` (u8 0/1) predicate buffer back into the
/// host `Vec<bool>` that backs a [`MaskedTensor`]. The mask is host-resident by
/// design, so this one-way readback of the freshly-computed predicate is the
/// intended data path — the value tensor stays on the device (no R-CODE-4
/// round trip). Each byte is normalised `b != 0` so a stray nonzero never
/// produces an invalid `bool` bit pattern (mirrors `BoolTensor::to(Cpu)`).
///
/// The predicate kernel is launched over the data buffer's RAW cudarc slice
/// length, which the pool over-allocates to a multiple of `ROUND_ELEMENTS`
/// (#1659): a 6-element tensor lands in a 256-element slice, so the readback
/// `bytes` may be longer than the tensor's logical `numel`. The leading `numel`
/// bytes are the valid predicate results; the pooled tail is zeroed garbage to
/// be discarded. Truncate to `numel` so the returned mask matches
/// `data.numel()` (a no-op for an already-packed offset-0 buffer where the raw
/// slice length already equals `numel`).
fn predicate_mask_gpu(
    backend: &dyn crate::gpu_dispatch::GpuBackend,
    mask_h: &crate::gpu_dispatch::GpuBufferHandle,
    numel: usize,
) -> FerrotorchResult<Vec<bool>> {
    let bytes = backend.gpu_to_cpu(mask_h)?;
    Ok(bytes.iter().take(numel).map(|&b| b != 0).collect())
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;
    use crate::creation::tensor;

    fn t(data: &[f64], shape: &[usize]) -> Tensor<f64> {
        Tensor::from_storage(TensorStorage::cpu(data.to_vec()), shape.to_vec(), false).unwrap()
    }

    fn close(a: f64, b: f64, tol: f64) -> bool {
        (a - b).abs() < tol
    }

    // ----- Construction --------------------------------------------------

    #[test]
    fn new_with_matching_mask() {
        let d = t(&[1.0, 2.0, 3.0], &[3]);
        let m = MaskedTensor::new(d, vec![true, false, true]).unwrap();
        assert_eq!(m.shape(), &[3]);
        assert_eq!(m.numel(), 3);
        assert_eq!(m.count_valid(), 2);
        assert_eq!(m.count_masked(), 1);
    }

    #[test]
    fn new_rejects_mask_length_mismatch() {
        let d = t(&[1.0, 2.0, 3.0], &[3]);
        let err = MaskedTensor::new(d, vec![true, false]).unwrap_err();
        assert!(matches!(err, FerrotorchError::ShapeMismatch { .. }));
    }

    #[test]
    fn from_data_marks_all_valid() {
        let d = t(&[1.0, 2.0, 3.0], &[3]);
        let m = MaskedTensor::from_data(d).unwrap();
        assert_eq!(m.count_valid(), 3);
        assert_eq!(m.count_masked(), 0);
    }

    // ----- masked_where (numpy-style) ------------------------------------

    #[test]
    fn masked_where_inverts_condition() {
        // condition=[F, T, F, T] → mask=[T, F, T, F] (i.e. positions 1 and 3
        // are masked OUT in torch convention).
        let d = t(&[10.0, 20.0, 30.0, 40.0], &[4]);
        let mt = masked_where(d, &[false, true, false, true]).unwrap();
        assert_eq!(mt.mask(), &[true, false, true, false]);
        assert_eq!(mt.count_valid(), 2);
    }

    // ----- masked_invalid ------------------------------------------------

    #[test]
    fn masked_invalid_masks_nan() {
        let d = t(&[1.0, f64::NAN, 3.0, f64::INFINITY], &[4]);
        let mt = masked_invalid(d).unwrap();
        // 1.0 finite → valid; NaN → invalid; 3.0 finite → valid; inf → invalid
        assert_eq!(mt.mask(), &[true, false, true, false]);
    }

    // ----- masked_equal --------------------------------------------------

    #[test]
    fn masked_equal_masks_matching() {
        let d = t(&[1.0, 5.0, 5.0, 2.0], &[4]);
        let mt = masked_equal(d, 5.0).unwrap();
        // 5.0 → masked OUT; others → valid.
        assert_eq!(mt.mask(), &[true, false, false, true]);
    }

    // ----- Reductions ----------------------------------------------------

    #[test]
    fn masked_sum_skips_masked_entries() {
        let d = t(&[1.0, 2.0, 3.0, 4.0, 5.0], &[5]);
        // Mask out 2 and 4: valid = 1, 3, 5 → sum 9.
        let mt = MaskedTensor::new(d, vec![true, false, true, false, true]).unwrap();
        let s = masked_sum(&mt).unwrap();
        assert!(close(s.data().unwrap()[0], 9.0, 1e-12));
    }

    #[test]
    fn masked_mean_divides_by_valid_count() {
        let d = t(&[10.0, 0.0, 30.0, 0.0, 50.0], &[5]);
        // valid: 10, 30, 50 → mean 30
        let mt = MaskedTensor::new(d, vec![true, false, true, false, true]).unwrap();
        let r = masked_mean(&mt).unwrap();
        assert!(close(r.data().unwrap()[0], 30.0, 1e-12));
    }

    #[test]
    fn masked_mean_all_masked_returns_nan() {
        let d = t(&[1.0, 2.0, 3.0], &[3]);
        let mt = MaskedTensor::new(d, vec![false, false, false]).unwrap();
        let r = masked_mean(&mt).unwrap();
        assert!(r.data().unwrap()[0].is_nan());
    }

    #[test]
    fn masked_min_max_skip_masked() {
        let d = t(&[5.0, 1.0, 9.0, 2.0], &[4]);
        // Mask out the 9.0 (max) and 1.0 (min) → among valids: 5.0, 2.0
        // min=2.0, max=5.0
        let mt = MaskedTensor::new(d, vec![true, false, false, true]).unwrap();
        assert!(close(
            masked_min(&mt).unwrap().data().unwrap()[0],
            2.0,
            1e-12
        ));
        assert!(close(
            masked_max(&mt).unwrap().data().unwrap()[0],
            5.0,
            1e-12
        ));
    }

    #[test]
    // reason: masked_count returns an integer count cast to float; 3 is
    // exactly representable, so equality (not epsilon) is the right check.
    #[allow(clippy::float_cmp)]
    fn masked_count_returns_valid_count() {
        let d = t(&[1.0, 2.0, 3.0, 4.0], &[4]);
        let mt = MaskedTensor::new(d, vec![true, false, true, true]).unwrap();
        let c = masked_count(&mt).unwrap();
        assert_eq!(c.data().unwrap()[0], 3.0);
    }

    // ----- filled / to_tensor --------------------------------------------

    #[test]
    fn filled_substitutes_default_zero() {
        let d = t(&[1.0, 2.0, 3.0], &[3]);
        let mt = MaskedTensor::new(d, vec![true, false, true]).unwrap();
        let f = mt.filled().unwrap();
        assert_eq!(f.data().unwrap(), &[1.0, 0.0, 3.0]);
    }

    #[test]
    fn filled_uses_fill_value() {
        let d = t(&[1.0, 2.0, 3.0], &[3]);
        let mt = MaskedTensor::new(d, vec![true, false, true])
            .unwrap()
            .with_fill_value(-99.0);
        let f = mt.filled().unwrap();
        assert_eq!(f.data().unwrap(), &[1.0, -99.0, 3.0]);
    }

    #[test]
    fn to_tensor_is_alias_for_filled() {
        let d = t(&[1.0, 2.0, 3.0], &[3]);
        let mt = MaskedTensor::new(d, vec![true, false, true]).unwrap();
        let a = mt.filled().unwrap();
        let b = mt.to_tensor().unwrap();
        assert_eq!(a.data().unwrap(), b.data().unwrap());
    }

    // ----- ferray-ma bridge ----------------------------------------------

    #[test]
    fn to_ferray_round_trip_mean_matches_inhouse() {
        // Cross-check our in-house masked_mean against ferray-ma's
        // MaskedArray::mean() to confirm the mask-inversion bridge is
        // semantically correct.
        let d = t(&[2.0, 4.0, 6.0, 8.0], &[4]);
        let mt = MaskedTensor::new(d, vec![true, false, true, false]).unwrap();
        let inhouse = masked_mean(&mt).unwrap().data().unwrap()[0];
        // Build ferray-ma view via our internal bridge.
        let ferray_ma_view: MaskedArray<f64, FerrayIxDyn> = mt.to_ferray("test").unwrap();
        let ferray_mean = ferray_ma_view.mean().unwrap();
        assert!(close(inhouse, ferray_mean, 1e-12));
        // Sanity: in-house value matches the closed form (2 + 6) / 2 = 4.
        assert!(close(inhouse, 4.0, 1e-12));
    }

    // ----- GPU discipline -------------------------------------------------

    #[test]
    fn constructors_accept_cpu_tensors() {
        // Sanity: every constructor path is reachable for a CPU input.
        let d = tensor(&[1.0_f64, 2.0, 3.0]).unwrap();
        assert!(MaskedTensor::from_data(d.clone()).is_ok());
        assert!(masked_where(d.clone(), &[false, true, false]).is_ok());
        assert!(masked_invalid(d.clone()).is_ok());
        assert!(masked_equal(d, 2.0).is_ok());
    }

    // -------------------------------------------------------------------
    // #616: masked_min/max no longer error on GPU — they fall back to a
    // host-bounce reduce. CPU branch is exercised here; the GPU branch
    // shares the same data_vec() entry point so the same code drives both.
    // -------------------------------------------------------------------

    #[test]
    fn masked_min_max_match_cpu_definition() {
        let d = tensor(&[1.0_f64, -3.0, 5.0, 7.0]).unwrap();
        // mask: [valid, masked, valid, masked] -> visible = {1.0, 5.0}
        let mt = MaskedTensor::new(d, vec![true, false, true, false]).unwrap();
        assert_eq!(masked_min(&mt).unwrap().data().unwrap(), &[1.0]);
        assert_eq!(masked_max(&mt).unwrap().data().unwrap(), &[5.0]);
    }

    #[test]
    fn masked_min_max_all_masked_returns_nan() {
        let d = tensor(&[1.0_f64, 2.0]).unwrap();
        let mt = MaskedTensor::new(d, vec![false, false]).unwrap();
        assert!(masked_min(&mt).unwrap().data().unwrap()[0].is_nan());
        assert!(masked_max(&mt).unwrap().data().unwrap()[0].is_nan());
    }
}