use std::num::NonZeroUsize;
use crate::Real;
use crate::distance::haversine_distance;
use crate::error::KrigingError;
use crate::geo_dataset::GeoDataset;
#[derive(Debug, Clone, Copy, PartialEq)]
pub struct PositiveReal(Real);
impl PositiveReal {
pub fn try_new(x: Real) -> Result<Self, KrigingError> {
if x > 0.0 && x.is_finite() {
Ok(Self(x))
} else {
Err(KrigingError::FittingError(
"value must be finite and positive".to_string(),
))
}
}
#[inline]
pub fn get(self) -> Real {
self.0
}
}
impl std::ops::Deref for PositiveReal {
type Target = Real;
#[inline]
fn deref(&self) -> &Real {
&self.0
}
}
#[derive(Debug, Clone)]
pub struct EmpiricalVariogram {
pub distances: Vec<Real>,
pub semivariances: Vec<Real>,
pub n_pairs: Vec<usize>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum EmpiricalEstimator {
#[default]
Classical,
CressieHawkins,
}
#[derive(Debug, Clone)]
pub struct VariogramConfig {
pub max_distance: Option<PositiveReal>,
pub n_bins: NonZeroUsize,
pub estimator: EmpiricalEstimator,
}
impl Default for VariogramConfig {
fn default() -> Self {
Self {
max_distance: None,
n_bins: NonZeroUsize::new(12).expect("12 != 0"),
estimator: EmpiricalEstimator::Classical,
}
}
}
pub fn compute_empirical_variogram(
dataset: &GeoDataset,
config: &VariogramConfig,
) -> Result<EmpiricalVariogram, KrigingError> {
let coords = dataset.coords();
let values = dataset.values();
let n = coords.len();
let n_bins = config.n_bins.get();
let mut dist_sums = vec![0.0 as Real; n_bins];
let mut value_sums = vec![0.0 as Real; n_bins];
let mut counts = vec![0usize; n_bins];
let robust = matches!(config.estimator, EmpiricalEstimator::CressieHawkins);
let accumulate_pair = |values_i: Real,
values_j: Real,
bin: usize,
d: Real,
dist_sums: &mut [Real],
value_sums: &mut [Real],
counts: &mut [usize]| {
let dz = (values_i - values_j).abs();
let g = if robust { dz.sqrt() } else { 0.5 * dz * dz };
dist_sums[bin] += d;
value_sums[bin] += g;
counts[bin] += 1;
};
if let Some(max_dist) = config.max_distance {
let max_dist = max_dist.get();
let bin_width = max_dist / n_bins as Real;
for i in 0..n {
for j in (i + 1)..n {
let d = haversine_distance(coords[i], coords[j]);
if d > max_dist {
continue;
}
let mut bin = (d / bin_width).floor() as usize;
if bin >= n_bins {
bin = n_bins - 1;
}
accumulate_pair(
values[i],
values[j],
bin,
d,
&mut dist_sums,
&mut value_sums,
&mut counts,
);
}
}
} else {
let mut max_observed: Real = 0.0;
for i in 0..n {
for j in (i + 1)..n {
let d = haversine_distance(coords[i], coords[j]);
if d > max_observed {
max_observed = d;
}
}
}
if max_observed <= 0.0 {
return Err(KrigingError::FittingError(
"max distance must be positive".to_string(),
));
}
let bin_width = max_observed / n_bins as Real;
for i in 0..n {
for j in (i + 1)..n {
let d = haversine_distance(coords[i], coords[j]);
let mut bin = (d / bin_width).floor() as usize;
if bin >= n_bins {
bin = n_bins - 1;
}
accumulate_pair(
values[i],
values[j],
bin,
d,
&mut dist_sums,
&mut value_sums,
&mut counts,
);
}
}
}
let mut distances = Vec::new();
let mut semivariances = Vec::new();
let mut n_pairs = Vec::new();
for i in 0..n_bins {
if counts[i] == 0 {
continue;
}
let n_i = counts[i] as Real;
let g = if robust {
let mean_sqrt = value_sums[i] / n_i;
let numer = mean_sqrt.powi(4);
let denom = 0.457 + 0.494 / n_i + 0.045 / (n_i * n_i);
0.5 * numer / denom
} else {
value_sums[i] / n_i
};
distances.push(dist_sums[i] / n_i);
semivariances.push(g);
n_pairs.push(counts[i]);
}
if distances.is_empty() {
return Err(KrigingError::FittingError(
"no pairs in selected distance range".to_string(),
));
}
Ok(EmpiricalVariogram {
distances,
semivariances,
n_pairs,
})
}
pub fn compute_empirical_variogram_binomial_calibrated(
dataset: &GeoDataset,
per_site_logit_variance: &[Real],
config: &VariogramConfig,
rel_weight_eps: Real,
) -> Result<EmpiricalVariogram, KrigingError> {
if !matches!(config.estimator, EmpiricalEstimator::Classical) {
return Err(KrigingError::FittingError(
"binomial_calibrated variogram: use Estimator::Classical only; Cressie-Hawkins is not implemented for this path".to_string(),
));
}
let n = dataset.len();
if per_site_logit_variance.len() != n {
return Err(KrigingError::FittingError(
"per_site_logitVariance must have same length as dataset values".to_string(),
));
}
for &v in per_site_logit_variance {
if !v.is_finite() || v < 0.0 {
return Err(KrigingError::FittingError(
"per-site logit variance must be finite and non-negative".to_string(),
));
}
}
let coords = dataset.coords();
let values = dataset.values();
let n_bins = config.n_bins.get();
let mut dist_weighted: Vec<Real> = vec![0.0; n_bins];
let mut gamma_weighted: Vec<Real> = vec![0.0; n_bins];
let mut weight_sums: Vec<Real> = vec![0.0; n_bins];
let mut counts: Vec<usize> = vec![0usize; n_bins];
let (distances, semivariances, n_pairs) = {
if let Some(max_dist) = config.max_distance {
let max_d = max_dist.get();
let bin_width = max_d / n_bins as Real;
for i in 0..n {
for j in (i + 1)..n {
let d = haversine_distance(coords[i], coords[j]);
if d > max_d {
continue;
}
let mut bin = (d / bin_width).floor() as usize;
if bin >= n_bins {
bin = n_bins - 1;
}
let vi = per_site_logit_variance[i];
let vj = per_site_logit_variance[j];
let w = 1.0 / (0.5 * (vi + vj) + rel_weight_eps);
if !w.is_finite() {
continue;
}
let half_sq = 0.5 * (values[i] - values[j]) * (values[i] - values[j]);
let half_noise = 0.5 * (vi + vj);
let g = (half_sq - half_noise).max(0.0);
dist_weighted[bin] += w * d;
gamma_weighted[bin] += w * g;
weight_sums[bin] += w;
counts[bin] += 1;
}
}
} else {
let mut max_observed: Real = 0.0;
for i in 0..n {
for j in (i + 1)..n {
let d = haversine_distance(coords[i], coords[j]);
if d > max_observed {
max_observed = d;
}
}
}
if max_observed <= 0.0 {
return Err(KrigingError::FittingError(
"max distance must be positive".to_string(),
));
}
let bin_width = max_observed / n_bins as Real;
for i in 0..n {
for j in (i + 1)..n {
let d = haversine_distance(coords[i], coords[j]);
let mut bin = (d / bin_width).floor() as usize;
if bin >= n_bins {
bin = n_bins - 1;
}
let vi = per_site_logit_variance[i];
let vj = per_site_logit_variance[j];
let w = 1.0 / (0.5 * (vi + vj) + rel_weight_eps);
if !w.is_finite() {
continue;
}
let half_sq = 0.5 * (values[i] - values[j]) * (values[i] - values[j]);
let half_noise = 0.5 * (vi + vj);
let g = (half_sq - half_noise).max(0.0);
dist_weighted[bin] += w * d;
gamma_weighted[bin] += w * g;
weight_sums[bin] += w;
counts[bin] += 1;
}
}
}
let mut distances = Vec::new();
let mut semivariances = Vec::new();
let mut n_pairs = Vec::new();
for i in 0..n_bins {
if counts[i] == 0 || weight_sums[i] <= 0.0 {
continue;
}
let wsum = weight_sums[i];
distances.push(dist_weighted[i] / wsum);
semivariances.push(gamma_weighted[i] / wsum);
n_pairs.push(counts[i]);
}
(distances, semivariances, n_pairs)
};
if distances.is_empty() {
return Err(KrigingError::FittingError(
"no pairs in selected distance range (binomial calibrated variogram)".to_string(),
));
}
Ok(EmpiricalVariogram {
distances,
semivariances,
n_pairs,
})
}
#[cfg(test)]
mod tests {
use super::*;
use crate::distance::GeoCoord;
#[test]
fn empirical_variogram_has_non_empty_bins() {
let coords = vec![
GeoCoord::try_new(0.0, 0.0).unwrap(),
GeoCoord::try_new(0.0, 1.0).unwrap(),
GeoCoord::try_new(1.0, 0.0).unwrap(),
GeoCoord::try_new(1.0, 1.0).unwrap(),
];
let values = vec![1.0, 2.0, 2.0, 3.0];
let dataset = GeoDataset::new(coords, values).unwrap();
let out = compute_empirical_variogram(
&dataset,
&VariogramConfig {
max_distance: None,
n_bins: NonZeroUsize::new(6).unwrap(),
estimator: EmpiricalEstimator::Classical,
},
)
.expect("empirical variogram should compute");
assert!(!out.distances.is_empty());
assert_eq!(out.distances.len(), out.semivariances.len());
assert_eq!(out.distances.len(), out.n_pairs.len());
}
#[test]
fn cressie_hawkins_is_robust_to_a_single_outlier() {
let mut coords: Vec<GeoCoord> = Vec::new();
let mut values: Vec<Real> = Vec::new();
for i in 0..6 {
for j in 0..6 {
coords.push(GeoCoord::try_new(i as Real * 0.1, j as Real * 0.1).unwrap());
values.push((i + j) as Real); }
}
*values.last_mut().unwrap() = 1000.0;
let dataset = GeoDataset::new(coords, values).unwrap();
let classical = compute_empirical_variogram(
&dataset,
&VariogramConfig {
estimator: EmpiricalEstimator::Classical,
n_bins: NonZeroUsize::new(5).unwrap(),
..Default::default()
},
)
.expect("classical");
let robust = compute_empirical_variogram(
&dataset,
&VariogramConfig {
estimator: EmpiricalEstimator::CressieHawkins,
n_bins: NonZeroUsize::new(5).unwrap(),
..Default::default()
},
)
.expect("robust");
assert_eq!(classical.semivariances.len(), robust.semivariances.len());
let max_classical = classical
.semivariances
.iter()
.copied()
.fold(0.0 as Real, Real::max);
let max_robust = robust
.semivariances
.iter()
.copied()
.fold(0.0 as Real, Real::max);
assert!(
max_robust < max_classical * 0.5,
"robust max={max_robust} should be much smaller than classical max={max_classical}"
);
}
#[test]
fn empirical_variogram_preserves_pair_accounting_with_fixed_max_distance() {
let coords = vec![
GeoCoord::try_new(0.0, 0.0).unwrap(),
GeoCoord::try_new(0.0, 0.5).unwrap(),
GeoCoord::try_new(0.5, 0.0).unwrap(),
GeoCoord::try_new(0.5, 0.5).unwrap(),
GeoCoord::try_new(1.0, 1.0).unwrap(),
];
let values = vec![1.0, 1.5, 2.0, 2.5, 3.0];
let max_distance = PositiveReal::try_new(500.0).unwrap();
let dataset = GeoDataset::new(coords, values).unwrap();
let out = compute_empirical_variogram(
&dataset,
&VariogramConfig {
max_distance: Some(max_distance),
n_bins: NonZeroUsize::new(8).unwrap(),
estimator: EmpiricalEstimator::Classical,
},
)
.expect("empirical variogram should compute");
let mut expected_pair_count = 0usize;
let coords = dataset.coords();
for i in 0..coords.len() {
for j in (i + 1)..coords.len() {
if haversine_distance(coords[i], coords[j]) <= max_distance.get() {
expected_pair_count += 1;
}
}
}
let observed_pair_count = out.n_pairs.iter().sum::<usize>();
assert_eq!(observed_pair_count, expected_pair_count);
assert_eq!(out.distances.len(), out.semivariances.len());
assert_eq!(out.distances.len(), out.n_pairs.len());
assert!(
out.distances
.iter()
.all(|d| *d >= 0.0 && *d <= max_distance.get())
);
assert!(out.semivariances.iter().all(|g| g.is_finite() && *g >= 0.0));
}
}