use scirs2_core::ndarray::ArrayView2;
use scirs2_core::random::{seeded_rng, Rng, RngExt};
use crate::error::{SpatialError, SpatialResult};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ScanModel {
Bernoulli,
Poisson,
}
#[derive(Debug, Clone)]
pub struct ScanStatisticConfig {
pub model: ScanModel,
pub max_population_fraction: f64,
pub n_monte_carlo: usize,
pub seed: u64,
pub max_secondary_clusters: usize,
}
impl Default for ScanStatisticConfig {
fn default() -> Self {
Self {
model: ScanModel::Poisson,
max_population_fraction: 0.5,
n_monte_carlo: 999,
seed: 12345,
max_secondary_clusters: 5,
}
}
}
#[derive(Debug, Clone)]
pub struct ScanCluster {
pub center_index: usize,
pub radius: f64,
pub llr: f64,
pub p_value: f64,
pub member_indices: Vec<usize>,
pub observed_inside: f64,
pub expected_inside: f64,
}
#[derive(Debug, Clone)]
pub struct ScanResult {
pub primary_cluster: ScanCluster,
pub secondary_clusters: Vec<ScanCluster>,
}
pub fn kulldorff_scan(
coordinates: &ArrayView2<f64>,
cases: &[f64],
population: &[f64],
config: &ScanStatisticConfig,
) -> SpatialResult<ScanResult> {
let n = coordinates.nrows();
if n < 3 {
return Err(SpatialError::ValueError(
"Need at least 3 locations".to_string(),
));
}
if coordinates.ncols() < 2 {
return Err(SpatialError::DimensionError(
"Coordinates must be 2D".to_string(),
));
}
if cases.len() != n || population.len() != n {
return Err(SpatialError::DimensionError(
"cases and population must have length n".to_string(),
));
}
if config.max_population_fraction <= 0.0 || config.max_population_fraction > 1.0 {
return Err(SpatialError::ValueError(
"max_population_fraction must be in (0, 1]".to_string(),
));
}
let total_cases: f64 = cases.iter().sum();
let total_population: f64 = population.iter().sum();
if total_cases <= 0.0 || total_population <= 0.0 {
return Err(SpatialError::ValueError(
"Total cases and population must be positive".to_string(),
));
}
let max_pop = config.max_population_fraction * total_population;
let distances = precompute_distances(coordinates, n);
let sorted_neighbours = build_sorted_neighbours(&distances, n);
let (best_llr, best_center, best_radius, best_members, best_obs, best_exp) = find_best_window(
&sorted_neighbours,
&distances,
cases,
population,
total_cases,
total_population,
max_pop,
config.model,
n,
);
let mc_p = monte_carlo_p_value(
&sorted_neighbours,
&distances,
population,
total_cases,
total_population,
max_pop,
config.model,
best_llr,
config.n_monte_carlo,
config.seed,
n,
);
let primary = ScanCluster {
center_index: best_center,
radius: best_radius,
llr: best_llr,
p_value: mc_p,
member_indices: best_members.clone(),
observed_inside: best_obs,
expected_inside: best_exp,
};
let secondary = find_secondary_clusters(
&sorted_neighbours,
&distances,
cases,
population,
total_cases,
total_population,
max_pop,
config.model,
&best_members,
config.max_secondary_clusters,
n,
);
Ok(ScanResult {
primary_cluster: primary,
secondary_clusters: secondary,
})
}
fn precompute_distances(coordinates: &ArrayView2<f64>, n: usize) -> Vec<Vec<f64>> {
let mut dists = vec![vec![0.0; n]; n];
for i in 0..n {
for j in (i + 1)..n {
let dx = coordinates[[i, 0]] - coordinates[[j, 0]];
let dy = coordinates[[i, 1]] - coordinates[[j, 1]];
let d = (dx * dx + dy * dy).sqrt();
dists[i][j] = d;
dists[j][i] = d;
}
}
dists
}
fn build_sorted_neighbours(distances: &[Vec<f64>], n: usize) -> Vec<Vec<usize>> {
let mut sorted = Vec::with_capacity(n);
for i in 0..n {
let mut neighbours: Vec<usize> = (0..n).collect();
neighbours.sort_by(|&a, &b| {
distances[i][a]
.partial_cmp(&distances[i][b])
.unwrap_or(std::cmp::Ordering::Equal)
});
sorted.push(neighbours);
}
sorted
}
fn compute_llr(
obs_in: f64,
exp_in: f64,
total_cases: f64,
total_expected: f64,
model: ScanModel,
) -> f64 {
let obs_out = total_cases - obs_in;
let exp_out = total_expected - exp_in;
match model {
ScanModel::Poisson => {
if exp_in <= 0.0 || exp_out <= 0.0 || obs_in <= 0.0 {
return 0.0;
}
let rate_in = obs_in / exp_in;
let rate_out = if obs_out > 0.0 && exp_out > 0.0 {
obs_out / exp_out
} else {
0.0
};
if rate_in <= rate_out {
return 0.0;
}
let mut llr = obs_in * (obs_in / exp_in).ln();
if obs_out > 0.0 && exp_out > 0.0 {
llr += obs_out * (obs_out / exp_out).ln();
}
if total_cases > 0.0 && total_expected > 0.0 {
llr -= total_cases * (total_cases / total_expected).ln();
}
llr.max(0.0)
}
ScanModel::Bernoulli => {
if exp_in <= 0.0 || exp_out <= 0.0 {
return 0.0;
}
let p_in = obs_in / exp_in;
let p_out = if exp_out > 0.0 {
obs_out / exp_out
} else {
0.0
};
if p_in <= p_out || p_in <= 0.0 || p_in >= 1.0 {
return 0.0;
}
let p_in_c = p_in.clamp(1e-15, 1.0 - 1e-15);
let p_out_c = p_out.clamp(1e-15, 1.0 - 1e-15);
let p_total = total_cases / total_expected;
let p_total_c = p_total.clamp(1e-15, 1.0 - 1e-15);
let llr = obs_in * (p_in_c / p_total_c).ln()
+ (exp_in - obs_in) * ((1.0 - p_in_c) / (1.0 - p_total_c)).ln()
+ obs_out * (p_out_c / p_total_c).ln()
+ (exp_out - obs_out) * ((1.0 - p_out_c) / (1.0 - p_total_c)).ln();
llr.max(0.0)
}
}
}
#[allow(clippy::too_many_arguments)]
fn find_best_window(
sorted_neighbours: &[Vec<usize>],
distances: &[Vec<f64>],
cases: &[f64],
population: &[f64],
total_cases: f64,
total_population: f64,
max_pop: f64,
model: ScanModel,
n: usize,
) -> (f64, usize, f64, Vec<usize>, f64, f64) {
let mut best_llr = 0.0;
let mut best_center = 0;
let mut best_radius = 0.0;
let mut best_members: Vec<usize> = Vec::new();
let mut best_obs = 0.0;
let mut best_exp = 0.0;
for i in 0..n {
let mut cum_cases = 0.0;
let mut cum_pop = 0.0;
let mut members: Vec<usize> = Vec::new();
for &j in &sorted_neighbours[i] {
cum_cases += cases[j];
cum_pop += population[j];
members.push(j);
if cum_pop > max_pop {
break;
}
let exp_in = match model {
ScanModel::Poisson => cum_pop * total_cases / total_population,
ScanModel::Bernoulli => cum_pop,
};
let total_expected = match model {
ScanModel::Poisson => total_population * total_cases / total_population,
ScanModel::Bernoulli => total_population,
};
let llr = compute_llr(cum_cases, exp_in, total_cases, total_expected, model);
if llr > best_llr {
best_llr = llr;
best_center = i;
best_radius = distances[i][j];
best_members = members.clone();
best_obs = cum_cases;
best_exp = exp_in;
}
}
}
(
best_llr,
best_center,
best_radius,
best_members,
best_obs,
best_exp,
)
}
#[allow(clippy::too_many_arguments)]
fn monte_carlo_p_value(
sorted_neighbours: &[Vec<usize>],
distances: &[Vec<f64>],
population: &[f64],
total_cases: f64,
total_population: f64,
max_pop: f64,
model: ScanModel,
observed_llr: f64,
n_simulations: usize,
seed: u64,
n: usize,
) -> f64 {
let mut rng = seeded_rng(seed);
let mut count_ge = 0usize;
for _sim in 0..n_simulations {
let sim_cases = generate_null_cases(&mut rng, population, total_cases, model, n);
let (sim_best_llr, _, _, _, _, _) = find_best_window(
sorted_neighbours,
distances,
&sim_cases,
population,
total_cases,
total_population,
max_pop,
model,
n,
);
if sim_best_llr >= observed_llr {
count_ge += 1;
}
}
(count_ge as f64 + 1.0) / (n_simulations as f64 + 1.0)
}
fn generate_null_cases<R: Rng + ?Sized>(
rng: &mut R,
population: &[f64],
total_cases: f64,
model: ScanModel,
n: usize,
) -> Vec<f64> {
match model {
ScanModel::Poisson => {
let total_pop: f64 = population.iter().sum();
let mut sim_cases = vec![0.0; n];
let mut remaining = total_cases as usize;
for i in 0..n {
if remaining == 0 {
break;
}
let prob = population[i] / total_pop;
let expected = remaining as f64 * prob;
let allocated = poisson_sample(rng, expected).min(remaining as f64);
sim_cases[i] = allocated;
remaining = remaining.saturating_sub(allocated as usize);
}
while remaining > 0 {
let idx = rng.random_range(0..n);
sim_cases[idx] += 1.0;
remaining -= 1;
}
sim_cases
}
ScanModel::Bernoulli => {
let total_pop: f64 = population.iter().sum();
let p = total_cases / total_pop;
let mut sim_cases = vec![0.0; n];
for i in 0..n {
let trials = population[i] as usize;
let mut count = 0.0;
for _ in 0..trials {
if rng.random::<f64>() < p {
count += 1.0;
}
}
sim_cases[i] = count;
}
sim_cases
}
}
}
fn poisson_sample<R: Rng + ?Sized>(rng: &mut R, lambda: f64) -> f64 {
if lambda <= 0.0 {
return 0.0;
}
let l = (-lambda).exp();
let mut k: f64 = 0.0;
let mut p: f64 = 1.0;
loop {
k += 1.0;
let u: f64 = rng.random::<f64>();
p *= u;
if p < l {
break;
}
}
if k - 1.0 > 0.0 {
k - 1.0
} else {
0.0
}
}
#[allow(clippy::too_many_arguments)]
fn find_secondary_clusters(
sorted_neighbours: &[Vec<usize>],
distances: &[Vec<f64>],
cases: &[f64],
population: &[f64],
total_cases: f64,
total_population: f64,
max_pop: f64,
model: ScanModel,
primary_members: &[usize],
max_secondary: usize,
n: usize,
) -> Vec<ScanCluster> {
let mut candidates: Vec<(f64, usize, f64, Vec<usize>, f64, f64)> = Vec::new();
for i in 0..n {
if primary_members.contains(&i) {
continue;
}
let mut cum_cases = 0.0;
let mut cum_pop = 0.0;
let mut members: Vec<usize> = Vec::new();
for &j in &sorted_neighbours[i] {
if primary_members.contains(&j) {
continue;
}
cum_cases += cases[j];
cum_pop += population[j];
members.push(j);
if cum_pop > max_pop {
break;
}
let exp_in = match model {
ScanModel::Poisson => cum_pop * total_cases / total_population,
ScanModel::Bernoulli => cum_pop,
};
let total_expected = match model {
ScanModel::Poisson => total_cases,
ScanModel::Bernoulli => total_population,
};
let llr = compute_llr(cum_cases, exp_in, total_cases, total_expected, model);
if llr > 0.0 {
candidates.push((llr, i, distances[i][j], members.clone(), cum_cases, exp_in));
}
}
}
candidates.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
let mut result = Vec::new();
let mut used_indices: Vec<usize> = primary_members.to_vec();
for (llr, center, radius, members, obs, exp) in candidates {
if result.len() >= max_secondary {
break;
}
let overlaps = members.iter().any(|m| used_indices.contains(m));
if overlaps {
continue;
}
used_indices.extend_from_slice(&members);
result.push(ScanCluster {
center_index: center,
radius,
llr,
p_value: 1.0, member_indices: members,
observed_inside: obs,
expected_inside: exp,
});
}
result
}
#[cfg(test)]
mod tests {
use super::*;
use scirs2_core::ndarray::array;
#[test]
fn test_planted_poisson_cluster() {
let coords = array![
[0.0, 0.0],
[1.0, 0.0],
[0.0, 1.0],
[1.0, 1.0], [5.0, 0.0],
[6.0, 0.0],
[5.0, 1.0],
[6.0, 1.0],
[3.0, 3.0],
[4.0, 4.0],
];
let cases = [20.0, 18.0, 22.0, 19.0, 2.0, 3.0, 1.0, 2.0, 1.0, 1.0];
let population = [100.0; 10];
let config = ScanStatisticConfig {
model: ScanModel::Poisson,
max_population_fraction: 0.5,
n_monte_carlo: 99,
seed: 42,
max_secondary_clusters: 3,
};
let result =
kulldorff_scan(&coords.view(), &cases, &population, &config).expect("scan failed");
assert!(
result.primary_cluster.llr > 0.0,
"Primary cluster LLR should be positive"
);
assert!(
result.primary_cluster.center_index < 4,
"Primary cluster should be centred in the high-rate area, got index {}",
result.primary_cluster.center_index
);
assert!(
result.primary_cluster.p_value < 0.5,
"p-value should be < 0.5, got {}",
result.primary_cluster.p_value
);
}
#[test]
fn test_planted_bernoulli_cluster() {
let coords = array![
[0.0, 0.0],
[0.5, 0.0],
[0.0, 0.5],
[5.0, 5.0],
[5.5, 5.0],
[5.0, 5.5],
];
let cases = [9.0, 8.0, 10.0, 1.0, 2.0, 1.0];
let population = [10.0, 10.0, 10.0, 10.0, 10.0, 10.0];
let config = ScanStatisticConfig {
model: ScanModel::Bernoulli,
max_population_fraction: 0.5,
n_monte_carlo: 99,
seed: 123,
max_secondary_clusters: 2,
};
let result =
kulldorff_scan(&coords.view(), &cases, &population, &config).expect("bernoulli scan");
assert!(result.primary_cluster.llr > 0.0);
let centre = result.primary_cluster.center_index;
assert!(
centre < 3,
"Expected cluster centre in [0,3), got {}",
centre
);
}
#[test]
fn test_no_cluster_uniform() {
let coords = array![
[0.0, 0.0],
[1.0, 0.0],
[2.0, 0.0],
[0.0, 1.0],
[1.0, 1.0],
[2.0, 1.0],
];
let cases = [5.0, 5.0, 5.0, 5.0, 5.0, 5.0];
let population = [100.0; 6];
let config = ScanStatisticConfig {
model: ScanModel::Poisson,
n_monte_carlo: 99,
seed: 77,
..Default::default()
};
let result =
kulldorff_scan(&coords.view(), &cases, &population, &config).expect("uniform scan");
assert!(
result.primary_cluster.p_value > 0.05,
"p-value should be > 0.05 for uniform data, got {}",
result.primary_cluster.p_value
);
}
#[test]
fn test_scan_errors() {
let coords = array![[0.0, 0.0], [1.0, 0.0]]; let cases = [1.0, 1.0];
let population = [10.0, 10.0];
let config = ScanStatisticConfig::default();
assert!(kulldorff_scan(&coords.view(), &cases, &population, &config).is_err());
}
#[test]
fn test_scan_secondary_clusters() {
let coords = array![
[0.0, 0.0],
[0.1, 0.0],
[0.0, 0.1],
[10.0, 10.0],
[10.1, 10.0],
[10.0, 10.1],
[5.0, 5.0], [5.1, 5.0],
[5.0, 5.1],
];
let cases = [15.0, 14.0, 16.0, 12.0, 13.0, 14.0, 1.0, 1.0, 1.0];
let population = [20.0; 9];
let config = ScanStatisticConfig {
model: ScanModel::Poisson,
n_monte_carlo: 49,
seed: 999,
max_secondary_clusters: 3,
..Default::default()
};
let result =
kulldorff_scan(&coords.view(), &cases, &population, &config).expect("secondary scan");
assert!(result.primary_cluster.llr > 0.0);
}
}