use polars::prelude::*;
use std::collections::HashMap;
#[derive(Debug, Default)]
struct PlanetAggregate {
discovery_methods: HashMap<String, usize>,
radii: Vec<f64>,
orbital_periods: Vec<f64>,
discovery_years: Vec<i32>,
}
#[derive(Debug, Clone)]
pub struct TemperatureBin {
pub range: String,
pub min_temp: f64,
pub max_temp: f64,
pub star_count: u32,
pub percentage: f64,
}
#[derive(Debug, Clone)]
pub struct DecadeData {
pub decade: i32,
pub stars_discovered: u32,
pub discovery_methods: HashMap<String, u32>,
pub median_temp: Option<f64>,
}
#[derive(Debug, Clone)]
pub struct CatalogStats {
pub total_stars: u32,
pub hd_match_rate: f64,
pub hip_match_rate: f64,
pub tic_match_rate: f64,
pub gaia_dr2_match_rate: f64,
pub gaia_dr3_match_rate: f64,
pub cross_match_matrix: Vec<Vec<u32>>, }
#[derive(Debug, Clone)]
pub struct PhotometricStats {
pub band_stats: HashMap<String, BandStats>,
pub color_indices: HashMap<String, f64>,
}
#[derive(Debug, Clone)]
pub struct BandStats {
pub band_name: String,
pub count: u32,
pub mean_mag: f64,
pub median_mag: f64,
pub std_mag: f64,
pub min_mag: f64,
pub max_mag: f64,
}
pub fn temperature_distribution(df: &DataFrame) -> Vec<TemperatureBin> {
if let Ok(st_teff_col) = df.column("st_teff")
&& let Some(st_teff_series) = st_teff_col.as_series()
&& let Ok(st_teff_data) = st_teff_series.f64()
{
let total_stars = st_teff_data.len() as f64;
let mut bin_counts = [0; 7];
for temp in st_teff_data.into_iter().flatten() {
if (3000.0..=10000.0).contains(&temp) {
let bin_index = ((temp - 3000.0) / 1000.0) as usize;
if bin_index < 7 {
bin_counts[bin_index] += 1;
}
}
}
let mut bins = Vec::new();
for (i, &count) in bin_counts.iter().enumerate() {
let min_temp = 3000.0 + (i as f64) * 1000.0;
let max_temp = min_temp + 1000.0;
bins.push(TemperatureBin {
range: format!("{:.0}-{:.0}K", min_temp, max_temp),
min_temp,
max_temp,
star_count: count as u32,
percentage: (count as f64 / total_stars) * 100.0,
});
}
return bins;
}
vec![]
}
pub fn discovery_timeline(df: &DataFrame) -> Vec<DecadeData> {
if let (Ok(disc_year_col), Ok(st_teff_col), Ok(hostname_col)) = (
df.column("disc_year"),
df.column("st_teff"),
df.column("hostname"),
) && let (Some(disc_year_series), Some(st_teff_series)) =
(disc_year_col.as_series(), st_teff_col.as_series())
&& hostname_col.as_series().is_some()
&& let Ok(disc_year_data) = disc_year_series.f64()
&& let Ok(st_teff_data) = st_teff_series.f64()
{
let mut decade_map: HashMap<i32, (u32, Vec<f64>)> = HashMap::new();
for (i, opt_year) in disc_year_data.into_iter().enumerate() {
if let Some(year) = opt_year {
let decade = (year as i32 / 10) * 10;
if i < st_teff_data.len()
&& let Some(temp) = st_teff_data.get(i)
&& temp > 0.0
{
let entry =
decade_map.entry(decade).or_insert((0, Vec::new()));
entry.0 += 1;
entry.1.push(temp);
}
}
}
let mut result = Vec::new();
for (decade, (count, temps)) in decade_map {
let median_temp = if !temps.is_empty() {
let mut sorted_temps = temps;
sorted_temps.sort_by(|a, b| a.partial_cmp(b).unwrap());
let len = sorted_temps.len();
if len % 2 == 0 {
Some(
(sorted_temps[len / 2 - 1] + sorted_temps[len / 2]) / 2.0,
)
} else {
Some(sorted_temps[len / 2])
}
} else {
None
};
result.push(DecadeData {
decade,
stars_discovered: count,
discovery_methods: HashMap::new(), median_temp,
});
}
result.sort_by_key(|d| d.decade);
return result;
}
vec![]
}
pub fn catalog_crossmatch(df: &DataFrame) -> CatalogStats {
let total_stars = df.height() as u32;
let count_column = |col_name: &str| -> u32 {
if let Ok(col) = df.column(col_name)
&& let Some(series) = col.as_series()
{
return (series.len() - series.null_count()) as u32;
}
0
};
let stars_with_hd = count_column("hd_name");
let stars_with_hip = count_column("hip_name");
let stars_with_tic = count_column("tic_id");
let stars_with_gaia_dr2 = count_column("gaia_dr2_id");
let stars_with_gaia_dr3 = count_column("gaia_dr3_id");
CatalogStats {
total_stars,
hd_match_rate: (stars_with_hd as f64 / total_stars as f64) * 100.0,
hip_match_rate: (stars_with_hip as f64 / total_stars as f64) * 100.0,
tic_match_rate: (stars_with_tic as f64 / total_stars as f64) * 100.0,
gaia_dr2_match_rate: (stars_with_gaia_dr2 as f64 / total_stars as f64)
* 100.0,
gaia_dr3_match_rate: (stars_with_gaia_dr3 as f64 / total_stars as f64)
* 100.0,
cross_match_matrix: Vec::new(), }
}
pub fn photometric_statistics(df: &DataFrame) -> PhotometricStats {
let photometric_bands = vec![
("sy_vmag", "V"),
("sy_bmag", "B"),
("sy_jmag", "J"),
("sy_hmag", "H"),
("sy_kmag", "K"),
("sy_gmag", "G"),
("sy_gaiamag", "Gaia"),
("sy_kepmag", "Kepler"),
];
let mut band_stats = HashMap::new();
for (col_name, band_name) in photometric_bands {
if let Some(stats) = compute_band_stats(df, col_name) {
band_stats.insert(band_name.to_string(), stats);
}
}
let mut color_indices = HashMap::new();
if band_stats.contains_key("B") && band_stats.contains_key("V") {
color_indices.insert("B-V".to_string(), 0.0); }
if band_stats.contains_key("V") && band_stats.contains_key("K") {
color_indices.insert("V-K".to_string(), 0.0); }
PhotometricStats {
band_stats,
color_indices,
}
}
fn compute_band_stats(df: &DataFrame, column: &str) -> Option<BandStats> {
if let Ok(col) = df.column(column)
&& let Some(series) = col.as_series()
&& let Ok(f64_series) = series.f64()
{
let count = f64_series.len() as u32;
if count > 0 {
let mean_mag = f64_series.mean().unwrap_or(0.0);
let median_mag = f64_series.median().unwrap_or(0.0);
let std_mag = f64_series.std(0).unwrap_or(0.0);
let min_mag = f64_series.min().unwrap_or(0.0);
let max_mag = f64_series.max().unwrap_or(0.0);
return Some(BandStats {
band_name: column.to_string(),
count,
mean_mag,
median_mag,
std_mag,
min_mag,
max_mag,
});
}
}
None
}
pub fn get_total_counts(
stellarhosts_df: &DataFrame,
exoplanets_df: &DataFrame,
) -> (usize, usize) {
let stellarhosts_total = distinct_non_null_count(stellarhosts_df, "hostname");
let exoplanets_total = distinct_non_null_count(exoplanets_df, "pl_name");
(stellarhosts_total, exoplanets_total)
}
fn distinct_non_null_count(df: &DataFrame, column: &str) -> usize {
df.column(column)
.ok()
.and_then(|col| col.as_series())
.and_then(|series| series.drop_nulls().n_unique().ok())
.unwrap_or_else(|| df.height())
}
pub fn get_avg_temperature(df: &DataFrame) -> Option<f64> {
df.column("st_teff")
.ok()
.and_then(|col| col.f64().ok())
.and_then(|series| series.mean())
}
pub fn get_avg_distance(df: &DataFrame) -> Option<f64> {
df.column("sy_dist")
.ok()
.and_then(|col| col.f64().ok())
.and_then(|series| series.mean())
}
pub fn get_discovery_methods(
df: &DataFrame,
limit: usize,
) -> Vec<(String, usize)> {
let planets = build_planet_aggregates(df);
let mut methods = HashMap::new();
for aggregate in planets.values() {
if let Some(method) = canonical_string(&aggregate.discovery_methods) {
*methods.entry(method).or_insert(0) += 1;
}
}
let mut methods_vec: Vec<_> = methods.into_iter().collect();
methods_vec.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
methods_vec.truncate(limit);
methods_vec
}
pub fn get_planet_size_categories(df: &DataFrame) -> Vec<(String, usize)> {
let planets = build_planet_aggregates(df);
let mut categories = HashMap::new();
for aggregate in planets.values() {
if let Some(radius) = median_f64(&aggregate.radii) {
let category = if radius < 1.0 {
"Sub-Earth (< 1 R⊕)"
} else if radius < 1.5 {
"Earth-like (1-1.5 R⊕)"
} else if radius < 2.5 {
"Super-Earth (1.5-2.5 R⊕)"
} else if radius < 4.0 {
"Neptune-like (2.5-4 R⊕)"
} else {
"Jupiter-like (> 4 R⊕)"
};
*categories.entry(category.to_string()).or_insert(0) += 1;
}
}
let mut categories_vec: Vec<_> = categories.into_iter().collect();
categories_vec.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
categories_vec
}
pub fn get_discovery_year_counts(
df: &DataFrame,
limit: usize,
) -> Vec<(String, usize)> {
let planets = build_planet_aggregates(df);
let mut years = HashMap::new();
for aggregate in planets.values() {
if let Some(year) = aggregate.discovery_years.iter().min() {
*years.entry(year.to_string()).or_insert(0) += 1;
}
}
let mut years_vec: Vec<_> = years.into_iter().collect();
years_vec.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| b.0.cmp(&a.0)));
years_vec.truncate(limit);
years_vec
}
pub fn get_orbital_period_buckets(df: &DataFrame) -> Vec<(String, usize)> {
let planets = build_planet_aggregates(df);
let mut buckets = HashMap::new();
for aggregate in planets.values() {
if let Some(period) = median_f64(&aggregate.orbital_periods) {
let bucket = if period < 1.0 {
"< 1 day"
} else if period < 10.0 {
"1-10 days"
} else if period < 100.0 {
"10-100 days"
} else if period < 1000.0 {
"100-1000 days"
} else {
"> 1000 days"
};
*buckets.entry(bucket.to_string()).or_insert(0) += 1;
}
}
let mut buckets_vec: Vec<_> = buckets.into_iter().collect();
buckets_vec.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
buckets_vec
}
fn build_planet_aggregates(df: &DataFrame) -> HashMap<String, PlanetAggregate> {
let Ok(pl_name_col) = df.column("pl_name") else {
return HashMap::new();
};
let discovery_method_col = df.column("discoverymethod").ok();
let radius_col = df.column("pl_rade").ok();
let orbital_period_col = df.column("pl_orbper").ok();
let discovery_year_col = df.column("disc_year").ok();
let mut planets = HashMap::new();
for row_idx in 0..df.height() {
let Some(pl_name) = string_value_at(pl_name_col, row_idx) else {
continue;
};
let aggregate = planets
.entry(pl_name)
.or_insert_with(PlanetAggregate::default);
if let Some(col) = discovery_method_col.as_ref()
&& let Some(method) = string_value_at(col, row_idx)
{
*aggregate.discovery_methods.entry(method).or_insert(0) += 1;
}
if let Some(col) = radius_col.as_ref()
&& let Some(radius) = float_value_at(col, row_idx)
&& radius.is_finite()
{
aggregate.radii.push(radius);
}
if let Some(col) = orbital_period_col.as_ref()
&& let Some(period) = float_value_at(col, row_idx)
&& period.is_finite()
&& period >= 0.0
{
aggregate.orbital_periods.push(period);
}
if let Some(col) = discovery_year_col.as_ref()
&& let Some(year) = year_value_at(col, row_idx)
{
aggregate.discovery_years.push(year);
}
}
planets
}
fn string_value_at(col: &Column, row_idx: usize) -> Option<String> {
match col.get(row_idx).ok()? {
AnyValue::String(value) => Some(value.to_string()),
AnyValue::StringOwned(value) => Some(value.as_str().to_string()),
_ => None,
}
}
fn float_value_at(col: &Column, row_idx: usize) -> Option<f64> {
match col.get(row_idx).ok()? {
AnyValue::Float64(value) => Some(value),
AnyValue::Float32(value) => Some(value as f64),
AnyValue::Int64(value) => Some(value as f64),
AnyValue::Int32(value) => Some(value as f64),
AnyValue::UInt64(value) => Some(value as f64),
AnyValue::UInt32(value) => Some(value as f64),
_ => None,
}
}
fn year_value_at(col: &Column, row_idx: usize) -> Option<i32> {
match col.get(row_idx).ok()? {
AnyValue::Int64(value) => i32::try_from(value).ok(),
AnyValue::Int32(value) => Some(value),
AnyValue::UInt64(value) => i32::try_from(value).ok(),
AnyValue::UInt32(value) => i32::try_from(value).ok(),
AnyValue::Float64(value) if value.is_finite() => Some(value as i32),
AnyValue::Float32(value) if value.is_finite() => Some(value as i32),
_ => None,
}
}
fn canonical_string(values: &HashMap<String, usize>) -> Option<String> {
let mut sorted: Vec<_> = values.iter().collect();
sorted.sort_by(|a, b| b.1.cmp(a.1).then_with(|| a.0.cmp(b.0)));
sorted.first().map(|(value, _)| (*value).clone())
}
fn median_f64(values: &[f64]) -> Option<f64> {
if values.is_empty() {
return None;
}
let mut sorted = values.to_vec();
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
let len = sorted.len();
if len.is_multiple_of(2) {
Some((sorted[len / 2 - 1] + sorted[len / 2]) / 2.0)
} else {
Some(sorted[len / 2])
}
}
#[cfg(test)]
mod tests {
use super::{
get_discovery_methods, get_discovery_year_counts,
get_orbital_period_buckets, get_planet_size_categories, get_total_counts,
};
use polars::df;
#[test]
fn total_counts_use_distinct_host_and_planet_names() {
let stellarhosts_df = df!(
"hostname" => &[Some("HD 1"), Some("HD 1"), Some("HD 2"), None]
)
.unwrap();
let exoplanets_df = df!(
"pl_name" => &[Some("Planet A"), Some("Planet A"), Some("Planet B"), None]
)
.unwrap();
let (stellarhosts_total, exoplanets_total) =
get_total_counts(&stellarhosts_df, &exoplanets_df);
assert_eq!(stellarhosts_total, 2);
assert_eq!(exoplanets_total, 2);
}
#[test]
fn discovery_methods_use_one_canonical_method_per_planet() {
let exoplanets_df = df!(
"pl_name" => &[
"Planet A",
"Planet A",
"Planet A",
"Planet B",
"Planet B",
"Planet C",
],
"discoverymethod" => &[
"Transit",
"Transit",
"Radial Velocity",
"Radial Velocity",
"Radial Velocity",
"Imaging",
]
)
.unwrap();
let methods = get_discovery_methods(&exoplanets_df, 10);
assert_eq!(
methods,
vec![
("Imaging".to_string(), 1),
("Radial Velocity".to_string(), 1),
("Transit".to_string(), 1),
]
);
}
#[test]
fn planet_size_categories_use_one_canonical_radius_per_planet() {
let exoplanets_df = df!(
"pl_name" => &["Planet A", "Planet A", "Planet B", "Planet B", "Planet C"],
"pl_rade" => &[1.2, 1.4, 3.1, 3.7, 5.2]
)
.unwrap();
let categories = get_planet_size_categories(&exoplanets_df);
assert_eq!(
categories,
vec![
("Earth-like (1-1.5 R⊕)".to_string(), 1),
("Jupiter-like (> 4 R⊕)".to_string(), 1),
("Neptune-like (2.5-4 R⊕)".to_string(), 1),
]
);
}
#[test]
fn discovery_years_use_earliest_year_per_planet() {
let exoplanets_df = df!(
"pl_name" => &["Planet A", "Planet A", "Planet B", "Planet C", "Planet C"],
"disc_year" => &[2016i32, 2018i32, 2014i32, 2021i32, 2020i32]
)
.unwrap();
let years = get_discovery_year_counts(&exoplanets_df, 10);
assert_eq!(
years,
vec![
("2020".to_string(), 1),
("2016".to_string(), 1),
("2014".to_string(), 1),
]
);
}
#[test]
fn orbital_period_buckets_use_one_canonical_period_per_planet() {
let exoplanets_df = df!(
"pl_name" => &[
"Planet A",
"Planet A",
"Planet B",
"Planet C",
"Planet D",
"Planet E",
],
"pl_orbper" => &[0.8, 0.9, 5.0, 55.0, 500.0, 5000.0]
)
.unwrap();
let buckets = get_orbital_period_buckets(&exoplanets_df);
assert_eq!(
buckets,
vec![
("1-10 days".to_string(), 1),
("10-100 days".to_string(), 1),
("100-1000 days".to_string(), 1),
("< 1 day".to_string(), 1),
("> 1000 days".to_string(), 1),
]
);
}
}