use calamine::{Data, Reader, open_workbook_auto};
use polars::prelude::*;
use spreadsheet_ods::{Value, read_ods};
use std::fs;
use std::path::Path;
#[allow(dead_code)]
#[derive(Debug, Clone)]
pub struct AxisDef {
pub scale_type: String,
pub axis_name: String,
pub min_scale_value: u32,
pub max_scale_value: u32,
pub increment: u32,
}
#[allow(dead_code)]
#[derive(Debug, Clone)]
pub struct MetaData {
pub scaling_factor: f64,
pub data_type: String,
pub nation: String,
pub table_description: String,
pub axis_defs: Vec<AxisDef>,
}
#[allow(dead_code)]
#[derive(Debug, Clone)]
pub struct Table {
pub meta_data: MetaData,
pub values: DataFrame,
}
#[allow(dead_code)]
#[derive(Debug, Clone)]
pub struct ContentClassification {
pub table_identity: i32,
pub provider_domain: String,
pub provider_name: String,
pub table_reference: String,
pub content_type: String,
pub table_name: String,
pub table_description: String,
pub comments: String,
pub key_words: Vec<String>,
}
#[allow(dead_code)]
#[derive(Debug, Clone)]
pub struct MortXML {
pub content_classification: ContentClassification,
pub tables: Vec<Table>,
}
impl MortXML {
pub fn from_string(xml_str: &str) -> Result<Self, Box<dyn std::error::Error>> {
let doc = roxmltree::Document::parse(xml_str)?;
let root = doc.root_element();
let content_classification = create_content_classification(&root)?;
let tables = create_tables(&root)?;
if tables.len() != 1 {
return Err("MortXML must contain exactly one table".into());
}
let result = MortXML {
content_classification,
tables,
};
Ok(result)
}
pub fn from_id(id: i32) -> Result<Self, Box<dyn std::error::Error>> {
let filename = format!("data/t{id}.xml");
let xml_str = fs::read_to_string(filename)?;
Self::from_string(&xml_str)
}
pub fn from_path(path: &Path) -> Result<Self, Box<dyn std::error::Error>> {
let xml_str = fs::read_to_string(path)?;
Self::from_string(&xml_str)
}
pub fn from_url(url: &str) -> Result<Self, Box<dyn std::error::Error>> {
let xml_str = reqwest::blocking::get(url)?.text()?;
Self::from_string(&xml_str)
}
pub fn from_url_id(id: i32) -> Result<Self, Box<dyn std::error::Error>> {
let url = format!("https://mort.soa.org/data/t{id}.xml");
Self::from_url(&url)
}
pub fn from_df(df: DataFrame) -> Result<Self, Box<dyn std::error::Error>> {
Self::_validate_df_schema(&df)?;
let content_classification = ContentClassification {
table_identity: 0,
provider_domain: "local".to_string(),
provider_name: "Local DataFrame".to_string(),
table_reference: "DataFrame Table".to_string(),
content_type: "Mortality/Life table".to_string(),
table_name: "DataFrame Table".to_string(),
table_description: "Table created from DataFrame".to_string(),
comments: "No comments".to_string(),
key_words: vec![],
};
let meta_data = MetaData {
scaling_factor: 1.0,
data_type: "Mortality Rate".to_string(),
nation: "Local".to_string(),
table_description: "Table created from DataFrame".to_string(),
axis_defs: vec![],
};
let table = Table {
meta_data,
values: df,
};
let result = MortXML {
content_classification,
tables: vec![table],
};
Ok(result)
}
pub fn from_xlsx(
xlsx_file: &str,
sheet_name: &str,
) -> Result<Self, Box<dyn std::error::Error>> {
let mut workbook = open_workbook_auto(xlsx_file)
.map_err(|e| format!("Failed to open XLSX file '{xlsx_file}': {e}"))?;
let range = workbook
.worksheet_range(sheet_name)
.map_err(|e| format!("Failed to read sheet '{sheet_name}': {e}"))?;
if range.is_empty() {
return Err(format!("Sheet '{sheet_name}' is empty").into());
}
let rows: Vec<_> = range.rows().collect();
if rows.len() < 2 {
return Err("Sheet must contain at least a header row and one data row".into());
}
let header_row = &rows[0];
if header_row.is_empty() {
return Err("Header row is empty".into());
}
let mut column_names = Vec::new();
for (i, cell) in header_row.iter().enumerate() {
let col_name =
Self::extract_xlsx_header_name(Some(cell), &format!("Column {}", i + 1))?;
column_names.push(col_name);
}
let mut column_data: Vec<Vec<AnyValue>> = vec![Vec::new(); column_names.len()];
for (i, row) in rows.iter().enumerate().skip(1) {
let row_num = i + 1;
for (col_idx, (cell, col_name)) in row.iter().zip(column_names.iter()).enumerate() {
let any_value = if col_name == "age" || col_name == "duration" {
let val = Self::_parse_xlsx_u32_cell(Some(cell), row_num, col_name)?;
AnyValue::UInt32(val)
} else {
let val = Self::_parse_xlsx_f64_cell(Some(cell), row_num, col_name)?;
AnyValue::Float64(val)
};
column_data[col_idx].push(any_value);
}
}
if column_data.is_empty() || column_data[0].is_empty() {
return Err("No data rows found in sheet".into());
}
let mut columns = Vec::new();
for (col_name, data) in column_names.iter().zip(column_data.iter()) {
let series = Series::from_any_values(col_name.as_str().into(), data, true)
.map_err(|e| format!("Failed to create series for column '{col_name}': {e}"))?;
columns.push(series.into_column());
}
let df = DataFrame::new(columns).map_err(|e| format!("Failed to create DataFrame: {e}"))?;
MortXML::from_df(df)
}
pub fn from_ods(ods_file: &str, sheet_name: &str) -> Result<Self, Box<dyn std::error::Error>> {
let workbook =
read_ods(ods_file).map_err(|e| format!("Failed to open ODS file '{ods_file}': {e}"))?;
let mut sheet = None;
for i in 0..workbook.num_sheets() {
let current_sheet = workbook.sheet(i);
if current_sheet.name() == sheet_name {
sheet = Some(current_sheet);
break;
}
}
let sheet = sheet.ok_or_else(|| format!("Sheet '{sheet_name}' not found in ODS file"))?;
let (max_row, max_col) = sheet.used_grid_size();
if max_row < 1 {
return Err(format!("Sheet '{sheet_name}' is empty").into());
}
let mut column_names = Vec::new();
for col in 0..=max_col {
let cell_value = sheet.value(0, col);
let col_name =
Self::_extract_ods_header_name(cell_value, &format!("Column {}", col + 1))?;
column_names.push(col_name);
}
let mut column_data: Vec<Vec<AnyValue>> = vec![Vec::new(); column_names.len()];
for row in 1..=max_row {
let row_num = (row + 1) as usize;
for (col_idx, col_name) in column_names.iter().enumerate() {
let cell_value = sheet.value(row, col_idx as u32);
let any_value = if col_name == "age" || col_name == "duration" {
let val = Self::_parse_ods_u32_cell(cell_value, row_num, col_name)?;
AnyValue::UInt32(val)
} else {
let val = Self::_parse_ods_f64_cell(cell_value, row_num, col_name)?;
AnyValue::Float64(val)
};
column_data[col_idx].push(any_value);
}
}
if column_data.is_empty() || column_data[0].is_empty() {
return Err("No data rows found in sheet".into());
}
let mut columns = Vec::new();
for (col_name, data) in column_names.iter().zip(column_data.iter()) {
let series = Series::from_any_values(col_name.as_str().into(), data, true)
.map_err(|e| format!("Failed to create series for column '{col_name}': {e}"))?;
columns.push(series.into_column());
}
let df = DataFrame::new(columns).map_err(|e| format!("Failed to create DataFrame: {e}"))?;
MortXML::from_df(df)
}
fn _validate_df_schema(df: &DataFrame) -> Result<(), Box<dyn std::error::Error>> {
if df.height() == 0 {
return Err("DataFrame must contain at least one row of data".into());
}
let columns = df.get_columns();
let num_cols = columns.len();
if num_cols < 2 {
return Err("DataFrame must have at least 2 columns (age and qx/lx)".into());
}
if num_cols > 3 {
return Err(
"DataFrame must have at most 3 columns (age, qx/lx, optional duration)".into(),
);
}
let age_col = &columns[0];
if age_col.name() != "age" {
return Err(format!(
"First column must be named 'age', found '{}'",
age_col.name()
)
.into());
}
if !matches!(age_col.dtype(), DataType::UInt32) {
return Err(format!(
"First column 'age' must be u32 type, found {:?}",
age_col.dtype()
)
.into());
}
let value_col = &columns[1];
let value_col_name = value_col.name();
if value_col_name != "qx" && value_col_name != "lx" {
return Err(format!(
"Second column must be named 'qx' or 'lx', found '{value_col_name}'"
)
.into());
}
if !matches!(value_col.dtype(), DataType::Float64) {
return Err(format!(
"Second column '{}' must be f64 type, found {:?}",
value_col_name,
value_col.dtype()
)
.into());
}
if num_cols == 3 {
let duration_col = &columns[2];
if duration_col.name() != "duration" {
return Err(format!(
"Third column must be named 'duration', found '{}'",
duration_col.name()
)
.into());
}
if !matches!(duration_col.dtype(), DataType::UInt32) {
return Err(format!(
"Third column 'duration' must be u32 type, found {:?}",
duration_col.dtype()
)
.into());
}
}
let _age_column = df.column("age")?;
let value_column = df.column(value_col_name)?;
let value_series = value_column.as_materialized_series();
if value_col_name == "lx" {
if let Ok(Some(min_val)) = value_series.min::<f64>() {
if min_val < 0.0 {
return Err(format!(
"Life count values (lx) must be non-negative, found minimum: {min_val}"
)
.into());
}
}
} else if value_col_name == "qx" {
if let Ok(Some(min_val)) = value_series.min::<f64>() {
if min_val < 0.0 {
return Err(format!(
"Mortality rate values (qx) must be non-negative, found minimum: {min_val}"
)
.into());
}
}
if let Ok(Some(max_val)) = value_series.max::<f64>() {
if max_val > 1.0 {
return Err(format!(
"Mortality rate values (qx) must be ≤ 1.0, found maximum: {max_val}"
)
.into());
}
}
}
if num_cols == 3 {
}
Ok(())
}
fn _extract_ods_header_name(
cell_value: &Value,
column_desc: &str,
) -> Result<String, Box<dyn std::error::Error>> {
match cell_value {
Value::Text(s) => Ok(s.trim().to_lowercase()),
Value::Empty => Err(format!("{column_desc} header is missing").into()),
other => Err(format!("{column_desc} header must be text, found {other:?}").into()),
}
}
fn _parse_ods_u32_cell(
cell_value: &Value,
row_num: usize,
col_name: &str,
) -> Result<u32, Box<dyn std::error::Error>> {
match cell_value {
Value::Number(f) => {
if f.is_nan() || f.is_infinite() || *f < 0.0 || *f > u32::MAX as f64 {
Err(
format!("{col_name} value {f} at row {row_num} is invalid or out of range")
.into(),
)
} else {
Ok(*f as u32)
}
}
Value::Text(s) => s.parse::<u32>().map_err(|_| {
format!("Cannot parse {col_name} '{s}' at row {row_num} as unsigned integer").into()
}),
Value::Empty => Err(format!("Missing {col_name} value at row {row_num}").into()),
other => Err(format!("Invalid {col_name} cell type {other:?} at row {row_num}").into()),
}
}
fn _parse_ods_f64_cell(
cell_value: &Value,
row_num: usize,
col_name: &str,
) -> Result<f64, Box<dyn std::error::Error>> {
match cell_value {
Value::Number(f) => Ok(*f),
Value::Text(s) => {
if s.trim().is_empty() {
Ok(f64::NAN)
} else {
s.parse::<f64>().map_err(|_| {
format!("Cannot parse {col_name} '{s}' at row {row_num} as number").into()
})
}
}
Value::Empty => Ok(f64::NAN),
other => Err(format!("Invalid {col_name} cell type {other:?} at row {row_num}").into()),
}
}
fn extract_xlsx_header_name(
cell: Option<&Data>,
column_desc: &str,
) -> Result<String, Box<dyn std::error::Error>> {
match cell {
Some(Data::String(s)) => Ok(s.trim().to_lowercase()),
Some(other) => {
Err(format!("{column_desc} header must be text, found {other:?}").into())
}
None => Err(format!("{column_desc} header is missing").into()),
}
}
fn _parse_xlsx_u32_cell(
cell: Option<&Data>,
row_num: usize,
col_name: &str,
) -> Result<u32, Box<dyn std::error::Error>> {
match cell {
Some(Data::Int(v)) => {
if *v < 0 || *v > u32::MAX as i64 {
Err(
format!("{col_name} value {v} at row {row_num} is out of valid range")
.into(),
)
} else {
Ok(*v as u32)
}
}
Some(Data::Float(f)) => {
if f.is_nan() || f.is_infinite() || *f < 0.0 || *f > u32::MAX as f64 {
Err(
format!("{col_name} value {f} at row {row_num} is invalid or out of range")
.into(),
)
} else {
Ok(*f as u32)
}
}
Some(Data::String(s)) => s.parse::<u32>().map_err(|_| {
format!("Cannot parse {col_name} '{s}' at row {row_num} as unsigned integer").into()
}),
Some(Data::Bool(b)) => Ok(if *b { 1 } else { 0 }),
Some(Data::Empty) => Err(format!("Missing {col_name} value at row {row_num}").into()),
Some(other) => {
Err(format!("Invalid {col_name} cell type {other:?} at row {row_num}").into())
}
None => Err(format!("Missing {col_name} cell at row {row_num}").into()),
}
}
fn _parse_xlsx_f64_cell(
cell: Option<&Data>,
row_num: usize,
col_name: &str,
) -> Result<f64, Box<dyn std::error::Error>> {
match cell {
Some(Data::Float(f)) => Ok(*f),
Some(Data::Int(v)) => Ok(*v as f64),
Some(Data::String(s)) => {
if s.trim().is_empty() {
Ok(f64::NAN)
} else {
s.parse::<f64>().map_err(|_| {
format!("Cannot parse {col_name} '{s}' at row {row_num} as number").into()
})
}
}
Some(Data::Bool(b)) => Ok(if *b { 1.0 } else { 0.0 }),
Some(Data::Empty) => Ok(f64::NAN),
Some(other) => {
Err(format!("Invalid {col_name} cell type {other:?} at row {row_num}").into())
}
None => Err(format!("Missing {col_name} cell at row {row_num}").into()),
}
}
}
fn create_content_classification(
root: &roxmltree::Node,
) -> Result<ContentClassification, Box<dyn std::error::Error>> {
let cc = root
.descendants()
.find(|n| n.tag_name().name() == "ContentClassification")
.ok_or("ContentClassification element not found")?;
let table_identity = cc
.descendants()
.find(|n| n.tag_name().name() == "TableIdentity")
.and_then(|n| n.text())
.and_then(|t| t.parse::<i32>().ok())
.ok_or("TableIdentity not found or invalid")?;
let provider_domain = cc
.descendants()
.find(|n| n.tag_name().name() == "ProviderDomain")
.and_then(|n| n.text())
.unwrap_or("")
.to_string();
let provider_name = cc
.descendants()
.find(|n| n.tag_name().name() == "ProviderName")
.and_then(|n| n.text())
.unwrap_or("")
.to_string();
let table_reference = cc
.descendants()
.find(|n| n.tag_name().name() == "TableReference")
.and_then(|n| n.text())
.unwrap_or("")
.to_string();
let content_type = cc
.descendants()
.find(|n| n.tag_name().name() == "ContentType")
.and_then(|n| n.text())
.unwrap_or("")
.to_string();
let table_name = cc
.descendants()
.find(|n| n.tag_name().name() == "TableName")
.and_then(|n| n.text())
.unwrap_or("")
.to_string();
let table_description = cc
.descendants()
.find(|n| n.tag_name().name() == "TableDescription")
.and_then(|n| n.text())
.unwrap_or("")
.to_string();
let comments = cc
.descendants()
.find(|n| n.tag_name().name() == "Comments")
.and_then(|n| n.text())
.unwrap_or("")
.to_string();
let key_words = cc
.descendants()
.filter(|n| n.tag_name().name() == "KeyWord")
.filter_map(|n| n.text())
.map(|s| s.to_string())
.collect();
let result = ContentClassification {
table_identity,
provider_domain,
provider_name,
table_reference,
content_type,
table_name,
table_description,
comments,
key_words,
};
Ok(result)
}
fn create_tables(root: &roxmltree::Node) -> Result<Vec<Table>, Box<dyn std::error::Error>> {
let mut tables = Vec::new();
let table_nodes = root
.descendants()
.filter(|n| n.tag_name().name() == "Table");
for node in table_nodes {
let table = create_table(&node)?;
tables.push(table);
}
Ok(tables)
}
fn create_table(table_node: &roxmltree::Node) -> Result<Table, Box<dyn std::error::Error>> {
let meta_data = create_meta_data(table_node)?;
let values = create_values(table_node)?;
let result = Table { meta_data, values };
Ok(result)
}
fn create_meta_data(table_node: &roxmltree::Node) -> Result<MetaData, Box<dyn std::error::Error>> {
let metadata_node = table_node
.descendants()
.find(|n| n.tag_name().name() == "MetaData")
.ok_or("MetaData element not found")?;
let scaling_factor = metadata_node
.descendants()
.find(|n| n.tag_name().name() == "ScalingFactor")
.and_then(|n| n.text())
.and_then(|t| t.parse::<f64>().ok())
.unwrap_or(1.0);
let data_type = metadata_node
.descendants()
.find(|n| n.tag_name().name() == "DataType")
.and_then(|n| n.text())
.unwrap_or("")
.to_string();
let nation = metadata_node
.descendants()
.find(|n| n.tag_name().name() == "Nation")
.and_then(|n| n.text())
.unwrap_or("")
.to_string();
let table_description = metadata_node
.descendants()
.find(|n| n.tag_name().name() == "TableDescription")
.and_then(|n| n.text())
.unwrap_or("")
.to_string();
let mut axis_defs = Vec::new();
let axis_def_nodes = metadata_node
.descendants()
.filter(|n| n.tag_name().name() == "AxisDef");
for node in axis_def_nodes {
let axis_def = create_axis_def(&node)?;
axis_defs.push(axis_def);
}
let result = MetaData {
scaling_factor,
data_type,
nation,
table_description,
axis_defs,
};
Ok(result)
}
fn create_axis_def(axis_def_node: &roxmltree::Node) -> Result<AxisDef, Box<dyn std::error::Error>> {
let scale_type = axis_def_node
.descendants()
.find(|n| n.tag_name().name() == "ScaleType")
.and_then(|n| n.text())
.unwrap_or("")
.to_string();
let axis_name = axis_def_node
.descendants()
.find(|n| n.tag_name().name() == "AxisName")
.and_then(|n| n.text())
.unwrap_or("")
.to_string();
let min_scale_value = axis_def_node
.descendants()
.find(|n| n.tag_name().name() == "MinScaleValue")
.and_then(|n| n.text())
.and_then(|t| t.parse::<u32>().ok())
.unwrap_or(0);
let max_scale_value = axis_def_node
.descendants()
.find(|n| n.tag_name().name() == "MaxScaleValue")
.and_then(|n| n.text())
.and_then(|t| t.parse::<u32>().ok())
.unwrap_or(0);
let increment = axis_def_node
.descendants()
.find(|n| n.tag_name().name() == "Increment")
.and_then(|n| n.text())
.and_then(|t| t.parse::<u32>().ok())
.unwrap_or(1);
let result = AxisDef {
scale_type,
axis_name,
min_scale_value,
max_scale_value,
increment,
};
Ok(result)
}
fn create_values(table_node: &roxmltree::Node) -> Result<DataFrame, Box<dyn std::error::Error>> {
let mut ages: Vec<Option<u32>> = Vec::new();
let mut durations: Vec<Option<u32>> = Vec::new();
let mut values: Vec<f64> = Vec::new();
let axis_nodes = table_node
.descendants()
.filter(|n| n.tag_name().name() == "Axis");
for node in axis_nodes {
let (axis_ages, axis_durations, axis_values) = get_axis_values(&node)?;
ages.extend(axis_ages);
durations.extend(axis_durations);
values.extend(axis_values);
}
let mut columns_vec: Vec<Column> = Vec::new();
if ages.iter().any(|age| age.is_some()) {
columns_vec.push(Series::new("age".into(), ages.clone()).into_column());
}
let content_type = table_node
.descendants()
.find(|n| n.tag_name().name() == "ContentType")
.and_then(|n| n.text())
.unwrap_or("Mortality/Life table");
let value_column_name = if content_type == "Life Table" {
"lx"
} else {
"qx"
};
columns_vec.push(Series::new(value_column_name.into(), values.clone()).into_column());
if durations.iter().any(|duration| duration.is_some()) {
columns_vec.push(Series::new("duration".into(), durations.clone()).into_column());
}
let columns: Vec<Column> = columns_vec.into_iter().map(|s| s.into_column()).collect();
let df = DataFrame::new(columns)?;
let result = if df
.get_column_names()
.iter()
.any(|name| name.as_str() == "duration")
{
df.lazy().filter(col("duration").is_not_null()).collect()? } else {
df
};
Ok(result)
}
type AxisValues = (Vec<Option<u32>>, Vec<Option<u32>>, Vec<f64>);
fn get_axis_values(axis_node: &roxmltree::Node) -> Result<AxisValues, Box<dyn std::error::Error>> {
let mut ages: Vec<Option<u32>> = Vec::new();
let mut durations: Vec<Option<u32>> = Vec::new();
let mut values: Vec<f64> = Vec::new();
let row_t = axis_node.attribute("t").and_then(|t| t.parse::<u32>().ok());
let y_nodes = axis_node
.descendants()
.filter(|n| n.tag_name().name() == "Y");
for node in y_nodes {
let text = node.text();
let value = text.and_then(|t| t.parse::<f64>().ok());
if value.is_none() {
return Err("Invalid value in Y node".into());
}
let value = value.unwrap();
let col_t = node.attribute("t").and_then(|t| t.parse::<u32>().ok());
match row_t {
Some(age) => {
ages.push(Some(age));
durations.push(col_t);
values.push(value);
}
None => {
ages.push(col_t);
durations.push(None);
values.push(value);
}
}
}
Ok((ages, durations, values))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_xml_selection_table() {
let mort_xml = MortXML::from_url_id(47).unwrap();
let df = &mort_xml.tables[0].values;
println!("DataFrame: {:?}", df.head(Some(10)));
assert!(df.height() > 0, "DataFrame is empty");
}
#[test]
#[ignore]
fn test_xml_from_url_id() {
let result = MortXML::from_url_id(912);
assert!(result.is_ok(), "Failed to load MortXML from URL ID");
let mort_xml = result.unwrap();
assert!(!mort_xml.tables.is_empty(), "No tables loaded from URL ID");
let df = &mort_xml.tables[0].values;
assert!(df.height() > 0, "DataFrame is empty");
assert!(df.column("value").is_ok(), "No 'value' column");
assert!(
mort_xml.content_classification.table_identity > 0,
"Invalid table identity"
);
assert!(
!mort_xml.content_classification.table_name.is_empty(),
"Table name is empty"
);
println!("Table name: {}", mort_xml.content_classification.table_name);
println!("Number of rows: {}", df.height());
}
#[test]
fn test_xml_from_id() {
let result = MortXML::from_id(1704);
assert!(result.is_ok(), "Failed to load MortXML from id 1704");
let mort_xml = result.unwrap();
assert!(!mort_xml.tables.is_empty(), "No tables loaded from ID");
let df = &mort_xml.tables[0].values;
assert!(df.height() > 0, "DataFrame is empty");
assert!(df.column("qx").is_ok(), "No 'qx' column");
let values = df.column("qx").unwrap();
let first_value = values.get(0).unwrap();
assert!(!first_value.is_null(), "First value is missing or null");
assert!(
mort_xml.content_classification.table_identity > 0,
"Invalid table identity"
);
assert!(
!mort_xml.content_classification.table_name.is_empty(),
"Table name is empty"
);
println!("Table name: {}", mort_xml.content_classification.table_name);
println!("Number of rows: {}", df.height());
println!("First value: {first_value:?}");
}
#[test]
#[ignore]
fn test_xml_from_path() {
use std::path::Path;
let path = Path::new("src/table_xml/t1704.xml");
let result = MortXML::from_path(path);
assert!(result.is_ok(), "Failed to load MortXML from path");
let mort_xml = result.unwrap();
assert!(!mort_xml.tables.is_empty(), "No tables loaded from path");
let df = &mort_xml.tables[0].values;
assert!(df.height() > 0, "DataFrame is empty");
assert!(df.column("qx").is_ok(), "No 'qx' column");
assert!(
mort_xml.content_classification.table_identity > 0,
"Invalid table identity"
);
assert!(
!mort_xml.content_classification.table_name.is_empty(),
"Table name is empty"
);
println!("Table name: {}", mort_xml.content_classification.table_name);
println!("Number of rows: {}", df.height());
}
#[test]
#[ignore] fn test_xml_from_xlsx() {
let result = MortXML::from_xlsx("data/elt15.xlsx", "female");
match result {
Ok(mort_xml) => {
println!("✓ Successfully loaded XLSX file");
println!(
" Table name: {}",
mort_xml.content_classification.table_name
);
println!(" Rows: {}", mort_xml.tables[0].values.height());
println!(
" Columns: {:?}",
mort_xml.tables[0].values.get_column_names()
);
assert!(!mort_xml.tables.is_empty(), "No tables loaded from XLSX");
assert!(mort_xml.tables[0].values.height() > 0, "DataFrame is empty");
let df = &mort_xml.tables[0].values;
assert!(df.column("age").is_ok(), "Should have 'age' column");
assert!(df.column("qx").is_ok(), "Should have 'qx' column");
let age_col = df.column("age").unwrap();
let qx_col = df.column("qx").unwrap();
assert_eq!(age_col.get(0).unwrap().try_extract::<u32>().unwrap(), 0);
let first_qx = qx_col.get(0).unwrap().try_extract::<f64>().unwrap();
assert!(
(first_qx - 0.00632).abs() < 0.0001,
"First qx should be approximately 0.00632"
);
println!(
" First age: {}",
age_col.get(0).unwrap().try_extract::<u32>().unwrap()
);
println!(" First qx: {:.5}", first_qx);
}
Err(e) => {
panic!("Failed to load XLSX file: {}", e);
}
}
}
#[test]
fn test_xml_from_df() {
use polars::prelude::*;
let ages = (0..121).map(|x| x as u32).collect::<Vec<u32>>();
let values = (0..121)
.map(|age| {
0.001 * (age as f64 / 80.0).exp()
})
.collect::<Vec<f64>>();
let df = df! {
"age" => ages.clone(),
"qx" => values.clone(),
}
.expect("Failed to create DataFrame");
let result = MortXML::from_df(df);
assert!(result.is_ok(), "Failed to create MortXML from DataFrame");
let mort_xml = result.unwrap();
assert_eq!(mort_xml.tables.len(), 1, "Should have exactly one table");
let table = &mort_xml.tables[0];
assert_eq!(
table.values.height(),
121,
"Should have 121 rows (ages 0-120)"
);
assert!(
table.values.column("age").is_ok(),
"Should have 'age' column"
);
assert!(table.values.column("qx").is_ok(), "Should have 'qx' column");
let classification = &mort_xml.content_classification;
assert_eq!(
classification.table_identity, 0,
"Should have ID 0 for local table"
);
assert_eq!(classification.provider_name, "Local DataFrame");
assert_eq!(classification.table_name, "DataFrame Table");
assert_eq!(classification.content_type, "Mortality/Life table");
let metadata = &table.meta_data;
assert_eq!(
metadata.scaling_factor, 1.0,
"Should have scaling factor 1.0"
);
assert_eq!(metadata.data_type, "Mortality Rate");
assert_eq!(metadata.nation, "Local");
let age_column = table.values.column("age").unwrap();
let value_column = table.values.column("qx").unwrap();
let first_age = age_column.get(0).unwrap();
let first_value = value_column.get(0).unwrap();
assert_eq!(first_age.try_extract::<u32>().unwrap(), 0);
assert!((first_value.try_extract::<f64>().unwrap() - 0.001).abs() < 1e-10);
let last_age = age_column.get(120).unwrap();
let last_value = value_column.get(120).unwrap();
assert_eq!(last_age.try_extract::<u32>().unwrap(), 120);
let expected_last_value = 0.001 * (120.0_f64 / 80.0).exp();
assert!((last_value.try_extract::<f64>().unwrap() - expected_last_value).abs() < 1e-6);
println!("✓ Successfully created MortXML from DataFrame");
println!(" Table name: {}", classification.table_name);
println!(" Rows: {}", table.values.height());
println!(
" First mortality rate (age 0): {:.6}",
first_value.try_extract::<f64>().unwrap()
);
println!(
" Last mortality rate (age 120): {:.6}",
last_value.try_extract::<f64>().unwrap()
);
}
#[test]
fn test_dataframe_schema_validation_valid_qx() {
use polars::prelude::*;
let df = df! {
"age" => [25u32, 26u32, 27u32],
"qx" => [0.0015f64, 0.0018f64, 0.0020f64],
}
.expect("Failed to create DataFrame");
let result = MortXML::from_df(df);
assert!(result.is_ok(), "Valid qx DataFrame should pass validation");
}
#[test]
fn test_dataframe_schema_validation_valid_lx() {
use polars::prelude::*;
let df = df! {
"age" => [25u32, 26u32, 27u32],
"lx" => [100000.0f64, 99850.0f64, 99680.0f64],
}
.expect("Failed to create DataFrame");
let result = MortXML::from_df(df);
assert!(result.is_ok(), "Valid lx DataFrame should pass validation");
}
#[test]
fn test_dataframe_schema_validation_with_duration() {
use polars::prelude::*;
let df = df! {
"age" => [25u32, 26u32, 27u32],
"qx" => [0.0015f64, 0.0018f64, 0.0020f64],
"duration" => [0u32, 1u32, 2u32],
}
.expect("Failed to create DataFrame");
let result = MortXML::from_df(df);
assert!(
result.is_ok(),
"Valid DataFrame with duration should pass validation"
);
}
#[test]
fn test_dataframe_schema_validation_wrong_column_name() {
use polars::prelude::*;
let df = df! {
"age" => [25u32, 26u32, 27u32],
"value" => [0.0015f64, 0.0018f64, 0.0020f64], }
.expect("Failed to create DataFrame");
let result = MortXML::from_df(df);
assert!(
result.is_err(),
"DataFrame with wrong column name should fail validation"
);
let error_msg = result.unwrap_err().to_string();
assert!(error_msg.contains("Second column must be named 'qx' or 'lx'"));
}
#[test]
fn test_dataframe_schema_validation_wrong_age_column_name() {
use polars::prelude::*;
let df = df! {
"years" => [25u32, 26u32, 27u32], "qx" => [0.0015f64, 0.0018f64, 0.0020f64],
}
.expect("Failed to create DataFrame");
let result = MortXML::from_df(df);
assert!(
result.is_err(),
"DataFrame with wrong age column name should fail validation"
);
let error_msg = result.unwrap_err().to_string();
assert!(error_msg.contains("First column must be named 'age'"));
}
#[test]
fn test_dataframe_schema_validation_wrong_data_types() {
use polars::prelude::*;
let df = df! {
"age" => [25.0f64, 26.0f64, 27.0f64], "qx" => [0.0015f64, 0.0018f64, 0.0020f64],
}
.expect("Failed to create DataFrame");
let result = MortXML::from_df(df);
assert!(
result.is_err(),
"DataFrame with wrong age data type should fail validation"
);
let error_msg = result.unwrap_err().to_string();
assert!(error_msg.contains("First column 'age' must be u32 type"));
}
#[test]
fn test_dataframe_schema_validation_empty_dataframe() {
use polars::prelude::*;
let df = df! {
"age" => Vec::<u32>::new(),
"qx" => Vec::<f64>::new(),
}
.expect("Failed to create DataFrame");
let result = MortXML::from_df(df);
assert!(result.is_err(), "Empty DataFrame should fail validation");
let error_msg = result.unwrap_err().to_string();
assert!(error_msg.contains("DataFrame must contain at least one row of data"));
}
#[test]
fn test_dataframe_schema_validation_negative_ages() {
use polars::prelude::*;
let df = df! {
"age" => [25u32, 26u32, 27u32],
"qx" => [0.0015f64, 0.0018f64, 0.0020f64],
}
.expect("Failed to create DataFrame");
let result = MortXML::from_df(df);
assert!(
result.is_ok(),
"Valid u32 DataFrame should pass validation"
);
}
#[test]
fn test_dataframe_schema_validation_invalid_qx_values() {
use polars::prelude::*;
let df = df! {
"age" => [25u32, 26u32, 27u32],
"qx" => [0.5f64, 1.5f64, 0.8f64], }
.expect("Failed to create DataFrame");
let result = MortXML::from_df(df);
assert!(
result.is_err(),
"DataFrame with qx > 1.0 should fail validation"
);
let error_msg = result.unwrap_err().to_string();
assert!(error_msg.contains("Mortality rate values (qx) must be ≤ 1.0"));
}
#[test]
fn test_dataframe_schema_validation_too_many_columns() {
use polars::prelude::*;
let df = df! {
"age" => [25u32, 26u32, 27u32],
"qx" => [0.0015f64, 0.0018f64, 0.0020f64],
"duration" => [0u32, 1u32, 2u32],
"extra" => [1.0f64, 2.0f64, 3.0f64], }
.expect("Failed to create DataFrame");
let result = MortXML::from_df(df);
assert!(
result.is_err(),
"DataFrame with too many columns should fail validation"
);
let error_msg = result.unwrap_err().to_string();
assert!(error_msg.contains("DataFrame must have at most 3 columns"));
}
}