pub mod internal {
pub mod ast;
pub mod data_structures;
pub mod errors;
pub mod expect;
pub mod lexer;
pub mod matches;
pub mod meta_builder;
pub mod new;
pub mod next;
pub mod parse;
pub mod parse_arg;
pub mod parse_arg_list;
pub mod parse_family;
pub mod parse_formula;
pub mod parse_random_effect;
pub mod parse_response;
pub mod parse_rhs;
pub mod parse_term;
pub mod parser;
pub mod peek;
}
use internal::parse::{MetaBuilder, Parser, Term};
use serde_json::Value;
pub fn parse_formula(formula: &str) -> Result<Value, Box<dyn std::error::Error>> {
let mut p = Parser::new(formula)?;
let (response, terms, mut has_intercept, family_opt) = match p.parse_formula() {
Ok(v) => v,
Err(e) => {
eprintln!("{}", p.pretty_error(&e));
return Err(Box::new(e));
}
};
let mut mb = MetaBuilder::new();
mb.push_response(&response);
let has_zero_term = terms.iter().any(|t| matches!(t, Term::Zero));
if has_zero_term {
has_intercept = false;
}
for t in terms {
match t {
Term::Column(name) => mb.push_plain_term(&name),
Term::Function { name, args } => mb.push_function_term(&name, &args),
Term::Interaction { left, right } => mb.push_interaction(&left, &right),
Term::RandomEffect(random_effect) => mb.push_random_effect(&random_effect),
Term::Intercept => {
}
Term::Zero => {
}
}
}
let family_name = family_opt.map(|f| format!("{:?}", f).to_lowercase());
let meta = mb.build(formula, has_intercept, family_name);
Ok(serde_json::to_value(meta)?)
}
pub fn lex_formula(formula: &str) -> Result<Value, Box<dyn std::error::Error>> {
use crate::internal::lexer::Token;
use logos::Logos;
let mut lex = Token::lexer(formula);
let mut tokens = Vec::new();
while let Some(item) = lex.next() {
match item {
Ok(tok) => {
let slice = lex.slice();
let obj = serde_json::json!({
"token": format!("{:?}", tok),
"lexeme": slice,
});
tokens.push(obj);
}
Err(()) => {
return Err(Box::new(crate::internal::errors::ParseError::Lex(
lex.slice().to_string(),
)));
}
}
}
Ok(serde_json::Value::Array(tokens))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_intercept_and_formula_order_with_intercept() {
let formula = "y ~ x + poly(x, 2) + log(z)";
let result = parse_formula(formula).expect("Should parse successfully");
let all_columns = result
.get("all_generated_columns")
.expect("Should have all_generated_columns")
.as_array()
.expect("Should be an array");
assert!(
all_columns
.iter()
.any(|col| col.as_str() == Some("intercept")),
"Intercept should be present in all_generated_columns"
);
let expected_columns = vec!["y", "intercept", "x", "x_poly_1", "x_poly_2", "z_log"];
let actual_columns: Vec<&str> = all_columns
.iter()
.map(|col| col.as_str().unwrap())
.collect();
assert_eq!(
actual_columns, expected_columns,
"all_generated_columns should have the correct order"
);
let formula_order = result
.get("all_generated_columns_formula_order")
.expect("Should have all_generated_columns_formula_order")
.as_object()
.expect("Should be an object");
assert_eq!(formula_order.get("1").unwrap().as_str(), Some("y"));
assert_eq!(formula_order.get("2").unwrap().as_str(), Some("intercept"));
assert_eq!(formula_order.get("3").unwrap().as_str(), Some("x"));
assert_eq!(formula_order.get("4").unwrap().as_str(), Some("x_poly_1"));
assert_eq!(formula_order.get("5").unwrap().as_str(), Some("x_poly_2"));
assert_eq!(formula_order.get("6").unwrap().as_str(), Some("z_log"));
let metadata = result.get("metadata").expect("Should have metadata");
assert_eq!(metadata.get("has_intercept").unwrap().as_bool(), Some(true));
}
#[test]
fn test_intercept_and_formula_order_without_intercept() {
let formula = "y ~ x + poly(x, 2) + log(z) - 1";
let result = parse_formula(formula).expect("Should parse successfully");
let all_columns = result
.get("all_generated_columns")
.expect("Should have all_generated_columns")
.as_array()
.expect("Should be an array");
assert!(
!all_columns
.iter()
.any(|col| col.as_str() == Some("intercept")),
"Intercept should NOT be present when has_intercept is false"
);
let expected_columns = vec!["y", "x", "x_poly_1", "x_poly_2", "z_log"];
let actual_columns: Vec<&str> = all_columns
.iter()
.map(|col| col.as_str().unwrap())
.collect();
assert_eq!(
actual_columns, expected_columns,
"all_generated_columns should have the correct order without intercept"
);
let formula_order = result
.get("all_generated_columns_formula_order")
.expect("Should have all_generated_columns_formula_order")
.as_object()
.expect("Should be an object");
assert_eq!(formula_order.get("1").unwrap().as_str(), Some("y"));
assert_eq!(formula_order.get("2").unwrap().as_str(), Some("x"));
assert_eq!(formula_order.get("3").unwrap().as_str(), Some("x_poly_1"));
assert_eq!(formula_order.get("4").unwrap().as_str(), Some("x_poly_2"));
assert_eq!(formula_order.get("5").unwrap().as_str(), Some("z_log"));
let metadata = result.get("metadata").expect("Should have metadata");
assert_eq!(
metadata.get("has_intercept").unwrap().as_bool(),
Some(false)
);
}
#[test]
fn test_simple_formula_with_intercept() {
let formula = "y ~ x";
let result = parse_formula(formula).expect("Should parse successfully");
let all_columns = result
.get("all_generated_columns")
.expect("Should have all_generated_columns")
.as_array()
.expect("Should be an array");
assert!(
all_columns
.iter()
.any(|col| col.as_str() == Some("intercept")),
"Intercept should be present in simple formula"
);
let expected_columns = vec!["y", "intercept", "x"];
let actual_columns: Vec<&str> = all_columns
.iter()
.map(|col| col.as_str().unwrap())
.collect();
assert_eq!(actual_columns, expected_columns);
let formula_order = result
.get("all_generated_columns_formula_order")
.expect("Should have all_generated_columns_formula_order")
.as_object()
.expect("Should be an object");
assert_eq!(formula_order.get("1").unwrap().as_str(), Some("y"));
assert_eq!(formula_order.get("2").unwrap().as_str(), Some("intercept"));
assert_eq!(formula_order.get("3").unwrap().as_str(), Some("x"));
}
#[test]
fn test_complex_formula_with_intercept() {
let formula = "y ~ x1 + x2*x3 + poly(x1, 2) + log(z)";
let result = parse_formula(formula).expect("Should parse successfully");
let all_columns = result
.get("all_generated_columns")
.expect("Should have all_generated_columns")
.as_array()
.expect("Should be an array");
assert!(
all_columns
.iter()
.any(|col| col.as_str() == Some("intercept")),
"Intercept should be present in complex formula"
);
assert_eq!(all_columns[1].as_str(), Some("intercept"));
let formula_order = result
.get("all_generated_columns_formula_order")
.expect("Should have all_generated_columns_formula_order")
.as_object()
.expect("Should be an object");
assert_eq!(formula_order.get("1").unwrap().as_str(), Some("y"));
assert_eq!(formula_order.get("2").unwrap().as_str(), Some("intercept"));
let metadata = result.get("metadata").expect("Should have metadata");
assert_eq!(metadata.get("has_intercept").unwrap().as_bool(), Some(true));
}
#[test]
fn test_intercept_only_model() {
let formula = "y ~ 1";
let result = parse_formula(formula).expect("Should parse successfully");
let all_columns = result
.get("all_generated_columns")
.expect("Should have all_generated_columns")
.as_array()
.expect("Should be an array");
assert!(
all_columns
.iter()
.any(|col| col.as_str() == Some("intercept")),
"Intercept should be present in intercept-only model"
);
let expected_columns = vec!["y", "intercept"];
let actual_columns: Vec<&str> = all_columns
.iter()
.map(|col| col.as_str().unwrap())
.collect();
assert_eq!(actual_columns, expected_columns);
let formula_order = result
.get("all_generated_columns_formula_order")
.expect("Should have all_generated_columns_formula_order")
.as_object()
.expect("Should be an object");
assert_eq!(formula_order.get("1").unwrap().as_str(), Some("y"));
assert_eq!(formula_order.get("2").unwrap().as_str(), Some("intercept"));
let metadata = result.get("metadata").expect("Should have metadata");
assert_eq!(metadata.get("has_intercept").unwrap().as_bool(), Some(true));
let columns = result
.get("columns")
.expect("Should have columns")
.as_object()
.expect("Should be an object");
assert_eq!(
columns.len(),
1,
"Should only have response variable in columns"
);
assert!(
columns.contains_key("y"),
"Should have response variable 'y'"
);
}
#[test]
fn test_intercept_only_model_with_family() {
let formula = "y ~ 1, family = gaussian";
let result = parse_formula(formula).expect("Should parse successfully");
let all_columns = result
.get("all_generated_columns")
.expect("Should have all_generated_columns")
.as_array()
.expect("Should be an array");
assert!(
all_columns
.iter()
.any(|col| col.as_str() == Some("intercept")),
"Intercept should be present in intercept-only model with family"
);
let metadata = result.get("metadata").expect("Should have metadata");
assert_eq!(metadata.get("family").unwrap().as_str(), Some("gaussian"));
assert_eq!(metadata.get("has_intercept").unwrap().as_bool(), Some(true));
}
#[test]
fn test_no_intercept_model() {
let formula = "y ~ 0";
let result = parse_formula(formula).expect("Should parse successfully");
let all_columns = result
.get("all_generated_columns")
.expect("Should have all_generated_columns")
.as_array()
.expect("Should be an array");
assert!(
!all_columns
.iter()
.any(|col| col.as_str() == Some("intercept")),
"Intercept should NOT be present in y ~ 0 model"
);
let expected_columns = vec!["y"];
let actual_columns: Vec<&str> = all_columns
.iter()
.map(|col| col.as_str().unwrap())
.collect();
assert_eq!(actual_columns, expected_columns);
let formula_order = result
.get("all_generated_columns_formula_order")
.expect("Should have all_generated_columns_formula_order")
.as_object()
.expect("Should be an object");
assert_eq!(formula_order.get("1").unwrap().as_str(), Some("y"));
assert_eq!(
formula_order.len(),
1,
"Should only have response variable in formula order"
);
let metadata = result.get("metadata").expect("Should have metadata");
assert_eq!(
metadata.get("has_intercept").unwrap().as_bool(),
Some(false)
);
}
#[test]
fn test_invalid_intercept_syntax() {
let formula = "y ~ 1 - 1";
let result = parse_formula(formula);
assert!(
result.is_err(),
"y ~ 1 - 1 should fail because it's contradictory syntax"
);
if let Err(e) = result {
let error_msg = format!("{}", e);
assert!(
error_msg.contains("cannot have both intercept term and intercept removal"),
"Error should mention contradictory syntax"
);
}
}
#[test]
fn test_invalid_zero_combination() {
let formula = "y ~ 0 + 1";
let result = parse_formula(formula);
assert!(
result.is_err(),
"y ~ 0 + 1 should fail because 0 cannot be combined with other terms"
);
if let Err(e) = result {
let error_msg = format!("{}", e);
assert!(
error_msg.contains("zero term (0) cannot be combined with other terms"),
"Error should mention zero term combination restriction"
);
}
}
#[test]
fn test_multivariate_response_basic() {
let formula = "bind(y1, y2) ~ x";
let result = parse_formula(formula).expect("Should parse successfully");
let columns = result
.get("columns")
.expect("Should have columns")
.as_object()
.expect("Should be an object");
assert!(columns.contains_key("y1"), "Should contain y1 response variable");
assert!(columns.contains_key("y2"), "Should contain y2 response variable");
let y1_info = columns.get("y1").expect("Should have y1");
let y1_roles = y1_info.get("roles").expect("Should have roles").as_array().expect("Should be array");
assert!(y1_roles.iter().any(|r| r.as_str() == Some("Response")), "y1 should have Response role");
let y2_info = columns.get("y2").expect("Should have y2");
let y2_roles = y2_info.get("roles").expect("Should have roles").as_array().expect("Should be array");
assert!(y2_roles.iter().any(|r| r.as_str() == Some("Response")), "y2 should have Response role");
assert_eq!(y1_info.get("id").expect("Should have id").as_u64(), Some(1));
assert_eq!(y2_info.get("id").expect("Should have id").as_u64(), Some(1));
let all_columns = result
.get("all_generated_columns")
.expect("Should have all_generated_columns")
.as_array()
.expect("Should be an array");
let column_names: Vec<&str> = all_columns
.iter()
.map(|col| col.as_str().unwrap())
.collect();
assert!(column_names.contains(&"y1"), "Should contain y1 in generated columns");
assert!(column_names.contains(&"y2"), "Should contain y2 in generated columns");
assert!(column_names.contains(&"x"), "Should contain x in generated columns");
assert!(column_names.contains(&"intercept"), "Should contain intercept in generated columns");
}
#[test]
fn test_multivariate_response_three_variables() {
let formula = "bind(y1, y2, y3) ~ x + z";
let result = parse_formula(formula).expect("Should parse successfully");
let columns = result
.get("columns")
.expect("Should have columns")
.as_object()
.expect("Should be an object");
for var_name in &["y1", "y2", "y3"] {
assert!(columns.contains_key(*var_name), "Should contain {} response variable", var_name);
let var_info = columns.get(*var_name).expect(&format!("Should have {}", var_name));
let roles = var_info.get("roles").expect("Should have roles").as_array().expect("Should be array");
assert!(roles.iter().any(|r| r.as_str() == Some("Response")), "{} should have Response role", var_name);
assert_eq!(var_info.get("id").expect("Should have id").as_u64(), Some(1));
}
let x_info = columns.get("x").expect("Should have x");
let z_info = columns.get("z").expect("Should have z");
assert_eq!(x_info.get("id").expect("Should have id").as_u64(), Some(2));
assert_eq!(z_info.get("id").expect("Should have id").as_u64(), Some(3));
}
#[test]
fn test_multivariate_response_with_family() {
let formula = "bind(y1, y2) ~ x, family = gaussian";
let result = parse_formula(formula).expect("Should parse successfully");
let metadata = result.get("metadata").expect("Should have metadata");
assert_eq!(metadata.get("family").expect("Should have family").as_str(), Some("gaussian"));
let columns = result
.get("columns")
.expect("Should have columns")
.as_object()
.expect("Should be an object");
assert!(columns.contains_key("y1"), "Should contain y1 response variable");
assert!(columns.contains_key("y2"), "Should contain y2 response variable");
assert!(columns.contains_key("x"), "Should contain x predictor variable");
}
#[test]
fn test_multivariate_response_invalid_single_variable() {
let formula = "bind(y1) ~ x";
let result = parse_formula(formula);
assert!(result.is_err(), "bind() with single variable should fail");
if let Err(e) = result {
let error_msg = format!("{}", e);
assert!(
error_msg.contains("bind() requires at least 2 variables"),
"Error should mention bind() requires at least 2 variables"
);
}
}
}