pub mod bitmask;
pub mod column;
pub mod csv;
pub mod dataframe;
pub mod expr;
pub mod kahan;
pub mod nlp;
pub mod regex_engine;
pub mod rng;
pub mod tidyview;
use pyo3::prelude::*;
use pyo3::exceptions::{PyRuntimeError, PyValueError};
use pyo3::types::{PyDict, PyList};
#[pyclass(name = "DataFrame")]
#[derive(Clone)]
struct PyDataFrame {
inner: dataframe::DataFrame,
}
#[pymethods]
impl PyDataFrame {
#[new]
fn new(columns: &Bound<'_, PyDict>) -> PyResult<Self> {
let mut cols: Vec<(String, column::Column)> = Vec::new();
for (key, value) in columns.iter() {
let name: String = key.extract()?;
let list = value.downcast::<PyList>()?;
let col = py_list_to_column(list)?;
cols.push((name, col));
}
let df = dataframe::DataFrame::from_columns(cols)
.map_err(|e| PyValueError::new_err(format!("{}", e)))?;
Ok(PyDataFrame { inner: df })
}
fn nrows(&self) -> usize {
self.inner.nrows()
}
fn ncols(&self) -> usize {
self.inner.ncols()
}
fn column_names(&self) -> Vec<String> {
self.inner.column_names().into_iter().map(|s| s.to_string()).collect()
}
fn get_column(&self, name: &str) -> PyResult<PyObject> {
let col = self.inner.get_column(name)
.ok_or_else(|| PyValueError::new_err(format!("column `{}` not found", name)))?;
Python::with_gil(|py| column_to_py(py, col))
}
fn __repr__(&self) -> String {
format!("DataFrame(nrows={}, ncols={}, columns={:?})",
self.inner.nrows(), self.inner.ncols(), self.inner.column_names())
}
}
#[pyclass(name = "TidyView", unsendable)]
#[derive(Clone)]
struct PyTidyView {
inner: tidyview::TidyView,
}
#[pymethods]
impl PyTidyView {
#[new]
fn new(df: &PyDataFrame) -> Self {
let tv = tidyview::TidyView::new(df.inner.clone());
PyTidyView { inner: tv }
}
fn nrows(&self) -> usize {
self.inner.nrows()
}
fn ncols(&self) -> usize {
self.inner.ncols()
}
fn column_names(&self) -> Vec<String> {
self.inner.column_names().into_iter().map(|s| s.to_string()).collect()
}
fn filter_gt_int(&self, col_name: &str, value: i64) -> PyResult<Self> {
let pred = expr::binop(
expr::BinOp::Gt,
expr::col(col_name),
expr::DExpr::LitInt(value),
);
let inner = self.inner.filter(&pred)
.map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?;
Ok(PyTidyView { inner })
}
fn filter_lt_int(&self, col_name: &str, value: i64) -> PyResult<Self> {
let pred = expr::binop(
expr::BinOp::Lt,
expr::col(col_name),
expr::DExpr::LitInt(value),
);
let inner = self.inner.filter(&pred)
.map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?;
Ok(PyTidyView { inner })
}
fn filter_eq_int(&self, col_name: &str, value: i64) -> PyResult<Self> {
let pred = expr::binop(
expr::BinOp::Eq,
expr::col(col_name),
expr::DExpr::LitInt(value),
);
let inner = self.inner.filter(&pred)
.map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?;
Ok(PyTidyView { inner })
}
fn filter_eq_str(&self, col_name: &str, value: &str) -> PyResult<Self> {
let pred = expr::binop(
expr::BinOp::Eq,
expr::col(col_name),
expr::DExpr::LitStr(value.to_string()),
);
let inner = self.inner.filter(&pred)
.map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?;
Ok(PyTidyView { inner })
}
fn filter_gt_float(&self, col_name: &str, value: f64) -> PyResult<Self> {
let pred = expr::binop(
expr::BinOp::Gt,
expr::col(col_name),
expr::DExpr::LitFloat(value),
);
let inner = self.inner.filter(&pred)
.map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?;
Ok(PyTidyView { inner })
}
fn select(&self, columns: Vec<String>) -> PyResult<Self> {
let refs: Vec<&str> = columns.iter().map(|s| s.as_str()).collect();
let inner = self.inner.select(&refs)
.map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?;
Ok(PyTidyView { inner })
}
fn arrange(&self, col_name: &str) -> PyResult<Self> {
let keys = vec![tidyview::ArrangeKey {
col_name: col_name.to_string(),
descending: false,
}];
let inner = self.inner.arrange(&keys)
.map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?;
Ok(PyTidyView { inner })
}
fn arrange_desc(&self, col_name: &str) -> PyResult<Self> {
let keys = vec![tidyview::ArrangeKey {
col_name: col_name.to_string(),
descending: true,
}];
let inner = self.inner.arrange(&keys)
.map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?;
Ok(PyTidyView { inner })
}
fn slice_head(&self, n: usize) -> Self {
PyTidyView { inner: self.inner.slice_head(n) }
}
fn slice_tail(&self, n: usize) -> Self {
PyTidyView { inner: self.inner.slice_tail(n) }
}
fn slice_sample(&self, n: usize, seed: u64) -> Self {
PyTidyView { inner: self.inner.slice_sample(n, seed) }
}
fn distinct(&self, columns: Vec<String>) -> PyResult<Self> {
let refs: Vec<&str> = columns.iter().map(|s| s.as_str()).collect();
let inner = self.inner.distinct(&refs)
.map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?;
Ok(PyTidyView { inner })
}
fn group_summarise(&self, group_cols: Vec<String>, agg_col: &str, agg_fn: &str, output_name: &str) -> PyResult<PyDataFrame> {
let refs: Vec<&str> = group_cols.iter().map(|s| s.as_str()).collect();
let agg = parse_agg(agg_fn, agg_col)?;
let grouped = self.inner.group_by(&refs)
.map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?;
let result_df = grouped.summarise(&[(output_name, agg)])
.map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?;
Ok(PyDataFrame { inner: result_df })
}
fn inner_join(&self, other: &PyTidyView, by: Vec<String>) -> PyResult<PyDataFrame> {
let pairs: Vec<(&str, &str)> = by.iter().map(|s| (s.as_str(), s.as_str())).collect();
let result_df = self.inner.inner_join(&other.inner, &pairs)
.map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?;
Ok(PyDataFrame { inner: result_df })
}
fn left_join(&self, other: &PyTidyView, by: Vec<String>) -> PyResult<PyDataFrame> {
let pairs: Vec<(&str, &str)> = by.iter().map(|s| (s.as_str(), s.as_str())).collect();
let result_df = self.inner.left_join(&other.inner, &pairs)
.map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?;
Ok(PyDataFrame { inner: result_df })
}
fn materialize(&self) -> PyResult<PyDataFrame> {
let df = self.inner.materialize()
.map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?;
Ok(PyDataFrame { inner: df })
}
fn __repr__(&self) -> String {
format!("TidyView(nrows={}, ncols={}, columns={:?})",
self.nrows(), self.ncols(), self.column_names())
}
}
#[pyclass(name = "KahanAccumulator")]
struct PyKahanAccumulator {
inner: kahan::KahanAccumulator,
}
#[pymethods]
impl PyKahanAccumulator {
#[new]
fn new() -> Self {
PyKahanAccumulator { inner: kahan::KahanAccumulator::new() }
}
fn add(&mut self, value: f64) {
self.inner.add(value);
}
fn add_slice(&mut self, values: Vec<f64>) {
self.inner.add_slice(&values);
}
fn finalize(&self) -> f64 {
self.inner.finalize()
}
fn count(&self) -> usize {
self.inner.count()
}
}
#[pyclass(name = "Rng")]
struct PyRng {
inner: rng::Rng,
}
#[pymethods]
impl PyRng {
#[new]
fn new(seed: u64) -> Self {
PyRng { inner: rng::Rng::seeded(seed) }
}
fn next_u64(&mut self) -> u64 {
self.inner.next_u64()
}
fn next_f64(&mut self) -> f64 {
self.inner.next_f64()
}
fn next_normal(&mut self) -> f64 {
self.inner.next_normal()
}
fn fork(&mut self) -> Self {
PyRng { inner: self.inner.fork() }
}
}
#[pyfunction]
fn read_csv(text: &str) -> PyResult<PyDataFrame> {
let reader = csv::CsvReader::new(csv::CsvConfig::default());
let df = reader.parse(text.as_bytes())
.map_err(|e| PyValueError::new_err(format!("{}", e)))?;
Ok(PyDataFrame { inner: df })
}
#[pyfunction]
fn read_csv_delim(text: &str, delimiter: &str) -> PyResult<PyDataFrame> {
let delim = delimiter.as_bytes().first().copied().unwrap_or(b',');
let config = csv::CsvConfig {
delimiter: delim,
..Default::default()
};
let reader = csv::CsvReader::new(config);
let df = reader.parse(text.as_bytes())
.map_err(|e| PyValueError::new_err(format!("{}", e)))?;
Ok(PyDataFrame { inner: df })
}
#[pyfunction]
#[pyo3(signature = (pattern, text, flags=None))]
fn regex_is_match(pattern: &str, text: &str, flags: Option<&str>) -> bool {
regex_engine::is_match(pattern, flags.unwrap_or(""), text.as_bytes())
}
#[pyfunction]
#[pyo3(signature = (pattern, text, flags=None))]
fn regex_find(pattern: &str, text: &str, flags: Option<&str>) -> Option<(usize, usize)> {
regex_engine::find(pattern, flags.unwrap_or(""), text.as_bytes())
}
#[pyfunction]
#[pyo3(signature = (pattern, text, flags=None))]
fn regex_find_all(pattern: &str, text: &str, flags: Option<&str>) -> Vec<(usize, usize)> {
regex_engine::find_all(pattern, flags.unwrap_or(""), text.as_bytes())
}
#[pyfunction]
#[pyo3(signature = (pattern, text, flags=None))]
fn regex_split(pattern: &str, text: &str, flags: Option<&str>) -> Vec<(usize, usize)> {
regex_engine::split(pattern, flags.unwrap_or(""), text.as_bytes())
}
#[pyfunction]
fn levenshtein(a: &str, b: &str) -> usize {
nlp::levenshtein(a, b)
}
#[pyfunction]
fn levenshtein_similarity(a: &str, b: &str) -> f64 {
nlp::levenshtein_similarity(a, b)
}
#[pyfunction]
fn jaccard_ngram_similarity(a: &str, b: &str, n: usize) -> f64 {
nlp::jaccard_ngram_similarity(a, b, n)
}
#[pyfunction]
fn char_ngrams(text: &str, n: usize) -> std::collections::BTreeMap<String, usize> {
nlp::char_ngrams(text, n)
}
#[pyfunction]
fn word_ngrams(text: &str, n: usize) -> std::collections::BTreeMap<String, usize> {
nlp::word_ngrams(text, n)
}
#[pyfunction]
fn tokenize_whitespace(text: &str) -> Vec<(usize, usize)> {
nlp::tokenize_whitespace(text)
}
#[pyfunction]
fn tokenize_words(text: &str) -> Vec<String> {
nlp::tokenize_words(text)
}
#[pyfunction]
fn term_frequency(text: &str) -> std::collections::BTreeMap<String, f64> {
nlp::term_frequency(text)
}
#[pyfunction]
fn kahan_sum(values: Vec<f64>) -> f64 {
kahan::kahan_sum(&values)
}
fn py_list_to_column(list: &Bound<'_, PyList>) -> PyResult<column::Column> {
if list.is_empty() {
return Ok(column::Column::Str(Vec::new()));
}
let first = list.get_item(0)?;
if first.extract::<bool>().is_ok() {
let vals: Vec<bool> = list.extract()?;
Ok(column::Column::Bool(vals))
} else if first.extract::<i64>().is_ok() {
let vals: Vec<i64> = list.extract()?;
Ok(column::Column::Int(vals))
} else if first.extract::<f64>().is_ok() {
let vals: Vec<f64> = list.extract()?;
Ok(column::Column::Float(vals))
} else {
let vals: Vec<String> = list.extract()?;
Ok(column::Column::Str(vals))
}
}
fn column_to_py(py: Python<'_>, col: &column::Column) -> PyResult<PyObject> {
match col {
column::Column::Int(v) => Ok(v.to_object(py)),
column::Column::Float(v) => Ok(v.to_object(py)),
column::Column::Str(v) => Ok(v.to_object(py)),
column::Column::Bool(v) => Ok(v.to_object(py)),
}
}
fn parse_agg(name: &str, col: &str) -> PyResult<tidyview::TidyAgg> {
let c = col.to_string();
match name.to_lowercase().as_str() {
"count" => Ok(tidyview::TidyAgg::Count),
"sum" => Ok(tidyview::TidyAgg::Sum(c)),
"mean" => Ok(tidyview::TidyAgg::Mean(c)),
"min" => Ok(tidyview::TidyAgg::Min(c)),
"max" => Ok(tidyview::TidyAgg::Max(c)),
"sd" => Ok(tidyview::TidyAgg::Sd(c)),
"var" => Ok(tidyview::TidyAgg::Var(c)),
"first" => Ok(tidyview::TidyAgg::First(c)),
"last" => Ok(tidyview::TidyAgg::Last(c)),
"n_distinct" => Ok(tidyview::TidyAgg::NDistinct(c)),
_ => Err(PyValueError::new_err(format!("unknown aggregation: {}", name))),
}
}
#[pymodule]
fn virtual_frame(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_class::<PyDataFrame>()?;
m.add_class::<PyTidyView>()?;
m.add_class::<PyKahanAccumulator>()?;
m.add_class::<PyRng>()?;
m.add_function(wrap_pyfunction!(read_csv, m)?)?;
m.add_function(wrap_pyfunction!(read_csv_delim, m)?)?;
m.add_function(wrap_pyfunction!(regex_is_match, m)?)?;
m.add_function(wrap_pyfunction!(regex_find, m)?)?;
m.add_function(wrap_pyfunction!(regex_find_all, m)?)?;
m.add_function(wrap_pyfunction!(regex_split, m)?)?;
m.add_function(wrap_pyfunction!(levenshtein, m)?)?;
m.add_function(wrap_pyfunction!(levenshtein_similarity, m)?)?;
m.add_function(wrap_pyfunction!(jaccard_ngram_similarity, m)?)?;
m.add_function(wrap_pyfunction!(char_ngrams, m)?)?;
m.add_function(wrap_pyfunction!(word_ngrams, m)?)?;
m.add_function(wrap_pyfunction!(tokenize_whitespace, m)?)?;
m.add_function(wrap_pyfunction!(tokenize_words, m)?)?;
m.add_function(wrap_pyfunction!(term_frequency, m)?)?;
m.add_function(wrap_pyfunction!(kahan_sum, m)?)?;
Ok(())
}