1pub mod bitmask;
7pub mod column;
8pub mod csv;
9pub mod dataframe;
10pub mod expr;
11pub mod kahan;
12pub mod nlp;
13pub mod regex_engine;
14pub mod rng;
15pub mod tidyview;
16
17use pyo3::prelude::*;
20use pyo3::exceptions::{PyRuntimeError, PyValueError};
21use pyo3::types::{PyDict, PyList};
22
23#[pyclass(name = "DataFrame")]
27#[derive(Clone)]
28struct PyDataFrame {
29 inner: dataframe::DataFrame,
30}
31
32#[pymethods]
33impl PyDataFrame {
34 #[new]
36 fn new(columns: &Bound<'_, PyDict>) -> PyResult<Self> {
37 let mut cols: Vec<(String, column::Column)> = Vec::new();
38 for (key, value) in columns.iter() {
39 let name: String = key.extract()?;
40 let list = value.downcast::<PyList>()?;
41 let col = py_list_to_column(list)?;
42 cols.push((name, col));
43 }
44 let df = dataframe::DataFrame::from_columns(cols)
45 .map_err(|e| PyValueError::new_err(format!("{}", e)))?;
46 Ok(PyDataFrame { inner: df })
47 }
48
49 fn nrows(&self) -> usize {
51 self.inner.nrows()
52 }
53
54 fn ncols(&self) -> usize {
56 self.inner.ncols()
57 }
58
59 fn column_names(&self) -> Vec<String> {
61 self.inner.column_names().into_iter().map(|s| s.to_string()).collect()
62 }
63
64 fn get_column(&self, name: &str) -> PyResult<PyObject> {
66 let col = self.inner.get_column(name)
67 .ok_or_else(|| PyValueError::new_err(format!("column `{}` not found", name)))?;
68 Python::with_gil(|py| column_to_py(py, col))
69 }
70
71 fn __repr__(&self) -> String {
72 format!("DataFrame(nrows={}, ncols={}, columns={:?})",
73 self.inner.nrows(), self.inner.ncols(), self.inner.column_names())
74 }
75}
76
77#[pyclass(name = "TidyView", unsendable)]
79#[derive(Clone)]
80struct PyTidyView {
81 inner: tidyview::TidyView,
82}
83
84#[pymethods]
85impl PyTidyView {
86 #[new]
88 fn new(df: &PyDataFrame) -> Self {
89 let tv = tidyview::TidyView::new(df.inner.clone());
90 PyTidyView { inner: tv }
91 }
92
93 fn nrows(&self) -> usize {
95 self.inner.nrows()
96 }
97
98 fn ncols(&self) -> usize {
100 self.inner.ncols()
101 }
102
103 fn column_names(&self) -> Vec<String> {
105 self.inner.column_names().into_iter().map(|s| s.to_string()).collect()
106 }
107
108 fn filter_gt_int(&self, col_name: &str, value: i64) -> PyResult<Self> {
110 let pred = expr::binop(
111 expr::BinOp::Gt,
112 expr::col(col_name),
113 expr::DExpr::LitInt(value),
114 );
115 let inner = self.inner.filter(&pred)
116 .map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?;
117 Ok(PyTidyView { inner })
118 }
119
120 fn filter_lt_int(&self, col_name: &str, value: i64) -> PyResult<Self> {
122 let pred = expr::binop(
123 expr::BinOp::Lt,
124 expr::col(col_name),
125 expr::DExpr::LitInt(value),
126 );
127 let inner = self.inner.filter(&pred)
128 .map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?;
129 Ok(PyTidyView { inner })
130 }
131
132 fn filter_eq_int(&self, col_name: &str, value: i64) -> PyResult<Self> {
134 let pred = expr::binop(
135 expr::BinOp::Eq,
136 expr::col(col_name),
137 expr::DExpr::LitInt(value),
138 );
139 let inner = self.inner.filter(&pred)
140 .map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?;
141 Ok(PyTidyView { inner })
142 }
143
144 fn filter_eq_str(&self, col_name: &str, value: &str) -> PyResult<Self> {
146 let pred = expr::binop(
147 expr::BinOp::Eq,
148 expr::col(col_name),
149 expr::DExpr::LitStr(value.to_string()),
150 );
151 let inner = self.inner.filter(&pred)
152 .map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?;
153 Ok(PyTidyView { inner })
154 }
155
156 fn filter_gt_float(&self, col_name: &str, value: f64) -> PyResult<Self> {
158 let pred = expr::binop(
159 expr::BinOp::Gt,
160 expr::col(col_name),
161 expr::DExpr::LitFloat(value),
162 );
163 let inner = self.inner.filter(&pred)
164 .map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?;
165 Ok(PyTidyView { inner })
166 }
167
168 fn select(&self, columns: Vec<String>) -> PyResult<Self> {
170 let refs: Vec<&str> = columns.iter().map(|s| s.as_str()).collect();
171 let inner = self.inner.select(&refs)
172 .map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?;
173 Ok(PyTidyView { inner })
174 }
175
176 fn arrange(&self, col_name: &str) -> PyResult<Self> {
178 let keys = vec![tidyview::ArrangeKey {
179 col_name: col_name.to_string(),
180 descending: false,
181 }];
182 let inner = self.inner.arrange(&keys)
183 .map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?;
184 Ok(PyTidyView { inner })
185 }
186
187 fn arrange_desc(&self, col_name: &str) -> PyResult<Self> {
189 let keys = vec![tidyview::ArrangeKey {
190 col_name: col_name.to_string(),
191 descending: true,
192 }];
193 let inner = self.inner.arrange(&keys)
194 .map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?;
195 Ok(PyTidyView { inner })
196 }
197
198 fn slice_head(&self, n: usize) -> Self {
200 PyTidyView { inner: self.inner.slice_head(n) }
201 }
202
203 fn slice_tail(&self, n: usize) -> Self {
205 PyTidyView { inner: self.inner.slice_tail(n) }
206 }
207
208 fn slice_sample(&self, n: usize, seed: u64) -> Self {
210 PyTidyView { inner: self.inner.slice_sample(n, seed) }
211 }
212
213 fn distinct(&self, columns: Vec<String>) -> PyResult<Self> {
215 let refs: Vec<&str> = columns.iter().map(|s| s.as_str()).collect();
216 let inner = self.inner.distinct(&refs)
217 .map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?;
218 Ok(PyTidyView { inner })
219 }
220
221 fn group_summarise(&self, group_cols: Vec<String>, agg_col: &str, agg_fn: &str, output_name: &str) -> PyResult<PyDataFrame> {
228 let refs: Vec<&str> = group_cols.iter().map(|s| s.as_str()).collect();
229 let agg = parse_agg(agg_fn, agg_col)?;
230 let grouped = self.inner.group_by(&refs)
231 .map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?;
232 let result_df = grouped.summarise(&[(output_name, agg)])
233 .map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?;
234 Ok(PyDataFrame { inner: result_df })
235 }
236
237 fn inner_join(&self, other: &PyTidyView, by: Vec<String>) -> PyResult<PyDataFrame> {
241 let pairs: Vec<(&str, &str)> = by.iter().map(|s| (s.as_str(), s.as_str())).collect();
242 let result_df = self.inner.inner_join(&other.inner, &pairs)
243 .map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?;
244 Ok(PyDataFrame { inner: result_df })
245 }
246
247 fn left_join(&self, other: &PyTidyView, by: Vec<String>) -> PyResult<PyDataFrame> {
249 let pairs: Vec<(&str, &str)> = by.iter().map(|s| (s.as_str(), s.as_str())).collect();
250 let result_df = self.inner.left_join(&other.inner, &pairs)
251 .map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?;
252 Ok(PyDataFrame { inner: result_df })
253 }
254
255 fn materialize(&self) -> PyResult<PyDataFrame> {
257 let df = self.inner.materialize()
258 .map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?;
259 Ok(PyDataFrame { inner: df })
260 }
261
262 fn __repr__(&self) -> String {
263 format!("TidyView(nrows={}, ncols={}, columns={:?})",
264 self.nrows(), self.ncols(), self.column_names())
265 }
266}
267
268#[pyclass(name = "KahanAccumulator")]
270struct PyKahanAccumulator {
271 inner: kahan::KahanAccumulator,
272}
273
274#[pymethods]
275impl PyKahanAccumulator {
276 #[new]
277 fn new() -> Self {
278 PyKahanAccumulator { inner: kahan::KahanAccumulator::new() }
279 }
280
281 fn add(&mut self, value: f64) {
282 self.inner.add(value);
283 }
284
285 fn add_slice(&mut self, values: Vec<f64>) {
286 self.inner.add_slice(&values);
287 }
288
289 fn finalize(&self) -> f64 {
290 self.inner.finalize()
291 }
292
293 fn count(&self) -> usize {
294 self.inner.count()
295 }
296}
297
298#[pyclass(name = "Rng")]
300struct PyRng {
301 inner: rng::Rng,
302}
303
304#[pymethods]
305impl PyRng {
306 #[new]
307 fn new(seed: u64) -> Self {
308 PyRng { inner: rng::Rng::seeded(seed) }
309 }
310
311 fn next_u64(&mut self) -> u64 {
312 self.inner.next_u64()
313 }
314
315 fn next_f64(&mut self) -> f64 {
316 self.inner.next_f64()
317 }
318
319 fn next_normal(&mut self) -> f64 {
320 self.inner.next_normal()
321 }
322
323 fn fork(&mut self) -> Self {
324 PyRng { inner: self.inner.fork() }
325 }
326}
327
328#[pyfunction]
332fn read_csv(text: &str) -> PyResult<PyDataFrame> {
333 let reader = csv::CsvReader::new(csv::CsvConfig::default());
334 let df = reader.parse(text.as_bytes())
335 .map_err(|e| PyValueError::new_err(format!("{}", e)))?;
336 Ok(PyDataFrame { inner: df })
337}
338
339#[pyfunction]
341fn read_csv_delim(text: &str, delimiter: &str) -> PyResult<PyDataFrame> {
342 let delim = delimiter.as_bytes().first().copied().unwrap_or(b',');
343 let config = csv::CsvConfig {
344 delimiter: delim,
345 ..Default::default()
346 };
347 let reader = csv::CsvReader::new(config);
348 let df = reader.parse(text.as_bytes())
349 .map_err(|e| PyValueError::new_err(format!("{}", e)))?;
350 Ok(PyDataFrame { inner: df })
351}
352
353#[pyfunction]
357#[pyo3(signature = (pattern, text, flags=None))]
358fn regex_is_match(pattern: &str, text: &str, flags: Option<&str>) -> bool {
359 regex_engine::is_match(pattern, flags.unwrap_or(""), text.as_bytes())
360}
361
362#[pyfunction]
364#[pyo3(signature = (pattern, text, flags=None))]
365fn regex_find(pattern: &str, text: &str, flags: Option<&str>) -> Option<(usize, usize)> {
366 regex_engine::find(pattern, flags.unwrap_or(""), text.as_bytes())
367}
368
369#[pyfunction]
371#[pyo3(signature = (pattern, text, flags=None))]
372fn regex_find_all(pattern: &str, text: &str, flags: Option<&str>) -> Vec<(usize, usize)> {
373 regex_engine::find_all(pattern, flags.unwrap_or(""), text.as_bytes())
374}
375
376#[pyfunction]
378#[pyo3(signature = (pattern, text, flags=None))]
379fn regex_split(pattern: &str, text: &str, flags: Option<&str>) -> Vec<(usize, usize)> {
380 regex_engine::split(pattern, flags.unwrap_or(""), text.as_bytes())
381}
382
383#[pyfunction]
387fn levenshtein(a: &str, b: &str) -> usize {
388 nlp::levenshtein(a, b)
389}
390
391#[pyfunction]
393fn levenshtein_similarity(a: &str, b: &str) -> f64 {
394 nlp::levenshtein_similarity(a, b)
395}
396
397#[pyfunction]
399fn jaccard_ngram_similarity(a: &str, b: &str, n: usize) -> f64 {
400 nlp::jaccard_ngram_similarity(a, b, n)
401}
402
403#[pyfunction]
405fn char_ngrams(text: &str, n: usize) -> std::collections::BTreeMap<String, usize> {
406 nlp::char_ngrams(text, n)
407}
408
409#[pyfunction]
411fn word_ngrams(text: &str, n: usize) -> std::collections::BTreeMap<String, usize> {
412 nlp::word_ngrams(text, n)
413}
414
415#[pyfunction]
417fn tokenize_whitespace(text: &str) -> Vec<(usize, usize)> {
418 nlp::tokenize_whitespace(text)
419}
420
421#[pyfunction]
423fn tokenize_words(text: &str) -> Vec<String> {
424 nlp::tokenize_words(text)
425}
426
427#[pyfunction]
429fn term_frequency(text: &str) -> std::collections::BTreeMap<String, f64> {
430 nlp::term_frequency(text)
431}
432
433#[pyfunction]
435fn kahan_sum(values: Vec<f64>) -> f64 {
436 kahan::kahan_sum(&values)
437}
438
439fn py_list_to_column(list: &Bound<'_, PyList>) -> PyResult<column::Column> {
442 if list.is_empty() {
443 return Ok(column::Column::Str(Vec::new()));
444 }
445
446 let first = list.get_item(0)?;
448 if first.extract::<bool>().is_ok() {
449 let vals: Vec<bool> = list.extract()?;
450 Ok(column::Column::Bool(vals))
451 } else if first.extract::<i64>().is_ok() {
452 let vals: Vec<i64> = list.extract()?;
453 Ok(column::Column::Int(vals))
454 } else if first.extract::<f64>().is_ok() {
455 let vals: Vec<f64> = list.extract()?;
456 Ok(column::Column::Float(vals))
457 } else {
458 let vals: Vec<String> = list.extract()?;
459 Ok(column::Column::Str(vals))
460 }
461}
462
463fn column_to_py(py: Python<'_>, col: &column::Column) -> PyResult<PyObject> {
464 match col {
465 column::Column::Int(v) => Ok(v.to_object(py)),
466 column::Column::Float(v) => Ok(v.to_object(py)),
467 column::Column::Str(v) => Ok(v.to_object(py)),
468 column::Column::Bool(v) => Ok(v.to_object(py)),
469 }
470}
471
472fn parse_agg(name: &str, col: &str) -> PyResult<tidyview::TidyAgg> {
473 let c = col.to_string();
474 match name.to_lowercase().as_str() {
475 "count" => Ok(tidyview::TidyAgg::Count),
476 "sum" => Ok(tidyview::TidyAgg::Sum(c)),
477 "mean" => Ok(tidyview::TidyAgg::Mean(c)),
478 "min" => Ok(tidyview::TidyAgg::Min(c)),
479 "max" => Ok(tidyview::TidyAgg::Max(c)),
480 "sd" => Ok(tidyview::TidyAgg::Sd(c)),
481 "var" => Ok(tidyview::TidyAgg::Var(c)),
482 "first" => Ok(tidyview::TidyAgg::First(c)),
483 "last" => Ok(tidyview::TidyAgg::Last(c)),
484 "n_distinct" => Ok(tidyview::TidyAgg::NDistinct(c)),
485 _ => Err(PyValueError::new_err(format!("unknown aggregation: {}", name))),
486 }
487}
488
489#[pymodule]
493fn virtual_frame(m: &Bound<'_, PyModule>) -> PyResult<()> {
494 m.add_class::<PyDataFrame>()?;
496 m.add_class::<PyTidyView>()?;
497 m.add_class::<PyKahanAccumulator>()?;
498 m.add_class::<PyRng>()?;
499
500 m.add_function(wrap_pyfunction!(read_csv, m)?)?;
502 m.add_function(wrap_pyfunction!(read_csv_delim, m)?)?;
503
504 m.add_function(wrap_pyfunction!(regex_is_match, m)?)?;
506 m.add_function(wrap_pyfunction!(regex_find, m)?)?;
507 m.add_function(wrap_pyfunction!(regex_find_all, m)?)?;
508 m.add_function(wrap_pyfunction!(regex_split, m)?)?;
509
510 m.add_function(wrap_pyfunction!(levenshtein, m)?)?;
512 m.add_function(wrap_pyfunction!(levenshtein_similarity, m)?)?;
513 m.add_function(wrap_pyfunction!(jaccard_ngram_similarity, m)?)?;
514 m.add_function(wrap_pyfunction!(char_ngrams, m)?)?;
515 m.add_function(wrap_pyfunction!(word_ngrams, m)?)?;
516 m.add_function(wrap_pyfunction!(tokenize_whitespace, m)?)?;
517 m.add_function(wrap_pyfunction!(tokenize_words, m)?)?;
518 m.add_function(wrap_pyfunction!(term_frequency, m)?)?;
519
520 m.add_function(wrap_pyfunction!(kahan_sum, m)?)?;
522
523 Ok(())
524}