Skip to main content

virtual_frame/
lib.rs

1//! virtual-frame — Deterministic data pipeline toolkit for LLM training.
2//!
3//! Bitmask-filtered virtual views, NFA regex, Kahan summation, NLP primitives,
4//! CSV ingestion, and a deterministic RNG. Python bindings via PyO3.
5
6pub mod bitmask;
7pub mod column;
8pub mod csv;
9pub mod dataframe;
10pub mod expr;
11pub mod kahan;
12pub mod nlp;
13pub mod regex_engine;
14pub mod rng;
15pub mod tidyview;
16
17// ── PyO3 Python bindings ──────────────────────────────────────────────────
18
19use pyo3::prelude::*;
20use pyo3::exceptions::{PyRuntimeError, PyValueError};
21use pyo3::types::{PyDict, PyList};
22
23// ── Python wrappers ───────────────────────────────────────────────────────
24
25/// Python-visible DataFrame wrapper.
26#[pyclass(name = "DataFrame")]
27#[derive(Clone)]
28struct PyDataFrame {
29    inner: dataframe::DataFrame,
30}
31
32#[pymethods]
33impl PyDataFrame {
34    /// Create a DataFrame from a dict of column_name → list.
35    #[new]
36    fn new(columns: &Bound<'_, PyDict>) -> PyResult<Self> {
37        let mut cols: Vec<(String, column::Column)> = Vec::new();
38        for (key, value) in columns.iter() {
39            let name: String = key.extract()?;
40            let list = value.downcast::<PyList>()?;
41            let col = py_list_to_column(list)?;
42            cols.push((name, col));
43        }
44        let df = dataframe::DataFrame::from_columns(cols)
45            .map_err(|e| PyValueError::new_err(format!("{}", e)))?;
46        Ok(PyDataFrame { inner: df })
47    }
48
49    /// Number of rows.
50    fn nrows(&self) -> usize {
51        self.inner.nrows()
52    }
53
54    /// Number of columns.
55    fn ncols(&self) -> usize {
56        self.inner.ncols()
57    }
58
59    /// Column names.
60    fn column_names(&self) -> Vec<String> {
61        self.inner.column_names().into_iter().map(|s| s.to_string()).collect()
62    }
63
64    /// Get a column as a Python list.
65    fn get_column(&self, name: &str) -> PyResult<PyObject> {
66        let col = self.inner.get_column(name)
67            .ok_or_else(|| PyValueError::new_err(format!("column `{}` not found", name)))?;
68        Python::with_gil(|py| column_to_py(py, col))
69    }
70
71    fn __repr__(&self) -> String {
72        format!("DataFrame(nrows={}, ncols={}, columns={:?})",
73            self.inner.nrows(), self.inner.ncols(), self.inner.column_names())
74    }
75}
76
77/// Python-visible TidyView wrapper.
78#[pyclass(name = "TidyView", unsendable)]
79#[derive(Clone)]
80struct PyTidyView {
81    inner: tidyview::TidyView,
82}
83
84#[pymethods]
85impl PyTidyView {
86    /// Create a TidyView from a DataFrame.
87    #[new]
88    fn new(df: &PyDataFrame) -> Self {
89        let tv = tidyview::TidyView::new(df.inner.clone());
90        PyTidyView { inner: tv }
91    }
92
93    /// Number of visible rows.
94    fn nrows(&self) -> usize {
95        self.inner.nrows()
96    }
97
98    /// Number of visible columns.
99    fn ncols(&self) -> usize {
100        self.inner.ncols()
101    }
102
103    /// Visible column names.
104    fn column_names(&self) -> Vec<String> {
105        self.inner.column_names().into_iter().map(|s| s.to_string()).collect()
106    }
107
108    /// Filter rows where column > value (integer).
109    fn filter_gt_int(&self, col_name: &str, value: i64) -> PyResult<Self> {
110        let pred = expr::binop(
111            expr::BinOp::Gt,
112            expr::col(col_name),
113            expr::DExpr::LitInt(value),
114        );
115        let inner = self.inner.filter(&pred)
116            .map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?;
117        Ok(PyTidyView { inner })
118    }
119
120    /// Filter rows where column < value (integer).
121    fn filter_lt_int(&self, col_name: &str, value: i64) -> PyResult<Self> {
122        let pred = expr::binop(
123            expr::BinOp::Lt,
124            expr::col(col_name),
125            expr::DExpr::LitInt(value),
126        );
127        let inner = self.inner.filter(&pred)
128            .map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?;
129        Ok(PyTidyView { inner })
130    }
131
132    /// Filter rows where column == value (integer).
133    fn filter_eq_int(&self, col_name: &str, value: i64) -> PyResult<Self> {
134        let pred = expr::binop(
135            expr::BinOp::Eq,
136            expr::col(col_name),
137            expr::DExpr::LitInt(value),
138        );
139        let inner = self.inner.filter(&pred)
140            .map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?;
141        Ok(PyTidyView { inner })
142    }
143
144    /// Filter rows where column == value (string).
145    fn filter_eq_str(&self, col_name: &str, value: &str) -> PyResult<Self> {
146        let pred = expr::binop(
147            expr::BinOp::Eq,
148            expr::col(col_name),
149            expr::DExpr::LitStr(value.to_string()),
150        );
151        let inner = self.inner.filter(&pred)
152            .map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?;
153        Ok(PyTidyView { inner })
154    }
155
156    /// Filter rows where column > value (float).
157    fn filter_gt_float(&self, col_name: &str, value: f64) -> PyResult<Self> {
158        let pred = expr::binop(
159            expr::BinOp::Gt,
160            expr::col(col_name),
161            expr::DExpr::LitFloat(value),
162        );
163        let inner = self.inner.filter(&pred)
164            .map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?;
165        Ok(PyTidyView { inner })
166    }
167
168    /// Select specific columns by name.
169    fn select(&self, columns: Vec<String>) -> PyResult<Self> {
170        let refs: Vec<&str> = columns.iter().map(|s| s.as_str()).collect();
171        let inner = self.inner.select(&refs)
172            .map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?;
173        Ok(PyTidyView { inner })
174    }
175
176    /// Sort by a column (ascending).
177    fn arrange(&self, col_name: &str) -> PyResult<Self> {
178        let keys = vec![tidyview::ArrangeKey {
179            col_name: col_name.to_string(),
180            descending: false,
181        }];
182        let inner = self.inner.arrange(&keys)
183            .map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?;
184        Ok(PyTidyView { inner })
185    }
186
187    /// Sort by a column (descending).
188    fn arrange_desc(&self, col_name: &str) -> PyResult<Self> {
189        let keys = vec![tidyview::ArrangeKey {
190            col_name: col_name.to_string(),
191            descending: true,
192        }];
193        let inner = self.inner.arrange(&keys)
194            .map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?;
195        Ok(PyTidyView { inner })
196    }
197
198    /// Take the first n rows.
199    fn slice_head(&self, n: usize) -> Self {
200        PyTidyView { inner: self.inner.slice_head(n) }
201    }
202
203    /// Take the last n rows.
204    fn slice_tail(&self, n: usize) -> Self {
205        PyTidyView { inner: self.inner.slice_tail(n) }
206    }
207
208    /// Deterministic random sample of n rows.
209    fn slice_sample(&self, n: usize, seed: u64) -> Self {
210        PyTidyView { inner: self.inner.slice_sample(n, seed) }
211    }
212
213    /// Distinct rows by specified columns.
214    fn distinct(&self, columns: Vec<String>) -> PyResult<Self> {
215        let refs: Vec<&str> = columns.iter().map(|s| s.as_str()).collect();
216        let inner = self.inner.distinct(&refs)
217            .map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?;
218        Ok(PyTidyView { inner })
219    }
220
221    /// Group by columns and summarise with an aggregation.
222    ///
223    /// `agg_fn` is one of: "count", "sum", "mean", "min", "max", "sd", "var",
224    /// "first", "last", "n_distinct".
225    /// `agg_col` is the source column to aggregate.
226    /// `output_name` is the name for the output column (e.g., "mean_score").
227    fn group_summarise(&self, group_cols: Vec<String>, agg_col: &str, agg_fn: &str, output_name: &str) -> PyResult<PyDataFrame> {
228        let refs: Vec<&str> = group_cols.iter().map(|s| s.as_str()).collect();
229        let agg = parse_agg(agg_fn, agg_col)?;
230        let grouped = self.inner.group_by(&refs)
231            .map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?;
232        let result_df = grouped.summarise(&[(output_name, agg)])
233            .map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?;
234        Ok(PyDataFrame { inner: result_df })
235    }
236
237    /// Inner join with another TidyView on specified columns.
238    ///
239    /// `by` is a list of column names to join on (same name in both views).
240    fn inner_join(&self, other: &PyTidyView, by: Vec<String>) -> PyResult<PyDataFrame> {
241        let pairs: Vec<(&str, &str)> = by.iter().map(|s| (s.as_str(), s.as_str())).collect();
242        let result_df = self.inner.inner_join(&other.inner, &pairs)
243            .map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?;
244        Ok(PyDataFrame { inner: result_df })
245    }
246
247    /// Left join with another TidyView on specified columns.
248    fn left_join(&self, other: &PyTidyView, by: Vec<String>) -> PyResult<PyDataFrame> {
249        let pairs: Vec<(&str, &str)> = by.iter().map(|s| (s.as_str(), s.as_str())).collect();
250        let result_df = self.inner.left_join(&other.inner, &pairs)
251            .map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?;
252        Ok(PyDataFrame { inner: result_df })
253    }
254
255    /// Materialize the view into a concrete DataFrame.
256    fn materialize(&self) -> PyResult<PyDataFrame> {
257        let df = self.inner.materialize()
258            .map_err(|e| PyRuntimeError::new_err(format!("{}", e)))?;
259        Ok(PyDataFrame { inner: df })
260    }
261
262    fn __repr__(&self) -> String {
263        format!("TidyView(nrows={}, ncols={}, columns={:?})",
264            self.nrows(), self.ncols(), self.column_names())
265    }
266}
267
268/// Python-visible Kahan accumulator.
269#[pyclass(name = "KahanAccumulator")]
270struct PyKahanAccumulator {
271    inner: kahan::KahanAccumulator,
272}
273
274#[pymethods]
275impl PyKahanAccumulator {
276    #[new]
277    fn new() -> Self {
278        PyKahanAccumulator { inner: kahan::KahanAccumulator::new() }
279    }
280
281    fn add(&mut self, value: f64) {
282        self.inner.add(value);
283    }
284
285    fn add_slice(&mut self, values: Vec<f64>) {
286        self.inner.add_slice(&values);
287    }
288
289    fn finalize(&self) -> f64 {
290        self.inner.finalize()
291    }
292
293    fn count(&self) -> usize {
294        self.inner.count()
295    }
296}
297
298/// Python-visible deterministic RNG.
299#[pyclass(name = "Rng")]
300struct PyRng {
301    inner: rng::Rng,
302}
303
304#[pymethods]
305impl PyRng {
306    #[new]
307    fn new(seed: u64) -> Self {
308        PyRng { inner: rng::Rng::seeded(seed) }
309    }
310
311    fn next_u64(&mut self) -> u64 {
312        self.inner.next_u64()
313    }
314
315    fn next_f64(&mut self) -> f64 {
316        self.inner.next_f64()
317    }
318
319    fn next_normal(&mut self) -> f64 {
320        self.inner.next_normal()
321    }
322
323    fn fork(&mut self) -> Self {
324        PyRng { inner: self.inner.fork() }
325    }
326}
327
328// ── CSV functions ─────────────────────────────────────────────────────────
329
330/// Parse CSV text into a DataFrame.
331#[pyfunction]
332fn read_csv(text: &str) -> PyResult<PyDataFrame> {
333    let reader = csv::CsvReader::new(csv::CsvConfig::default());
334    let df = reader.parse(text.as_bytes())
335        .map_err(|e| PyValueError::new_err(format!("{}", e)))?;
336    Ok(PyDataFrame { inner: df })
337}
338
339/// Parse CSV with a custom delimiter.
340#[pyfunction]
341fn read_csv_delim(text: &str, delimiter: &str) -> PyResult<PyDataFrame> {
342    let delim = delimiter.as_bytes().first().copied().unwrap_or(b',');
343    let config = csv::CsvConfig {
344        delimiter: delim,
345        ..Default::default()
346    };
347    let reader = csv::CsvReader::new(config);
348    let df = reader.parse(text.as_bytes())
349        .map_err(|e| PyValueError::new_err(format!("{}", e)))?;
350    Ok(PyDataFrame { inner: df })
351}
352
353// ── Regex functions ───────────────────────────────────────────────────────
354
355/// Test if a regex pattern matches anywhere in the text.
356#[pyfunction]
357#[pyo3(signature = (pattern, text, flags=None))]
358fn regex_is_match(pattern: &str, text: &str, flags: Option<&str>) -> bool {
359    regex_engine::is_match(pattern, flags.unwrap_or(""), text.as_bytes())
360}
361
362/// Find the first match span (start, end) or None.
363#[pyfunction]
364#[pyo3(signature = (pattern, text, flags=None))]
365fn regex_find(pattern: &str, text: &str, flags: Option<&str>) -> Option<(usize, usize)> {
366    regex_engine::find(pattern, flags.unwrap_or(""), text.as_bytes())
367}
368
369/// Find all non-overlapping match spans.
370#[pyfunction]
371#[pyo3(signature = (pattern, text, flags=None))]
372fn regex_find_all(pattern: &str, text: &str, flags: Option<&str>) -> Vec<(usize, usize)> {
373    regex_engine::find_all(pattern, flags.unwrap_or(""), text.as_bytes())
374}
375
376/// Split text by a regex pattern, returning segment spans.
377#[pyfunction]
378#[pyo3(signature = (pattern, text, flags=None))]
379fn regex_split(pattern: &str, text: &str, flags: Option<&str>) -> Vec<(usize, usize)> {
380    regex_engine::split(pattern, flags.unwrap_or(""), text.as_bytes())
381}
382
383// ── NLP functions ─────────────────────────────────────────────────────────
384
385/// Levenshtein edit distance between two strings.
386#[pyfunction]
387fn levenshtein(a: &str, b: &str) -> usize {
388    nlp::levenshtein(a, b)
389}
390
391/// Normalized Levenshtein similarity in [0.0, 1.0].
392#[pyfunction]
393fn levenshtein_similarity(a: &str, b: &str) -> f64 {
394    nlp::levenshtein_similarity(a, b)
395}
396
397/// Jaccard similarity between character n-gram sets.
398#[pyfunction]
399fn jaccard_ngram_similarity(a: &str, b: &str, n: usize) -> f64 {
400    nlp::jaccard_ngram_similarity(a, b, n)
401}
402
403/// Extract character n-grams with frequency counts.
404#[pyfunction]
405fn char_ngrams(text: &str, n: usize) -> std::collections::BTreeMap<String, usize> {
406    nlp::char_ngrams(text, n)
407}
408
409/// Extract word n-grams with frequency counts.
410#[pyfunction]
411fn word_ngrams(text: &str, n: usize) -> std::collections::BTreeMap<String, usize> {
412    nlp::word_ngrams(text, n)
413}
414
415/// Tokenize by whitespace, returning (start, end) byte spans.
416#[pyfunction]
417fn tokenize_whitespace(text: &str) -> Vec<(usize, usize)> {
418    nlp::tokenize_whitespace(text)
419}
420
421/// Tokenize into words and punctuation.
422#[pyfunction]
423fn tokenize_words(text: &str) -> Vec<String> {
424    nlp::tokenize_words(text)
425}
426
427/// Term frequency (TF) for each word.
428#[pyfunction]
429fn term_frequency(text: &str) -> std::collections::BTreeMap<String, f64> {
430    nlp::term_frequency(text)
431}
432
433/// One-shot Kahan-compensated sum of a list of floats.
434#[pyfunction]
435fn kahan_sum(values: Vec<f64>) -> f64 {
436    kahan::kahan_sum(&values)
437}
438
439// ── Helper functions ──────────────────────────────────────────────────────
440
441fn py_list_to_column(list: &Bound<'_, PyList>) -> PyResult<column::Column> {
442    if list.is_empty() {
443        return Ok(column::Column::Str(Vec::new()));
444    }
445
446    // Detect type from the first element
447    let first = list.get_item(0)?;
448    if first.extract::<bool>().is_ok() {
449        let vals: Vec<bool> = list.extract()?;
450        Ok(column::Column::Bool(vals))
451    } else if first.extract::<i64>().is_ok() {
452        let vals: Vec<i64> = list.extract()?;
453        Ok(column::Column::Int(vals))
454    } else if first.extract::<f64>().is_ok() {
455        let vals: Vec<f64> = list.extract()?;
456        Ok(column::Column::Float(vals))
457    } else {
458        let vals: Vec<String> = list.extract()?;
459        Ok(column::Column::Str(vals))
460    }
461}
462
463fn column_to_py(py: Python<'_>, col: &column::Column) -> PyResult<PyObject> {
464    match col {
465        column::Column::Int(v) => Ok(v.to_object(py)),
466        column::Column::Float(v) => Ok(v.to_object(py)),
467        column::Column::Str(v) => Ok(v.to_object(py)),
468        column::Column::Bool(v) => Ok(v.to_object(py)),
469    }
470}
471
472fn parse_agg(name: &str, col: &str) -> PyResult<tidyview::TidyAgg> {
473    let c = col.to_string();
474    match name.to_lowercase().as_str() {
475        "count" => Ok(tidyview::TidyAgg::Count),
476        "sum" => Ok(tidyview::TidyAgg::Sum(c)),
477        "mean" => Ok(tidyview::TidyAgg::Mean(c)),
478        "min" => Ok(tidyview::TidyAgg::Min(c)),
479        "max" => Ok(tidyview::TidyAgg::Max(c)),
480        "sd" => Ok(tidyview::TidyAgg::Sd(c)),
481        "var" => Ok(tidyview::TidyAgg::Var(c)),
482        "first" => Ok(tidyview::TidyAgg::First(c)),
483        "last" => Ok(tidyview::TidyAgg::Last(c)),
484        "n_distinct" => Ok(tidyview::TidyAgg::NDistinct(c)),
485        _ => Err(PyValueError::new_err(format!("unknown aggregation: {}", name))),
486    }
487}
488
489// ── Python module ─────────────────────────────────────────────────────────
490
491/// virtual-frame Python module.
492#[pymodule]
493fn virtual_frame(m: &Bound<'_, PyModule>) -> PyResult<()> {
494    // Classes
495    m.add_class::<PyDataFrame>()?;
496    m.add_class::<PyTidyView>()?;
497    m.add_class::<PyKahanAccumulator>()?;
498    m.add_class::<PyRng>()?;
499
500    // CSV functions
501    m.add_function(wrap_pyfunction!(read_csv, m)?)?;
502    m.add_function(wrap_pyfunction!(read_csv_delim, m)?)?;
503
504    // Regex functions
505    m.add_function(wrap_pyfunction!(regex_is_match, m)?)?;
506    m.add_function(wrap_pyfunction!(regex_find, m)?)?;
507    m.add_function(wrap_pyfunction!(regex_find_all, m)?)?;
508    m.add_function(wrap_pyfunction!(regex_split, m)?)?;
509
510    // NLP functions
511    m.add_function(wrap_pyfunction!(levenshtein, m)?)?;
512    m.add_function(wrap_pyfunction!(levenshtein_similarity, m)?)?;
513    m.add_function(wrap_pyfunction!(jaccard_ngram_similarity, m)?)?;
514    m.add_function(wrap_pyfunction!(char_ngrams, m)?)?;
515    m.add_function(wrap_pyfunction!(word_ngrams, m)?)?;
516    m.add_function(wrap_pyfunction!(tokenize_whitespace, m)?)?;
517    m.add_function(wrap_pyfunction!(tokenize_words, m)?)?;
518    m.add_function(wrap_pyfunction!(term_frequency, m)?)?;
519
520    // Math
521    m.add_function(wrap_pyfunction!(kahan_sum, m)?)?;
522
523    Ok(())
524}