Skip to main content

robin_sparkless/
lib.rs

1//! Robin Sparkless - A Rust DataFrame library with PySpark-like API
2//!
3//! This library provides a PySpark-compatible API. The **root crate** is engine-agnostic:
4//! it depends on [robin-sparkless-core](https://docs.rs/robin-sparkless-core) (types, expression IR, config)
5//! and one backend—currently **robin-sparkless-polars**, which uses [Polars](https://www.pola.rs/)
6//! for execution. The public API exposes engine-agnostic expression types where possible.
7//!
8//! # Expression APIs
9//!
10//! - **ExprIr (engine-agnostic):** Use [`col`], [`lit_i64`], [`lit_str`], [`when`], [`gt`], [`eq`], etc.
11//!   from the crate root (re-exported from `robin_sparkless_core`). These build an [`ExprIr`] tree.
12//!   Use [`DataFrame::filter_expr_ir`], [`DataFrame::select_expr_ir`], [`DataFrame::with_column_expr_ir`],
13//!   [`DataFrame::collect_rows`], and [`GroupedData::agg_expr_ir`] with `&ExprIr` / `&[ExprIr]`.
14//!   Collect returns [`CollectedRows`] (JSON-like rows). Prefer this for new code and embeddings.
15//!
16//! - **Column / Expr (Polars-backed):** Use [`prelude`] or `robin_sparkless::functions::{col, lit_i64, ...}`
17//!   for the full PySpark-like API that returns [`Column`] and uses Polars [`Expr`]. Use
18//!   [`DataFrame::filter`], [`DataFrame::with_column`], [`DataFrame::select_exprs`], etc.
19//!   with those types. Still supported for compatibility and advanced use.
20//!
21//! # Getting started and embedding
22//!
23//! For application code and embedding, use the [prelude]: `use robin_sparkless::prelude::*`.
24//! For a minimal FFI surface, use [prelude::embed]. For engine-agnostic expressions, use the
25//! root re-exports (`col`, `lit_i64`, `gt`, etc.) and the `*_expr_ir` / `collect_rows` methods.
26//!
27//! # Panics and errors
28//!
29//! Some functions panic when used with invalid or empty inputs (e.g. calling
30//! `when(cond).otherwise(val)` without `.then()`, or passing no columns to
31//! `format_string`, `elt`, `concat`, `coalesce`, or `named_struct` in Rust).
32//! In Rust, `create_map` and `array` return `Result` for empty input instead of
33//! panicking. From Python, empty columns for `coalesce`, `format_string`,
34//! `printf`, and `named_struct` raise `ValueError`. See the documentation for
35//! each function for details.
36//!
37//! # API stability
38//!
39//! While the crate is in the 0.x series, we follow [semver](https://semver.org/) but may introduce
40//! breaking changes in minor releases (e.g. 0.1 → 0.2) until 1.0. For behavioral caveats and
41//! intentional differences from PySpark, see the [repository documentation](https://github.com/eddiethedean/robin-sparkless/blob/main/docs/PYSPARK_DIFFERENCES.md).
42
43#![allow(clippy::collapsible_if)]
44#![allow(clippy::let_and_return)]
45
46pub mod config;
47pub mod dataframe;
48pub mod prelude;
49pub mod schema;
50pub mod session;
51pub mod traits;
52
53/// Engine-agnostic types, traits, and expression IR re-exported from `robin-sparkless-core`.
54/// Prefer these for engine-generic code and embeddings that should not depend on Polars.
55pub mod engine {
56    pub use robin_sparkless_core::engine::{
57        CollectedRows, DataFrameBackend, DataFrameReaderBackend, GroupedDataBackend, JoinType,
58        PlanExecutor, SparkSessionBackend,
59    };
60    pub use robin_sparkless_core::expr::{
61        ExprIr, LiteralValue, WhenBuilder, WhenThenBuilder, alias, and_, approx_count_distinct,
62        between, bool_and, call, col, collect_list, collect_set, count, count_distinct, count_if,
63        eq, every, first, ge, gt, is_in, is_null, kurtosis, le, lit_bool, lit_f64, lit_i32,
64        lit_i64, lit_null, lit_str, lt, max, mean, median, min, mode, ne, not_, or_, skewness, std,
65        stddev, stddev_pop, stddev_samp, sum, try_avg, try_sum, var_pop, var_samp, variance, when,
66    };
67    pub use robin_sparkless_core::{DataType, EngineError, StructField, StructType};
68}
69
70/// Polars-backed types and helper functions re-exported from `robin-sparkless-polars`.
71/// These are useful when you explicitly want access to Polars-level APIs from Rust.
72pub mod polars {
73    pub use robin_sparkless_polars::functions::{SortOrder, *};
74    pub use robin_sparkless_polars::{
75        Column, Expr, PlDataFrame, PlDataType, PolarsError, RustUdf, StructTypePolarsExt,
76        UdfRegistry, broadcast, expression, schema_from_json,
77    };
78    pub use robin_sparkless_polars::{column, error, functions, type_coercion};
79}
80
81// Backward-compatible re-exports at the crate root. New code should prefer the
82// `engine` and `polars` modules above for clearer boundaries, but we keep these
83// for existing users.
84pub use engine::CollectedRows;
85pub use engine::{
86    DataType, EngineError, ExprIr, LiteralValue, StructField, StructType, WhenBuilder,
87    WhenThenBuilder, alias, and_, approx_count_distinct, between, bool_and, call, col,
88    collect_list, collect_set, count, count_distinct, count_if, eq, every, first, ge, gt, is_in,
89    is_null, kurtosis, le, lit_bool, lit_f64, lit_i32, lit_i64, lit_null, lit_str, lt, max, mean,
90    median, min, mode, ne, not_, or_, skewness, std, stddev, stddev_pop, stddev_samp, sum, try_avg,
91    try_sum, var_pop, var_samp, variance, when,
92};
93pub use polars::{
94    Column, Expr, PolarsError, RustUdf, StructTypePolarsExt, UdfRegistry, broadcast, expression,
95    schema_from_json,
96};
97/// Backwards-compatible module re-export so `robin_sparkless::functions::*` continues to work.
98pub use robin_sparkless_polars::functions;
99pub use robin_sparkless_polars::functions::{SortOrder, *};
100
101// Root-owned entry-point types (delegate to robin-sparkless-polars).
102pub use dataframe::{
103    CubeRollupData, DataFrame, DataFrameNa, DataFrameStat, DataFrameWriter, GroupBySpec,
104    GroupedData, JoinType, PivotedGroupedData, SaveMode, SelectItem, WriteFormat, WriteMode,
105    expr_contains_only_join_key_equalities, try_extract_join_eq_columns,
106    try_extract_join_eq_columns_all,
107};
108pub use session::{DataFrameReader, SparkSession, SparkSessionBuilder};
109
110// Root-owned traits (work with root DataFrame/SparkSession); plan re-export.
111pub use robin_sparkless_polars::plan::{PlanError, PlanExprError};
112pub use traits::{FromRobinDf, IntoRobinDf};
113
114/// Execute a logical plan; returns root-owned [`DataFrame`].
115pub fn execute_plan(
116    session: &SparkSession,
117    data: Vec<Vec<serde_json::Value>>,
118    schema: Vec<(String, String)>,
119    plan: &[serde_json::Value],
120) -> Result<DataFrame, PlanError> {
121    use robin_sparkless_core::engine::PlanExecutor as _;
122
123    // Execute via the engine-generic PlanExecutor trait implemented by the Polars backend.
124    let boxed = robin_sparkless_polars::plan::PolarsPlanExecutor::execute_plan(
125        &session.0, data, schema, plan,
126    )
127    .map_err(|e| PlanError::InvalidPlan(e.to_string()))?;
128
129    crate::dataframe::from_backend(boxed).map_err(|e| PlanError::InvalidPlan(e.to_string()))
130}
131
132pub use config::SparklessConfig;
133
134/// Convert PolarsError to EngineError (for APIs that still return PolarsError).
135pub fn to_engine_error(e: PolarsError) -> EngineError {
136    robin_sparkless_polars::polars_to_core_error(e)
137}
138
139// Re-export thread UDF context functions for test isolation fix
140pub use robin_sparkless_polars::{
141    clear_thread_udf_context, set_thread_udf_context, set_thread_udf_context_with_tz,
142};
143
144#[cfg(feature = "sql")]
145pub mod sql {
146    //! SQL parsing and execution; returns root-owned DataFrame.
147    use crate::dataframe::DataFrame;
148    use crate::session::SparkSession;
149    use robin_sparkless_polars::PolarsError;
150
151    pub use robin_sparkless_polars::sql::{Statement, execute_sql, parse_sql};
152
153    /// Parse a single SQL expression string to Polars Expr using the session and DataFrame for resolution.
154    pub fn expr_string_to_polars(
155        expr_str: &str,
156        session: &SparkSession,
157        df: &DataFrame,
158    ) -> Result<robin_sparkless_polars::Expr, PolarsError> {
159        robin_sparkless_polars::sql::expr_string_to_polars(expr_str, &session.0, &df.0)
160    }
161
162    /// Execute SQL and return root-owned DataFrame.
163    pub fn execute_sql_root(session: &SparkSession, query: &str) -> Result<DataFrame, PolarsError> {
164        robin_sparkless_polars::sql::execute_sql(&session.0, query).map(DataFrame)
165    }
166}
167
168#[cfg(feature = "delta")]
169pub mod delta {
170    //! Delta Lake read/write; returns root-owned DataFrame where applicable.
171    use crate::dataframe::DataFrame;
172    use robin_sparkless_polars::PolarsError;
173    use std::path::Path;
174
175    pub use robin_sparkless_polars::delta::{read_delta, read_delta_with_version, write_delta};
176
177    /// Read Delta table; returns root-owned DataFrame.
178    pub fn read_delta_root(
179        path: impl AsRef<Path>,
180        case_sensitive: bool,
181    ) -> Result<DataFrame, PolarsError> {
182        robin_sparkless_polars::delta::read_delta(path, case_sensitive).map(DataFrame)
183    }
184
185    /// Read Delta table at version; returns root-owned DataFrame.
186    pub fn read_delta_with_version_root(
187        path: impl AsRef<Path>,
188        version: Option<i64>,
189        case_sensitive: bool,
190    ) -> Result<DataFrame, PolarsError> {
191        robin_sparkless_polars::delta::read_delta_with_version(path, version, case_sensitive)
192            .map(DataFrame)
193    }
194}