pub struct DataFrame { /* private fields */ }
Expand description
DataFrame is composed of a SparkSession referencing a Spark Connect enabled cluster, and a LogicalPlanBuilder which represents the unresolved spark::Plan to be submitted to the cluster when an action is called.
The LogicalPlanBuilder is a series of unresolved logical plans, and every additional transformation takes the prior spark::Plan and builds onto it. The final unresolved logical plan is submitted to the spark connect server.
§create_dataframe & range
A DataFrame
can be created with an arrow::array::RecordBatch, or with spark.range(...)
let name: ArrayRef = Arc::new(StringArray::from(vec!["Tom", "Alice", "Bob"]));
let age: ArrayRef = Arc::new(Int64Array::from(vec![14, 23, 16]));
let data = RecordBatch::try_from_iter(vec![("name", name), ("age", age)])?
let df = spark.create_dataframe(&data).await?
§sql
A DataFrame
is created from a spark.sql()
statement
let df = spark.sql("SELECT * FROM json.`/opt/spark/work-dir/datasets/employees.json`").await?;
§read & readStream
A DataFrame
is also created from a spark.read()
and spark.read_stream()
statement.
let df = spark
.read()
.format("csv")
.option("header", "True")
.option("delimiter", ";")
.load(paths)?;
Implementations§
Source§impl DataFrame
impl DataFrame
Sourcepub fn agg<I>(self, exprs: I) -> DataFrame
pub fn agg<I>(self, exprs: I) -> DataFrame
Aggregate on the entire DataFrame without groups (shorthand for df.group_by().agg()
)
Sourcepub async fn approx_quantile<I, P>(
self,
cols: I,
probabilities: P,
relative_error: f64,
) -> Result<RecordBatch, SparkError>
pub async fn approx_quantile<I, P>( self, cols: I, probabilities: P, relative_error: f64, ) -> Result<RecordBatch, SparkError>
Calculates the approximate quantiles of numerical columns of a DataFrame.
Sourcepub async fn cache(self) -> DataFrame
pub async fn cache(self) -> DataFrame
Persists the DataFrame with the default storage::StorageLevel::MemoryAndDiskDeser (MEMORY_AND_DISK_DESER).
Sourcepub fn coalesce(self, num_partitions: u32) -> DataFrame
pub fn coalesce(self, num_partitions: u32) -> DataFrame
Returns a new DataFrame that has exactly num_partitions
partitions.
Sourcepub fn col_regex(self, col_name: &str) -> Column
pub fn col_regex(self, col_name: &str) -> Column
Selects column based on the column name specified as a regex and returns it as Column.
Sourcepub async fn collect(self) -> Result<RecordBatch, SparkError>
pub async fn collect(self) -> Result<RecordBatch, SparkError>
Sourcepub async fn corr(self, col1: &str, col2: &str) -> Result<f64, SparkError>
pub async fn corr(self, col1: &str, col2: &str) -> Result<f64, SparkError>
Calculates the correlation of two columns of a DataFrame as a f64
.
Currently only supports the Pearson Correlation Coefficient.
Sourcepub async fn count(self) -> Result<i64, SparkError>
pub async fn count(self) -> Result<i64, SparkError>
Returns the number of rows in this DataFrame
Sourcepub async fn cov(self, col1: &str, col2: &str) -> Result<f64, SparkError>
pub async fn cov(self, col1: &str, col2: &str) -> Result<f64, SparkError>
Calculate the sample covariance for the given columns, specified by their names, as a f64
Sourcepub async fn create_global_temp_view(self, name: &str) -> Result<(), SparkError>
pub async fn create_global_temp_view(self, name: &str) -> Result<(), SparkError>
Creates a global temporary view with this DataFrame.
Sourcepub async fn create_or_replace_global_temp_view(
self,
name: &str,
) -> Result<(), SparkError>
pub async fn create_or_replace_global_temp_view( self, name: &str, ) -> Result<(), SparkError>
Creates or replaces a global temporary view using the given name.
Sourcepub async fn create_or_replace_temp_view(
self,
name: &str,
) -> Result<(), SparkError>
pub async fn create_or_replace_temp_view( self, name: &str, ) -> Result<(), SparkError>
Creates or replaces a local temporary view with this DataFrame
Sourcepub async fn create_temp_view(self, name: &str) -> Result<(), SparkError>
pub async fn create_temp_view(self, name: &str) -> Result<(), SparkError>
Creates a local temporary view with this DataFrame
Sourcepub fn cross_join(self, other: DataFrame) -> DataFrame
pub fn cross_join(self, other: DataFrame) -> DataFrame
Returns the cartesian product with another DataFrame.
Sourcepub fn crosstab(self, col1: &str, col2: &str) -> DataFrame
pub fn crosstab(self, col1: &str, col2: &str) -> DataFrame
Computes a pair-wise frequency table of the given columns. Also known as a contingency table.
Sourcepub fn cube<I>(self, cols: I) -> GroupedData
pub fn cube<I>(self, cols: I) -> GroupedData
Create a multi-dimensional cube for the current DataFrame using the specified columns, so we can run aggregations on them.
pub fn describe<I, T>(self, cols: Option<I>) -> DataFrame
Sourcepub fn drop<I>(self, cols: I) -> DataFrame
pub fn drop<I>(self, cols: I) -> DataFrame
Returns a new DataFrame without the specified columns
Sourcepub fn drop_duplicates<I, T>(self, cols: Option<I>) -> DataFrame
pub fn drop_duplicates<I, T>(self, cols: Option<I>) -> DataFrame
Return a new DataFrame with duplicate rows removed,
optionally only considering certain columns from a Vec<String>
If no columns are supplied then it all columns are used
Sourcepub fn drop_duplicates_within_waterwmark<I, T>(
self,
cols: Option<I>,
) -> DataFrame
pub fn drop_duplicates_within_waterwmark<I, T>( self, cols: Option<I>, ) -> DataFrame
Sourcepub fn dropna(
self,
how: &str,
threshold: Option<i32>,
subset: Option<Vec<&str>>,
) -> DataFrame
pub fn dropna( self, how: &str, threshold: Option<i32>, subset: Option<Vec<&str>>, ) -> DataFrame
Returns a new DataFrame omitting rows with null values.
Sourcepub async fn dtypes(self) -> Result<Vec<(String, Kind)>, SparkError>
pub async fn dtypes(self) -> Result<Vec<(String, Kind)>, SparkError>
Returns all column names and their data types as a Vec
containing
the field name as a String
and the spark::data_type::Kind enum
Sourcepub fn except_all(self, other: DataFrame) -> DataFrame
pub fn except_all(self, other: DataFrame) -> DataFrame
Sourcepub async fn explain(
self,
mode: Option<ExplainMode>,
) -> Result<String, SparkError>
pub async fn explain( self, mode: Option<ExplainMode>, ) -> Result<String, SparkError>
Prints the spark::Plan to the console
§Arguments:
mode
: ExplainMode Defaults tounspecified
simple
extended
codegen
cost
formatted
unspecified
Sourcepub fn fillna<I, T, L>(self, cols: Option<I>, values: T) -> DataFrame
pub fn fillna<I, T, L>(self, cols: Option<I>, values: T) -> DataFrame
Replace null values, alias for df.na().fill()
.
Sourcepub fn filter(self, condition: impl ToFilterExpr) -> DataFrame
pub fn filter(self, condition: impl ToFilterExpr) -> DataFrame
Sourcepub async fn first(self) -> Result<RecordBatch, SparkError>
pub async fn first(self) -> Result<RecordBatch, SparkError>
Returns the first row as a RecordBatch.
Sourcepub fn freq_items<I, S>(self, cols: I, support: Option<f64>) -> DataFrame
pub fn freq_items<I, S>(self, cols: I, support: Option<f64>) -> DataFrame
Finding frequent items for columns, possibly with false positives.
Sourcepub fn group_by<I>(self, cols: Option<I>) -> GroupedData
pub fn group_by<I>(self, cols: Option<I>) -> GroupedData
Groups the DataFrame using the specified columns, and returns a GroupedData object
Sourcepub async fn head(self, n: Option<i32>) -> Result<RecordBatch, SparkError>
pub async fn head(self, n: Option<i32>) -> Result<RecordBatch, SparkError>
Returns the first n rows.
Sourcepub fn hint<I>(self, name: &str, parameters: Option<I>) -> DataFrame
pub fn hint<I>(self, name: &str, parameters: Option<I>) -> DataFrame
Specifies some hint on the current DataFrame.
Sourcepub async fn input_files(self) -> Result<Vec<String>, SparkError>
pub async fn input_files(self) -> Result<Vec<String>, SparkError>
Returns a best-effort snapshot of the files that compose this DataFrame
Sourcepub fn intersect_all(self, other: DataFrame) -> DataFrame
pub fn intersect_all(self, other: DataFrame) -> DataFrame
Sourcepub async fn is_empty(self) -> Result<bool, SparkError>
pub async fn is_empty(self) -> Result<bool, SparkError>
Checks if the DataFrame is empty and returns a boolean value.
Sourcepub async fn is_local(self) -> Result<bool, SparkError>
pub async fn is_local(self) -> Result<bool, SparkError>
Returns true
if the collect()
and take()
methods can be run locally (without any Spark executors).
Sourcepub async fn is_streaming(self) -> Result<bool, SparkError>
pub async fn is_streaming(self) -> Result<bool, SparkError>
Returns true
if this DataFrame contains one or more sources that continuously return data as it arrives.
Sourcepub fn join<T: Into<Expression>>(
self,
other: DataFrame,
on: Option<T>,
how: JoinType,
) -> DataFrame
pub fn join<T: Into<Expression>>( self, other: DataFrame, on: Option<T>, how: JoinType, ) -> DataFrame
Joins with another DataFrame, using the given join expression.
§Example:
use spark_connect_rs::functions::col;
use spark_connect_rs::dataframe::JoinType;
async {
// join two dataframes where `id` == `name`
let condition = Some(col("id").eq(col("name")));
let df = df.join(df2, condition, JoinType::Inner);
}
Sourcepub fn melt<I>(
self,
ids: I,
values: Option<I>,
variable_column_name: &str,
value_column_name: &str,
) -> DataFrame
pub fn melt<I>( self, ids: I, values: Option<I>, variable_column_name: &str, value_column_name: &str, ) -> DataFrame
Alias for DataFrame::unpivot
Sourcepub fn na(self) -> DataFrameNaFunctions
pub fn na(self) -> DataFrameNaFunctions
Returns a DataFrameNaFunctions for handling missing values.
Sourcepub fn order_by<I>(self, cols: I) -> DataFrame
pub fn order_by<I>(self, cols: I) -> DataFrame
Returns a new DataFrame sorted by the specified column(s).
Sourcepub async fn persist(self, storage_level: StorageLevel) -> DataFrame
pub async fn persist(self, storage_level: StorageLevel) -> DataFrame
Sets the storage level to persist the contents of the DataFrame across operations after the first time it is computed.
Sourcepub async fn print_schema(
self,
level: Option<i32>,
) -> Result<String, SparkError>
pub async fn print_schema( self, level: Option<i32>, ) -> Result<String, SparkError>
Prints out the schema in the tree format to a specific level number.
Sourcepub fn random_split<I>(self, weights: I, seed: Option<i64>) -> Vec<DataFrame>
pub fn random_split<I>(self, weights: I, seed: Option<i64>) -> Vec<DataFrame>
Randomly splits this DataFrame with the provided weights.
Sourcepub fn repartition_by_range<I>(
self,
num_partitions: Option<i32>,
cols: I,
) -> DataFrame
pub fn repartition_by_range<I>( self, num_partitions: Option<i32>, cols: I, ) -> DataFrame
Returns a new DataFrame partitioned by the given partitioning expressions.
Sourcepub fn replace<I, T>(
self,
to_replace: T,
value: T,
subset: Option<I>,
) -> DataFrame
pub fn replace<I, T>( self, to_replace: T, value: T, subset: Option<I>, ) -> DataFrame
Returns a new DataFrame replacing a value with another value.
Sourcepub fn rollup<I>(self, cols: I) -> GroupedData
pub fn rollup<I>(self, cols: I) -> GroupedData
Create a multi-dimensional rollup for the current DataFrame using the specified columns, and returns a GroupedData object
Sourcepub async fn same_semantics(self, other: DataFrame) -> Result<bool, SparkError>
pub async fn same_semantics(self, other: DataFrame) -> Result<bool, SparkError>
Returns True when the logical query plans inside both DataFrames are equal and therefore return the same results.
Sourcepub fn sample(
self,
lower_bound: f64,
upper_bound: f64,
with_replacement: Option<bool>,
seed: Option<i64>,
) -> DataFrame
pub fn sample( self, lower_bound: f64, upper_bound: f64, with_replacement: Option<bool>, seed: Option<i64>, ) -> DataFrame
Returns a sampled subset of this DataFrame
Sourcepub fn sample_by<K, I>(
self,
col: Column,
fractions: I,
seed: Option<i64>,
) -> DataFrame
pub fn sample_by<K, I>( self, col: Column, fractions: I, seed: Option<i64>, ) -> DataFrame
Returns a stratified sample without replacement based on the fraction given on each stratum.
Sourcepub async fn schema(self) -> Result<DataType, SparkError>
pub async fn schema(self) -> Result<DataType, SparkError>
Returns the schema of this DataFrame as a spark::DataType which contains the schema of a DataFrame
Sourcepub fn select_expr<I>(self, cols: I) -> DataFrame
pub fn select_expr<I>(self, cols: I) -> DataFrame
Sourcepub async fn semantic_hash(self) -> Result<i32, SparkError>
pub async fn semantic_hash(self) -> Result<i32, SparkError>
Returns a hash code of the logical query plan against this DataFrame.
Sourcepub async fn show(
self,
num_rows: Option<i32>,
truncate: Option<i32>,
vertical: Option<bool>,
) -> Result<(), SparkError>
pub async fn show( self, num_rows: Option<i32>, truncate: Option<i32>, vertical: Option<bool>, ) -> Result<(), SparkError>
Prints the first n
rows to the console
§Arguments:
num_row
: (int, optional) number of rows to show (default 10)truncate
: (int, optional) If set to 0, it truncates the string. Any other number will not truncate the stringsvertical
: (bool, optional) If set to true, prints output rows vertically (one line per column value).
Sourcepub fn sort<I>(self, cols: I) -> DataFrame
pub fn sort<I>(self, cols: I) -> DataFrame
Returns a new DataFrame sorted by the specified column(s).
Sourcepub fn sort_within_partitions<I>(self, cols: I) -> DataFrame
pub fn sort_within_partitions<I>(self, cols: I) -> DataFrame
Returns a new DataFrame with each partition sorted by the specified column(s).
Sourcepub fn spark_session(self) -> Box<SparkSession>
pub fn spark_session(self) -> Box<SparkSession>
Returns Spark session that created this DataFrame.
Sourcepub fn stat(self) -> DataFrameStatFunctions
pub fn stat(self) -> DataFrameStatFunctions
Returns a DataFrameStatFunctions for statistic functions.
Sourcepub async fn storage_level(self) -> Result<StorageLevel, SparkError>
pub async fn storage_level(self) -> Result<StorageLevel, SparkError>
Get the DataFrame’s current storage level.
Sourcepub fn summary<I>(self, statistics: Option<I>) -> DataFrame
pub fn summary<I>(self, statistics: Option<I>) -> DataFrame
Computes specified statistics for numeric and string columns. Available statistics are: - count - mean - stddev - min - max - arbitrary approximate percentiles specified as a percentage (e.g., 75%)
If no statistics are given, this function computes count, mean, stddev, min, approximate quartiles (percentiles at 25%, 50%, and 75%), and max
Sourcepub async fn tail(self, limit: i32) -> Result<RecordBatch, SparkError>
pub async fn tail(self, limit: i32) -> Result<RecordBatch, SparkError>
Returns the last n
rows as a RecordBatch
Running tail requires moving the data and results in an action
Sourcepub async fn take(self, n: i32) -> Result<RecordBatch, SparkError>
pub async fn take(self, n: i32) -> Result<RecordBatch, SparkError>
Returns the first num
rows as a RecordBatch.
Sourcepub fn to_df<I>(self, cols: I) -> DataFrame
pub fn to_df<I>(self, cols: I) -> DataFrame
Returns a new DataFrame that with new specified column names
Sourcepub async fn to_json(self) -> Result<String, SparkError>
pub async fn to_json(self) -> Result<String, SparkError>
Converts a DataFrame into String representation of JSON
Each row is turned into a JSON document
Sourcepub fn union_by_name(
self,
other: DataFrame,
allow_missing_columns: Option<bool>,
) -> DataFrame
pub fn union_by_name( self, other: DataFrame, allow_missing_columns: Option<bool>, ) -> DataFrame
Sourcepub async fn unpersist(self, blocking: Option<bool>) -> DataFrame
pub async fn unpersist(self, blocking: Option<bool>) -> DataFrame
Marks the DataFrame as non-persistent, and remove all blocks for it from memory and disk.
Sourcepub fn unpivot<I, T>(
self,
ids: I,
values: Option<T>,
variable_column_name: &str,
value_column_name: &str,
) -> DataFrame
pub fn unpivot<I, T>( self, ids: I, values: Option<T>, variable_column_name: &str, value_column_name: &str, ) -> DataFrame
Unpivot a DataFrame from wide format to long format, optionally leaving identifier columns set. This is the reverse to groupBy(…).pivot(…).agg(…), except for the aggregation, which cannot be reversed.
Sourcepub fn with_column(self, col_name: &str, col: Column) -> DataFrame
pub fn with_column(self, col_name: &str, col: Column) -> DataFrame
Returns a new DataFrame by adding a column or replacing the existing column that has the same name.
Sourcepub fn with_columns<I, K>(self, col_map: I) -> DataFrame
pub fn with_columns<I, K>(self, col_map: I) -> DataFrame
Returns a new DataFrame by adding multiple columns or replacing the existing columns that have the same names.
Sourcepub fn with_column_renamed<K, V>(self, existing: K, new: V) -> DataFrame
pub fn with_column_renamed<K, V>(self, existing: K, new: V) -> DataFrame
Returns a new DataFrame by renaming an existing column.
Sourcepub fn with_columns_renamed<I, K, V>(self, cols: I) -> DataFrame
pub fn with_columns_renamed<I, K, V>(self, cols: I) -> DataFrame
Returns a new DataFrame by renaming multiple columns from a
an iterator of containing a key/value pair with the key as the existing
column name and the value as the new
column name.
Sourcepub fn with_metadata(self, col: &str, metadata: &str) -> DataFrame
pub fn with_metadata(self, col: &str, metadata: &str) -> DataFrame
Returns a new DataFrame by updating an existing column with metadata.
Sourcepub fn with_watermark(
self,
event_time: &str,
delay_threshold: &str,
) -> DataFrame
pub fn with_watermark( self, event_time: &str, delay_threshold: &str, ) -> DataFrame
Defines an event time watermark for this DataFrame.
Sourcepub fn write(self) -> DataFrameWriter
pub fn write(self) -> DataFrameWriter
Returns a DataFrameWriter struct based on the current DataFrame
Sourcepub fn write_stream(self) -> DataStreamWriter
pub fn write_stream(self) -> DataStreamWriter
Interface for DataStreamWriter to save the content of the streaming DataFrame out into external storage.
Sourcepub fn write_to(self, table: &str) -> DataFrameWriterV2
pub fn write_to(self, table: &str) -> DataFrameWriterV2
Create a write configuration builder for v2 sources with DataFrameWriterV2.
Trait Implementations§
Auto Trait Implementations§
impl Freeze for DataFrame
impl !RefUnwindSafe for DataFrame
impl Send for DataFrame
impl Sync for DataFrame
impl Unpin for DataFrame
impl !UnwindSafe for DataFrame
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Source§impl<T> CloneToUninit for Twhere
T: Clone,
impl<T> CloneToUninit for Twhere
T: Clone,
Source§impl<T> Instrument for T
impl<T> Instrument for T
Source§fn instrument(self, span: Span) -> Instrumented<Self>
fn instrument(self, span: Span) -> Instrumented<Self>
Source§fn in_current_span(self) -> Instrumented<Self>
fn in_current_span(self) -> Instrumented<Self>
Source§impl<T> IntoRequest<T> for T
impl<T> IntoRequest<T> for T
Source§fn into_request(self) -> Request<T>
fn into_request(self) -> Request<T>
T
in a tonic::Request