Struct spark_connect_rs::dataframe::DataFrame

source ·

pub struct DataFrame {
    pub spark_session: SparkSession,
    pub logical_plan: LogicalPlanBuilder,
}

Expand description

DataFrame is composed of a spark_session connecot ting to a remote Spark Connect enabled cluster, and a logical_plan which represents the Plan to be submitted to the cluster when an action is called

Fields§

§spark_session: SparkSession

Global SparkSession connecting to the remote cluster

§logical_plan: LogicalPlanBuilder

Logical Plan representing the unresolved Relation which will be submitted to the remote cluster

Implementations§

source §

impl DataFrame

source

pub fn new( spark_session: SparkSession, logical_plan: LogicalPlanBuilder ) -> DataFrame

create default DataFrame based on a spark session and initial logical plan

source

pub fn alias(&mut self, alias: &str) -> DataFrame

Returns a new DataFrame with an alias set.

source

pub async fn cache(&mut self) -> DataFrame

Persists the DataFrame with the default storage::StorageLevel::MemoryAndDiskDeser (MEMORY_AND_DISK_DESER).

source

pub fn coalesce(&mut self, num_partitions: u32) -> DataFrame

Returns a new DataFrame that has exactly num_partitions partitions.

source

pub fn colRegex(self, col_name: &str) -> Column

Selects column based on the column name specified as a regex and returns it as Column.

source

pub async fn collect(&mut self) -> Result<RecordBatch, SparkError>

Returns all records as a RecordBatch

§Example:

async {
    df.collect().await?;
}

source

pub async fn columns(&mut self) -> Vec<String>

Retrieves the names of all columns in the DataFrame as a Vec<String>. The order of the column names in the list reflects their order in the DataFrame.

source

pub async fn corr(&mut self, col1: &str, col2: &str) -> Option<f64>

Calculates the correlation of two columns of a DataFrame as a f64. Currently only supports the Pearson Correlation Coefficient.

source

pub async fn cov(&mut self, col1: &str, col2: &str) -> Option<f64>

Calculate the sample covariance for the given columns, specified by their names, as a f64

source

pub fn crossJoin(&mut self, other: DataFrame) -> DataFrame

Returns the cartesian product with another DataFrame.

source

pub fn crosstab(&mut self, col1: &str, col2: &str) -> DataFrame

Computes a pair-wise frequency table of the given columns. Also known as a contingency table.

source

pub fn describe(&mut self, cols: Option<Vec<&str>>) -> DataFrame

source

pub fn distinct(&mut self) -> DataFrame

Returns a new DataFrame containing the distinct rows in this DataFrame.

source

pub fn drop<T: ToVecExpr>(&mut self, cols: T) -> DataFrame

Returns a new DataFrame without the specified columns

source

pub fn drop_duplicates(&mut self, cols: Option<Vec<&str>>) -> DataFrame

Return a new DataFrame with duplicate rows removed, optionally only considering certain columns from a Vec<String>

If no columns are supplied then it all columns are used

Alias for dropDuplciates

source

pub fn dropDuplicates(&mut self, cols: Option<Vec<&str>>) -> DataFrame

source

pub fn dropna( &mut self, how: &str, threshold: Option<i32>, subset: Option<Vec<&str>> ) -> DataFrame

source

pub async fn dtypes(&mut self) -> Vec<(String, Option<Kind>)>

Returns all column names and their data types as a Vec containing the field name as a String and the spark::data_type::Kind enum

source

pub fn exceptAll(&mut self, other: DataFrame) -> DataFrame

source

pub async fn explain( &mut self, mode: Option<ExplainMode> ) -> Result<String, SparkError>

Prints the spark::Plan to the console

§Arguments:

mode: ExplainMode Defaults to unspecified
- simple
- extended
- codegen
- cost
- formatted
- unspecified

source

pub fn filter<T: ToFilterExpr>(&mut self, condition: T) -> DataFrame

Filters rows using a given conditions and returns a new DataFrame

§Example:

async {
    df.filter("salary > 4000").collect().await?;
}

Examples found in repository ?

examples/sql.rs (line 19)

async fn main() -> Result<(), Box<dyn std::error::Error>> {
    let mut spark: SparkSession =
        SparkSessionBuilder::remote("sc://127.0.0.1:15002/;user_id=example_rs")
            .build()
            .await?;

    let mut df = spark
        .sql("SELECT * FROM json.`/opt/spark/examples/src/main/resources/employees.json`")
        .await?;

    df.filter("salary >= 3500")
        .select("*")
        .show(Some(5), None, None)
        .await?;

    // +-----------------+
    // | show_string     |
    // +-----------------+
    // | +------+------+ |
    // | |name  |salary| |
    // | +------+------+ |
    // | |Andy  |4500  | |
    // | |Justin|3500  | |
    // | |Berta |4000  | |
    // | +------+------+ |
    // |                 |
    // +-----------------+

    Ok(())
}

source

pub async fn first(&mut self) -> RecordBatch

source

pub fn freqItems<'a, I>(&mut self, cols: I, support: Option<f64>) -> DataFrame
where I: IntoIterator<Item = &'a str>,

source

pub async fn head(&mut self, n: Option<i32>) -> RecordBatch

source

pub fn hint<T: ToVecExpr>( &mut self, name: &str, parameters: Option<T> ) -> DataFrame

source

pub async fn inputFiles(&mut self) -> Vec<String>

source

pub fn intersect(&mut self, other: DataFrame) -> DataFrame

source

pub fn intersectAll(&mut self, other: DataFrame) -> DataFrame

source

pub async fn isEmpty(&mut self) -> bool

source

pub fn join<T: ToExpr>( &mut self, other: DataFrame, on: Option<T>, how: JoinType ) -> DataFrame

Joins with another DataFrame, using the given join expression.

§Example:


use spark_connect_rs::functionas::*
use spark_connect_rs::dataframe::JoinType;

async {
    // join two dataframes where `id` == `name`
    let condition = Some(col("id").eq("name"));
    let df = df.join(df2, condition, JoinType::Inner);
}

source

pub fn limit(&mut self, limit: i32) -> DataFrame

Limits the result count o thte number specified and returns a new DataFrame

§Example:

async {
    df.limit(10).collect().await?;
}

source

pub fn offset(&mut self, num: i32) -> DataFrame

Returns a new DataFrame by skiping the first n rows

source

pub fn orderBy(&mut self, cols: I) -> DataFrame
where I: IntoIterator<Item = Column>,

source

pub async fn persist(&mut self, storage_level: StorageLevel) -> DataFrame

source

pub async fn printSchema(&mut self, level: Option<i32>) -> String

source

pub fn repartition( &mut self, num_partitions: u32, shuffle: Option<bool> ) -> DataFrame

Returns a new DataFrame partitioned by the given partition number and shuffle option

§Arguments

num_partitions: the target number of partitions
(optional) shuffle: to induce a shuffle. Default is false

source

pub async fn sameSemantics(&mut self, other: DataFrame) -> bool

source

pub fn sample( &mut self, lower_bound: f64, upper_bound: f64, with_replacement: Option<bool>, seed: Option<i64> ) -> DataFrame

Returns a sampled subset of this DataFrame

source

pub async fn schema(&mut self) -> Option<DataType>

Returns the schema of this DataFrame as a spark::DataType which contains the schema of a DataFrame

source

pub fn select<T: ToVecExpr>(&mut self, cols: T) -> DataFrame

Projects a set of expressions and returns a new DataFrame

§Arguments:

cols - An object that implements [ToVecExpr]

§Example:

async {
    df.select(vec![col("age"), col("name")]).collect().await?;
}

Examples found in repository ?

examples/sql.rs (line 20)

async fn main() -> Result<(), Box<dyn std::error::Error>> {
    let mut spark: SparkSession =
        SparkSessionBuilder::remote("sc://127.0.0.1:15002/;user_id=example_rs")
            .build()
            .await?;

    let mut df = spark
        .sql("SELECT * FROM json.`/opt/spark/examples/src/main/resources/employees.json`")
        .await?;

    df.filter("salary >= 3500")
        .select("*")
        .show(Some(5), None, None)
        .await?;

    // +-----------------+
    // | show_string     |
    // +-----------------+
    // | +------+------+ |
    // | |name  |salary| |
    // | +------+------+ |
    // | |Andy  |4500  | |
    // | |Justin|3500  | |
    // | |Berta |4000  | |
    // | +------+------+ |
    // |                 |
    // +-----------------+

    Ok(())
}

More examples

Hide additional examples

examples/reader.rs (lines 21-25)

async fn main() -> Result<(), Box<dyn std::error::Error>> {
    let spark: SparkSession = SparkSessionBuilder::default().build().await?;

    let path = ["/opt/spark/examples/src/main/resources/people.csv"];

    let mut df = spark
        .read()
        .format("csv")
        .option("header", "True")
        .option("delimiter", ";")
        .load(path);

    df.select([
        F::col("name"),
        F::col("age").cast("int").alias("age_int"),
        (F::lit(3.0) + F::col("age").cast("int")).alias("addition"),
    ])
    .sort(vec![F::col("name").desc()])
    .show(Some(5), None, None)
    .await?;

    // print results
    // +--------------------------+
    // | show_string              |
    // +--------------------------+
    // | +-----+-------+--------+ |
    // | |name |age_int|addition| |
    // | +-----+-------+--------+ |
    // | |Jorge|30     |33.0    | |
    // | |Bob  |32     |35.0    | |
    // | +-----+-------+--------+ |
    // |                          |
    // +--------------------------+

    Ok(())
}

examples/writer.rs (line 19)

async fn main() -> Result<(), Box<dyn std::error::Error>> {
    let spark: SparkSession = SparkSessionBuilder::default().build().await?;

    let df = spark
        .clone()
        .range(None, 1000, 1, Some(16))
        .select(col("id").alias("range_id"));

    let path = "/opt/spark/examples/src/main/rust/employees/";

    df.write()
        .format("csv")
        .mode(SaveMode::Overwrite)
        .option("header", "true")
        .save(path)
        .await?;

    let mut df = spark
        .clone()
        .read()
        .format("csv")
        .option("header", "true")
        .load([path]);

    df.show(Some(10), None, None).await?;

    // print results may slighty vary but should be close to the below
    // +--------------------------+
    // | show_string              |
    // +--------------------------+
    // | +--------+               |
    // | |range_id|               |
    // | +--------+               |
    // | |312     |               |
    // | |313     |               |
    // | |314     |               |
    // | |315     |               |
    // | |316     |               |
    // | |317     |               |
    // | |318     |               |
    // | |319     |               |
    // | |320     |               |
    // | |321     |               |
    // | +--------+               |
    // | only showing top 10 rows |
    // |                          |
    // +--------------------------+

    Ok(())
}

source

pub fn selectExpr<'a, I>(&mut self, cols: I) -> DataFrame
where I: IntoIterator<Item = &'a str>,

Project a set of SQL expressions and returns a new DataFrame

This is a variant of select that accepts SQL Expressions

§Example:

async {
    df.selectExpr(vec!["id * 2", "abs(id)"]).collect().await?;
}

source

pub async fn semanticHash(&mut self) -> i32

source

pub async fn show( &mut self, num_rows: Option<i32>, truncate: Option<i32>, vertical: Option<bool> ) -> Result<(), SparkError>

Prints the first n rows to the console

§Arguments:

num_row: (int, optional) number of rows to show (default 10)
truncate: (int, optional) If set to 0, it truncates the string. Any other number will not truncate the strings
vertical: (bool, optional) If set to true, prints output rows vertically (one line per column value).

Examples found in repository ?

examples/sql.rs (line 21)

async fn main() -> Result<(), Box<dyn std::error::Error>> {
    let mut spark: SparkSession =
        SparkSessionBuilder::remote("sc://127.0.0.1:15002/;user_id=example_rs")
            .build()
            .await?;

    let mut df = spark
        .sql("SELECT * FROM json.`/opt/spark/examples/src/main/resources/employees.json`")
        .await?;

    df.filter("salary >= 3500")
        .select("*")
        .show(Some(5), None, None)
        .await?;

    // +-----------------+
    // | show_string     |
    // +-----------------+
    // | +------+------+ |
    // | |name  |salary| |
    // | +------+------+ |
    // | |Andy  |4500  | |
    // | |Justin|3500  | |
    // | |Berta |4000  | |
    // | +------+------+ |
    // |                 |
    // +-----------------+

    Ok(())
}

More examples

Hide additional examples

examples/reader.rs (line 27)

async fn main() -> Result<(), Box<dyn std::error::Error>> {
    let spark: SparkSession = SparkSessionBuilder::default().build().await?;

    let path = ["/opt/spark/examples/src/main/resources/people.csv"];

    let mut df = spark
        .read()
        .format("csv")
        .option("header", "True")
        .option("delimiter", ";")
        .load(path);

    df.select([
        F::col("name"),
        F::col("age").cast("int").alias("age_int"),
        (F::lit(3.0) + F::col("age").cast("int")).alias("addition"),
    ])
    .sort(vec![F::col("name").desc()])
    .show(Some(5), None, None)
    .await?;

    // print results
    // +--------------------------+
    // | show_string              |
    // +--------------------------+
    // | +-----+-------+--------+ |
    // | |name |age_int|addition| |
    // | +-----+-------+--------+ |
    // | |Jorge|30     |33.0    | |
    // | |Bob  |32     |35.0    | |
    // | +-----+-------+--------+ |
    // |                          |
    // +--------------------------+

    Ok(())
}

examples/writer.rs (line 37)

async fn main() -> Result<(), Box<dyn std::error::Error>> {
    let spark: SparkSession = SparkSessionBuilder::default().build().await?;

    let df = spark
        .clone()
        .range(None, 1000, 1, Some(16))
        .select(col("id").alias("range_id"));

    let path = "/opt/spark/examples/src/main/rust/employees/";

    df.write()
        .format("csv")
        .mode(SaveMode::Overwrite)
        .option("header", "true")
        .save(path)
        .await?;

    let mut df = spark
        .clone()
        .read()
        .format("csv")
        .option("header", "true")
        .load([path]);

    df.show(Some(10), None, None).await?;

    // print results may slighty vary but should be close to the below
    // +--------------------------+
    // | show_string              |
    // +--------------------------+
    // | +--------+               |
    // | |range_id|               |
    // | +--------+               |
    // | |312     |               |
    // | |313     |               |
    // | |314     |               |
    // | |315     |               |
    // | |316     |               |
    // | |317     |               |
    // | |318     |               |
    // | |319     |               |
    // | |320     |               |
    // | |321     |               |
    // | +--------+               |
    // | only showing top 10 rows |
    // |                          |
    // +--------------------------+

    Ok(())
}

examples/delta.rs (line 38)

async fn main() -> Result<(), Box<dyn std::error::Error>> {
    let mut spark: SparkSession = SparkSessionBuilder::default().build().await?;

    let paths = ["/opt/spark/examples/src/main/resources/people.csv"];

    let df = spark
        .clone()
        .read()
        .format("csv")
        .option("header", "True")
        .option("delimiter", ";")
        .option("inferSchema", "True")
        .load(paths);

    df.write()
        .format("delta")
        .mode(SaveMode::Overwrite)
        .saveAsTable("default.people_delta")
        .await
        .unwrap();

    spark
        .sql("DESCRIBE HISTORY default.people_delta")
        .await?
        .show(Some(1), None, Some(true))
        .await
        .unwrap();

    // print results
    // +-------------------------------------------------------------------------------------------------------+
    // | show_string                                                                                           |
    // +-------------------------------------------------------------------------------------------------------+
    // | -RECORD 0-------------------------------------------------------------------------------------------- |
    // |  version             | 3                                                                              |
    // |  timestamp           | 2024-03-16 13:46:23.552                                                        |
    // |  userId              | NULL                                                                           |
    // |  userName            | NULL                                                                           |
    // |  operation           | CREATE OR REPLACE TABLE AS SELECT                                              |
    // |  operationParameters | {isManaged -> true, description -> NULL, partitionBy -> [], properties -> {}}  |
    // |  job                 | NULL                                                                           |
    // |  notebook            | NULL                                                                           |
    // |  clusterId           | NULL                                                                           |
    // |  readVersion         | 2                                                                              |
    // |  isolationLevel      | Serializable                                                                   |
    // |  isBlindAppend       | false                                                                          |
    // |  operationMetrics    | {numFiles -> 1, numOutputRows -> 2, numOutputBytes -> 988}                     |
    // |  userMetadata        | NULL                                                                           |
    // |  engineInfo          | Apache-Spark/3.5.0 Delta-Lake/3.0.0                                            |
    // | only showing top 1 row                                                                                |
    // |                                                                                                       |
    // +-------------------------------------------------------------------------------------------------------+

    Ok(())
}

source

pub fn sort(&mut self, cols: Vec<Column>) -> DataFrame

Examples found in repository ?

examples/reader.rs (line 26)

async fn main() -> Result<(), Box<dyn std::error::Error>> {
    let spark: SparkSession = SparkSessionBuilder::default().build().await?;

    let path = ["/opt/spark/examples/src/main/resources/people.csv"];

    let mut df = spark
        .read()
        .format("csv")
        .option("header", "True")
        .option("delimiter", ";")
        .load(path);

    df.select([
        F::col("name"),
        F::col("age").cast("int").alias("age_int"),
        (F::lit(3.0) + F::col("age").cast("int")).alias("addition"),
    ])
    .sort(vec![F::col("name").desc()])
    .show(Some(5), None, None)
    .await?;

    // print results
    // +--------------------------+
    // | show_string              |
    // +--------------------------+
    // | +-----+-------+--------+ |
    // | |name |age_int|addition| |
    // | +-----+-------+--------+ |
    // | |Jorge|30     |33.0    | |
    // | |Bob  |32     |35.0    | |
    // | +-----+-------+--------+ |
    // |                          |
    // +--------------------------+

    Ok(())
}

source

pub fn sparkSession(self) -> SparkSession

source

pub async fn storageLevel(&mut self) -> StorageLevel

source

pub fn subtract(&mut self, other: DataFrame) -> DataFrame

source

pub async fn tail(&mut self, limit: i32) -> RecordBatch

Returns the last n rows as a RecordBatch

Running tail requires moving the data and results in an action

source

pub async fn take(&mut self, n: i32) -> RecordBatch

source

pub fn toDF<'a, I>(&mut self, cols: I) -> DataFrame
where I: IntoIterator<Item = &'a str>,

source

pub fn union(&mut self, other: DataFrame) -> DataFrame

source

pub fn unionAll(&mut self, other: DataFrame) -> DataFrame

source

pub fn unionByName( &mut self, other: DataFrame, allow_missing_columns: Option<bool> ) -> DataFrame

source

pub async fn unpersist(&mut self, blocking: Option<bool>) -> DataFrame

source

pub fn withColumn(&mut self, colName: &str, col: Column) -> DataFrame

source

pub fn withColumns<I, K>(&mut self, colMap: I) -> DataFrame
where I: IntoIterator<Item = (K, Column)>, K: ToString,

source

pub fn withColumnsRenamed<I, K, V>(&mut self, cols: I) -> DataFrame
where I: IntoIterator<Item = (K, V)>, K: AsRef<str>, V: AsRef<str>,

Returns a new DataFrame by renaming multiple columns from a an iterator of containing a key/value pair with the key as the existing column name and the value as the new column name.

source

pub fn write(self) -> DataFrameWriter

Returns a DataFrameWriter struct based on the current DataFrame

Examples found in repository ?

examples/writer.rs (line 23)

async fn main() -> Result<(), Box<dyn std::error::Error>> {
    let spark: SparkSession = SparkSessionBuilder::default().build().await?;

    let df = spark
        .clone()
        .range(None, 1000, 1, Some(16))
        .select(col("id").alias("range_id"));

    let path = "/opt/spark/examples/src/main/rust/employees/";

    df.write()
        .format("csv")
        .mode(SaveMode::Overwrite)
        .option("header", "true")
        .save(path)
        .await?;

    let mut df = spark
        .clone()
        .read()
        .format("csv")
        .option("header", "true")
        .load([path]);

    df.show(Some(10), None, None).await?;

    // print results may slighty vary but should be close to the below
    // +--------------------------+
    // | show_string              |
    // +--------------------------+
    // | +--------+               |
    // | |range_id|               |
    // | +--------+               |
    // | |312     |               |
    // | |313     |               |
    // | |314     |               |
    // | |315     |               |
    // | |316     |               |
    // | |317     |               |
    // | |318     |               |
    // | |319     |               |
    // | |320     |               |
    // | |321     |               |
    // | +--------+               |
    // | only showing top 10 rows |
    // |                          |
    // +--------------------------+

    Ok(())
}

More examples

Hide additional examples

examples/delta.rs (line 28)

async fn main() -> Result<(), Box<dyn std::error::Error>> {
    let mut spark: SparkSession = SparkSessionBuilder::default().build().await?;

    let paths = ["/opt/spark/examples/src/main/resources/people.csv"];

    let df = spark
        .clone()
        .read()
        .format("csv")
        .option("header", "True")
        .option("delimiter", ";")
        .option("inferSchema", "True")
        .load(paths);

    df.write()
        .format("delta")
        .mode(SaveMode::Overwrite)
        .saveAsTable("default.people_delta")
        .await
        .unwrap();

    spark
        .sql("DESCRIBE HISTORY default.people_delta")
        .await?
        .show(Some(1), None, Some(true))
        .await
        .unwrap();

    // print results
    // +-------------------------------------------------------------------------------------------------------+
    // | show_string                                                                                           |
    // +-------------------------------------------------------------------------------------------------------+
    // | -RECORD 0-------------------------------------------------------------------------------------------- |
    // |  version             | 3                                                                              |
    // |  timestamp           | 2024-03-16 13:46:23.552                                                        |
    // |  userId              | NULL                                                                           |
    // |  userName            | NULL                                                                           |
    // |  operation           | CREATE OR REPLACE TABLE AS SELECT                                              |
    // |  operationParameters | {isManaged -> true, description -> NULL, partitionBy -> [], properties -> {}}  |
    // |  job                 | NULL                                                                           |
    // |  notebook            | NULL                                                                           |
    // |  clusterId           | NULL                                                                           |
    // |  readVersion         | 2                                                                              |
    // |  isolationLevel      | Serializable                                                                   |
    // |  isBlindAppend       | false                                                                          |
    // |  operationMetrics    | {numFiles -> 1, numOutputRows -> 2, numOutputBytes -> 988}                     |
    // |  userMetadata        | NULL                                                                           |
    // |  engineInfo          | Apache-Spark/3.5.0 Delta-Lake/3.0.0                                            |
    // | only showing top 1 row                                                                                |
    // |                                                                                                       |
    // +-------------------------------------------------------------------------------------------------------+

    Ok(())
}