Struct spark_connect_rs::DataFrame

source ·

pub struct DataFrame { /* private fields */ }

Expand description

The spark-connect-rs crate is currently just a meta-package shim for spark-connect-core DataFrame is composed of a SparkSession referencing a Spark Connect enabled cluster, and a LogicalPlanBuilder which represents the unresolved spark::Plan to be submitted to the cluster when an action is called.

The LogicalPlanBuilder is a series of unresolved logical plans, and every additional transformation takes the prior spark::Plan and builds onto it. The final unresolved logical plan is submitted to the spark connect server.

§createDataFrame & range

A DataFrame can be created with an arrow::array::RecordBatch, or with spark.range(...)

let name: ArrayRef = Arc::new(StringArray::from(vec!["Tom", "Alice", "Bob"]));
let age: ArrayRef = Arc::new(Int64Array::from(vec![14, 23, 16]));

let data = RecordBatch::try_from_iter(vec![("name", name), ("age", age)])?

let df = spark.createDataFrame(&data).await?

§sql

A DataFrame is created from a spark.sql() statement

let df = spark.sql("SELECT * FROM json.`/opt/spark/examples/src/main/resources/employees.json`").await?;

§read & readStream

A DataFrame is also created from a spark.read() and spark.readStream() statement.

let df = spark
    .read()
    .format("csv")
    .option("header", "True")
    .option("delimiter", ";")
    .load(paths)?;

Struct spark_connect_rs::DataFrameCopy item path

§createDataFrame & range

§sql

§read & readStream

Implementations§

impl DataFrame

pub fn new(spark_session: SparkSession, plan: LogicalPlanBuilder) -> DataFrame

pub fn agg<T>(self, exprs: T) -> DataFramewhere T: ToVecExpr,

pub fn alias(self, alias: &str) -> DataFrame

pub async fn cache(self) -> DataFrame

pub fn coalesce(self, num_partitions: u32) -> DataFrame

pub async fn count(self) -> Result<i64, SparkError>

pub fn colRegex(self, col_name: &str) -> Column

pub async fn collect(self) -> Result<RecordBatch, SparkError>

§Example:

pub async fn columns(self) -> Result<Vec<String>, SparkError>

pub async fn corr(self, col1: &str, col2: &str) -> Result<f64, SparkError>

pub async fn cov(self, col1: &str, col2: &str) -> Result<f64, SparkError>

pub async fn createTempView(self, name: &str) -> Result<(), SparkError>

pub async fn createGlobalTempView(self, name: &str) -> Result<(), SparkError>

pub async fn createOrReplaceGlobalTempView( self, name: &str ) -> Result<(), SparkError>

pub async fn createOrReplaceTempView(self, name: &str) -> Result<(), SparkError>

pub fn crossJoin(self, other: DataFrame) -> DataFrame

pub fn crosstab(self, col1: &str, col2: &str) -> DataFrame

pub fn cube<T>(self, cols: T) -> GroupedDatawhere T: ToVecExpr,

pub fn describe<'a, I>(self, cols: Option<I>) -> DataFramewhere I: IntoIterator<Item = &'a str> + Default,

pub fn distinct(self) -> DataFrame

pub fn drop<T>(self, cols: T) -> DataFramewhere T: ToVecExpr,

pub fn drop_duplicates(self, cols: Option<Vec<&str>>) -> DataFrame

pub fn dropDuplicates(self, cols: Option<Vec<&str>>) -> DataFrame

pub fn dropna( self, how: &str, threshold: Option<i32>, subset: Option<Vec<&str>> ) -> DataFrame

pub async fn dtypes(self) -> Result<Vec<(String, Kind)>, SparkError>

pub fn exceptAll(self, other: DataFrame) -> DataFrame

pub async fn explain( self, mode: Option<ExplainMode> ) -> Result<String, SparkError>

§Arguments:

pub fn filter<T>(self, condition: T) -> DataFramewhere T: ToFilterExpr,

§Example:

pub async fn first(self) -> Result<RecordBatch, SparkError>

pub fn freqItems<'a, I>(self, cols: I, support: Option<f64>) -> DataFramewhere I: IntoIterator<Item = &'a str>,

pub fn groupBy<T>(self, cols: Option<T>) -> GroupedDatawhere T: ToVecExpr,

pub async fn head(self, n: Option<i32>) -> Result<RecordBatch, SparkError>

pub fn hint<T>(self, name: &str, parameters: Option<T>) -> DataFramewhere T: ToVecExpr,

pub async fn inputFiles(self) -> Result<Vec<String>, SparkError>

pub fn intersect(self, other: DataFrame) -> DataFrame

pub fn intersectAll(self, other: DataFrame) -> DataFrame

pub async fn isEmpty(self) -> Result<bool, SparkError>

pub async fn isStreaming(self) -> Result<bool, SparkError>

pub fn join<T>( self, other: DataFrame, on: Option<T>, how: JoinType ) -> DataFramewhere T: ToExpr,

§Example:

pub fn limit(self, limit: i32) -> DataFrame

§Example:

pub fn melt<I, K>( self, ids: I, values: Option<K>, variable_column_name: &str, value_column_name: &str ) -> DataFramewhere I: ToVecExpr, K: ToVecExpr,

pub fn offset(self, num: i32) -> DataFrame

pub fn orderBy<I>(self, cols: I) -> DataFramewhere I: IntoIterator<Item = Column>,

pub async fn persist(self, storage_level: StorageLevel) -> DataFrame

pub async fn printSchema(self, level: Option<i32>) -> Result<String, SparkError>

pub fn repartition( self, num_partitions: u32, shuffle: Option<bool> ) -> DataFrame

§Arguments

pub fn rollup<T>(self, cols: T) -> GroupedDatawhere T: ToVecExpr,

pub async fn sameSemantics(self, other: DataFrame) -> Result<bool, SparkError>

pub fn sample( self, lower_bound: f64, upper_bound: f64, with_replacement: Option<bool>, seed: Option<i64> ) -> DataFrame

pub async fn schema(self) -> Result<DataType, SparkError>

pub fn select<T>(self, cols: T) -> DataFramewhere T: ToVecExpr,

§Arguments:

§Example:

pub fn selectExpr<'a, I>(self, cols: I) -> DataFramewhere I: IntoIterator<Item = &'a str>,

§Example:

pub async fn semanticHash(self) -> Result<i32, SparkError>

pub async fn show( self, num_rows: Option<i32>, truncate: Option<i32>, vertical: Option<bool> ) -> Result<(), SparkError>

§Arguments:

pub fn sort<I>(self, cols: I) -> DataFramewhere I: IntoIterator<Item = Column>,

pub fn sparkSession(self) -> Box<SparkSession>

pub async fn storageLevel(self) -> Result<StorageLevel, SparkError>

pub fn subtract(self, other: DataFrame) -> DataFrame

pub async fn tail(self, limit: i32) -> Result<RecordBatch, SparkError>

pub async fn take(self, n: i32) -> Result<RecordBatch, SparkError>

pub fn toDF<'a, I>(self, cols: I) -> DataFramewhere I: IntoIterator<Item = &'a str>,

pub fn union(self, other: DataFrame) -> DataFrame

pub fn unionAll(self, other: DataFrame) -> DataFrame

pub fn unionByName( self, other: DataFrame, allow_missing_columns: Option<bool> ) -> DataFrame

Struct spark_connect_rs::DataFrame

pub fn agg<T>(self, exprs: T) -> DataFrame
where T: ToVecExpr,

pub fn cube<T>(self, cols: T) -> GroupedData
where T: ToVecExpr,

pub fn describe<'a, I>(self, cols: Option<I>) -> DataFrame
where I: IntoIterator<Item = &'a str> + Default,

pub fn drop<T>(self, cols: T) -> DataFrame
where T: ToVecExpr,

pub fn filter<T>(self, condition: T) -> DataFrame
where T: ToFilterExpr,

pub fn freqItems<'a, I>(self, cols: I, support: Option<f64>) -> DataFrame
where I: IntoIterator<Item = &'a str>,

pub fn groupBy<T>(self, cols: Option<T>) -> GroupedData
where T: ToVecExpr,

pub fn hint<T>(self, name: &str, parameters: Option<T>) -> DataFrame
where T: ToVecExpr,

pub fn join<T>( self, other: DataFrame, on: Option<T>, how: JoinType ) -> DataFrame
where T: ToExpr,

pub fn melt<I, K>( self, ids: I, values: Option<K>, variable_column_name: &str, value_column_name: &str ) -> DataFrame
where I: ToVecExpr, K: ToVecExpr,

pub fn orderBy<I>(self, cols: I) -> DataFrame
where I: IntoIterator<Item = Column>,

pub fn rollup<T>(self, cols: T) -> GroupedData
where T: ToVecExpr,

pub fn select<T>(self, cols: T) -> DataFrame
where T: ToVecExpr,

pub fn selectExpr<'a, I>(self, cols: I) -> DataFrame
where I: IntoIterator<Item = &'a str>,

pub fn sort<I>(self, cols: I) -> DataFrame
where I: IntoIterator<Item = Column>,

pub fn toDF<'a, I>(self, cols: I) -> DataFrame
where I: IntoIterator<Item = &'a str>,

pub fn unpivot<I, K>( self, ids: I, values: Option<K>, variable_column_name: &str, value_column_name: &str ) -> DataFrame
where I: ToVecExpr, K: ToVecExpr,

pub fn withColumns<I, K>(self, colMap: I) -> DataFrame
where I: IntoIterator<Item = (K, Column)>, K: ToString,

pub fn withColumnsRenamed<I, K, V>(self, cols: I) -> DataFrame
where I: IntoIterator<Item = (K, V)>, K: AsRef<str>, V: AsRef<str>,

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T> FromRef<T> for T
where T: Clone,

impl<T, U> Into<U> for T
where U: From<T>,

impl<T> ToOwned for T
where T: Clone,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>
where S: Into<Dispatch>,