Struct datafusion::dataframe::DataFrame

source · [−]

pub struct DataFrame { /* private fields */ }

Expand description

DataFrame represents a logical set of rows with the same named columns. Similar to a Pandas DataFrame or Spark DataFrame

DataFrames are typically created by the read_csv and read_parquet methods on the SessionContext and can then be modified by calling the transformation methods, such as filter, select, aggregate, and limit to build up a query definition.

The query can be executed by calling the collect method.

let ctx = SessionContext::new();
let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new()).await?;
let df = df.filter(col("a").lt_eq(col("b")))?
           .aggregate(vec![col("a")], vec![min(col("b"))])?
           .limit(0, Some(100))?;
let results = df.collect();

Implementations

source

impl DataFrame

source

pub fn new(session_state: Arc<RwLock<SessionState>>, plan: &LogicalPlan) -> Self

Create a new Table based on an existing logical plan

source

pub async fn create_physical_plan(&self) -> Result<Arc<dyn ExecutionPlan>>

Create a physical plan

source

pub fn select_columns(&self, columns: &[&str]) -> Result<Arc<DataFrame>>

Filter the DataFrame by column. Returns a new DataFrame only containing the specified columns.

let ctx = SessionContext::new();
let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new()).await?;
let df = df.select_columns(&["a", "b"])?;

source

pub fn select(&self, expr_list: Vec<Expr>) -> Result<Arc<DataFrame>>

Create a projection based on arbitrary expressions.

let ctx = SessionContext::new();
let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new()).await?;
let df = df.select(vec![col("a") * col("b"), col("c")])?;

source

pub fn filter(&self, predicate: Expr) -> Result<Arc<DataFrame>>

Filter a DataFrame to only include rows that match the specified filter expression.

let ctx = SessionContext::new();
let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new()).await?;
let df = df.filter(col("a").lt_eq(col("b")))?;

source

pub fn aggregate(
 &self,
 group_expr: Vec<Expr>,
 aggr_expr: Vec<Expr>
) -> Result<Arc<DataFrame>>

Perform an aggregate query with optional grouping expressions.

let ctx = SessionContext::new();
let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new()).await?;

// The following use is the equivalent of "SELECT MIN(b) GROUP BY a"
let _ = df.aggregate(vec![col("a")], vec![min(col("b"))])?;

// The following use is the equivalent of "SELECT MIN(b)"
let _ = df.aggregate(vec![], vec![min(col("b"))])?;

source

pub fn limit(&self, skip: usize, fetch: Option<usize>) -> Result<Arc<DataFrame>>

Limit the number of rows returned from this DataFrame.

skip - Number of rows to skip before fetch any row

fetch - Maximum number of rows to fetch, after skipping skip rows.

let ctx = SessionContext::new();
let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new()).await?;
let df = df.limit(0, Some(100))?;

source

pub fn union(&self, dataframe: Arc<DataFrame>) -> Result<Arc<DataFrame>>

Calculate the union of two DataFrames, preserving duplicate rows.The two DataFrames must have exactly the same schema

let ctx = SessionContext::new();
let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new()).await?;
let df = df.union(df.clone())?;

source

pub fn union_distinct(&self, dataframe: Arc<DataFrame>) -> Result<Arc<DataFrame>>

Calculate the distinct union of two DataFrames. The two DataFrames must have exactly the same schema

let ctx = SessionContext::new();
let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new()).await?;
let df = df.union_distinct(df.clone())?;

source

pub fn distinct(&self) -> Result<Arc<DataFrame>>

Filter out duplicate rows

let ctx = SessionContext::new();
let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new()).await?;
let df = df.distinct()?;

source

pub fn sort(&self, expr: Vec<Expr>) -> Result<Arc<DataFrame>>

Sort the DataFrame by the specified sorting expressions. Any expression can be turned into a sort expression by calling its sort method.

let ctx = SessionContext::new();
let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new()).await?;
let df = df.sort(vec![col("a").sort(true, true), col("b").sort(false, false)])?;

source

pub fn join(
 &self,
 right: Arc<DataFrame>,
 join_type: JoinType,
 left_cols: &[&str],
 right_cols: &[&str],
 filter: Option<Expr>
) -> Result<Arc<DataFrame>>

Join this DataFrame with another DataFrame using the specified columns as join keys.

Filter expression expected to contain non-equality predicates that can not be pushed down to any of join inputs. In case of outer join, filter applied to only matched rows.

let ctx = SessionContext::new();
let left = ctx.read_csv("tests/example.csv", CsvReadOptions::new()).await?;
let right = ctx.read_csv("tests/example.csv", CsvReadOptions::new()).await?
  .select(vec![
    col("a").alias("a2"),
    col("b").alias("b2"),
    col("c").alias("c2")])?;
let join = left.join(right, JoinType::Inner, &["a", "b"], &["a2", "b2"], None)?;
let batches = join.collect().await?;

source

pub fn repartition(
&self,
partitioning_scheme: Partitioning
) -> Result<Arc<DataFrame>>

Repartition a DataFrame based on a logical partitioning scheme.

let ctx = SessionContext::new();
let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new()).await?;
let df1 = df.repartition(Partitioning::RoundRobinBatch(4))?;

source

pub async fn collect(&self) -> Result<Vec<RecordBatch>>

Convert the logical plan represented by this DataFrame into a physical plan and execute it, collecting all resulting batches into memory Executes this DataFrame and collects all results into a vector of RecordBatch.

let ctx = SessionContext::new();
let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new()).await?;
let batches = df.collect().await?;

source

pub async fn show(&self) -> Result<()>

Print results.

let ctx = SessionContext::new();
let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new()).await?;
df.show().await?;

source

pub async fn show_limit(&self, num: usize) -> Result<()>

Print results and limit rows.

let ctx = SessionContext::new();
let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new()).await?;
df.show_limit(10).await?;

source

pub async fn execute_stream(&self) -> Result<SendableRecordBatchStream>

Executes this DataFrame and returns a stream over a single partition

let ctx = SessionContext::new();
let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new()).await?;
let stream = df.execute_stream().await?;

source

pub async fn collect_partitioned(&self) -> Result<Vec<Vec<RecordBatch>>>

Executes this DataFrame and collects all results into a vector of vector of RecordBatch maintaining the input partitioning.

let ctx = SessionContext::new();
let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new()).await?;
let batches = df.collect_partitioned().await?;

source

pub async fn execute_stream_partitioned(
&self
) -> Result<Vec<SendableRecordBatchStream>>

Executes this DataFrame and returns one stream per partition.

let ctx = SessionContext::new();
let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new()).await?;
let batches = df.execute_stream_partitioned().await?;

source

pub fn schema(&self) -> &DFSchema

Returns the schema describing the output of this DataFrame in terms of columns returned, where each column has a name, data type, and nullability attribute.

let ctx = SessionContext::new();
let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new()).await?;
let schema = df.schema();

source

pub fn to_unoptimized_plan(&self) -> LogicalPlan

Return the unoptimized logical plan represented by this DataFrame.

source

pub fn to_logical_plan(&self) -> Result<LogicalPlan>

Return the optimized logical plan represented by this DataFrame.

source

pub fn explain(&self, verbose: bool, analyze: bool) -> Result<Arc<DataFrame>>

Return a DataFrame with the explanation of its plan so far.

if analyze is specified, runs the plan and reports metrics

let ctx = SessionContext::new();
let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new()).await?;
let batches = df.limit(0, Some(100))?.explain(false, false)?.collect().await?;

source

pub fn registry(&self) -> Arc<dyn FunctionRegistry>

Return a FunctionRegistry used to plan udf’s calls

let ctx = SessionContext::new();
let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new()).await?;
let f = df.registry();
// use f.udf("name", vec![...]) to use the udf

source

pub fn intersect(&self, dataframe: Arc<DataFrame>) -> Result<Arc<DataFrame>>

Calculate the intersection of two DataFrames. The two DataFrames must have exactly the same schema

let ctx = SessionContext::new();
let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new()).await?;
let df = df.intersect(df.clone())?;

source

pub fn except(&self, dataframe: Arc<DataFrame>) -> Result<Arc<DataFrame>>

Calculate the exception of two DataFrames. The two DataFrames must have exactly the same schema

let ctx = SessionContext::new();
let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new()).await?;
let df = df.except(df.clone())?;

source

pub async fn write_csv(&self, path: &str) -> Result<()>

Write a DataFrame to a CSV file.

source

pub async fn write_parquet(
 &self,
 path: &str,
 writer_properties: Option<WriterProperties>
) -> Result<()>

Write a DataFrame to a Parquet file.

source

pub async fn write_json(&self, path: impl AsRef<str>) -> Result<()>

Executes a query and writes the results to a partitioned JSON file.

source

pub fn with_column(&self, name: &str, expr: Expr) -> Result<Arc<DataFrame>>

Add an additional column to the DataFrame.

let ctx = SessionContext::new();
let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new()).await?;
let df = df.with_column("ab_sum", col("a") + col("b"))?;

source

pub fn with_column_renamed(
 &self,
 old_name: &str,
 new_name: &str
) -> Result<Arc<DataFrame>>

Rename one column by applying a new projection. This is a no-op if the column to be renamed does not exist.

let ctx = SessionContext::new();
let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new()).await?;
let df = df.with_column_renamed("ab_sum", "total")?;

Trait Implementations

source

impl Debug for DataFrame

source

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Formats the value using the given formatter. Read more

source

Tests whether the table provider can make use of a filter expression to optimise data retrieval. Read more

Auto Trait Implementations

impl !RefUnwindSafe for DataFrame

impl Send for DataFrame

impl Sync for DataFrame

impl Unpin for DataFrame

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

source

impl<T> Same<T> for T

type Output = T

Should always be Self

source

impl<T, U> TryFrom for Twhere
U: Into<T>,

type Error = Infallible

The type returned in the event of a conversion error.

const: unstable · source

fn try_from(value: U) -> Result<T, <T as TryFrom>::Error>

Performs the conversion.

source

impl<T, U> TryInto for Twhere
U: TryFrom<T>,

type Error = >::Error

The type returned in the event of a conversion error.

const: unstable · source

fn try_into(self) -> Result<U, >::Error>

Performs the conversion.

Struct datafusion::dataframe::DataFrame

Implementations

impl DataFrame

pub fn new(session_state: Arc<RwLock<SessionState>>, plan: &LogicalPlan) -> Self

pub async fn create_physical_plan(&self) -> Result<Arc<dyn ExecutionPlan>>

pub fn select_columns(&self, columns: &[&str]) -> Result<Arc<DataFrame>>

pub fn select(&self, expr_list: Vec<Expr>) -> Result<Arc<DataFrame>>

pub fn filter(&self, predicate: Expr) -> Result<Arc<DataFrame>>

pub fn aggregate( &self, group_expr: Vec<Expr>, aggr_expr: Vec<Expr>) -> Result<Arc<DataFrame>>

pub fn limit(&self, skip: usize, fetch: Option<usize>) -> Result<Arc<DataFrame>>

pub fn union(&self, dataframe: Arc<DataFrame>) -> Result<Arc<DataFrame>>

pub fn union_distinct(&self, dataframe: Arc<DataFrame>) -> Result<Arc<DataFrame>>

pub fn distinct(&self) -> Result<Arc<DataFrame>>

pub fn sort(&self, expr: Vec<Expr>) -> Result<Arc<DataFrame>>

pub fn join( &self, right: Arc<DataFrame>, join_type: JoinType, left_cols: &[&str], right_cols: &[&str], filter: Option<Expr>) -> Result<Arc<DataFrame>>

pub fn repartition( &self, partitioning_scheme: Partitioning) -> Result<Arc<DataFrame>>

pub async fn collect(&self) -> Result<Vec<RecordBatch>>

pub async fn show(&self) -> Result<()>

pub async fn show_limit(&self, num: usize) -> Result<()>

pub async fn execute_stream(&self) -> Result<SendableRecordBatchStream>

pub async fn collect_partitioned(&self) -> Result<Vec<Vec<RecordBatch>>>

pub async fn execute_stream_partitioned( &self) -> Result<Vec<SendableRecordBatchStream>>

pub fn schema(&self) -> &DFSchema

pub fn to_unoptimized_plan(&self) -> LogicalPlan

pub fn to_logical_plan(&self) -> Result<LogicalPlan>

pub fn explain(&self, verbose: bool, analyze: bool) -> Result<Arc<DataFrame>>

pub fn registry(&self) -> Arc<dyn FunctionRegistry>

pub fn intersect(&self, dataframe: Arc<DataFrame>) -> Result<Arc<DataFrame>>

pub fn except(&self, dataframe: Arc<DataFrame>) -> Result<Arc<DataFrame>>

pub async fn write_csv(&self, path: &str) -> Result<()>

pub async fn write_parquet( &self, path: &str, writer_properties: Option<WriterProperties>) -> Result<()>

pub async fn write_json(&self, path: impl AsRef<str>) -> Result<()>

pub fn with_column(&self, name: &str, expr: Expr) -> Result<Arc<DataFrame>>

pub fn with_column_renamed( &self, old_name: &str, new_name: &str) -> Result<Arc<DataFrame>>

Trait Implementations

impl Debug for DataFrame

fn fmt(&self, f: &mut Formatter<'_>) -> Result

impl TableProvider for DataFrame

fn as_any(&self) -> &dyn Any

fn schema(&self) -> SchemaRef

fn table_type(&self) -> TableType

fn get_table_definition(&self) -> Option<&str>

fn supports_filter_pushdown( &self, _filter: &Expr) -> Result<TableProviderFilterPushDown>

Auto Trait Implementations

impl !RefUnwindSafe for DataFrame

impl Send for DataFrame

impl Sync for DataFrame

impl Unpin for DataFrame

impl !UnwindSafe for DataFrame

Blanket Implementations

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> From<T> for T

fn from(t: T) -> T

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T> Same<T> for T

type Output = T

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

type Error = Infallible

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

impl<V, T> VZip<V> for Twhere V: MultiLane<T>,

fn vzip(self) -> V

pub fn aggregate(
&self,
group_expr: Vec<Expr>,
aggr_expr: Vec<Expr>
) -> Result<Arc<DataFrame>>

pub fn join(
&self,
right: Arc<DataFrame>,
join_type: JoinType,
left_cols: &[&str],
right_cols: &[&str],
filter: Option<Expr>
) -> Result<Arc<DataFrame>>

pub fn repartition(
&self,
partitioning_scheme: Partitioning
) -> Result<Arc<DataFrame>>

pub async fn execute_stream_partitioned(
&self
) -> Result<Vec<SendableRecordBatchStream>>

pub async fn write_parquet(
&self,
path: &str,
writer_properties: Option<WriterProperties>
) -> Result<()>

pub fn with_column_renamed(
&self,
old_name: &str,
new_name: &str
) -> Result<Arc<DataFrame>>

fn supports_filter_pushdown(
&self,
_filter: &Expr
) -> Result<TableProviderFilterPushDown>

impl<T> Any for Twhere
T: 'static + ?Sized,

impl<T> Borrow<T> for Twhere
T: ?Sized,

impl<T> BorrowMut<T> for Twhere
T: ?Sized,

impl<T, U> Into<U> for Twhere
U: From<T>,

impl<T, U> TryFrom<U> for Twhere
U: Into<T>,

impl<T, U> TryInto<U> for Twhere
U: TryFrom<T>,

impl<V, T> VZip<V> for Twhere
V: MultiLane<T>,