Trait datafusion::dataframe::DataFrame[−][src]

pub trait DataFrame: Send + Sync {
Show methods
    fn select_columns(&self, columns: &[&str]) -> Result<Arc<dyn DataFrame>>;
    fn select(&self, expr: Vec<Expr>) -> Result<Arc<dyn DataFrame>>;
    fn filter(&self, expr: Expr) -> Result<Arc<dyn DataFrame>>;
    fn aggregate(
        &self, 
        group_expr: Vec<Expr>, 
        aggr_expr: Vec<Expr>
    ) -> Result<Arc<dyn DataFrame>>;
    fn limit(&self, n: usize) -> Result<Arc<dyn DataFrame>>;
    fn union(&self, dataframe: Arc<dyn DataFrame>) -> Result<Arc<dyn DataFrame>>;
    fn sort(&self, expr: Vec<Expr>) -> Result<Arc<dyn DataFrame>>;
    fn join(
        &self, 
        right: Arc<dyn DataFrame>, 
        join_type: JoinType, 
        left_cols: &[&str], 
        right_cols: &[&str]
    ) -> Result<Arc<dyn DataFrame>>;
    fn repartition(
        &self, 
        partitioning_scheme: Partitioning
    ) -> Result<Arc<dyn DataFrame>>;
    #[must_use]
    fn collect<'life0, 'async_trait>(
        &'life0 self
    ) -> Pin<Box<dyn Future<Output = Result<Vec<RecordBatch>>> + Send + 'async_trait>>
    where
        'life0: 'async_trait,
        Self: 'async_trait;
    #[must_use]
    fn collect_partitioned<'life0, 'async_trait>(
        &'life0 self
    ) -> Pin<Box<dyn Future<Output = Result<Vec<Vec<RecordBatch>>>> + Send + 'async_trait>>
    where
        'life0: 'async_trait,
        Self: 'async_trait;
    fn schema(&self) -> &DFSchema;
    fn to_logical_plan(&self) -> LogicalPlan;
    fn explain(&self, verbose: bool) -> Result<Arc<dyn DataFrame>>;
    fn registry(&self) -> Arc<dyn FunctionRegistry>;
}

DataFrame represents a logical set of rows with the same named columns. Similar to a Pandas DataFrame or Spark DataFrame

DataFrames are typically created by the read_csv and read_parquet methods on the ExecutionContext and can then be modified by calling the transformation methods, such as filter, select, aggregate, and limit to build up a query definition.

The query can be executed by calling the collect method.

let mut ctx = ExecutionContext::new();
let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?;
let df = df.filter(col("a").lt_eq(col("b")))?
           .aggregate(vec![col("a")], vec![min(col("b"))])?
           .limit(100)?;
let results = df.collect();

Required methods

`fn select_columns(&self, columns: &[&str]) -> Result<Arc<dyn DataFrame>>`[src]

Filter the DataFrame by column. Returns a new DataFrame only containing the specified columns.

let mut ctx = ExecutionContext::new();
let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?;
let df = df.select_columns(&["a", "b"])?;

`fn select(&self, expr: Vec<Expr>) -> Result<Arc<dyn DataFrame>>`[src]

Create a projection based on arbitrary expressions.

let mut ctx = ExecutionContext::new();
let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?;
let df = df.select(vec![col("a") * col("b"), col("c")])?;

`fn filter(&self, expr: Expr) -> Result<Arc<dyn DataFrame>>`[src]

Filter a DataFrame to only include rows that match the specified filter expression.

let mut ctx = ExecutionContext::new();
let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?;
let df = df.filter(col("a").lt_eq(col("b")))?;

`fn aggregate( &self, group_expr: Vec<Expr>, aggr_expr: Vec<Expr> ) -> Result<Arc<dyn DataFrame>>`[src]

Perform an aggregate query with optional grouping expressions.

let mut ctx = ExecutionContext::new();
let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?;

// The following use is the equivalent of "SELECT MIN(b) GROUP BY a"
let _ = df.aggregate(vec![col("a")], vec![min(col("b"))])?;

// The following use is the equivalent of "SELECT MIN(b)"
let _ = df.aggregate(vec![], vec![min(col("b"))])?;

`fn limit(&self, n: usize) -> Result<Arc<dyn DataFrame>>`[src]

Limit the number of rows returned from this DataFrame.

let mut ctx = ExecutionContext::new();
let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?;
let df = df.limit(100)?;

`fn union(&self, dataframe: Arc<dyn DataFrame>) -> Result<Arc<dyn DataFrame>>`[src]

Calculate the union two DataFrames. The two DataFrames must have exactly the same schema

let mut ctx = ExecutionContext::new();
let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?;
let df = df.union(df.clone())?;

`fn sort(&self, expr: Vec<Expr>) -> Result<Arc<dyn DataFrame>>`[src]

Sort the DataFrame by the specified sorting expressions. Any expression can be turned into a sort expression by calling its sort method.

let mut ctx = ExecutionContext::new();
let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?;
let df = df.sort(vec![col("a").sort(true, true), col("b").sort(false, false)])?;

`fn join( &self, right: Arc<dyn DataFrame>, join_type: JoinType, left_cols: &[&str], right_cols: &[&str] ) -> Result<Arc<dyn DataFrame>>`[src]

Join this DataFrame with another DataFrame using the specified columns as join keys

let mut ctx = ExecutionContext::new();
let left = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?;
let right = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?
  .select(vec![
    col("a").alias("a2"),
    col("b").alias("b2"),
    col("c").alias("c2")])?;
let join = left.join(right, JoinType::Inner, &["a", "b"], &["a2", "b2"])?;
let batches = join.collect().await?;

`fn repartition( &self, partitioning_scheme: Partitioning ) -> Result<Arc<dyn DataFrame>>`[src]

Repartition a DataFrame based on a logical partitioning scheme.

let mut ctx = ExecutionContext::new();
let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?;
let df1 = df.repartition(Partitioning::RoundRobinBatch(4))?;

`#[must_use] fn collect<'life0, 'async_trait>( &'life0 self ) -> Pin<Box<dyn Future<Output = Result<Vec<RecordBatch>>> + Send + 'async_trait>> where 'life0: 'async_trait, Self: 'async_trait,` [src]

Executes this DataFrame and collects all results into a vector of RecordBatch.

let mut ctx = ExecutionContext::new();
let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?;
let batches = df.collect().await?;

`#[must_use] fn collect_partitioned<'life0, 'async_trait>( &'life0 self ) -> Pin<Box<dyn Future<Output = Result<Vec<Vec<RecordBatch>>>> + Send + 'async_trait>> where 'life0: 'async_trait, Self: 'async_trait,` [src]

Executes this DataFrame and collects all results into a vector of vector of RecordBatch maintaining the input partitioning.

let mut ctx = ExecutionContext::new();
let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?;
let batches = df.collect_partitioned().await?;

`fn schema(&self) -> &DFSchema`[src]

Returns the schema describing the output of this DataFrame in terms of columns returned, where each column has a name, data type, and nullability attribute.

let mut ctx = ExecutionContext::new();
let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?;
let schema = df.schema();

`fn to_logical_plan(&self) -> LogicalPlan`[src]

Return the logical plan represented by this DataFrame.

`fn explain(&self, verbose: bool) -> Result<Arc<dyn DataFrame>>`[src]

Return a DataFrame with the explanation of its plan so far.

let mut ctx = ExecutionContext::new();
let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?;
let batches = df.limit(100)?.explain(false)?.collect().await?;

`fn registry(&self) -> Arc<dyn FunctionRegistry>`[src]

Return a FunctionRegistry used to plan udf’s calls

let mut ctx = ExecutionContext::new();
let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?;
let f = df.registry();
// use f.udf("name", vec![...]) to use the udf

Loading content...

Implementors

`impl DataFrame for DataFrameImpl`[src]

`fn select_columns(&self, columns: &[&str]) -> Result<Arc<dyn DataFrame>>`[src]

Apply a projection based on a list of column names

`fn select(&self, expr_list: Vec<Expr>) -> Result<Arc<dyn DataFrame>>`[src]

Create a projection based on arbitrary expressions

`fn filter(&self, predicate: Expr) -> Result<Arc<dyn DataFrame>>`[src]

Create a filter based on a predicate expression

`fn aggregate( &self, group_expr: Vec<Expr>, aggr_expr: Vec<Expr> ) -> Result<Arc<dyn DataFrame>>`[src]

Perform an aggregate query

`fn limit(&self, n: usize) -> Result<Arc<dyn DataFrame>>`[src]

Limit the number of rows

`fn sort(&self, expr: Vec<Expr>) -> Result<Arc<dyn DataFrame>>`[src]

Sort by specified sorting expressions

`fn join( &self, right: Arc<dyn DataFrame>, join_type: JoinType, left_cols: &[&str], right_cols: &[&str] ) -> Result<Arc<dyn DataFrame>>`[src]

Join with another DataFrame

`fn repartition( &self, partitioning_scheme: Partitioning ) -> Result<Arc<dyn DataFrame>>`[src]

`fn to_logical_plan(&self) -> LogicalPlan`[src]

Convert to logical plan

`fn collect<'life0, 'async_trait>( &'life0 self ) -> Pin<Box<dyn Future<Output = Result<Vec<RecordBatch>>> + Send + 'async_trait>> where 'life0: 'async_trait, Self: 'async_trait,` [src]

`fn collect_partitioned<'life0, 'async_trait>( &'life0 self ) -> Pin<Box<dyn Future<Output = Result<Vec<Vec<RecordBatch>>>> + Send + 'async_trait>> where 'life0: 'async_trait, Self: 'async_trait,` [src]

`fn schema(&self) -> &DFSchema`[src]

Returns the schema from the logical plan

`fn explain(&self, verbose: bool) -> Result<Arc<dyn DataFrame>>`[src]

`fn registry(&self) -> Arc<dyn FunctionRegistry>`[src]

`fn union(&self, dataframe: Arc<dyn DataFrame>) -> Result<Arc<dyn DataFrame>>`[src]

Loading content...

Trait datafusion::dataframe::DataFrame⎘[−][src]

Required methods

fn select_columns(&self, columns: &[&str]) -> Result<Arc<dyn DataFrame>>[src]

fn select(&self, expr: Vec<Expr>) -> Result<Arc<dyn DataFrame>>[src]

fn filter(&self, expr: Expr) -> Result<Arc<dyn DataFrame>>[src]

fn aggregate( &self, group_expr: Vec<Expr>, aggr_expr: Vec<Expr>) -> Result<Arc<dyn DataFrame>>[src]

fn limit(&self, n: usize) -> Result<Arc<dyn DataFrame>>[src]

fn union(&self, dataframe: Arc<dyn DataFrame>) -> Result<Arc<dyn DataFrame>>[src]

fn sort(&self, expr: Vec<Expr>) -> Result<Arc<dyn DataFrame>>[src]

fn join( &self, right: Arc<dyn DataFrame>, join_type: JoinType, left_cols: &[&str], right_cols: &[&str]) -> Result<Arc<dyn DataFrame>>[src]

fn repartition( &self, partitioning_scheme: Partitioning) -> Result<Arc<dyn DataFrame>>[src]

#[must_use]fn collect<'life0, 'async_trait>( &'life0 self) -> Pin<Box<dyn Future<Output = Result<Vec<RecordBatch>>> + Send + 'async_trait>> where 'life0: 'async_trait, Self: 'async_trait, [src]

#[must_use]fn collect_partitioned<'life0, 'async_trait>( &'life0 self) -> Pin<Box<dyn Future<Output = Result<Vec<Vec<RecordBatch>>>> + Send + 'async_trait>> where 'life0: 'async_trait, Self: 'async_trait, [src]

fn schema(&self) -> &DFSchema[src]

fn to_logical_plan(&self) -> LogicalPlan[src]

fn explain(&self, verbose: bool) -> Result<Arc<dyn DataFrame>>[src]

fn registry(&self) -> Arc<dyn FunctionRegistry>[src]

Implementors

impl DataFrame for DataFrameImpl[src]

fn select_columns(&self, columns: &[&str]) -> Result<Arc<dyn DataFrame>>[src]

fn select(&self, expr_list: Vec<Expr>) -> Result<Arc<dyn DataFrame>>[src]

fn filter(&self, predicate: Expr) -> Result<Arc<dyn DataFrame>>[src]

fn aggregate( &self, group_expr: Vec<Expr>, aggr_expr: Vec<Expr>) -> Result<Arc<dyn DataFrame>>[src]

fn limit(&self, n: usize) -> Result<Arc<dyn DataFrame>>[src]

fn sort(&self, expr: Vec<Expr>) -> Result<Arc<dyn DataFrame>>[src]

fn join( &self, right: Arc<dyn DataFrame>, join_type: JoinType, left_cols: &[&str], right_cols: &[&str]) -> Result<Arc<dyn DataFrame>>[src]

fn repartition( &self, partitioning_scheme: Partitioning) -> Result<Arc<dyn DataFrame>>[src]

fn to_logical_plan(&self) -> LogicalPlan[src]

fn collect<'life0, 'async_trait>( &'life0 self) -> Pin<Box<dyn Future<Output = Result<Vec<RecordBatch>>> + Send + 'async_trait>> where 'life0: 'async_trait, Self: 'async_trait, [src]

fn collect_partitioned<'life0, 'async_trait>( &'life0 self) -> Pin<Box<dyn Future<Output = Result<Vec<Vec<RecordBatch>>>> + Send + 'async_trait>> where 'life0: 'async_trait, Self: 'async_trait, [src]

fn schema(&self) -> &DFSchema[src]

fn explain(&self, verbose: bool) -> Result<Arc<dyn DataFrame>>[src]

fn registry(&self) -> Arc<dyn FunctionRegistry>[src]

fn union(&self, dataframe: Arc<dyn DataFrame>) -> Result<Arc<dyn DataFrame>>[src]

Trait datafusion::dataframe::DataFrame[−][src]

`fn select_columns(&self, columns: &[&str]) -> Result<Arc<dyn DataFrame>>`[src]

`fn select(&self, expr: Vec<Expr>) -> Result<Arc<dyn DataFrame>>`[src]

`fn filter(&self, expr: Expr) -> Result<Arc<dyn DataFrame>>`[src]

`fn aggregate( &self, group_expr: Vec<Expr>, aggr_expr: Vec<Expr> ) -> Result<Arc<dyn DataFrame>>`[src]

`fn limit(&self, n: usize) -> Result<Arc<dyn DataFrame>>`[src]

`fn union(&self, dataframe: Arc<dyn DataFrame>) -> Result<Arc<dyn DataFrame>>`[src]

`fn sort(&self, expr: Vec<Expr>) -> Result<Arc<dyn DataFrame>>`[src]

`fn join( &self, right: Arc<dyn DataFrame>, join_type: JoinType, left_cols: &[&str], right_cols: &[&str] ) -> Result<Arc<dyn DataFrame>>`[src]

`fn repartition( &self, partitioning_scheme: Partitioning ) -> Result<Arc<dyn DataFrame>>`[src]

`#[must_use] fn collect<'life0, 'async_trait>( &'life0 self ) -> Pin<Box<dyn Future<Output = Result<Vec<RecordBatch>>> + Send + 'async_trait>> where 'life0: 'async_trait, Self: 'async_trait,` [src]

`#[must_use] fn collect_partitioned<'life0, 'async_trait>( &'life0 self ) -> Pin<Box<dyn Future<Output = Result<Vec<Vec<RecordBatch>>>> + Send + 'async_trait>> where 'life0: 'async_trait, Self: 'async_trait,` [src]

`fn schema(&self) -> &DFSchema`[src]

`fn to_logical_plan(&self) -> LogicalPlan`[src]

`fn explain(&self, verbose: bool) -> Result<Arc<dyn DataFrame>>`[src]

`fn registry(&self) -> Arc<dyn FunctionRegistry>`[src]

`impl DataFrame for DataFrameImpl`[src]

`fn select_columns(&self, columns: &[&str]) -> Result<Arc<dyn DataFrame>>`[src]

`fn select(&self, expr_list: Vec<Expr>) -> Result<Arc<dyn DataFrame>>`[src]

`fn filter(&self, predicate: Expr) -> Result<Arc<dyn DataFrame>>`[src]

`fn aggregate( &self, group_expr: Vec<Expr>, aggr_expr: Vec<Expr> ) -> Result<Arc<dyn DataFrame>>`[src]

`fn limit(&self, n: usize) -> Result<Arc<dyn DataFrame>>`[src]

`fn sort(&self, expr: Vec<Expr>) -> Result<Arc<dyn DataFrame>>`[src]

`fn join( &self, right: Arc<dyn DataFrame>, join_type: JoinType, left_cols: &[&str], right_cols: &[&str] ) -> Result<Arc<dyn DataFrame>>`[src]

`fn repartition( &self, partitioning_scheme: Partitioning ) -> Result<Arc<dyn DataFrame>>`[src]

`fn to_logical_plan(&self) -> LogicalPlan`[src]

`fn collect<'life0, 'async_trait>( &'life0 self ) -> Pin<Box<dyn Future<Output = Result<Vec<RecordBatch>>> + Send + 'async_trait>> where 'life0: 'async_trait, Self: 'async_trait,` [src]

`fn collect_partitioned<'life0, 'async_trait>( &'life0 self ) -> Pin<Box<dyn Future<Output = Result<Vec<Vec<RecordBatch>>>> + Send + 'async_trait>> where 'life0: 'async_trait, Self: 'async_trait,` [src]

`fn schema(&self) -> &DFSchema`[src]

`fn explain(&self, verbose: bool) -> Result<Arc<dyn DataFrame>>`[src]

`fn registry(&self) -> Arc<dyn FunctionRegistry>`[src]

`fn union(&self, dataframe: Arc<dyn DataFrame>) -> Result<Arc<dyn DataFrame>>`[src]