Trait datafusion::dataframe::DataFrame [−][src]
pub trait DataFrame: Send + Sync {}Show methods
fn select_columns(&self, columns: &[&str]) -> Result<Arc<dyn DataFrame>>; fn select(&self, expr: Vec<Expr>) -> Result<Arc<dyn DataFrame>>; fn filter(&self, expr: Expr) -> Result<Arc<dyn DataFrame>>; fn aggregate(
&self,
group_expr: Vec<Expr>,
aggr_expr: Vec<Expr>
) -> Result<Arc<dyn DataFrame>>; fn limit(&self, n: usize) -> Result<Arc<dyn DataFrame>>; fn union(&self, dataframe: Arc<dyn DataFrame>) -> Result<Arc<dyn DataFrame>>; fn sort(&self, expr: Vec<Expr>) -> Result<Arc<dyn DataFrame>>; fn join(
&self,
right: Arc<dyn DataFrame>,
join_type: JoinType,
left_cols: &[&str],
right_cols: &[&str]
) -> Result<Arc<dyn DataFrame>>; fn repartition(
&self,
partitioning_scheme: Partitioning
) -> Result<Arc<dyn DataFrame>>; #[must_use] fn collect<'life0, 'async_trait>(
&'life0 self
) -> Pin<Box<dyn Future<Output = Result<Vec<RecordBatch>>> + Send + 'async_trait>>
where
'life0: 'async_trait,
Self: 'async_trait; #[must_use] fn collect_partitioned<'life0, 'async_trait>(
&'life0 self
) -> Pin<Box<dyn Future<Output = Result<Vec<Vec<RecordBatch>>>> + Send + 'async_trait>>
where
'life0: 'async_trait,
Self: 'async_trait; fn schema(&self) -> &DFSchema; fn to_logical_plan(&self) -> LogicalPlan; fn explain(&self, verbose: bool) -> Result<Arc<dyn DataFrame>>; fn registry(&self) -> Arc<dyn FunctionRegistry>;
DataFrame represents a logical set of rows with the same named columns. Similar to a Pandas DataFrame or Spark DataFrame
DataFrames are typically created by the read_csv
and read_parquet
methods on the
ExecutionContext and can then be modified
by calling the transformation methods, such as filter
, select
, aggregate
, and limit
to build up a query definition.
The query can be executed by calling the collect
method.
let mut ctx = ExecutionContext::new(); let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?; let df = df.filter(col("a").lt_eq(col("b")))? .aggregate(vec![col("a")], vec![min(col("b"))])? .limit(100)?; let results = df.collect();
Required methods
fn select_columns(&self, columns: &[&str]) -> Result<Arc<dyn DataFrame>>
[src]
Filter the DataFrame by column. Returns a new DataFrame only containing the specified columns.
let mut ctx = ExecutionContext::new(); let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?; let df = df.select_columns(&["a", "b"])?;
fn select(&self, expr: Vec<Expr>) -> Result<Arc<dyn DataFrame>>
[src]
Create a projection based on arbitrary expressions.
let mut ctx = ExecutionContext::new(); let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?; let df = df.select(vec![col("a") * col("b"), col("c")])?;
fn filter(&self, expr: Expr) -> Result<Arc<dyn DataFrame>>
[src]
Filter a DataFrame to only include rows that match the specified filter expression.
let mut ctx = ExecutionContext::new(); let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?; let df = df.filter(col("a").lt_eq(col("b")))?;
fn aggregate(
&self,
group_expr: Vec<Expr>,
aggr_expr: Vec<Expr>
) -> Result<Arc<dyn DataFrame>>
[src]
&self,
group_expr: Vec<Expr>,
aggr_expr: Vec<Expr>
) -> Result<Arc<dyn DataFrame>>
Perform an aggregate query with optional grouping expressions.
let mut ctx = ExecutionContext::new(); let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?; // The following use is the equivalent of "SELECT MIN(b) GROUP BY a" let _ = df.aggregate(vec![col("a")], vec![min(col("b"))])?; // The following use is the equivalent of "SELECT MIN(b)" let _ = df.aggregate(vec![], vec![min(col("b"))])?;
fn limit(&self, n: usize) -> Result<Arc<dyn DataFrame>>
[src]
Limit the number of rows returned from this DataFrame.
let mut ctx = ExecutionContext::new(); let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?; let df = df.limit(100)?;
fn union(&self, dataframe: Arc<dyn DataFrame>) -> Result<Arc<dyn DataFrame>>
[src]
Calculate the union two DataFrame
s. The two DataFrame
s must have exactly the same schema
let mut ctx = ExecutionContext::new(); let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?; let df = df.union(df.clone())?;
fn sort(&self, expr: Vec<Expr>) -> Result<Arc<dyn DataFrame>>
[src]
Sort the DataFrame by the specified sorting expressions. Any expression can be turned into a sort expression by calling its sort method.
let mut ctx = ExecutionContext::new(); let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?; let df = df.sort(vec![col("a").sort(true, true), col("b").sort(false, false)])?;
fn join(
&self,
right: Arc<dyn DataFrame>,
join_type: JoinType,
left_cols: &[&str],
right_cols: &[&str]
) -> Result<Arc<dyn DataFrame>>
[src]
&self,
right: Arc<dyn DataFrame>,
join_type: JoinType,
left_cols: &[&str],
right_cols: &[&str]
) -> Result<Arc<dyn DataFrame>>
Join this DataFrame with another DataFrame using the specified columns as join keys
let mut ctx = ExecutionContext::new(); let left = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?; let right = ctx.read_csv("tests/example.csv", CsvReadOptions::new())? .select(vec![ col("a").alias("a2"), col("b").alias("b2"), col("c").alias("c2")])?; let join = left.join(right, JoinType::Inner, &["a", "b"], &["a2", "b2"])?; let batches = join.collect().await?;
fn repartition(
&self,
partitioning_scheme: Partitioning
) -> Result<Arc<dyn DataFrame>>
[src]
&self,
partitioning_scheme: Partitioning
) -> Result<Arc<dyn DataFrame>>
Repartition a DataFrame based on a logical partitioning scheme.
let mut ctx = ExecutionContext::new(); let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?; let df1 = df.repartition(Partitioning::RoundRobinBatch(4))?;
#[must_use]fn collect<'life0, 'async_trait>(
&'life0 self
) -> Pin<Box<dyn Future<Output = Result<Vec<RecordBatch>>> + Send + 'async_trait>> where
'life0: 'async_trait,
Self: 'async_trait,
[src]
&'life0 self
) -> Pin<Box<dyn Future<Output = Result<Vec<RecordBatch>>> + Send + 'async_trait>> where
'life0: 'async_trait,
Self: 'async_trait,
Executes this DataFrame and collects all results into a vector of RecordBatch.
let mut ctx = ExecutionContext::new(); let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?; let batches = df.collect().await?;
#[must_use]fn collect_partitioned<'life0, 'async_trait>(
&'life0 self
) -> Pin<Box<dyn Future<Output = Result<Vec<Vec<RecordBatch>>>> + Send + 'async_trait>> where
'life0: 'async_trait,
Self: 'async_trait,
[src]
&'life0 self
) -> Pin<Box<dyn Future<Output = Result<Vec<Vec<RecordBatch>>>> + Send + 'async_trait>> where
'life0: 'async_trait,
Self: 'async_trait,
Executes this DataFrame and collects all results into a vector of vector of RecordBatch maintaining the input partitioning.
let mut ctx = ExecutionContext::new(); let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?; let batches = df.collect_partitioned().await?;
fn schema(&self) -> &DFSchema
[src]
Returns the schema describing the output of this DataFrame in terms of columns returned, where each column has a name, data type, and nullability attribute.
let mut ctx = ExecutionContext::new(); let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?; let schema = df.schema();
fn to_logical_plan(&self) -> LogicalPlan
[src]
Return the logical plan represented by this DataFrame.
fn explain(&self, verbose: bool) -> Result<Arc<dyn DataFrame>>
[src]
Return a DataFrame with the explanation of its plan so far.
let mut ctx = ExecutionContext::new(); let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?; let batches = df.limit(100)?.explain(false)?.collect().await?;
fn registry(&self) -> Arc<dyn FunctionRegistry>
[src]
Return a FunctionRegistry
used to plan udf’s calls
let mut ctx = ExecutionContext::new(); let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new())?; let f = df.registry(); // use f.udf("name", vec![...]) to use the udf
Implementors
impl DataFrame for DataFrameImpl
[src]
impl DataFrame for DataFrameImpl
[src]fn select_columns(&self, columns: &[&str]) -> Result<Arc<dyn DataFrame>>
[src]
Apply a projection based on a list of column names
fn select(&self, expr_list: Vec<Expr>) -> Result<Arc<dyn DataFrame>>
[src]
Create a projection based on arbitrary expressions
fn filter(&self, predicate: Expr) -> Result<Arc<dyn DataFrame>>
[src]
Create a filter based on a predicate expression
fn aggregate(
&self,
group_expr: Vec<Expr>,
aggr_expr: Vec<Expr>
) -> Result<Arc<dyn DataFrame>>
[src]
&self,
group_expr: Vec<Expr>,
aggr_expr: Vec<Expr>
) -> Result<Arc<dyn DataFrame>>
Perform an aggregate query
fn limit(&self, n: usize) -> Result<Arc<dyn DataFrame>>
[src]
Limit the number of rows
fn sort(&self, expr: Vec<Expr>) -> Result<Arc<dyn DataFrame>>
[src]
Sort by specified sorting expressions
fn join(
&self,
right: Arc<dyn DataFrame>,
join_type: JoinType,
left_cols: &[&str],
right_cols: &[&str]
) -> Result<Arc<dyn DataFrame>>
[src]
&self,
right: Arc<dyn DataFrame>,
join_type: JoinType,
left_cols: &[&str],
right_cols: &[&str]
) -> Result<Arc<dyn DataFrame>>
Join with another DataFrame
fn repartition(
&self,
partitioning_scheme: Partitioning
) -> Result<Arc<dyn DataFrame>>
[src]
&self,
partitioning_scheme: Partitioning
) -> Result<Arc<dyn DataFrame>>
fn to_logical_plan(&self) -> LogicalPlan
[src]
Convert to logical plan
fn collect<'life0, 'async_trait>(
&'life0 self
) -> Pin<Box<dyn Future<Output = Result<Vec<RecordBatch>>> + Send + 'async_trait>> where
'life0: 'async_trait,
Self: 'async_trait,
[src]
&'life0 self
) -> Pin<Box<dyn Future<Output = Result<Vec<RecordBatch>>> + Send + 'async_trait>> where
'life0: 'async_trait,
Self: 'async_trait,
fn collect_partitioned<'life0, 'async_trait>(
&'life0 self
) -> Pin<Box<dyn Future<Output = Result<Vec<Vec<RecordBatch>>>> + Send + 'async_trait>> where
'life0: 'async_trait,
Self: 'async_trait,
[src]
&'life0 self
) -> Pin<Box<dyn Future<Output = Result<Vec<Vec<RecordBatch>>>> + Send + 'async_trait>> where
'life0: 'async_trait,
Self: 'async_trait,
fn schema(&self) -> &DFSchema
[src]
Returns the schema from the logical plan