substrait_validator/parse/relations/
set.rs

1// SPDX-License-Identifier: Apache-2.0
2
3//! Module providing parse/validation functions for set relations.
4//!
5//! The set operation encompasses several set level operations that support
6//! combining datasets based, possibly excluding records based on various
7//! types of record level matching.
8//!
9//! See <https://substrait.io/relations/logical_relations/#set-operation>
10
11use std::sync::Arc;
12
13use crate::input::proto::substrait;
14use crate::output::diagnostic;
15use crate::parse::context;
16use crate::parse::types;
17
18enum Operation {
19    Invalid,
20    Subtract,
21    SubtractByUnion,
22    SubtractByIntersection,
23    Intersect,
24    IntersectWithUnion,
25    Union,
26    Merge,
27}
28
29/// Parse set relation.
30pub fn parse_set_rel(x: &substrait::SetRel, y: &mut context::Context) -> diagnostic::Result<()> {
31    use substrait::set_rel::SetOp;
32
33    // Parse inputs.
34    let in_types: Vec<_> = handle_rel_inputs!(x, y).collect();
35
36    // Check inputs and derive schema.
37    if in_types.len() < 2 {
38        diagnostic!(
39            y,
40            Error,
41            RelationMissing,
42            "set operations require at least two input relations"
43        );
44    }
45    let mut schema = Arc::default();
46    for in_type in in_types.iter() {
47        schema = types::assert_equal(
48            y,
49            &in_type.strip_field_names(),
50            &schema,
51            "all set inputs must have matching schemas",
52        );
53    }
54    y.set_schema(schema);
55
56    // Check set operation.
57    let op = proto_required_enum_field!(x, y, op, SetOp)
58        .1
59        .unwrap_or_default();
60    let op = match (op, in_types.len() > 2) {
61        (SetOp::Unspecified, _) => Operation::Invalid,
62        (SetOp::MinusPrimary, true) => Operation::SubtractByUnion,
63        (SetOp::MinusPrimary, false) => Operation::Subtract,
64        (SetOp::MinusMultiset, true) => Operation::SubtractByIntersection,
65        (SetOp::MinusMultiset, false) => Operation::Subtract,
66        (SetOp::IntersectionPrimary, true) => Operation::IntersectWithUnion,
67        (SetOp::IntersectionPrimary, false) => Operation::Intersect,
68        (SetOp::IntersectionMultiset, _) => Operation::Intersect,
69        (SetOp::UnionDistinct, _) => Operation::Union,
70        (SetOp::UnionAll, _) => Operation::Merge,
71        (SetOp::MinusPrimaryAll, _) | (SetOp::IntersectionMultisetAll, _) => {
72            diagnostic!(
73                y,
74                Warning,
75                NotYetImplemented,
76                "Set variant {:?} not yet supported",
77                op
78            );
79
80            handle_rel_common!(x, y);
81            handle_advanced_extension!(x, y);
82            return Ok(());
83        }
84    };
85
86    // Describe the relation.
87    match op {
88        Operation::Invalid => {
89            describe!(y, Relation, "Invalid set operation");
90        }
91        Operation::Subtract => {
92            describe!(y, Relation, "Set subtraction");
93            summary!(
94                y,
95                "Yields all rows from the first dataset that do not exist \
96                in the second dataset."
97            );
98        }
99        Operation::SubtractByUnion => {
100            describe!(y, Relation, "Set subtract by union");
101            summary!(
102                y,
103                "Yields all rows from the first dataset that do not exist \
104                in any of the other datasets."
105            );
106        }
107        Operation::SubtractByIntersection => {
108            describe!(y, Relation, "Set subtract by intersection");
109            summary!(
110                y,
111                "Yields all rows from the first dataset that do not exist in \
112                all of the other datasets."
113            );
114        }
115        Operation::Intersect => {
116            describe!(y, Relation, "Set intersection");
117            summary!(
118                y,
119                "Yields all rows from the first dataset that exist in all \
120                datasets."
121            );
122        }
123        Operation::IntersectWithUnion => {
124            describe!(y, Relation, "Set intersect with union");
125            summary!(
126                y,
127                "Yields all rows from the first dataset that exist in any of \
128                the other datasets."
129            );
130        }
131        Operation::Union => {
132            describe!(y, Relation, "Set union");
133            summary!(
134                y,
135                "Yields all rows that exist in any dataset, removing duplicates."
136            );
137        }
138        Operation::Merge => {
139            describe!(y, Relation, "Merge");
140            summary!(y, "Yields all rows from all incoming datasets.");
141        }
142    };
143
144    // Handle the common field.
145    handle_rel_common!(x, y);
146
147    // Handle the advanced extension field.
148    handle_advanced_extension!(x, y);
149
150    Ok(())
151}