data_transform/parser/
ast.rs

1use serde::{Deserialize, Serialize};
2
3#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
4pub struct Program {
5    pub statements: Vec<Statement>,
6}
7
8#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
9pub enum Statement {
10    Assignment { name: String, pipeline: Pipeline },
11    Pipeline(Pipeline),
12}
13
14#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
15pub struct Pipeline {
16    pub source: Option<Source>,
17    pub operations: Vec<Operation>,
18}
19
20#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
21pub enum Source {
22    Read(ReadOp),
23    Variable(String),
24}
25
26#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
27pub enum Operation {
28    Read(ReadOp),
29    Variable(String),  // Variable reference (e.g., "data" in "data | filter(...)")
30    Write(WriteOp),
31    Select(SelectOp),
32    Filter(FilterOp),
33    Mutate(MutateOp),
34    Rename(RenameOp),
35    RenameAll(RenameAllOp),
36    Sort(SortOp),
37    Take(TakeOp),
38    Skip(SkipOp),
39    Slice(SliceOp),
40    Drop(DropOp),
41    Distinct(DistinctOp),
42}
43
44#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
45pub struct ReadOp {
46    pub path: String,
47    pub format: Option<String>,
48    pub delimiter: Option<char>,
49    pub header: Option<bool>,  // NEW: Whether the file has a header row
50    pub skip_rows: Option<usize>,  // NEW: Number of rows to skip before reading
51    pub trim_whitespace: Option<bool>,  // NEW: Trim leading/trailing whitespace from each line
52}
53
54#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
55pub struct WriteOp {
56    pub path: String,
57    pub format: Option<String>,
58    pub header: Option<bool>,
59    pub delimiter: Option<char>,  // NEW: Delimiter character for output
60}
61
62#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
63pub struct SelectOp {
64    pub selectors: Vec<(ColumnSelector, Option<String>)>, // (selector, optional alias)
65}
66
67#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
68pub enum ColumnSelector {
69    Name(String),
70    Index(usize), // 0-based internally, only via $N syntax
71    Range(usize, usize), // 0-based internally, only via $N..$M syntax
72    Regex(String),
73    Type(Vec<DataType>),
74    All,
75    Except(Box<ColumnSelector>),
76    And(Box<ColumnSelector>, Box<ColumnSelector>),
77}
78
79#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
80pub enum DataType {
81    Number,
82    String,
83    Boolean,
84    Date,
85    DateTime,
86}
87
88#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
89pub struct FilterOp {
90    pub condition: Expression,
91}
92
93#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
94pub struct MutateOp {
95    pub assignments: Vec<Assignment>,
96}
97
98#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
99pub struct Assignment {
100    pub column: AssignmentTarget,
101    pub expression: Expression,
102}
103
104#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
105pub enum AssignmentTarget {
106    Name(String),      // Named column: name, column_name
107    Position(usize),   // Positional column: $1, $2, etc. (1-based)
108}
109
110#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
111pub struct RenameOp {
112    pub mappings: Vec<(ColumnRef, String)>,
113}
114
115#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
116pub struct RenameAllOp {
117    pub strategy: RenameStrategy,
118}
119
120#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
121pub enum RenameStrategy {
122    Replace { old: String, new: String },
123    Sequential { prefix: String, start: usize, end: usize },
124}
125
126#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
127pub enum ColumnRef {
128    Name(String),
129    Index(usize),      // 0-based index for internal use
130    Position(usize),   // 1-based AWK-style ($1, $2, etc.)
131}
132
133#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
134pub struct SortOp {
135    pub columns: Vec<(ColumnRef, bool)>, // (column, descending)
136}
137
138#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
139pub struct TakeOp {
140    pub n: usize,
141}
142
143#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
144pub struct SkipOp {
145    pub n: usize,
146}
147
148#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
149pub struct SliceOp {
150    pub start: usize,
151    pub end: usize,
152}
153
154#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
155pub struct DropOp {
156    pub columns: Vec<ColumnSelector>,
157}
158
159#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
160pub struct DistinctOp {
161    pub columns: Option<Vec<ColumnSelector>>,  // None = all columns
162}
163
164#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
165pub enum Expression {
166    Literal(Literal),
167    Column(ColumnRef),
168    List(Vec<Literal>),  // List literal for 'in' operator: ['a', 'b', 'c']
169    Variable(String),  // Variable reference (e.g., "want" in "filter($3 in want)")
170    BinaryOp {
171        left: Box<Expression>,
172        op: BinOp,
173        right: Box<Expression>,
174    },
175    MethodCall {
176        object: Box<Expression>,
177        method: String,
178        args: Vec<Expression>,
179    },
180    Split {
181        string: Box<Expression>,
182        delimiter: Box<Expression>,
183        index: usize,
184    },
185    Lookup {
186        table: String,              // Variable name of the lookup table
187        key: Box<Expression>,       // Expression to evaluate as lookup key
188        on: LookupField,            // Field in lookup table to match against
189        return_field: LookupField,  // Field to return from lookup table
190    },
191    Replace {
192        text: Box<Expression>,      // Expression to perform replacement on
193        old: Box<Expression>,       // Pattern to replace
194        new: Box<Expression>,       // Replacement text
195    },
196    Regex(String),  // Regex pattern literal: re('pattern')
197}
198
199#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
200pub enum LookupField {
201    Name(String),          // Explicit column name: 'column_name'
202    Position(usize),       // Positional column: $1, $2, etc. (1-based)
203}
204
205#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
206pub enum Literal {
207    Number(f64),
208    String(String),
209    Boolean(bool),
210    Null,
211}
212
213#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
214pub enum BinOp {
215    Add,
216    Sub,
217    Mul,
218    Div,
219    Gt,
220    Lt,
221    Gte,
222    Lte,
223    Eq,
224    Neq,
225    And,
226    Or,
227    In,  // Membership test (value in collection)
228}