1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
use std::collections::HashMap;

pub use field_expr::{
    BinaryParam, ComponentFieldRefParam, CondParam, ConstParam, FieldExpression, FieldRefParam,
    RecordRefParam, UnaryParam, ConstValue, StringRepr
};
use serde::{Deserialize, Serialize};

use strum::AsRefStr;

mod field_expr;

#[derive(Serialize, Deserialize, Debug, Clone, AsRefStr)]
#[serde(tag = "opcode")]
pub enum GrassIR {
    /// Cast the inner data stream to a bed3 data stream
    CastToBed3(CastToBed3Param),
    /// Assign a label to a GRASS expression
    Let(LetBinding),
    /// Reference to an existing GRASS expression
    Ref(RefParam),
    /// Open a external data source
    Open(OpenParam),
    /// Write the result of GRASS expression to a file/file_no
    WriteFile(WriteFileParam),
    /// Modify a field for each record in a GRASS expression
    Alter(AlterParam),
    /// Filter records in a GRASS expression
    Filter(FilterParam),
    /// Merge any overlapped records in a GRASS expression
    Merge(MergeParam),
    /// Intersect two GRASS expression
    Intersection(IntersectParam),
    /// Customize the output format of records in a GRASS expression
    Format(FormatParam),
    /// Group the records in a GRASS expression into groups
    GroupBy(GroupByParam),

    AssumeSorted(AssumeSortedParam),
}

#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct RefParam {
    /// The symbol we are referencing
    pub id: String,
}

#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct GroupByParam {
    /// The expression to group
    #[serde(rename = "inner")]
    pub expr: Box<GrassIR>,
    /// The list of key expressions for grouping
    pub keys: Vec<FieldExpression>,
}

#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct FormatParam {
    /// The expression to be formatted
    #[serde(rename = "inner")]
    pub expr: Box<GrassIR>,
    /// The formatting string
    pub fmt_str: String,
    /// The value referred by the formatting string
    pub values: HashMap<String, FieldExpression>,
}

#[derive(Serialize, Deserialize, Debug, Clone)]
pub enum IntersectFlavor {
    #[serde(rename = "inner")]
    Inner,
    #[serde(rename = "outer")]
    Outer,
    #[serde(rename = "left-outer")]
    LeftOuter,
    #[serde(rename = "right-outer")]
    RightOuter,
}

#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct IntersectParam {
    /// The flavor of the insection operator
    pub flavor: IntersectFlavor,
    /// The left-hand-side operand
    pub lhs: Box<GrassIR>,
    /// The right-hand-side operand
    pub rhs: Box<GrassIR>,
    /// If we are using the sorted algorithm
    pub sorted: bool,
}

#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct MergeParam {
    #[serde(rename = "inner")]
    pub input_expr: Box<GrassIR>,
    pub sorted: bool,
}

#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct FilterParam {
    /// The original expression
    #[serde(rename = "inner")]
    pub input_expr: Box<GrassIR>,
    ///  The condition expression
    pub cond: FieldExpression,
}

#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct AlterParam {
    /// The original expression
    #[serde(rename = "inner")]
    pub original_expr: Box<GrassIR>,
    /// The field name that we are going to modify
    pub field: String,
    /// The new value this field should assigned to
    pub value: FieldExpression,
}


#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct AssumeSortedParam {
    pub inner: Box<GrassIR>,
}

#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct CastToBed3Param {
    pub inner: Box<GrassIR>,
}

#[derive(Serialize, Deserialize, Debug, Clone)]
pub enum InputFormat {
    Bam,
    Bed,
    Cram,
    Vcf,
    Fasta,
}

#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct OpenParam {
    /// The path to the data source
    pub path: String,
    /// What format of the input we are expecting
    pub format: InputFormat,
    /// How many field for each record
    pub num_of_fields: i32,
    /// If the input file is compressed
    pub compression: bool,
    /// If this file is known sorted
    pub sorted: bool,
}

#[derive(Serialize, Deserialize, Debug, Clone)]
#[serde(untagged)]
pub enum WriteTarget {
    Path(String),
    FileNo(i32),
}

#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct WriteFileParam {
    /// The expression we want to write
    pub what: Box<GrassIR>,
    /// The target file or file number
    pub target: WriteTarget,
}

#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct LetBinding {
    /// The symbol of the value
    pub id: String,
    /// The actual expression that assigned to this symbol
    pub value: Box<GrassIR>,
}

#[cfg(test)]
mod test {
    use std::{collections::BTreeMap, error::Error};

    use serde::{Deserialize, Serialize};
    use serde_json::from_str;

    use crate::GrassIR;

    #[derive(Serialize, Deserialize, PartialEq, Clone, Debug)]
    #[serde(untagged)]
    enum JsonValue {
        String(String),
        Number(f64),
        Boolean(bool),
        List(Vec<JsonValue>),
        Object(BTreeMap<String, JsonValue>),
    }

    fn validate_object<'a, T: Serialize>(input: &str, obj: &'a T) {
        let input_dict: JsonValue = serde_json::from_str(input).unwrap();
        let obj_str = serde_json::to_string(obj).unwrap();
        let obj_dict: JsonValue = serde_json::from_str(&obj_str).unwrap();
        assert_eq!(obj_dict, input_dict);
    }

    macro_rules! parse_test {
        ($name: ident, $path : expr) => {
            #[test]
            fn $name() -> Result<(), Box<dyn Error>> {
                let input = include_str!($path);
                let data: GrassIR = from_str(input)?;
                validate_object(input, &data);
                Ok(())
            }
        };
    }
    parse_test!(parse_bam_to_bed, "../../data/ir/bam-to-bed.py.json");
    parse_test!(
        parse_expand_interval,
        "../../data/ir/expand-interval.py.json"
    );
    parse_test!(parse_filter, "../../data/ir/filter.py.json");
    parse_test!(parse_merge, "../../data/ir/merge.py.json");
    parse_test!(parse_slop, "../../data/ir/slop.py.json");
    parse_test!(
        parse_sorted_intersect_custom_format,
        "../../data/ir/sorted-intersect-custom-fmt.py.json"
    );
    parse_test!(
        parse_sorted_intersect_groupby,
        "../../data/ir/sorted-intersect-group.py.json"
    );
    parse_test!(
        parse_sorted_intersect_leftouter,
        "../../data/ir/sorted-intersect-leftouter.py.json"
    );
    parse_test!(
        parse_sorted_intersect_overlap_filter,
        "../../data/ir/sorted-intersect-overlap-filter.py.json"
    );
    parse_test!(
        parse_sorted_intersect,
        "../../data/ir/sorted-intersect.py.json"
    );
    parse_test!(parse_sorted_window, "../../data/ir/window.py.json");
}