grass_ir/
lib.rs

1use std::{collections::BTreeMap, marker::PhantomData};
2
3pub use field_expr::{
4    BinaryParam, ComponentFieldRefParam, CondParam, ConstParam, ConstValue, FieldExpression,
5    FieldRefParam, RecordRefParam, StringRepr, UnaryParam,
6};
7use serde::{Deserialize, Serialize};
8
9use strum::AsRefStr;
10
11mod field_expr;
12
13// TODO: Make sure that we use this type for all the IR values that can be passed by runtime environ
14#[derive(Serialize, Deserialize, Debug, Clone)]
15pub struct ConstBagRef<T> {
16    const_bag_key: usize,
17    #[serde(skip)]
18    _phantom_data: PhantomData<T>,
19}
20
21impl<T> ConstBagRef<T> {
22    pub fn get_const_bag_ident(&self) -> String {
23        format!("__CONST_BAG_VALUE_{}", self.const_bag_key)
24    }
25}
26
27#[derive(Serialize, Deserialize, Debug, Clone)]
28#[serde(untagged)]
29pub enum ConstOrEnv<T> {
30    Const(T),
31    Env(ConstBagRef<T>),
32}
33
34#[derive(Serialize, Deserialize, Debug, Clone, AsRefStr)]
35#[serde(tag = "opcode")]
36pub enum GrassIR {
37    /// Cast the inner data stream to a bed3 data stream
38    CastToBed(CastToBedParam),
39    /// Assign a label to a GRASS expression
40    Let(LetBinding),
41    /// Reference to an existing GRASS expression
42    Ref(RefParam),
43    /// Open a external data source
44    Open(OpenParam),
45    /// Write the result of GRASS expression to a file/file_no
46    WriteFile(WriteFileParam),
47    /// Modify a field for each record in a GRASS expression
48    Alter(AlterParam),
49    /// Filter records in a GRASS expression
50    Filter(FilterParam),
51    /// Merge any overlapped records in a GRASS expression
52    MergeOverlap(MergeOverlapParam),
53    /// Intersect two GRASS expression
54    Intersection(IntersectParam),
55    /// Customize the output format of records in a GRASS expression
56    Format(FormatParam),
57    /// Group the records in a GRASS expression into groups
58    GroupBy(GroupByParam),
59
60    AssumeSorted(AssumeSortedParam),
61
62    InlineRust(InlineRustParam),
63
64    LoadGenomeFile(LoadGenomeFileParam),
65
66    SortedRandom(SortedRandomParam),
67
68    InternalSort(InternalSortParam),
69
70    Invert(InvertParam),
71
72    AssignTag(AssignTagParam),
73
74    TwoWayMerge(TwoWayMergeParam),
75
76    Limit(LimitParam),
77
78    Nop(NopParam),
79}
80
81#[derive(Serialize, Deserialize, Debug, Clone)]
82pub struct LimitParam {
83    pub what: Box<GrassIR>,
84    pub count: ConstOrEnv<f64>,
85}
86
87#[derive(Serialize, Deserialize, Debug, Clone)]
88pub struct TwoWayMergeParam {
89    pub expr_1: Box<GrassIR>,
90    pub expr_2: Box<GrassIR>,
91}
92
93#[derive(Serialize, Deserialize, Debug, Clone)]
94#[serde(untagged)]
95pub enum TagValue {
96    String(String),
97    Int(i64),
98    Float(f64),
99}
100
101#[derive(Serialize, Deserialize, Debug, Clone)]
102pub struct AssignTagParam {
103    pub inner: Box<GrassIR>,
104    pub tag: TagValue,
105}
106
107#[derive(Serialize, Deserialize, Debug, Clone)]
108pub struct InvertParam {
109    pub inner: Box<GrassIR>,
110}
111
112#[derive(Serialize, Deserialize, Debug, Clone)]
113pub struct InternalSortParam {
114    pub inner: Box<GrassIR>,
115}
116
117#[derive(Serialize, Deserialize, Debug, Clone)]
118pub struct NopParam {
119    pub inner: Box<GrassIR>,
120}
121
122#[derive(Serialize, Deserialize, Debug, Clone)]
123pub struct SortedRandomParam {
124    pub count: ConstOrEnv<usize>,
125    pub min_length: ConstOrEnv<u32>,
126    pub max_length: ConstOrEnv<u32>,
127}
128
129#[derive(Serialize, Deserialize, Debug, Clone)]
130pub enum LoadGenomeFileParam {
131    File(ConstOrEnv<String>),
132}
133
134#[derive(Serialize, Deserialize, Debug, Clone)]
135pub struct InlineRustParam {
136    pub env: BTreeMap<String, GrassIR>,
137    pub src: String,
138}
139
140#[derive(Serialize, Deserialize, Debug, Clone)]
141pub struct RefParam {
142    /// The symbol we are referencing
143    pub id: String,
144}
145
146#[derive(Serialize, Deserialize, Debug, Clone)]
147pub struct GroupByParam {
148    /// The expression to group
149    #[serde(rename = "inner")]
150    pub expr: Box<GrassIR>,
151    /// The list of key expressions for grouping
152    pub keys: Vec<FieldExpression>,
153}
154
155#[derive(Serialize, Deserialize, Debug, Clone)]
156pub struct FormatParam {
157    /// The expression to be formatted
158    #[serde(rename = "inner")]
159    pub expr: Box<GrassIR>,
160    /// The formatting string
161    pub fmt_str: String,
162    /// The value referred by the formatting string
163    pub values: BTreeMap<String, FieldExpression>,
164}
165
166#[derive(Serialize, Deserialize, Debug, Clone)]
167pub enum IntersectFlavor {
168    #[serde(rename = "inner")]
169    Inner,
170    #[serde(rename = "outer")]
171    Outer,
172    #[serde(rename = "left-outer")]
173    LeftOuter,
174    #[serde(rename = "right-outer")]
175    RightOuter,
176}
177
178#[derive(Serialize, Deserialize, Debug, Clone)]
179pub struct IntersectParam {
180    /// The flavor of the insection operator
181    pub flavor: IntersectFlavor,
182    /// The left-hand-side operand
183    pub lhs: Box<GrassIR>,
184    /// The right-hand-side operand
185    pub rhs: Box<GrassIR>,
186    /// If we are using the sorted algorithm
187    pub sorted: bool,
188}
189
190#[derive(Serialize, Deserialize, Debug, Clone)]
191pub struct MergeOverlapParam {
192    #[serde(rename = "inner")]
193    pub input_expr: Box<GrassIR>,
194}
195
196#[derive(Serialize, Deserialize, Debug, Clone)]
197pub struct FilterParam {
198    /// The original expression
199    #[serde(rename = "inner")]
200    pub input_expr: Box<GrassIR>,
201    ///  The condition expression
202    pub cond: FieldExpression,
203}
204
205#[derive(Serialize, Deserialize, Debug, Clone)]
206pub struct AlterParam {
207    /// The original expression
208    #[serde(rename = "inner")]
209    pub original_expr: Box<GrassIR>,
210    /// The field name that we are going to modify
211    pub field: String,
212    /// The new value this field should assigned to
213    pub value: FieldExpression,
214    pub sorted: bool,
215}
216
217#[derive(Serialize, Deserialize, Debug, Clone)]
218pub struct AssumeSortedParam {
219    pub inner: Box<GrassIR>,
220}
221
222#[derive(Serialize, Deserialize, Debug, Clone)]
223pub struct CastToBedParam {
224    pub inner: Box<GrassIR>,
225    pub num_of_fields: u32,
226    pub sorted: bool,
227}
228
229#[derive(Serialize, Deserialize, Debug, Clone)]
230pub enum InputFormat {
231    Bam,
232    Bed,
233    Cram,
234    Vcf,
235    Fasta,
236}
237
238#[derive(Serialize, Deserialize, Debug, Clone)]
239pub enum OpenTarget {
240    Path(ConstOrEnv<String>),
241    FileNo(u32),
242    CmdArg(u32),
243}
244
245#[derive(Serialize, Deserialize, Debug, Clone)]
246pub struct OpenParam {
247    /// The path to the data source
248    pub target: OpenTarget,
249    /// What format of the input we are expecting
250    pub format: InputFormat,
251    /// How many field for each record
252    pub num_of_fields: i32,
253    /// If the input file is compressed
254    pub compression: bool,
255    /// If this file is known sorted
256    pub sorted: bool,
257}
258
259#[derive(Serialize, Deserialize, Debug, Clone)]
260#[serde(untagged)]
261pub enum WriteTarget {
262    Path(ConstOrEnv<String>),
263    FileNo(i32),
264}
265
266#[derive(Serialize, Deserialize, Debug, Clone)]
267pub struct WriteFileParam {
268    /// The expression we want to write
269    pub what: Box<GrassIR>,
270    /// The target file or file number
271    pub target: WriteTarget,
272}
273
274#[derive(Serialize, Deserialize, Debug, Clone)]
275pub struct LetBinding {
276    /// The symbol of the value
277    pub id: String,
278    /// The actual expression that assigned to this symbol
279    pub value: Box<GrassIR>,
280}
281
282#[cfg(test)]
283mod test {
284    use std::{collections::BTreeMap, error::Error};
285
286    use serde::{Deserialize, Serialize};
287    use serde_json::from_str;
288
289    use crate::GrassIR;
290
291    #[derive(Serialize, Deserialize, PartialEq, Clone, Debug)]
292    #[serde(untagged)]
293    enum JsonValue {
294        String(String),
295        Number(f64),
296        Boolean(bool),
297        List(Vec<JsonValue>),
298        Object(BTreeMap<String, JsonValue>),
299    }
300
301    fn validate_object<'a, T: Serialize>(input: &str, obj: &'a T) {
302        let input_dict: JsonValue = serde_json::from_str(input).unwrap();
303        let obj_str = serde_json::to_string(obj).unwrap();
304        let obj_dict: JsonValue = serde_json::from_str(&obj_str).unwrap();
305        assert_eq!(obj_dict, input_dict);
306    }
307
308    macro_rules! parse_test {
309        ($name: ident, $path : expr) => {
310            #[test]
311            fn $name() -> Result<(), Box<dyn Error>> {
312                let input = include_str!($path);
313                let data: GrassIR = from_str(input)?;
314                validate_object(input, &data);
315                Ok(())
316            }
317        };
318    }
319    parse_test!(parse_bam_to_bed, "../../data/ir/bam-to-bed.py.json");
320    parse_test!(
321        parse_expand_interval,
322        "../../data/ir/expand-interval.py.json"
323    );
324    parse_test!(parse_filter, "../../data/ir/filter.py.json");
325    parse_test!(parse_merge, "../../data/ir/merge.py.json");
326    parse_test!(parse_slop, "../../data/ir/slop.py.json");
327    parse_test!(
328        parse_sorted_intersect_custom_format,
329        "../../data/ir/sorted-intersect-custom-fmt.py.json"
330    );
331    parse_test!(
332        parse_sorted_intersect_groupby,
333        "../../data/ir/sorted-intersect-group.py.json"
334    );
335    parse_test!(
336        parse_sorted_intersect_leftouter,
337        "../../data/ir/sorted-intersect-leftouter.py.json"
338    );
339    parse_test!(
340        parse_sorted_intersect_overlap_filter,
341        "../../data/ir/sorted-intersect-overlap-filter.py.json"
342    );
343    parse_test!(
344        parse_sorted_intersect,
345        "../../data/ir/sorted-intersect.py.json"
346    );
347    parse_test!(parse_sorted_window, "../../data/ir/window.py.json");
348}