wdl-doc 0.3.1

Documentation generator for Workflow Description Language (WDL) documents.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
## Read groups are defined in the SAM spec
## - ID: Read group identifier. Each Read Group must have a unique ID.
##     The value of ID is used in the RG tags of alignment records.
## - BC: "Barcode sequence identifying the sample or library. This value is the
##     expected barcode bases as read by the sequencing machine in the absence
##     of errors. If there are several barcodes for the sample/library
##     (e.g., one on each end of the template), the recommended implementation
##     concatenates all the barcodes separating them with hyphens (`-`).
## - CN: Name of sequencing center producing the read.
## - DS: Description.
## - DT: Date the run was produced (ISO8601 date or date/time).
## - FO: Flow order. The array of nucleotide bases that correspond to the nucleotides
##     used for each flow of each read. Multi-base flows are encoded in IUPAC format,
##     and non-nucleotide flows by various other characters.
##     Format: /\\*|[ACMGRSVTWYHKDBN]+/
## - KS: The array of nucleotide bases that correspond to the key sequence of each read.
## - LB: Library.
## - PG: Programs used for processing the read group.
## - PI: Predicted median insert size, rounded to the nearest integer.
## - PL: Platform/technology used to produce the reads.
##     Valid values: CAPILLARY, DNBSEQ (MGI/BGI), ELEMENT, HELICOS, ILLUMINA, IONTORRENT,
##     LS454, ONT (Oxford Nanopore), PACBIO (Pacific Biosciences), SINGULAR, SOLID,
##     and ULTIMA. This field should be omitted when the technology is not in this list
##     (though the PM field may still be present in this case) or is unknown.
## - PM: Platform model. Free-form text providing further details of the
##     platform/technology used.
## - PU: Platform unit (e.g., flowcell-barcode.lane for Illumina or slide
##     for SOLiD). Unique identifier.
## - SM: Sample. Use pool name where a pool is being sequenced.
##
## An example input JSON entry for `read_group` might look like this:
## ```
## {
##     "read_group": {
##         "ID": "rg1",
##         "PI": 150,
##         "PL": "ILLUMINA",
##         "SM": "Sample",
##         "LB": "Sample"
##     }
## }
## ```

version 1.1

# See the `read_groups` `parameter_meta` for definitions of each field
#@ except: SnakeCase
struct ReadGroup {
    String ID
    String? BC
    String? CN
    String? DS
    String? DT
    String? FO
    String? KS
    String? LB
    String? PG
    Int? PI
    String? PL
    String? PM
    String? PU
    String? SM
}

task read_group_to_string {
    meta {
        description: "Stringifies a ReadGroup struct"
        outputs: {
            stringified_read_group: "Input ReadGroup as a string"
        }
    }

    parameter_meta {
        read_group: "ReadGroup struct to stringify"
    }

    input {
        ReadGroup read_group
    }

    command <<<
        {
            echo -n "~{"ID:~{read_group.ID}"}"  # required field. All others optional
            echo -n "~{if defined(read_group.BC) then " BC:~{read_group.BC}" else ""}"
            echo -n "~{if defined(read_group.CN) then " CN:~{read_group.CN}" else ""}"
            echo -n "~{if defined(read_group.DS) then " DS:~{read_group.DS}" else ""}"
            echo -n "~{if defined(read_group.DT) then " DT:~{read_group.DT}" else ""}"
            echo -n "~{if defined(read_group.FO) then " FO:~{read_group.FO}" else ""}"
            echo -n "~{if defined(read_group.KS) then " KS:~{read_group.KS}" else ""}"
            echo -n "~{if defined(read_group.LB) then " LB:~{read_group.LB}" else ""}"
            echo -n "~{if defined(read_group.PG) then " PG:~{read_group.PG}" else ""}"
            echo -n "~{if defined(read_group.PI) then " PI:~{read_group.PI}" else ""}"
            echo -n "~{if defined(read_group.PL) then " PL:~{read_group.PL}" else ""}"
            echo -n "~{if defined(read_group.PM) then " PM:~{read_group.PM}" else ""}"
            echo -n "~{if defined(read_group.PU) then " PU:~{read_group.PU}" else ""}"
            echo "~{if defined(read_group.SM) then " SM:~{read_group.SM}" else ""}"
        } > out.txt
    >>>

    output {
        String stringified_read_group = read_string("out.txt")
    }

    runtime {
        memory: "4 GB"
        disks: "10 GB"
        container: "ghcr.io/stjudecloud/util:1.4.0"
        maxRetries: 1
    }
}

task get_read_groups {
    meta {
        description: "Gets read group information from a BAM file and writes it out as JSON which is converted to a WDL struct."
        outputs: {
            read_groups: "An array of ReadGroup structs containing read group information."
        }
    }

    parameter_meta {
        bam: "Input BAM format file to get read groups from"
        modify_disk_size_gb: "Add to or subtract from dynamic disk space allocation. Default disk size is determined by the size of the inputs. Specified in GB."
    }

    input {
        File bam
        Int modify_disk_size_gb = 0
    }

    Float bam_size = size(bam, "GiB")
    Int disk_size_gb = ceil(bam_size) + 10 + modify_disk_size_gb

    #@ except: LineWidth
    command <<<
        set -euo pipefail
        BAM="~{bam}" OUTFILE="read_groups.json" python - <<END
        import os  # lint-check: ignore
        import pysam  # lint-check: ignore
        import json  # lint-check: ignore
        sam = pysam.AlignmentFile(os.environ["BAM"], "rb")

        out_file = open(os.environ["OUTFILE"], "w")
        header = sam.header.to_dict()["RG"]
        modified_header = []
        for read_group in sorted(header, key=lambda d: d['ID']):
            modified_header.append({k:v.upper() if k=='PL' else v for k,v in read_group.items()})
        json.dump(modified_header, out_file)
        out_file.close()
        END
    >>>

    output {
        Array[ReadGroup] read_groups = read_json("read_groups.json")
    }

    runtime {
        memory: "4 GB"
        disks: "~{disk_size_gb} GB"
        container: "quay.io/biocontainers/pysam:0.22.0--py38h15b938a_1"
        maxRetries: 1
    }
}

task validate_read_group {
    meta {
        description: "Validate a ReadGroup struct's fields are defined"
        outputs: {
            check: "Dummy output to indicate success and enable call-caching"
        }
    }

    parameter_meta {
        read_group: "ReadGroup struct to validate"
        required_fields: "Array of read group fields that must be defined. The ID field is always required and does not need to be specified."
        restrictive: "If true, run a less permissive validation of field values. Otherwise, check against SAM spec-defined values."
    }

    input {
        ReadGroup read_group
        Array[String] required_fields = []
        Boolean restrictive = true
    }

    # The SAM spec allows any printable ASCII character in header fields.
    String sam_spec_pattern = "[\\ -~]+"
    # We have the opinion that is too permissive for ID and SM.
    String id_pattern = "id"
    String sample_pattern = "sample.?"
    String restrictive_pattern = "\\ "  # Disallow spaces
    Array[String] platforms = [
        "CAPILLARY", "DNBSEQ", "ELEMENT", "HELICOS", "ILLUMINA", "IONTORRENT", "LS454",
        "ONT", "PACBIO", "SINGULAR", "SOLID", "ULTIMA",
    ]

    command <<<
        error=0
        if [[ ~{restrictive} == "true" ]]
        then
            if [[ ~{read_group.ID} =~ ^~{id_pattern}$ ]] \
            || [[ ~{read_group.ID} =~ ~{restrictive_pattern} ]]
            then
                >&2 echo "ID (~{read_group.ID}) must not match patterns:"
                >&2 echo "'~{id_pattern}' or '~{restrictive_pattern}'"
                error=1
            fi
        fi
        if [[ ! "~{read_group.ID}" =~ ^~{sam_spec_pattern}$ ]]
        then
            >&2 echo "ID must match pattern ~{sam_spec_pattern}"
            error=1
        fi
        if [ $(echo "~{sep(" ", required_fields)}" | grep -Ewc "SM") -eq 1 ]
        then
            if [ -z "~{read_group.SM}" ]
            then
                >&2 echo "SM is required"
                error=1
            fi
        fi
        if [ "~{defined(read_group.SM)}" == "true" ]
        then
            if [[ ~{restrictive} == "true" ]]
            then
                if [[ "~{read_group.SM}" =~ ^~{sample_pattern}$ ]] \
                || [[ "~{read_group.SM}" =~ ~{restrictive_pattern} ]]
                then
                    >&2 echo "SM must not match patterns:"
                    >&2 echo "'~{sample_pattern}' or '~{restrictive_pattern}'"
                    error=1
                fi
            fi
            if [[ ! "~{read_group.SM}" =~ ^~{sam_spec_pattern}$ ]]
            then
                >&2 echo "SM must match pattern ~{sam_spec_pattern}"
                error=1
            fi
        fi
        if [ $(echo "~{sep(" ", required_fields)}" | grep -Ewc "BC") -eq 1 ]
        then
            if [ -z "~{read_group.BC}" ]
            then
                >&2 echo "BC is required"
                error=1
            fi
        fi
        if [ "~{defined(read_group.BC)}" == "true" ]
        then
            if [[ ! "~{read_group.BC}" =~ ^~{sam_spec_pattern}$ ]]
            then
                >&2 echo "BC must match pattern ~{sam_spec_pattern}"
                error=1
            fi
        fi
        if [ $(echo "~{sep(" ", required_fields)}" | grep -Ewc "CN") -eq 1 ]
        then
            if [ -z "~{read_group.CN}" ]
            then
                >&2 echo "CN is required"
                error=1
            fi
        fi
        if [ "~{defined(read_group.CN)}" == "true" ]
        then
            if [[ ! "~{read_group.CN}" =~ ^~{sam_spec_pattern}$ ]]
            then
                >&2 echo "CN must match pattern ~{sam_spec_pattern}"
                error=1
            fi
        fi
        if [ $(echo "~{sep(" ", required_fields)}" | grep -Ewc "DS") -eq 1 ]
        then
            if [ -z "~{read_group.DS}" ]
            then
                >&2 echo "DS is required"
                error=1
            fi
        fi
        if [ "~{defined(read_group.DS)}" == "true" ]
        then
            if [[ ! "~{read_group.DS}" =~ ^~{sam_spec_pattern}$ ]]
            then
                >&2 echo "DS must match pattern ~{sam_spec_pattern}"
                error=1
            fi
        fi
        if [ $(echo "~{sep(" ", required_fields)}" | grep -Ewc "DT") -eq 1 ]
        then
            if [ -z "~{read_group.DT}" ]
            then
                >&2 echo "DT is required"
                error=1
            fi
        fi
        if [ "~{defined(read_group.DT)}" == "true" ]
        then
            if [[ ! "~{read_group.DT}" =~ ^~{sam_spec_pattern}$ ]]
            then
                >&2 echo "DT must match pattern ~{sam_spec_pattern}"
                error=1
            fi
        fi
        if [ $(echo "~{sep(" ", required_fields)}" | grep -Ewc "FO") -eq 1 ]
        then
            if [ -z "~{read_group.FO}" ]
            then
                >&2 echo "FO is required"
                error=1
            fi
        fi
        if [ "~{defined(read_group.FO)}" == "true" ]
        then
            if [[ ! "~{read_group.FO}" =~ ^\*|[ACMGRSVTWYHKDBN]+$ ]]
            then
                >&2 echo "FO must match pattern \*|[ACMGRSVTWYHKDBN]+"
                error=1
            fi
        fi
        if [ $(echo "~{sep(" ", required_fields)}" | grep -Ewc "KS") -eq 1 ]
        then
            if [ -z "~{if defined(read_group.KS) then read_group.KS else ""}" ]
            then
                >&2 echo "KS is required"
                error=1
            fi
        fi
        if [ "~{defined(read_group.KS)}" == "true" ]
        then
            if [[ ! "~{read_group.KS}" =~ ^~{sam_spec_pattern}$ ]]
            then
                >&2 echo "KS must match pattern ~{sam_spec_pattern}"
                error=1
            fi
        fi
        if [ $(echo "~{sep(" ", required_fields)}" | grep -Ewc "LB") -eq 1 ]
        then
            if [ -z "~{read_group.LB}" ]
            then
                >&2 echo "LB is required"
                error=1
            fi
        fi
        if [ "~{defined(read_group.LB)}" == "true" ]
        then
            if [[ ! "~{read_group.LB}" =~ ^~{sam_spec_pattern}$ ]]
            then
                >&2 echo "LB must match pattern ~{sam_spec_pattern}"
                error=1
            fi
        fi
        if [ $(echo "~{sep(" ", required_fields)}" | grep -Ewc "PG") -eq 1 ]
        then
            if [ -z "~{read_group.PG}" ]
            then
                >&2 echo "PG is required"
                error=1
            fi
        fi
        if [ "~{defined(read_group.PG)}" == "true" ]
        then
            if [[ ! "~{read_group.PG}" =~ ^~{sam_spec_pattern}$ ]]
            then
                >&2 echo "PG must match pattern ~{sam_spec_pattern}"
                error=1
            fi
        fi
        if [ $(echo "~{sep(" ", required_fields)}" | grep -Ewc "PI") -eq 1 ]
        then
            if [ -z "~{read_group.PI}" ]
            then
                >&2 echo "PI is required"
                error=1
            fi
        fi
        if [ "~{defined(read_group.PI)}" == "true" ]
        then
            if [[ ! "~{read_group.PI}" =~ ^[0-9]+$ ]]
            then
                >&2 echo "PI must match pattern [0-9]+"
                error=1
            fi
        fi
        if [ $(echo "~{sep(" ", required_fields)}" | grep -Ewc "PL") -eq 1 ]
        then
            if [ -z "~{read_group.PL}" ]
            then
                >&2 echo "PL is required"
                error=1
            fi
        fi
        if [ "~{defined(read_group.PL)}" == "true" ]
        then
            if [[ ! "~{read_group.PL}" =~ ^~{sep("|", platforms)}$ ]]
            then
                >&2 echo "PL must match pattern ~{sep("|", platforms)}"
                error=1
            fi
        fi
        if [ $(echo "~{sep(" ", required_fields)}" | grep -Ewc "PM") -eq 1 ]
        then
            if [ -z "~{read_group.PM}" ]
            then
                >&2 echo "PM is required"
                error=1
            fi
        fi
        if [ "~{defined(read_group.PM)}" == "true" ]
        then
            if [[ ! "~{read_group.PM}" =~ ^~{sam_spec_pattern}$ ]]
            then
                >&2 echo "PM must match pattern ~{sam_spec_pattern}"
                error=1
            fi
        fi
        if [ $(echo "~{sep(" ", required_fields)}" | grep -Ewc "PU") -eq 1 ]
        then
            if [ -z "~{read_group.PU}" ]
            then
                >&2 echo "PU is required"
                error=1
            fi
        fi
        if [ "~{defined(read_group.PU)}" == "true" ]
        then
            if [[ ! "~{read_group.PU}" =~ ^~{sam_spec_pattern}$ ]]
            then
                >&2 echo "PU must match pattern ~{sam_spec_pattern}"
                error=1
            fi
        fi

        if [ $error -eq 1 ]
        then
            exit 1
        fi
    >>>

    output {
        String check = "passed"
    }

    runtime {
        memory: "4 GB"
        disks: "10 GB"
        container: "ghcr.io/stjudecloud/util:1.4.0"
        maxRetries: 0
    }
}