wdl-doc 0.3.1 - Docs.rs

## Read groups are defined in the SAM spec
## - ID: Read group identifier. Each Read Group must have a unique ID.
##     The value of ID is used in the RG tags of alignment records.
## - BC: "Barcode sequence identifying the sample or library. This value is the
##     expected barcode bases as read by the sequencing machine in the absence
##     of errors. If there are several barcodes for the sample/library
##     (e.g., one on each end of the template), the recommended implementation
##     concatenates all the barcodes separating them with hyphens (`-`).
## - CN: Name of sequencing center producing the read.
## - DS: Description.
## - DT: Date the run was produced (ISO8601 date or date/time).
## - FO: Flow order. The array of nucleotide bases that correspond to the nucleotides
##     used for each flow of each read. Multi-base flows are encoded in IUPAC format,
##     and non-nucleotide flows by various other characters.
##     Format: /\\*|[ACMGRSVTWYHKDBN]+/
## - KS: The array of nucleotide bases that correspond to the key sequence of each read.
## - LB: Library.
## - PG: Programs used for processing the read group.
## - PI: Predicted median insert size, rounded to the nearest integer.
## - PL: Platform/technology used to produce the reads.
##     Valid values: CAPILLARY, DNBSEQ (MGI/BGI), ELEMENT, HELICOS, ILLUMINA, IONTORRENT,
##     LS454, ONT (Oxford Nanopore), PACBIO (Pacific Biosciences), SINGULAR, SOLID,
##     and ULTIMA. This field should be omitted when the technology is not in this list
##     (though the PM field may still be present in this case) or is unknown.
## - PM: Platform model. Free-form text providing further details of the
##     platform/technology used.
## - PU: Platform unit (e.g., flowcell-barcode.lane for Illumina or slide
##     for SOLiD). Unique identifier.
## - SM: Sample. Use pool name where a pool is being sequenced.
##
## An example input JSON entry for `read_group` might look like this:
## ```
## {
##     "read_group": {
##         "ID": "rg1",
##         "PI": 150,
##         "PL": "ILLUMINA",
##         "SM": "Sample",
##         "LB": "Sample"
##     }
## }
## ```

version 1.1

# See the `read_groups` `parameter_meta` for definitions of each field
#@ except: SnakeCase
struct ReadGroup {
    String ID
    String? BC
    String? CN
    String? DS
    String? DT
    String? FO
    String? KS
    String? LB
    String? PG
    Int? PI
    String? PL
    String? PM
    String? PU
    String? SM
}

task read_group_to_string {
    meta {
        description: "Stringifies a ReadGroup struct"
        outputs: {
            stringified_read_group: "Input ReadGroup as a string"
        }
    }

    parameter_meta {
        read_group: "ReadGroup struct to stringify"
    }

    input {
        ReadGroup read_group
    }

    command <<<
        {
            echo -n "~{"ID:~{read_group.ID}"}"  # required field. All others optional
            echo -n "~{if defined(read_group.BC) then " BC:~{read_group.BC}" else ""}"
            echo -n "~{if defined(read_group.CN) then " CN:~{read_group.CN}" else ""}"
            echo -n "~{if defined(read_group.DS) then " DS:~{read_group.DS}" else ""}"
            echo -n "~{if defined(read_group.DT) then " DT:~{read_group.DT}" else ""}"
            echo -n "~{if defined(read_group.FO) then " FO:~{read_group.FO}" else ""}"
            echo -n "~{if defined(read_group.KS) then " KS:~{read_group.KS}" else ""}"
            echo -n "~{if defined(read_group.LB) then " LB:~{read_group.LB}" else ""}"
            echo -n "~{if defined(read_group.PG) then " PG:~{read_group.PG}" else ""}"
            echo -n "~{if defined(read_group.PI) then " PI:~{read_group.PI}" else ""}"
            echo -n "~{if defined(read_group.PL) then " PL:~{read_group.PL}" else ""}"
            echo -n "~{if defined(read_group.PM) then " PM:~{read_group.PM}" else ""}"
            echo -n "~{if defined(read_group.PU) then " PU:~{read_group.PU}" else ""}"
            echo "~{if defined(read_group.SM) then " SM:~{read_group.SM}" else ""}"
        } > out.txt
    >>>

    output {
        String stringified_read_group = read_string("out.txt")
    }

    runtime {
        memory: "4 GB"
        disks: "10 GB"
        container: "ghcr.io/stjudecloud/util:1.4.0"
        maxRetries: 1
    }
}

task get_read_groups {
    meta {
        description: "Gets read group information from a BAM file and writes it out as JSON which is converted to a WDL struct."
        outputs: {
            read_groups: "An array of ReadGroup structs containing read group information."
        }
    }

    parameter_meta {
        bam: "Input BAM format file to get read groups from"
        modify_disk_size_gb: "Add to or subtract from dynamic disk space allocation. Default disk size is determined by the size of the inputs. Specified in GB."
    }

    input {
        File bam
        Int modify_disk_size_gb = 0
    }

    Float bam_size = size(bam, "GiB")
    Int disk_size_gb = ceil(bam_size) + 10 + modify_disk_size_gb

    #@ except: LineWidth
    command <<<
        set -euo pipefail
        BAM="~{bam}" OUTFILE="read_groups.json" python - <<END
        import os  # lint-check: ignore
        import pysam  # lint-check: ignore
        import json  # lint-check: ignore
        sam = pysam.AlignmentFile(os.environ["BAM"], "rb")

        out_file = open(os.environ["OUTFILE"], "w")
        header = sam.header.to_dict()["RG"]
        modified_header = []
        for read_group in sorted(header, key=lambda d: d['ID']):
            modified_header.append({k:v.upper() if k=='PL' else v for k,v in read_group.items()})
        json.dump(modified_header, out_file)
        out_file.close()
        END
    >>>

    output {
        Array[ReadGroup] read_groups = read_json("read_groups.json")
    }

    runtime {
        memory: "4 GB"
        disks: "~{disk_size_gb} GB"
        container: "quay.io/biocontainers/pysam:0.22.0--py38h15b938a_1"
        maxRetries: 1
    }
}

task validate_read_group {
    meta {
        description: "Validate a ReadGroup struct's fields are defined"
        outputs: {
            check: "Dummy output to indicate success and enable call-caching"
        }
    }

    parameter_meta {
        read_group: "ReadGroup struct to validate"
        required_fields: "Array of read group fields that must be defined. The ID field is always required and does not need to be specified."
        restrictive: "If true, run a less permissive validation of field values. Otherwise, check against SAM spec-defined values."
    }

    input {
        ReadGroup read_group
        Array[String] required_fields = []
        Boolean restrictive = true
    }

    # The SAM spec allows any printable ASCII character in header fields.
    String sam_spec_pattern = "[\\ -~]+"
    # We have the opinion that is too permissive for ID and SM.
    String id_pattern = "id"
    String sample_pattern = "sample.?"
    String restrictive_pattern = "\\ "  # Disallow spaces
    Array[String] platforms = [
        "CAPILLARY", "DNBSEQ", "ELEMENT", "HELICOS", "ILLUMINA", "IONTORRENT", "LS454",
        "ONT", "PACBIO", "SINGULAR", "SOLID", "ULTIMA",
    ]

    command <<<
        error=0
        if [[ ~{restrictive} == "true" ]]
        then
            if [[ ~{read_group.ID} =~ ^~{id_pattern}$ ]] \
            || [[ ~{read_group.ID} =~ ~{restrictive_pattern} ]]
            then
                >&2 echo "ID (~{read_group.ID}) must not match patterns:"
                >&2 echo "'~{id_pattern}' or '~{restrictive_pattern}'"
                error=1
            fi
        fi
        if [[ ! "~{read_group.ID}" =~ ^~{sam_spec_pattern}$ ]]
        then
            >&2 echo "ID must match pattern ~{sam_spec_pattern}"
            error=1
        fi
        if [ $(echo "~{sep(" ", required_fields)}" | grep -Ewc "SM") -eq 1 ]
        then
            if [ -z "~{read_group.SM}" ]
            then
                >&2 echo "SM is required"
                error=1
            fi
        fi
        if [ "~{defined(read_group.SM)}" == "true" ]
        then
            if [[ ~{restrictive} == "true" ]]
            then
                if [[ "~{read_group.SM}" =~ ^~{sample_pattern}$ ]] \
                || [[ "~{read_group.SM}" =~ ~{restrictive_pattern} ]]
                then
                    >&2 echo "SM must not match patterns:"
                    >&2 echo "'~{sample_pattern}' or '~{restrictive_pattern}'"
                    error=1
                fi
            fi
            if [[ ! "~{read_group.SM}" =~ ^~{sam_spec_pattern}$ ]]
            then
                >&2 echo "SM must match pattern ~{sam_spec_pattern}"
                error=1
            fi
        fi
        if [ $(echo "~{sep(" ", required_fields)}" | grep -Ewc "BC") -eq 1 ]
        then
            if [ -z "~{read_group.BC}" ]
            then
                >&2 echo "BC is required"
                error=1
            fi
        fi
        if [ "~{defined(read_group.BC)}" == "true" ]
        then
            if [[ ! "~{read_group.BC}" =~ ^~{sam_spec_pattern}$ ]]
            then
                >&2 echo "BC must match pattern ~{sam_spec_pattern}"
                error=1
            fi
        fi
        if [ $(echo "~{sep(" ", required_fields)}" | grep -Ewc "CN") -eq 1 ]
        then
            if [ -z "~{read_group.CN}" ]
            then
                >&2 echo "CN is required"
                error=1
            fi
        fi
        if [ "~{defined(read_group.CN)}" == "true" ]
        then
            if [[ ! "~{read_group.CN}" =~ ^~{sam_spec_pattern}$ ]]
            then
                >&2 echo "CN must match pattern ~{sam_spec_pattern}"
                error=1
            fi
        fi
        if [ $(echo "~{sep(" ", required_fields)}" | grep -Ewc "DS") -eq 1 ]
        then
            if [ -z "~{read_group.DS}" ]
            then
                >&2 echo "DS is required"
                error=1
            fi
        fi
        if [ "~{defined(read_group.DS)}" == "true" ]
        then
            if [[ ! "~{read_group.DS}" =~ ^~{sam_spec_pattern}$ ]]
            then
                >&2 echo "DS must match pattern ~{sam_spec_pattern}"
                error=1
            fi
        fi
        if [ $(echo "~{sep(" ", required_fields)}" | grep -Ewc "DT") -eq 1 ]
        then
            if [ -z "~{read_group.DT}" ]
            then
                >&2 echo "DT is required"
                error=1
            fi
        fi
        if [ "~{defined(read_group.DT)}" == "true" ]
        then
            if [[ ! "~{read_group.DT}" =~ ^~{sam_spec_pattern}$ ]]
            then
                >&2 echo "DT must match pattern ~{sam_spec_pattern}"
                error=1
            fi
        fi
        if [ $(echo "~{sep(" ", required_fields)}" | grep -Ewc "FO") -eq 1 ]
        then
            if [ -z "~{read_group.FO}" ]
            then
                >&2 echo "FO is required"
                error=1
            fi
        fi
        if [ "~{defined(read_group.FO)}" == "true" ]
        then
            if [[ ! "~{read_group.FO}" =~ ^\*|[ACMGRSVTWYHKDBN]+$ ]]
            then
                >&2 echo "FO must match pattern \*|[ACMGRSVTWYHKDBN]+"
                error=1
            fi
        fi
        if [ $(echo "~{sep(" ", required_fields)}" | grep -Ewc "KS") -eq 1 ]
        then
            if [ -z "~{if defined(read_group.KS) then read_group.KS else ""}" ]
            then
                >&2 echo "KS is required"
                error=1
            fi
        fi
        if [ "~{defined(read_group.KS)}" == "true" ]
        then
            if [[ ! "~{read_group.KS}" =~ ^~{sam_spec_pattern}$ ]]
            then
                >&2 echo "KS must match pattern ~{sam_spec_pattern}"
                error=1
            fi
        fi
        if [ $(echo "~{sep(" ", required_fields)}" | grep -Ewc "LB") -eq 1 ]
        then
            if [ -z "~{read_group.LB}" ]
            then
                >&2 echo "LB is required"
                error=1
            fi
        fi
        if [ "~{defined(read_group.LB)}" == "true" ]
        then
            if [[ ! "~{read_group.LB}" =~ ^~{sam_spec_pattern}$ ]]
            then
                >&2 echo "LB must match pattern ~{sam_spec_pattern}"
                error=1
            fi
        fi
        if [ $(echo "~{sep(" ", required_fields)}" | grep -Ewc "PG") -eq 1 ]
        then
            if [ -z "~{read_group.PG}" ]
            then
                >&2 echo "PG is required"
                error=1
            fi
        fi
        if [ "~{defined(read_group.PG)}" == "true" ]
        then
            if [[ ! "~{read_group.PG}" =~ ^~{sam_spec_pattern}$ ]]
            then
                >&2 echo "PG must match pattern ~{sam_spec_pattern}"
                error=1
            fi
        fi
        if [ $(echo "~{sep(" ", required_fields)}" | grep -Ewc "PI") -eq 1 ]
        then
            if [ -z "~{read_group.PI}" ]
            then
                >&2 echo "PI is required"
                error=1
            fi
        fi
        if [ "~{defined(read_group.PI)}" == "true" ]
        then
            if [[ ! "~{read_group.PI}" =~ ^[0-9]+$ ]]
            then
                >&2 echo "PI must match pattern [0-9]+"
                error=1
            fi
        fi
        if [ $(echo "~{sep(" ", required_fields)}" | grep -Ewc "PL") -eq 1 ]
        then
            if [ -z "~{read_group.PL}" ]
            then
                >&2 echo "PL is required"
                error=1
            fi
        fi
        if [ "~{defined(read_group.PL)}" == "true" ]
        then
            if [[ ! "~{read_group.PL}" =~ ^~{sep("|", platforms)}$ ]]
            then
                >&2 echo "PL must match pattern ~{sep("|", platforms)}"
                error=1
            fi
        fi
        if [ $(echo "~{sep(" ", required_fields)}" | grep -Ewc "PM") -eq 1 ]
        then
            if [ -z "~{read_group.PM}" ]
            then
                >&2 echo "PM is required"
                error=1
            fi
        fi
        if [ "~{defined(read_group.PM)}" == "true" ]
        then
            if [[ ! "~{read_group.PM}" =~ ^~{sam_spec_pattern}$ ]]
            then
                >&2 echo "PM must match pattern ~{sam_spec_pattern}"
                error=1
            fi
        fi
        if [ $(echo "~{sep(" ", required_fields)}" | grep -Ewc "PU") -eq 1 ]
        then
            if [ -z "~{read_group.PU}" ]
            then
                >&2 echo "PU is required"
                error=1
            fi
        fi
        if [ "~{defined(read_group.PU)}" == "true" ]
        then
            if [[ ! "~{read_group.PU}" =~ ^~{sam_spec_pattern}$ ]]
            then
                >&2 echo "PU must match pattern ~{sam_spec_pattern}"
                error=1
            fi
        fi

        if [ $error -eq 1 ]
        then
            exit 1
        fi
    >>>

    output {
        String check = "passed"
    }

    runtime {
        memory: "4 GB"
        disks: "10 GB"
        container: "ghcr.io/stjudecloud/util:1.4.0"
        maxRetries: 0
    }
}