bstr 0.2.6

A string type that is not required to be valid UTF-8.
Documentation
#!/bin/sh

set -e
D="$(dirname "$0")"

# Convenience function for checking that a command exists.
requires() {
    cmd="$1"
    if ! command -v "$cmd" > /dev/null 2>&1; then
        echo "DEPENDENCY MISSING: $cmd must be installed" >&2
        exit 1
    fi
}

# Test if an array ($2) contains a particular element ($1).
array_exists() {
    needle="$1"
    shift

    for el in "$@"; do
        if [ "$el" = "$needle" ]; then
            return 0
        fi
    done
    return 1
}

graphemes() {
    regex="$(sh "$D/regex/grapheme.sh")"

    echo "generating forward grapheme DFA"
    ucd-generate dfa \
        --name GRAPHEME_BREAK_FWD \
        --sparse --minimize --anchored --state-size 2 \
        src/unicode/fsm/ \
        "$regex"

    echo "generating reverse grapheme DFA"
    ucd-generate dfa \
        --name GRAPHEME_BREAK_REV \
        --reverse --longest \
        --sparse --minimize --anchored --state-size 2 \
        src/unicode/fsm/ \
        "$regex"
}

words() {
    regex="$(sh "$D/regex/word.sh")"

    echo "generating forward word DFA (this can take a while)"
    ucd-generate dfa \
        --name WORD_BREAK_FWD \
        --sparse --minimize --anchored --state-size 4 \
        src/unicode/fsm/ \
        "$regex"
}

sentences() {
    regex="$(sh "$D/regex/sentence.sh")"

    echo "generating forward sentence DFA (this can take a while)"
    ucd-generate dfa \
        --name SENTENCE_BREAK_FWD \
        --minimize \
        --sparse --anchored --state-size 4 \
        src/unicode/fsm/ \
        "$regex"
}

regional_indicator() {
    # For finding all occurrences of region indicators. This is used to handle
    # regional indicators as a special case for the reverse grapheme iterator
    # and the reverse word iterator.
    echo "generating regional indicator DFA"
    ucd-generate dfa \
        --name REGIONAL_INDICATOR_REV \
        --reverse \
        --classes --minimize --anchored --premultiply --state-size 1 \
        src/unicode/fsm/ \
        "\p{gcb=Regional_Indicator}"
}

simple_word() {
    echo "generating forward simple word DFA"
    ucd-generate dfa \
        --name SIMPLE_WORD_FWD \
        --sparse --minimize --state-size 2 \
        src/unicode/fsm/ \
        "\w"
}

whitespace() {
    echo "generating forward whitespace DFA"
    ucd-generate dfa \
        --name WHITESPACE_ANCHORED_FWD \
        --anchored --classes --premultiply --minimize --state-size 1 \
        src/unicode/fsm/ \
        "\s+"

    echo "generating reverse whitespace DFA"
    ucd-generate dfa \
        --name WHITESPACE_ANCHORED_REV \
        --reverse \
        --anchored --classes --minimize --state-size 1 \
        src/unicode/fsm/ \
        "\s+"
}

main() {
    if array_exists "-h" "$@" || array_exists "--help" "$@"; then
        echo "Usage: $(basename "$0") [--list-commands] [<command>] ..." >&2
        exit
    fi

    commands="
        graphemes
        sentences
        words
        regional-indicator
        simple-word
        whitespace
    "
    if array_exists "--list-commands" "$@"; then
        for cmd in $commands; do
            echo "$cmd"
        done
        exit
    fi

    # ucd-generate is used to compile regexes into DFAs.
    requires ucd-generate

    mkdir -p src/unicode/fsm/

    cmds=$*
    if [ $# -eq 0 ] || array_exists "all" "$@"; then
        cmds=$commands
    fi
    for cmd in $cmds; do
        if array_exists "$cmd" $commands; then
            fun="$(echo "$cmd" | sed 's/-/_/g')"
            eval "$fun"
        else
            echo "unrecognized command: $cmd" >&2
        fi
    done
}

main "$@"