fst 0.1.23

Use finite state transducers to compactly represents sets or maps of many strings (> 1 billion is possible).
Documentation
#!/usr/bin/env python

from __future__ import absolute_import, division, print_function
import codecs
from operator import itemgetter
import sys


if __name__ == '__main__':
    # Get frequency counts of each byte.
    freqs = [0] * 256  # byte |--> frequency
    for fpath in sys.argv[1:]:
        with codecs.open(fpath, 'r', 'utf-8') as fin:
            for line in fin:
                for byte in line.strip().encode('utf-8'):
                    freqs[byte] += 1

    # Create the inverse mapping.
    orders = [0] * 256  # byte |--> sort index, descending
    sort_by_freq = sorted(zip(range(256), freqs),
                          key=itemgetter(1), reverse=True)
    for sort_idx, byte in enumerate(map(itemgetter(0), sort_by_freq)):
        orders[byte] = sort_idx

    # Now write Rust.
    olines = ['pub const COMMON_INPUTS: [u8; 256] = [']
    for byte in range(256):
        olines.append('    %3d, // %r' % (orders[byte], chr(byte)))
    olines.append('];')
    olines.append('')
    olines.append('pub const COMMON_INPUTS_INV: [u8; 256] = [')
    for sort_idx in range(256):
        byte = orders.index(sort_idx)
        if byte <= 127:
            olines.append('    b%r,' % chr(byte))
        else:
            olines.append("    b'\\x%x'," % byte)
    olines.append('];')
    print('\n'.join(olines))