wordfreq-model 0.2.3

Model loaders for wordfreq-rs
Documentation
import argparse
import gzip

import msgpack
import zstandard as zstd


def cB_to_freq(cB: int) -> float:
    """
    Convert a word frequency from the logarithmic centibel scale that we use
    internally, to a proportion from 0 to 1.

    On this scale, 0 cB represents the maximum possible frequency of
    1.0. -100 cB represents a word that happens 1 in 10 times,
    -200 cB represents something that happens 1 in 100 times, and so on.

    In general, x cB represents a frequency of 10 ** (x/100).
    """
    if cB > 0:
        raise ValueError("A frequency cannot be a positive number of centibels.")
    return 10 ** (cB / 100)


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("input", type=str)
    parser.add_argument("output", type=str)
    args = parser.parse_args()

    with gzip.open(args.input, "rb") as infile:
        data = msgpack.load(infile, raw=False)

    header = data[0]
    if header != {'format': 'cB', 'version': 1}:
        raise ValueError(f"Unexpected header: {header}")

    cctx = zstd.ZstdCompressor(level=19)
    with open(args.output, "wb") as outfile:
        compressor = cctx.stream_writer(outfile)
        for index, bucket in enumerate(data[1:]):
            freq = cB_to_freq(-index)
            for word in bucket:
                compressor.write(f"{word} {freq}\n".encode())
        compressor.flush(zstd.FLUSH_FRAME)


if __name__ == "__main__":
    main()