tokengeex 1.1.0

TokenGeeX is an efficient tokenizer for code based on UnigramLM and TokenMonster.
Documentation
import os
import sys
import time

import tiktoken

num_threads = int(sys.argv[1]) if len(sys.argv) > 1 else 1

os.environ["TOKENIZERS_PARALLELISM"] = "1"
os.environ["TOKENGEEX_PARALLELISM"] = "1"

os.environ["RAYON_NUM_THREADS"] = str(num_threads)


def bytes_to_mb(bytes: int) -> float:
    return round(bytes / 1024 / 1024, 2)


def benchmark_batch(documents: list[str]) -> None:
    num_bytes = sum(map(len, map(str.encode, documents)))

    enc = tiktoken.get_encoding("gpt2")
    enc.encode("warmup")

    start = time.perf_counter_ns()
    enc.encode_ordinary_batch(documents, num_threads=num_threads)
    end = time.perf_counter_ns()
    print(
        f"TikToken     {bytes_to_mb(int(num_bytes / (end-start) * 1e9)):>5} MB/s {round((end - start) / 1e9, 2):>5}s ({'single thread' if num_threads < 2 else '{} threads'.format(num_threads)})"
    )

    from transformers import AutoTokenizer

    hf_enc = AutoTokenizer.from_pretrained("gpt2")

    hf_enc.model_max_length = 1e30  # type: ignore
    hf_enc.encode("warmup")
    start = time.perf_counter_ns()
    hf_enc(documents)
    end = time.perf_counter_ns()
    print(
        f"HuggingFace  {bytes_to_mb(int(num_bytes / (end-start) * 1e9)):>5} MB/s {round((end - start) / 1e9, 2):>5}s ({'single thread' if num_threads < 2 else '{} threads'.format(num_threads)})"
    )

    import tokengeex

    tokenizer = tokengeex.load("./data/unigram-65k.json")

    start = time.perf_counter_ns()
    tokenizer.encode_batch(documents)
    end = time.perf_counter_ns()
    print(
        f"TokenGeex    {bytes_to_mb(int(num_bytes / (end-start) * 1e9)):>5} MB/s {round((end - start) / 1e9, 2):>5}s ({'single thread' if num_threads < 2 else '{} threads'.format(num_threads)})"
    )


samples = open("./data/train.bin", "rb").read().split(b"\0")

samples = samples * 10

benchmark_batch(list(map(bytes.decode, samples)))