splintr 0.9.1

Fast Rust tokenizer (BPE + SentencePiece + WordPiece) with Python bindings
Documentation
# Reusable test workflow: lint, check, and test.
#
# Called by:
#   - ci.yml      (PR checks)
#   - release.yml (pre-publish gate)

name: Test

on:
  workflow_call:

permissions:
  contents: read

env:
  CARGO_TERM_COLOR: always

jobs:
  lint:
    name: Lint & Format
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v5

      - name: Install Rust
        uses: dtolnay/rust-toolchain@stable
        with:
          components: rustfmt, clippy

      - name: Install PCRE2
        run: sudo apt-get update && sudo apt-get install -y libpcre2-dev

      - uses: Swatinem/rust-cache@v2
        with:
          prefix-key: lint

      - name: Check formatting
        run: cargo fmt --all --check

      - name: Run clippy
        run: cargo clippy --all-targets -- -D warnings

  test:
    name: Test (${{ matrix.target }})
    runs-on: ${{ matrix.runs-on }}
    strategy:
      fail-fast: false
      matrix:
        include:
          - runs-on: ubuntu-latest
            target: x86_64-unknown-linux-gnu
          - runs-on: macos-latest
            target: aarch64-apple-darwin
          - runs-on: windows-latest
            target: x86_64-pc-windows-msvc
    steps:
      - uses: actions/checkout@v5

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: "3.12"

      - name: Install Rust
        uses: dtolnay/rust-toolchain@stable

      - uses: Swatinem/rust-cache@v2
        with:
          prefix-key: test-${{ matrix.target }}

      - name: Install PCRE2 (Ubuntu)
        if: runner.os == 'Linux'
        run: sudo apt-get update && sudo apt-get install -y libpcre2-dev

      - name: Configure Python for PyO3 (Ubuntu)
        if: runner.os == 'Linux'
        run: echo "PYO3_PYTHON=$(which python3)" >> $GITHUB_ENV

      - name: Install PCRE2 (macOS)
        if: runner.os == 'macOS'
        run: brew install pcre2

      - name: Configure Python for PyO3 (macOS)
        if: runner.os == 'macOS'
        run: |
          echo "PYO3_PYTHON=$(which python3)" >> $GITHUB_ENV
          PYTHON_PREFIX=$(python3 -c "import sys; print(sys.prefix)")
          echo "LIBRARY_PATH=${PYTHON_PREFIX}/lib" >> $GITHUB_ENV
          echo "DYLD_LIBRARY_PATH=${PYTHON_PREFIX}/lib" >> $GITHUB_ENV
          echo "CARGO_BUILD_RUSTFLAGS=-C link-arg=-undefined -C link-arg=dynamic_lookup" >> $GITHUB_ENV

      - name: Install PCRE2 (Windows)
        if: runner.os == 'Windows'
        run: |
          vcpkg install pcre2:x64-windows
          echo "PCRE2_SYS_STATIC=1" >> $env:GITHUB_ENV

      - name: Configure Python for PyO3 (Windows)
        if: runner.os == 'Windows'
        run: |
          $pythonPath = (Get-Command python).Source
          echo "PYO3_PYTHON=$pythonPath" >> $env:GITHUB_ENV

      - name: Run tests
        run: cargo test

  python:
    name: Python bindings
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v5

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: "3.12"

      - name: Install PCRE2
        run: sudo apt-get update && sudo apt-get install -y libpcre2-dev

      - uses: Swatinem/rust-cache@v2
        with:
          prefix-key: python

      - name: Install dependencies and build
        run: |
          python -m venv .venv
          . .venv/bin/activate
          python -m pip install --upgrade pip
          pip install maturin tiktoken
          maturin develop --release

      - name: Test Python bindings
        run: |
          .venv/bin/python -c "
          import splintr
          import tiktoken

          # Test cl100k_base
          tok = splintr.Tokenizer.from_pretrained('cl100k_base')
          tik = tiktoken.get_encoding('cl100k_base')

          text = 'Hello, world!'
          assert tok.encode(text) == list(tik.encode(text)), 'cl100k_base mismatch'

          # Test o200k_base
          tok2 = splintr.Tokenizer.from_pretrained('o200k_base')
          tik2 = tiktoken.get_encoding('o200k_base')
          assert tok2.encode(text) == list(tik2.encode(text)), 'o200k_base mismatch'

          # Test streaming decoder
          decoder = tok.streaming_decoder()
          tokens = tok.encode('Hello')
          result = []
          for t in tokens:
              chunk = decoder.add_token(t)
              if chunk:
                  result.append(chunk)
          result.append(decoder.flush())
          assert ''.join(result) == 'Hello', 'Streaming decoder failed'

          print('All Python tests passed!')
          "