aprender-train 0.31.2

metadata:
  version: "1.0.0"
  created: "2026-03-21"
  author: "PAIML Engineering"
  description: "BPE tokenization encode/decode contracts"
  references:
    - "Sennrich et al. (2016) Neural Machine Translation of Rare Words with Subword Units"

equations:
  bpe_merge:
    formula: "merge(corpus, k) applies k most-frequent byte-pair merges"
    domain: "corpus: list of strings, k <= vocab_size - base_vocab_size"
    preconditions:
      - "!corpus.is_empty()"
      - "k > 0"
      - "k <= vocab_size - base_vocab_size"
    postconditions:
      - "vocab.len() == base_vocab_size + k"
      - "total_tokens_after <= total_tokens_before"
    invariants:
      - "Each merge reduces total token count in corpus"
      - "Vocabulary size increases by 1 per merge"
  encode_decode_identity:
    formula: "decode(encode(text)) == normalize(text)"
    domain: "text: valid UTF-8 string"
    preconditions:
      - "!text.is_empty()"
      - "tokenizer.is_trained()"
    postconditions:
      - "!ret.is_empty()"
      - "ret.iter().all(|id| *id < vocab_size)"
    invariants:
      - "Encoding is deterministic for same input"
      - "Token IDs are valid indices into vocabulary"

proof_obligations:
  - type: equivalence
    property: "Round-trip identity"
    formal: "decode(encode(text)) == text for trained tokenizer"
    applies_to: all
  - type: invariant
    property: "Token IDs in range"
    formal: "0 <= id < vocab_size for all ids in encode output"
    applies_to: all
  - type: bound
    property: "Output length bounded"
    formal: "len(encode(text)) <= len(text) (each char produces at most 1 token)"
    applies_to: all

falsification_tests:
  - id: FALSIFY-TOK-001
    rule: "BPE encode-decode round-trip"
    prediction: "Decoded output matches original text after training"
    test: "tokenizer::bpe::tests::test_bpe_encode_decode"
    if_fails: "Merge rules or vocabulary mapping inconsistent"
  - id: FALSIFY-TOK-002
    rule: "BPE training produces valid vocabulary"
    prediction: "After training, vocab_size > 0 and is_trained == true"
    test: "tokenizer::bpe::tests::test_bpe_train"
    if_fails: "Training loop not populating vocabulary"
  - id: FALSIFY-TOK-003
    rule: "Untrained tokenizer rejects encode"
    prediction: "Encoding before training returns error"
    test: "tokenizer::bpe::tests::test_bpe_encode_not_trained"
    if_fails: "Missing is_trained guard in encode path"
  - id: FALSIFY-TOK-004
    rule: "Token-to-ID and ID-to-token bijection"
    prediction: "id_to_token(token_to_id(t)) == t for all tokens in vocabulary"
    test: "tokenizer::bpe::tests::test_bpe_id_to_token"
    if_fails: "Forward and reverse maps out of sync"

kani_harnesses:
  - id: tok-kani-001
    obligation: "Token IDs in valid range"
    bound: 4
    strategy: bounded_int