import pytest
from splintr import Tokenizer
class TestMistralV2ExactTokens:
@pytest.fixture
def tokenizer(self):
return Tokenizer.from_pretrained("mistral_v2")
def test_hello_world_tokens(self, tokenizer):
tokens = tokenizer.encode("Hello world")
decoded = tokenizer.decode(tokens)
assert decoded == "Hello world"
assert len(tokens) >= 2
def test_control_tokens_exact(self, tokenizer):
assert tokenizer.encode_with_special("[INST]") == [3]
assert tokenizer.encode_with_special("[/INST]") == [4]
assert tokenizer.encode_with_special("[TOOL_CALLS]") == [5]
assert tokenizer.encode_with_special("[AVAILABLE_TOOLS]") == [6]
def test_space_preservation(self, tokenizer):
tokens = tokenizer.encode(" world!")
decoded = tokenizer.decode(tokens)
assert decoded == " world!", f"Space not preserved: got {decoded!r}"
def test_chinese_tokens(self, tokenizer):
text = "你好世界"
tokens = tokenizer.encode(text)
decoded = tokenizer.decode(tokens)
assert decoded == text, f"Chinese roundtrip failed: {decoded!r}"
def test_emoji_tokens(self, tokenizer):
text = "Hello 🌍 World!"
tokens = tokenizer.encode(text)
decoded = tokenizer.decode(tokens)
assert decoded == text, f"Emoji roundtrip failed: {decoded!r}"
class TestMistralV2ControlTokens:
@pytest.fixture
def tokenizer(self):
return Tokenizer.from_pretrained("mistral_v2")
def test_instruction_format(self, tokenizer):
text = "[INST]Hello, how are you?[/INST]I'm doing great!"
tokens = tokenizer.encode_with_special(text)
assert 3 in tokens, "[INST] not found"
assert 4 in tokens, "[/INST] not found"
decoded = tokenizer.decode(tokens)
assert decoded == text
def test_tool_calling_format(self, tokenizer):
text = "[AVAILABLE_TOOLS]get_weather[/AVAILABLE_TOOLS][TOOL_CALLS]get_weather()"
tokens = tokenizer.encode_with_special(text)
assert 5 in tokens, "[TOOL_CALLS] not found"
assert 6 in tokens, "[AVAILABLE_TOOLS] not found"
decoded = tokenizer.decode(tokens)
assert "[AVAILABLE_TOOLS]" in decoded
assert "[TOOL_CALLS]" in decoded
def test_decode_control_tokens(self, tokenizer):
assert tokenizer.decode([3]) == "[INST]"
assert tokenizer.decode([4]) == "[/INST]"
assert tokenizer.decode([5]) == "[TOOL_CALLS]"
assert tokenizer.decode([6]) == "[AVAILABLE_TOOLS]"
def test_mixed_control_and_text(self, tokenizer):
text = "[INST]Write a poem about Rust[/INST]Rust is fast and safe..."
tokens = tokenizer.encode_with_special(text)
decoded = tokenizer.decode(tokens)
assert decoded == text
class TestMistralV2Roundtrip:
@pytest.fixture
def tokenizer(self):
return Tokenizer.from_pretrained("mistral_v2")
def test_encode_decode_roundtrip(self, tokenizer):
test_cases = [
"Hello, world!",
"The quick brown fox jumps over the lazy dog.",
"Rust is a systems programming language.",
"1234567890",
"Special characters: !@#$%^&*()",
"Unicode: こんにちは 世界 🦀",
"Mixed: Hello 你好 🌍 World!",
]
for text in test_cases:
tokens = tokenizer.encode(text)
decoded = tokenizer.decode(tokens)
assert decoded == text, f"Roundtrip failed for: {text!r}"
def test_multiline_roundtrip(self, tokenizer):
text = "Multi-line\ntext\nwith\nnewlines"
tokens = tokenizer.encode(text)
decoded = tokenizer.decode(tokens)
assert decoded == text, f"Roundtrip failed for: {text!r}"
def test_code_content(self, tokenizer):
code = '''def hello_world():
print("Hello, World!")
if __name__ == "__main__":
hello_world()
'''
tokens = tokenizer.encode(code)
decoded = tokenizer.decode(tokens)
assert decoded == code
class TestMistralV2SpecialTokens:
@pytest.fixture
def tokenizer(self):
return Tokenizer.from_pretrained("mistral_v2")
def test_bos_eos_tokens(self, tokenizer):
tokens = tokenizer.encode_with_special("<s>")
assert tokens == [1], f"<s> should be token 1, got {tokens}"
tokens = tokenizer.encode_with_special("</s>")
assert tokens == [2], f"</s> should be token 2, got {tokens}"
def test_agent_tokens(self, tokenizer):
tokens = tokenizer.encode_with_special("<|think|>")
assert tokens == [32773], f"<|think|> should be [32773], got {tokens}"
tokens = tokenizer.encode_with_special("<|function|>")
assert tokens == [32783], f"<|function|> should be [32783], got {tokens}"
def test_decode_agent_tokens(self, tokenizer):
assert tokenizer.decode([32773]) == "<|think|>"
assert tokenizer.decode([32783]) == "<|function|>"
class TestMistralV2VocabSize:
def test_vocab_size(self):
tok = Tokenizer.from_pretrained("mistral_v2")
assert tok.vocab_size == 32822
def test_v2_larger_than_v1(self):
v1 = Tokenizer.from_pretrained("mistral_v1")
v2 = Tokenizer.from_pretrained("mistral_v2")
assert v2.vocab_size > v1.vocab_size
assert v1.vocab_size == 32054
assert v2.vocab_size == 32822
def test_hyphenated_names_rejected(self):
with pytest.raises(ValueError):
Tokenizer.from_pretrained("mistral-v2")
with pytest.raises(ValueError):
Tokenizer.from_pretrained("codestral")
class TestMistralV2VsV1:
def test_v2_has_control_tokens(self):
v1 = Tokenizer.from_pretrained("mistral_v1")
v2 = Tokenizer.from_pretrained("mistral_v2")
v1_tokens = v1.encode_with_special("[INST]")
v2_tokens = v2.encode_with_special("[INST]")
assert len(v1_tokens) > 1, "V1 should tokenize [INST] as text"
assert v2_tokens == [3], "V2 should have [INST] as token 3"
def test_different_vocabularies(self):
v1 = Tokenizer.from_pretrained("mistral_v1")
v2 = Tokenizer.from_pretrained("mistral_v2")
text = "This is a test message"
v1_tokens = v1.encode(text)
v2_tokens = v2.encode(text)
assert v1_tokens != v2_tokens, "V1 and V2 should encode differently"
class TestMistralV2Batch:
@pytest.fixture
def tokenizer(self):
return Tokenizer.from_pretrained("mistral_v2")
def test_batch_matches_individual(self, tokenizer):
texts = [
"Hello, world!",
"How are you?",
"I'm doing great!",
"Unicode: 你好 🌍",
]
batch_tokens = tokenizer.encode_batch(texts)
assert len(batch_tokens) == len(texts)
for i, text in enumerate(texts):
individual = tokenizer.encode(text)
assert batch_tokens[i] == individual, (
f"Batch mismatch for text {i}: {text!r}"
)
def test_empty_input(self, tokenizer):
assert tokenizer.encode("") == []
assert tokenizer.decode([]) == ""
class TestMistralV2Utf8Boundaries:
@pytest.fixture
def tokenizer(self):
return Tokenizer.from_pretrained("mistral_v2")
@pytest.fixture
def tokenizer_pcre2(self):
return Tokenizer.from_pretrained("mistral_v2").pcre2(True)
def test_em_dash(self, tokenizer):
text = "I'm sorry you're hurting—breakups suck, but you'll get through it."
tokens = tokenizer.encode(text)
decoded = tokenizer.decode(tokens)
assert decoded == text
def test_curly_quotes(self, tokenizer):
text = 'He said, \u2018Hello\u2019 and she replied, \u201cGoodbye\u201d.'
tokens = tokenizer.encode(text)
decoded = tokenizer.decode(tokens)
assert decoded == text
def test_em_dash_at_boundaries(self, tokenizer):
texts = [
"word—word",
"a—b",
"test—",
"—start",
"one—two—three",
"Check your brake pads—they might be worn out.",
]
for text in texts:
tokens = tokenizer.encode(text)
decoded = tokenizer.decode(tokens)
assert decoded == text, f"Failed for: {text!r}"
def test_backend_consistency_multibyte(self, tokenizer, tokenizer_pcre2):
texts = [
"word—word",
"I'm sorry you're hurting—breakups suck.",
'He said, \u2018Hello\u2019 and she replied, \u201cGoodbye\u201d.',
"Check credentials—API key—in headers.",
]
for text in texts:
tokens_regexr = tokenizer.encode(text)
tokens_pcre2 = tokenizer_pcre2.encode(text)
assert tokens_regexr == tokens_pcre2, f"Backend mismatch for: {text!r}"
class TestMistralV2LargeScaleBatch:
@pytest.fixture
def tokenizer(self):
return Tokenizer.from_pretrained("mistral_v2")
def test_large_batch_parallel(self, tokenizer):
base_texts = [
"I'm sorry you're hurting—breakups suck, but you'll get through it.",
"Check if you're using valid credentials—API key, token—in headers.",
"你好世界!这是一个测试。",
"Hello 🌍 World! 🦀 Rust is great!",
"Mixed: Hello 你好 🌍 —test— World!",
"Code: def foo(): return 42",
"A 403 Forbidden error means permission denied.",
]
texts = base_texts * 100
all_tokens = tokenizer.encode_batch(texts)
assert len(all_tokens) == len(texts)
for i in range(0, len(texts), 50):
decoded = tokenizer.decode(all_tokens[i])
assert decoded == texts[i], f"Failed roundtrip for text {i}"
class TestMistralV2BackendOptions:
def test_default_backend(self):
tokenizer = Tokenizer.from_pretrained("mistral_v2")
text = "Hello, world!"
tokens = tokenizer.encode(text)
assert tokenizer.decode(tokens) == text
def test_pcre2_backend(self):
tokenizer = Tokenizer.from_pretrained("mistral_v2").pcre2(True)
text = "Hello, world!"
tokens = tokenizer.encode(text)
assert tokenizer.decode(tokens) == text
def test_jit_disabled(self):
tokenizer = Tokenizer.from_pretrained("mistral_v2").jit(False)
text = "Hello, world!"
tokens = tokenizer.encode(text)
assert tokenizer.decode(tokens) == text
def test_backend_consistency(self):
text = "The quick brown fox 你好 🦀 jumps—over—the lazy dog."
tok_default = Tokenizer.from_pretrained("mistral_v2")
tok_pcre2 = Tokenizer.from_pretrained("mistral_v2").pcre2(True)
tok_no_jit = Tokenizer.from_pretrained("mistral_v2").jit(False)
tokens_default = tok_default.encode(text)
tokens_pcre2 = tok_pcre2.encode(text)
tokens_no_jit = tok_no_jit.encode(text)
assert tokens_default == tokens_pcre2, "PCRE2 should match default"
assert tokens_default == tokens_no_jit, "Non-JIT should match default"