import tempfile
from pathlib import Path
import pytest
from transformers.data.processors.glue import Sst2Processor
from transformers.file_utils import get_from_cache
from transformers.tokenization_bert import BertTokenizer
from transformers.tokenization_distilbert import DistilBertTokenizer
from transformers.tokenization_ctrl import CTRLTokenizer
from transformers.tokenization_gpt2 import GPT2Tokenizer
from transformers.tokenization_roberta import RobertaTokenizer
from transformers.tokenization_openai import OpenAIGPTTokenizer
from rust_transformers import PyBertTokenizer, PyCtrlTokenizer, PyGpt2Tokenizer, PyRobertaTokenizer, \
PyOpenAiGptTokenizer
import os
@pytest.mark.slow
class TestTokenizationSST2:
def setup_class(self):
self.processor = Sst2Processor()
self.examples = self.processor.get_train_examples(os.environ["SST2_PATH"])
self.test_dir = Path(tempfile.mkdtemp())
def test_tokenization_bert(self):
self.base_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True,
cache_dir=self.test_dir)
self.rust_tokenizer = PyBertTokenizer(
get_from_cache(self.base_tokenizer.pretrained_vocab_files_map['vocab_file']['bert-base-uncased']))
output_baseline = []
for example in self.examples:
output_baseline.append(self.base_tokenizer.encode_plus(example.text_a,
add_special_tokens=True,
return_overflowing_tokens=True,
return_special_tokens_mask=True,
max_length=128))
output_rust = self.rust_tokenizer.encode_list([example.text_a for example in self.examples],
max_len=128,
truncation_strategy='longest_first',
stride=0)
for rust, baseline in zip(output_rust, output_baseline):
assert (rust.token_ids == baseline['input_ids'])
assert (rust.segment_ids == baseline['token_type_ids'])
assert (rust.special_tokens_mask == baseline['special_tokens_mask'])
def test_tokenization_distilbert(self):
self.base_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True,
cache_dir=self.test_dir)
self.rust_tokenizer = PyBertTokenizer(
get_from_cache(self.base_tokenizer.pretrained_vocab_files_map['vocab_file']['distilbert-base-uncased']))
output_baseline = []
for example in self.examples:
output_baseline.append(self.base_tokenizer.encode_plus(example.text_a,
add_special_tokens=True,
return_overflowing_tokens=True,
return_special_tokens_mask=True,
max_length=128))
output_rust = self.rust_tokenizer.encode_list([example.text_a for example in self.examples],
max_len=128,
truncation_strategy='longest_first',
stride=0)
for rust, baseline in zip(output_rust, output_baseline):
assert (rust.token_ids == baseline['input_ids'])
assert (rust.segment_ids == baseline['token_type_ids'])
assert (rust.special_tokens_mask == baseline['special_tokens_mask'])
def test_tokenization_ctrl(self):
self.base_tokenizer = CTRLTokenizer.from_pretrained('ctrl', do_lower_case=True,
cache_dir=self.test_dir)
self.rust_tokenizer = PyCtrlTokenizer(
get_from_cache(self.base_tokenizer.pretrained_vocab_files_map['vocab_file']['ctrl']),
get_from_cache(self.base_tokenizer.pretrained_vocab_files_map['merges_file']['ctrl'])
)
output_baseline = []
for example in self.examples:
output_baseline.append(self.base_tokenizer.encode_plus(example.text_a,
add_special_tokens=True,
return_overflowing_tokens=True,
return_special_tokens_mask=True,
max_length=128))
output_rust = self.rust_tokenizer.encode_list([example.text_a for example in self.examples],
max_len=128,
truncation_strategy='longest_first',
stride=0)
for rust, baseline in zip(output_rust, output_baseline):
assert (rust.token_ids == baseline['input_ids'])
assert (rust.segment_ids == baseline['token_type_ids'])
assert (rust.special_tokens_mask == baseline['special_tokens_mask'])
def test_tokenization_gpt2(self):
self.base_tokenizer = GPT2Tokenizer.from_pretrained('gpt2', do_lower_case=True,
cache_dir=self.test_dir)
self.rust_tokenizer = PyGpt2Tokenizer(
get_from_cache(self.base_tokenizer.pretrained_vocab_files_map['vocab_file']['gpt2']),
get_from_cache(self.base_tokenizer.pretrained_vocab_files_map['merges_file']['gpt2'])
)
output_baseline = []
for example in self.examples:
output_baseline.append(self.base_tokenizer.encode_plus(example.text_a,
add_special_tokens=True,
return_overflowing_tokens=True,
return_special_tokens_mask=True,
max_length=128))
output_rust = self.rust_tokenizer.encode_list([example.text_a for example in self.examples],
max_len=128,
truncation_strategy='longest_first',
stride=0)
for rust, baseline in zip(output_rust, output_baseline):
assert (rust.token_ids == baseline['input_ids'])
assert (rust.segment_ids == baseline['token_type_ids'])
assert (rust.special_tokens_mask == baseline['special_tokens_mask'])
def test_tokenization_roberta(self):
self.base_tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True,
cache_dir=self.test_dir)
self.rust_tokenizer = PyRobertaTokenizer(
get_from_cache(self.base_tokenizer.pretrained_vocab_files_map['vocab_file']['roberta-base']),
get_from_cache(self.base_tokenizer.pretrained_vocab_files_map['merges_file']['roberta-base'])
)
output_baseline = []
for example in self.examples:
output_baseline.append(self.base_tokenizer.encode_plus(example.text_a,
add_special_tokens=True,
return_overflowing_tokens=True,
return_special_tokens_mask=True,
max_length=128))
output_rust = self.rust_tokenizer.encode_list([example.text_a for example in self.examples],
max_len=128,
truncation_strategy='longest_first',
stride=0)
for rust, baseline in zip(output_rust, output_baseline):
assert (rust.token_ids == baseline['input_ids'])
assert (rust.segment_ids == baseline['token_type_ids'])
assert (rust.special_tokens_mask == baseline['special_tokens_mask'])
def test_tokenization_openai_gpt(self):
self.base_tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt', do_lower_case=True,
cache_dir=self.test_dir)
self.rust_tokenizer = PyOpenAiGptTokenizer(
get_from_cache(self.base_tokenizer.pretrained_vocab_files_map['vocab_file']['openai-gpt']),
get_from_cache(self.base_tokenizer.pretrained_vocab_files_map['merges_file']['openai-gpt'])
)
output_baseline = []
for example in self.examples:
output_baseline.append(self.base_tokenizer.encode_plus(example.text_a,
add_special_tokens=True,
return_overflowing_tokens=True,
return_special_tokens_mask=True,
max_length=128))
output_rust = self.rust_tokenizer.encode_list([example.text_a for example in self.examples],
max_len=128,
truncation_strategy='longest_first',
stride=0)
for rust, baseline in zip(output_rust, output_baseline):
assert (rust.token_ids == baseline['input_ids'])
assert (rust.segment_ids == baseline['token_type_ids'])
assert (rust.special_tokens_mask == baseline['special_tokens_mask'])