1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
//! Logos DFA lexer for the `r50k_base` pattern (GPT-2).
use logos::Logos;
use super::gpt2_family::{
Gpt2FamilyLogos,
Gpt2FamilyTokenRole,
};
use crate::pretrained::openai::OA_R50K_BASE_PATTERN;
/// Logos token variants for `r50k_base`.
#[derive(Logos, Debug, PartialEq, Clone)]
pub(crate) enum R50kToken {
// Case-sensitive: the r50k regex `'(?:[sdmt]|ll|ve|re)` only matches
// lowercase suffixes, unlike cl100k which uses `(?i:...)`.
#[regex(r"'([sdtm]|re|ve|ll)")]
Contraction,
// Contraction suffix followed by more letters. Beats Contraction by
// longest-match. When preceded by whitespace, the apostrophe is absorbed
// as punctuation and the suffix+letters become one span. Without
// whitespace, contraction_split separates the contraction prefix.
#[regex(r"'([sdtm]|re|ve|ll)\p{Letter}+")]
ContractionLetters,
#[regex(r" ?\p{Letter}+")]
Letters,
#[regex(r" ?\p{Number}+")]
Digits,
#[regex(r" ?[^\s\p{Letter}\p{Number}]+")]
Punctuation,
// All whitespace including \r\n. The r50k regex treats all \s uniformly
// via `\s+(?!\S)` and `\s`; there is no separate newline branch.
#[regex(r"\s+")]
Whitespace,
}
impl Gpt2FamilyLogos<'_> for R50kToken {
fn family_role(&self) -> Gpt2FamilyTokenRole {
match self {
Self::Whitespace => Gpt2FamilyTokenRole::Whitespace,
// All three r50k content patterns use ` ?X` which absorbs only
// literal ASCII space. This matches the Punctuation role behavior.
Self::Letters | Self::Digits | Self::Punctuation => Gpt2FamilyTokenRole::Punctuation,
Self::Contraction => Gpt2FamilyTokenRole::Word {
check_contraction: false,
first_char_is_letter: false,
},
// With whitespace: apostrophe is absorbed as punctuation prefix,
// contraction_split on the suffix finds no apostrophe, so the
// remaining letters emit as one span.
// Without whitespace: contraction_split separates the contraction
// prefix from trailing letters (e.g. "'tA" -> "'t" + "A").
Self::ContractionLetters => Gpt2FamilyTokenRole::Word {
check_contraction: true,
first_char_is_letter: false,
},
}
}
}
logos_lexer! {
/// Logos DFA word scanner for `r50k_base` (GPT-2).
pub struct R50kLexer;
token = R50kToken;
pattern = OA_R50K_BASE_PATTERN;
}
#[cfg(test)]
mod tests {
use super::*;
use crate::{
alloc::{
string::ToString,
sync::Arc,
vec,
vec::Vec,
},
spanners::{
SpanRef,
TextSpanner,
span_lexers::{
LexerTextSpanner,
SpanLexer,
},
},
};
/// Build a `TextSpanner` from a logos lexer with no specials.
fn spanner(lexer: impl SpanLexer + 'static) -> LexerTextSpanner {
LexerTextSpanner::new(Arc::new(lexer), None)
}
#[test]
fn test_logos_basic_splitting() {
let s = spanner(R50kLexer);
let text = "hello world";
let spans = s.split_spans(text, None);
assert_eq!(spans, vec![SpanRef::Word(0..5), SpanRef::Word(5..11),]);
}
#[test]
fn test_logos_with_specials() {
let special_pattern = crate::support::regex::alternate_choice_regex_pattern(&[
"<|FNORD|>".to_string(),
"<|NORP|>".to_string(),
]);
let s = LexerTextSpanner::new(
Arc::new(R50kLexer),
Some(Arc::new(special_pattern.compile().unwrap()) as Arc<dyn SpanLexer>),
);
let text = "hello<|FNORD|> world<|NORP|>!";
let spans = s.split_spans(text, None);
assert_eq!(
spans,
vec![
SpanRef::Word(0..5),
SpanRef::Special(5..14),
SpanRef::Word(14..20),
SpanRef::Special(20..28),
SpanRef::Word(28..29),
]
);
}
#[test]
fn test_logos_digits() {
let s = spanner(R50kLexer);
let text = "abc 123 4567";
let spans = s.split_spans(text, None);
// Space absorbed by ` ?` prefix in each content pattern.
assert_eq!(
spans,
vec![
SpanRef::Word(0..3), // "abc"
SpanRef::Word(3..7), // " 123"
SpanRef::Word(7..12), // " 4567"
]
);
}
#[test]
fn test_logos_contractions() {
let s = spanner(R50kLexer);
let text = "don't I'll she's";
let spans = s.split_spans(text, None);
let words: Vec<&str> = spans
.iter()
.filter_map(|s| match s {
SpanRef::Word(r) => Some(&text[r.clone()]),
_ => None,
})
.collect();
assert!(words.contains(&"don"));
assert!(words.contains(&"'t"));
assert!(words.contains(&"'ll"));
assert!(words.contains(&"'s"));
}
#[test]
fn test_r50k_common() {
crate::spanners::span_lexers::logos::testutil::common_lexer_tests(
crate::alloc::boxed::Box::new(R50kLexer),
);
}
#[test]
fn test_logos_camel_case() {
let s = spanner(R50kLexer);
// r50k uses \p{L}+ so CamelCase is one token.
assert_eq!(s.split_spans("CamelCase", None), vec![SpanRef::Word(0..9)]);
assert_eq!(
s.split_spans("getElementById", None),
vec![SpanRef::Word(0..14)]
);
}
// Regression: contraction suffix must merge with trailing letters
// when whitespace absorbs the apostrophe as punctuation.
#[test]
fn test_contraction_letters_after_whitespace() {
let s = spanner(R50kLexer);
// " 'sA": ws split -> " ", punct absorbs " '", letters "sA"
assert_eq!(
s.split_spans(" 'sA", None),
vec![
SpanRef::Word(0..1), // " "
SpanRef::Word(1..3), // " '"
SpanRef::Word(3..5), // "sA"
]
);
// " 'llA": same pattern with two-char contraction suffix
assert_eq!(
s.split_spans(" 'llA", None),
vec![
SpanRef::Word(0..1), // " "
SpanRef::Word(1..3), // " '"
SpanRef::Word(3..6), // "llA"
]
);
}
// Without whitespace, contraction_split separates the contraction.
#[test]
fn test_contraction_letters_without_whitespace() {
let s = spanner(R50kLexer);
// "don'tA": contraction split -> "don", "'t", "A"
let text = "don'tA";
let words: Vec<&str> = s
.split_spans(text, None)
.iter()
.filter_map(|s| match s {
SpanRef::Word(r) => Some(&text[r.clone()]),
_ => None,
})
.collect();
assert_eq!(words, vec!["don", "'t", "A"]);
}
}