1use std::ops::Deref;
10use std::hash::{Hash, Hasher};
11
12use prelude::LetterCase;
13
14const HAS_FINAL_PERIOD: u16 = 0b0000000000000001;
16const IS_ELLIPSIS: u16 = 0b0000000000000010;
17const IS_ABBREV: u16 = 0b0000000000000100;
18const IS_SENTENCE_BREAK: u16 = 0b0000000000001000;
19const IS_PARAGRAPH_START: u16 = 0b0000000000010000;
20const IS_NEWLINE_START: u16 = 0b0000000000100000;
21const IS_UPPERCASE: u16 = 0b0000000001000000;
22const IS_LOWERCASE: u16 = 0b0000000010000000;
23
24const IS_INITIAL: u16 = 0b1000000000000000;
26const IS_NUMERIC: u16 = 0b0100000000000000;
27const IS_NON_PUNCT: u16 = 0b0010000000000000;
28const IS_ALPHABETIC: u16 = 0b0000010000000000;
29
30#[derive(Eq)]
31pub struct Token {
32 inner: String,
33 flags: u16,
34}
35
36impl Token {
37 pub fn new(slice: &str, is_el: bool, is_pg: bool, is_nl: bool) -> Token {
38 debug_assert!(slice.len() > 0);
39
40 let first = slice.chars().nth(0).unwrap();
41 let mut has_punct = false;
42
43 let mut tok = if slice.as_bytes()[slice.len() - 1] == b'.' {
46 let mut tok = Token {
47 inner: String::with_capacity(slice.len()),
48 flags: 0x00,
49 };
50
51 tok.set_has_final_period(true);
52 tok
53 } else {
54 Token {
55 inner: String::with_capacity(slice.len() + 1),
56 flags: 0x00,
57 }
58 };
59
60 if is_str_numeric(slice) {
61 tok.set_is_numeric(true);
62 } else if is_str_initial(slice) {
63 tok.set_is_initial(true);
64 }
65
66 for c in slice.chars() {
67 for c0 in c.to_lowercase() {
68 tok.inner.push(c0);
69 }
70
71 if c.is_alphabetic() || c == '_' {
72 tok.set_is_non_punct(true);
73 } else if !c.is_digit(10) {
74 has_punct = true;
75 }
76 }
77
78 if !tok.has_final_period() {
79 tok.inner.push('.');
80 }
81
82 if first.is_uppercase() {
83 tok.set_is_uppercase(true);
84 } else if first.is_lowercase() {
85 tok.set_is_lowercase(true);
86 }
87
88 tok.set_is_alphabetic(!has_punct);
89 tok.set_is_ellipsis(is_el);
90 tok.set_is_paragraph_start(is_pg);
91 tok.set_is_newline_start(is_nl);
92
93 tok
94 }
95
96 #[inline(always)]
99 pub fn tok(&self) -> &str {
100 if self.has_final_period() {
101 &self.inner[..]
102 } else {
103 &self.inner[..self.inner.len() - 1]
104 }
105 }
106
107 #[inline(always)]
109 pub fn tok_without_period(&self) -> &str {
110 if self.has_final_period() {
111 &self.tok()[..self.len() - 1]
112 } else {
113 self.tok()
114 }
115 }
116
117 #[inline(always)]
120 pub fn typ(&self) -> &str {
121 if self.is_numeric() {
122 "##number##"
123 } else {
124 self.tok()
125 }
126 }
127
128 #[inline(always)]
132 pub fn typ_with_period(&self) -> &str {
133 if self.is_numeric() {
134 "##number##."
135 } else {
136 &self.inner[..]
137 }
138 }
139
140 #[inline(always)]
144 pub fn typ_without_period(&self) -> &str {
145 if self.tok().len() > 1 && self.has_final_period() {
146 &self.typ_with_period()[..self.typ_with_period().len() - 1]
147 } else {
148 self.typ()
149 }
150 }
151
152 #[inline(always)]
155 pub fn typ_without_break_or_period(&self) -> &str {
156 if self.is_sentence_break() {
157 self.typ_without_period()
158 } else {
159 self.typ()
160 }
161 }
162
163 #[inline(always)]
164 pub fn first_case(&self) -> LetterCase {
165 if self.is_uppercase() {
166 LetterCase::Upper
167 } else if self.is_lowercase() {
168 LetterCase::Lower
169 } else {
170 LetterCase::Unknown
171 }
172 }
173
174 #[inline(always)]
175 pub fn is_uppercase(&self) -> bool {
176 self.flags & IS_UPPERCASE != 0
177 }
178
179 #[inline(always)]
180 pub fn is_lowercase(&self) -> bool {
181 self.flags & IS_LOWERCASE != 0
182 }
183
184 #[inline(always)]
185 pub fn is_ellipsis(&self) -> bool {
186 self.flags & IS_ELLIPSIS != 0
187 }
188
189 #[inline(always)]
190 pub fn is_abbrev(&self) -> bool {
191 self.flags & IS_ABBREV != 0
192 }
193
194 #[inline(always)]
195 pub fn is_sentence_break(&self) -> bool {
196 self.flags & IS_SENTENCE_BREAK != 0
197 }
198
199 #[inline(always)]
200 pub fn has_final_period(&self) -> bool {
201 self.flags & HAS_FINAL_PERIOD != 0
202 }
203
204 #[inline(always)]
205 pub fn is_paragraph_start(&self) -> bool {
206 self.flags & IS_PARAGRAPH_START != 0
207 }
208
209 #[inline(always)]
210 pub fn is_newline_start(&self) -> bool {
211 self.flags & IS_NEWLINE_START != 0
212 }
213
214 #[inline(always)]
215 pub fn is_numeric(&self) -> bool {
216 self.flags & IS_NUMERIC != 0
217 }
218
219 #[inline(always)]
220 pub fn is_initial(&self) -> bool {
221 self.flags & IS_INITIAL != 0
222 }
223
224 #[inline(always)]
228 pub fn is_non_punct(&self) -> bool {
229 (self.flags & IS_NON_PUNCT != 0) || self.is_numeric()
230 }
231
232 #[inline(always)]
233 pub fn is_alphabetic(&self) -> bool {
234 self.flags & IS_ALPHABETIC != 0
235 }
236
237 #[inline(always)]
238 pub fn set_is_ellipsis(&mut self, b: bool) {
239 if b {
240 self.flags |= IS_ELLIPSIS;
241 } else if self.is_ellipsis() {
242 self.flags ^= IS_ELLIPSIS;
243 }
244 }
245
246 #[inline(always)]
247 pub fn set_is_abbrev(&mut self, b: bool) {
248 if b {
249 self.flags |= IS_ABBREV;
250 } else if self.is_abbrev() {
251 self.flags ^= IS_ABBREV;
252 }
253 }
254
255 #[inline(always)]
256 pub fn set_is_sentence_break(&mut self, b: bool) {
257 if b {
258 self.flags |= IS_SENTENCE_BREAK;
259 } else if self.is_sentence_break() {
260 self.flags ^= IS_SENTENCE_BREAK;
261 }
262 }
263
264 #[inline(always)]
265 pub fn set_has_final_period(&mut self, b: bool) {
266 if b {
267 self.flags |= HAS_FINAL_PERIOD;
268 } else if self.has_final_period() {
269 self.flags ^= HAS_FINAL_PERIOD;
270 }
271 }
272
273 #[inline(always)]
274 pub fn set_is_paragraph_start(&mut self, b: bool) {
275 if b {
276 self.flags |= IS_PARAGRAPH_START;
277 } else if self.is_paragraph_start() {
278 self.flags ^= IS_PARAGRAPH_START;
279 }
280 }
281
282 #[inline(always)]
283 pub fn set_is_newline_start(&mut self, b: bool) {
284 if b {
285 self.flags |= IS_NEWLINE_START;
286 } else if self.is_newline_start() {
287 self.flags ^= IS_NEWLINE_START;
288 }
289 }
290
291 #[inline(always)]
292 pub fn set_is_uppercase(&mut self, b: bool) {
293 if b {
294 self.flags |= IS_UPPERCASE;
295 } else if self.is_uppercase() {
296 self.flags ^= IS_UPPERCASE;
297 }
298 }
299
300 #[inline(always)]
301 pub fn set_is_lowercase(&mut self, b: bool) {
302 if b {
303 self.flags |= IS_LOWERCASE;
304 } else if self.is_lowercase() {
305 self.flags ^= IS_LOWERCASE;
306 }
307 }
308
309 #[inline(always)]
310 pub fn set_is_numeric(&mut self, b: bool) {
311 if b {
312 self.flags |= IS_NUMERIC;
313 } else if self.is_numeric() {
314 self.flags ^= IS_NUMERIC;
315 }
316 }
317
318 #[inline(always)]
319 pub fn set_is_initial(&mut self, b: bool) {
320 if b {
321 self.flags |= IS_INITIAL;
322 } else if self.is_initial() {
323 self.flags ^= IS_INITIAL;
324 }
325 }
326
327 #[inline(always)]
328 pub fn set_is_non_punct(&mut self, b: bool) {
329 if b {
330 self.flags |= IS_NON_PUNCT;
331 } else if self.is_non_punct() {
332 self.flags ^= IS_NON_PUNCT;
333 }
334 }
335
336 #[inline(always)]
337 pub fn set_is_alphabetic(&mut self, b: bool) {
338 if b {
339 self.flags |= IS_ALPHABETIC;
340 } else if self.is_alphabetic() {
341 self.flags ^= IS_ALPHABETIC;
342 }
343 }
344}
345
346impl Deref for Token {
347 type Target = str;
348
349 #[inline(always)]
350 fn deref(&self) -> &str {
351 &self.inner[..]
352 }
353}
354
355impl PartialEq for Token {
356 #[inline(always)]
357 fn eq(&self, other: &Token) -> bool {
358 self.typ() == other.typ()
359 }
360}
361
362impl Hash for Token {
363 #[inline(always)]
364 fn hash<H>(&self, state: &mut H)
365 where
366 H: Hasher,
367 {
368 self.typ().hash(state)
369 }
370}
371
372#[inline]
379fn is_str_numeric(tok: &str) -> bool {
380 let mut digit_found = false;
381 let mut pos = 0;
382
383 for c in tok.chars() {
384 match c {
385 _ if c.is_digit(10) => digit_found = true,
388 ',' | '.' | '-' if digit_found => (),
391 ',' | '.' if pos == 0 || pos == 1 => (),
394 '-' if pos == 0 => (),
396 _ => return false,
399 }
400
401 pos += c.len_utf8();
402 }
403
404 digit_found
405}
406
407#[inline]
411fn is_str_initial(tok: &str) -> bool {
412 let mut iter = tok.chars();
413
414 match (iter.next(), iter.next()) {
415 (Some(c), Some('.')) if c.is_alphabetic() => iter.next().is_none(),
416 _ => false,
417 }
418}
419
420#[test]
421fn test_token_flags() {
422 macro_rules! perform_flag_test(
423 ($tok:expr, $f:ident, $t:ident) => (
424 {
425 $tok.$f(true);
426 assert!($tok.$t());
427 $tok.$f(false);
428 assert!(!$tok.$t());
429 }
430 )
431 );
432
433 let mut tok = Token::new("test", false, false, false);
434
435 tok.set_is_non_punct(false);
436 tok.set_is_lowercase(false);
437 tok.set_is_alphabetic(false);
438
439 assert_eq!(tok.flags, 0);
440
441 perform_flag_test!(tok, set_is_ellipsis, is_ellipsis);
442 perform_flag_test!(tok, set_is_abbrev, is_abbrev);
443 perform_flag_test!(tok, set_has_final_period, has_final_period);
444 perform_flag_test!(tok, set_is_paragraph_start, is_paragraph_start);
445 perform_flag_test!(tok, set_is_newline_start, is_newline_start);
446 perform_flag_test!(tok, set_is_uppercase, is_uppercase);
447 perform_flag_test!(tok, set_is_lowercase, is_lowercase);
448 perform_flag_test!(tok, set_is_numeric, is_numeric);
449 perform_flag_test!(tok, set_is_initial, is_initial);
450 perform_flag_test!(tok, set_is_non_punct, is_non_punct);
451 perform_flag_test!(tok, set_is_alphabetic, is_alphabetic);
452}