punkt/
token.rs

1// Copyright 2016 rust-punkt developers
2//
3// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6// option. This file may not be copied, modified, or distributed
7// except according to those terms.
8
9use std::ops::Deref;
10use std::hash::{Hash, Hasher};
11
12use prelude::LetterCase;
13
14// These 6 flags only use the lower 8 bits.
15const HAS_FINAL_PERIOD: u16 = 0b0000000000000001;
16const IS_ELLIPSIS: u16 = 0b0000000000000010;
17const IS_ABBREV: u16 = 0b0000000000000100;
18const IS_SENTENCE_BREAK: u16 = 0b0000000000001000;
19const IS_PARAGRAPH_START: u16 = 0b0000000000010000;
20const IS_NEWLINE_START: u16 = 0b0000000000100000;
21const IS_UPPERCASE: u16 = 0b0000000001000000;
22const IS_LOWERCASE: u16 = 0b0000000010000000;
23
24// These flags only use the upper 8 bits.
25const IS_INITIAL: u16 = 0b1000000000000000;
26const IS_NUMERIC: u16 = 0b0100000000000000;
27const IS_NON_PUNCT: u16 = 0b0010000000000000;
28const IS_ALPHABETIC: u16 = 0b0000010000000000;
29
30#[derive(Eq)]
31pub struct Token {
32  inner: String,
33  flags: u16,
34}
35
36impl Token {
37  pub fn new(slice: &str, is_el: bool, is_pg: bool, is_nl: bool) -> Token {
38    debug_assert!(slice.len() > 0);
39
40    let first = slice.chars().nth(0).unwrap();
41    let mut has_punct = false;
42
43    // Add a period to any tokens without a period. This is an optimization
44    // to avoid creating an entirely new token when using as a key.
45    let mut tok = if slice.as_bytes()[slice.len() - 1] == b'.' {
46      let mut tok = Token {
47        inner: String::with_capacity(slice.len()),
48        flags: 0x00,
49      };
50
51      tok.set_has_final_period(true);
52      tok
53    } else {
54      Token {
55        inner: String::with_capacity(slice.len() + 1),
56        flags: 0x00,
57      }
58    };
59
60    if is_str_numeric(slice) {
61      tok.set_is_numeric(true);
62    } else if is_str_initial(slice) {
63      tok.set_is_initial(true);
64    }
65
66    for c in slice.chars() {
67      for c0 in c.to_lowercase() {
68        tok.inner.push(c0);
69      }
70
71      if c.is_alphabetic() || c == '_' {
72        tok.set_is_non_punct(true);
73      } else if !c.is_digit(10) {
74        has_punct = true;
75      }
76    }
77
78    if !tok.has_final_period() {
79      tok.inner.push('.');
80    }
81
82    if first.is_uppercase() {
83      tok.set_is_uppercase(true);
84    } else if first.is_lowercase() {
85      tok.set_is_lowercase(true);
86    }
87
88    tok.set_is_alphabetic(!has_punct);
89    tok.set_is_ellipsis(is_el);
90    tok.set_is_paragraph_start(is_pg);
91    tok.set_is_newline_start(is_nl);
92
93    tok
94  }
95
96  /// Returns the normalized original token (which can be reconstructed from
97  /// the inner representation of the token, and the flags on the token).
98  #[inline(always)]
99  pub fn tok(&self) -> &str {
100    if self.has_final_period() {
101      &self.inner[..]
102    } else {
103      &self.inner[..self.inner.len() - 1]
104    }
105  }
106
107  /// Returns the token with any ending period truncated.
108  #[inline(always)]
109  pub fn tok_without_period(&self) -> &str {
110    if self.has_final_period() {
111      &self.tok()[..self.len() - 1]
112    } else {
113      self.tok()
114    }
115  }
116
117  /// Returns the type of the token. If the token is numeric (determined by flags),
118  /// returns `##number##`, otherwise returns the normalized token.
119  #[inline(always)]
120  pub fn typ(&self) -> &str {
121    if self.is_numeric() {
122      "##number##"
123    } else {
124      self.tok()
125    }
126  }
127
128  /// Returns the type of the token with a period appended to it. Returns
129  /// `##number##.` if the token is numeric (determined by flags), otherwise
130  /// returns the original token with a period appended to it.
131  #[inline(always)]
132  pub fn typ_with_period(&self) -> &str {
133    if self.is_numeric() {
134      "##number##."
135    } else {
136      &self.inner[..]
137    }
138  }
139
140  /// Returns the type of the token without a period appended to it. Will return
141  /// `.`, if it is the only character in the string; otherwise, will slice type
142  /// to exclude the final period.
143  #[inline(always)]
144  pub fn typ_without_period(&self) -> &str {
145    if self.tok().len() > 1 && self.has_final_period() {
146      &self.typ_with_period()[..self.typ_with_period().len() - 1]
147    } else {
148      self.typ()
149    }
150  }
151
152  /// Returns the type of the token without a break or period if it had one originally
153  /// at the end.
154  #[inline(always)]
155  pub fn typ_without_break_or_period(&self) -> &str {
156    if self.is_sentence_break() {
157      self.typ_without_period()
158    } else {
159      self.typ()
160    }
161  }
162
163  #[inline(always)]
164  pub fn first_case(&self) -> LetterCase {
165    if self.is_uppercase() {
166      LetterCase::Upper
167    } else if self.is_lowercase() {
168      LetterCase::Lower
169    } else {
170      LetterCase::Unknown
171    }
172  }
173
174  #[inline(always)]
175  pub fn is_uppercase(&self) -> bool {
176    self.flags & IS_UPPERCASE != 0
177  }
178
179  #[inline(always)]
180  pub fn is_lowercase(&self) -> bool {
181    self.flags & IS_LOWERCASE != 0
182  }
183
184  #[inline(always)]
185  pub fn is_ellipsis(&self) -> bool {
186    self.flags & IS_ELLIPSIS != 0
187  }
188
189  #[inline(always)]
190  pub fn is_abbrev(&self) -> bool {
191    self.flags & IS_ABBREV != 0
192  }
193
194  #[inline(always)]
195  pub fn is_sentence_break(&self) -> bool {
196    self.flags & IS_SENTENCE_BREAK != 0
197  }
198
199  #[inline(always)]
200  pub fn has_final_period(&self) -> bool {
201    self.flags & HAS_FINAL_PERIOD != 0
202  }
203
204  #[inline(always)]
205  pub fn is_paragraph_start(&self) -> bool {
206    self.flags & IS_PARAGRAPH_START != 0
207  }
208
209  #[inline(always)]
210  pub fn is_newline_start(&self) -> bool {
211    self.flags & IS_NEWLINE_START != 0
212  }
213
214  #[inline(always)]
215  pub fn is_numeric(&self) -> bool {
216    self.flags & IS_NUMERIC != 0
217  }
218
219  #[inline(always)]
220  pub fn is_initial(&self) -> bool {
221    self.flags & IS_INITIAL != 0
222  }
223
224  // The NLTK docs note that all numeric tokens are considered to be contain
225  // only punctuation, because they are converted to `##number##`, which clearly
226  // has alphabetic characters.
227  #[inline(always)]
228  pub fn is_non_punct(&self) -> bool {
229    (self.flags & IS_NON_PUNCT != 0) || self.is_numeric()
230  }
231
232  #[inline(always)]
233  pub fn is_alphabetic(&self) -> bool {
234    self.flags & IS_ALPHABETIC != 0
235  }
236
237  #[inline(always)]
238  pub fn set_is_ellipsis(&mut self, b: bool) {
239    if b {
240      self.flags |= IS_ELLIPSIS;
241    } else if self.is_ellipsis() {
242      self.flags ^= IS_ELLIPSIS;
243    }
244  }
245
246  #[inline(always)]
247  pub fn set_is_abbrev(&mut self, b: bool) {
248    if b {
249      self.flags |= IS_ABBREV;
250    } else if self.is_abbrev() {
251      self.flags ^= IS_ABBREV;
252    }
253  }
254
255  #[inline(always)]
256  pub fn set_is_sentence_break(&mut self, b: bool) {
257    if b {
258      self.flags |= IS_SENTENCE_BREAK;
259    } else if self.is_sentence_break() {
260      self.flags ^= IS_SENTENCE_BREAK;
261    }
262  }
263
264  #[inline(always)]
265  pub fn set_has_final_period(&mut self, b: bool) {
266    if b {
267      self.flags |= HAS_FINAL_PERIOD;
268    } else if self.has_final_period() {
269      self.flags ^= HAS_FINAL_PERIOD;
270    }
271  }
272
273  #[inline(always)]
274  pub fn set_is_paragraph_start(&mut self, b: bool) {
275    if b {
276      self.flags |= IS_PARAGRAPH_START;
277    } else if self.is_paragraph_start() {
278      self.flags ^= IS_PARAGRAPH_START;
279    }
280  }
281
282  #[inline(always)]
283  pub fn set_is_newline_start(&mut self, b: bool) {
284    if b {
285      self.flags |= IS_NEWLINE_START;
286    } else if self.is_newline_start() {
287      self.flags ^= IS_NEWLINE_START;
288    }
289  }
290
291  #[inline(always)]
292  pub fn set_is_uppercase(&mut self, b: bool) {
293    if b {
294      self.flags |= IS_UPPERCASE;
295    } else if self.is_uppercase() {
296      self.flags ^= IS_UPPERCASE;
297    }
298  }
299
300  #[inline(always)]
301  pub fn set_is_lowercase(&mut self, b: bool) {
302    if b {
303      self.flags |= IS_LOWERCASE;
304    } else if self.is_lowercase() {
305      self.flags ^= IS_LOWERCASE;
306    }
307  }
308
309  #[inline(always)]
310  pub fn set_is_numeric(&mut self, b: bool) {
311    if b {
312      self.flags |= IS_NUMERIC;
313    } else if self.is_numeric() {
314      self.flags ^= IS_NUMERIC;
315    }
316  }
317
318  #[inline(always)]
319  pub fn set_is_initial(&mut self, b: bool) {
320    if b {
321      self.flags |= IS_INITIAL;
322    } else if self.is_initial() {
323      self.flags ^= IS_INITIAL;
324    }
325  }
326
327  #[inline(always)]
328  pub fn set_is_non_punct(&mut self, b: bool) {
329    if b {
330      self.flags |= IS_NON_PUNCT;
331    } else if self.is_non_punct() {
332      self.flags ^= IS_NON_PUNCT;
333    }
334  }
335
336  #[inline(always)]
337  pub fn set_is_alphabetic(&mut self, b: bool) {
338    if b {
339      self.flags |= IS_ALPHABETIC;
340    } else if self.is_alphabetic() {
341      self.flags ^= IS_ALPHABETIC;
342    }
343  }
344}
345
346impl Deref for Token {
347  type Target = str;
348
349  #[inline(always)]
350  fn deref(&self) -> &str {
351    &self.inner[..]
352  }
353}
354
355impl PartialEq for Token {
356  #[inline(always)]
357  fn eq(&self, other: &Token) -> bool {
358    self.typ() == other.typ()
359  }
360}
361
362impl Hash for Token {
363  #[inline(always)]
364  fn hash<H>(&self, state: &mut H)
365  where
366    H: Hasher,
367  {
368    self.typ().hash(state)
369  }
370}
371
372/// A number can start with a negative sign ('-'), and be followed by digits
373/// or isolated periods, commas, or dashes.
374/// Note: It's assumed that multi-chars are taken out of the input when creating word
375/// tokens, so a numeric word token SHOULD not have a multi-char within it as
376/// its received. This assumption should be fulfilled by the parser generating
377/// these word tokens. If it isn't some weird outputs are possible (such as "5.4--5").
378#[inline]
379fn is_str_numeric(tok: &str) -> bool {
380  let mut digit_found = false;
381  let mut pos = 0;
382
383  for c in tok.chars() {
384    match c {
385      // A digit was found. Note this to confirm later if punctuation
386      // within the number is valid or not.
387      _ if c.is_digit(10) => digit_found = true,
388      // A delimeter was found. This is valid as long as
389      // a digit was also found prior.
390      ',' | '.' | '-' if digit_found => (),
391      // A comma or period was found as the first character, or
392      // after a negative sign. This is a valid token.
393      ',' | '.' if pos == 0 || pos == 1 => (),
394      // A negative sign is found.
395      '-' if pos == 0 => (),
396      // A non numeric token was encountered in the string that
397      // isn't a valid one. Return false.
398      _ => return false,
399    }
400
401    pos += c.len_utf8();
402  }
403
404  digit_found
405}
406
407/// Tests if the token is an initial. An initial is a 2 character grouping
408/// where the first character is a letter (non-digit, non-symbol), and the
409/// next is a period.
410#[inline]
411fn is_str_initial(tok: &str) -> bool {
412  let mut iter = tok.chars();
413
414  match (iter.next(), iter.next()) {
415    (Some(c), Some('.')) if c.is_alphabetic() => iter.next().is_none(),
416    _ => false,
417  }
418}
419
420#[test]
421fn test_token_flags() {
422  macro_rules! perform_flag_test(
423    ($tok:expr, $f:ident, $t:ident) => (
424      {
425        $tok.$f(true);
426        assert!($tok.$t());
427        $tok.$f(false);
428        assert!(!$tok.$t());
429      }
430    )
431  );
432
433  let mut tok = Token::new("test", false, false, false);
434
435  tok.set_is_non_punct(false);
436  tok.set_is_lowercase(false);
437  tok.set_is_alphabetic(false);
438
439  assert_eq!(tok.flags, 0);
440
441  perform_flag_test!(tok, set_is_ellipsis, is_ellipsis);
442  perform_flag_test!(tok, set_is_abbrev, is_abbrev);
443  perform_flag_test!(tok, set_has_final_period, has_final_period);
444  perform_flag_test!(tok, set_is_paragraph_start, is_paragraph_start);
445  perform_flag_test!(tok, set_is_newline_start, is_newline_start);
446  perform_flag_test!(tok, set_is_uppercase, is_uppercase);
447  perform_flag_test!(tok, set_is_lowercase, is_lowercase);
448  perform_flag_test!(tok, set_is_numeric, is_numeric);
449  perform_flag_test!(tok, set_is_initial, is_initial);
450  perform_flag_test!(tok, set_is_non_punct, is_non_punct);
451  perform_flag_test!(tok, set_is_alphabetic, is_alphabetic);
452}