1use std::cmp::Ordering;
2use std::collections::VecDeque;
3use std::fmt::Display;
4
5use harper_brill::{Chunker, Tagger, brill_tagger, burn_chunker};
6use paste::paste;
7
8use crate::expr::{Expr, ExprExt, FirstMatchOf, Repeating, SequenceExpr};
9use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
10use crate::patterns::WordSet;
11use crate::punctuation::Punctuation;
12use crate::spell::{Dictionary, FstDictionary};
13use crate::vec_ext::VecExt;
14use crate::{CharStringExt, FatStringToken, FatToken, Lrc, Token, TokenKind, TokenStringExt};
15use crate::{OrdinalSuffix, Span};
16
17#[derive(Debug, Clone)]
19pub struct Document {
20 source: Lrc<Vec<char>>,
21 tokens: Vec<Token>,
22}
23
24impl Default for Document {
25 fn default() -> Self {
26 Self::new("", &PlainEnglish, &FstDictionary::curated())
27 }
28}
29
30impl Document {
31 pub fn token_indices_intersecting(&self, span: Span<char>) -> Vec<usize> {
35 self.tokens()
36 .enumerate()
37 .filter_map(|(idx, tok)| tok.span.overlaps_with(span).then_some(idx))
38 .collect()
39 }
40
41 pub fn fat_tokens_intersecting(&self, span: Span<char>) -> Vec<FatToken> {
45 let indices = self.token_indices_intersecting(span);
46
47 indices
48 .into_iter()
49 .map(|i| self.tokens[i].to_fat(&self.source))
50 .collect()
51 }
52
53 pub fn new(text: &str, parser: &impl Parser, dictionary: &impl Dictionary) -> Self {
56 let source: Vec<_> = text.chars().collect();
57
58 Self::new_from_vec(Lrc::new(source), parser, dictionary)
59 }
60
61 pub fn new_curated(text: &str, parser: &impl Parser) -> Self {
64 let source: Vec<_> = text.chars().collect();
65
66 Self::new_from_vec(Lrc::new(source), parser, &FstDictionary::curated())
67 }
68
69 pub fn new_from_vec(
72 source: Lrc<Vec<char>>,
73 parser: &impl Parser,
74 dictionary: &impl Dictionary,
75 ) -> Self {
76 let tokens = parser.parse(&source);
77
78 let mut document = Self { source, tokens };
79 document.parse(dictionary);
80
81 document
82 }
83
84 pub fn new_plain_english_curated(text: &str) -> Self {
87 Self::new(text, &PlainEnglish, &FstDictionary::curated())
88 }
89
90 pub fn new_plain_english(text: &str, dictionary: &impl Dictionary) -> Self {
93 Self::new(text, &PlainEnglish, dictionary)
94 }
95
96 pub fn new_markdown_curated(text: &str, markdown_options: MarkdownOptions) -> Self {
99 Self::new(
100 text,
101 &Markdown::new(markdown_options),
102 &FstDictionary::curated(),
103 )
104 }
105
106 pub fn new_markdown_default_curated(text: &str) -> Self {
109 Self::new_markdown_curated(text, MarkdownOptions::default())
110 }
111
112 pub fn new_markdown(
115 text: &str,
116 markdown_options: MarkdownOptions,
117 dictionary: &impl Dictionary,
118 ) -> Self {
119 Self::new(text, &Markdown::new(markdown_options), dictionary)
120 }
121
122 pub fn new_markdown_default(text: &str, dictionary: &impl Dictionary) -> Self {
125 Self::new_markdown(text, MarkdownOptions::default(), dictionary)
126 }
127
128 fn parse(&mut self, dictionary: &impl Dictionary) {
132 self.condense_spaces();
133 self.condense_newlines();
134 self.newlines_to_breaks();
135 self.condense_contractions();
136 self.condense_dotted_initialisms();
137 self.condense_number_suffixes();
138 self.condense_ellipsis();
139 self.condense_latin();
140 self.condense_filename_extensions();
141 self.condense_tldr();
142 self.match_quotes();
143
144 let chunker = burn_chunker();
145 let tagger = brill_tagger();
146
147 for sent in self.tokens.iter_sentences_mut() {
148 let token_strings: Vec<_> = sent
149 .iter()
150 .filter(|t| !t.kind.is_whitespace())
151 .map(|t| t.span.get_content_string(&self.source))
152 .collect();
153
154 let token_tags = tagger.tag_sentence(&token_strings);
155 let np_flags = chunker.chunk_sentence(&token_strings, &token_tags);
156
157 let mut i = 0;
158
159 for token in sent.iter_mut() {
161 if let TokenKind::Word(meta) = &mut token.kind {
162 let word_source = token.span.get_content(&self.source);
163 let mut found_meta = dictionary.get_word_metadata(word_source).cloned();
164
165 if let Some(inner) = &mut found_meta {
166 inner.pos_tag = token_tags[i].or_else(|| inner.infer_pos_tag());
167 inner.np_member = Some(np_flags[i]);
168 }
169
170 *meta = found_meta;
171 i += 1;
172 } else if !token.kind.is_whitespace() {
173 i += 1;
174 }
175 }
176 }
177 }
178
179 fn newlines_to_breaks(&mut self) {
181 for token in &mut self.tokens {
182 if let TokenKind::Newline(n) = token.kind
183 && n >= 2
184 {
185 token.kind = TokenKind::ParagraphBreak;
186 }
187 }
188 }
189
190 fn condense_indices(&mut self, indices: &[usize], stretch_len: usize) {
196 for idx in indices {
198 let end_tok = self.tokens[idx + stretch_len - 1].clone();
199 let start_tok = &mut self.tokens[*idx];
200
201 start_tok.span.end = end_tok.span.end;
202 }
203
204 let old = self.tokens.clone();
206 self.tokens.clear();
207
208 self.tokens
210 .extend_from_slice(&old[0..indices.first().copied().unwrap_or(indices.len())]);
211
212 let mut iter = indices.iter().peekable();
213
214 while let (Some(a_idx), b) = (iter.next(), iter.peek()) {
215 self.tokens.push(old[*a_idx].clone());
216
217 if let Some(b_idx) = b {
218 self.tokens
219 .extend_from_slice(&old[a_idx + stretch_len..**b_idx]);
220 }
221 }
222
223 self.tokens.extend_from_slice(
225 &old[indices
226 .last()
227 .map(|v| v + stretch_len)
228 .unwrap_or(indices.len())..],
229 );
230 }
231
232 pub fn get_token_at_char_index(&self, char_index: usize) -> Option<&Token> {
233 let index = self
234 .tokens
235 .binary_search_by(|t| {
236 if t.span.overlaps_with(Span::new_with_len(char_index, 1)) {
237 Ordering::Equal
238 } else {
239 t.span.start.cmp(&char_index)
240 }
241 })
242 .ok()?;
243
244 Some(&self.tokens[index])
245 }
246
247 pub fn get_token(&self, index: usize) -> Option<&Token> {
249 self.tokens.get(index)
250 }
251
252 pub fn get_token_offset(&self, base: usize, offset: isize) -> Option<&Token> {
254 match base.checked_add_signed(offset) {
255 None => None,
256 Some(idx) => self.get_token(idx),
257 }
258 }
259
260 pub fn tokens(&self) -> impl Iterator<Item = &Token> + '_ {
262 self.tokens.iter()
263 }
264
265 pub fn iter_nominal_phrases(&self) -> impl Iterator<Item = &[Token]> {
266 fn is_np_member(t: &Token) -> bool {
267 t.kind
268 .as_word()
269 .and_then(|x| x.as_ref())
270 .and_then(|w| w.np_member)
271 .unwrap_or(false)
272 }
273
274 fn trim(slice: &[Token]) -> &[Token] {
275 let mut start = 0;
276 let mut end = slice.len();
277 while start < end && slice[start].kind.is_whitespace() {
278 start += 1;
279 }
280 while end > start && slice[end - 1].kind.is_whitespace() {
281 end -= 1;
282 }
283 &slice[start..end]
284 }
285
286 self.tokens
287 .as_slice()
288 .split(|t| !(is_np_member(t) || t.kind.is_whitespace()))
289 .filter_map(|s| {
290 let s = trim(s);
291 if s.iter().any(is_np_member) {
292 Some(s)
293 } else {
294 None
295 }
296 })
297 }
298
299 pub fn fat_tokens(&self) -> impl Iterator<Item = FatToken> + '_ {
301 self.tokens().map(|token| token.to_fat(&self.source))
302 }
303
304 pub fn get_next_word_from_offset(&self, base: usize, offset: isize) -> Option<&Token> {
307 if !self.get_token_offset(base, offset)?.kind.is_whitespace() {
309 return None;
310 }
311 let word_token = self.get_token_offset(base, offset + offset.signum());
313 let word_token = word_token?;
314 word_token.kind.is_word().then_some(word_token)
315 }
316
317 pub fn fat_string_tokens(&self) -> impl Iterator<Item = FatStringToken> + '_ {
319 self.fat_tokens().map(|t| t.into())
320 }
321
322 pub fn get_span_content(&self, span: &Span<char>) -> &[char] {
323 span.get_content(&self.source)
324 }
325
326 pub fn get_span_content_str(&self, span: &Span<char>) -> String {
327 String::from_iter(self.get_span_content(span))
328 }
329
330 pub fn get_full_string(&self) -> String {
331 self.get_span_content_str(&Span::new(0, self.source.len()))
332 }
333
334 pub fn get_full_content(&self) -> &[char] {
335 &self.source
336 }
337
338 pub fn get_source(&self) -> &[char] {
339 &self.source
340 }
341
342 pub fn get_tokens(&self) -> &[Token] {
343 &self.tokens
344 }
345
346 fn match_quotes(&mut self) {
352 let quote_indices: Vec<usize> = self.tokens.iter_quote_indices().collect();
353
354 for i in 0..quote_indices.len() / 2 {
355 let a_i = quote_indices[i * 2];
356 let b_i = quote_indices[i * 2 + 1];
357
358 {
359 let a = self.tokens[a_i].kind.as_mut_quote().unwrap();
360 a.twin_loc = Some(b_i);
361 }
362
363 {
364 let b = self.tokens[b_i].kind.as_mut_quote().unwrap();
365 b.twin_loc = Some(a_i);
366 }
367 }
368 }
369
370 fn condense_number_suffixes(&mut self) {
372 if self.tokens.len() < 2 {
373 return;
374 }
375
376 let mut replace_starts = Vec::new();
377
378 for idx in 0..self.tokens.len() - 1 {
379 let b = &self.tokens[idx + 1];
380 let a = &self.tokens[idx];
381
382 if let (TokenKind::Number(..), TokenKind::Word(..)) = (&a.kind, &b.kind)
385 && let Some(found_suffix) =
386 OrdinalSuffix::from_chars(self.get_span_content(&b.span))
387 {
388 self.tokens[idx].kind.as_mut_number().unwrap().suffix = Some(found_suffix);
389 replace_starts.push(idx);
390 }
391 }
392
393 self.condense_indices(&replace_starts, 2);
394 }
395
396 fn condense_spaces(&mut self) {
399 let mut cursor = 0;
400 let copy = self.tokens.clone();
401
402 let mut remove_these = VecDeque::new();
403
404 while cursor < self.tokens.len() {
405 let start_tok = &mut self.tokens[cursor];
407
408 if let TokenKind::Space(start_count) = &mut start_tok.kind {
409 loop {
410 cursor += 1;
411
412 if cursor >= copy.len() {
413 break;
414 }
415
416 let child_tok = ©[cursor];
417
418 if start_tok.span.end != child_tok.span.start {
420 break;
421 }
422
423 if let TokenKind::Space(n) = child_tok.kind {
424 *start_count += n;
425 start_tok.span.end = child_tok.span.end;
426 remove_these.push_back(cursor);
427 cursor += 1;
428 } else {
429 break;
430 };
431 }
432 }
433
434 cursor += 1;
435 }
436
437 self.tokens.remove_indices(remove_these);
438 }
439
440 thread_local! {
441 static LATIN_EXPR: Lrc<FirstMatchOf> = Document::uncached_latin_expr();
442 }
443
444 fn uncached_latin_expr() -> Lrc<FirstMatchOf> {
445 Lrc::new(FirstMatchOf::new(vec![
446 Box::new(
447 SequenceExpr::default()
448 .then(WordSet::new(&["etc", "vs"]))
449 .then_period(),
450 ),
451 Box::new(
452 SequenceExpr::aco("et")
453 .then_whitespace()
454 .t_aco("al")
455 .then_period(),
456 ),
457 ]))
458 }
459
460 fn condense_expr<F>(&mut self, expr: &impl Expr, edit: F)
463 where
464 F: Fn(&mut Token),
465 {
466 let matches = expr.iter_matches_in_doc(self).collect::<Vec<_>>();
467
468 let mut remove_indices = VecDeque::with_capacity(matches.len());
469
470 for m in matches {
471 remove_indices.extend(m.start + 1..m.end);
472 self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
473 edit(&mut self.tokens[m.start]);
474 }
475
476 self.tokens.remove_indices(remove_indices);
477 }
478
479 fn condense_latin(&mut self) {
480 self.condense_expr(&Self::LATIN_EXPR.with(|v| v.clone()), |_| {})
481 }
482
483 fn condense_newlines(&mut self) {
486 let mut cursor = 0;
487 let copy = self.tokens.clone();
488
489 let mut remove_these = VecDeque::new();
490
491 while cursor < self.tokens.len() {
492 let start_tok = &mut self.tokens[cursor];
494
495 if let TokenKind::Newline(start_count) = &mut start_tok.kind {
496 loop {
497 cursor += 1;
498
499 if cursor >= copy.len() {
500 break;
501 }
502
503 let child_tok = ©[cursor];
504 if let TokenKind::Newline(n) = child_tok.kind {
505 *start_count += n;
506 start_tok.span.end = child_tok.span.end;
507 remove_these.push_back(cursor);
508 cursor += 1;
509 } else {
510 break;
511 };
512 }
513 }
514
515 cursor += 1;
516 }
517
518 self.tokens.remove_indices(remove_these);
519 }
520
521 fn condense_dotted_initialisms(&mut self) {
524 if self.tokens.len() < 2 {
525 return;
526 }
527
528 let mut to_remove = VecDeque::new();
529
530 let mut cursor = 1;
531
532 let mut initialism_start = None;
533
534 loop {
535 let a = &self.tokens[cursor - 1];
536 let b = &self.tokens[cursor];
537
538 let is_initialism_chunk = a.kind.is_word() && a.span.len() == 1 && b.kind.is_period();
539
540 if is_initialism_chunk {
541 if initialism_start.is_none() {
542 initialism_start = Some(cursor - 1);
543 } else {
544 to_remove.push_back(cursor - 1);
545 }
546
547 to_remove.push_back(cursor);
548 cursor += 1;
549 } else {
550 if let Some(start) = initialism_start {
551 let end = self.tokens[cursor - 2].span.end;
552 let start_tok: &mut Token = &mut self.tokens[start];
553 start_tok.span.end = end;
554 }
555
556 initialism_start = None;
557 }
558
559 cursor += 1;
560
561 if cursor >= self.tokens.len() - 1 {
562 break;
563 }
564 }
565
566 self.tokens.remove_indices(to_remove);
567 }
568
569 fn condense_filename_extensions(&mut self) {
571 if self.tokens.len() < 2 {
572 return;
573 }
574
575 let mut to_remove = VecDeque::new();
576
577 let mut cursor = 1;
578
579 let mut ext_start = None;
580
581 loop {
582 let l = self.get_token_offset(cursor, -2);
584 let d = &self.tokens[cursor - 1];
585 let x = &self.tokens[cursor];
586 let r = self.get_token_offset(cursor, 1);
587
588 let is_ext_chunk = d.kind.is_period()
589 && x.kind.is_word()
590 && x.span.len() <= 3
591 && ((l.is_none_or(|t| t.kind.is_whitespace())
592 && r.is_none_or(|t| t.kind.is_whitespace()))
593 || (l.is_some_and(|t| t.kind.is_open_round())
594 && r.is_some_and(|t| t.kind.is_close_round())))
595 && {
596 let ext_chars = x.span.get_content(&self.source);
597 ext_chars.iter().all(|c| c.is_ascii_lowercase())
598 || ext_chars.iter().all(|c| c.is_ascii_uppercase())
599 };
600
601 if is_ext_chunk {
602 if ext_start.is_none() {
603 ext_start = Some(cursor - 1);
604 self.tokens[cursor - 1].kind = TokenKind::Unlintable;
605 } else {
606 to_remove.push_back(cursor - 1);
607 }
608
609 to_remove.push_back(cursor);
610 cursor += 1;
611 } else {
612 if let Some(start) = ext_start {
613 let end = self.tokens[cursor - 2].span.end;
614 let start_tok: &mut Token = &mut self.tokens[start];
615 start_tok.span.end = end;
616 }
617
618 ext_start = None;
619 }
620
621 cursor += 1;
622
623 if cursor >= self.tokens.len() {
624 break;
625 }
626 }
627
628 self.tokens.remove_indices(to_remove);
629 }
630
631 fn condense_tldr(&mut self) {
633 if self.tokens.len() < 3 {
634 return;
635 }
636
637 let mut to_remove = VecDeque::new();
638 let mut cursor = 2;
639
640 loop {
641 let tl = &self.tokens[cursor - 2];
642 let simicolon = &self.tokens[cursor - 1];
643 let dr = &self.tokens[cursor];
644
645 let is_tldr_chunk = tl.kind.is_word()
646 && tl.span.len() == 2
647 && tl
648 .span
649 .get_content(&self.source)
650 .eq_ignore_ascii_case_chars(&['t', 'l'])
651 && simicolon.kind.is_semicolon()
652 && dr.kind.is_word()
653 && dr.span.len() >= 2
654 && dr.span.len() <= 3
655 && dr
656 .span
657 .get_content(&self.source)
658 .eq_any_ignore_ascii_case_chars(&[&['d', 'r'], &['d', 'r', 's']]);
659
660 if is_tldr_chunk {
661 self.tokens[cursor - 2].span = Span::new(
663 self.tokens[cursor - 2].span.start,
664 self.tokens[cursor].span.end,
665 );
666
667 to_remove.push_back(cursor - 1);
669 to_remove.push_back(cursor);
670 }
671
672 cursor += 1;
674
675 if cursor >= self.tokens.len() {
676 break;
677 }
678 }
679
680 self.tokens.remove_indices(to_remove);
682 }
683
684 fn uncached_ellipsis_pattern() -> Lrc<Repeating> {
685 let period = SequenceExpr::default().then_period();
686 Lrc::new(Repeating::new(Box::new(period), 2))
687 }
688
689 thread_local! {
690 static ELLIPSIS_EXPR: Lrc<Repeating> = Document::uncached_ellipsis_pattern();
691 }
692
693 fn condense_ellipsis(&mut self) {
694 let expr = Self::ELLIPSIS_EXPR.with(|v| v.clone());
695 self.condense_expr(&expr, |tok| {
696 tok.kind = TokenKind::Punctuation(Punctuation::Ellipsis)
697 });
698 }
699
700 fn uncached_contraction_expr() -> Lrc<SequenceExpr> {
701 Lrc::new(
702 SequenceExpr::default()
703 .then_any_word()
704 .then_apostrophe()
705 .then_any_word(),
706 )
707 }
708
709 thread_local! {
710 static CONTRACTION_EXPR: Lrc<SequenceExpr> = Document::uncached_contraction_expr();
711 }
712
713 fn condense_contractions(&mut self) {
716 let expr = Self::CONTRACTION_EXPR.with(|v| v.clone());
717
718 self.condense_expr(&expr, |_| {})
719 }
720}
721
722macro_rules! create_fns_on_doc {
724 ($thing:ident) => {
725 paste! {
726 fn [< first_ $thing >](&self) -> Option<&Token> {
727 self.tokens.[< first_ $thing >]()
728 }
729
730 fn [< last_ $thing >](&self) -> Option<&Token> {
731 self.tokens.[< last_ $thing >]()
732 }
733
734 fn [< last_ $thing _index>](&self) -> Option<usize> {
735 self.tokens.[< last_ $thing _index >]()
736 }
737
738 fn [<iter_ $thing _indices>](&self) -> impl DoubleEndedIterator<Item = usize> + '_ {
739 self.tokens.[< iter_ $thing _indices >]()
740 }
741
742 fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_ {
743 self.tokens.[< iter_ $thing s >]()
744 }
745 }
746 };
747}
748
749impl TokenStringExt for Document {
750 create_fns_on_doc!(adjective);
751 create_fns_on_doc!(apostrophe);
752 create_fns_on_doc!(at);
753 create_fns_on_doc!(chunk_terminator);
754 create_fns_on_doc!(comma);
755 create_fns_on_doc!(conjunction);
756 create_fns_on_doc!(currency);
757 create_fns_on_doc!(ellipsis);
758 create_fns_on_doc!(hostname);
759 create_fns_on_doc!(likely_homograph);
760 create_fns_on_doc!(noun);
761 create_fns_on_doc!(number);
762 create_fns_on_doc!(paragraph_break);
763 create_fns_on_doc!(pipe);
764 create_fns_on_doc!(preposition);
765 create_fns_on_doc!(punctuation);
766 create_fns_on_doc!(quote);
767 create_fns_on_doc!(sentence_terminator);
768 create_fns_on_doc!(space);
769 create_fns_on_doc!(unlintable);
770 create_fns_on_doc!(verb);
771 create_fns_on_doc!(word);
772 create_fns_on_doc!(word_like);
773
774 fn first_sentence_word(&self) -> Option<&Token> {
775 self.tokens.first_sentence_word()
776 }
777
778 fn first_non_whitespace(&self) -> Option<&Token> {
779 self.tokens.first_non_whitespace()
780 }
781
782 fn span(&self) -> Option<Span<char>> {
783 self.tokens.span()
784 }
785
786 fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
787 self.tokens.iter_linking_verb_indices()
788 }
789
790 fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_ {
791 self.tokens.iter_linking_verbs()
792 }
793
794 fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
795 self.tokens.iter_chunks()
796 }
797
798 fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
799 self.tokens.iter_paragraphs()
800 }
801
802 fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
803 self.tokens.iter_sentences()
804 }
805
806 fn iter_sentences_mut(&mut self) -> impl Iterator<Item = &'_ mut [Token]> + '_ {
807 self.tokens.iter_sentences_mut()
808 }
809}
810
811impl Display for Document {
812 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
813 for token in &self.tokens {
814 write!(f, "{}", self.get_span_content_str(&token.span))?;
815 }
816
817 Ok(())
818 }
819}
820
821#[cfg(test)]
822mod tests {
823 use itertools::Itertools;
824
825 use super::Document;
826 use crate::{Span, parsers::MarkdownOptions};
827
828 fn assert_condensed_contractions(text: &str, final_tok_count: usize) {
829 let document = Document::new_plain_english_curated(text);
830
831 assert_eq!(document.tokens.len(), final_tok_count);
832
833 let document = Document::new_markdown_curated(text, MarkdownOptions::default());
834
835 assert_eq!(document.tokens.len(), final_tok_count);
836 }
837
838 #[test]
839 fn simple_contraction() {
840 assert_condensed_contractions("isn't", 1);
841 }
842
843 #[test]
844 fn simple_contraction2() {
845 assert_condensed_contractions("wasn't", 1);
846 }
847
848 #[test]
849 fn simple_contraction3() {
850 assert_condensed_contractions("There's", 1);
851 }
852
853 #[test]
854 fn medium_contraction() {
855 assert_condensed_contractions("isn't wasn't", 3);
856 }
857
858 #[test]
859 fn medium_contraction2() {
860 assert_condensed_contractions("There's no way", 5);
861 }
862
863 #[test]
864 fn selects_token_at_char_index() {
865 let text = "There were three little pigs. They built three little homes.";
866 let document = Document::new_plain_english_curated(text);
867
868 let got = document.get_token_at_char_index(19).unwrap();
869
870 assert!(got.kind.is_word());
871 assert_eq!(got.span, Span::new(17, 23));
872 }
873
874 fn assert_token_count(source: &str, count: usize) {
875 let document = Document::new_plain_english_curated(source);
876
877 dbg!(document.tokens().map(|t| t.kind.clone()).collect_vec());
878 assert_eq!(document.tokens.len(), count);
879 }
880
881 #[test]
882 fn condenses_number_suffixes() {
883 assert_token_count("1st", 1);
884 assert_token_count("This is the 2nd test", 9);
885 assert_token_count("This is the 3rd test", 9);
886 assert_token_count(
887 "It works even with weird capitalization like this: 600nD",
888 18,
889 );
890 }
891
892 #[test]
893 fn condenses_ie() {
894 assert_token_count("There is a thing (i.e. that one)", 15);
895 assert_token_count("We are trying to condense \"i.e.\"", 13);
896 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
897 }
898
899 #[test]
900 fn condenses_eg() {
901 assert_token_count("We are trying to condense \"e.g.\"", 13);
902 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
903 }
904
905 #[test]
906 fn condenses_nsa() {
907 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
908 }
909
910 #[test]
911 fn parses_ellipsis() {
912 assert_token_count("...", 1);
913 }
914
915 #[test]
916 fn parses_long_ellipsis() {
917 assert_token_count(".....", 1);
918 }
919
920 #[test]
921 fn parses_short_ellipsis() {
922 assert_token_count("..", 1);
923 }
924
925 #[test]
926 fn selects_token_at_offset() {
927 let doc = Document::new_plain_english_curated("Foo bar baz");
928
929 let tok = doc.get_token_offset(1, -1).unwrap();
930
931 assert_eq!(tok.span, Span::new(0, 3));
932 }
933
934 #[test]
935 fn cant_select_token_before_start() {
936 let doc = Document::new_plain_english_curated("Foo bar baz");
937
938 let tok = doc.get_token_offset(0, -1);
939
940 assert!(tok.is_none());
941 }
942
943 #[test]
944 fn select_next_word_pos_offset() {
945 let doc = Document::new_plain_english_curated("Foo bar baz");
946
947 let bar = doc.get_next_word_from_offset(0, 1).unwrap();
948 let bar = doc.get_span_content(&bar.span);
949 assert_eq!(bar, ['b', 'a', 'r']);
950 }
951
952 #[test]
953 fn select_next_word_neg_offset() {
954 let doc = Document::new_plain_english_curated("Foo bar baz");
955
956 let bar = doc.get_next_word_from_offset(2, -1).unwrap();
957 let bar = doc.get_span_content(&bar.span);
958 assert_eq!(bar, ['F', 'o', 'o']);
959 }
960
961 #[test]
962 fn cant_select_next_word_not_from_whitespace() {
963 let doc = Document::new_plain_english_curated("Foo bar baz");
964
965 let tok = doc.get_next_word_from_offset(0, 2);
966
967 assert!(tok.is_none());
968 }
969
970 #[test]
971 fn cant_select_next_word_before_start() {
972 let doc = Document::new_plain_english_curated("Foo bar baz");
973
974 let tok = doc.get_next_word_from_offset(0, -1);
975
976 assert!(tok.is_none());
977 }
978
979 #[test]
980 fn cant_select_next_word_with_punctuation_instead_of_whitespace() {
981 let doc = Document::new_plain_english_curated("Foo, bar, baz");
982
983 let tok = doc.get_next_word_from_offset(0, 1);
984
985 assert!(tok.is_none());
986 }
987
988 #[test]
989 fn cant_select_next_word_with_punctuation_after_whitespace() {
990 let doc = Document::new_plain_english_curated("Foo \"bar\", baz");
991
992 let tok = doc.get_next_word_from_offset(0, 1);
993
994 assert!(tok.is_none());
995 }
996
997 #[test]
998 fn condenses_filename_extensions() {
999 let doc = Document::new_plain_english_curated(".c and .exe and .js");
1000 assert!(doc.tokens[0].kind.is_unlintable());
1001 assert!(doc.tokens[4].kind.is_unlintable());
1002 assert!(doc.tokens[8].kind.is_unlintable());
1003 }
1004
1005 #[test]
1006 fn condense_filename_extension_ok_at_start_and_end() {
1007 let doc = Document::new_plain_english_curated(".c and .EXE");
1008 assert!(doc.tokens.len() == 5);
1009 assert!(doc.tokens[0].kind.is_unlintable());
1010 assert!(doc.tokens[4].kind.is_unlintable());
1011 }
1012
1013 #[test]
1014 fn doesnt_condense_filename_extensions_with_mixed_case() {
1015 let doc = Document::new_plain_english_curated(".c and .Exe");
1016 assert!(doc.tokens.len() == 6);
1017 assert!(doc.tokens[0].kind.is_unlintable());
1018 assert!(doc.tokens[4].kind.is_punctuation());
1019 assert!(doc.tokens[5].kind.is_word());
1020 }
1021
1022 #[test]
1023 fn doesnt_condense_filename_extensions_with_non_letters() {
1024 let doc = Document::new_plain_english_curated(".COM and .C0M");
1025 assert!(doc.tokens.len() == 6);
1026 assert!(doc.tokens[0].kind.is_unlintable());
1027 assert!(doc.tokens[4].kind.is_punctuation());
1028 assert!(doc.tokens[5].kind.is_word());
1029 }
1030
1031 #[test]
1032 fn doesnt_condense_filename_extensions_longer_than_three() {
1033 let doc = Document::new_plain_english_curated(".dll and .dlls");
1034 assert!(doc.tokens.len() == 6);
1035 assert!(doc.tokens[0].kind.is_unlintable());
1036 assert!(doc.tokens[4].kind.is_punctuation());
1037 assert!(doc.tokens[5].kind.is_word());
1038 }
1039
1040 #[test]
1041 fn condense_filename_extension_in_parens() {
1042 let doc = Document::new_plain_english_curated(
1043 "true for the manual installation when trying to run the executable(.exe) after a manual download",
1044 );
1045 assert!(doc.tokens.len() > 23);
1046 assert!(doc.tokens[21].kind.is_open_round());
1047 assert!(doc.tokens[22].kind.is_unlintable());
1048 assert!(doc.tokens[23].kind.is_close_round());
1049 }
1050
1051 #[test]
1052 fn condense_tldr_uppercase() {
1053 let doc = Document::new_plain_english_curated("TL;DR");
1054 assert!(doc.tokens.len() == 1);
1055 assert!(doc.tokens[0].kind.is_word());
1056 assert!(doc.tokens[0].span.len() == 5);
1057 }
1058
1059 #[test]
1060 fn condense_tldr_lowercase() {
1061 let doc = Document::new_plain_english_curated("tl;dr");
1062 assert!(doc.tokens.len() == 1);
1063 assert!(doc.tokens[0].kind.is_word());
1064 }
1065
1066 #[test]
1067 fn condense_tldr_mixed_case_1() {
1068 let doc = Document::new_plain_english_curated("tl;DR");
1069 assert!(doc.tokens.len() == 1);
1070 assert!(doc.tokens[0].kind.is_word());
1071 }
1072
1073 #[test]
1074 fn condense_tldr_mixed_case_2() {
1075 let doc = Document::new_plain_english_curated("TL;Dr");
1076 assert!(doc.tokens.len() == 1);
1077 assert!(doc.tokens[0].kind.is_word());
1078 }
1079
1080 #[test]
1081 fn condense_tldr_pural() {
1082 let doc = Document::new_plain_english_curated(
1083 "managing the flow between components to produce relevant TL;DRs of current news articles",
1084 );
1085 assert!(
1087 doc.tokens
1088 .iter()
1089 .all(|t| t.kind.is_word() || t.kind.is_whitespace())
1090 );
1091 let tldrs = doc
1093 .tokens
1094 .iter()
1095 .filter(|t| t.span.get_content(&doc.source).contains(&';'))
1096 .collect_vec();
1097 assert!(tldrs.len() == 1);
1098 assert!(tldrs[0].span.get_content_string(&doc.source) == "TL;DRs");
1099 }
1100}