1use std::cmp::Ordering;
2use std::collections::VecDeque;
3use std::fmt::Display;
4
5use harper_brill::{Chunker, Tagger, brill_chunker, brill_tagger};
6use paste::paste;
7
8use crate::expr::{Expr, ExprExt, FirstMatchOf, Repeating, SequenceExpr};
9use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
10use crate::patterns::WordSet;
11use crate::punctuation::Punctuation;
12use crate::spell::{Dictionary, FstDictionary};
13use crate::vec_ext::VecExt;
14use crate::{FatStringToken, FatToken, Lrc, Token, TokenKind, TokenStringExt};
15use crate::{OrdinalSuffix, Span};
16
17#[derive(Debug, Clone)]
19pub struct Document {
20 source: Lrc<Vec<char>>,
21 tokens: Vec<Token>,
22}
23
24impl Default for Document {
25 fn default() -> Self {
26 Self::new("", &PlainEnglish, &FstDictionary::curated())
27 }
28}
29
30impl Document {
31 pub fn token_indices_intersecting(&self, span: Span<char>) -> Vec<usize> {
35 self.tokens()
36 .enumerate()
37 .filter_map(|(idx, tok)| tok.span.overlaps_with(span).then_some(idx))
38 .collect()
39 }
40
41 pub fn fat_tokens_intersecting(&self, span: Span<char>) -> Vec<FatToken> {
45 let indices = self.token_indices_intersecting(span);
46
47 indices
48 .into_iter()
49 .map(|i| self.tokens[i].to_fat(&self.source))
50 .collect()
51 }
52
53 pub fn new(text: &str, parser: &impl Parser, dictionary: &impl Dictionary) -> Self {
56 let source: Vec<_> = text.chars().collect();
57
58 Self::new_from_vec(Lrc::new(source), parser, dictionary)
59 }
60
61 pub fn new_curated(text: &str, parser: &impl Parser) -> Self {
64 let source: Vec<_> = text.chars().collect();
65
66 Self::new_from_vec(Lrc::new(source), parser, &FstDictionary::curated())
67 }
68
69 pub fn new_from_vec(
72 source: Lrc<Vec<char>>,
73 parser: &impl Parser,
74 dictionary: &impl Dictionary,
75 ) -> Self {
76 let tokens = parser.parse(&source);
77
78 let mut document = Self { source, tokens };
79 document.parse(dictionary);
80
81 document
82 }
83
84 pub fn new_plain_english_curated(text: &str) -> Self {
87 Self::new(text, &PlainEnglish, &FstDictionary::curated())
88 }
89
90 pub fn new_plain_english(text: &str, dictionary: &impl Dictionary) -> Self {
93 Self::new(text, &PlainEnglish, dictionary)
94 }
95
96 pub fn new_markdown_curated(text: &str, markdown_options: MarkdownOptions) -> Self {
99 Self::new(
100 text,
101 &Markdown::new(markdown_options),
102 &FstDictionary::curated(),
103 )
104 }
105
106 pub fn new_markdown_default_curated(text: &str) -> Self {
109 Self::new_markdown_curated(text, MarkdownOptions::default())
110 }
111
112 pub fn new_markdown(
115 text: &str,
116 markdown_options: MarkdownOptions,
117 dictionary: &impl Dictionary,
118 ) -> Self {
119 Self::new(text, &Markdown::new(markdown_options), dictionary)
120 }
121
122 pub fn new_markdown_default(text: &str, dictionary: &impl Dictionary) -> Self {
125 Self::new_markdown(text, MarkdownOptions::default(), dictionary)
126 }
127
128 fn parse(&mut self, dictionary: &impl Dictionary) {
132 self.condense_spaces();
133 self.condense_newlines();
134 self.newlines_to_breaks();
135 self.condense_contractions();
136 self.condense_dotted_initialisms();
137 self.condense_number_suffixes();
138 self.condense_ellipsis();
139 self.condense_latin();
140 self.condense_filename_extensions();
141 self.match_quotes();
142
143 let token_strings: Vec<_> = self
144 .tokens
145 .iter()
146 .filter(|t| !t.kind.is_whitespace())
147 .map(|t| self.get_span_content_str(&t.span))
148 .collect();
149
150 let token_tags = brill_tagger().tag_sentence(&token_strings);
151 let np_flags = brill_chunker().chunk_sentence(&token_strings, &token_tags);
152
153 let mut i = 0;
154
155 for token in self.tokens.iter_mut() {
157 if let TokenKind::Word(meta) = &mut token.kind {
158 let word_source = token.span.get_content(&self.source);
159 let mut found_meta = dictionary.get_word_metadata(word_source).cloned();
160
161 if let Some(inner) = &mut found_meta {
162 inner.pos_tag = token_tags[i].or_else(|| inner.infer_pos_tag());
163 inner.np_member = Some(np_flags[i]);
164 }
165
166 *meta = found_meta;
167 i += 1;
168 } else if !token.kind.is_whitespace() {
169 i += 1;
170 }
171 }
172 }
173
174 fn newlines_to_breaks(&mut self) {
176 for token in &mut self.tokens {
177 if let TokenKind::Newline(n) = token.kind {
178 if n >= 2 {
179 token.kind = TokenKind::ParagraphBreak;
180 }
181 }
182 }
183 }
184
185 fn condense_indices(&mut self, indices: &[usize], stretch_len: usize) {
191 for idx in indices {
193 let end_tok = self.tokens[idx + stretch_len - 1].clone();
194 let start_tok = &mut self.tokens[*idx];
195
196 start_tok.span.end = end_tok.span.end;
197 }
198
199 let old = self.tokens.clone();
201 self.tokens.clear();
202
203 self.tokens
205 .extend_from_slice(&old[0..indices.first().copied().unwrap_or(indices.len())]);
206
207 let mut iter = indices.iter().peekable();
208
209 while let (Some(a_idx), b) = (iter.next(), iter.peek()) {
210 self.tokens.push(old[*a_idx].clone());
211
212 if let Some(b_idx) = b {
213 self.tokens
214 .extend_from_slice(&old[a_idx + stretch_len..**b_idx]);
215 }
216 }
217
218 self.tokens.extend_from_slice(
220 &old[indices
221 .last()
222 .map(|v| v + stretch_len)
223 .unwrap_or(indices.len())..],
224 );
225 }
226
227 pub fn get_token_at_char_index(&self, char_index: usize) -> Option<&Token> {
228 let index = self
229 .tokens
230 .binary_search_by(|t| {
231 if t.span.overlaps_with(Span::new_with_len(char_index, 1)) {
232 Ordering::Equal
233 } else {
234 t.span.start.cmp(&char_index)
235 }
236 })
237 .ok()?;
238
239 Some(&self.tokens[index])
240 }
241
242 pub fn get_token(&self, index: usize) -> Option<&Token> {
244 self.tokens.get(index)
245 }
246
247 pub fn get_token_offset(&self, base: usize, offset: isize) -> Option<&Token> {
249 match base.checked_add_signed(offset) {
250 None => None,
251 Some(idx) => self.get_token(idx),
252 }
253 }
254
255 pub fn tokens(&self) -> impl Iterator<Item = &Token> + '_ {
257 self.tokens.iter()
258 }
259
260 pub fn iter_nominal_phrases(&self) -> impl Iterator<Item = &[Token]> {
261 fn is_np_member(t: &Token) -> bool {
262 t.kind
263 .as_word()
264 .and_then(|x| x.as_ref())
265 .and_then(|w| w.np_member)
266 .unwrap_or(false)
267 }
268
269 fn trim(slice: &[Token]) -> &[Token] {
270 let mut start = 0;
271 let mut end = slice.len();
272 while start < end && slice[start].kind.is_whitespace() {
273 start += 1;
274 }
275 while end > start && slice[end - 1].kind.is_whitespace() {
276 end -= 1;
277 }
278 &slice[start..end]
279 }
280
281 self.tokens
282 .as_slice()
283 .split(|t| !(is_np_member(t) || t.kind.is_whitespace()))
284 .filter_map(|s| {
285 let s = trim(s);
286 if s.iter().any(is_np_member) {
287 Some(s)
288 } else {
289 None
290 }
291 })
292 }
293
294 pub fn fat_tokens(&self) -> impl Iterator<Item = FatToken> + '_ {
296 self.tokens().map(|token| token.to_fat(&self.source))
297 }
298
299 pub fn get_next_word_from_offset(&self, base: usize, offset: isize) -> Option<&Token> {
302 if !self.get_token_offset(base, offset)?.kind.is_whitespace() {
304 return None;
305 }
306 let word_token = self.get_token_offset(base, offset + offset.signum());
308 let word_token = word_token?;
309 word_token.kind.is_word().then_some(word_token)
310 }
311
312 pub fn fat_string_tokens(&self) -> impl Iterator<Item = FatStringToken> + '_ {
314 self.fat_tokens().map(|t| t.into())
315 }
316
317 pub fn get_span_content(&self, span: &Span<char>) -> &[char] {
318 span.get_content(&self.source)
319 }
320
321 pub fn get_span_content_str(&self, span: &Span<char>) -> String {
322 String::from_iter(self.get_span_content(span))
323 }
324
325 pub fn get_full_string(&self) -> String {
326 self.get_span_content_str(&Span::new(0, self.source.len()))
327 }
328
329 pub fn get_full_content(&self) -> &[char] {
330 &self.source
331 }
332
333 pub fn get_source(&self) -> &[char] {
334 &self.source
335 }
336
337 pub fn get_tokens(&self) -> &[Token] {
338 &self.tokens
339 }
340
341 fn match_quotes(&mut self) {
347 let quote_indices: Vec<usize> = self.tokens.iter_quote_indices().collect();
348
349 for i in 0..quote_indices.len() / 2 {
350 let a_i = quote_indices[i * 2];
351 let b_i = quote_indices[i * 2 + 1];
352
353 {
354 let a = self.tokens[a_i].kind.as_mut_quote().unwrap();
355 a.twin_loc = Some(b_i);
356 }
357
358 {
359 let b = self.tokens[b_i].kind.as_mut_quote().unwrap();
360 b.twin_loc = Some(a_i);
361 }
362 }
363 }
364
365 fn condense_number_suffixes(&mut self) {
367 if self.tokens.len() < 2 {
368 return;
369 }
370
371 let mut replace_starts = Vec::new();
372
373 for idx in 0..self.tokens.len() - 1 {
374 let b = &self.tokens[idx + 1];
375 let a = &self.tokens[idx];
376
377 if let (TokenKind::Number(..), TokenKind::Word(..)) = (&a.kind, &b.kind) {
380 if let Some(found_suffix) =
381 OrdinalSuffix::from_chars(self.get_span_content(&b.span))
382 {
383 self.tokens[idx].kind.as_mut_number().unwrap().suffix = Some(found_suffix);
384 replace_starts.push(idx);
385 }
386 }
387 }
388
389 self.condense_indices(&replace_starts, 2);
390 }
391
392 fn condense_spaces(&mut self) {
395 let mut cursor = 0;
396 let copy = self.tokens.clone();
397
398 let mut remove_these = VecDeque::new();
399
400 while cursor < self.tokens.len() {
401 let start_tok = &mut self.tokens[cursor];
403
404 if let TokenKind::Space(start_count) = &mut start_tok.kind {
405 loop {
406 cursor += 1;
407
408 if cursor >= copy.len() {
409 break;
410 }
411
412 let child_tok = ©[cursor];
413
414 if start_tok.span.end != child_tok.span.start {
416 break;
417 }
418
419 if let TokenKind::Space(n) = child_tok.kind {
420 *start_count += n;
421 start_tok.span.end = child_tok.span.end;
422 remove_these.push_back(cursor);
423 cursor += 1;
424 } else {
425 break;
426 };
427 }
428 }
429
430 cursor += 1;
431 }
432
433 self.tokens.remove_indices(remove_these);
434 }
435
436 thread_local! {
437 static LATIN_EXPR: Lrc<FirstMatchOf> = Document::uncached_latin_expr();
438 }
439
440 fn uncached_latin_expr() -> Lrc<FirstMatchOf> {
441 Lrc::new(FirstMatchOf::new(vec![
442 Box::new(
443 SequenceExpr::default()
444 .then(WordSet::new(&["etc", "vs"]))
445 .then_period(),
446 ),
447 Box::new(
448 SequenceExpr::aco("et")
449 .then_whitespace()
450 .t_aco("al")
451 .then_period(),
452 ),
453 ]))
454 }
455
456 fn condense_expr<F>(&mut self, expr: &impl Expr, edit: F)
459 where
460 F: Fn(&mut Token),
461 {
462 let matches = expr.iter_matches_in_doc(self).collect::<Vec<_>>();
463
464 let mut remove_indices = VecDeque::with_capacity(matches.len());
465
466 for m in matches {
467 remove_indices.extend(m.start + 1..m.end);
468 self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
469 edit(&mut self.tokens[m.start]);
470 }
471
472 self.tokens.remove_indices(remove_indices);
473 }
474
475 fn condense_latin(&mut self) {
476 self.condense_expr(&Self::LATIN_EXPR.with(|v| v.clone()), |_| {})
477 }
478
479 fn condense_newlines(&mut self) {
482 let mut cursor = 0;
483 let copy = self.tokens.clone();
484
485 let mut remove_these = VecDeque::new();
486
487 while cursor < self.tokens.len() {
488 let start_tok = &mut self.tokens[cursor];
490
491 if let TokenKind::Newline(start_count) = &mut start_tok.kind {
492 loop {
493 cursor += 1;
494
495 if cursor >= copy.len() {
496 break;
497 }
498
499 let child_tok = ©[cursor];
500 if let TokenKind::Newline(n) = child_tok.kind {
501 *start_count += n;
502 start_tok.span.end = child_tok.span.end;
503 remove_these.push_back(cursor);
504 cursor += 1;
505 } else {
506 break;
507 };
508 }
509 }
510
511 cursor += 1;
512 }
513
514 self.tokens.remove_indices(remove_these);
515 }
516
517 fn condense_dotted_initialisms(&mut self) {
520 if self.tokens.len() < 2 {
521 return;
522 }
523
524 let mut to_remove = VecDeque::new();
525
526 let mut cursor = 1;
527
528 let mut initialism_start = None;
529
530 loop {
531 let a = &self.tokens[cursor - 1];
532 let b = &self.tokens[cursor];
533
534 let is_initialism_chunk = a.kind.is_word() && a.span.len() == 1 && b.kind.is_period();
535
536 if is_initialism_chunk {
537 if initialism_start.is_none() {
538 initialism_start = Some(cursor - 1);
539 } else {
540 to_remove.push_back(cursor - 1);
541 }
542
543 to_remove.push_back(cursor);
544 cursor += 1;
545 } else {
546 if let Some(start) = initialism_start {
547 let end = self.tokens[cursor - 2].span.end;
548 let start_tok: &mut Token = &mut self.tokens[start];
549 start_tok.span.end = end;
550 }
551
552 initialism_start = None;
553 }
554
555 cursor += 1;
556
557 if cursor >= self.tokens.len() - 1 {
558 break;
559 }
560 }
561
562 self.tokens.remove_indices(to_remove);
563 }
564
565 fn condense_filename_extensions(&mut self) {
567 if self.tokens.len() < 2 {
568 return;
569 }
570
571 let mut to_remove = VecDeque::new();
572
573 let mut cursor = 1;
574
575 let mut ext_start = None;
576
577 loop {
578 let a = self.get_token_offset(cursor, -2);
579 let b = &self.tokens[cursor - 1];
580 let c = &self.tokens[cursor];
581 let d = self.get_token_offset(cursor, 1);
582
583 let is_ext_chunk = a.is_none_or(|t| t.kind.is_whitespace())
584 && b.kind.is_period()
585 && c.kind.is_word()
586 && c.span.len() <= 3
587 && d.is_none_or(|t| t.kind.is_whitespace())
588 && if d.is_none_or(|t| t.kind.is_whitespace()) {
589 let ext_chars = c.span.get_content(&self.source);
590 ext_chars.iter().all(|c| c.is_ascii_lowercase())
591 || ext_chars.iter().all(|c| c.is_ascii_uppercase())
592 } else {
593 false
594 };
595
596 if is_ext_chunk {
597 if ext_start.is_none() {
598 ext_start = Some(cursor - 1);
599 self.tokens[cursor - 1].kind = TokenKind::Unlintable;
600 } else {
601 to_remove.push_back(cursor - 1);
602 }
603
604 to_remove.push_back(cursor);
605 cursor += 1;
606 } else {
607 if let Some(start) = ext_start {
608 let end = self.tokens[cursor - 2].span.end;
609 let start_tok: &mut Token = &mut self.tokens[start];
610 start_tok.span.end = end;
611 }
612
613 ext_start = None;
614 }
615
616 cursor += 1;
617
618 if cursor >= self.tokens.len() {
619 break;
620 }
621 }
622
623 self.tokens.remove_indices(to_remove);
624 }
625
626 fn uncached_ellipsis_pattern() -> Lrc<Repeating> {
627 let period = SequenceExpr::default().then_period();
628 Lrc::new(Repeating::new(Box::new(period), 2))
629 }
630
631 thread_local! {
632 static ELLIPSIS_EXPR: Lrc<Repeating> = Document::uncached_ellipsis_pattern();
633 }
634
635 fn condense_ellipsis(&mut self) {
636 let expr = Self::ELLIPSIS_EXPR.with(|v| v.clone());
637 self.condense_expr(&expr, |tok| {
638 tok.kind = TokenKind::Punctuation(Punctuation::Ellipsis)
639 });
640 }
641
642 fn uncached_contraction_expr() -> Lrc<SequenceExpr> {
643 Lrc::new(
644 SequenceExpr::default()
645 .then_any_word()
646 .then_apostrophe()
647 .then_any_word(),
648 )
649 }
650
651 thread_local! {
652 static CONTRACTION_EXPR: Lrc<SequenceExpr> = Document::uncached_contraction_expr();
653 }
654
655 fn condense_contractions(&mut self) {
658 let expr = Self::CONTRACTION_EXPR.with(|v| v.clone());
659
660 self.condense_expr(&expr, |_| {});
661 }
662}
663
664macro_rules! create_fns_on_doc {
666 ($thing:ident) => {
667 paste! {
668 fn [< first_ $thing >](&self) -> Option<&Token> {
669 self.tokens.[< first_ $thing >]()
670 }
671
672 fn [< last_ $thing >](&self) -> Option<&Token> {
673 self.tokens.[< last_ $thing >]()
674 }
675
676 fn [< last_ $thing _index>](&self) -> Option<usize> {
677 self.tokens.[< last_ $thing _index >]()
678 }
679
680 fn [<iter_ $thing _indices>](&self) -> impl DoubleEndedIterator<Item = usize> + '_ {
681 self.tokens.[< iter_ $thing _indices >]()
682 }
683
684 fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_ {
685 self.tokens.[< iter_ $thing s >]()
686 }
687 }
688 };
689}
690
691impl TokenStringExt for Document {
692 create_fns_on_doc!(adjective);
693 create_fns_on_doc!(apostrophe);
694 create_fns_on_doc!(at);
695 create_fns_on_doc!(chunk_terminator);
696 create_fns_on_doc!(comma);
697 create_fns_on_doc!(conjunction);
698 create_fns_on_doc!(currency);
699 create_fns_on_doc!(ellipsis);
700 create_fns_on_doc!(hostname);
701 create_fns_on_doc!(likely_homograph);
702 create_fns_on_doc!(noun);
703 create_fns_on_doc!(number);
704 create_fns_on_doc!(paragraph_break);
705 create_fns_on_doc!(pipe);
706 create_fns_on_doc!(preposition);
707 create_fns_on_doc!(punctuation);
708 create_fns_on_doc!(quote);
709 create_fns_on_doc!(sentence_terminator);
710 create_fns_on_doc!(space);
711 create_fns_on_doc!(unlintable);
712 create_fns_on_doc!(verb);
713 create_fns_on_doc!(word);
714 create_fns_on_doc!(word_like);
715
716 fn first_sentence_word(&self) -> Option<&Token> {
717 self.tokens.first_sentence_word()
718 }
719
720 fn first_non_whitespace(&self) -> Option<&Token> {
721 self.tokens.first_non_whitespace()
722 }
723
724 fn span(&self) -> Option<Span<char>> {
725 self.tokens.span()
726 }
727
728 fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
729 self.tokens.iter_linking_verb_indices()
730 }
731
732 fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_ {
733 self.tokens.iter_linking_verbs()
734 }
735
736 fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
737 self.tokens.iter_chunks()
738 }
739
740 fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
741 self.tokens.iter_paragraphs()
742 }
743
744 fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
745 self.tokens.iter_sentences()
746 }
747}
748
749impl Display for Document {
750 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
751 for token in &self.tokens {
752 write!(f, "{}", self.get_span_content_str(&token.span))?;
753 }
754
755 Ok(())
756 }
757}
758
759#[cfg(test)]
760mod tests {
761 use itertools::Itertools;
762
763 use super::Document;
764 use crate::{Span, parsers::MarkdownOptions};
765
766 fn assert_condensed_contractions(text: &str, final_tok_count: usize) {
767 let document = Document::new_plain_english_curated(text);
768
769 assert_eq!(document.tokens.len(), final_tok_count);
770
771 let document = Document::new_markdown_curated(text, MarkdownOptions::default());
772
773 assert_eq!(document.tokens.len(), final_tok_count);
774 }
775
776 #[test]
777 fn simple_contraction() {
778 assert_condensed_contractions("isn't", 1);
779 }
780
781 #[test]
782 fn simple_contraction2() {
783 assert_condensed_contractions("wasn't", 1);
784 }
785
786 #[test]
787 fn simple_contraction3() {
788 assert_condensed_contractions("There's", 1);
789 }
790
791 #[test]
792 fn medium_contraction() {
793 assert_condensed_contractions("isn't wasn't", 3);
794 }
795
796 #[test]
797 fn medium_contraction2() {
798 assert_condensed_contractions("There's no way", 5);
799 }
800
801 #[test]
802 fn selects_token_at_char_index() {
803 let text = "There were three little pigs. They built three little homes.";
804 let document = Document::new_plain_english_curated(text);
805
806 let got = document.get_token_at_char_index(19).unwrap();
807
808 assert!(got.kind.is_word());
809 assert_eq!(got.span, Span::new(17, 23));
810 }
811
812 fn assert_token_count(source: &str, count: usize) {
813 let document = Document::new_plain_english_curated(source);
814
815 dbg!(document.tokens().map(|t| t.kind.clone()).collect_vec());
816 assert_eq!(document.tokens.len(), count);
817 }
818
819 #[test]
820 fn condenses_number_suffixes() {
821 assert_token_count("1st", 1);
822 assert_token_count("This is the 2nd test", 9);
823 assert_token_count("This is the 3rd test", 9);
824 assert_token_count(
825 "It works even with weird capitalization like this: 600nD",
826 18,
827 );
828 }
829
830 #[test]
831 fn condenses_ie() {
832 assert_token_count("There is a thing (i.e. that one)", 15);
833 assert_token_count("We are trying to condense \"i.e.\"", 13);
834 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
835 }
836
837 #[test]
838 fn condenses_eg() {
839 assert_token_count("We are trying to condense \"e.g.\"", 13);
840 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
841 }
842
843 #[test]
844 fn condenses_nsa() {
845 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
846 }
847
848 #[test]
849 fn parses_ellipsis() {
850 assert_token_count("...", 1);
851 }
852
853 #[test]
854 fn parses_long_ellipsis() {
855 assert_token_count(".....", 1);
856 }
857
858 #[test]
859 fn parses_short_ellipsis() {
860 assert_token_count("..", 1);
861 }
862
863 #[test]
864 fn selects_token_at_offset() {
865 let doc = Document::new_plain_english_curated("Foo bar baz");
866
867 let tok = doc.get_token_offset(1, -1).unwrap();
868
869 assert_eq!(tok.span, Span::new(0, 3));
870 }
871
872 #[test]
873 fn cant_select_token_before_start() {
874 let doc = Document::new_plain_english_curated("Foo bar baz");
875
876 let tok = doc.get_token_offset(0, -1);
877
878 assert!(tok.is_none());
879 }
880
881 #[test]
882 fn select_next_word_pos_offset() {
883 let doc = Document::new_plain_english_curated("Foo bar baz");
884
885 let bar = doc.get_next_word_from_offset(0, 1).unwrap();
886 let bar = doc.get_span_content(&bar.span);
887 assert_eq!(bar, ['b', 'a', 'r']);
888 }
889
890 #[test]
891 fn select_next_word_neg_offset() {
892 let doc = Document::new_plain_english_curated("Foo bar baz");
893
894 let bar = doc.get_next_word_from_offset(2, -1).unwrap();
895 let bar = doc.get_span_content(&bar.span);
896 assert_eq!(bar, ['F', 'o', 'o']);
897 }
898
899 #[test]
900 fn cant_select_next_word_not_from_whitespace() {
901 let doc = Document::new_plain_english_curated("Foo bar baz");
902
903 let tok = doc.get_next_word_from_offset(0, 2);
904
905 assert!(tok.is_none());
906 }
907
908 #[test]
909 fn cant_select_next_word_before_start() {
910 let doc = Document::new_plain_english_curated("Foo bar baz");
911
912 let tok = doc.get_next_word_from_offset(0, -1);
913
914 assert!(tok.is_none());
915 }
916
917 #[test]
918 fn cant_select_next_word_with_punctuation_instead_of_whitespace() {
919 let doc = Document::new_plain_english_curated("Foo, bar, baz");
920
921 let tok = doc.get_next_word_from_offset(0, 1);
922
923 assert!(tok.is_none());
924 }
925
926 #[test]
927 fn cant_select_next_word_with_punctuation_after_whitespace() {
928 let doc = Document::new_plain_english_curated("Foo \"bar\", baz");
929
930 let tok = doc.get_next_word_from_offset(0, 1);
931
932 assert!(tok.is_none());
933 }
934
935 #[test]
936 fn condenses_filename_extensions() {
937 let doc = Document::new_plain_english_curated(".c and .exe and .js");
938 assert!(doc.tokens[0].kind.is_unlintable());
939 assert!(doc.tokens[4].kind.is_unlintable());
940 assert!(doc.tokens[8].kind.is_unlintable());
941 }
942
943 #[test]
944 fn condense_filename_extension_ok_at_start_and_end() {
945 let doc = Document::new_plain_english_curated(".c and .EXE");
946 assert!(doc.tokens.len() == 5);
947 assert!(doc.tokens[0].kind.is_unlintable());
948 assert!(doc.tokens[4].kind.is_unlintable());
949 }
950
951 #[test]
952 fn doesnt_condense_filename_extensions_with_mixed_case() {
953 let doc = Document::new_plain_english_curated(".c and .Exe");
954 assert!(doc.tokens.len() == 6);
955 assert!(doc.tokens[0].kind.is_unlintable());
956 assert!(doc.tokens[4].kind.is_punctuation());
957 assert!(doc.tokens[5].kind.is_word());
958 }
959
960 #[test]
961 fn doesnt_condense_filename_extensions_with_non_letters() {
962 let doc = Document::new_plain_english_curated(".COM and .C0M");
963 assert!(doc.tokens.len() == 6);
964 assert!(doc.tokens[0].kind.is_unlintable());
965 assert!(doc.tokens[4].kind.is_punctuation());
966 assert!(doc.tokens[5].kind.is_word());
967 }
968
969 #[test]
970 fn doesnt_condense_filename_extensions_longer_than_three() {
971 let doc = Document::new_plain_english_curated(".dll and .dlls");
972 assert!(doc.tokens.len() == 6);
973 assert!(doc.tokens[0].kind.is_unlintable());
974 assert!(doc.tokens[4].kind.is_punctuation());
975 assert!(doc.tokens[5].kind.is_word());
976 }
977}