1use std::cmp::Ordering;
2use std::collections::VecDeque;
3use std::fmt::Display;
4
5use harper_brill::{Chunker, Tagger, brill_tagger, burn_chunker};
6use paste::paste;
7
8use crate::expr::{Expr, ExprExt, FirstMatchOf, Repeating, SequenceExpr};
9use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
10use crate::patterns::WordSet;
11use crate::punctuation::Punctuation;
12use crate::spell::{Dictionary, FstDictionary};
13use crate::vec_ext::VecExt;
14use crate::{CharStringExt, FatStringToken, FatToken, Lrc, Token, TokenKind, TokenStringExt};
15use crate::{OrdinalSuffix, Span};
16
17#[derive(Debug, Clone)]
19pub struct Document {
20 source: Lrc<Vec<char>>,
21 tokens: Vec<Token>,
22}
23
24impl Default for Document {
25 fn default() -> Self {
26 Self::new("", &PlainEnglish, &FstDictionary::curated())
27 }
28}
29
30impl Document {
31 pub fn token_indices_intersecting(&self, span: Span<char>) -> Vec<usize> {
35 self.tokens()
36 .enumerate()
37 .filter_map(|(idx, tok)| tok.span.overlaps_with(span).then_some(idx))
38 .collect()
39 }
40
41 pub fn fat_tokens_intersecting(&self, span: Span<char>) -> Vec<FatToken> {
45 let indices = self.token_indices_intersecting(span);
46
47 indices
48 .into_iter()
49 .map(|i| self.tokens[i].to_fat(&self.source))
50 .collect()
51 }
52
53 pub fn new(text: &str, parser: &impl Parser, dictionary: &impl Dictionary) -> Self {
56 let source: Vec<_> = text.chars().collect();
57
58 Self::new_from_vec(Lrc::new(source), parser, dictionary)
59 }
60
61 pub fn new_curated(text: &str, parser: &impl Parser) -> Self {
64 let source: Vec<_> = text.chars().collect();
65
66 Self::new_from_vec(Lrc::new(source), parser, &FstDictionary::curated())
67 }
68
69 pub fn new_from_vec(
72 source: Lrc<Vec<char>>,
73 parser: &impl Parser,
74 dictionary: &impl Dictionary,
75 ) -> Self {
76 let tokens = parser.parse(&source);
77
78 let mut document = Self { source, tokens };
79 document.parse(dictionary);
80
81 document
82 }
83
84 pub fn new_plain_english_curated(text: &str) -> Self {
87 Self::new(text, &PlainEnglish, &FstDictionary::curated())
88 }
89
90 pub fn new_plain_english(text: &str, dictionary: &impl Dictionary) -> Self {
93 Self::new(text, &PlainEnglish, dictionary)
94 }
95
96 pub fn new_markdown_curated(text: &str, markdown_options: MarkdownOptions) -> Self {
99 Self::new(
100 text,
101 &Markdown::new(markdown_options),
102 &FstDictionary::curated(),
103 )
104 }
105
106 pub fn new_markdown_default_curated(text: &str) -> Self {
109 Self::new_markdown_curated(text, MarkdownOptions::default())
110 }
111
112 pub fn new_markdown(
115 text: &str,
116 markdown_options: MarkdownOptions,
117 dictionary: &impl Dictionary,
118 ) -> Self {
119 Self::new(text, &Markdown::new(markdown_options), dictionary)
120 }
121
122 pub fn new_markdown_default(text: &str, dictionary: &impl Dictionary) -> Self {
125 Self::new_markdown(text, MarkdownOptions::default(), dictionary)
126 }
127
128 fn parse(&mut self, dictionary: &impl Dictionary) {
132 self.condense_spaces();
133 self.condense_newlines();
134 self.newlines_to_breaks();
135 self.condense_contractions();
136 self.condense_dotted_initialisms();
137 self.condense_number_suffixes();
138 self.condense_ellipsis();
139 self.condense_latin();
140 self.condense_filename_extensions();
141 self.condense_tldr();
142 self.condense_ampersand_pairs();
143 self.match_quotes();
144
145 let chunker = burn_chunker();
146 let tagger = brill_tagger();
147
148 for sent in self.tokens.iter_sentences_mut() {
149 let token_strings: Vec<_> = sent
150 .iter()
151 .filter(|t| !t.kind.is_whitespace())
152 .map(|t| t.span.get_content_string(&self.source))
153 .collect();
154
155 let token_tags = tagger.tag_sentence(&token_strings);
156 let np_flags = chunker.chunk_sentence(&token_strings, &token_tags);
157
158 let mut i = 0;
159
160 for token in sent.iter_mut() {
162 if let TokenKind::Word(meta) = &mut token.kind {
163 let word_source = token.span.get_content(&self.source);
164 let mut found_meta = dictionary.get_word_metadata(word_source).cloned();
165
166 if let Some(inner) = &mut found_meta {
167 inner.pos_tag = token_tags[i].or_else(|| inner.infer_pos_tag());
168 inner.np_member = Some(np_flags[i]);
169 }
170
171 *meta = found_meta;
172 i += 1;
173 } else if !token.kind.is_whitespace() {
174 i += 1;
175 }
176 }
177 }
178 }
179
180 fn newlines_to_breaks(&mut self) {
182 for token in &mut self.tokens {
183 if let TokenKind::Newline(n) = token.kind
184 && n >= 2
185 {
186 token.kind = TokenKind::ParagraphBreak;
187 }
188 }
189 }
190
191 fn condense_indices(&mut self, indices: &[usize], stretch_len: usize) {
197 for idx in indices {
199 let end_tok = self.tokens[idx + stretch_len - 1].clone();
200 let start_tok = &mut self.tokens[*idx];
201
202 start_tok.span.end = end_tok.span.end;
203 }
204
205 let old = self.tokens.clone();
207 self.tokens.clear();
208
209 self.tokens
211 .extend_from_slice(&old[0..indices.first().copied().unwrap_or(indices.len())]);
212
213 let mut iter = indices.iter().peekable();
214
215 while let (Some(a_idx), b) = (iter.next(), iter.peek()) {
216 self.tokens.push(old[*a_idx].clone());
217
218 if let Some(b_idx) = b {
219 self.tokens
220 .extend_from_slice(&old[a_idx + stretch_len..**b_idx]);
221 }
222 }
223
224 self.tokens.extend_from_slice(
226 &old[indices
227 .last()
228 .map(|v| v + stretch_len)
229 .unwrap_or(indices.len())..],
230 );
231 }
232
233 pub fn get_token_at_char_index(&self, char_index: usize) -> Option<&Token> {
234 let index = self
235 .tokens
236 .binary_search_by(|t| {
237 if t.span.overlaps_with(Span::new_with_len(char_index, 1)) {
238 Ordering::Equal
239 } else {
240 t.span.start.cmp(&char_index)
241 }
242 })
243 .ok()?;
244
245 Some(&self.tokens[index])
246 }
247
248 pub fn get_token(&self, index: usize) -> Option<&Token> {
250 self.tokens.get(index)
251 }
252
253 pub fn get_token_offset(&self, base: usize, offset: isize) -> Option<&Token> {
255 match base.checked_add_signed(offset) {
256 None => None,
257 Some(idx) => self.get_token(idx),
258 }
259 }
260
261 pub fn tokens(&self) -> impl Iterator<Item = &Token> + '_ {
263 self.tokens.iter()
264 }
265
266 pub fn iter_nominal_phrases(&self) -> impl Iterator<Item = &[Token]> {
267 fn is_np_member(t: &Token) -> bool {
268 t.kind
269 .as_word()
270 .and_then(|x| x.as_ref())
271 .and_then(|w| w.np_member)
272 .unwrap_or(false)
273 }
274
275 fn trim(slice: &[Token]) -> &[Token] {
276 let mut start = 0;
277 let mut end = slice.len();
278 while start < end && slice[start].kind.is_whitespace() {
279 start += 1;
280 }
281 while end > start && slice[end - 1].kind.is_whitespace() {
282 end -= 1;
283 }
284 &slice[start..end]
285 }
286
287 self.tokens
288 .as_slice()
289 .split(|t| !(is_np_member(t) || t.kind.is_whitespace()))
290 .filter_map(|s| {
291 let s = trim(s);
292 if s.iter().any(is_np_member) {
293 Some(s)
294 } else {
295 None
296 }
297 })
298 }
299
300 pub fn fat_tokens(&self) -> impl Iterator<Item = FatToken> + '_ {
302 self.tokens().map(|token| token.to_fat(&self.source))
303 }
304
305 pub fn get_next_word_from_offset(&self, base: usize, offset: isize) -> Option<&Token> {
308 if !self.get_token_offset(base, offset)?.kind.is_whitespace() {
310 return None;
311 }
312 let word_token = self.get_token_offset(base, offset + offset.signum());
314 let word_token = word_token?;
315 word_token.kind.is_word().then_some(word_token)
316 }
317
318 pub fn fat_string_tokens(&self) -> impl Iterator<Item = FatStringToken> + '_ {
320 self.fat_tokens().map(|t| t.into())
321 }
322
323 pub fn get_span_content(&self, span: &Span<char>) -> &[char] {
324 span.get_content(&self.source)
325 }
326
327 pub fn get_span_content_str(&self, span: &Span<char>) -> String {
328 String::from_iter(self.get_span_content(span))
329 }
330
331 pub fn get_full_string(&self) -> String {
332 self.get_span_content_str(&Span::new(0, self.source.len()))
333 }
334
335 pub fn get_full_content(&self) -> &[char] {
336 &self.source
337 }
338
339 pub fn get_source(&self) -> &[char] {
340 &self.source
341 }
342
343 pub fn get_tokens(&self) -> &[Token] {
344 &self.tokens
345 }
346
347 fn match_quotes(&mut self) {
353 let quote_indices: Vec<usize> = self.tokens.iter_quote_indices().collect();
354
355 for i in 0..quote_indices.len() / 2 {
356 let a_i = quote_indices[i * 2];
357 let b_i = quote_indices[i * 2 + 1];
358
359 {
360 let a = self.tokens[a_i].kind.as_mut_quote().unwrap();
361 a.twin_loc = Some(b_i);
362 }
363
364 {
365 let b = self.tokens[b_i].kind.as_mut_quote().unwrap();
366 b.twin_loc = Some(a_i);
367 }
368 }
369 }
370
371 fn condense_number_suffixes(&mut self) {
373 if self.tokens.len() < 2 {
374 return;
375 }
376
377 let mut replace_starts = Vec::new();
378
379 for idx in 0..self.tokens.len() - 1 {
380 let b = &self.tokens[idx + 1];
381 let a = &self.tokens[idx];
382
383 if let (TokenKind::Number(..), TokenKind::Word(..)) = (&a.kind, &b.kind)
386 && let Some(found_suffix) =
387 OrdinalSuffix::from_chars(self.get_span_content(&b.span))
388 {
389 self.tokens[idx].kind.as_mut_number().unwrap().suffix = Some(found_suffix);
390 replace_starts.push(idx);
391 }
392 }
393
394 self.condense_indices(&replace_starts, 2);
395 }
396
397 fn condense_spaces(&mut self) {
400 let mut cursor = 0;
401 let copy = self.tokens.clone();
402
403 let mut remove_these = VecDeque::new();
404
405 while cursor < self.tokens.len() {
406 let start_tok = &mut self.tokens[cursor];
408
409 if let TokenKind::Space(start_count) = &mut start_tok.kind {
410 loop {
411 cursor += 1;
412
413 if cursor >= copy.len() {
414 break;
415 }
416
417 let child_tok = ©[cursor];
418
419 if start_tok.span.end != child_tok.span.start {
421 break;
422 }
423
424 if let TokenKind::Space(n) = child_tok.kind {
425 *start_count += n;
426 start_tok.span.end = child_tok.span.end;
427 remove_these.push_back(cursor);
428 cursor += 1;
429 } else {
430 break;
431 };
432 }
433 }
434
435 cursor += 1;
436 }
437
438 self.tokens.remove_indices(remove_these);
439 }
440
441 thread_local! {
442 static LATIN_EXPR: Lrc<FirstMatchOf> = Document::uncached_latin_expr();
443 }
444
445 fn uncached_latin_expr() -> Lrc<FirstMatchOf> {
446 Lrc::new(FirstMatchOf::new(vec![
447 Box::new(
448 SequenceExpr::default()
449 .then(WordSet::new(&["etc", "vs"]))
450 .then_period(),
451 ),
452 Box::new(
453 SequenceExpr::aco("et")
454 .then_whitespace()
455 .t_aco("al")
456 .then_period(),
457 ),
458 ]))
459 }
460
461 fn condense_expr<F>(&mut self, expr: &impl Expr, edit: F)
464 where
465 F: Fn(&mut Token),
466 {
467 let matches = expr.iter_matches_in_doc(self).collect::<Vec<_>>();
468
469 let mut remove_indices = VecDeque::with_capacity(matches.len());
470
471 for m in matches {
472 remove_indices.extend(m.start + 1..m.end);
473 self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
474 edit(&mut self.tokens[m.start]);
475 }
476
477 self.tokens.remove_indices(remove_indices);
478 }
479
480 fn condense_latin(&mut self) {
481 self.condense_expr(&Self::LATIN_EXPR.with(|v| v.clone()), |_| {})
482 }
483
484 fn condense_newlines(&mut self) {
487 let mut cursor = 0;
488 let copy = self.tokens.clone();
489
490 let mut remove_these = VecDeque::new();
491
492 while cursor < self.tokens.len() {
493 let start_tok = &mut self.tokens[cursor];
495
496 if let TokenKind::Newline(start_count) = &mut start_tok.kind {
497 loop {
498 cursor += 1;
499
500 if cursor >= copy.len() {
501 break;
502 }
503
504 let child_tok = ©[cursor];
505 if let TokenKind::Newline(n) = child_tok.kind {
506 *start_count += n;
507 start_tok.span.end = child_tok.span.end;
508 remove_these.push_back(cursor);
509 cursor += 1;
510 } else {
511 break;
512 };
513 }
514 }
515
516 cursor += 1;
517 }
518
519 self.tokens.remove_indices(remove_these);
520 }
521
522 fn condense_dotted_initialisms(&mut self) {
525 if self.tokens.len() < 2 {
526 return;
527 }
528
529 let mut to_remove = VecDeque::new();
530
531 let mut cursor = 1;
532
533 let mut initialism_start = None;
534
535 loop {
536 let a = &self.tokens[cursor - 1];
537 let b = &self.tokens[cursor];
538
539 let is_initialism_chunk = a.kind.is_word() && a.span.len() == 1 && b.kind.is_period();
540
541 if is_initialism_chunk {
542 if initialism_start.is_none() {
543 initialism_start = Some(cursor - 1);
544 } else {
545 to_remove.push_back(cursor - 1);
546 }
547
548 to_remove.push_back(cursor);
549 cursor += 1;
550 } else {
551 if let Some(start) = initialism_start {
552 let end = self.tokens[cursor - 2].span.end;
553 let start_tok: &mut Token = &mut self.tokens[start];
554 start_tok.span.end = end;
555 }
556
557 initialism_start = None;
558 }
559
560 cursor += 1;
561
562 if cursor >= self.tokens.len() - 1 {
563 break;
564 }
565 }
566
567 self.tokens.remove_indices(to_remove);
568 }
569
570 fn condense_filename_extensions(&mut self) {
572 if self.tokens.len() < 2 {
573 return;
574 }
575
576 let mut to_remove = VecDeque::new();
577
578 let mut cursor = 1;
579
580 let mut ext_start = None;
581
582 loop {
583 let l = self.get_token_offset(cursor, -2);
585 let d = &self.tokens[cursor - 1];
586 let x = &self.tokens[cursor];
587 let r = self.get_token_offset(cursor, 1);
588
589 let is_ext_chunk = d.kind.is_period()
590 && x.kind.is_word()
591 && x.span.len() <= 3
592 && ((l.is_none_or(|t| t.kind.is_whitespace())
593 && r.is_none_or(|t| t.kind.is_whitespace()))
594 || (l.is_some_and(|t| t.kind.is_open_round())
595 && r.is_some_and(|t| t.kind.is_close_round())))
596 && {
597 let ext_chars = x.span.get_content(&self.source);
598 ext_chars.iter().all(|c| c.is_ascii_lowercase())
599 || ext_chars.iter().all(|c| c.is_ascii_uppercase())
600 };
601
602 if is_ext_chunk {
603 if ext_start.is_none() {
604 ext_start = Some(cursor - 1);
605 self.tokens[cursor - 1].kind = TokenKind::Unlintable;
606 } else {
607 to_remove.push_back(cursor - 1);
608 }
609
610 to_remove.push_back(cursor);
611 cursor += 1;
612 } else {
613 if let Some(start) = ext_start {
614 let end = self.tokens[cursor - 2].span.end;
615 let start_tok: &mut Token = &mut self.tokens[start];
616 start_tok.span.end = end;
617 }
618
619 ext_start = None;
620 }
621
622 cursor += 1;
623
624 if cursor >= self.tokens.len() {
625 break;
626 }
627 }
628
629 self.tokens.remove_indices(to_remove);
630 }
631
632 fn condense_tldr(&mut self) {
634 if self.tokens.len() < 3 {
635 return;
636 }
637
638 let mut to_remove = VecDeque::new();
639 let mut cursor = 2;
640
641 loop {
642 let tl = &self.tokens[cursor - 2];
643 let simicolon = &self.tokens[cursor - 1];
644 let dr = &self.tokens[cursor];
645
646 let is_tldr_chunk = tl.kind.is_word()
647 && tl.span.len() == 2
648 && tl
649 .span
650 .get_content(&self.source)
651 .eq_ignore_ascii_case_chars(&['t', 'l'])
652 && simicolon.kind.is_semicolon()
653 && dr.kind.is_word()
654 && dr.span.len() >= 2
655 && dr.span.len() <= 3
656 && dr
657 .span
658 .get_content(&self.source)
659 .eq_any_ignore_ascii_case_chars(&[&['d', 'r'], &['d', 'r', 's']]);
660
661 if is_tldr_chunk {
662 self.tokens[cursor - 2].span = Span::new(
664 self.tokens[cursor - 2].span.start,
665 self.tokens[cursor].span.end,
666 );
667
668 to_remove.push_back(cursor - 1);
670 to_remove.push_back(cursor);
671 }
672
673 cursor += 1;
675
676 if cursor >= self.tokens.len() {
677 break;
678 }
679 }
680
681 self.tokens.remove_indices(to_remove);
683 }
684
685 fn condense_ampersand_pairs(&mut self) {
687 if self.tokens.len() < 3 {
688 return;
689 }
690
691 let mut to_remove = VecDeque::new();
692 let mut cursor = 2;
694
695 loop {
696 let l1 = &self.tokens[cursor - 2];
697 let and = &self.tokens[cursor - 1];
698 let l2 = &self.tokens[cursor];
699
700 let is_letter_amp_letter_chunk = l1.kind.is_word()
701 && l1.span.len() == 1
702 && and.kind.is_ampersand()
703 && l2.kind.is_word()
704 && l2.span.len() == 1;
705
706 if is_letter_amp_letter_chunk {
707 let (l1, l2) = (
708 l1.span.get_content(&self.source).first(),
709 l2.span.get_content(&self.source).first(),
710 );
711
712 let is_valid_pair = match (l1, l2) {
713 (Some(l1), Some(l2)) => {
714 matches!(
715 (l1.to_ascii_lowercase(), l2.to_ascii_lowercase()),
716 ('r', 'd') | ('q', 'a')
717 )
718 }
719 _ => false,
720 };
721
722 if is_valid_pair {
723 self.tokens[cursor - 2].span = Span::new(
724 self.tokens[cursor - 2].span.start,
725 self.tokens[cursor].span.end,
726 );
727 to_remove.push_back(cursor - 1);
728 to_remove.push_back(cursor);
729 }
730 }
731
732 cursor += 1;
734
735 if cursor >= self.tokens.len() {
736 break;
737 }
738 }
739
740 self.tokens.remove_indices(to_remove);
742 }
743
744 fn uncached_ellipsis_pattern() -> Lrc<Repeating> {
745 let period = SequenceExpr::default().then_period();
746 Lrc::new(Repeating::new(Box::new(period), 2))
747 }
748
749 thread_local! {
750 static ELLIPSIS_EXPR: Lrc<Repeating> = Document::uncached_ellipsis_pattern();
751 }
752
753 fn condense_ellipsis(&mut self) {
754 let expr = Self::ELLIPSIS_EXPR.with(|v| v.clone());
755 self.condense_expr(&expr, |tok| {
756 tok.kind = TokenKind::Punctuation(Punctuation::Ellipsis)
757 });
758 }
759
760 fn uncached_contraction_expr() -> Lrc<SequenceExpr> {
761 Lrc::new(
762 SequenceExpr::default()
763 .then_any_word()
764 .then_apostrophe()
765 .then_any_word(),
766 )
767 }
768
769 thread_local! {
770 static CONTRACTION_EXPR: Lrc<SequenceExpr> = Document::uncached_contraction_expr();
771 }
772
773 fn condense_contractions(&mut self) {
776 let expr = Self::CONTRACTION_EXPR.with(|v| v.clone());
777
778 self.condense_expr(&expr, |_| {})
779 }
780}
781
782macro_rules! create_fns_on_doc {
784 ($thing:ident) => {
785 paste! {
786 fn [< first_ $thing >](&self) -> Option<&Token> {
787 self.tokens.[< first_ $thing >]()
788 }
789
790 fn [< last_ $thing >](&self) -> Option<&Token> {
791 self.tokens.[< last_ $thing >]()
792 }
793
794 fn [< last_ $thing _index>](&self) -> Option<usize> {
795 self.tokens.[< last_ $thing _index >]()
796 }
797
798 fn [<iter_ $thing _indices>](&self) -> impl DoubleEndedIterator<Item = usize> + '_ {
799 self.tokens.[< iter_ $thing _indices >]()
800 }
801
802 fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_ {
803 self.tokens.[< iter_ $thing s >]()
804 }
805 }
806 };
807}
808
809impl TokenStringExt for Document {
810 create_fns_on_doc!(adjective);
811 create_fns_on_doc!(apostrophe);
812 create_fns_on_doc!(at);
813 create_fns_on_doc!(chunk_terminator);
814 create_fns_on_doc!(comma);
815 create_fns_on_doc!(conjunction);
816 create_fns_on_doc!(currency);
817 create_fns_on_doc!(ellipsis);
818 create_fns_on_doc!(hostname);
819 create_fns_on_doc!(likely_homograph);
820 create_fns_on_doc!(noun);
821 create_fns_on_doc!(number);
822 create_fns_on_doc!(paragraph_break);
823 create_fns_on_doc!(pipe);
824 create_fns_on_doc!(preposition);
825 create_fns_on_doc!(punctuation);
826 create_fns_on_doc!(quote);
827 create_fns_on_doc!(sentence_terminator);
828 create_fns_on_doc!(space);
829 create_fns_on_doc!(unlintable);
830 create_fns_on_doc!(verb);
831 create_fns_on_doc!(word);
832 create_fns_on_doc!(word_like);
833
834 fn first_sentence_word(&self) -> Option<&Token> {
835 self.tokens.first_sentence_word()
836 }
837
838 fn first_non_whitespace(&self) -> Option<&Token> {
839 self.tokens.first_non_whitespace()
840 }
841
842 fn span(&self) -> Option<Span<char>> {
843 self.tokens.span()
844 }
845
846 fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
847 self.tokens.iter_linking_verb_indices()
848 }
849
850 fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_ {
851 self.tokens.iter_linking_verbs()
852 }
853
854 fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
855 self.tokens.iter_chunks()
856 }
857
858 fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
859 self.tokens.iter_paragraphs()
860 }
861
862 fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
863 self.tokens.iter_sentences()
864 }
865
866 fn iter_sentences_mut(&mut self) -> impl Iterator<Item = &'_ mut [Token]> + '_ {
867 self.tokens.iter_sentences_mut()
868 }
869}
870
871impl Display for Document {
872 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
873 for token in &self.tokens {
874 write!(f, "{}", self.get_span_content_str(&token.span))?;
875 }
876
877 Ok(())
878 }
879}
880
881#[cfg(test)]
882mod tests {
883 use itertools::Itertools;
884
885 use super::Document;
886 use crate::{Span, parsers::MarkdownOptions};
887
888 fn assert_condensed_contractions(text: &str, final_tok_count: usize) {
889 let document = Document::new_plain_english_curated(text);
890
891 assert_eq!(document.tokens.len(), final_tok_count);
892
893 let document = Document::new_markdown_curated(text, MarkdownOptions::default());
894
895 assert_eq!(document.tokens.len(), final_tok_count);
896 }
897
898 #[test]
899 fn simple_contraction() {
900 assert_condensed_contractions("isn't", 1);
901 }
902
903 #[test]
904 fn simple_contraction2() {
905 assert_condensed_contractions("wasn't", 1);
906 }
907
908 #[test]
909 fn simple_contraction3() {
910 assert_condensed_contractions("There's", 1);
911 }
912
913 #[test]
914 fn medium_contraction() {
915 assert_condensed_contractions("isn't wasn't", 3);
916 }
917
918 #[test]
919 fn medium_contraction2() {
920 assert_condensed_contractions("There's no way", 5);
921 }
922
923 #[test]
924 fn selects_token_at_char_index() {
925 let text = "There were three little pigs. They built three little homes.";
926 let document = Document::new_plain_english_curated(text);
927
928 let got = document.get_token_at_char_index(19).unwrap();
929
930 assert!(got.kind.is_word());
931 assert_eq!(got.span, Span::new(17, 23));
932 }
933
934 fn assert_token_count(source: &str, count: usize) {
935 let document = Document::new_plain_english_curated(source);
936
937 dbg!(document.tokens().map(|t| t.kind.clone()).collect_vec());
938 assert_eq!(document.tokens.len(), count);
939 }
940
941 #[test]
942 fn condenses_number_suffixes() {
943 assert_token_count("1st", 1);
944 assert_token_count("This is the 2nd test", 9);
945 assert_token_count("This is the 3rd test", 9);
946 assert_token_count(
947 "It works even with weird capitalization like this: 600nD",
948 18,
949 );
950 }
951
952 #[test]
953 fn condenses_ie() {
954 assert_token_count("There is a thing (i.e. that one)", 15);
955 assert_token_count("We are trying to condense \"i.e.\"", 13);
956 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
957 }
958
959 #[test]
960 fn condenses_eg() {
961 assert_token_count("We are trying to condense \"e.g.\"", 13);
962 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
963 }
964
965 #[test]
966 fn condenses_nsa() {
967 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
968 }
969
970 #[test]
971 fn parses_ellipsis() {
972 assert_token_count("...", 1);
973 }
974
975 #[test]
976 fn parses_long_ellipsis() {
977 assert_token_count(".....", 1);
978 }
979
980 #[test]
981 fn parses_short_ellipsis() {
982 assert_token_count("..", 1);
983 }
984
985 #[test]
986 fn selects_token_at_offset() {
987 let doc = Document::new_plain_english_curated("Foo bar baz");
988
989 let tok = doc.get_token_offset(1, -1).unwrap();
990
991 assert_eq!(tok.span, Span::new(0, 3));
992 }
993
994 #[test]
995 fn cant_select_token_before_start() {
996 let doc = Document::new_plain_english_curated("Foo bar baz");
997
998 let tok = doc.get_token_offset(0, -1);
999
1000 assert!(tok.is_none());
1001 }
1002
1003 #[test]
1004 fn select_next_word_pos_offset() {
1005 let doc = Document::new_plain_english_curated("Foo bar baz");
1006
1007 let bar = doc.get_next_word_from_offset(0, 1).unwrap();
1008 let bar = doc.get_span_content(&bar.span);
1009 assert_eq!(bar, ['b', 'a', 'r']);
1010 }
1011
1012 #[test]
1013 fn select_next_word_neg_offset() {
1014 let doc = Document::new_plain_english_curated("Foo bar baz");
1015
1016 let bar = doc.get_next_word_from_offset(2, -1).unwrap();
1017 let bar = doc.get_span_content(&bar.span);
1018 assert_eq!(bar, ['F', 'o', 'o']);
1019 }
1020
1021 #[test]
1022 fn cant_select_next_word_not_from_whitespace() {
1023 let doc = Document::new_plain_english_curated("Foo bar baz");
1024
1025 let tok = doc.get_next_word_from_offset(0, 2);
1026
1027 assert!(tok.is_none());
1028 }
1029
1030 #[test]
1031 fn cant_select_next_word_before_start() {
1032 let doc = Document::new_plain_english_curated("Foo bar baz");
1033
1034 let tok = doc.get_next_word_from_offset(0, -1);
1035
1036 assert!(tok.is_none());
1037 }
1038
1039 #[test]
1040 fn cant_select_next_word_with_punctuation_instead_of_whitespace() {
1041 let doc = Document::new_plain_english_curated("Foo, bar, baz");
1042
1043 let tok = doc.get_next_word_from_offset(0, 1);
1044
1045 assert!(tok.is_none());
1046 }
1047
1048 #[test]
1049 fn cant_select_next_word_with_punctuation_after_whitespace() {
1050 let doc = Document::new_plain_english_curated("Foo \"bar\", baz");
1051
1052 let tok = doc.get_next_word_from_offset(0, 1);
1053
1054 assert!(tok.is_none());
1055 }
1056
1057 #[test]
1058 fn condenses_filename_extensions() {
1059 let doc = Document::new_plain_english_curated(".c and .exe and .js");
1060 assert!(doc.tokens[0].kind.is_unlintable());
1061 assert!(doc.tokens[4].kind.is_unlintable());
1062 assert!(doc.tokens[8].kind.is_unlintable());
1063 }
1064
1065 #[test]
1066 fn condense_filename_extension_ok_at_start_and_end() {
1067 let doc = Document::new_plain_english_curated(".c and .EXE");
1068 assert!(doc.tokens.len() == 5);
1069 assert!(doc.tokens[0].kind.is_unlintable());
1070 assert!(doc.tokens[4].kind.is_unlintable());
1071 }
1072
1073 #[test]
1074 fn doesnt_condense_filename_extensions_with_mixed_case() {
1075 let doc = Document::new_plain_english_curated(".c and .Exe");
1076 assert!(doc.tokens.len() == 6);
1077 assert!(doc.tokens[0].kind.is_unlintable());
1078 assert!(doc.tokens[4].kind.is_punctuation());
1079 assert!(doc.tokens[5].kind.is_word());
1080 }
1081
1082 #[test]
1083 fn doesnt_condense_filename_extensions_with_non_letters() {
1084 let doc = Document::new_plain_english_curated(".COM and .C0M");
1085 assert!(doc.tokens.len() == 6);
1086 assert!(doc.tokens[0].kind.is_unlintable());
1087 assert!(doc.tokens[4].kind.is_punctuation());
1088 assert!(doc.tokens[5].kind.is_word());
1089 }
1090
1091 #[test]
1092 fn doesnt_condense_filename_extensions_longer_than_three() {
1093 let doc = Document::new_plain_english_curated(".dll and .dlls");
1094 assert!(doc.tokens.len() == 6);
1095 assert!(doc.tokens[0].kind.is_unlintable());
1096 assert!(doc.tokens[4].kind.is_punctuation());
1097 assert!(doc.tokens[5].kind.is_word());
1098 }
1099
1100 #[test]
1101 fn condense_filename_extension_in_parens() {
1102 let doc = Document::new_plain_english_curated(
1103 "true for the manual installation when trying to run the executable(.exe) after a manual download",
1104 );
1105 assert!(doc.tokens.len() > 23);
1106 assert!(doc.tokens[21].kind.is_open_round());
1107 assert!(doc.tokens[22].kind.is_unlintable());
1108 assert!(doc.tokens[23].kind.is_close_round());
1109 }
1110
1111 #[test]
1112 fn condense_tldr_uppercase() {
1113 let doc = Document::new_plain_english_curated("TL;DR");
1114 assert!(doc.tokens.len() == 1);
1115 assert!(doc.tokens[0].kind.is_word());
1116 assert!(doc.tokens[0].span.len() == 5);
1117 }
1118
1119 #[test]
1120 fn condense_tldr_lowercase() {
1121 let doc = Document::new_plain_english_curated("tl;dr");
1122 assert!(doc.tokens.len() == 1);
1123 assert!(doc.tokens[0].kind.is_word());
1124 }
1125
1126 #[test]
1127 fn condense_tldr_mixed_case_1() {
1128 let doc = Document::new_plain_english_curated("tl;DR");
1129 assert!(doc.tokens.len() == 1);
1130 assert!(doc.tokens[0].kind.is_word());
1131 }
1132
1133 #[test]
1134 fn condense_tldr_mixed_case_2() {
1135 let doc = Document::new_plain_english_curated("TL;Dr");
1136 assert!(doc.tokens.len() == 1);
1137 assert!(doc.tokens[0].kind.is_word());
1138 }
1139
1140 #[test]
1141 fn condense_tldr_pural() {
1142 let doc = Document::new_plain_english_curated(
1143 "managing the flow between components to produce relevant TL;DRs of current news articles",
1144 );
1145 assert!(
1147 doc.tokens
1148 .iter()
1149 .all(|t| t.kind.is_word() || t.kind.is_whitespace())
1150 );
1151 let tldrs = doc
1153 .tokens
1154 .iter()
1155 .filter(|t| t.span.get_content(&doc.source).contains(&';'))
1156 .collect_vec();
1157 assert!(tldrs.len() == 1);
1158 assert!(tldrs[0].span.get_content_string(&doc.source) == "TL;DRs");
1159 }
1160
1161 #[test]
1162 fn condense_r_and_d_caps() {
1163 let doc = Document::new_plain_english_curated("R&D");
1164 assert!(doc.tokens.len() == 1);
1165 assert!(doc.tokens[0].kind.is_word());
1166 }
1167
1168 #[test]
1169 fn condense_r_and_d_mixed_case() {
1170 let doc = Document::new_plain_english_curated("R&d");
1171 assert!(doc.tokens.len() == 1);
1172 assert!(doc.tokens[0].kind.is_word());
1173 }
1174
1175 #[test]
1176 fn condense_r_and_d_lowercase() {
1177 let doc = Document::new_plain_english_curated("r&d");
1178 assert!(doc.tokens.len() == 1);
1179 assert!(doc.tokens[0].kind.is_word());
1180 }
1181
1182 #[test]
1183 fn dont_condense_r_and_d_with_spaces() {
1184 let doc = Document::new_plain_english_curated("R & D");
1185 assert!(doc.tokens.len() == 5);
1186 assert!(doc.tokens[0].kind.is_word());
1187 assert!(doc.tokens[1].kind.is_whitespace());
1188 assert!(doc.tokens[2].kind.is_ampersand());
1189 assert!(doc.tokens[3].kind.is_whitespace());
1190 assert!(doc.tokens[4].kind.is_word());
1191 }
1192
1193 #[test]
1194 fn condense_q_and_a() {
1195 let doc =
1196 Document::new_plain_english_curated("A Q&A platform software for teams at any scales.");
1197 assert!(doc.tokens.len() >= 3);
1198 assert!(doc.tokens[2].kind.is_word());
1199 assert!(doc.tokens[2].span.get_content_string(&doc.source) == "Q&A");
1200 }
1201
1202 #[test]
1203 fn dont_allow_mixed_r_and_d_with_q_and_a() {
1204 let doc = Document::new_plain_english_curated("R&A or Q&D");
1205 assert!(doc.tokens.len() == 9);
1206 assert!(doc.tokens[1].kind.is_ampersand() || doc.tokens[7].kind.is_ampersand());
1207 }
1208}