1use std::cmp::Ordering;
2use std::collections::VecDeque;
3use std::fmt::Display;
4
5use harper_brill::{Chunker, Tagger, brill_tagger, burn_chunker};
6use paste::paste;
7
8use crate::expr::{Expr, ExprExt, FirstMatchOf, Repeating, SequenceExpr};
9use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
10use crate::patterns::WordSet;
11use crate::punctuation::Punctuation;
12use crate::spell::{Dictionary, FstDictionary};
13use crate::vec_ext::VecExt;
14use crate::{CharStringExt, FatStringToken, FatToken, Lrc, Token, TokenKind, TokenStringExt};
15use crate::{OrdinalSuffix, Span};
16
17#[derive(Debug, Clone)]
19pub struct Document {
20 source: Lrc<Vec<char>>,
21 tokens: Vec<Token>,
22}
23
24impl Default for Document {
25 fn default() -> Self {
26 Self::new("", &PlainEnglish, &FstDictionary::curated())
27 }
28}
29
30impl Document {
31 pub fn token_indices_intersecting(&self, span: Span<char>) -> Vec<usize> {
35 self.tokens()
36 .enumerate()
37 .filter_map(|(idx, tok)| tok.span.overlaps_with(span).then_some(idx))
38 .collect()
39 }
40
41 pub fn fat_tokens_intersecting(&self, span: Span<char>) -> Vec<FatToken> {
45 let indices = self.token_indices_intersecting(span);
46
47 indices
48 .into_iter()
49 .map(|i| self.tokens[i].to_fat(&self.source))
50 .collect()
51 }
52
53 pub fn new(text: &str, parser: &impl Parser, dictionary: &impl Dictionary) -> Self {
56 let source: Vec<_> = text.chars().collect();
57
58 Self::new_from_vec(Lrc::new(source), parser, dictionary)
59 }
60
61 pub fn new_curated(text: &str, parser: &impl Parser) -> Self {
64 let source: Vec<_> = text.chars().collect();
65
66 Self::new_from_vec(Lrc::new(source), parser, &FstDictionary::curated())
67 }
68
69 pub fn new_from_vec(
72 source: Lrc<Vec<char>>,
73 parser: &impl Parser,
74 dictionary: &impl Dictionary,
75 ) -> Self {
76 let tokens = parser.parse(&source);
77
78 let mut document = Self { source, tokens };
79 document.parse(dictionary);
80
81 document
82 }
83
84 pub fn new_plain_english_curated(text: &str) -> Self {
87 Self::new(text, &PlainEnglish, &FstDictionary::curated())
88 }
89
90 pub fn new_plain_english(text: &str, dictionary: &impl Dictionary) -> Self {
93 Self::new(text, &PlainEnglish, dictionary)
94 }
95
96 pub fn new_markdown_curated(text: &str, markdown_options: MarkdownOptions) -> Self {
99 Self::new(
100 text,
101 &Markdown::new(markdown_options),
102 &FstDictionary::curated(),
103 )
104 }
105
106 pub fn new_markdown_default_curated(text: &str) -> Self {
109 Self::new_markdown_curated(text, MarkdownOptions::default())
110 }
111
112 pub fn new_markdown(
115 text: &str,
116 markdown_options: MarkdownOptions,
117 dictionary: &impl Dictionary,
118 ) -> Self {
119 Self::new(text, &Markdown::new(markdown_options), dictionary)
120 }
121
122 pub fn new_markdown_default(text: &str, dictionary: &impl Dictionary) -> Self {
125 Self::new_markdown(text, MarkdownOptions::default(), dictionary)
126 }
127
128 fn parse(&mut self, dictionary: &impl Dictionary) {
132 self.condense_spaces();
133 self.condense_newlines();
134 self.newlines_to_breaks();
135 self.condense_contractions();
136 self.condense_dotted_initialisms();
137 self.condense_number_suffixes();
138 self.condense_ellipsis();
139 self.condense_latin();
140 self.condense_filename_extensions();
141 self.condense_tldr();
142 self.condense_ampersand_pairs();
143 self.condense_slash_pairs();
144 self.match_quotes();
145
146 let chunker = burn_chunker();
147 let tagger = brill_tagger();
148
149 for sent in self.tokens.iter_sentences_mut() {
150 let token_strings: Vec<_> = sent
151 .iter()
152 .filter(|t| !t.kind.is_whitespace())
153 .map(|t| t.span.get_content_string(&self.source))
154 .collect();
155
156 let token_tags = tagger.tag_sentence(&token_strings);
157 let np_flags = chunker.chunk_sentence(&token_strings, &token_tags);
158
159 let mut i = 0;
160
161 for token in sent.iter_mut() {
163 if let TokenKind::Word(meta) = &mut token.kind {
164 let word_source = token.span.get_content(&self.source);
165 let mut found_meta = dictionary.get_word_metadata(word_source).cloned();
166
167 if let Some(inner) = &mut found_meta {
168 inner.pos_tag = token_tags[i].or_else(|| inner.infer_pos_tag());
169 inner.np_member = Some(np_flags[i]);
170 }
171
172 *meta = found_meta;
173 i += 1;
174 } else if !token.kind.is_whitespace() {
175 i += 1;
176 }
177 }
178 }
179 }
180
181 fn newlines_to_breaks(&mut self) {
183 for token in &mut self.tokens {
184 if let TokenKind::Newline(n) = token.kind
185 && n >= 2
186 {
187 token.kind = TokenKind::ParagraphBreak;
188 }
189 }
190 }
191
192 fn condense_indices(&mut self, indices: &[usize], stretch_len: usize) {
198 for idx in indices {
200 let end_tok = self.tokens[idx + stretch_len - 1].clone();
201 let start_tok = &mut self.tokens[*idx];
202
203 start_tok.span.end = end_tok.span.end;
204 }
205
206 let old = self.tokens.clone();
208 self.tokens.clear();
209
210 self.tokens
212 .extend_from_slice(&old[0..indices.first().copied().unwrap_or(indices.len())]);
213
214 let mut iter = indices.iter().peekable();
215
216 while let (Some(a_idx), b) = (iter.next(), iter.peek()) {
217 self.tokens.push(old[*a_idx].clone());
218
219 if let Some(b_idx) = b {
220 self.tokens
221 .extend_from_slice(&old[a_idx + stretch_len..**b_idx]);
222 }
223 }
224
225 self.tokens.extend_from_slice(
227 &old[indices
228 .last()
229 .map(|v| v + stretch_len)
230 .unwrap_or(indices.len())..],
231 );
232 }
233
234 pub fn get_token_at_char_index(&self, char_index: usize) -> Option<&Token> {
235 let index = self
236 .tokens
237 .binary_search_by(|t| {
238 if t.span.overlaps_with(Span::new_with_len(char_index, 1)) {
239 Ordering::Equal
240 } else {
241 t.span.start.cmp(&char_index)
242 }
243 })
244 .ok()?;
245
246 Some(&self.tokens[index])
247 }
248
249 pub fn get_token(&self, index: usize) -> Option<&Token> {
251 self.tokens.get(index)
252 }
253
254 pub fn get_token_offset(&self, base: usize, offset: isize) -> Option<&Token> {
256 match base.checked_add_signed(offset) {
257 None => None,
258 Some(idx) => self.get_token(idx),
259 }
260 }
261
262 pub fn tokens(&self) -> impl Iterator<Item = &Token> + '_ {
264 self.tokens.iter()
265 }
266
267 pub fn iter_nominal_phrases(&self) -> impl Iterator<Item = &[Token]> {
268 fn is_np_member(t: &Token) -> bool {
269 t.kind
270 .as_word()
271 .and_then(|x| x.as_ref())
272 .and_then(|w| w.np_member)
273 .unwrap_or(false)
274 }
275
276 fn trim(slice: &[Token]) -> &[Token] {
277 let mut start = 0;
278 let mut end = slice.len();
279 while start < end && slice[start].kind.is_whitespace() {
280 start += 1;
281 }
282 while end > start && slice[end - 1].kind.is_whitespace() {
283 end -= 1;
284 }
285 &slice[start..end]
286 }
287
288 self.tokens
289 .as_slice()
290 .split(|t| !(is_np_member(t) || t.kind.is_whitespace()))
291 .filter_map(|s| {
292 let s = trim(s);
293 if s.iter().any(is_np_member) {
294 Some(s)
295 } else {
296 None
297 }
298 })
299 }
300
301 pub fn fat_tokens(&self) -> impl Iterator<Item = FatToken> + '_ {
303 self.tokens().map(|token| token.to_fat(&self.source))
304 }
305
306 pub fn get_next_word_from_offset(&self, base: usize, offset: isize) -> Option<&Token> {
309 if !self.get_token_offset(base, offset)?.kind.is_whitespace() {
311 return None;
312 }
313 let word_token = self.get_token_offset(base, offset + offset.signum());
315 let word_token = word_token?;
316 word_token.kind.is_word().then_some(word_token)
317 }
318
319 pub fn fat_string_tokens(&self) -> impl Iterator<Item = FatStringToken> + '_ {
321 self.fat_tokens().map(|t| t.into())
322 }
323
324 pub fn get_span_content(&self, span: &Span<char>) -> &[char] {
325 span.get_content(&self.source)
326 }
327
328 pub fn get_span_content_str(&self, span: &Span<char>) -> String {
329 String::from_iter(self.get_span_content(span))
330 }
331
332 pub fn get_full_string(&self) -> String {
333 self.get_span_content_str(&Span::new(0, self.source.len()))
334 }
335
336 pub fn get_full_content(&self) -> &[char] {
337 &self.source
338 }
339
340 pub fn get_source(&self) -> &[char] {
341 &self.source
342 }
343
344 pub fn get_tokens(&self) -> &[Token] {
345 &self.tokens
346 }
347
348 fn match_quotes(&mut self) {
354 let quote_indices: Vec<usize> = self.tokens.iter_quote_indices().collect();
355
356 for i in 0..quote_indices.len() / 2 {
357 let a_i = quote_indices[i * 2];
358 let b_i = quote_indices[i * 2 + 1];
359
360 {
361 let a = self.tokens[a_i].kind.as_mut_quote().unwrap();
362 a.twin_loc = Some(b_i);
363 }
364
365 {
366 let b = self.tokens[b_i].kind.as_mut_quote().unwrap();
367 b.twin_loc = Some(a_i);
368 }
369 }
370 }
371
372 fn condense_number_suffixes(&mut self) {
374 if self.tokens.len() < 2 {
375 return;
376 }
377
378 let mut replace_starts = Vec::new();
379
380 for idx in 0..self.tokens.len() - 1 {
381 let b = &self.tokens[idx + 1];
382 let a = &self.tokens[idx];
383
384 if let (TokenKind::Number(..), TokenKind::Word(..)) = (&a.kind, &b.kind)
387 && let Some(found_suffix) =
388 OrdinalSuffix::from_chars(self.get_span_content(&b.span))
389 {
390 self.tokens[idx].kind.as_mut_number().unwrap().suffix = Some(found_suffix);
391 replace_starts.push(idx);
392 }
393 }
394
395 self.condense_indices(&replace_starts, 2);
396 }
397
398 fn condense_spaces(&mut self) {
401 let mut cursor = 0;
402 let copy = self.tokens.clone();
403
404 let mut remove_these = VecDeque::new();
405
406 while cursor < self.tokens.len() {
407 let start_tok = &mut self.tokens[cursor];
409
410 if let TokenKind::Space(start_count) = &mut start_tok.kind {
411 loop {
412 cursor += 1;
413
414 if cursor >= copy.len() {
415 break;
416 }
417
418 let child_tok = ©[cursor];
419
420 if start_tok.span.end != child_tok.span.start {
422 break;
423 }
424
425 if let TokenKind::Space(n) = child_tok.kind {
426 *start_count += n;
427 start_tok.span.end = child_tok.span.end;
428 remove_these.push_back(cursor);
429 cursor += 1;
430 } else {
431 break;
432 };
433 }
434 }
435
436 cursor += 1;
437 }
438
439 self.tokens.remove_indices(remove_these);
440 }
441
442 thread_local! {
443 static LATIN_EXPR: Lrc<FirstMatchOf> = Document::uncached_latin_expr();
444 }
445
446 fn uncached_latin_expr() -> Lrc<FirstMatchOf> {
447 Lrc::new(FirstMatchOf::new(vec![
448 Box::new(
449 SequenceExpr::default()
450 .then(WordSet::new(&["etc", "vs"]))
451 .then_period(),
452 ),
453 Box::new(
454 SequenceExpr::aco("et")
455 .then_whitespace()
456 .t_aco("al")
457 .then_period(),
458 ),
459 ]))
460 }
461
462 fn condense_expr<F>(&mut self, expr: &impl Expr, edit: F)
465 where
466 F: Fn(&mut Token),
467 {
468 let matches = expr.iter_matches_in_doc(self).collect::<Vec<_>>();
469
470 let mut remove_indices = VecDeque::with_capacity(matches.len());
471
472 for m in matches {
473 remove_indices.extend(m.start + 1..m.end);
474 self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
475 edit(&mut self.tokens[m.start]);
476 }
477
478 self.tokens.remove_indices(remove_indices);
479 }
480
481 fn condense_latin(&mut self) {
482 self.condense_expr(&Self::LATIN_EXPR.with(|v| v.clone()), |_| {})
483 }
484
485 fn condense_newlines(&mut self) {
488 let mut cursor = 0;
489 let copy = self.tokens.clone();
490
491 let mut remove_these = VecDeque::new();
492
493 while cursor < self.tokens.len() {
494 let start_tok = &mut self.tokens[cursor];
496
497 if let TokenKind::Newline(start_count) = &mut start_tok.kind {
498 loop {
499 cursor += 1;
500
501 if cursor >= copy.len() {
502 break;
503 }
504
505 let child_tok = ©[cursor];
506 if let TokenKind::Newline(n) = child_tok.kind {
507 *start_count += n;
508 start_tok.span.end = child_tok.span.end;
509 remove_these.push_back(cursor);
510 cursor += 1;
511 } else {
512 break;
513 };
514 }
515 }
516
517 cursor += 1;
518 }
519
520 self.tokens.remove_indices(remove_these);
521 }
522
523 fn condense_dotted_initialisms(&mut self) {
526 if self.tokens.len() < 2 {
527 return;
528 }
529
530 let mut to_remove = VecDeque::new();
531
532 let mut cursor = 1;
533
534 let mut initialism_start = None;
535
536 loop {
537 let a = &self.tokens[cursor - 1];
538 let b = &self.tokens[cursor];
539
540 let is_initialism_chunk = a.kind.is_word() && a.span.len() == 1 && b.kind.is_period();
541
542 if is_initialism_chunk {
543 if initialism_start.is_none() {
544 initialism_start = Some(cursor - 1);
545 } else {
546 to_remove.push_back(cursor - 1);
547 }
548
549 to_remove.push_back(cursor);
550 cursor += 1;
551 } else {
552 if let Some(start) = initialism_start {
553 let end = self.tokens[cursor - 2].span.end;
554 let start_tok: &mut Token = &mut self.tokens[start];
555 start_tok.span.end = end;
556 }
557
558 initialism_start = None;
559 }
560
561 cursor += 1;
562
563 if cursor >= self.tokens.len() - 1 {
564 break;
565 }
566 }
567
568 self.tokens.remove_indices(to_remove);
569 }
570
571 fn condense_filename_extensions(&mut self) {
573 if self.tokens.len() < 2 {
574 return;
575 }
576
577 let mut to_remove = VecDeque::new();
578
579 let mut cursor = 1;
580
581 let mut ext_start = None;
582
583 loop {
584 let l = self.get_token_offset(cursor, -2);
586 let d = &self.tokens[cursor - 1];
587 let x = &self.tokens[cursor];
588 let r = self.get_token_offset(cursor, 1);
589
590 let is_ext_chunk = d.kind.is_period()
591 && x.kind.is_word()
592 && x.span.len() <= 3
593 && ((l.is_none_or(|t| t.kind.is_whitespace())
594 && r.is_none_or(|t| t.kind.is_whitespace()))
595 || (l.is_some_and(|t| t.kind.is_open_round())
596 && r.is_some_and(|t| t.kind.is_close_round())))
597 && {
598 let ext_chars = x.span.get_content(&self.source);
599 ext_chars.iter().all(|c| c.is_ascii_lowercase())
600 || ext_chars.iter().all(|c| c.is_ascii_uppercase())
601 };
602
603 if is_ext_chunk {
604 if ext_start.is_none() {
605 ext_start = Some(cursor - 1);
606 self.tokens[cursor - 1].kind = TokenKind::Unlintable;
607 } else {
608 to_remove.push_back(cursor - 1);
609 }
610
611 to_remove.push_back(cursor);
612 cursor += 1;
613 } else {
614 if let Some(start) = ext_start {
615 let end = self.tokens[cursor - 2].span.end;
616 let start_tok: &mut Token = &mut self.tokens[start];
617 start_tok.span.end = end;
618 }
619
620 ext_start = None;
621 }
622
623 cursor += 1;
624
625 if cursor >= self.tokens.len() {
626 break;
627 }
628 }
629
630 self.tokens.remove_indices(to_remove);
631 }
632
633 fn condense_tldr(&mut self) {
635 if self.tokens.len() < 3 {
636 return;
637 }
638
639 let mut to_remove = VecDeque::new();
640 let mut cursor = 2;
641
642 loop {
643 let tl = &self.tokens[cursor - 2];
644 let simicolon = &self.tokens[cursor - 1];
645 let dr = &self.tokens[cursor];
646
647 let is_tldr_chunk = tl.kind.is_word()
648 && tl.span.len() == 2
649 && tl
650 .span
651 .get_content(&self.source)
652 .eq_ignore_ascii_case_chars(&['t', 'l'])
653 && simicolon.kind.is_semicolon()
654 && dr.kind.is_word()
655 && dr.span.len() >= 2
656 && dr.span.len() <= 3
657 && dr
658 .span
659 .get_content(&self.source)
660 .eq_any_ignore_ascii_case_chars(&[&['d', 'r'], &['d', 'r', 's']]);
661
662 if is_tldr_chunk {
663 self.tokens[cursor - 2].span = Span::new(
665 self.tokens[cursor - 2].span.start,
666 self.tokens[cursor].span.end,
667 );
668
669 to_remove.push_back(cursor - 1);
671 to_remove.push_back(cursor);
672 }
673
674 cursor += 1;
676
677 if cursor >= self.tokens.len() {
678 break;
679 }
680 }
681
682 self.tokens.remove_indices(to_remove);
684 }
685
686 fn condense_delimited_pairs<F>(&mut self, is_delimiter: F, valid_pairs: &[(char, char)])
694 where
695 F: Fn(&TokenKind) -> bool,
696 {
697 if self.tokens.len() < 3 {
698 return;
699 }
700
701 let mut to_remove = VecDeque::new();
702 let mut cursor = 2;
703
704 loop {
705 let l1 = &self.tokens[cursor - 2];
706 let delim = &self.tokens[cursor - 1];
707 let l2 = &self.tokens[cursor];
708
709 let is_delimited_chunk = l1.kind.is_word()
710 && l1.span.len() == 1
711 && is_delimiter(&delim.kind)
712 && l2.kind.is_word()
713 && l2.span.len() == 1;
714
715 if is_delimited_chunk {
716 let (l1, l2) = (
717 l1.span.get_content(&self.source).first(),
718 l2.span.get_content(&self.source).first(),
719 );
720
721 let is_valid_pair = match (l1, l2) {
722 (Some(l1), Some(l2)) => {
723 let pair = (l1.to_ascii_lowercase(), l2.to_ascii_lowercase());
724 valid_pairs.contains(&pair)
725 }
726 _ => false,
727 };
728
729 if is_valid_pair {
730 self.tokens[cursor - 2].span = Span::new(
731 self.tokens[cursor - 2].span.start,
732 self.tokens[cursor].span.end,
733 );
734 to_remove.push_back(cursor - 1);
735 to_remove.push_back(cursor);
736 }
737 }
738
739 cursor += 1;
740 if cursor >= self.tokens.len() {
741 break;
742 }
743 }
744
745 self.tokens.remove_indices(to_remove);
746 }
747
748 fn condense_ampersand_pairs(&mut self) {
750 self.condense_delimited_pairs(
751 |kind| kind.is_ampersand(),
752 &[
753 ('b', 'b'), ('b', 'w'), ('g', 't'), ('k', 'r'), ('q', 'a'), ('r', 'b'), ('r', 'd'), ('r', 'r'), ('s', 'p'), ],
763 );
764 }
765
766 fn condense_slash_pairs(&mut self) {
768 self.condense_delimited_pairs(
769 |kind| kind.is_slash(),
770 &[
771 ('a', 'c'), ('b', 'w'), ('c', 'o'), ('d', 'c'), ('d', 'l'), ('i', 'o'), ('j', 'k'), ('n', 'a'), ('r', 'c'), ('s', 'n'), ('y', 'n'), ('y', 'o'), ],
784 );
785 }
786
787 fn uncached_ellipsis_pattern() -> Lrc<Repeating> {
788 let period = SequenceExpr::default().then_period();
789 Lrc::new(Repeating::new(Box::new(period), 2))
790 }
791
792 thread_local! {
793 static ELLIPSIS_EXPR: Lrc<Repeating> = Document::uncached_ellipsis_pattern();
794 }
795
796 fn condense_ellipsis(&mut self) {
797 let expr = Self::ELLIPSIS_EXPR.with(|v| v.clone());
798 self.condense_expr(&expr, |tok| {
799 tok.kind = TokenKind::Punctuation(Punctuation::Ellipsis)
800 });
801 }
802
803 fn uncached_contraction_expr() -> Lrc<SequenceExpr> {
804 Lrc::new(
805 SequenceExpr::default()
806 .then_any_word()
807 .then_apostrophe()
808 .then_any_word(),
809 )
810 }
811
812 thread_local! {
813 static CONTRACTION_EXPR: Lrc<SequenceExpr> = Document::uncached_contraction_expr();
814 }
815
816 fn condense_contractions(&mut self) {
819 let expr = Self::CONTRACTION_EXPR.with(|v| v.clone());
820
821 self.condense_expr(&expr, |_| {})
822 }
823}
824
825macro_rules! create_fns_on_doc {
827 ($thing:ident) => {
828 paste! {
829 fn [< first_ $thing >](&self) -> Option<&Token> {
830 self.tokens.[< first_ $thing >]()
831 }
832
833 fn [< last_ $thing >](&self) -> Option<&Token> {
834 self.tokens.[< last_ $thing >]()
835 }
836
837 fn [< last_ $thing _index>](&self) -> Option<usize> {
838 self.tokens.[< last_ $thing _index >]()
839 }
840
841 fn [<iter_ $thing _indices>](&self) -> impl DoubleEndedIterator<Item = usize> + '_ {
842 self.tokens.[< iter_ $thing _indices >]()
843 }
844
845 fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_ {
846 self.tokens.[< iter_ $thing s >]()
847 }
848 }
849 };
850}
851
852impl TokenStringExt for Document {
853 create_fns_on_doc!(adjective);
854 create_fns_on_doc!(apostrophe);
855 create_fns_on_doc!(at);
856 create_fns_on_doc!(chunk_terminator);
857 create_fns_on_doc!(comma);
858 create_fns_on_doc!(conjunction);
859 create_fns_on_doc!(currency);
860 create_fns_on_doc!(ellipsis);
861 create_fns_on_doc!(hostname);
862 create_fns_on_doc!(likely_homograph);
863 create_fns_on_doc!(noun);
864 create_fns_on_doc!(number);
865 create_fns_on_doc!(paragraph_break);
866 create_fns_on_doc!(pipe);
867 create_fns_on_doc!(preposition);
868 create_fns_on_doc!(punctuation);
869 create_fns_on_doc!(quote);
870 create_fns_on_doc!(sentence_terminator);
871 create_fns_on_doc!(space);
872 create_fns_on_doc!(unlintable);
873 create_fns_on_doc!(verb);
874 create_fns_on_doc!(word);
875 create_fns_on_doc!(word_like);
876
877 fn first_sentence_word(&self) -> Option<&Token> {
878 self.tokens.first_sentence_word()
879 }
880
881 fn first_non_whitespace(&self) -> Option<&Token> {
882 self.tokens.first_non_whitespace()
883 }
884
885 fn span(&self) -> Option<Span<char>> {
886 self.tokens.span()
887 }
888
889 fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
890 self.tokens.iter_linking_verb_indices()
891 }
892
893 fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_ {
894 self.tokens.iter_linking_verbs()
895 }
896
897 fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
898 self.tokens.iter_chunks()
899 }
900
901 fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
902 self.tokens.iter_paragraphs()
903 }
904
905 fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
906 self.tokens.iter_sentences()
907 }
908
909 fn iter_sentences_mut(&mut self) -> impl Iterator<Item = &'_ mut [Token]> + '_ {
910 self.tokens.iter_sentences_mut()
911 }
912}
913
914impl Display for Document {
915 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
916 for token in &self.tokens {
917 write!(f, "{}", self.get_span_content_str(&token.span))?;
918 }
919
920 Ok(())
921 }
922}
923
924#[cfg(test)]
925mod tests {
926 use itertools::Itertools;
927
928 use super::Document;
929 use crate::{Span, parsers::MarkdownOptions};
930
931 fn assert_condensed_contractions(text: &str, final_tok_count: usize) {
932 let document = Document::new_plain_english_curated(text);
933
934 assert_eq!(document.tokens.len(), final_tok_count);
935
936 let document = Document::new_markdown_curated(text, MarkdownOptions::default());
937
938 assert_eq!(document.tokens.len(), final_tok_count);
939 }
940
941 #[test]
942 fn simple_contraction() {
943 assert_condensed_contractions("isn't", 1);
944 }
945
946 #[test]
947 fn simple_contraction2() {
948 assert_condensed_contractions("wasn't", 1);
949 }
950
951 #[test]
952 fn simple_contraction3() {
953 assert_condensed_contractions("There's", 1);
954 }
955
956 #[test]
957 fn medium_contraction() {
958 assert_condensed_contractions("isn't wasn't", 3);
959 }
960
961 #[test]
962 fn medium_contraction2() {
963 assert_condensed_contractions("There's no way", 5);
964 }
965
966 #[test]
967 fn selects_token_at_char_index() {
968 let text = "There were three little pigs. They built three little homes.";
969 let document = Document::new_plain_english_curated(text);
970
971 let got = document.get_token_at_char_index(19).unwrap();
972
973 assert!(got.kind.is_word());
974 assert_eq!(got.span, Span::new(17, 23));
975 }
976
977 fn assert_token_count(source: &str, count: usize) {
978 let document = Document::new_plain_english_curated(source);
979
980 dbg!(document.tokens().map(|t| t.kind.clone()).collect_vec());
981 assert_eq!(document.tokens.len(), count);
982 }
983
984 #[test]
985 fn condenses_number_suffixes() {
986 assert_token_count("1st", 1);
987 assert_token_count("This is the 2nd test", 9);
988 assert_token_count("This is the 3rd test", 9);
989 assert_token_count(
990 "It works even with weird capitalization like this: 600nD",
991 18,
992 );
993 }
994
995 #[test]
996 fn condenses_ie() {
997 assert_token_count("There is a thing (i.e. that one)", 15);
998 assert_token_count("We are trying to condense \"i.e.\"", 13);
999 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1000 }
1001
1002 #[test]
1003 fn condenses_eg() {
1004 assert_token_count("We are trying to condense \"e.g.\"", 13);
1005 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1006 }
1007
1008 #[test]
1009 fn condenses_nsa() {
1010 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1011 }
1012
1013 #[test]
1014 fn parses_ellipsis() {
1015 assert_token_count("...", 1);
1016 }
1017
1018 #[test]
1019 fn parses_long_ellipsis() {
1020 assert_token_count(".....", 1);
1021 }
1022
1023 #[test]
1024 fn parses_short_ellipsis() {
1025 assert_token_count("..", 1);
1026 }
1027
1028 #[test]
1029 fn selects_token_at_offset() {
1030 let doc = Document::new_plain_english_curated("Foo bar baz");
1031
1032 let tok = doc.get_token_offset(1, -1).unwrap();
1033
1034 assert_eq!(tok.span, Span::new(0, 3));
1035 }
1036
1037 #[test]
1038 fn cant_select_token_before_start() {
1039 let doc = Document::new_plain_english_curated("Foo bar baz");
1040
1041 let tok = doc.get_token_offset(0, -1);
1042
1043 assert!(tok.is_none());
1044 }
1045
1046 #[test]
1047 fn select_next_word_pos_offset() {
1048 let doc = Document::new_plain_english_curated("Foo bar baz");
1049
1050 let bar = doc.get_next_word_from_offset(0, 1).unwrap();
1051 let bar = doc.get_span_content(&bar.span);
1052 assert_eq!(bar, ['b', 'a', 'r']);
1053 }
1054
1055 #[test]
1056 fn select_next_word_neg_offset() {
1057 let doc = Document::new_plain_english_curated("Foo bar baz");
1058
1059 let bar = doc.get_next_word_from_offset(2, -1).unwrap();
1060 let bar = doc.get_span_content(&bar.span);
1061 assert_eq!(bar, ['F', 'o', 'o']);
1062 }
1063
1064 #[test]
1065 fn cant_select_next_word_not_from_whitespace() {
1066 let doc = Document::new_plain_english_curated("Foo bar baz");
1067
1068 let tok = doc.get_next_word_from_offset(0, 2);
1069
1070 assert!(tok.is_none());
1071 }
1072
1073 #[test]
1074 fn cant_select_next_word_before_start() {
1075 let doc = Document::new_plain_english_curated("Foo bar baz");
1076
1077 let tok = doc.get_next_word_from_offset(0, -1);
1078
1079 assert!(tok.is_none());
1080 }
1081
1082 #[test]
1083 fn cant_select_next_word_with_punctuation_instead_of_whitespace() {
1084 let doc = Document::new_plain_english_curated("Foo, bar, baz");
1085
1086 let tok = doc.get_next_word_from_offset(0, 1);
1087
1088 assert!(tok.is_none());
1089 }
1090
1091 #[test]
1092 fn cant_select_next_word_with_punctuation_after_whitespace() {
1093 let doc = Document::new_plain_english_curated("Foo \"bar\", baz");
1094
1095 let tok = doc.get_next_word_from_offset(0, 1);
1096
1097 assert!(tok.is_none());
1098 }
1099
1100 #[test]
1101 fn condenses_filename_extensions() {
1102 let doc = Document::new_plain_english_curated(".c and .exe and .js");
1103 assert!(doc.tokens[0].kind.is_unlintable());
1104 assert!(doc.tokens[4].kind.is_unlintable());
1105 assert!(doc.tokens[8].kind.is_unlintable());
1106 }
1107
1108 #[test]
1109 fn condense_filename_extension_ok_at_start_and_end() {
1110 let doc = Document::new_plain_english_curated(".c and .EXE");
1111 assert!(doc.tokens.len() == 5);
1112 assert!(doc.tokens[0].kind.is_unlintable());
1113 assert!(doc.tokens[4].kind.is_unlintable());
1114 }
1115
1116 #[test]
1117 fn doesnt_condense_filename_extensions_with_mixed_case() {
1118 let doc = Document::new_plain_english_curated(".c and .Exe");
1119 assert!(doc.tokens.len() == 6);
1120 assert!(doc.tokens[0].kind.is_unlintable());
1121 assert!(doc.tokens[4].kind.is_punctuation());
1122 assert!(doc.tokens[5].kind.is_word());
1123 }
1124
1125 #[test]
1126 fn doesnt_condense_filename_extensions_with_non_letters() {
1127 let doc = Document::new_plain_english_curated(".COM and .C0M");
1128 assert!(doc.tokens.len() == 6);
1129 assert!(doc.tokens[0].kind.is_unlintable());
1130 assert!(doc.tokens[4].kind.is_punctuation());
1131 assert!(doc.tokens[5].kind.is_word());
1132 }
1133
1134 #[test]
1135 fn doesnt_condense_filename_extensions_longer_than_three() {
1136 let doc = Document::new_plain_english_curated(".dll and .dlls");
1137 assert!(doc.tokens.len() == 6);
1138 assert!(doc.tokens[0].kind.is_unlintable());
1139 assert!(doc.tokens[4].kind.is_punctuation());
1140 assert!(doc.tokens[5].kind.is_word());
1141 }
1142
1143 #[test]
1144 fn condense_filename_extension_in_parens() {
1145 let doc = Document::new_plain_english_curated(
1146 "true for the manual installation when trying to run the executable(.exe) after a manual download",
1147 );
1148 assert!(doc.tokens.len() > 23);
1149 assert!(doc.tokens[21].kind.is_open_round());
1150 assert!(doc.tokens[22].kind.is_unlintable());
1151 assert!(doc.tokens[23].kind.is_close_round());
1152 }
1153
1154 #[test]
1155 fn condense_tldr_uppercase() {
1156 let doc = Document::new_plain_english_curated("TL;DR");
1157 assert!(doc.tokens.len() == 1);
1158 assert!(doc.tokens[0].kind.is_word());
1159 assert!(doc.tokens[0].span.len() == 5);
1160 }
1161
1162 #[test]
1163 fn condense_tldr_lowercase() {
1164 let doc = Document::new_plain_english_curated("tl;dr");
1165 assert!(doc.tokens.len() == 1);
1166 assert!(doc.tokens[0].kind.is_word());
1167 }
1168
1169 #[test]
1170 fn condense_tldr_mixed_case_1() {
1171 let doc = Document::new_plain_english_curated("tl;DR");
1172 assert!(doc.tokens.len() == 1);
1173 assert!(doc.tokens[0].kind.is_word());
1174 }
1175
1176 #[test]
1177 fn condense_tldr_mixed_case_2() {
1178 let doc = Document::new_plain_english_curated("TL;Dr");
1179 assert!(doc.tokens.len() == 1);
1180 assert!(doc.tokens[0].kind.is_word());
1181 }
1182
1183 #[test]
1184 fn condense_tldr_pural() {
1185 let doc = Document::new_plain_english_curated(
1186 "managing the flow between components to produce relevant TL;DRs of current news articles",
1187 );
1188 assert!(
1190 doc.tokens
1191 .iter()
1192 .all(|t| t.kind.is_word() || t.kind.is_whitespace())
1193 );
1194 let tldrs = doc
1196 .tokens
1197 .iter()
1198 .filter(|t| t.span.get_content(&doc.source).contains(&';'))
1199 .collect_vec();
1200 assert!(tldrs.len() == 1);
1201 assert!(tldrs[0].span.get_content_string(&doc.source) == "TL;DRs");
1202 }
1203
1204 #[test]
1205 fn condense_r_and_d_caps() {
1206 let doc = Document::new_plain_english_curated("R&D");
1207 assert!(doc.tokens.len() == 1);
1208 assert!(doc.tokens[0].kind.is_word());
1209 }
1210
1211 #[test]
1212 fn condense_r_and_d_mixed_case() {
1213 let doc = Document::new_plain_english_curated("R&d");
1214 assert!(doc.tokens.len() == 1);
1215 assert!(doc.tokens[0].kind.is_word());
1216 }
1217
1218 #[test]
1219 fn condense_r_and_d_lowercase() {
1220 let doc = Document::new_plain_english_curated("r&d");
1221 assert!(doc.tokens.len() == 1);
1222 assert!(doc.tokens[0].kind.is_word());
1223 }
1224
1225 #[test]
1226 fn dont_condense_r_and_d_with_spaces() {
1227 let doc = Document::new_plain_english_curated("R & D");
1228 assert!(doc.tokens.len() == 5);
1229 assert!(doc.tokens[0].kind.is_word());
1230 assert!(doc.tokens[1].kind.is_whitespace());
1231 assert!(doc.tokens[2].kind.is_ampersand());
1232 assert!(doc.tokens[3].kind.is_whitespace());
1233 assert!(doc.tokens[4].kind.is_word());
1234 }
1235
1236 #[test]
1237 fn condense_q_and_a() {
1238 let doc =
1239 Document::new_plain_english_curated("A Q&A platform software for teams at any scales.");
1240 assert!(doc.tokens.len() >= 3);
1241 assert!(doc.tokens[2].kind.is_word());
1242 assert!(doc.tokens[2].span.get_content_string(&doc.source) == "Q&A");
1243 }
1244
1245 #[test]
1246 fn dont_allow_mixed_r_and_d_with_q_and_a() {
1247 let doc = Document::new_plain_english_curated("R&A or Q&D");
1248 assert!(doc.tokens.len() == 9);
1249 assert!(doc.tokens[1].kind.is_ampersand() || doc.tokens[7].kind.is_ampersand());
1250 }
1251
1252 #[test]
1253 fn condense_io() {
1254 let doc = Document::new_plain_english_curated("I/O");
1255 assert!(doc.tokens.len() == 1);
1256 assert!(doc.tokens[0].kind.is_word());
1257 }
1258}