1use std::cmp::Ordering;
2use std::collections::VecDeque;
3use std::fmt::Display;
4
5use harper_brill::{Chunker, Tagger, brill_tagger, burn_chunker};
6use itertools::Itertools;
7use paste::paste;
8
9use crate::expr::{Expr, ExprExt, FirstMatchOf, Repeating, SequenceExpr};
10use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
11use crate::patterns::WordSet;
12use crate::punctuation::Punctuation;
13use crate::spell::{Dictionary, FstDictionary};
14use crate::vec_ext::VecExt;
15use crate::{CharStringExt, FatStringToken, FatToken, Lrc, Token, TokenKind, TokenStringExt};
16use crate::{OrdinalSuffix, Span};
17
18#[derive(Debug, Clone)]
20pub struct Document {
21 source: Lrc<Vec<char>>,
22 tokens: Vec<Token>,
23}
24
25impl Default for Document {
26 fn default() -> Self {
27 Self::new("", &PlainEnglish, &FstDictionary::curated())
28 }
29}
30
31impl Document {
32 pub fn token_indices_intersecting(&self, span: Span<char>) -> Vec<usize> {
36 self.tokens()
37 .enumerate()
38 .filter_map(|(idx, tok)| tok.span.overlaps_with(span).then_some(idx))
39 .collect()
40 }
41
42 pub fn fat_tokens_intersecting(&self, span: Span<char>) -> Vec<FatToken> {
46 let indices = self.token_indices_intersecting(span);
47
48 indices
49 .into_iter()
50 .map(|i| self.tokens[i].to_fat(&self.source))
51 .collect()
52 }
53
54 pub fn new(text: &str, parser: &impl Parser, dictionary: &impl Dictionary) -> Self {
57 let source: Vec<_> = text.chars().collect();
58
59 Self::new_from_vec(Lrc::new(source), parser, dictionary)
60 }
61
62 pub fn new_curated(text: &str, parser: &impl Parser) -> Self {
65 let source: Vec<_> = text.chars().collect();
66
67 Self::new_from_vec(Lrc::new(source), parser, &FstDictionary::curated())
68 }
69
70 pub fn new_from_vec(
73 source: Lrc<Vec<char>>,
74 parser: &impl Parser,
75 dictionary: &impl Dictionary,
76 ) -> Self {
77 let tokens = parser.parse(&source);
78
79 let mut document = Self { source, tokens };
80 document.parse(dictionary);
81
82 document
83 }
84
85 pub fn new_plain_english_curated(text: &str) -> Self {
88 Self::new(text, &PlainEnglish, &FstDictionary::curated())
89 }
90
91 pub(crate) fn new_basic_tokenize(text: &str, parser: &impl Parser) -> Self {
97 let source = Lrc::new(text.chars().collect_vec());
98 let tokens = parser.parse(&source);
99 let mut document = Self { source, tokens };
100 document.apply_fixups();
101 document
102 }
103
104 pub fn new_plain_english(text: &str, dictionary: &impl Dictionary) -> Self {
107 Self::new(text, &PlainEnglish, dictionary)
108 }
109
110 pub fn new_markdown_curated(text: &str, markdown_options: MarkdownOptions) -> Self {
113 Self::new(
114 text,
115 &Markdown::new(markdown_options),
116 &FstDictionary::curated(),
117 )
118 }
119
120 pub fn new_markdown_default_curated(text: &str) -> Self {
123 Self::new_markdown_curated(text, MarkdownOptions::default())
124 }
125
126 pub fn new_markdown(
129 text: &str,
130 markdown_options: MarkdownOptions,
131 dictionary: &impl Dictionary,
132 ) -> Self {
133 Self::new(text, &Markdown::new(markdown_options), dictionary)
134 }
135
136 pub fn new_markdown_default(text: &str, dictionary: &impl Dictionary) -> Self {
139 Self::new_markdown(text, MarkdownOptions::default(), dictionary)
140 }
141
142 fn apply_fixups(&mut self) {
143 self.condense_spaces();
144 self.condense_newlines();
145 self.newlines_to_breaks();
146 self.condense_dotted_initialisms();
147 self.condense_number_suffixes();
148 self.condense_ellipsis();
149 self.condense_latin();
150 self.condense_filename_extensions();
151 self.condense_tldr();
152 self.condense_ampersand_pairs();
153 self.condense_slash_pairs();
154 self.match_quotes();
155 }
156
157 fn parse(&mut self, dictionary: &impl Dictionary) {
161 self.apply_fixups();
162
163 let chunker = burn_chunker();
164 let tagger = brill_tagger();
165
166 for sent in self.tokens.iter_sentences_mut() {
167 let token_strings: Vec<_> = sent
168 .iter()
169 .filter(|t| !t.kind.is_whitespace())
170 .map(|t| t.span.get_content_string(&self.source))
171 .collect();
172
173 let token_tags = tagger.tag_sentence(&token_strings);
174 let np_flags = chunker.chunk_sentence(&token_strings, &token_tags);
175
176 let mut i = 0;
177
178 for token in sent.iter_mut() {
180 if let TokenKind::Word(meta) = &mut token.kind {
181 let word_source = token.span.get_content(&self.source);
182 let mut found_meta = dictionary
183 .get_word_metadata(word_source)
184 .map(|c| c.into_owned());
185
186 if let Some(inner) = &mut found_meta {
187 inner.pos_tag = token_tags[i].or_else(|| inner.infer_pos_tag());
188 inner.np_member = Some(np_flags[i]);
189 }
190
191 *meta = found_meta;
192 i += 1;
193 } else if !token.kind.is_whitespace() {
194 i += 1;
195 }
196 }
197 }
198 }
199
200 fn newlines_to_breaks(&mut self) {
202 for token in &mut self.tokens {
203 if let TokenKind::Newline(n) = token.kind
204 && n >= 2
205 {
206 token.kind = TokenKind::ParagraphBreak;
207 }
208 }
209 }
210
211 fn condense_indices(&mut self, indices: &[usize], stretch_len: usize) {
217 for idx in indices {
219 let end_tok = self.tokens[idx + stretch_len - 1].clone();
220 let start_tok = &mut self.tokens[*idx];
221
222 start_tok.span.end = end_tok.span.end;
223 }
224
225 let old = self.tokens.clone();
227 self.tokens.clear();
228
229 self.tokens
231 .extend_from_slice(&old[0..indices.first().copied().unwrap_or(indices.len())]);
232
233 let mut iter = indices.iter().peekable();
234
235 while let (Some(a_idx), b) = (iter.next(), iter.peek()) {
236 self.tokens.push(old[*a_idx].clone());
237
238 if let Some(b_idx) = b {
239 self.tokens
240 .extend_from_slice(&old[a_idx + stretch_len..**b_idx]);
241 }
242 }
243
244 self.tokens.extend_from_slice(
246 &old[indices
247 .last()
248 .map(|v| v + stretch_len)
249 .unwrap_or(indices.len())..],
250 );
251 }
252
253 pub fn get_token_at_char_index(&self, char_index: usize) -> Option<&Token> {
254 let index = self
255 .tokens
256 .binary_search_by(|t| {
257 if t.span.overlaps_with(Span::new_with_len(char_index, 1)) {
258 Ordering::Equal
259 } else {
260 t.span.start.cmp(&char_index)
261 }
262 })
263 .ok()?;
264
265 Some(&self.tokens[index])
266 }
267
268 pub fn get_token(&self, index: usize) -> Option<&Token> {
270 self.tokens.get(index)
271 }
272
273 pub fn get_token_offset(&self, base: usize, offset: isize) -> Option<&Token> {
275 match base.checked_add_signed(offset) {
276 None => None,
277 Some(idx) => self.get_token(idx),
278 }
279 }
280
281 pub fn tokens(&self) -> impl Iterator<Item = &Token> + '_ {
283 self.tokens.iter()
284 }
285
286 pub fn iter_nominal_phrases(&self) -> impl Iterator<Item = &[Token]> {
287 fn is_np_member(t: &Token) -> bool {
288 t.kind
289 .as_word()
290 .and_then(|x| x.as_ref())
291 .and_then(|w| w.np_member)
292 .unwrap_or(false)
293 }
294
295 fn trim(slice: &[Token]) -> &[Token] {
296 let mut start = 0;
297 let mut end = slice.len();
298 while start < end && slice[start].kind.is_whitespace() {
299 start += 1;
300 }
301 while end > start && slice[end - 1].kind.is_whitespace() {
302 end -= 1;
303 }
304 &slice[start..end]
305 }
306
307 self.tokens
308 .as_slice()
309 .split(|t| !(is_np_member(t) || t.kind.is_whitespace()))
310 .filter_map(|s| {
311 let s = trim(s);
312 if s.iter().any(is_np_member) {
313 Some(s)
314 } else {
315 None
316 }
317 })
318 }
319
320 pub fn fat_tokens(&self) -> impl Iterator<Item = FatToken> + '_ {
322 self.tokens().map(|token| token.to_fat(&self.source))
323 }
324
325 pub fn get_next_word_from_offset(&self, base: usize, offset: isize) -> Option<&Token> {
328 if !self.get_token_offset(base, offset)?.kind.is_whitespace() {
330 return None;
331 }
332 let word_token = self.get_token_offset(base, offset + offset.signum());
334 let word_token = word_token?;
335 word_token.kind.is_word().then_some(word_token)
336 }
337
338 pub fn fat_string_tokens(&self) -> impl Iterator<Item = FatStringToken> + '_ {
340 self.fat_tokens().map(|t| t.into())
341 }
342
343 pub fn get_span_content(&self, span: &Span<char>) -> &[char] {
344 span.get_content(&self.source)
345 }
346
347 pub fn get_span_content_str(&self, span: &Span<char>) -> String {
348 String::from_iter(self.get_span_content(span))
349 }
350
351 pub fn get_full_string(&self) -> String {
352 self.get_span_content_str(&Span::new(0, self.source.len()))
353 }
354
355 pub fn get_full_content(&self) -> &[char] {
356 &self.source
357 }
358
359 pub fn get_source(&self) -> &[char] {
360 &self.source
361 }
362
363 pub fn get_tokens(&self) -> &[Token] {
364 &self.tokens
365 }
366
367 fn match_quotes(&mut self) {
373 let mut pg_indices: Vec<_> = vec![0];
374 pg_indices.extend(self.iter_paragraph_break_indices());
375 pg_indices.push(self.tokens.len());
376
377 let mut quote_indices = Vec::new();
379 let mut open_quote_indices = Vec::new();
380
381 for (start, end) in pg_indices.into_iter().tuple_windows() {
382 let pg = &mut self.tokens[start..end];
383
384 quote_indices.clear();
385 quote_indices.extend(pg.iter_quote_indices());
386 open_quote_indices.clear();
387
388 for quote in "e_indices {
390 let is_open = *quote == 0
391 || pg[0..*quote].iter_word_likes().next().is_none()
392 || pg[quote - 1].kind.is_whitespace()
393 || matches!(
394 pg[quote - 1].kind.as_punctuation(),
395 Some(Punctuation::LessThan)
396 | Some(Punctuation::OpenRound)
397 | Some(Punctuation::OpenSquare)
398 | Some(Punctuation::OpenCurly)
399 | Some(Punctuation::Apostrophe)
400 );
401
402 if is_open {
403 open_quote_indices.push(*quote);
404 }
405 }
406
407 while let Some(open_idx) = open_quote_indices.pop() {
408 let Some(close_idx) = pg[open_idx + 1..].iter_quote_indices().next() else {
409 continue;
410 };
411
412 if pg[close_idx + open_idx + 1]
413 .kind
414 .as_quote()
415 .unwrap()
416 .twin_loc
417 .is_some()
418 {
419 continue;
420 }
421
422 pg[open_idx].kind.as_mut_quote().unwrap().twin_loc =
423 Some(close_idx + open_idx + start + 1);
424 pg[close_idx + open_idx + 1]
425 .kind
426 .as_mut_quote()
427 .unwrap()
428 .twin_loc = Some(open_idx + start);
429 }
430 }
431 }
432
433 fn condense_number_suffixes(&mut self) {
435 if self.tokens.len() < 2 {
436 return;
437 }
438
439 let mut replace_starts = Vec::new();
440
441 for idx in 0..self.tokens.len() - 1 {
442 let b = &self.tokens[idx + 1];
443 let a = &self.tokens[idx];
444
445 if let (TokenKind::Number(..), TokenKind::Word(..)) = (&a.kind, &b.kind)
448 && let Some(found_suffix) =
449 OrdinalSuffix::from_chars(self.get_span_content(&b.span))
450 {
451 self.tokens[idx].kind.as_mut_number().unwrap().suffix = Some(found_suffix);
452 replace_starts.push(idx);
453 }
454 }
455
456 self.condense_indices(&replace_starts, 2);
457 }
458
459 fn condense_spaces(&mut self) {
462 let mut cursor = 0;
463 let copy = self.tokens.clone();
464
465 let mut remove_these = VecDeque::new();
466
467 while cursor < self.tokens.len() {
468 let start_tok = &mut self.tokens[cursor];
470
471 if let TokenKind::Space(start_count) = &mut start_tok.kind {
472 loop {
473 cursor += 1;
474
475 if cursor >= copy.len() {
476 break;
477 }
478
479 let child_tok = ©[cursor];
480
481 if start_tok.span.end != child_tok.span.start {
483 break;
484 }
485
486 if let TokenKind::Space(n) = child_tok.kind {
487 *start_count += n;
488 start_tok.span.end = child_tok.span.end;
489 remove_these.push_back(cursor);
490 cursor += 1;
491 } else {
492 break;
493 };
494 }
495 }
496
497 cursor += 1;
498 }
499
500 self.tokens.remove_indices(remove_these);
501 }
502
503 thread_local! {
504 static LATIN_EXPR: Lrc<FirstMatchOf> = Document::uncached_latin_expr();
505 }
506
507 fn uncached_latin_expr() -> Lrc<FirstMatchOf> {
508 Lrc::new(FirstMatchOf::new(vec![
509 Box::new(
510 SequenceExpr::default()
511 .then(WordSet::new(&["etc", "vs"]))
512 .then_period(),
513 ),
514 Box::new(
515 SequenceExpr::aco("et")
516 .then_whitespace()
517 .t_aco("al")
518 .then_period(),
519 ),
520 ]))
521 }
522
523 fn condense_expr<F>(&mut self, expr: &impl Expr, edit: F)
526 where
527 F: Fn(&mut Token),
528 {
529 let matches = expr.iter_matches_in_doc(self).collect::<Vec<_>>();
530
531 let mut remove_indices = VecDeque::with_capacity(matches.len());
532
533 for m in matches {
534 remove_indices.extend(m.start + 1..m.end);
535 self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
536 edit(&mut self.tokens[m.start]);
537 }
538
539 self.tokens.remove_indices(remove_indices);
540 }
541
542 fn condense_latin(&mut self) {
543 self.condense_expr(&Self::LATIN_EXPR.with(|v| v.clone()), |_| {})
544 }
545
546 fn condense_newlines(&mut self) {
549 let mut cursor = 0;
550 let copy = self.tokens.clone();
551
552 let mut remove_these = VecDeque::new();
553
554 while cursor < self.tokens.len() {
555 let start_tok = &mut self.tokens[cursor];
557
558 if let TokenKind::Newline(start_count) = &mut start_tok.kind {
559 loop {
560 cursor += 1;
561
562 if cursor >= copy.len() {
563 break;
564 }
565
566 let child_tok = ©[cursor];
567 if let TokenKind::Newline(n) = child_tok.kind {
568 *start_count += n;
569 start_tok.span.end = child_tok.span.end;
570 remove_these.push_back(cursor);
571 cursor += 1;
572 } else {
573 break;
574 };
575 }
576 }
577
578 cursor += 1;
579 }
580
581 self.tokens.remove_indices(remove_these);
582 }
583
584 fn condense_dotted_initialisms(&mut self) {
587 if self.tokens.len() < 2 {
588 return;
589 }
590
591 let mut to_remove = VecDeque::new();
592
593 let mut cursor = 1;
594
595 let mut initialism_start = None;
596
597 loop {
598 let a = &self.tokens[cursor - 1];
599 let b = &self.tokens[cursor];
600
601 let is_initialism_chunk = a.kind.is_word() && a.span.len() == 1 && b.kind.is_period();
602
603 if is_initialism_chunk {
604 if initialism_start.is_none() {
605 initialism_start = Some(cursor - 1);
606 } else {
607 to_remove.push_back(cursor - 1);
608 }
609
610 to_remove.push_back(cursor);
611 cursor += 1;
612 } else {
613 if let Some(start) = initialism_start {
614 let end = self.tokens[cursor - 2].span.end;
615 let start_tok: &mut Token = &mut self.tokens[start];
616 start_tok.span.end = end;
617 }
618
619 initialism_start = None;
620 }
621
622 cursor += 1;
623
624 if cursor >= self.tokens.len() - 1 {
625 break;
626 }
627 }
628
629 self.tokens.remove_indices(to_remove);
630 }
631
632 fn condense_filename_extensions(&mut self) {
634 if self.tokens.len() < 2 {
635 return;
636 }
637
638 let mut to_remove = VecDeque::new();
639
640 let mut cursor = 1;
641
642 let mut ext_start = None;
643
644 loop {
645 let l = self.get_token_offset(cursor, -2);
647 let d = &self.tokens[cursor - 1];
648 let x = &self.tokens[cursor];
649 let r = self.get_token_offset(cursor, 1);
650
651 let is_ext_chunk = d.kind.is_period()
652 && x.kind.is_word()
653 && x.span.len() <= 3
654 && ((l.is_none_or(|t| t.kind.is_whitespace())
655 && r.is_none_or(|t| t.kind.is_whitespace()))
656 || (l.is_some_and(|t| t.kind.is_open_round())
657 && r.is_some_and(|t| t.kind.is_close_round())))
658 && {
659 let ext_chars = x.span.get_content(&self.source);
660 ext_chars.iter().all(|c| c.is_ascii_lowercase())
661 || ext_chars.iter().all(|c| c.is_ascii_uppercase())
662 };
663
664 if is_ext_chunk {
665 if ext_start.is_none() {
666 ext_start = Some(cursor - 1);
667 self.tokens[cursor - 1].kind = TokenKind::Unlintable;
668 } else {
669 to_remove.push_back(cursor - 1);
670 }
671
672 to_remove.push_back(cursor);
673 cursor += 1;
674 } else {
675 if let Some(start) = ext_start {
676 let end = self.tokens[cursor - 2].span.end;
677 let start_tok: &mut Token = &mut self.tokens[start];
678 start_tok.span.end = end;
679 }
680
681 ext_start = None;
682 }
683
684 cursor += 1;
685
686 if cursor >= self.tokens.len() {
687 break;
688 }
689 }
690
691 self.tokens.remove_indices(to_remove);
692 }
693
694 fn condense_tldr(&mut self) {
696 if self.tokens.len() < 3 {
697 return;
698 }
699
700 let mut to_remove = VecDeque::new();
701 let mut cursor = 2;
702
703 loop {
704 let tl = &self.tokens[cursor - 2];
705 let simicolon = &self.tokens[cursor - 1];
706 let dr = &self.tokens[cursor];
707
708 let is_tldr_chunk = tl.kind.is_word()
709 && tl.span.len() == 2
710 && tl
711 .span
712 .get_content(&self.source)
713 .eq_ignore_ascii_case_chars(&['t', 'l'])
714 && simicolon.kind.is_semicolon()
715 && dr.kind.is_word()
716 && dr.span.len() >= 2
717 && dr.span.len() <= 3
718 && dr
719 .span
720 .get_content(&self.source)
721 .eq_any_ignore_ascii_case_chars(&[&['d', 'r'], &['d', 'r', 's']]);
722
723 if is_tldr_chunk {
724 self.tokens[cursor - 2].span = Span::new(
726 self.tokens[cursor - 2].span.start,
727 self.tokens[cursor].span.end,
728 );
729
730 to_remove.push_back(cursor - 1);
732 to_remove.push_back(cursor);
733 }
734
735 cursor += 1;
737
738 if cursor >= self.tokens.len() {
739 break;
740 }
741 }
742
743 self.tokens.remove_indices(to_remove);
745 }
746
747 fn condense_delimited_pairs<F>(&mut self, is_delimiter: F, valid_pairs: &[(char, char)])
755 where
756 F: Fn(&TokenKind) -> bool,
757 {
758 if self.tokens.len() < 3 {
759 return;
760 }
761
762 let mut to_remove = VecDeque::new();
763 let mut cursor = 2;
764
765 loop {
766 let l1 = &self.tokens[cursor - 2];
767 let delim = &self.tokens[cursor - 1];
768 let l2 = &self.tokens[cursor];
769
770 let is_delimited_chunk = l1.kind.is_word()
771 && l1.span.len() == 1
772 && is_delimiter(&delim.kind)
773 && l2.kind.is_word()
774 && l2.span.len() == 1;
775
776 if is_delimited_chunk {
777 let (l1, l2) = (
778 l1.span.get_content(&self.source).first(),
779 l2.span.get_content(&self.source).first(),
780 );
781
782 let is_valid_pair = match (l1, l2) {
783 (Some(l1), Some(l2)) => {
784 let pair = (l1.to_ascii_lowercase(), l2.to_ascii_lowercase());
785 valid_pairs.contains(&pair)
786 }
787 _ => false,
788 };
789
790 if is_valid_pair {
791 self.tokens[cursor - 2].span = Span::new(
792 self.tokens[cursor - 2].span.start,
793 self.tokens[cursor].span.end,
794 );
795 to_remove.push_back(cursor - 1);
796 to_remove.push_back(cursor);
797 }
798 }
799
800 cursor += 1;
801 if cursor >= self.tokens.len() {
802 break;
803 }
804 }
805
806 self.tokens.remove_indices(to_remove);
807 }
808
809 fn condense_ampersand_pairs(&mut self) {
811 self.condense_delimited_pairs(
812 |kind| kind.is_ampersand(),
813 &[
814 ('b', 'b'), ('b', 'w'), ('g', 't'), ('k', 'r'), ('q', 'a'), ('r', 'b'), ('r', 'd'), ('r', 'r'), ('s', 'p'), ],
824 );
825 }
826
827 fn condense_slash_pairs(&mut self) {
829 self.condense_delimited_pairs(
830 |kind| kind.is_slash(),
831 &[
832 ('a', 'c'), ('b', 'w'), ('c', 'o'), ('d', 'c'), ('d', 'l'), ('i', 'o'), ('j', 'k'), ('n', 'a'), ('r', 'c'), ('s', 'n'), ('y', 'n'), ('y', 'o'), ],
845 );
846 }
847
848 fn uncached_ellipsis_pattern() -> Lrc<Repeating> {
849 let period = SequenceExpr::default().then_period();
850 Lrc::new(Repeating::new(Box::new(period), 2))
851 }
852
853 thread_local! {
854 static ELLIPSIS_EXPR: Lrc<Repeating> = Document::uncached_ellipsis_pattern();
855 }
856
857 fn condense_ellipsis(&mut self) {
858 let expr = Self::ELLIPSIS_EXPR.with(|v| v.clone());
859 self.condense_expr(&expr, |tok| {
860 tok.kind = TokenKind::Punctuation(Punctuation::Ellipsis)
861 });
862 }
863}
864
865macro_rules! create_fns_on_doc {
867 ($thing:ident) => {
868 paste! {
869 fn [< first_ $thing >](&self) -> Option<&Token> {
870 self.tokens.[< first_ $thing >]()
871 }
872
873 fn [< last_ $thing >](&self) -> Option<&Token> {
874 self.tokens.[< last_ $thing >]()
875 }
876
877 fn [< last_ $thing _index>](&self) -> Option<usize> {
878 self.tokens.[< last_ $thing _index >]()
879 }
880
881 fn [<iter_ $thing _indices>](&self) -> impl DoubleEndedIterator<Item = usize> + '_ {
882 self.tokens.[< iter_ $thing _indices >]()
883 }
884
885 fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_ {
886 self.tokens.[< iter_ $thing s >]()
887 }
888 }
889 };
890}
891
892impl TokenStringExt for Document {
893 create_fns_on_doc!(adjective);
894 create_fns_on_doc!(apostrophe);
895 create_fns_on_doc!(at);
896 create_fns_on_doc!(chunk_terminator);
897 create_fns_on_doc!(comma);
898 create_fns_on_doc!(conjunction);
899 create_fns_on_doc!(currency);
900 create_fns_on_doc!(ellipsis);
901 create_fns_on_doc!(hostname);
902 create_fns_on_doc!(likely_homograph);
903 create_fns_on_doc!(noun);
904 create_fns_on_doc!(number);
905 create_fns_on_doc!(paragraph_break);
906 create_fns_on_doc!(pipe);
907 create_fns_on_doc!(preposition);
908 create_fns_on_doc!(punctuation);
909 create_fns_on_doc!(quote);
910 create_fns_on_doc!(sentence_terminator);
911 create_fns_on_doc!(space);
912 create_fns_on_doc!(unlintable);
913 create_fns_on_doc!(verb);
914 create_fns_on_doc!(word);
915 create_fns_on_doc!(word_like);
916 create_fns_on_doc!(heading_start);
917
918 fn first_sentence_word(&self) -> Option<&Token> {
919 self.tokens.first_sentence_word()
920 }
921
922 fn first_non_whitespace(&self) -> Option<&Token> {
923 self.tokens.first_non_whitespace()
924 }
925
926 fn span(&self) -> Option<Span<char>> {
927 self.tokens.span()
928 }
929
930 fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
931 self.tokens.iter_linking_verb_indices()
932 }
933
934 fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_ {
935 self.tokens.iter_linking_verbs()
936 }
937
938 fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
939 self.tokens.iter_chunks()
940 }
941
942 fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
943 self.tokens.iter_paragraphs()
944 }
945
946 fn iter_headings(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
947 self.tokens.iter_headings()
948 }
949
950 fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
951 self.tokens.iter_sentences()
952 }
953
954 fn iter_sentences_mut(&mut self) -> impl Iterator<Item = &'_ mut [Token]> + '_ {
955 self.tokens.iter_sentences_mut()
956 }
957}
958
959impl Display for Document {
960 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
961 for token in &self.tokens {
962 write!(f, "{}", self.get_span_content_str(&token.span))?;
963 }
964
965 Ok(())
966 }
967}
968
969#[cfg(test)]
970mod tests {
971 use itertools::Itertools;
972
973 use super::Document;
974 use crate::TokenStringExt;
975 use crate::{Span, parsers::MarkdownOptions};
976
977 fn assert_condensed_contractions(text: &str, final_tok_count: usize) {
978 let document = Document::new_plain_english_curated(text);
979
980 assert_eq!(document.tokens.len(), final_tok_count);
981
982 let document = Document::new_markdown_curated(text, MarkdownOptions::default());
983
984 assert_eq!(document.tokens.len(), final_tok_count);
985 }
986
987 #[test]
988 fn simple_contraction() {
989 assert_condensed_contractions("isn't", 1);
990 }
991
992 #[test]
993 fn simple_contraction2() {
994 assert_condensed_contractions("wasn't", 1);
995 }
996
997 #[test]
998 fn simple_contraction3() {
999 assert_condensed_contractions("There's", 1);
1000 }
1001
1002 #[test]
1003 fn simple_contraction4() {
1004 assert_condensed_contractions("doesn't", 1);
1005 }
1006
1007 #[test]
1008 fn medium_contraction() {
1009 assert_condensed_contractions("isn't wasn't", 3);
1010 }
1011
1012 #[test]
1013 fn medium_contraction2() {
1014 assert_condensed_contractions("There's no way", 5);
1015 }
1016
1017 #[test]
1018 fn selects_token_at_char_index() {
1019 let text = "There were three little pigs. They built three little homes.";
1020 let document = Document::new_plain_english_curated(text);
1021
1022 let got = document.get_token_at_char_index(19).unwrap();
1023
1024 assert!(got.kind.is_word());
1025 assert_eq!(got.span, Span::new(17, 23));
1026 }
1027
1028 fn assert_token_count(source: &str, count: usize) {
1029 let document = Document::new_plain_english_curated(source);
1030
1031 dbg!(document.tokens().map(|t| t.kind.clone()).collect_vec());
1032 assert_eq!(document.tokens.len(), count);
1033 }
1034
1035 #[test]
1036 fn condenses_number_suffixes() {
1037 assert_token_count("1st", 1);
1038 assert_token_count("This is the 2nd test", 9);
1039 assert_token_count("This is the 3rd test", 9);
1040 assert_token_count(
1041 "It works even with weird capitalization like this: 600nD",
1042 18,
1043 );
1044 }
1045
1046 #[test]
1047 fn condenses_ie() {
1048 assert_token_count("There is a thing (i.e. that one)", 15);
1049 assert_token_count("We are trying to condense \"i.e.\"", 13);
1050 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1051 }
1052
1053 #[test]
1054 fn condenses_eg() {
1055 assert_token_count("We are trying to condense \"e.g.\"", 13);
1056 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1057 }
1058
1059 #[test]
1060 fn condenses_nsa() {
1061 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1062 }
1063
1064 #[test]
1065 fn parses_ellipsis() {
1066 assert_token_count("...", 1);
1067 }
1068
1069 #[test]
1070 fn parses_long_ellipsis() {
1071 assert_token_count(".....", 1);
1072 }
1073
1074 #[test]
1075 fn parses_short_ellipsis() {
1076 assert_token_count("..", 1);
1077 }
1078
1079 #[test]
1080 fn selects_token_at_offset() {
1081 let doc = Document::new_plain_english_curated("Foo bar baz");
1082
1083 let tok = doc.get_token_offset(1, -1).unwrap();
1084
1085 assert_eq!(tok.span, Span::new(0, 3));
1086 }
1087
1088 #[test]
1089 fn cant_select_token_before_start() {
1090 let doc = Document::new_plain_english_curated("Foo bar baz");
1091
1092 let tok = doc.get_token_offset(0, -1);
1093
1094 assert!(tok.is_none());
1095 }
1096
1097 #[test]
1098 fn select_next_word_pos_offset() {
1099 let doc = Document::new_plain_english_curated("Foo bar baz");
1100
1101 let bar = doc.get_next_word_from_offset(0, 1).unwrap();
1102 let bar = doc.get_span_content(&bar.span);
1103 assert_eq!(bar, ['b', 'a', 'r']);
1104 }
1105
1106 #[test]
1107 fn select_next_word_neg_offset() {
1108 let doc = Document::new_plain_english_curated("Foo bar baz");
1109
1110 let bar = doc.get_next_word_from_offset(2, -1).unwrap();
1111 let bar = doc.get_span_content(&bar.span);
1112 assert_eq!(bar, ['F', 'o', 'o']);
1113 }
1114
1115 #[test]
1116 fn cant_select_next_word_not_from_whitespace() {
1117 let doc = Document::new_plain_english_curated("Foo bar baz");
1118
1119 let tok = doc.get_next_word_from_offset(0, 2);
1120
1121 assert!(tok.is_none());
1122 }
1123
1124 #[test]
1125 fn cant_select_next_word_before_start() {
1126 let doc = Document::new_plain_english_curated("Foo bar baz");
1127
1128 let tok = doc.get_next_word_from_offset(0, -1);
1129
1130 assert!(tok.is_none());
1131 }
1132
1133 #[test]
1134 fn cant_select_next_word_with_punctuation_instead_of_whitespace() {
1135 let doc = Document::new_plain_english_curated("Foo, bar, baz");
1136
1137 let tok = doc.get_next_word_from_offset(0, 1);
1138
1139 assert!(tok.is_none());
1140 }
1141
1142 #[test]
1143 fn cant_select_next_word_with_punctuation_after_whitespace() {
1144 let doc = Document::new_plain_english_curated("Foo \"bar\", baz");
1145
1146 let tok = doc.get_next_word_from_offset(0, 1);
1147
1148 assert!(tok.is_none());
1149 }
1150
1151 #[test]
1152 fn condenses_filename_extensions() {
1153 let doc = Document::new_plain_english_curated(".c and .exe and .js");
1154 assert!(doc.tokens[0].kind.is_unlintable());
1155 assert!(doc.tokens[4].kind.is_unlintable());
1156 assert!(doc.tokens[8].kind.is_unlintable());
1157 }
1158
1159 #[test]
1160 fn condense_filename_extension_ok_at_start_and_end() {
1161 let doc = Document::new_plain_english_curated(".c and .EXE");
1162 assert!(doc.tokens.len() == 5);
1163 assert!(doc.tokens[0].kind.is_unlintable());
1164 assert!(doc.tokens[4].kind.is_unlintable());
1165 }
1166
1167 #[test]
1168 fn doesnt_condense_filename_extensions_with_mixed_case() {
1169 let doc = Document::new_plain_english_curated(".c and .Exe");
1170 assert!(doc.tokens.len() == 6);
1171 assert!(doc.tokens[0].kind.is_unlintable());
1172 assert!(doc.tokens[4].kind.is_punctuation());
1173 assert!(doc.tokens[5].kind.is_word());
1174 }
1175
1176 #[test]
1177 fn doesnt_condense_filename_extensions_with_non_letters() {
1178 let doc = Document::new_plain_english_curated(".COM and .C0M");
1179 assert!(doc.tokens.len() == 6);
1180 assert!(doc.tokens[0].kind.is_unlintable());
1181 assert!(doc.tokens[4].kind.is_punctuation());
1182 assert!(doc.tokens[5].kind.is_word());
1183 }
1184
1185 #[test]
1186 fn doesnt_condense_filename_extensions_longer_than_three() {
1187 let doc = Document::new_plain_english_curated(".dll and .dlls");
1188 assert!(doc.tokens.len() == 6);
1189 assert!(doc.tokens[0].kind.is_unlintable());
1190 assert!(doc.tokens[4].kind.is_punctuation());
1191 assert!(doc.tokens[5].kind.is_word());
1192 }
1193
1194 #[test]
1195 fn condense_filename_extension_in_parens() {
1196 let doc = Document::new_plain_english_curated(
1197 "true for the manual installation when trying to run the executable(.exe) after a manual download",
1198 );
1199 assert!(doc.tokens.len() > 23);
1200 assert!(doc.tokens[21].kind.is_open_round());
1201 assert!(doc.tokens[22].kind.is_unlintable());
1202 assert!(doc.tokens[23].kind.is_close_round());
1203 }
1204
1205 #[test]
1206 fn condense_tldr_uppercase() {
1207 let doc = Document::new_plain_english_curated("TL;DR");
1208 assert!(doc.tokens.len() == 1);
1209 assert!(doc.tokens[0].kind.is_word());
1210 assert!(doc.tokens[0].span.len() == 5);
1211 }
1212
1213 #[test]
1214 fn condense_tldr_lowercase() {
1215 let doc = Document::new_plain_english_curated("tl;dr");
1216 assert!(doc.tokens.len() == 1);
1217 assert!(doc.tokens[0].kind.is_word());
1218 }
1219
1220 #[test]
1221 fn condense_tldr_mixed_case_1() {
1222 let doc = Document::new_plain_english_curated("tl;DR");
1223 assert!(doc.tokens.len() == 1);
1224 assert!(doc.tokens[0].kind.is_word());
1225 }
1226
1227 #[test]
1228 fn condense_tldr_mixed_case_2() {
1229 let doc = Document::new_plain_english_curated("TL;Dr");
1230 assert!(doc.tokens.len() == 1);
1231 assert!(doc.tokens[0].kind.is_word());
1232 }
1233
1234 #[test]
1235 fn condense_tldr_pural() {
1236 let doc = Document::new_plain_english_curated(
1237 "managing the flow between components to produce relevant TL;DRs of current news articles",
1238 );
1239 assert!(
1241 doc.tokens
1242 .iter()
1243 .all(|t| t.kind.is_word() || t.kind.is_whitespace())
1244 );
1245 let tldrs = doc
1247 .tokens
1248 .iter()
1249 .filter(|t| t.span.get_content(&doc.source).contains(&';'))
1250 .collect_vec();
1251 assert!(tldrs.len() == 1);
1252 assert!(tldrs[0].span.get_content_string(&doc.source) == "TL;DRs");
1253 }
1254
1255 #[test]
1256 fn condense_r_and_d_caps() {
1257 let doc = Document::new_plain_english_curated("R&D");
1258 assert!(doc.tokens.len() == 1);
1259 assert!(doc.tokens[0].kind.is_word());
1260 }
1261
1262 #[test]
1263 fn condense_r_and_d_mixed_case() {
1264 let doc = Document::new_plain_english_curated("R&d");
1265 assert!(doc.tokens.len() == 1);
1266 assert!(doc.tokens[0].kind.is_word());
1267 }
1268
1269 #[test]
1270 fn condense_r_and_d_lowercase() {
1271 let doc = Document::new_plain_english_curated("r&d");
1272 assert!(doc.tokens.len() == 1);
1273 assert!(doc.tokens[0].kind.is_word());
1274 }
1275
1276 #[test]
1277 fn dont_condense_r_and_d_with_spaces() {
1278 let doc = Document::new_plain_english_curated("R & D");
1279 assert!(doc.tokens.len() == 5);
1280 assert!(doc.tokens[0].kind.is_word());
1281 assert!(doc.tokens[1].kind.is_whitespace());
1282 assert!(doc.tokens[2].kind.is_ampersand());
1283 assert!(doc.tokens[3].kind.is_whitespace());
1284 assert!(doc.tokens[4].kind.is_word());
1285 }
1286
1287 #[test]
1288 fn condense_q_and_a() {
1289 let doc =
1290 Document::new_plain_english_curated("A Q&A platform software for teams at any scales.");
1291 assert!(doc.tokens.len() >= 3);
1292 assert!(doc.tokens[2].kind.is_word());
1293 assert!(doc.tokens[2].span.get_content_string(&doc.source) == "Q&A");
1294 }
1295
1296 #[test]
1297 fn dont_allow_mixed_r_and_d_with_q_and_a() {
1298 let doc = Document::new_plain_english_curated("R&A or Q&D");
1299 assert!(doc.tokens.len() == 9);
1300 assert!(doc.tokens[1].kind.is_ampersand() || doc.tokens[7].kind.is_ampersand());
1301 }
1302
1303 #[test]
1304 fn condense_io() {
1305 let doc = Document::new_plain_english_curated("I/O");
1306 assert!(doc.tokens.len() == 1);
1307 assert!(doc.tokens[0].kind.is_word());
1308 }
1309
1310 #[test]
1311 fn finds_unmatched_quotes_in_document() {
1312 let raw = r#"
1313This is a paragraph with a single word "quoted."
1314
1315This is a second paragraph with no quotes.
1316
1317This is a third paragraph with a single erroneous "quote.
1318
1319This is a final paragraph with a weird "quote and a not-weird "quote".
1320 "#;
1321
1322 let doc = Document::new_markdown_default_curated(raw);
1323
1324 let quote_twins: Vec<_> = doc
1325 .iter_quotes()
1326 .map(|t| t.kind.as_quote().unwrap().twin_loc)
1327 .collect();
1328
1329 assert_eq!(
1330 quote_twins,
1331 vec![Some(19), Some(16), None, None, Some(89), Some(87)]
1332 )
1333 }
1334
1335 #[test]
1336 fn issue_1901() {
1337 let raw = r#"
1338"A quoted line"
1339"A quote without a closing mark
1340"Another quoted lined"
1341"The last quoted line"
1342 "#;
1343
1344 let doc = Document::new_markdown_default_curated(raw);
1345
1346 let quote_twins: Vec<_> = doc
1347 .iter_quotes()
1348 .map(|t| t.kind.as_quote().unwrap().twin_loc)
1349 .collect();
1350
1351 assert_eq!(
1352 quote_twins,
1353 vec![
1354 Some(6),
1355 Some(0),
1356 None,
1357 Some(27),
1358 Some(21),
1359 Some(37),
1360 Some(29)
1361 ]
1362 )
1363 }
1364}