1use std::cmp::Ordering;
2use std::collections::VecDeque;
3use std::fmt::Display;
4
5use paste::paste;
6
7use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
8use crate::patterns::{
9 DocPattern, EitherPattern, Pattern, RepeatingPattern, SequencePattern, WordSet,
10};
11use crate::punctuation::Punctuation;
12use crate::vec_ext::VecExt;
13use crate::{
14 Dictionary, FatStringToken, FatToken, FstDictionary, Lrc, Token, TokenKind, TokenStringExt,
15};
16use crate::{OrdinalSuffix, Span};
17
18#[derive(Debug, Clone)]
20pub struct Document {
21 source: Lrc<Vec<char>>,
22 tokens: Vec<Token>,
23}
24
25impl Default for Document {
26 fn default() -> Self {
27 Self::new("", &PlainEnglish, &FstDictionary::curated())
28 }
29}
30
31impl Document {
32 pub fn token_indices_intersecting(&self, span: Span) -> Vec<usize> {
36 self.tokens()
37 .enumerate()
38 .filter_map(|(idx, tok)| tok.span.overlaps_with(span).then_some(idx))
39 .collect()
40 }
41
42 pub fn fat_tokens_intersecting(&self, span: Span) -> Vec<FatToken> {
46 let indices = self.token_indices_intersecting(span);
47
48 indices
49 .into_iter()
50 .map(|i| self.tokens[i].to_fat(&self.source))
51 .collect()
52 }
53
54 pub fn new(text: &str, parser: &impl Parser, dictionary: &impl Dictionary) -> Self {
57 let source: Vec<_> = text.chars().collect();
58
59 Self::new_from_vec(Lrc::new(source), parser, dictionary)
60 }
61
62 pub fn new_curated(text: &str, parser: &impl Parser) -> Self {
65 let source: Vec<_> = text.chars().collect();
66
67 Self::new_from_vec(Lrc::new(source), parser, &FstDictionary::curated())
68 }
69
70 pub fn new_from_vec(
73 source: Lrc<Vec<char>>,
74 parser: &impl Parser,
75 dictionary: &impl Dictionary,
76 ) -> Self {
77 let tokens = parser.parse(&source);
78
79 let mut document = Self { source, tokens };
80 document.parse(dictionary);
81
82 document
83 }
84
85 pub fn new_plain_english_curated(text: &str) -> Self {
88 Self::new(text, &PlainEnglish, &FstDictionary::curated())
89 }
90
91 pub fn new_plain_english(text: &str, dictionary: &impl Dictionary) -> Self {
94 Self::new(text, &PlainEnglish, dictionary)
95 }
96
97 pub fn new_markdown_curated(text: &str, markdown_options: MarkdownOptions) -> Self {
100 Self::new(
101 text,
102 &Markdown::new(markdown_options),
103 &FstDictionary::curated(),
104 )
105 }
106
107 pub fn new_markdown_default_curated(text: &str) -> Self {
110 Self::new_markdown_curated(text, MarkdownOptions::default())
111 }
112
113 pub fn new_markdown(
116 text: &str,
117 markdown_options: MarkdownOptions,
118 dictionary: &impl Dictionary,
119 ) -> Self {
120 Self::new(text, &Markdown::new(markdown_options), dictionary)
121 }
122
123 pub fn new_markdown_default(text: &str, dictionary: &impl Dictionary) -> Self {
126 Self::new_markdown(text, MarkdownOptions::default(), dictionary)
127 }
128
129 fn parse(&mut self, dictionary: &impl Dictionary) {
133 self.condense_spaces();
134 self.condense_newlines();
135 self.newlines_to_breaks();
136 self.condense_contractions();
137 self.condense_dotted_initialisms();
138 self.condense_number_suffixes();
139 self.condense_ellipsis();
140 self.condense_latin();
141 self.match_quotes();
142 self.articles_imply_nouns();
143
144 for token in self.tokens.iter_mut() {
145 if let TokenKind::Word(meta) = &mut token.kind {
146 let word_source = token.span.get_content(&self.source);
147 let found_meta = dictionary.get_word_metadata(word_source);
148 *meta = found_meta.cloned()
149 }
150 }
151 }
152
153 fn uncached_article_pattern() -> Lrc<SequencePattern> {
154 Lrc::new(
155 SequencePattern::default()
156 .then_determiner()
157 .then_whitespace()
158 .then(|t: &Token, _source: &[char]| t.kind.is_adjective() && t.kind.is_noun())
159 .then_whitespace()
160 .then_noun(),
161 )
162 }
163
164 thread_local! {static ARTICLE_PATTERN: Lrc<SequencePattern> = Document::uncached_article_pattern()}
165
166 fn articles_imply_nouns(&mut self) {
169 let pattern = Self::ARTICLE_PATTERN.with(|v| v.clone());
170
171 for m in pattern.find_all_matches_in_doc(self) {
172 if let TokenKind::Word(Some(metadata)) = &mut self.tokens[m.start + 2].kind {
173 metadata.noun = None;
174 metadata.verb = None;
175 }
176 }
177 }
178
179 fn newlines_to_breaks(&mut self) {
181 for token in &mut self.tokens {
182 if let TokenKind::Newline(n) = token.kind {
183 if n >= 2 {
184 token.kind = TokenKind::ParagraphBreak;
185 }
186 }
187 }
188 }
189
190 fn condense_indices(&mut self, indices: &[usize], stretch_len: usize) {
196 for idx in indices {
198 let end_tok = self.tokens[idx + stretch_len - 1].clone();
199 let start_tok = &mut self.tokens[*idx];
200
201 start_tok.span.end = end_tok.span.end;
202 }
203
204 let old = self.tokens.clone();
206 self.tokens.clear();
207
208 self.tokens
210 .extend_from_slice(&old[0..indices.first().copied().unwrap_or(indices.len())]);
211
212 let mut iter = indices.iter().peekable();
213
214 while let (Some(a_idx), b) = (iter.next(), iter.peek()) {
215 self.tokens.push(old[*a_idx].clone());
216
217 if let Some(b_idx) = b {
218 self.tokens
219 .extend_from_slice(&old[a_idx + stretch_len..**b_idx]);
220 }
221 }
222
223 self.tokens.extend_from_slice(
225 &old[indices
226 .last()
227 .map(|v| v + stretch_len)
228 .unwrap_or(indices.len())..],
229 );
230 }
231
232 pub fn get_token_at_char_index(&self, char_index: usize) -> Option<&Token> {
233 let index = self
234 .tokens
235 .binary_search_by(|t| {
236 if t.span.overlaps_with(Span::new_with_len(char_index, 1)) {
237 Ordering::Equal
238 } else {
239 t.span.start.cmp(&char_index)
240 }
241 })
242 .ok()?;
243
244 Some(&self.tokens[index])
245 }
246
247 pub fn get_token(&self, index: usize) -> Option<&Token> {
249 self.tokens.get(index)
250 }
251
252 pub fn get_token_offset(&self, base: usize, offset: isize) -> Option<&Token> {
254 match base.checked_add_signed(offset) {
255 None => None,
256 Some(idx) => self.get_token(idx),
257 }
258 }
259
260 pub fn tokens(&self) -> impl Iterator<Item = &Token> + '_ {
262 self.tokens.iter()
263 }
264
265 pub fn fat_tokens(&self) -> impl Iterator<Item = FatToken> + '_ {
267 self.tokens().map(|token| token.to_fat(&self.source))
268 }
269
270 pub fn get_next_word_from_offset(&self, base: usize, offset: isize) -> Option<&Token> {
273 if !self.get_token_offset(base, offset)?.kind.is_whitespace() {
275 return None;
276 }
277 let word_token = self.get_token_offset(base, offset + offset.signum());
279 let word_token = word_token?;
280 word_token.kind.is_word().then_some(word_token)
281 }
282
283 pub fn fat_string_tokens(&self) -> impl Iterator<Item = FatStringToken> + '_ {
285 self.fat_tokens().map(|t| t.into())
286 }
287
288 pub fn get_span_content(&self, span: &Span) -> &[char] {
289 span.get_content(&self.source)
290 }
291
292 pub fn get_span_content_str(&self, span: &Span) -> String {
293 String::from_iter(self.get_span_content(span))
294 }
295
296 pub fn get_full_string(&self) -> String {
297 self.get_span_content_str(&Span {
298 start: 0,
299 end: self.source.len(),
300 })
301 }
302
303 pub fn get_full_content(&self) -> &[char] {
304 &self.source
305 }
306
307 pub fn get_source(&self) -> &[char] {
308 &self.source
309 }
310
311 pub fn get_tokens(&self) -> &[Token] {
312 &self.tokens
313 }
314
315 fn match_quotes(&mut self) {
321 let quote_indices: Vec<usize> = self.tokens.iter_quote_indices().collect();
322
323 for i in 0..quote_indices.len() / 2 {
324 let a_i = quote_indices[i * 2];
325 let b_i = quote_indices[i * 2 + 1];
326
327 {
328 let a = self.tokens[a_i].kind.as_mut_quote().unwrap();
329 a.twin_loc = Some(b_i);
330 }
331
332 {
333 let b = self.tokens[b_i].kind.as_mut_quote().unwrap();
334 b.twin_loc = Some(a_i);
335 }
336 }
337 }
338
339 fn condense_number_suffixes(&mut self) {
341 if self.tokens.len() < 2 {
342 return;
343 }
344
345 let mut replace_starts = Vec::new();
346
347 for idx in 0..self.tokens.len() - 1 {
348 let b = &self.tokens[idx + 1];
349 let a = &self.tokens[idx];
350
351 if let (TokenKind::Number(..), TokenKind::Word(..)) = (&a.kind, &b.kind) {
354 if let Some(found_suffix) =
355 OrdinalSuffix::from_chars(self.get_span_content(&b.span))
356 {
357 self.tokens[idx].kind.as_mut_number().unwrap().suffix = Some(found_suffix);
358 replace_starts.push(idx);
359 }
360 }
361 }
362
363 self.condense_indices(&replace_starts, 2);
364 }
365
366 fn condense_spaces(&mut self) {
369 let mut cursor = 0;
370 let copy = self.tokens.clone();
371
372 let mut remove_these = VecDeque::new();
373
374 while cursor < self.tokens.len() {
375 let start_tok = &mut self.tokens[cursor];
377
378 if let TokenKind::Space(start_count) = &mut start_tok.kind {
379 loop {
380 cursor += 1;
381
382 if cursor >= copy.len() {
383 break;
384 }
385
386 let child_tok = ©[cursor];
387
388 if start_tok.span.end != child_tok.span.start {
390 break;
391 }
392
393 if let TokenKind::Space(n) = child_tok.kind {
394 *start_count += n;
395 start_tok.span.end = child_tok.span.end;
396 remove_these.push_back(cursor);
397 cursor += 1;
398 } else {
399 break;
400 };
401 }
402 }
403
404 cursor += 1;
405 }
406
407 self.tokens.remove_indices(remove_these);
408 }
409
410 thread_local! {
411 static LATIN_PATTERN: Lrc<EitherPattern> = Document::uncached_latin_pattern();
412 }
413
414 fn uncached_latin_pattern() -> Lrc<EitherPattern> {
415 Lrc::new(EitherPattern::new(vec![
416 Box::new(
417 SequencePattern::default()
418 .then(WordSet::new(&["etc", "vs"]))
419 .then_period(),
420 ),
421 Box::new(
422 SequencePattern::aco("et")
423 .then_whitespace()
424 .t_aco("al")
425 .then_period(),
426 ),
427 ]))
428 }
429
430 fn condense_pattern<F>(&mut self, pattern: &impl Pattern, edit: F)
433 where
434 F: Fn(&mut Token),
435 {
436 let matches = pattern.find_all_matches_in_doc(self);
437
438 let mut remove_indices = VecDeque::with_capacity(matches.len());
439
440 for m in matches {
441 remove_indices.extend(m.start + 1..m.end);
442 self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
443 edit(&mut self.tokens[m.start]);
444 }
445
446 self.tokens.remove_indices(remove_indices);
447 }
448
449 fn condense_latin(&mut self) {
450 self.condense_pattern(&Self::LATIN_PATTERN.with(|v| v.clone()), |_| {})
451 }
452
453 fn condense_newlines(&mut self) {
456 let mut cursor = 0;
457 let copy = self.tokens.clone();
458
459 let mut remove_these = VecDeque::new();
460
461 while cursor < self.tokens.len() {
462 let start_tok = &mut self.tokens[cursor];
464
465 if let TokenKind::Newline(start_count) = &mut start_tok.kind {
466 loop {
467 cursor += 1;
468
469 if cursor >= copy.len() {
470 break;
471 }
472
473 let child_tok = ©[cursor];
474 if let TokenKind::Newline(n) = child_tok.kind {
475 *start_count += n;
476 start_tok.span.end = child_tok.span.end;
477 remove_these.push_back(cursor);
478 cursor += 1;
479 } else {
480 break;
481 };
482 }
483 }
484
485 cursor += 1;
486 }
487
488 self.tokens.remove_indices(remove_these);
489 }
490
491 fn condense_dotted_initialisms(&mut self) {
494 if self.tokens.len() < 2 {
495 return;
496 }
497
498 let mut to_remove = VecDeque::new();
499
500 let mut cursor = 1;
501
502 let mut initialism_start = None;
503
504 loop {
505 let a = &self.tokens[cursor - 1];
506 let b = &self.tokens[cursor];
507
508 let is_initialism_chunk = a.kind.is_word() && a.span.len() == 1 && b.kind.is_period();
509
510 if is_initialism_chunk {
511 if initialism_start.is_none() {
512 initialism_start = Some(cursor - 1);
513 } else {
514 to_remove.push_back(cursor - 1);
515 }
516
517 to_remove.push_back(cursor);
518 cursor += 1;
519 } else {
520 if let Some(start) = initialism_start {
521 let end = self.tokens[cursor - 2].span.end;
522 let start_tok: &mut Token = &mut self.tokens[start];
523 start_tok.span.end = end;
524 }
525
526 initialism_start = None;
527 }
528
529 cursor += 1;
530
531 if cursor >= self.tokens.len() - 1 {
532 break;
533 }
534 }
535
536 self.tokens.remove_indices(to_remove);
537 }
538
539 fn uncached_ellipsis_pattern() -> Lrc<RepeatingPattern> {
540 let period = SequencePattern::default().then_period();
541 Lrc::new(RepeatingPattern::new(Box::new(period), 2))
542 }
543
544 thread_local! {
545 static ELLIPSIS_PATTERN: Lrc<RepeatingPattern> = Document::uncached_ellipsis_pattern();
546 }
547
548 fn condense_ellipsis(&mut self) {
549 let pattern = Self::ELLIPSIS_PATTERN.with(|v| v.clone());
550 self.condense_pattern(&pattern, |tok| {
551 tok.kind = TokenKind::Punctuation(Punctuation::Ellipsis)
552 });
553 }
554
555 fn uncached_contraction_pattern() -> Lrc<SequencePattern> {
556 Lrc::new(
557 SequencePattern::default()
558 .then_any_word()
559 .then_apostrophe()
560 .then_any_word(),
561 )
562 }
563
564 thread_local! {
565 static CONTRACTION_PATTERN: Lrc<SequencePattern> = Document::uncached_contraction_pattern();
566 }
567
568 fn condense_contractions(&mut self) {
571 let pattern = Self::CONTRACTION_PATTERN.with(|v| v.clone());
572
573 self.condense_pattern(&pattern, |_| {});
574 }
575}
576
577macro_rules! create_fns_on_doc {
579 ($thing:ident) => {
580 paste! {
581 fn [< first_ $thing >](&self) -> Option<&Token> {
582 self.tokens.[< first_ $thing >]()
583 }
584
585 fn [< last_ $thing >](&self) -> Option<&Token> {
586 self.tokens.[< last_ $thing >]()
587 }
588
589 fn [< last_ $thing _index>](&self) -> Option<usize> {
590 self.tokens.[< last_ $thing _index >]()
591 }
592
593 fn [<iter_ $thing _indices>](&self) -> impl Iterator<Item = usize> + '_ {
594 self.tokens.[< iter_ $thing _indices >]()
595 }
596
597 fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_ {
598 self.tokens.[< iter_ $thing s >]()
599 }
600 }
601 };
602}
603
604impl TokenStringExt for Document {
605 create_fns_on_doc!(adjective);
606 create_fns_on_doc!(apostrophe);
607 create_fns_on_doc!(at);
608 create_fns_on_doc!(chunk_terminator);
609 create_fns_on_doc!(comma);
610 create_fns_on_doc!(conjunction);
611 create_fns_on_doc!(currency);
612 create_fns_on_doc!(ellipsis);
613 create_fns_on_doc!(hostname);
614 create_fns_on_doc!(likely_homograph);
615 create_fns_on_doc!(noun);
616 create_fns_on_doc!(number);
617 create_fns_on_doc!(paragraph_break);
618 create_fns_on_doc!(pipe);
619 create_fns_on_doc!(preposition);
620 create_fns_on_doc!(punctuation);
621 create_fns_on_doc!(quote);
622 create_fns_on_doc!(sentence_terminator);
623 create_fns_on_doc!(space);
624 create_fns_on_doc!(unlintable);
625 create_fns_on_doc!(verb);
626 create_fns_on_doc!(word);
627 create_fns_on_doc!(word_like);
628
629 fn first_sentence_word(&self) -> Option<&Token> {
630 self.tokens.first_sentence_word()
631 }
632
633 fn first_non_whitespace(&self) -> Option<&Token> {
634 self.tokens.first_non_whitespace()
635 }
636
637 fn span(&self) -> Option<Span> {
638 self.tokens.span()
639 }
640
641 fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
642 self.tokens.iter_linking_verb_indices()
643 }
644
645 fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_ {
646 self.tokens.iter_linking_verbs()
647 }
648
649 fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
650 self.tokens.iter_chunks()
651 }
652
653 fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
654 self.tokens.iter_paragraphs()
655 }
656
657 fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
658 self.tokens.iter_sentences()
659 }
660}
661
662impl Display for Document {
663 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
664 for token in &self.tokens {
665 write!(f, "{}", self.get_span_content_str(&token.span))?;
666 }
667
668 Ok(())
669 }
670}
671
672#[cfg(test)]
673mod tests {
674 use itertools::Itertools;
675
676 use super::Document;
677 use crate::{Span, parsers::MarkdownOptions};
678
679 fn assert_condensed_contractions(text: &str, final_tok_count: usize) {
680 let document = Document::new_plain_english_curated(text);
681
682 assert_eq!(document.tokens.len(), final_tok_count);
683
684 let document = Document::new_markdown_curated(text, MarkdownOptions::default());
685
686 assert_eq!(document.tokens.len(), final_tok_count);
687 }
688
689 #[test]
690 fn simple_contraction() {
691 assert_condensed_contractions("isn't", 1);
692 }
693
694 #[test]
695 fn simple_contraction2() {
696 assert_condensed_contractions("wasn't", 1);
697 }
698
699 #[test]
700 fn simple_contraction3() {
701 assert_condensed_contractions("There's", 1);
702 }
703
704 #[test]
705 fn medium_contraction() {
706 assert_condensed_contractions("isn't wasn't", 3);
707 }
708
709 #[test]
710 fn medium_contraction2() {
711 assert_condensed_contractions("There's no way", 5);
712 }
713
714 #[test]
715 fn selects_token_at_char_index() {
716 let text = "There were three little pigs. They built three little homes.";
717 let document = Document::new_plain_english_curated(text);
718
719 let got = document.get_token_at_char_index(19).unwrap();
720
721 assert!(got.kind.is_word());
722 assert_eq!(got.span, Span::new(17, 23));
723 }
724
725 fn assert_token_count(source: &str, count: usize) {
726 let document = Document::new_plain_english_curated(source);
727
728 dbg!(document.tokens().map(|t| t.kind.clone()).collect_vec());
729 assert_eq!(document.tokens.len(), count);
730 }
731
732 #[test]
733 fn condenses_number_suffixes() {
734 assert_token_count("1st", 1);
735 assert_token_count("This is the 2nd test", 9);
736 assert_token_count("This is the 3rd test", 9);
737 assert_token_count(
738 "It works even with weird capitalization like this: 600nD",
739 18,
740 );
741 }
742
743 #[test]
744 fn condenses_ie() {
745 assert_token_count("There is a thing (i.e. that one)", 15);
746 assert_token_count("We are trying to condense \"i.e.\"", 13);
747 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
748 }
749
750 #[test]
751 fn condenses_eg() {
752 assert_token_count("We are trying to condense \"e.g.\"", 13);
753 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
754 }
755
756 #[test]
757 fn condenses_nsa() {
758 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
759 }
760
761 #[test]
762 fn parses_ellipsis() {
763 assert_token_count("...", 1);
764 }
765
766 #[test]
767 fn parses_long_ellipsis() {
768 assert_token_count(".....", 1);
769 }
770
771 #[test]
772 fn parses_short_ellipsis() {
773 assert_token_count("..", 1);
774 }
775
776 #[test]
777 fn selects_token_at_offset() {
778 let doc = Document::new_plain_english_curated("Foo bar baz");
779
780 let tok = doc.get_token_offset(1, -1).unwrap();
781
782 assert_eq!(tok.span, Span::new(0, 3));
783 }
784
785 #[test]
786 fn cant_select_token_before_start() {
787 let doc = Document::new_plain_english_curated("Foo bar baz");
788
789 let tok = doc.get_token_offset(0, -1);
790
791 assert!(tok.is_none());
792 }
793
794 #[test]
795 fn select_next_word_pos_offset() {
796 let doc = Document::new_plain_english_curated("Foo bar baz");
797
798 let bar = doc.get_next_word_from_offset(0, 1).unwrap();
799 let bar = doc.get_span_content(&bar.span);
800 assert_eq!(bar, ['b', 'a', 'r']);
801 }
802
803 #[test]
804 fn select_next_word_neg_offset() {
805 let doc = Document::new_plain_english_curated("Foo bar baz");
806
807 let bar = doc.get_next_word_from_offset(2, -1).unwrap();
808 let bar = doc.get_span_content(&bar.span);
809 assert_eq!(bar, ['F', 'o', 'o']);
810 }
811
812 #[test]
813 fn cant_select_next_word_not_from_whitespace() {
814 let doc = Document::new_plain_english_curated("Foo bar baz");
815
816 let tok = doc.get_next_word_from_offset(0, 2);
817
818 assert!(tok.is_none());
819 }
820
821 #[test]
822 fn cant_select_next_word_before_start() {
823 let doc = Document::new_plain_english_curated("Foo bar baz");
824
825 let tok = doc.get_next_word_from_offset(0, -1);
826
827 assert!(tok.is_none());
828 }
829
830 #[test]
831 fn cant_select_next_word_with_punctuation_instead_of_whitespace() {
832 let doc = Document::new_plain_english_curated("Foo, bar, baz");
833
834 let tok = doc.get_next_word_from_offset(0, 1);
835
836 assert!(tok.is_none());
837 }
838
839 #[test]
840 fn cant_select_next_word_with_punctuation_after_whitespace() {
841 let doc = Document::new_plain_english_curated("Foo \"bar\", baz");
842
843 let tok = doc.get_next_word_from_offset(0, 1);
844
845 assert!(tok.is_none());
846 }
847}