1use std::cmp::Ordering;
2use std::collections::VecDeque;
3use std::fmt::Display;
4
5use paste::paste;
6
7use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
8use crate::patterns::{
9 DocPattern, EitherPattern, Pattern, RepeatingPattern, SequencePattern, WordSet,
10};
11use crate::punctuation::Punctuation;
12use crate::vec_ext::VecExt;
13use crate::{Dictionary, FatToken, FstDictionary, Lrc, Token, TokenKind, TokenStringExt};
14use crate::{NumberSuffix, Span};
15
16#[derive(Debug, Clone)]
18pub struct Document {
19 source: Lrc<Vec<char>>,
20 tokens: Vec<Token>,
21}
22
23impl Default for Document {
24 fn default() -> Self {
25 Self::new("", &PlainEnglish, &FstDictionary::curated())
26 }
27}
28
29impl Document {
30 pub fn token_indices_intersecting(&self, span: Span) -> Vec<usize> {
34 self.tokens()
35 .enumerate()
36 .filter_map(|(idx, tok)| tok.span.overlaps_with(span).then_some(idx))
37 .collect()
38 }
39
40 pub fn new(text: &str, parser: &impl Parser, dictionary: &impl Dictionary) -> Self {
43 let source: Vec<_> = text.chars().collect();
44
45 Self::new_from_vec(Lrc::new(source), parser, dictionary)
46 }
47
48 pub fn new_curated(text: &str, parser: &impl Parser) -> Self {
51 let source: Vec<_> = text.chars().collect();
52
53 Self::new_from_vec(Lrc::new(source), parser, &FstDictionary::curated())
54 }
55
56 pub fn new_from_vec(
59 source: Lrc<Vec<char>>,
60 parser: &impl Parser,
61 dictionary: &impl Dictionary,
62 ) -> Self {
63 let tokens = parser.parse(&source);
64
65 let mut document = Self { source, tokens };
66 document.parse(dictionary);
67
68 document
69 }
70
71 pub fn new_plain_english_curated(text: &str) -> Self {
74 Self::new(text, &PlainEnglish, &FstDictionary::curated())
75 }
76
77 pub fn new_plain_english(text: &str, dictionary: &impl Dictionary) -> Self {
80 Self::new(text, &PlainEnglish, dictionary)
81 }
82
83 pub fn new_markdown_curated(text: &str, markdown_options: MarkdownOptions) -> Self {
86 Self::new(
87 text,
88 &Markdown::new(markdown_options),
89 &FstDictionary::curated(),
90 )
91 }
92
93 pub fn new_markdown_default_curated(text: &str) -> Self {
96 Self::new_markdown_curated(text, MarkdownOptions::default())
97 }
98
99 pub fn new_markdown(
102 text: &str,
103 markdown_options: MarkdownOptions,
104 dictionary: &impl Dictionary,
105 ) -> Self {
106 Self::new(text, &Markdown::new(markdown_options), dictionary)
107 }
108
109 pub fn new_markdown_default(text: &str, dictionary: &impl Dictionary) -> Self {
112 Self::new_markdown(text, MarkdownOptions::default(), dictionary)
113 }
114
115 fn parse(&mut self, dictionary: &impl Dictionary) {
119 self.condense_spaces();
120 self.condense_newlines();
121 self.newlines_to_breaks();
122 self.condense_contractions();
123 self.condense_dotted_initialisms();
124 self.condense_number_suffixes();
125 self.condense_ellipsis();
126 self.condense_latin();
127 self.match_quotes();
128 self.articles_imply_nouns();
129
130 for token in self.tokens.iter_mut() {
131 if let TokenKind::Word(meta) = &mut token.kind {
132 let word_source = token.span.get_content(&self.source);
133 let found_meta = dictionary.get_word_metadata(word_source);
134 *meta = found_meta.cloned()
135 }
136 }
137 }
138
139 fn uncached_article_pattern() -> Lrc<SequencePattern> {
140 Lrc::new(
141 SequencePattern::default()
142 .then_determiner()
143 .then_whitespace()
144 .then(|t: &Token, _source: &[char]| t.kind.is_adjective() && t.kind.is_noun())
145 .then_whitespace()
146 .then_noun(),
147 )
148 }
149
150 thread_local! {static ARTICLE_PATTERN: Lrc<SequencePattern> = Document::uncached_article_pattern()}
151
152 fn articles_imply_nouns(&mut self) {
155 let pattern = Self::ARTICLE_PATTERN.with(|v| v.clone());
156
157 for m in pattern.find_all_matches_in_doc(self) {
158 if let TokenKind::Word(Some(metadata)) = &mut self.tokens[m.start + 2].kind {
159 metadata.noun = None;
160 metadata.verb = None;
161 }
162 }
163 }
164
165 fn newlines_to_breaks(&mut self) {
167 for token in &mut self.tokens {
168 if let TokenKind::Newline(n) = token.kind {
169 if n >= 2 {
170 token.kind = TokenKind::ParagraphBreak;
171 }
172 }
173 }
174 }
175
176 fn condense_indices(&mut self, indices: &[usize], stretch_len: usize) {
182 for idx in indices {
184 let end_tok = self.tokens[idx + stretch_len - 1].clone();
185 let start_tok = &mut self.tokens[*idx];
186
187 start_tok.span.end = end_tok.span.end;
188 }
189
190 let old = self.tokens.clone();
192 self.tokens.clear();
193
194 self.tokens
196 .extend_from_slice(&old[0..indices.first().copied().unwrap_or(indices.len())]);
197
198 let mut iter = indices.iter().peekable();
199
200 while let (Some(a_idx), b) = (iter.next(), iter.peek()) {
201 self.tokens.push(old[*a_idx].clone());
202
203 if let Some(b_idx) = b {
204 self.tokens
205 .extend_from_slice(&old[a_idx + stretch_len..**b_idx]);
206 }
207 }
208
209 self.tokens.extend_from_slice(
211 &old[indices
212 .last()
213 .map(|v| v + stretch_len)
214 .unwrap_or(indices.len())..],
215 );
216 }
217
218 pub fn get_token_at_char_index(&self, char_index: usize) -> Option<&Token> {
219 let index = self
220 .tokens
221 .binary_search_by(|t| {
222 if t.span.overlaps_with(Span::new_with_len(char_index, 1)) {
223 Ordering::Equal
224 } else {
225 t.span.start.cmp(&char_index)
226 }
227 })
228 .ok()?;
229
230 Some(&self.tokens[index])
231 }
232
233 pub fn get_token(&self, index: usize) -> Option<&Token> {
235 self.tokens.get(index)
236 }
237
238 pub fn tokens(&self) -> impl Iterator<Item = &Token> + '_ {
240 self.tokens.iter()
241 }
242
243 pub fn fat_tokens(&self) -> impl Iterator<Item = FatToken> + '_ {
245 self.tokens().map(|token| token.to_fat(&self.source))
246 }
247
248 pub fn get_span_content(&self, span: &Span) -> &[char] {
249 span.get_content(&self.source)
250 }
251
252 pub fn get_span_content_str(&self, span: &Span) -> String {
253 String::from_iter(self.get_span_content(span))
254 }
255
256 pub fn get_full_string(&self) -> String {
257 self.get_span_content_str(&Span {
258 start: 0,
259 end: self.source.len(),
260 })
261 }
262
263 pub fn get_full_content(&self) -> &[char] {
264 &self.source
265 }
266
267 pub fn get_source(&self) -> &[char] {
268 &self.source
269 }
270
271 pub fn get_tokens(&self) -> &[Token] {
272 &self.tokens
273 }
274
275 fn match_quotes(&mut self) {
281 let quote_indices: Vec<usize> = self.tokens.iter_quote_indices().collect();
282
283 for i in 0..quote_indices.len() / 2 {
284 let a_i = quote_indices[i * 2];
285 let b_i = quote_indices[i * 2 + 1];
286
287 {
288 let a = self.tokens[a_i].kind.as_mut_quote().unwrap();
289 a.twin_loc = Some(b_i);
290 }
291
292 {
293 let b = self.tokens[b_i].kind.as_mut_quote().unwrap();
294 b.twin_loc = Some(a_i);
295 }
296 }
297 }
298
299 fn condense_number_suffixes(&mut self) {
301 if self.tokens.len() < 2 {
302 return;
303 }
304
305 let mut replace_starts = Vec::new();
306
307 for idx in 0..self.tokens.len() - 1 {
308 let b = &self.tokens[idx + 1];
309 let a = &self.tokens[idx];
310
311 if let (TokenKind::Number(..), TokenKind::Word(..)) = (&a.kind, &b.kind) {
314 if let Some(found_suffix) = NumberSuffix::from_chars(self.get_span_content(&b.span))
315 {
316 self.tokens[idx].kind.as_mut_number().unwrap().suffix = Some(found_suffix);
317 replace_starts.push(idx);
318 }
319 }
320 }
321
322 self.condense_indices(&replace_starts, 2);
323 }
324
325 fn condense_spaces(&mut self) {
328 let mut cursor = 0;
329 let copy = self.tokens.clone();
330
331 let mut remove_these = VecDeque::new();
332
333 while cursor < self.tokens.len() {
334 let start_tok = &mut self.tokens[cursor];
336
337 if let TokenKind::Space(start_count) = &mut start_tok.kind {
338 loop {
339 cursor += 1;
340
341 if cursor >= copy.len() {
342 break;
343 }
344
345 let child_tok = ©[cursor];
346
347 if start_tok.span.end != child_tok.span.start {
349 break;
350 }
351
352 if let TokenKind::Space(n) = child_tok.kind {
353 *start_count += n;
354 start_tok.span.end = child_tok.span.end;
355 remove_these.push_back(cursor);
356 cursor += 1;
357 } else {
358 break;
359 };
360 }
361 }
362
363 cursor += 1;
364 }
365
366 self.tokens.remove_indices(remove_these);
367 }
368
369 thread_local! {
370 static LATIN_PATTERN: Lrc<EitherPattern> = Document::uncached_latin_pattern();
371 }
372
373 fn uncached_latin_pattern() -> Lrc<EitherPattern> {
374 Lrc::new(EitherPattern::new(vec![
375 Box::new(
376 SequencePattern::default()
377 .then(WordSet::new(&["etc", "vs"]))
378 .then_period(),
379 ),
380 Box::new(
381 SequencePattern::aco("et")
382 .then_whitespace()
383 .t_aco("al")
384 .then_period(),
385 ),
386 ]))
387 }
388
389 fn condense_pattern<F>(&mut self, pattern: &impl Pattern, edit: F)
392 where
393 F: Fn(&mut Token),
394 {
395 let matches = pattern.find_all_matches_in_doc(self);
396
397 let mut remove_indices = VecDeque::with_capacity(matches.len());
398
399 for m in matches {
400 remove_indices.extend(m.start + 1..m.end);
401 self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
402 edit(&mut self.tokens[m.start]);
403 }
404
405 self.tokens.remove_indices(remove_indices);
406 }
407
408 fn condense_latin(&mut self) {
409 self.condense_pattern(&Self::LATIN_PATTERN.with(|v| v.clone()), |_| {})
410 }
411
412 fn condense_newlines(&mut self) {
415 let mut cursor = 0;
416 let copy = self.tokens.clone();
417
418 let mut remove_these = VecDeque::new();
419
420 while cursor < self.tokens.len() {
421 let start_tok = &mut self.tokens[cursor];
423
424 if let TokenKind::Newline(start_count) = &mut start_tok.kind {
425 loop {
426 cursor += 1;
427
428 if cursor >= copy.len() {
429 break;
430 }
431
432 let child_tok = ©[cursor];
433 if let TokenKind::Newline(n) = child_tok.kind {
434 *start_count += n;
435 start_tok.span.end = child_tok.span.end;
436 remove_these.push_back(cursor);
437 cursor += 1;
438 } else {
439 break;
440 };
441 }
442 }
443
444 cursor += 1;
445 }
446
447 self.tokens.remove_indices(remove_these);
448 }
449
450 fn condense_dotted_initialisms(&mut self) {
453 if self.tokens.len() < 2 {
454 return;
455 }
456
457 let mut to_remove = VecDeque::new();
458
459 let mut cursor = 1;
460
461 let mut initialism_start = None;
462
463 loop {
464 let a = &self.tokens[cursor - 1];
465 let b = &self.tokens[cursor];
466
467 let is_initialism_chunk = a.kind.is_word() && a.span.len() == 1 && b.kind.is_period();
468
469 if is_initialism_chunk {
470 if initialism_start.is_none() {
471 initialism_start = Some(cursor - 1);
472 } else {
473 to_remove.push_back(cursor - 1);
474 }
475
476 to_remove.push_back(cursor);
477 cursor += 1;
478 } else {
479 if let Some(start) = initialism_start {
480 let end = self.tokens[cursor - 2].span.end;
481 let start_tok: &mut Token = &mut self.tokens[start];
482 start_tok.span.end = end;
483 }
484
485 initialism_start = None;
486 }
487
488 cursor += 1;
489
490 if cursor >= self.tokens.len() - 1 {
491 break;
492 }
493 }
494
495 self.tokens.remove_indices(to_remove);
496 }
497
498 fn uncached_ellipsis_pattern() -> Lrc<RepeatingPattern> {
499 let period = SequencePattern::default().then_period();
500 Lrc::new(RepeatingPattern::new(Box::new(period), 2))
501 }
502
503 thread_local! {
504 static ELLIPSIS_PATTERN: Lrc<RepeatingPattern> = Document::uncached_ellipsis_pattern();
505 }
506
507 fn condense_ellipsis(&mut self) {
508 let pattern = Self::ELLIPSIS_PATTERN.with(|v| v.clone());
509 self.condense_pattern(&pattern, |tok| {
510 tok.kind = TokenKind::Punctuation(Punctuation::Ellipsis)
511 });
512 }
513
514 fn uncached_contraction_pattern() -> Lrc<SequencePattern> {
515 Lrc::new(
516 SequencePattern::default()
517 .then_any_word()
518 .then_apostrophe()
519 .then_any_word(),
520 )
521 }
522
523 thread_local! {
524 static CONTRACTION_PATTERN: Lrc<SequencePattern> = Document::uncached_contraction_pattern();
525 }
526
527 fn condense_contractions(&mut self) {
530 let pattern = Self::CONTRACTION_PATTERN.with(|v| v.clone());
531
532 self.condense_pattern(&pattern, |_| {});
533 }
534}
535
536macro_rules! create_fns_on_doc {
538 ($thing:ident) => {
539 paste! {
540 fn [< first_ $thing >](&self) -> Option<&Token> {
541 self.tokens.[< first_ $thing >]()
542 }
543
544 fn [< last_ $thing >](&self) -> Option<&Token> {
545 self.tokens.[< last_ $thing >]()
546 }
547
548 fn [< last_ $thing _index>](&self) -> Option<usize> {
549 self.tokens.[< last_ $thing _index >]()
550 }
551
552 fn [<iter_ $thing _indices>](&self) -> impl Iterator<Item = usize> + '_ {
553 self.tokens.[< iter_ $thing _indices >]()
554 }
555
556 fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_ {
557 self.tokens.[< iter_ $thing s >]()
558 }
559 }
560 };
561}
562
563impl TokenStringExt for Document {
564 create_fns_on_doc!(word);
565 create_fns_on_doc!(hostname);
566 create_fns_on_doc!(word_like);
567 create_fns_on_doc!(conjunction);
568 create_fns_on_doc!(space);
569 create_fns_on_doc!(apostrophe);
570 create_fns_on_doc!(pipe);
571 create_fns_on_doc!(quote);
572 create_fns_on_doc!(number);
573 create_fns_on_doc!(at);
574 create_fns_on_doc!(ellipsis);
575 create_fns_on_doc!(unlintable);
576 create_fns_on_doc!(sentence_terminator);
577 create_fns_on_doc!(paragraph_break);
578 create_fns_on_doc!(chunk_terminator);
579 create_fns_on_doc!(punctuation);
580 create_fns_on_doc!(currency);
581 create_fns_on_doc!(likely_homograph);
582 create_fns_on_doc!(comma);
583
584 fn first_sentence_word(&self) -> Option<&Token> {
585 self.tokens.first_sentence_word()
586 }
587
588 fn first_non_whitespace(&self) -> Option<&Token> {
589 self.tokens.first_non_whitespace()
590 }
591
592 fn span(&self) -> Option<Span> {
593 self.tokens.span()
594 }
595
596 fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
597 self.tokens.iter_linking_verb_indices()
598 }
599
600 fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_ {
601 self.tokens.iter_linking_verbs()
602 }
603
604 fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
605 self.tokens.iter_chunks()
606 }
607
608 fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
609 self.tokens.iter_paragraphs()
610 }
611
612 fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
613 self.tokens.iter_sentences()
614 }
615}
616
617impl Display for Document {
618 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
619 for token in &self.tokens {
620 write!(f, "{}", self.get_span_content_str(&token.span))?;
621 }
622
623 Ok(())
624 }
625}
626
627#[cfg(test)]
628mod tests {
629 use itertools::Itertools;
630
631 use super::Document;
632 use crate::{Span, parsers::MarkdownOptions};
633
634 fn assert_condensed_contractions(text: &str, final_tok_count: usize) {
635 let document = Document::new_plain_english_curated(text);
636
637 assert_eq!(document.tokens.len(), final_tok_count);
638
639 let document = Document::new_markdown_curated(text, MarkdownOptions::default());
640
641 assert_eq!(document.tokens.len(), final_tok_count);
642 }
643
644 #[test]
645 fn simple_contraction() {
646 assert_condensed_contractions("isn't", 1);
647 }
648
649 #[test]
650 fn simple_contraction2() {
651 assert_condensed_contractions("wasn't", 1);
652 }
653
654 #[test]
655 fn simple_contraction3() {
656 assert_condensed_contractions("There's", 1);
657 }
658
659 #[test]
660 fn medium_contraction() {
661 assert_condensed_contractions("isn't wasn't", 3);
662 }
663
664 #[test]
665 fn medium_contraction2() {
666 assert_condensed_contractions("There's no way", 5);
667 }
668
669 #[test]
670 fn selects_token_at_char_index() {
671 let text = "There were three little pigs. They built three little homes.";
672 let document = Document::new_plain_english_curated(text);
673
674 let got = document.get_token_at_char_index(19).unwrap();
675
676 assert!(got.kind.is_word());
677 assert_eq!(got.span, Span::new(17, 23));
678 }
679
680 fn assert_token_count(source: &str, count: usize) {
681 let document = Document::new_plain_english_curated(source);
682
683 dbg!(document.tokens().map(|t| t.kind.clone()).collect_vec());
684 assert_eq!(document.tokens.len(), count);
685 }
686
687 #[test]
688 fn condenses_number_suffixes() {
689 assert_token_count("1st", 1);
690 assert_token_count("This is the 2nd test", 9);
691 assert_token_count("This is the 3rd test", 9);
692 assert_token_count(
693 "It works even with weird capitalization like this: 600nD",
694 18,
695 );
696 }
697
698 #[test]
699 fn condenses_ie() {
700 assert_token_count("There is a thing (i.e. that one)", 15);
701 assert_token_count("We are trying to condense \"i.e.\"", 13);
702 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
703 }
704
705 #[test]
706 fn condenses_eg() {
707 assert_token_count("We are trying to condense \"e.g.\"", 13);
708 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
709 }
710
711 #[test]
712 fn condenses_nsa() {
713 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
714 }
715
716 #[test]
717 fn parses_ellipsis() {
718 assert_token_count("...", 1);
719 }
720
721 #[test]
722 fn parses_long_ellipsis() {
723 assert_token_count(".....", 1);
724 }
725
726 #[test]
727 fn parses_short_ellipsis() {
728 assert_token_count("..", 1);
729 }
730}