1use std::cmp::Ordering;
2use std::collections::VecDeque;
3use std::fmt::Display;
4
5use paste::paste;
6
7use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
8use crate::patterns::{
9 DocPattern, EitherPattern, Pattern, RepeatingPattern, SequencePattern, WordSet,
10};
11use crate::punctuation::Punctuation;
12use crate::vec_ext::VecExt;
13use crate::{Dictionary, FatToken, FstDictionary, Lrc, Token, TokenKind, TokenStringExt};
14use crate::{NumberSuffix, Span};
15
16#[derive(Debug, Clone)]
18pub struct Document {
19 source: Lrc<Vec<char>>,
20 tokens: Vec<Token>,
21}
22
23impl Default for Document {
24 fn default() -> Self {
25 Self::new("", &PlainEnglish, &FstDictionary::curated())
26 }
27}
28
29impl Document {
30 pub fn token_indices_intersecting(&self, span: Span) -> Vec<usize> {
34 self.tokens()
35 .enumerate()
36 .filter_map(|(idx, tok)| tok.span.overlaps_with(span).then_some(idx))
37 .collect()
38 }
39
40 pub fn new(text: &str, parser: &impl Parser, dictionary: &impl Dictionary) -> Self {
43 let source: Vec<_> = text.chars().collect();
44
45 Self::new_from_vec(Lrc::new(source), parser, dictionary)
46 }
47
48 pub fn new_curated(text: &str, parser: &impl Parser) -> Self {
51 let source: Vec<_> = text.chars().collect();
52
53 Self::new_from_vec(Lrc::new(source), parser, &FstDictionary::curated())
54 }
55
56 pub fn new_from_vec(
59 source: Lrc<Vec<char>>,
60 parser: &impl Parser,
61 dictionary: &impl Dictionary,
62 ) -> Self {
63 let tokens = parser.parse(&source);
64
65 let mut document = Self { source, tokens };
66 document.parse(dictionary);
67
68 document
69 }
70
71 pub fn new_plain_english_curated(text: &str) -> Self {
74 Self::new(text, &PlainEnglish, &FstDictionary::curated())
75 }
76
77 pub fn new_plain_english(text: &str, dictionary: &impl Dictionary) -> Self {
80 Self::new(text, &PlainEnglish, dictionary)
81 }
82
83 pub fn new_markdown_curated(text: &str, markdown_options: MarkdownOptions) -> Self {
86 Self::new(
87 text,
88 &Markdown::new(markdown_options),
89 &FstDictionary::curated(),
90 )
91 }
92
93 pub fn new_markdown_default_curated(text: &str) -> Self {
96 Self::new_markdown_curated(text, MarkdownOptions::default())
97 }
98
99 pub fn new_markdown(
102 text: &str,
103 markdown_options: MarkdownOptions,
104 dictionary: &impl Dictionary,
105 ) -> Self {
106 Self::new(text, &Markdown::new(markdown_options), dictionary)
107 }
108
109 pub fn new_markdown_default(text: &str, dictionary: &impl Dictionary) -> Self {
112 Self::new_markdown(text, MarkdownOptions::default(), dictionary)
113 }
114
115 fn parse(&mut self, dictionary: &impl Dictionary) {
119 self.condense_spaces();
120 self.condense_newlines();
121 self.newlines_to_breaks();
122 self.condense_contractions();
123 self.condense_dotted_initialisms();
124 self.condense_number_suffixes();
125 self.condense_ellipsis();
126 self.condense_latin();
127 self.match_quotes();
128
129 for token in self.tokens.iter_mut() {
130 if let TokenKind::Word(meta) = &mut token.kind {
131 let word_source = token.span.get_content(&self.source);
132 let found_meta = dictionary
133 .get_correct_capitalization_of(word_source)
134 .map(|canonical_caps| dictionary.get_word_metadata(canonical_caps))
135 .unwrap_or_default();
136 *meta = meta.or(&found_meta);
137 }
138 }
139 }
140
141 fn newlines_to_breaks(&mut self) {
143 for token in &mut self.tokens {
144 if let TokenKind::Newline(n) = token.kind {
145 if n >= 2 {
146 token.kind = TokenKind::ParagraphBreak;
147 }
148 }
149 }
150 }
151
152 fn condense_indices(&mut self, indices: &[usize], stretch_len: usize) {
158 for idx in indices {
160 let end_tok = self.tokens[idx + stretch_len - 1];
161 let start_tok = &mut self.tokens[*idx];
162
163 start_tok.span.end = end_tok.span.end;
164 }
165
166 let old = self.tokens.clone();
168 self.tokens.clear();
169
170 self.tokens
172 .extend_from_slice(&old[0..indices.first().copied().unwrap_or(indices.len())]);
173
174 let mut iter = indices.iter().peekable();
175
176 while let (Some(a_idx), b) = (iter.next(), iter.peek()) {
177 self.tokens.push(old[*a_idx]);
178
179 if let Some(b_idx) = b {
180 self.tokens
181 .extend_from_slice(&old[a_idx + stretch_len..**b_idx]);
182 }
183 }
184
185 self.tokens.extend_from_slice(
187 &old[indices
188 .last()
189 .map(|v| v + stretch_len)
190 .unwrap_or(indices.len())..],
191 );
192 }
193
194 pub fn get_token_at_char_index(&self, char_index: usize) -> Option<Token> {
195 let index = self
196 .tokens
197 .binary_search_by(|t| {
198 if t.span.overlaps_with(Span::new_with_len(char_index, 1)) {
199 Ordering::Equal
200 } else {
201 t.span.start.cmp(&char_index)
202 }
203 })
204 .ok()?;
205
206 Some(self.tokens[index])
207 }
208
209 pub fn get_token(&self, index: usize) -> Option<Token> {
211 self.tokens.get(index).copied()
212 }
213
214 pub fn tokens(&self) -> impl Iterator<Item = Token> + '_ {
216 self.tokens.iter().copied()
217 }
218
219 pub fn fat_tokens(&self) -> impl Iterator<Item = FatToken> + '_ {
221 self.tokens().map(|token| token.to_fat(&self.source))
222 }
223
224 pub fn get_span_content(&self, span: Span) -> &[char] {
225 span.get_content(&self.source)
226 }
227
228 pub fn get_span_content_str(&self, span: Span) -> String {
229 String::from_iter(self.get_span_content(span))
230 }
231
232 pub fn get_full_string(&self) -> String {
233 self.get_span_content_str(Span {
234 start: 0,
235 end: self.source.len(),
236 })
237 }
238
239 pub fn get_full_content(&self) -> &[char] {
240 &self.source
241 }
242
243 pub fn get_source(&self) -> &[char] {
244 &self.source
245 }
246
247 pub fn get_tokens(&self) -> &[Token] {
248 &self.tokens
249 }
250
251 fn match_quotes(&mut self) {
257 let quote_indices: Vec<usize> = self.tokens.iter_quote_indices().collect();
258
259 for i in 0..quote_indices.len() / 2 {
260 let a_i = quote_indices[i * 2];
261 let b_i = quote_indices[i * 2 + 1];
262
263 {
264 let a = self.tokens[a_i].kind.as_mut_quote().unwrap();
265 a.twin_loc = Some(b_i);
266 }
267
268 {
269 let b = self.tokens[b_i].kind.as_mut_quote().unwrap();
270 b.twin_loc = Some(a_i);
271 }
272 }
273 }
274
275 fn condense_number_suffixes(&mut self) {
277 if self.tokens.len() < 2 {
278 return;
279 }
280
281 let mut replace_starts = Vec::new();
282
283 for idx in 0..self.tokens.len() - 1 {
284 let b = self.tokens[idx + 1];
285 let a = self.tokens[idx];
286
287 if let (TokenKind::Number(..), TokenKind::Word(..)) = (a.kind, b.kind) {
290 if let Some(found_suffix) = NumberSuffix::from_chars(self.get_span_content(b.span))
291 {
292 self.tokens[idx].kind.as_mut_number().unwrap().suffix = Some(found_suffix);
293 replace_starts.push(idx);
294 }
295 }
296 }
297
298 self.condense_indices(&replace_starts, 2);
299 }
300
301 fn condense_spaces(&mut self) {
304 let mut cursor = 0;
305 let copy = self.tokens.clone();
306
307 let mut remove_these = VecDeque::new();
308
309 while cursor < self.tokens.len() {
310 let start_tok = &mut self.tokens[cursor];
312
313 if let TokenKind::Space(start_count) = &mut start_tok.kind {
314 loop {
315 cursor += 1;
316
317 if cursor >= copy.len() {
318 break;
319 }
320
321 let child_tok = ©[cursor];
322
323 if start_tok.span.end != child_tok.span.start {
325 break;
326 }
327
328 if let TokenKind::Space(n) = child_tok.kind {
329 *start_count += n;
330 start_tok.span.end = child_tok.span.end;
331 remove_these.push_back(cursor);
332 cursor += 1;
333 } else {
334 break;
335 };
336 }
337 }
338
339 cursor += 1;
340 }
341
342 self.tokens.remove_indices(remove_these);
343 }
344
345 thread_local! {
346 static LATIN_PATTERN: Lrc<EitherPattern> = Document::uncached_latin_pattern();
347 }
348
349 fn uncached_latin_pattern() -> Lrc<EitherPattern> {
350 Lrc::new(EitherPattern::new(vec![
351 Box::new(
352 SequencePattern::default()
353 .then_word_set(WordSet::all(&["etc", "vs"]))
354 .then_period(),
355 ),
356 Box::new(
357 SequencePattern::aco("et")
358 .then_whitespace()
359 .t_aco("al")
360 .then_period(),
361 ),
362 ]))
363 }
364
365 fn condense_pattern<F>(&mut self, pattern: &impl Pattern, edit: F)
368 where
369 F: Fn(&mut Token),
370 {
371 let matches = pattern.find_all_matches_in_doc(self);
372
373 let mut remove_indices = VecDeque::with_capacity(matches.len());
374
375 for m in matches {
376 remove_indices.extend(m.start + 1..m.end);
377 self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
378 edit(&mut self.tokens[m.start]);
379 }
380
381 self.tokens.remove_indices(remove_indices);
382 }
383
384 fn condense_latin(&mut self) {
385 self.condense_pattern(&Self::LATIN_PATTERN.with(|v| v.clone()), |_| {})
386 }
387
388 fn condense_newlines(&mut self) {
391 let mut cursor = 0;
392 let copy = self.tokens.clone();
393
394 let mut remove_these = VecDeque::new();
395
396 while cursor < self.tokens.len() {
397 let start_tok = &mut self.tokens[cursor];
399
400 if let TokenKind::Newline(start_count) = &mut start_tok.kind {
401 loop {
402 cursor += 1;
403
404 if cursor >= copy.len() {
405 break;
406 }
407
408 let child_tok = ©[cursor];
409 if let TokenKind::Newline(n) = child_tok.kind {
410 *start_count += n;
411 start_tok.span.end = child_tok.span.end;
412 remove_these.push_back(cursor);
413 cursor += 1;
414 } else {
415 break;
416 };
417 }
418 }
419
420 cursor += 1;
421 }
422
423 self.tokens.remove_indices(remove_these);
424 }
425
426 fn condense_dotted_initialisms(&mut self) {
429 if self.tokens.len() < 2 {
430 return;
431 }
432
433 let mut to_remove = VecDeque::new();
434
435 let mut cursor = 1;
436
437 let mut initialism_start = None;
438
439 loop {
440 let a = self.tokens[cursor - 1];
441 let b = self.tokens[cursor];
442
443 let is_initialism_chunk = a.kind.is_word() && a.span.len() == 1 && b.kind.is_period();
444
445 if is_initialism_chunk {
446 if initialism_start.is_none() {
447 initialism_start = Some(cursor - 1);
448 } else {
449 to_remove.push_back(cursor - 1);
450 }
451
452 to_remove.push_back(cursor);
453 cursor += 1;
454 } else {
455 if let Some(start) = initialism_start {
456 let end = self.tokens[cursor - 2].span.end;
457 let start_tok: &mut Token = &mut self.tokens[start];
458 start_tok.span.end = end;
459 }
460
461 initialism_start = None;
462 }
463
464 cursor += 1;
465
466 if cursor >= self.tokens.len() - 1 {
467 break;
468 }
469 }
470
471 self.tokens.remove_indices(to_remove);
472 }
473
474 fn uncached_ellipsis_pattern() -> Lrc<RepeatingPattern> {
475 let period = SequencePattern::default().then_period();
476 Lrc::new(RepeatingPattern::new(Box::new(period), 2))
477 }
478
479 thread_local! {
480 static ELLIPSIS_PATTERN: Lrc<RepeatingPattern> = Document::uncached_ellipsis_pattern();
481 }
482
483 fn condense_ellipsis(&mut self) {
484 let pattern = Self::ELLIPSIS_PATTERN.with(|v| v.clone());
485 self.condense_pattern(&pattern, |tok| {
486 tok.kind = TokenKind::Punctuation(Punctuation::Ellipsis)
487 });
488 }
489
490 fn uncached_contraction_pattern() -> Lrc<SequencePattern> {
491 Lrc::new(
492 SequencePattern::default()
493 .then_any_word()
494 .then_apostrophe()
495 .then_any_word(),
496 )
497 }
498
499 thread_local! {
500 static CONTRACTION_PATTERN: Lrc<SequencePattern> = Document::uncached_contraction_pattern();
501 }
502
503 fn condense_contractions(&mut self) {
506 let pattern = Self::CONTRACTION_PATTERN.with(|v| v.clone());
507
508 self.condense_pattern(&pattern, |_| {});
509 }
510}
511
512macro_rules! create_fns_on_doc {
514 ($thing:ident) => {
515 paste! {
516 fn [< first_ $thing >](&self) -> Option<Token> {
517 self.tokens.[< first_ $thing >]()
518 }
519
520 fn [< last_ $thing >](&self) -> Option<Token> {
521 self.tokens.[< last_ $thing >]()
522 }
523
524 fn [< last_ $thing _index>](&self) -> Option<usize> {
525 self.tokens.[< last_ $thing _index >]()
526 }
527
528 fn [<iter_ $thing _indices>](&self) -> impl Iterator<Item = usize> + '_ {
529 self.tokens.[< iter_ $thing _indices >]()
530 }
531
532 fn [<iter_ $thing s>](&self) -> impl Iterator<Item = Token> + '_ {
533 self.tokens.[< iter_ $thing s >]()
534 }
535 }
536 };
537}
538
539impl TokenStringExt for Document {
540 create_fns_on_doc!(word);
541 create_fns_on_doc!(hostname);
542 create_fns_on_doc!(word_like);
543 create_fns_on_doc!(conjunction);
544 create_fns_on_doc!(space);
545 create_fns_on_doc!(apostrophe);
546 create_fns_on_doc!(pipe);
547 create_fns_on_doc!(quote);
548 create_fns_on_doc!(number);
549 create_fns_on_doc!(at);
550 create_fns_on_doc!(ellipsis);
551 create_fns_on_doc!(unlintable);
552 create_fns_on_doc!(sentence_terminator);
553 create_fns_on_doc!(paragraph_break);
554 create_fns_on_doc!(chunk_terminator);
555 create_fns_on_doc!(punctuation);
556 create_fns_on_doc!(currency);
557 create_fns_on_doc!(likely_homograph);
558 create_fns_on_doc!(comma);
559
560 fn first_sentence_word(&self) -> Option<Token> {
561 self.tokens.first_sentence_word()
562 }
563
564 fn first_non_whitespace(&self) -> Option<Token> {
565 self.tokens.first_non_whitespace()
566 }
567
568 fn span(&self) -> Option<Span> {
569 self.tokens.span()
570 }
571
572 fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
573 self.tokens.iter_linking_verb_indices()
574 }
575
576 fn iter_linking_verbs(&self) -> impl Iterator<Item = Token> + '_ {
577 self.tokens.iter_linking_verbs()
578 }
579
580 fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
581 self.tokens.iter_chunks()
582 }
583
584 fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
585 self.tokens.iter_paragraphs()
586 }
587
588 fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
589 self.tokens.iter_sentences()
590 }
591}
592
593impl Display for Document {
594 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
595 for token in &self.tokens {
596 write!(f, "{}", self.get_span_content_str(token.span))?;
597 }
598
599 Ok(())
600 }
601}
602
603#[cfg(test)]
604mod tests {
605 use itertools::Itertools;
606
607 use super::Document;
608 use crate::{parsers::MarkdownOptions, Span};
609
610 fn assert_condensed_contractions(text: &str, final_tok_count: usize) {
611 let document = Document::new_plain_english_curated(text);
612
613 assert_eq!(document.tokens.len(), final_tok_count);
614
615 let document = Document::new_markdown_curated(text, MarkdownOptions::default());
616
617 assert_eq!(document.tokens.len(), final_tok_count);
618 }
619
620 #[test]
621 fn simple_contraction() {
622 assert_condensed_contractions("isn't", 1);
623 }
624
625 #[test]
626 fn simple_contraction2() {
627 assert_condensed_contractions("wasn't", 1);
628 }
629
630 #[test]
631 fn simple_contraction3() {
632 assert_condensed_contractions("There's", 1);
633 }
634
635 #[test]
636 fn medium_contraction() {
637 assert_condensed_contractions("isn't wasn't", 3);
638 }
639
640 #[test]
641 fn medium_contraction2() {
642 assert_condensed_contractions("There's no way", 5);
643 }
644
645 #[test]
646 fn selects_token_at_char_index() {
647 let text = "There were three little pigs. They built three little homes.";
648 let document = Document::new_plain_english_curated(text);
649
650 let got = document.get_token_at_char_index(19).unwrap();
651
652 assert!(got.kind.is_word());
653 assert_eq!(got.span, Span::new(17, 23));
654 }
655
656 fn assert_token_count(source: &str, count: usize) {
657 let document = Document::new_plain_english_curated(source);
658
659 dbg!(document.tokens().map(|t| t.kind).collect_vec());
660 assert_eq!(document.tokens.len(), count);
661 }
662
663 #[test]
664 fn condenses_number_suffixes() {
665 assert_token_count("1st", 1);
666 assert_token_count("This is the 2nd test", 9);
667 assert_token_count("This is the 3rd test", 9);
668 assert_token_count(
669 "It works even with weird capitalization like this: 600nD",
670 18,
671 );
672 }
673
674 #[test]
675 fn condenses_ie() {
676 assert_token_count("There is a thing (i.e. that one)", 15);
677 assert_token_count("We are trying to condense \"i.e.\"", 13);
678 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
679 }
680
681 #[test]
682 fn condenses_eg() {
683 assert_token_count("We are trying to condense \"e.g.\"", 13);
684 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
685 }
686
687 #[test]
688 fn condenses_nsa() {
689 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
690 }
691
692 #[test]
693 fn parses_ellipsis() {
694 assert_token_count("...", 1);
695 }
696
697 #[test]
698 fn parses_long_ellipsis() {
699 assert_token_count(".....", 1);
700 }
701
702 #[test]
703 fn parses_short_ellipsis() {
704 assert_token_count("..", 1);
705 }
706}