1use std::cmp::Ordering;
2use std::collections::VecDeque;
3use std::fmt::Display;
4
5use paste::paste;
6
7use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
8use crate::patterns::{
9 DocPattern, EitherPattern, Pattern, RepeatingPattern, SequencePattern, WordSet,
10};
11use crate::punctuation::Punctuation;
12use crate::vec_ext::VecExt;
13use crate::{Dictionary, FatToken, FstDictionary, Lrc, Token, TokenKind, TokenStringExt};
14use crate::{NumberSuffix, Span};
15
16#[derive(Debug, Clone)]
18pub struct Document {
19 source: Lrc<Vec<char>>,
20 tokens: Vec<Token>,
21}
22
23impl Default for Document {
24 fn default() -> Self {
25 Self::new("", &PlainEnglish, &FstDictionary::curated())
26 }
27}
28
29impl Document {
30 pub fn token_indices_intersecting(&self, span: Span) -> Vec<usize> {
34 self.tokens()
35 .enumerate()
36 .filter_map(|(idx, tok)| tok.span.overlaps_with(span).then_some(idx))
37 .collect()
38 }
39
40 pub fn new(text: &str, parser: &impl Parser, dictionary: &impl Dictionary) -> Self {
43 let source: Vec<_> = text.chars().collect();
44
45 Self::new_from_vec(Lrc::new(source), parser, dictionary)
46 }
47
48 pub fn new_curated(text: &str, parser: &impl Parser) -> Self {
51 let source: Vec<_> = text.chars().collect();
52
53 Self::new_from_vec(Lrc::new(source), parser, &FstDictionary::curated())
54 }
55
56 pub fn new_from_vec(
59 source: Lrc<Vec<char>>,
60 parser: &impl Parser,
61 dictionary: &impl Dictionary,
62 ) -> Self {
63 let tokens = parser.parse(&source);
64
65 let mut document = Self { source, tokens };
66 document.parse(dictionary);
67
68 document
69 }
70
71 pub fn new_plain_english_curated(text: &str) -> Self {
74 Self::new(text, &PlainEnglish, &FstDictionary::curated())
75 }
76
77 pub fn new_plain_english(text: &str, dictionary: &impl Dictionary) -> Self {
80 Self::new(text, &PlainEnglish, dictionary)
81 }
82
83 pub fn new_markdown_curated(text: &str, markdown_options: MarkdownOptions) -> Self {
86 Self::new(
87 text,
88 &Markdown::new(markdown_options),
89 &FstDictionary::curated(),
90 )
91 }
92
93 pub fn new_markdown_default_curated(text: &str) -> Self {
96 Self::new_markdown_curated(text, MarkdownOptions::default())
97 }
98
99 pub fn new_markdown(
102 text: &str,
103 markdown_options: MarkdownOptions,
104 dictionary: &impl Dictionary,
105 ) -> Self {
106 Self::new(text, &Markdown::new(markdown_options), dictionary)
107 }
108
109 pub fn new_markdown_default(text: &str, dictionary: &impl Dictionary) -> Self {
112 Self::new_markdown(text, MarkdownOptions::default(), dictionary)
113 }
114
115 fn parse(&mut self, dictionary: &impl Dictionary) {
119 self.condense_spaces();
120 self.condense_newlines();
121 self.newlines_to_breaks();
122 self.condense_contractions();
123 self.condense_dotted_initialisms();
124 self.condense_number_suffixes();
125 self.condense_ellipsis();
126 self.condense_latin();
127 self.match_quotes();
128
129 for token in self.tokens.iter_mut() {
130 if let TokenKind::Word(meta) = &mut token.kind {
131 let word_source = token.span.get_content(&self.source);
132 let found_meta = dictionary.get_word_metadata(word_source);
133 *meta = meta.or(&found_meta);
134 }
135 }
136 }
137
138 fn newlines_to_breaks(&mut self) {
140 for token in &mut self.tokens {
141 if let TokenKind::Newline(n) = token.kind {
142 if n >= 2 {
143 token.kind = TokenKind::ParagraphBreak;
144 }
145 }
146 }
147 }
148
149 fn condense_indices(&mut self, indices: &[usize], stretch_len: usize) {
155 for idx in indices {
157 let end_tok = self.tokens[idx + stretch_len - 1];
158 let start_tok = &mut self.tokens[*idx];
159
160 start_tok.span.end = end_tok.span.end;
161 }
162
163 let old = self.tokens.clone();
165 self.tokens.clear();
166
167 self.tokens
169 .extend_from_slice(&old[0..indices.first().copied().unwrap_or(indices.len())]);
170
171 let mut iter = indices.iter().peekable();
172
173 while let (Some(a_idx), b) = (iter.next(), iter.peek()) {
174 self.tokens.push(old[*a_idx]);
175
176 if let Some(b_idx) = b {
177 self.tokens
178 .extend_from_slice(&old[a_idx + stretch_len..**b_idx]);
179 }
180 }
181
182 self.tokens.extend_from_slice(
184 &old[indices
185 .last()
186 .map(|v| v + stretch_len)
187 .unwrap_or(indices.len())..],
188 );
189 }
190
191 pub fn get_token_at_char_index(&self, char_index: usize) -> Option<Token> {
192 let index = self
193 .tokens
194 .binary_search_by(|t| {
195 if t.span.overlaps_with(Span::new_with_len(char_index, 1)) {
196 Ordering::Equal
197 } else {
198 t.span.start.cmp(&char_index)
199 }
200 })
201 .ok()?;
202
203 Some(self.tokens[index])
204 }
205
206 pub fn get_token(&self, index: usize) -> Option<Token> {
208 self.tokens.get(index).copied()
209 }
210
211 pub fn tokens(&self) -> impl Iterator<Item = Token> + '_ {
213 self.tokens.iter().copied()
214 }
215
216 pub fn fat_tokens(&self) -> impl Iterator<Item = FatToken> + '_ {
218 self.tokens().map(|token| token.to_fat(&self.source))
219 }
220
221 pub fn get_span_content(&self, span: Span) -> &[char] {
222 span.get_content(&self.source)
223 }
224
225 pub fn get_span_content_str(&self, span: Span) -> String {
226 String::from_iter(self.get_span_content(span))
227 }
228
229 pub fn get_full_string(&self) -> String {
230 self.get_span_content_str(Span {
231 start: 0,
232 end: self.source.len(),
233 })
234 }
235
236 pub fn get_full_content(&self) -> &[char] {
237 &self.source
238 }
239
240 pub fn get_source(&self) -> &[char] {
241 &self.source
242 }
243
244 pub fn get_tokens(&self) -> &[Token] {
245 &self.tokens
246 }
247
248 fn match_quotes(&mut self) {
254 let quote_indices: Vec<usize> = self.tokens.iter_quote_indices().collect();
255
256 for i in 0..quote_indices.len() / 2 {
257 let a_i = quote_indices[i * 2];
258 let b_i = quote_indices[i * 2 + 1];
259
260 {
261 let a = self.tokens[a_i].kind.as_mut_quote().unwrap();
262 a.twin_loc = Some(b_i);
263 }
264
265 {
266 let b = self.tokens[b_i].kind.as_mut_quote().unwrap();
267 b.twin_loc = Some(a_i);
268 }
269 }
270 }
271
272 fn condense_number_suffixes(&mut self) {
274 if self.tokens.len() < 2 {
275 return;
276 }
277
278 let mut replace_starts = Vec::new();
279
280 for idx in 0..self.tokens.len() - 1 {
281 let b = self.tokens[idx + 1];
282 let a = self.tokens[idx];
283
284 if let (TokenKind::Number(..), TokenKind::Word(..)) = (a.kind, b.kind) {
287 if let Some(found_suffix) = NumberSuffix::from_chars(self.get_span_content(b.span))
288 {
289 self.tokens[idx].kind.as_mut_number().unwrap().suffix = Some(found_suffix);
290 replace_starts.push(idx);
291 }
292 }
293 }
294
295 self.condense_indices(&replace_starts, 2);
296 }
297
298 fn condense_spaces(&mut self) {
301 let mut cursor = 0;
302 let copy = self.tokens.clone();
303
304 let mut remove_these = VecDeque::new();
305
306 while cursor < self.tokens.len() {
307 let start_tok = &mut self.tokens[cursor];
309
310 if let TokenKind::Space(start_count) = &mut start_tok.kind {
311 loop {
312 cursor += 1;
313
314 if cursor >= copy.len() {
315 break;
316 }
317
318 let child_tok = ©[cursor];
319
320 if start_tok.span.end != child_tok.span.start {
322 break;
323 }
324
325 if let TokenKind::Space(n) = child_tok.kind {
326 *start_count += n;
327 start_tok.span.end = child_tok.span.end;
328 remove_these.push_back(cursor);
329 cursor += 1;
330 } else {
331 break;
332 };
333 }
334 }
335
336 cursor += 1;
337 }
338
339 self.tokens.remove_indices(remove_these);
340 }
341
342 thread_local! {
343 static LATIN_PATTERN: Lrc<EitherPattern> = Document::uncached_latin_pattern();
344 }
345
346 fn uncached_latin_pattern() -> Lrc<EitherPattern> {
347 Lrc::new(EitherPattern::new(vec![
348 Box::new(
349 SequencePattern::default()
350 .then_word_set(WordSet::all(&["etc", "vs"]))
351 .then_period(),
352 ),
353 Box::new(
354 SequencePattern::aco("et")
355 .then_whitespace()
356 .t_aco("al")
357 .then_period(),
358 ),
359 ]))
360 }
361
362 fn condense_pattern<F>(&mut self, pattern: &impl Pattern, edit: F)
365 where
366 F: Fn(&mut Token),
367 {
368 let matches = pattern.find_all_matches_in_doc(self);
369
370 let mut remove_indices = VecDeque::with_capacity(matches.len());
371
372 for m in matches {
373 remove_indices.extend(m.start + 1..m.end);
374 self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
375 edit(&mut self.tokens[m.start]);
376 }
377
378 self.tokens.remove_indices(remove_indices);
379 }
380
381 fn condense_latin(&mut self) {
382 self.condense_pattern(&Self::LATIN_PATTERN.with(|v| v.clone()), |_| {})
383 }
384
385 fn condense_newlines(&mut self) {
388 let mut cursor = 0;
389 let copy = self.tokens.clone();
390
391 let mut remove_these = VecDeque::new();
392
393 while cursor < self.tokens.len() {
394 let start_tok = &mut self.tokens[cursor];
396
397 if let TokenKind::Newline(start_count) = &mut start_tok.kind {
398 loop {
399 cursor += 1;
400
401 if cursor >= copy.len() {
402 break;
403 }
404
405 let child_tok = ©[cursor];
406 if let TokenKind::Newline(n) = child_tok.kind {
407 *start_count += n;
408 start_tok.span.end = child_tok.span.end;
409 remove_these.push_back(cursor);
410 cursor += 1;
411 } else {
412 break;
413 };
414 }
415 }
416
417 cursor += 1;
418 }
419
420 self.tokens.remove_indices(remove_these);
421 }
422
423 fn condense_dotted_initialisms(&mut self) {
426 if self.tokens.len() < 2 {
427 return;
428 }
429
430 let mut to_remove = VecDeque::new();
431
432 let mut cursor = 1;
433
434 let mut initialism_start = None;
435
436 loop {
437 let a = self.tokens[cursor - 1];
438 let b = self.tokens[cursor];
439
440 let is_initialism_chunk = a.kind.is_word() && a.span.len() == 1 && b.kind.is_period();
441
442 if is_initialism_chunk {
443 if initialism_start.is_none() {
444 initialism_start = Some(cursor - 1);
445 } else {
446 to_remove.push_back(cursor - 1);
447 }
448
449 to_remove.push_back(cursor);
450 cursor += 1;
451 } else {
452 if let Some(start) = initialism_start {
453 let end = self.tokens[cursor - 2].span.end;
454 let start_tok: &mut Token = &mut self.tokens[start];
455 start_tok.span.end = end;
456 }
457
458 initialism_start = None;
459 }
460
461 cursor += 1;
462
463 if cursor >= self.tokens.len() - 1 {
464 break;
465 }
466 }
467
468 self.tokens.remove_indices(to_remove);
469 }
470
471 fn uncached_ellipsis_pattern() -> Lrc<RepeatingPattern> {
472 let period = SequencePattern::default().then_period();
473 Lrc::new(RepeatingPattern::new(Box::new(period), 2))
474 }
475
476 thread_local! {
477 static ELLIPSIS_PATTERN: Lrc<RepeatingPattern> = Document::uncached_ellipsis_pattern();
478 }
479
480 fn condense_ellipsis(&mut self) {
481 let pattern = Self::ELLIPSIS_PATTERN.with(|v| v.clone());
482 self.condense_pattern(&pattern, |tok| {
483 tok.kind = TokenKind::Punctuation(Punctuation::Ellipsis)
484 });
485 }
486
487 fn uncached_contraction_pattern() -> Lrc<SequencePattern> {
488 Lrc::new(
489 SequencePattern::default()
490 .then_any_word()
491 .then_apostrophe()
492 .then_any_word(),
493 )
494 }
495
496 thread_local! {
497 static CONTRACTION_PATTERN: Lrc<SequencePattern> = Document::uncached_contraction_pattern();
498 }
499
500 fn condense_contractions(&mut self) {
503 let pattern = Self::CONTRACTION_PATTERN.with(|v| v.clone());
504
505 self.condense_pattern(&pattern, |_| {});
506 }
507}
508
509macro_rules! create_fns_on_doc {
511 ($thing:ident) => {
512 paste! {
513 fn [< first_ $thing >](&self) -> Option<Token> {
514 self.tokens.[< first_ $thing >]()
515 }
516
517 fn [< last_ $thing >](&self) -> Option<Token> {
518 self.tokens.[< last_ $thing >]()
519 }
520
521 fn [< last_ $thing _index>](&self) -> Option<usize> {
522 self.tokens.[< last_ $thing _index >]()
523 }
524
525 fn [<iter_ $thing _indices>](&self) -> impl Iterator<Item = usize> + '_ {
526 self.tokens.[< iter_ $thing _indices >]()
527 }
528
529 fn [<iter_ $thing s>](&self) -> impl Iterator<Item = Token> + '_ {
530 self.tokens.[< iter_ $thing s >]()
531 }
532 }
533 };
534}
535
536impl TokenStringExt for Document {
537 create_fns_on_doc!(word);
538 create_fns_on_doc!(word_like);
539 create_fns_on_doc!(conjunction);
540 create_fns_on_doc!(space);
541 create_fns_on_doc!(apostrophe);
542 create_fns_on_doc!(pipe);
543 create_fns_on_doc!(quote);
544 create_fns_on_doc!(number);
545 create_fns_on_doc!(at);
546 create_fns_on_doc!(ellipsis);
547 create_fns_on_doc!(unlintable);
548 create_fns_on_doc!(sentence_terminator);
549 create_fns_on_doc!(paragraph_break);
550 create_fns_on_doc!(chunk_terminator);
551 create_fns_on_doc!(punctuation);
552 create_fns_on_doc!(currency);
553 create_fns_on_doc!(likely_homograph);
554 create_fns_on_doc!(comma);
555
556 fn first_sentence_word(&self) -> Option<Token> {
557 self.tokens.first_sentence_word()
558 }
559
560 fn first_non_whitespace(&self) -> Option<Token> {
561 self.tokens.first_non_whitespace()
562 }
563
564 fn span(&self) -> Option<Span> {
565 self.tokens.span()
566 }
567
568 fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
569 self.tokens.iter_linking_verb_indices()
570 }
571
572 fn iter_linking_verbs(&self) -> impl Iterator<Item = Token> + '_ {
573 self.tokens.iter_linking_verbs()
574 }
575
576 fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
577 self.tokens.iter_chunks()
578 }
579
580 fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
581 self.tokens.iter_paragraphs()
582 }
583
584 fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
585 self.tokens.iter_sentences()
586 }
587}
588
589impl Display for Document {
590 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
591 for token in &self.tokens {
592 write!(f, "{}", self.get_span_content_str(token.span))?;
593 }
594
595 Ok(())
596 }
597}
598
599#[cfg(test)]
600mod tests {
601 use itertools::Itertools;
602
603 use super::Document;
604 use crate::{parsers::MarkdownOptions, Span};
605
606 fn assert_condensed_contractions(text: &str, final_tok_count: usize) {
607 let document = Document::new_plain_english_curated(text);
608
609 assert_eq!(document.tokens.len(), final_tok_count);
610
611 let document = Document::new_markdown_curated(text, MarkdownOptions::default());
612
613 assert_eq!(document.tokens.len(), final_tok_count);
614 }
615
616 #[test]
617 fn simple_contraction() {
618 assert_condensed_contractions("isn't", 1);
619 }
620
621 #[test]
622 fn simple_contraction2() {
623 assert_condensed_contractions("wasn't", 1);
624 }
625
626 #[test]
627 fn simple_contraction3() {
628 assert_condensed_contractions("There's", 1);
629 }
630
631 #[test]
632 fn medium_contraction() {
633 assert_condensed_contractions("isn't wasn't", 3);
634 }
635
636 #[test]
637 fn medium_contraction2() {
638 assert_condensed_contractions("There's no way", 5);
639 }
640
641 #[test]
642 fn selects_token_at_char_index() {
643 let text = "There were three little pigs. They built three little homes.";
644 let document = Document::new_plain_english_curated(text);
645
646 let got = document.get_token_at_char_index(19).unwrap();
647
648 assert!(got.kind.is_word());
649 assert_eq!(got.span, Span::new(17, 23));
650 }
651
652 fn assert_token_count(source: &str, count: usize) {
653 let document = Document::new_plain_english_curated(source);
654
655 dbg!(document.tokens().map(|t| t.kind).collect_vec());
656 assert_eq!(document.tokens.len(), count);
657 }
658
659 #[test]
660 fn condenses_number_suffixes() {
661 assert_token_count("1st", 1);
662 assert_token_count("This is the 2nd test", 9);
663 assert_token_count("This is the 3rd test", 9);
664 assert_token_count(
665 "It works even with weird capitalization like this: 600nD",
666 18,
667 );
668 }
669
670 #[test]
671 fn condenses_ie() {
672 assert_token_count("There is a thing (i.e. that one)", 15);
673 assert_token_count("We are trying to condense \"i.e.\"", 13);
674 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
675 }
676
677 #[test]
678 fn condenses_eg() {
679 assert_token_count("We are trying to condense \"e.g.\"", 13);
680 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
681 }
682
683 #[test]
684 fn condenses_nsa() {
685 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
686 }
687
688 #[test]
689 fn parses_ellipsis() {
690 assert_token_count("...", 1);
691 }
692
693 #[test]
694 fn parses_long_ellipsis() {
695 assert_token_count(".....", 1);
696 }
697
698 #[test]
699 fn parses_short_ellipsis() {
700 assert_token_count("..", 1);
701 }
702}