1use std::cmp::Ordering;
2use std::collections::VecDeque;
3use std::fmt::Display;
4
5use harper_brill::{Chunker, Tagger, brill_tagger, burn_chunker};
6use itertools::Itertools;
7use paste::paste;
8
9use crate::expr::{Expr, ExprExt, FirstMatchOf, Repeating, SequenceExpr};
10use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
11use crate::punctuation::Punctuation;
12use crate::spell::{Dictionary, FstDictionary};
13use crate::vec_ext::VecExt;
14use crate::{CharStringExt, FatStringToken, FatToken, Lrc, Token, TokenKind, TokenStringExt};
15use crate::{OrdinalSuffix, Span};
16
17#[derive(Debug, Clone)]
19pub struct Document {
20 source: Lrc<Vec<char>>,
21 tokens: Vec<Token>,
22}
23
24impl Default for Document {
25 fn default() -> Self {
26 Self::new("", &PlainEnglish, &FstDictionary::curated())
27 }
28}
29
30impl Document {
31 pub fn token_indices_intersecting(&self, span: Span<char>) -> Vec<usize> {
35 self.tokens()
36 .enumerate()
37 .filter_map(|(idx, tok)| tok.span.overlaps_with(span).then_some(idx))
38 .collect()
39 }
40
41 pub fn fat_tokens_intersecting(&self, span: Span<char>) -> Vec<FatToken> {
45 let indices = self.token_indices_intersecting(span);
46
47 indices
48 .into_iter()
49 .map(|i| self.tokens[i].to_fat(&self.source))
50 .collect()
51 }
52
53 pub fn new(text: &str, parser: &impl Parser, dictionary: &impl Dictionary) -> Self {
56 let source: Vec<_> = text.chars().collect();
57
58 Self::new_from_vec(Lrc::new(source), parser, dictionary)
59 }
60
61 pub fn new_curated(text: &str, parser: &impl Parser) -> Self {
64 let source: Vec<_> = text.chars().collect();
65
66 Self::new_from_vec(Lrc::new(source), parser, &FstDictionary::curated())
67 }
68
69 pub fn new_from_vec(
72 source: Lrc<Vec<char>>,
73 parser: &impl Parser,
74 dictionary: &impl Dictionary,
75 ) -> Self {
76 let tokens = parser.parse(&source);
77
78 let mut document = Self { source, tokens };
79 document.parse(dictionary);
80
81 document
82 }
83
84 pub fn new_plain_english_curated(text: &str) -> Self {
87 Self::new(text, &PlainEnglish, &FstDictionary::curated())
88 }
89
90 pub(crate) fn new_basic_tokenize(text: &str, parser: &impl Parser) -> Self {
96 let source = Lrc::new(text.chars().collect_vec());
97 let tokens = parser.parse(&source);
98 let mut document = Self { source, tokens };
99 document.apply_fixups();
100 document
101 }
102
103 pub fn new_plain_english(text: &str, dictionary: &impl Dictionary) -> Self {
106 Self::new(text, &PlainEnglish, dictionary)
107 }
108
109 pub fn new_markdown_curated(text: &str, markdown_options: MarkdownOptions) -> Self {
112 Self::new(
113 text,
114 &Markdown::new(markdown_options),
115 &FstDictionary::curated(),
116 )
117 }
118
119 pub fn new_markdown_default_curated(text: &str) -> Self {
122 Self::new_markdown_curated(text, MarkdownOptions::default())
123 }
124
125 pub fn new_markdown(
128 text: &str,
129 markdown_options: MarkdownOptions,
130 dictionary: &impl Dictionary,
131 ) -> Self {
132 Self::new(text, &Markdown::new(markdown_options), dictionary)
133 }
134
135 pub fn new_markdown_default(text: &str, dictionary: &impl Dictionary) -> Self {
138 Self::new_markdown(text, MarkdownOptions::default(), dictionary)
139 }
140
141 fn apply_fixups(&mut self) {
142 self.condense_spaces();
143 self.condense_newlines();
144 self.newlines_to_breaks();
145 self.condense_dotted_initialisms();
146 self.condense_number_suffixes();
147 self.condense_ellipsis();
148 self.condense_latin();
149 self.condense_common_top_level_domains();
150 self.condense_filename_extensions();
151 self.condense_tldr();
152 self.condense_ampersand_pairs();
153 self.condense_slash_pairs();
154 self.match_quotes();
155 }
156
157 fn parse(&mut self, dictionary: &impl Dictionary) {
161 self.apply_fixups();
162
163 let chunker = burn_chunker();
164 let tagger = brill_tagger();
165
166 for sent in self.tokens.iter_sentences_mut() {
167 let token_strings: Vec<_> = sent
168 .iter()
169 .filter(|t| !t.kind.is_whitespace())
170 .map(|t| t.span.get_content_string(&self.source))
171 .collect();
172
173 let token_tags = tagger.tag_sentence(&token_strings);
174 let np_flags = chunker.chunk_sentence(&token_strings, &token_tags);
175
176 let mut i = 0;
177
178 for token in sent.iter_mut() {
180 if let TokenKind::Word(meta) = &mut token.kind {
181 let word_source = token.span.get_content(&self.source);
182 let mut found_meta = dictionary
183 .get_word_metadata(word_source)
184 .map(|c| c.into_owned());
185
186 if let Some(inner) = &mut found_meta {
187 inner.pos_tag = token_tags[i].or_else(|| inner.infer_pos_tag());
188 inner.np_member = Some(np_flags[i]);
189 }
190
191 *meta = found_meta;
192 i += 1;
193 } else if !token.kind.is_whitespace() {
194 i += 1;
195 }
196 }
197 }
198 }
199
200 fn newlines_to_breaks(&mut self) {
202 for token in &mut self.tokens {
203 if let TokenKind::Newline(n) = token.kind
204 && n >= 2
205 {
206 token.kind = TokenKind::ParagraphBreak;
207 }
208 }
209 }
210
211 fn condense_indices(&mut self, indices: &[usize], stretch_len: usize) {
217 for idx in indices {
219 let end_tok = self.tokens[idx + stretch_len - 1].clone();
220 let start_tok = &mut self.tokens[*idx];
221
222 start_tok.span.end = end_tok.span.end;
223 }
224
225 let old = self.tokens.clone();
227 self.tokens.clear();
228
229 self.tokens
231 .extend_from_slice(&old[0..indices.first().copied().unwrap_or(indices.len())]);
232
233 let mut iter = indices.iter().peekable();
234
235 while let (Some(a_idx), b) = (iter.next(), iter.peek()) {
236 self.tokens.push(old[*a_idx].clone());
237
238 if let Some(b_idx) = b {
239 self.tokens
240 .extend_from_slice(&old[a_idx + stretch_len..**b_idx]);
241 }
242 }
243
244 self.tokens.extend_from_slice(
246 &old[indices
247 .last()
248 .map(|v| v + stretch_len)
249 .unwrap_or(indices.len())..],
250 );
251 }
252
253 pub fn get_token_at_char_index(&self, char_index: usize) -> Option<&Token> {
254 let index = self
255 .tokens
256 .binary_search_by(|t| {
257 if t.span.overlaps_with(Span::new_with_len(char_index, 1)) {
258 Ordering::Equal
259 } else {
260 t.span.start.cmp(&char_index)
261 }
262 })
263 .ok()?;
264
265 Some(&self.tokens[index])
266 }
267
268 pub fn get_token(&self, index: usize) -> Option<&Token> {
270 self.tokens.get(index)
271 }
272
273 pub fn get_token_offset(&self, base: usize, offset: isize) -> Option<&Token> {
275 match base.checked_add_signed(offset) {
276 None => None,
277 Some(idx) => self.get_token(idx),
278 }
279 }
280
281 pub fn tokens(&self) -> impl Iterator<Item = &Token> + '_ {
283 self.tokens.iter()
284 }
285
286 pub fn iter_nominal_phrases(&self) -> impl Iterator<Item = &[Token]> {
287 fn is_np_member(t: &Token) -> bool {
288 t.kind
289 .as_word()
290 .and_then(|x| x.as_ref())
291 .and_then(|w| w.np_member)
292 .unwrap_or(false)
293 }
294
295 fn trim(slice: &[Token]) -> &[Token] {
296 let mut start = 0;
297 let mut end = slice.len();
298 while start < end && slice[start].kind.is_whitespace() {
299 start += 1;
300 }
301 while end > start && slice[end - 1].kind.is_whitespace() {
302 end -= 1;
303 }
304 &slice[start..end]
305 }
306
307 self.tokens
308 .as_slice()
309 .split(|t| !(is_np_member(t) || t.kind.is_whitespace()))
310 .filter_map(|s| {
311 let s = trim(s);
312 if s.iter().any(is_np_member) {
313 Some(s)
314 } else {
315 None
316 }
317 })
318 }
319
320 pub fn fat_tokens(&self) -> impl Iterator<Item = FatToken> + '_ {
322 self.tokens().map(|token| token.to_fat(&self.source))
323 }
324
325 pub fn get_next_word_from_offset(&self, base: usize, offset: isize) -> Option<&Token> {
328 if !self.get_token_offset(base, offset)?.kind.is_whitespace() {
330 return None;
331 }
332 let word_token = self.get_token_offset(base, offset + offset.signum());
334 let word_token = word_token?;
335 word_token.kind.is_word().then_some(word_token)
336 }
337
338 pub fn fat_string_tokens(&self) -> impl Iterator<Item = FatStringToken> + '_ {
340 self.fat_tokens().map(|t| t.into())
341 }
342
343 pub fn get_span_content(&self, span: &Span<char>) -> &[char] {
344 span.get_content(&self.source)
345 }
346
347 pub fn get_span_content_str(&self, span: &Span<char>) -> String {
348 String::from_iter(self.get_span_content(span))
349 }
350
351 pub fn get_full_string(&self) -> String {
352 self.get_span_content_str(&Span::new(0, self.source.len()))
353 }
354
355 pub fn get_full_content(&self) -> &[char] {
356 &self.source
357 }
358
359 pub fn get_source(&self) -> &[char] {
360 &self.source
361 }
362
363 pub fn get_tokens(&self) -> &[Token] {
364 &self.tokens
365 }
366
367 fn match_quotes(&mut self) {
373 let mut pg_indices: Vec<_> = vec![0];
374 pg_indices.extend(self.iter_paragraph_break_indices());
375 pg_indices.push(self.tokens.len());
376
377 let mut quote_indices = Vec::new();
379 let mut open_quote_indices = Vec::new();
380
381 for (start, end) in pg_indices.into_iter().tuple_windows() {
382 let pg = &mut self.tokens[start..end];
383
384 quote_indices.clear();
385 quote_indices.extend(pg.iter_quote_indices());
386 open_quote_indices.clear();
387
388 for quote in "e_indices {
390 let is_open = *quote == 0
391 || pg[0..*quote].iter_word_likes().next().is_none()
392 || pg[quote - 1].kind.is_whitespace()
393 || matches!(
394 pg[quote - 1].kind.as_punctuation(),
395 Some(Punctuation::LessThan)
396 | Some(Punctuation::OpenRound)
397 | Some(Punctuation::OpenSquare)
398 | Some(Punctuation::OpenCurly)
399 | Some(Punctuation::Apostrophe)
400 );
401
402 if is_open {
403 open_quote_indices.push(*quote);
404 }
405 }
406
407 while let Some(open_idx) = open_quote_indices.pop() {
408 let Some(close_idx) = pg[open_idx + 1..].iter_quote_indices().next() else {
409 continue;
410 };
411
412 if pg[close_idx + open_idx + 1]
413 .kind
414 .as_quote()
415 .unwrap()
416 .twin_loc
417 .is_some()
418 {
419 continue;
420 }
421
422 pg[open_idx].kind.as_mut_quote().unwrap().twin_loc =
423 Some(close_idx + open_idx + start + 1);
424 pg[close_idx + open_idx + 1]
425 .kind
426 .as_mut_quote()
427 .unwrap()
428 .twin_loc = Some(open_idx + start);
429 }
430 }
431 }
432
433 fn condense_number_suffixes(&mut self) {
435 if self.tokens.len() < 2 {
436 return;
437 }
438
439 let mut replace_starts = Vec::new();
440
441 for idx in 0..self.tokens.len() - 1 {
442 let b = &self.tokens[idx + 1];
443 let a = &self.tokens[idx];
444
445 if let (TokenKind::Number(..), TokenKind::Word(..)) = (&a.kind, &b.kind)
448 && let Some(found_suffix) =
449 OrdinalSuffix::from_chars(self.get_span_content(&b.span))
450 {
451 self.tokens[idx].kind.as_mut_number().unwrap().suffix = Some(found_suffix);
452 replace_starts.push(idx);
453 }
454 }
455
456 self.condense_indices(&replace_starts, 2);
457 }
458
459 fn condense_spaces(&mut self) {
462 let mut cursor = 0;
463 let copy = self.tokens.clone();
464
465 let mut remove_these = VecDeque::new();
466
467 while cursor < self.tokens.len() {
468 let start_tok = &mut self.tokens[cursor];
470
471 if let TokenKind::Space(start_count) = &mut start_tok.kind {
472 loop {
473 cursor += 1;
474
475 if cursor >= copy.len() {
476 break;
477 }
478
479 let child_tok = ©[cursor];
480
481 if start_tok.span.end != child_tok.span.start {
483 break;
484 }
485
486 if let TokenKind::Space(n) = child_tok.kind {
487 *start_count += n;
488 start_tok.span.end = child_tok.span.end;
489 remove_these.push_back(cursor);
490 cursor += 1;
491 } else {
492 break;
493 };
494 }
495 }
496
497 cursor += 1;
498 }
499
500 self.tokens.remove_indices(remove_these);
501 }
502
503 thread_local! {
504 static LATIN_EXPR: Lrc<FirstMatchOf> = Document::uncached_latin_expr();
505 }
506
507 fn uncached_latin_expr() -> Lrc<FirstMatchOf> {
508 Lrc::new(FirstMatchOf::new(vec![
509 Box::new(SequenceExpr::word_set(&["etc", "vs"]).then_period()),
510 Box::new(
511 SequenceExpr::aco("et")
512 .then_whitespace()
513 .t_aco("al")
514 .then_period(),
515 ),
516 ]))
517 }
518
519 fn condense_expr<F>(&mut self, expr: &impl Expr, edit: F)
522 where
523 F: Fn(&mut Token),
524 {
525 let matches = expr.iter_matches_in_doc(self).collect::<Vec<_>>();
526
527 let mut remove_indices = VecDeque::with_capacity(matches.len());
528
529 for m in matches {
530 remove_indices.extend(m.start + 1..m.end);
531 self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
532 edit(&mut self.tokens[m.start]);
533 }
534
535 self.tokens.remove_indices(remove_indices);
536 }
537
538 fn condense_latin(&mut self) {
539 self.condense_expr(&Self::LATIN_EXPR.with(|v| v.clone()), |_| {})
540 }
541
542 fn condense_newlines(&mut self) {
545 let mut cursor = 0;
546 let copy = self.tokens.clone();
547
548 let mut remove_these = VecDeque::new();
549
550 while cursor < self.tokens.len() {
551 let start_tok = &mut self.tokens[cursor];
553
554 if let TokenKind::Newline(start_count) = &mut start_tok.kind {
555 loop {
556 cursor += 1;
557
558 if cursor >= copy.len() {
559 break;
560 }
561
562 let child_tok = ©[cursor];
563 if let TokenKind::Newline(n) = child_tok.kind {
564 *start_count += n;
565 start_tok.span.end = child_tok.span.end;
566 remove_these.push_back(cursor);
567 cursor += 1;
568 } else {
569 break;
570 };
571 }
572 }
573
574 cursor += 1;
575 }
576
577 self.tokens.remove_indices(remove_these);
578 }
579
580 fn condense_dotted_initialisms(&mut self) {
583 if self.tokens.len() < 2 {
584 return;
585 }
586
587 let mut to_remove = VecDeque::new();
588
589 let mut cursor = 1;
590
591 let mut initialism_start = None;
592
593 loop {
594 let a = &self.tokens[cursor - 1];
595 let b = &self.tokens[cursor];
596
597 let is_initialism_chunk = a.kind.is_word() && a.span.len() == 1 && b.kind.is_period();
598
599 if is_initialism_chunk {
600 if initialism_start.is_none() {
601 initialism_start = Some(cursor - 1);
602 } else {
603 to_remove.push_back(cursor - 1);
604 }
605
606 to_remove.push_back(cursor);
607 cursor += 1;
608 } else {
609 if let Some(start) = initialism_start {
610 let end = self.tokens[cursor - 2].span.end;
611 let start_tok: &mut Token = &mut self.tokens[start];
612 start_tok.span.end = end;
613 }
614
615 initialism_start = None;
616 }
617
618 cursor += 1;
619
620 if cursor >= self.tokens.len() - 1 {
621 break;
622 }
623 }
624
625 self.tokens.remove_indices(to_remove);
626 }
627
628 fn condense_filename_extensions(&mut self) {
630 if self.tokens.len() < 2 {
631 return;
632 }
633
634 let mut to_remove = VecDeque::new();
635
636 let mut cursor = 1;
637
638 let mut ext_start = None;
639
640 loop {
641 let l = self.get_token_offset(cursor, -2);
643 let d = &self.tokens[cursor - 1];
644 let x = &self.tokens[cursor];
645 let r = self.get_token_offset(cursor, 1);
646
647 let is_ext_chunk = d.kind.is_period()
648 && x.kind.is_word()
649 && x.span.len() <= 3
650 && ((l.is_none_or(|t| t.kind.is_whitespace())
651 && r.is_none_or(|t| t.kind.is_whitespace()))
652 || (l.is_some_and(|t| t.kind.is_open_round())
653 && r.is_some_and(|t| t.kind.is_close_round())))
654 && {
655 let ext_chars = x.span.get_content(&self.source);
656 ext_chars.iter().all(|c| c.is_ascii_lowercase())
657 || ext_chars.iter().all(|c| c.is_ascii_uppercase())
658 };
659
660 if is_ext_chunk {
661 if ext_start.is_none() {
662 ext_start = Some(cursor - 1);
663 self.tokens[cursor - 1].kind = TokenKind::Unlintable;
664 } else {
665 to_remove.push_back(cursor - 1);
666 }
667
668 to_remove.push_back(cursor);
669 cursor += 1;
670 } else {
671 if let Some(start) = ext_start {
672 let end = self.tokens[cursor - 2].span.end;
673 let start_tok: &mut Token = &mut self.tokens[start];
674 start_tok.span.end = end;
675 }
676
677 ext_start = None;
678 }
679
680 cursor += 1;
681
682 if cursor >= self.tokens.len() {
683 break;
684 }
685 }
686
687 self.tokens.remove_indices(to_remove);
688 }
689
690 fn condense_common_top_level_domains(&mut self) {
692 const COMMON_TOP_LEVEL_DOMAINS: &[&str; 106] = &[
693 "ai", "app", "blog", "co", "com", "dev", "edu", "gov", "info", "io", "me", "mil",
694 "net", "org", "shop", "tech", "uk", "us", "xyz", "jp", "de", "fr", "br", "it", "ru",
695 "es", "pl", "ca", "au", "cn", "in", "nl", "eu", "ch", "id", "at", "kr", "cz", "mx",
696 "be", "tv", "se", "tr", "tw", "al", "ua", "ir", "vn", "cl", "sk", "ly", "cc", "to",
697 "no", "fi", "pt", "dk", "ar", "hu", "tk", "gr", "il", "news", "ro", "my", "biz", "ie",
698 "za", "nz", "sg", "ee", "th", "pe", "bg", "hk", "rs", "lt", "link", "ph", "club", "si",
699 "site", "mobi", "by", "cat", "wiki", "la", "ga", "xxx", "cf", "hr", "ng", "jobs",
700 "online", "kz", "ug", "gq", "ae", "is", "lv", "pro", "fm", "tips", "ms", "sa", "int",
701 ];
702
703 if self.tokens.len() < 2 {
704 return;
705 }
706
707 let mut to_remove = VecDeque::new();
708 for cursor in 1..self.tokens.len() {
709 let l = self.get_token_offset(cursor, -2);
711 let d = &self.tokens[cursor - 1];
712 let tld = &self.tokens[cursor];
713 let r = self.get_token_offset(cursor, 1);
714
715 let is_tld_chunk = d.kind.is_period()
716 && tld.kind.is_word()
717 && tld
718 .span
719 .get_content(&self.source)
720 .iter()
721 .all(|c| c.is_ascii_alphabetic())
722 && tld
723 .span
724 .get_content(&self.source)
725 .eq_any_ignore_ascii_case_str(COMMON_TOP_LEVEL_DOMAINS)
726 && ((l.is_none_or(|t| t.kind.is_whitespace())
727 && r.is_none_or(|t| t.kind.is_whitespace()))
728 || (l.is_some_and(|t| t.kind.is_open_round())
729 && r.is_some_and(|t| t.kind.is_close_round())));
730
731 if is_tld_chunk {
732 self.tokens[cursor - 1].kind = TokenKind::Unlintable;
733 self.tokens[cursor - 1].span.end = self.tokens[cursor].span.end;
734 to_remove.push_back(cursor);
735 }
736 }
737
738 self.tokens.remove_indices(to_remove);
739 }
740
741 fn condense_tldr(&mut self) {
743 if self.tokens.len() < 3 {
744 return;
745 }
746
747 let mut to_remove = VecDeque::new();
748 let mut cursor = 2;
749
750 loop {
751 let tl = &self.tokens[cursor - 2];
752 let simicolon = &self.tokens[cursor - 1];
753 let dr = &self.tokens[cursor];
754
755 let is_tldr_chunk = tl.kind.is_word()
756 && tl.span.len() == 2
757 && tl
758 .span
759 .get_content(&self.source)
760 .eq_ignore_ascii_case_chars(&['t', 'l'])
761 && simicolon.kind.is_semicolon()
762 && dr.kind.is_word()
763 && dr.span.len() >= 2
764 && dr.span.len() <= 3
765 && dr
766 .span
767 .get_content(&self.source)
768 .eq_any_ignore_ascii_case_chars(&[&['d', 'r'], &['d', 'r', 's']]);
769
770 if is_tldr_chunk {
771 self.tokens[cursor - 2].span = Span::new(
773 self.tokens[cursor - 2].span.start,
774 self.tokens[cursor].span.end,
775 );
776
777 to_remove.push_back(cursor - 1);
779 to_remove.push_back(cursor);
780 }
781
782 cursor += 1;
784
785 if cursor >= self.tokens.len() {
786 break;
787 }
788 }
789
790 self.tokens.remove_indices(to_remove);
792 }
793
794 fn condense_delimited_pairs<F>(&mut self, is_delimiter: F, valid_pairs: &[(char, char)])
802 where
803 F: Fn(&TokenKind) -> bool,
804 {
805 if self.tokens.len() < 3 {
806 return;
807 }
808
809 let mut to_remove = VecDeque::new();
810 let mut cursor = 2;
811
812 loop {
813 let l1 = &self.tokens[cursor - 2];
814 let delim = &self.tokens[cursor - 1];
815 let l2 = &self.tokens[cursor];
816
817 let is_delimited_chunk = l1.kind.is_word()
818 && l1.span.len() == 1
819 && is_delimiter(&delim.kind)
820 && l2.kind.is_word()
821 && l2.span.len() == 1;
822
823 if is_delimited_chunk {
824 let (l1, l2) = (
825 l1.span.get_content(&self.source).first(),
826 l2.span.get_content(&self.source).first(),
827 );
828
829 let is_valid_pair = match (l1, l2) {
830 (Some(l1), Some(l2)) => {
831 let pair = (l1.to_ascii_lowercase(), l2.to_ascii_lowercase());
832 valid_pairs.contains(&pair)
833 }
834 _ => false,
835 };
836
837 if is_valid_pair {
838 self.tokens[cursor - 2].span = Span::new(
839 self.tokens[cursor - 2].span.start,
840 self.tokens[cursor].span.end,
841 );
842 to_remove.push_back(cursor - 1);
843 to_remove.push_back(cursor);
844 }
845 }
846
847 cursor += 1;
848 if cursor >= self.tokens.len() {
849 break;
850 }
851 }
852
853 self.tokens.remove_indices(to_remove);
854 }
855
856 fn condense_ampersand_pairs(&mut self) {
858 self.condense_delimited_pairs(
859 |kind| kind.is_ampersand(),
860 &[
861 ('b', 'b'), ('b', 'w'), ('g', 't'), ('k', 'r'), ('q', 'a'), ('r', 'b'), ('r', 'd'), ('r', 'r'), ('s', 'p'), ],
871 );
872 }
873
874 fn condense_slash_pairs(&mut self) {
876 self.condense_delimited_pairs(
877 |kind| kind.is_slash(),
878 &[
879 ('a', 'c'), ('b', 'w'), ('c', 'o'), ('d', 'c'), ('d', 'l'), ('i', 'o'), ('j', 'k'), ('n', 'a'), ('r', 'c'), ('s', 'n'), ('y', 'n'), ('y', 'o'), ],
892 );
893 }
894
895 fn uncached_ellipsis_pattern() -> Lrc<Repeating> {
896 let period = SequenceExpr::default().then_period();
897 Lrc::new(Repeating::new(Box::new(period), 2))
898 }
899
900 thread_local! {
901 static ELLIPSIS_EXPR: Lrc<Repeating> = Document::uncached_ellipsis_pattern();
902 }
903
904 fn condense_ellipsis(&mut self) {
905 let expr = Self::ELLIPSIS_EXPR.with(|v| v.clone());
906 self.condense_expr(&expr, |tok| {
907 tok.kind = TokenKind::Punctuation(Punctuation::Ellipsis)
908 });
909 }
910}
911
912macro_rules! create_fns_on_doc {
914 ($thing:ident) => {
915 paste! {
916 fn [< first_ $thing >](&self) -> Option<&Token> {
917 self.tokens.[< first_ $thing >]()
918 }
919
920 fn [< last_ $thing >](&self) -> Option<&Token> {
921 self.tokens.[< last_ $thing >]()
922 }
923
924 fn [< last_ $thing _index>](&self) -> Option<usize> {
925 self.tokens.[< last_ $thing _index >]()
926 }
927
928 fn [<iter_ $thing _indices>](&self) -> impl DoubleEndedIterator<Item = usize> + '_ {
929 self.tokens.[< iter_ $thing _indices >]()
930 }
931
932 fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_ {
933 self.tokens.[< iter_ $thing s >]()
934 }
935 }
936 };
937}
938
939impl TokenStringExt for Document {
940 create_fns_on_doc!(adjective);
941 create_fns_on_doc!(apostrophe);
942 create_fns_on_doc!(at);
943 create_fns_on_doc!(chunk_terminator);
944 create_fns_on_doc!(comma);
945 create_fns_on_doc!(conjunction);
946 create_fns_on_doc!(currency);
947 create_fns_on_doc!(ellipsis);
948 create_fns_on_doc!(hostname);
949 create_fns_on_doc!(likely_homograph);
950 create_fns_on_doc!(noun);
951 create_fns_on_doc!(number);
952 create_fns_on_doc!(paragraph_break);
953 create_fns_on_doc!(pipe);
954 create_fns_on_doc!(preposition);
955 create_fns_on_doc!(punctuation);
956 create_fns_on_doc!(quote);
957 create_fns_on_doc!(sentence_terminator);
958 create_fns_on_doc!(space);
959 create_fns_on_doc!(unlintable);
960 create_fns_on_doc!(verb);
961 create_fns_on_doc!(word);
962 create_fns_on_doc!(word_like);
963 create_fns_on_doc!(heading_start);
964
965 fn first_sentence_word(&self) -> Option<&Token> {
966 self.tokens.first_sentence_word()
967 }
968
969 fn first_non_whitespace(&self) -> Option<&Token> {
970 self.tokens.first_non_whitespace()
971 }
972
973 fn span(&self) -> Option<Span<char>> {
974 self.tokens.span()
975 }
976
977 fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
978 self.tokens.iter_linking_verb_indices()
979 }
980
981 fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_ {
982 self.tokens.iter_linking_verbs()
983 }
984
985 fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
986 self.tokens.iter_chunks()
987 }
988
989 fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
990 self.tokens.iter_paragraphs()
991 }
992
993 fn iter_headings(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
994 self.tokens.iter_headings()
995 }
996
997 fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
998 self.tokens.iter_sentences()
999 }
1000
1001 fn iter_sentences_mut(&mut self) -> impl Iterator<Item = &'_ mut [Token]> + '_ {
1002 self.tokens.iter_sentences_mut()
1003 }
1004}
1005
1006impl Display for Document {
1007 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1008 for token in &self.tokens {
1009 write!(f, "{}", self.get_span_content_str(&token.span))?;
1010 }
1011
1012 Ok(())
1013 }
1014}
1015
1016#[cfg(test)]
1017mod tests {
1018 use itertools::Itertools;
1019
1020 use super::Document;
1021 use crate::TokenStringExt;
1022 use crate::{Span, parsers::MarkdownOptions};
1023
1024 fn assert_condensed_contractions(text: &str, final_tok_count: usize) {
1025 let document = Document::new_plain_english_curated(text);
1026
1027 assert_eq!(document.tokens.len(), final_tok_count);
1028
1029 let document = Document::new_markdown_curated(text, MarkdownOptions::default());
1030
1031 assert_eq!(document.tokens.len(), final_tok_count);
1032 }
1033
1034 #[test]
1035 fn simple_contraction() {
1036 assert_condensed_contractions("isn't", 1);
1037 }
1038
1039 #[test]
1040 fn simple_contraction2() {
1041 assert_condensed_contractions("wasn't", 1);
1042 }
1043
1044 #[test]
1045 fn simple_contraction3() {
1046 assert_condensed_contractions("There's", 1);
1047 }
1048
1049 #[test]
1050 fn simple_contraction4() {
1051 assert_condensed_contractions("doesn't", 1);
1052 }
1053
1054 #[test]
1055 fn medium_contraction() {
1056 assert_condensed_contractions("isn't wasn't", 3);
1057 }
1058
1059 #[test]
1060 fn medium_contraction2() {
1061 assert_condensed_contractions("There's no way", 5);
1062 }
1063
1064 #[test]
1065 fn selects_token_at_char_index() {
1066 let text = "There were three little pigs. They built three little homes.";
1067 let document = Document::new_plain_english_curated(text);
1068
1069 let got = document.get_token_at_char_index(19).unwrap();
1070
1071 assert!(got.kind.is_word());
1072 assert_eq!(got.span, Span::new(17, 23));
1073 }
1074
1075 fn assert_token_count(source: &str, count: usize) {
1076 let document = Document::new_plain_english_curated(source);
1077
1078 dbg!(document.tokens().map(|t| t.kind.clone()).collect_vec());
1079 assert_eq!(document.tokens.len(), count);
1080 }
1081
1082 #[test]
1083 fn condenses_number_suffixes() {
1084 assert_token_count("1st", 1);
1085 assert_token_count("This is the 2nd test", 9);
1086 assert_token_count("This is the 3rd test", 9);
1087 assert_token_count(
1088 "It works even with weird capitalization like this: 600nD",
1089 18,
1090 );
1091 }
1092
1093 #[test]
1094 fn condenses_ie() {
1095 assert_token_count("There is a thing (i.e. that one)", 15);
1096 assert_token_count("We are trying to condense \"i.e.\"", 13);
1097 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1098 }
1099
1100 #[test]
1101 fn condenses_eg() {
1102 assert_token_count("We are trying to condense \"e.g.\"", 13);
1103 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1104 }
1105
1106 #[test]
1107 fn condenses_nsa() {
1108 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1109 }
1110
1111 #[test]
1112 fn parses_ellipsis() {
1113 assert_token_count("...", 1);
1114 }
1115
1116 #[test]
1117 fn parses_long_ellipsis() {
1118 assert_token_count(".....", 1);
1119 }
1120
1121 #[test]
1122 fn parses_short_ellipsis() {
1123 assert_token_count("..", 1);
1124 }
1125
1126 #[test]
1127 fn selects_token_at_offset() {
1128 let doc = Document::new_plain_english_curated("Foo bar baz");
1129
1130 let tok = doc.get_token_offset(1, -1).unwrap();
1131
1132 assert_eq!(tok.span, Span::new(0, 3));
1133 }
1134
1135 #[test]
1136 fn cant_select_token_before_start() {
1137 let doc = Document::new_plain_english_curated("Foo bar baz");
1138
1139 let tok = doc.get_token_offset(0, -1);
1140
1141 assert!(tok.is_none());
1142 }
1143
1144 #[test]
1145 fn select_next_word_pos_offset() {
1146 let doc = Document::new_plain_english_curated("Foo bar baz");
1147
1148 let bar = doc.get_next_word_from_offset(0, 1).unwrap();
1149 let bar = doc.get_span_content(&bar.span);
1150 assert_eq!(bar, ['b', 'a', 'r']);
1151 }
1152
1153 #[test]
1154 fn select_next_word_neg_offset() {
1155 let doc = Document::new_plain_english_curated("Foo bar baz");
1156
1157 let bar = doc.get_next_word_from_offset(2, -1).unwrap();
1158 let bar = doc.get_span_content(&bar.span);
1159 assert_eq!(bar, ['F', 'o', 'o']);
1160 }
1161
1162 #[test]
1163 fn cant_select_next_word_not_from_whitespace() {
1164 let doc = Document::new_plain_english_curated("Foo bar baz");
1165
1166 let tok = doc.get_next_word_from_offset(0, 2);
1167
1168 assert!(tok.is_none());
1169 }
1170
1171 #[test]
1172 fn cant_select_next_word_before_start() {
1173 let doc = Document::new_plain_english_curated("Foo bar baz");
1174
1175 let tok = doc.get_next_word_from_offset(0, -1);
1176
1177 assert!(tok.is_none());
1178 }
1179
1180 #[test]
1181 fn cant_select_next_word_with_punctuation_instead_of_whitespace() {
1182 let doc = Document::new_plain_english_curated("Foo, bar, baz");
1183
1184 let tok = doc.get_next_word_from_offset(0, 1);
1185
1186 assert!(tok.is_none());
1187 }
1188
1189 #[test]
1190 fn cant_select_next_word_with_punctuation_after_whitespace() {
1191 let doc = Document::new_plain_english_curated("Foo \"bar\", baz");
1192
1193 let tok = doc.get_next_word_from_offset(0, 1);
1194
1195 assert!(tok.is_none());
1196 }
1197
1198 #[test]
1199 fn condenses_filename_extensions() {
1200 let doc = Document::new_plain_english_curated(".c and .exe and .js");
1201 assert!(doc.tokens[0].kind.is_unlintable());
1202 assert!(doc.tokens[4].kind.is_unlintable());
1203 assert!(doc.tokens[8].kind.is_unlintable());
1204 }
1205
1206 #[test]
1207 fn condense_filename_extension_ok_at_start_and_end() {
1208 let doc = Document::new_plain_english_curated(".c and .EXE");
1209 assert!(doc.tokens.len() == 5);
1210 assert!(doc.tokens[0].kind.is_unlintable());
1211 assert!(doc.tokens[4].kind.is_unlintable());
1212 }
1213
1214 #[test]
1215 fn doesnt_condense_filename_extensions_with_mixed_case() {
1216 let doc = Document::new_plain_english_curated(".c and .Exe");
1217 assert!(doc.tokens.len() == 6);
1218 assert!(doc.tokens[0].kind.is_unlintable());
1219 assert!(doc.tokens[4].kind.is_punctuation());
1220 assert!(doc.tokens[5].kind.is_word());
1221 }
1222
1223 #[test]
1224 fn doesnt_condense_filename_extensions_with_non_letters() {
1225 let doc = Document::new_plain_english_curated(".COM and .C0M");
1226 assert!(doc.tokens.len() == 6);
1227 assert!(doc.tokens[0].kind.is_unlintable());
1228 assert!(doc.tokens[4].kind.is_punctuation());
1229 assert!(doc.tokens[5].kind.is_word());
1230 }
1231
1232 #[test]
1233 fn doesnt_condense_filename_extensions_longer_than_three() {
1234 let doc = Document::new_plain_english_curated(".dll and .dlls");
1235 assert!(doc.tokens.len() == 6);
1236 assert!(doc.tokens[0].kind.is_unlintable());
1237 assert!(doc.tokens[4].kind.is_punctuation());
1238 assert!(doc.tokens[5].kind.is_word());
1239 }
1240
1241 #[test]
1242 fn condense_filename_extension_in_parens() {
1243 let doc = Document::new_plain_english_curated(
1244 "true for the manual installation when trying to run the executable(.exe) after a manual download",
1245 );
1246 assert!(doc.tokens.len() > 23);
1247 assert!(doc.tokens[21].kind.is_open_round());
1248 assert!(doc.tokens[22].kind.is_unlintable());
1249 assert!(doc.tokens[23].kind.is_close_round());
1250 }
1251
1252 #[test]
1253 fn condense_tldr_uppercase() {
1254 let doc = Document::new_plain_english_curated("TL;DR");
1255 assert!(doc.tokens.len() == 1);
1256 assert!(doc.tokens[0].kind.is_word());
1257 assert!(doc.tokens[0].span.len() == 5);
1258 }
1259
1260 #[test]
1261 fn condense_tldr_lowercase() {
1262 let doc = Document::new_plain_english_curated("tl;dr");
1263 assert!(doc.tokens.len() == 1);
1264 assert!(doc.tokens[0].kind.is_word());
1265 }
1266
1267 #[test]
1268 fn condense_tldr_mixed_case_1() {
1269 let doc = Document::new_plain_english_curated("tl;DR");
1270 assert!(doc.tokens.len() == 1);
1271 assert!(doc.tokens[0].kind.is_word());
1272 }
1273
1274 #[test]
1275 fn condense_tldr_mixed_case_2() {
1276 let doc = Document::new_plain_english_curated("TL;Dr");
1277 assert!(doc.tokens.len() == 1);
1278 assert!(doc.tokens[0].kind.is_word());
1279 }
1280
1281 #[test]
1282 fn condense_tldr_pural() {
1283 let doc = Document::new_plain_english_curated(
1284 "managing the flow between components to produce relevant TL;DRs of current news articles",
1285 );
1286 assert!(
1288 doc.tokens
1289 .iter()
1290 .all(|t| t.kind.is_word() || t.kind.is_whitespace())
1291 );
1292 let tldrs = doc
1294 .tokens
1295 .iter()
1296 .filter(|t| t.span.get_content(&doc.source).contains(&';'))
1297 .collect_vec();
1298 assert!(tldrs.len() == 1);
1299 assert!(tldrs[0].span.get_content_string(&doc.source) == "TL;DRs");
1300 }
1301
1302 #[test]
1303 fn condense_common_top_level_domains() {
1304 let doc = Document::new_plain_english_curated(".blog and .com and .NET");
1305 assert!(doc.tokens.len() == 9);
1306 assert!(doc.tokens[0].kind.is_unlintable());
1307 assert!(doc.tokens[4].kind.is_unlintable());
1308 assert!(doc.tokens[8].kind.is_unlintable());
1309 }
1310
1311 #[test]
1312 fn condense_common_top_level_domains_in_parens() {
1313 let doc = Document::new_plain_english_curated("(.blog)");
1314 assert!(doc.tokens.len() == 3);
1315 assert!(doc.tokens[0].kind.is_open_round());
1316 assert!(doc.tokens[1].kind.is_unlintable());
1317 assert!(doc.tokens[2].kind.is_close_round());
1318 }
1319
1320 #[test]
1321 fn doesnt_condense_unknown_top_level_domains() {
1322 let doc = Document::new_plain_english_curated(".harper");
1323 assert!(doc.tokens.len() == 2);
1324 assert!(doc.tokens[0].kind.is_punctuation());
1325 assert!(doc.tokens[1].kind.is_word());
1326 }
1327
1328 #[test]
1329 fn condense_r_and_d_caps() {
1330 let doc = Document::new_plain_english_curated("R&D");
1331 assert!(doc.tokens.len() == 1);
1332 assert!(doc.tokens[0].kind.is_word());
1333 }
1334
1335 #[test]
1336 fn condense_r_and_d_mixed_case() {
1337 let doc = Document::new_plain_english_curated("R&d");
1338 assert!(doc.tokens.len() == 1);
1339 assert!(doc.tokens[0].kind.is_word());
1340 }
1341
1342 #[test]
1343 fn condense_r_and_d_lowercase() {
1344 let doc = Document::new_plain_english_curated("r&d");
1345 assert!(doc.tokens.len() == 1);
1346 assert!(doc.tokens[0].kind.is_word());
1347 }
1348
1349 #[test]
1350 fn dont_condense_r_and_d_with_spaces() {
1351 let doc = Document::new_plain_english_curated("R & D");
1352 assert!(doc.tokens.len() == 5);
1353 assert!(doc.tokens[0].kind.is_word());
1354 assert!(doc.tokens[1].kind.is_whitespace());
1355 assert!(doc.tokens[2].kind.is_ampersand());
1356 assert!(doc.tokens[3].kind.is_whitespace());
1357 assert!(doc.tokens[4].kind.is_word());
1358 }
1359
1360 #[test]
1361 fn condense_q_and_a() {
1362 let doc =
1363 Document::new_plain_english_curated("A Q&A platform software for teams at any scales.");
1364 assert!(doc.tokens.len() >= 3);
1365 assert!(doc.tokens[2].kind.is_word());
1366 assert!(doc.tokens[2].span.get_content_string(&doc.source) == "Q&A");
1367 }
1368
1369 #[test]
1370 fn dont_allow_mixed_r_and_d_with_q_and_a() {
1371 let doc = Document::new_plain_english_curated("R&A or Q&D");
1372 assert!(doc.tokens.len() == 9);
1373 assert!(doc.tokens[1].kind.is_ampersand() || doc.tokens[7].kind.is_ampersand());
1374 }
1375
1376 #[test]
1377 fn condense_io() {
1378 let doc = Document::new_plain_english_curated("I/O");
1379 assert!(doc.tokens.len() == 1);
1380 assert!(doc.tokens[0].kind.is_word());
1381 }
1382
1383 #[test]
1384 fn finds_unmatched_quotes_in_document() {
1385 let raw = r#"
1386This is a paragraph with a single word "quoted."
1387
1388This is a second paragraph with no quotes.
1389
1390This is a third paragraph with a single erroneous "quote.
1391
1392This is a final paragraph with a weird "quote and a not-weird "quote".
1393 "#;
1394
1395 let doc = Document::new_markdown_default_curated(raw);
1396
1397 let quote_twins: Vec<_> = doc
1398 .iter_quotes()
1399 .map(|t| t.kind.as_quote().unwrap().twin_loc)
1400 .collect();
1401
1402 assert_eq!(
1403 quote_twins,
1404 vec![Some(19), Some(16), None, None, Some(89), Some(87)]
1405 )
1406 }
1407
1408 #[test]
1409 fn issue_1901() {
1410 let raw = r#"
1411"A quoted line"
1412"A quote without a closing mark
1413"Another quoted lined"
1414"The last quoted line"
1415 "#;
1416
1417 let doc = Document::new_markdown_default_curated(raw);
1418
1419 let quote_twins: Vec<_> = doc
1420 .iter_quotes()
1421 .map(|t| t.kind.as_quote().unwrap().twin_loc)
1422 .collect();
1423
1424 assert_eq!(
1425 quote_twins,
1426 vec![
1427 Some(6),
1428 Some(0),
1429 None,
1430 Some(27),
1431 Some(21),
1432 Some(37),
1433 Some(29)
1434 ]
1435 )
1436 }
1437}