1use std::cmp::Ordering;
2use std::collections::VecDeque;
3use std::fmt::Display;
4
5use harper_brill::{Chunker, Tagger, brill_tagger, burn_chunker};
6use itertools::Itertools;
7use paste::paste;
8
9use crate::expr::{Expr, ExprExt, FirstMatchOf, Repeating, SequenceExpr};
10use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
11use crate::patterns::WordSet;
12use crate::punctuation::Punctuation;
13use crate::spell::{Dictionary, FstDictionary};
14use crate::vec_ext::VecExt;
15use crate::{CharStringExt, FatStringToken, FatToken, Lrc, Token, TokenKind, TokenStringExt};
16use crate::{OrdinalSuffix, Span};
17
18#[derive(Debug, Clone)]
20pub struct Document {
21 source: Lrc<Vec<char>>,
22 tokens: Vec<Token>,
23}
24
25impl Default for Document {
26 fn default() -> Self {
27 Self::new("", &PlainEnglish, &FstDictionary::curated())
28 }
29}
30
31impl Document {
32 pub fn token_indices_intersecting(&self, span: Span<char>) -> Vec<usize> {
36 self.tokens()
37 .enumerate()
38 .filter_map(|(idx, tok)| tok.span.overlaps_with(span).then_some(idx))
39 .collect()
40 }
41
42 pub fn fat_tokens_intersecting(&self, span: Span<char>) -> Vec<FatToken> {
46 let indices = self.token_indices_intersecting(span);
47
48 indices
49 .into_iter()
50 .map(|i| self.tokens[i].to_fat(&self.source))
51 .collect()
52 }
53
54 pub fn new(text: &str, parser: &impl Parser, dictionary: &impl Dictionary) -> Self {
57 let source: Vec<_> = text.chars().collect();
58
59 Self::new_from_vec(Lrc::new(source), parser, dictionary)
60 }
61
62 pub fn new_curated(text: &str, parser: &impl Parser) -> Self {
65 let source: Vec<_> = text.chars().collect();
66
67 Self::new_from_vec(Lrc::new(source), parser, &FstDictionary::curated())
68 }
69
70 pub fn new_from_vec(
73 source: Lrc<Vec<char>>,
74 parser: &impl Parser,
75 dictionary: &impl Dictionary,
76 ) -> Self {
77 let tokens = parser.parse(&source);
78
79 let mut document = Self { source, tokens };
80 document.parse(dictionary);
81
82 document
83 }
84
85 pub fn new_plain_english_curated(text: &str) -> Self {
88 Self::new(text, &PlainEnglish, &FstDictionary::curated())
89 }
90
91 pub(crate) fn new_basic_tokenize(text: &str, parser: &impl Parser) -> Self {
97 let source = Lrc::new(text.chars().collect_vec());
98 let tokens = parser.parse(&source);
99 let mut document = Self { source, tokens };
100 document.apply_fixups();
101 document
102 }
103
104 pub fn new_plain_english(text: &str, dictionary: &impl Dictionary) -> Self {
107 Self::new(text, &PlainEnglish, dictionary)
108 }
109
110 pub fn new_markdown_curated(text: &str, markdown_options: MarkdownOptions) -> Self {
113 Self::new(
114 text,
115 &Markdown::new(markdown_options),
116 &FstDictionary::curated(),
117 )
118 }
119
120 pub fn new_markdown_default_curated(text: &str) -> Self {
123 Self::new_markdown_curated(text, MarkdownOptions::default())
124 }
125
126 pub fn new_markdown(
129 text: &str,
130 markdown_options: MarkdownOptions,
131 dictionary: &impl Dictionary,
132 ) -> Self {
133 Self::new(text, &Markdown::new(markdown_options), dictionary)
134 }
135
136 pub fn new_markdown_default(text: &str, dictionary: &impl Dictionary) -> Self {
139 Self::new_markdown(text, MarkdownOptions::default(), dictionary)
140 }
141
142 fn apply_fixups(&mut self) {
143 self.condense_spaces();
144 self.condense_newlines();
145 self.newlines_to_breaks();
146 self.condense_dotted_initialisms();
147 self.condense_number_suffixes();
148 self.condense_ellipsis();
149 self.condense_latin();
150 self.condense_common_top_level_domains();
151 self.condense_filename_extensions();
152 self.condense_tldr();
153 self.condense_ampersand_pairs();
154 self.condense_slash_pairs();
155 self.match_quotes();
156 }
157
158 fn parse(&mut self, dictionary: &impl Dictionary) {
162 self.apply_fixups();
163
164 let chunker = burn_chunker();
165 let tagger = brill_tagger();
166
167 for sent in self.tokens.iter_sentences_mut() {
168 let token_strings: Vec<_> = sent
169 .iter()
170 .filter(|t| !t.kind.is_whitespace())
171 .map(|t| t.span.get_content_string(&self.source))
172 .collect();
173
174 let token_tags = tagger.tag_sentence(&token_strings);
175 let np_flags = chunker.chunk_sentence(&token_strings, &token_tags);
176
177 let mut i = 0;
178
179 for token in sent.iter_mut() {
181 if let TokenKind::Word(meta) = &mut token.kind {
182 let word_source = token.span.get_content(&self.source);
183 let mut found_meta = dictionary
184 .get_word_metadata(word_source)
185 .map(|c| c.into_owned());
186
187 if let Some(inner) = &mut found_meta {
188 inner.pos_tag = token_tags[i].or_else(|| inner.infer_pos_tag());
189 inner.np_member = Some(np_flags[i]);
190 }
191
192 *meta = found_meta;
193 i += 1;
194 } else if !token.kind.is_whitespace() {
195 i += 1;
196 }
197 }
198 }
199 }
200
201 fn newlines_to_breaks(&mut self) {
203 for token in &mut self.tokens {
204 if let TokenKind::Newline(n) = token.kind
205 && n >= 2
206 {
207 token.kind = TokenKind::ParagraphBreak;
208 }
209 }
210 }
211
212 fn condense_indices(&mut self, indices: &[usize], stretch_len: usize) {
218 for idx in indices {
220 let end_tok = self.tokens[idx + stretch_len - 1].clone();
221 let start_tok = &mut self.tokens[*idx];
222
223 start_tok.span.end = end_tok.span.end;
224 }
225
226 let old = self.tokens.clone();
228 self.tokens.clear();
229
230 self.tokens
232 .extend_from_slice(&old[0..indices.first().copied().unwrap_or(indices.len())]);
233
234 let mut iter = indices.iter().peekable();
235
236 while let (Some(a_idx), b) = (iter.next(), iter.peek()) {
237 self.tokens.push(old[*a_idx].clone());
238
239 if let Some(b_idx) = b {
240 self.tokens
241 .extend_from_slice(&old[a_idx + stretch_len..**b_idx]);
242 }
243 }
244
245 self.tokens.extend_from_slice(
247 &old[indices
248 .last()
249 .map(|v| v + stretch_len)
250 .unwrap_or(indices.len())..],
251 );
252 }
253
254 pub fn get_token_at_char_index(&self, char_index: usize) -> Option<&Token> {
255 let index = self
256 .tokens
257 .binary_search_by(|t| {
258 if t.span.overlaps_with(Span::new_with_len(char_index, 1)) {
259 Ordering::Equal
260 } else {
261 t.span.start.cmp(&char_index)
262 }
263 })
264 .ok()?;
265
266 Some(&self.tokens[index])
267 }
268
269 pub fn get_token(&self, index: usize) -> Option<&Token> {
271 self.tokens.get(index)
272 }
273
274 pub fn get_token_offset(&self, base: usize, offset: isize) -> Option<&Token> {
276 match base.checked_add_signed(offset) {
277 None => None,
278 Some(idx) => self.get_token(idx),
279 }
280 }
281
282 pub fn tokens(&self) -> impl Iterator<Item = &Token> + '_ {
284 self.tokens.iter()
285 }
286
287 pub fn iter_nominal_phrases(&self) -> impl Iterator<Item = &[Token]> {
288 fn is_np_member(t: &Token) -> bool {
289 t.kind
290 .as_word()
291 .and_then(|x| x.as_ref())
292 .and_then(|w| w.np_member)
293 .unwrap_or(false)
294 }
295
296 fn trim(slice: &[Token]) -> &[Token] {
297 let mut start = 0;
298 let mut end = slice.len();
299 while start < end && slice[start].kind.is_whitespace() {
300 start += 1;
301 }
302 while end > start && slice[end - 1].kind.is_whitespace() {
303 end -= 1;
304 }
305 &slice[start..end]
306 }
307
308 self.tokens
309 .as_slice()
310 .split(|t| !(is_np_member(t) || t.kind.is_whitespace()))
311 .filter_map(|s| {
312 let s = trim(s);
313 if s.iter().any(is_np_member) {
314 Some(s)
315 } else {
316 None
317 }
318 })
319 }
320
321 pub fn fat_tokens(&self) -> impl Iterator<Item = FatToken> + '_ {
323 self.tokens().map(|token| token.to_fat(&self.source))
324 }
325
326 pub fn get_next_word_from_offset(&self, base: usize, offset: isize) -> Option<&Token> {
329 if !self.get_token_offset(base, offset)?.kind.is_whitespace() {
331 return None;
332 }
333 let word_token = self.get_token_offset(base, offset + offset.signum());
335 let word_token = word_token?;
336 word_token.kind.is_word().then_some(word_token)
337 }
338
339 pub fn fat_string_tokens(&self) -> impl Iterator<Item = FatStringToken> + '_ {
341 self.fat_tokens().map(|t| t.into())
342 }
343
344 pub fn get_span_content(&self, span: &Span<char>) -> &[char] {
345 span.get_content(&self.source)
346 }
347
348 pub fn get_span_content_str(&self, span: &Span<char>) -> String {
349 String::from_iter(self.get_span_content(span))
350 }
351
352 pub fn get_full_string(&self) -> String {
353 self.get_span_content_str(&Span::new(0, self.source.len()))
354 }
355
356 pub fn get_full_content(&self) -> &[char] {
357 &self.source
358 }
359
360 pub fn get_source(&self) -> &[char] {
361 &self.source
362 }
363
364 pub fn get_tokens(&self) -> &[Token] {
365 &self.tokens
366 }
367
368 fn match_quotes(&mut self) {
374 let mut pg_indices: Vec<_> = vec![0];
375 pg_indices.extend(self.iter_paragraph_break_indices());
376 pg_indices.push(self.tokens.len());
377
378 let mut quote_indices = Vec::new();
380 let mut open_quote_indices = Vec::new();
381
382 for (start, end) in pg_indices.into_iter().tuple_windows() {
383 let pg = &mut self.tokens[start..end];
384
385 quote_indices.clear();
386 quote_indices.extend(pg.iter_quote_indices());
387 open_quote_indices.clear();
388
389 for quote in "e_indices {
391 let is_open = *quote == 0
392 || pg[0..*quote].iter_word_likes().next().is_none()
393 || pg[quote - 1].kind.is_whitespace()
394 || matches!(
395 pg[quote - 1].kind.as_punctuation(),
396 Some(Punctuation::LessThan)
397 | Some(Punctuation::OpenRound)
398 | Some(Punctuation::OpenSquare)
399 | Some(Punctuation::OpenCurly)
400 | Some(Punctuation::Apostrophe)
401 );
402
403 if is_open {
404 open_quote_indices.push(*quote);
405 }
406 }
407
408 while let Some(open_idx) = open_quote_indices.pop() {
409 let Some(close_idx) = pg[open_idx + 1..].iter_quote_indices().next() else {
410 continue;
411 };
412
413 if pg[close_idx + open_idx + 1]
414 .kind
415 .as_quote()
416 .unwrap()
417 .twin_loc
418 .is_some()
419 {
420 continue;
421 }
422
423 pg[open_idx].kind.as_mut_quote().unwrap().twin_loc =
424 Some(close_idx + open_idx + start + 1);
425 pg[close_idx + open_idx + 1]
426 .kind
427 .as_mut_quote()
428 .unwrap()
429 .twin_loc = Some(open_idx + start);
430 }
431 }
432 }
433
434 fn condense_number_suffixes(&mut self) {
436 if self.tokens.len() < 2 {
437 return;
438 }
439
440 let mut replace_starts = Vec::new();
441
442 for idx in 0..self.tokens.len() - 1 {
443 let b = &self.tokens[idx + 1];
444 let a = &self.tokens[idx];
445
446 if let (TokenKind::Number(..), TokenKind::Word(..)) = (&a.kind, &b.kind)
449 && let Some(found_suffix) =
450 OrdinalSuffix::from_chars(self.get_span_content(&b.span))
451 {
452 self.tokens[idx].kind.as_mut_number().unwrap().suffix = Some(found_suffix);
453 replace_starts.push(idx);
454 }
455 }
456
457 self.condense_indices(&replace_starts, 2);
458 }
459
460 fn condense_spaces(&mut self) {
463 let mut cursor = 0;
464 let copy = self.tokens.clone();
465
466 let mut remove_these = VecDeque::new();
467
468 while cursor < self.tokens.len() {
469 let start_tok = &mut self.tokens[cursor];
471
472 if let TokenKind::Space(start_count) = &mut start_tok.kind {
473 loop {
474 cursor += 1;
475
476 if cursor >= copy.len() {
477 break;
478 }
479
480 let child_tok = ©[cursor];
481
482 if start_tok.span.end != child_tok.span.start {
484 break;
485 }
486
487 if let TokenKind::Space(n) = child_tok.kind {
488 *start_count += n;
489 start_tok.span.end = child_tok.span.end;
490 remove_these.push_back(cursor);
491 cursor += 1;
492 } else {
493 break;
494 };
495 }
496 }
497
498 cursor += 1;
499 }
500
501 self.tokens.remove_indices(remove_these);
502 }
503
504 thread_local! {
505 static LATIN_EXPR: Lrc<FirstMatchOf> = Document::uncached_latin_expr();
506 }
507
508 fn uncached_latin_expr() -> Lrc<FirstMatchOf> {
509 Lrc::new(FirstMatchOf::new(vec![
510 Box::new(
511 SequenceExpr::default()
512 .then(WordSet::new(&["etc", "vs"]))
513 .then_period(),
514 ),
515 Box::new(
516 SequenceExpr::aco("et")
517 .then_whitespace()
518 .t_aco("al")
519 .then_period(),
520 ),
521 ]))
522 }
523
524 fn condense_expr<F>(&mut self, expr: &impl Expr, edit: F)
527 where
528 F: Fn(&mut Token),
529 {
530 let matches = expr.iter_matches_in_doc(self).collect::<Vec<_>>();
531
532 let mut remove_indices = VecDeque::with_capacity(matches.len());
533
534 for m in matches {
535 remove_indices.extend(m.start + 1..m.end);
536 self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
537 edit(&mut self.tokens[m.start]);
538 }
539
540 self.tokens.remove_indices(remove_indices);
541 }
542
543 fn condense_latin(&mut self) {
544 self.condense_expr(&Self::LATIN_EXPR.with(|v| v.clone()), |_| {})
545 }
546
547 fn condense_newlines(&mut self) {
550 let mut cursor = 0;
551 let copy = self.tokens.clone();
552
553 let mut remove_these = VecDeque::new();
554
555 while cursor < self.tokens.len() {
556 let start_tok = &mut self.tokens[cursor];
558
559 if let TokenKind::Newline(start_count) = &mut start_tok.kind {
560 loop {
561 cursor += 1;
562
563 if cursor >= copy.len() {
564 break;
565 }
566
567 let child_tok = ©[cursor];
568 if let TokenKind::Newline(n) = child_tok.kind {
569 *start_count += n;
570 start_tok.span.end = child_tok.span.end;
571 remove_these.push_back(cursor);
572 cursor += 1;
573 } else {
574 break;
575 };
576 }
577 }
578
579 cursor += 1;
580 }
581
582 self.tokens.remove_indices(remove_these);
583 }
584
585 fn condense_dotted_initialisms(&mut self) {
588 if self.tokens.len() < 2 {
589 return;
590 }
591
592 let mut to_remove = VecDeque::new();
593
594 let mut cursor = 1;
595
596 let mut initialism_start = None;
597
598 loop {
599 let a = &self.tokens[cursor - 1];
600 let b = &self.tokens[cursor];
601
602 let is_initialism_chunk = a.kind.is_word() && a.span.len() == 1 && b.kind.is_period();
603
604 if is_initialism_chunk {
605 if initialism_start.is_none() {
606 initialism_start = Some(cursor - 1);
607 } else {
608 to_remove.push_back(cursor - 1);
609 }
610
611 to_remove.push_back(cursor);
612 cursor += 1;
613 } else {
614 if let Some(start) = initialism_start {
615 let end = self.tokens[cursor - 2].span.end;
616 let start_tok: &mut Token = &mut self.tokens[start];
617 start_tok.span.end = end;
618 }
619
620 initialism_start = None;
621 }
622
623 cursor += 1;
624
625 if cursor >= self.tokens.len() - 1 {
626 break;
627 }
628 }
629
630 self.tokens.remove_indices(to_remove);
631 }
632
633 fn condense_filename_extensions(&mut self) {
635 if self.tokens.len() < 2 {
636 return;
637 }
638
639 let mut to_remove = VecDeque::new();
640
641 let mut cursor = 1;
642
643 let mut ext_start = None;
644
645 loop {
646 let l = self.get_token_offset(cursor, -2);
648 let d = &self.tokens[cursor - 1];
649 let x = &self.tokens[cursor];
650 let r = self.get_token_offset(cursor, 1);
651
652 let is_ext_chunk = d.kind.is_period()
653 && x.kind.is_word()
654 && x.span.len() <= 3
655 && ((l.is_none_or(|t| t.kind.is_whitespace())
656 && r.is_none_or(|t| t.kind.is_whitespace()))
657 || (l.is_some_and(|t| t.kind.is_open_round())
658 && r.is_some_and(|t| t.kind.is_close_round())))
659 && {
660 let ext_chars = x.span.get_content(&self.source);
661 ext_chars.iter().all(|c| c.is_ascii_lowercase())
662 || ext_chars.iter().all(|c| c.is_ascii_uppercase())
663 };
664
665 if is_ext_chunk {
666 if ext_start.is_none() {
667 ext_start = Some(cursor - 1);
668 self.tokens[cursor - 1].kind = TokenKind::Unlintable;
669 } else {
670 to_remove.push_back(cursor - 1);
671 }
672
673 to_remove.push_back(cursor);
674 cursor += 1;
675 } else {
676 if let Some(start) = ext_start {
677 let end = self.tokens[cursor - 2].span.end;
678 let start_tok: &mut Token = &mut self.tokens[start];
679 start_tok.span.end = end;
680 }
681
682 ext_start = None;
683 }
684
685 cursor += 1;
686
687 if cursor >= self.tokens.len() {
688 break;
689 }
690 }
691
692 self.tokens.remove_indices(to_remove);
693 }
694
695 fn condense_common_top_level_domains(&mut self) {
697 const COMMON_TOP_LEVEL_DOMAINS: &[&str; 106] = &[
698 "ai", "app", "blog", "co", "com", "dev", "edu", "gov", "info", "io", "me", "mil",
699 "net", "org", "shop", "tech", "uk", "us", "xyz", "jp", "de", "fr", "br", "it", "ru",
700 "es", "pl", "ca", "au", "cn", "in", "nl", "eu", "ch", "id", "at", "kr", "cz", "mx",
701 "be", "tv", "se", "tr", "tw", "al", "ua", "ir", "vn", "cl", "sk", "ly", "cc", "to",
702 "no", "fi", "pt", "dk", "ar", "hu", "tk", "gr", "il", "news", "ro", "my", "biz", "ie",
703 "za", "nz", "sg", "ee", "th", "pe", "bg", "hk", "rs", "lt", "link", "ph", "club", "si",
704 "site", "mobi", "by", "cat", "wiki", "la", "ga", "xxx", "cf", "hr", "ng", "jobs",
705 "online", "kz", "ug", "gq", "ae", "is", "lv", "pro", "fm", "tips", "ms", "sa", "int",
706 ];
707
708 if self.tokens.len() < 2 {
709 return;
710 }
711
712 let mut to_remove = VecDeque::new();
713 for cursor in 1..self.tokens.len() {
714 let l = self.get_token_offset(cursor, -2);
716 let d = &self.tokens[cursor - 1];
717 let tld = &self.tokens[cursor];
718 let r = self.get_token_offset(cursor, 1);
719
720 let is_tld_chunk = d.kind.is_period()
721 && tld.kind.is_word()
722 && tld
723 .span
724 .get_content(&self.source)
725 .iter()
726 .all(|c| c.is_ascii_alphabetic())
727 && tld
728 .span
729 .get_content(&self.source)
730 .eq_any_ignore_ascii_case_str(COMMON_TOP_LEVEL_DOMAINS)
731 && ((l.is_none_or(|t| t.kind.is_whitespace())
732 && r.is_none_or(|t| t.kind.is_whitespace()))
733 || (l.is_some_and(|t| t.kind.is_open_round())
734 && r.is_some_and(|t| t.kind.is_close_round())));
735
736 if is_tld_chunk {
737 self.tokens[cursor - 1].kind = TokenKind::Unlintable;
738 self.tokens[cursor - 1].span.end = self.tokens[cursor].span.end;
739 to_remove.push_back(cursor);
740 }
741 }
742
743 self.tokens.remove_indices(to_remove);
744 }
745
746 fn condense_tldr(&mut self) {
748 if self.tokens.len() < 3 {
749 return;
750 }
751
752 let mut to_remove = VecDeque::new();
753 let mut cursor = 2;
754
755 loop {
756 let tl = &self.tokens[cursor - 2];
757 let simicolon = &self.tokens[cursor - 1];
758 let dr = &self.tokens[cursor];
759
760 let is_tldr_chunk = tl.kind.is_word()
761 && tl.span.len() == 2
762 && tl
763 .span
764 .get_content(&self.source)
765 .eq_ignore_ascii_case_chars(&['t', 'l'])
766 && simicolon.kind.is_semicolon()
767 && dr.kind.is_word()
768 && dr.span.len() >= 2
769 && dr.span.len() <= 3
770 && dr
771 .span
772 .get_content(&self.source)
773 .eq_any_ignore_ascii_case_chars(&[&['d', 'r'], &['d', 'r', 's']]);
774
775 if is_tldr_chunk {
776 self.tokens[cursor - 2].span = Span::new(
778 self.tokens[cursor - 2].span.start,
779 self.tokens[cursor].span.end,
780 );
781
782 to_remove.push_back(cursor - 1);
784 to_remove.push_back(cursor);
785 }
786
787 cursor += 1;
789
790 if cursor >= self.tokens.len() {
791 break;
792 }
793 }
794
795 self.tokens.remove_indices(to_remove);
797 }
798
799 fn condense_delimited_pairs<F>(&mut self, is_delimiter: F, valid_pairs: &[(char, char)])
807 where
808 F: Fn(&TokenKind) -> bool,
809 {
810 if self.tokens.len() < 3 {
811 return;
812 }
813
814 let mut to_remove = VecDeque::new();
815 let mut cursor = 2;
816
817 loop {
818 let l1 = &self.tokens[cursor - 2];
819 let delim = &self.tokens[cursor - 1];
820 let l2 = &self.tokens[cursor];
821
822 let is_delimited_chunk = l1.kind.is_word()
823 && l1.span.len() == 1
824 && is_delimiter(&delim.kind)
825 && l2.kind.is_word()
826 && l2.span.len() == 1;
827
828 if is_delimited_chunk {
829 let (l1, l2) = (
830 l1.span.get_content(&self.source).first(),
831 l2.span.get_content(&self.source).first(),
832 );
833
834 let is_valid_pair = match (l1, l2) {
835 (Some(l1), Some(l2)) => {
836 let pair = (l1.to_ascii_lowercase(), l2.to_ascii_lowercase());
837 valid_pairs.contains(&pair)
838 }
839 _ => false,
840 };
841
842 if is_valid_pair {
843 self.tokens[cursor - 2].span = Span::new(
844 self.tokens[cursor - 2].span.start,
845 self.tokens[cursor].span.end,
846 );
847 to_remove.push_back(cursor - 1);
848 to_remove.push_back(cursor);
849 }
850 }
851
852 cursor += 1;
853 if cursor >= self.tokens.len() {
854 break;
855 }
856 }
857
858 self.tokens.remove_indices(to_remove);
859 }
860
861 fn condense_ampersand_pairs(&mut self) {
863 self.condense_delimited_pairs(
864 |kind| kind.is_ampersand(),
865 &[
866 ('b', 'b'), ('b', 'w'), ('g', 't'), ('k', 'r'), ('q', 'a'), ('r', 'b'), ('r', 'd'), ('r', 'r'), ('s', 'p'), ],
876 );
877 }
878
879 fn condense_slash_pairs(&mut self) {
881 self.condense_delimited_pairs(
882 |kind| kind.is_slash(),
883 &[
884 ('a', 'c'), ('b', 'w'), ('c', 'o'), ('d', 'c'), ('d', 'l'), ('i', 'o'), ('j', 'k'), ('n', 'a'), ('r', 'c'), ('s', 'n'), ('y', 'n'), ('y', 'o'), ],
897 );
898 }
899
900 fn uncached_ellipsis_pattern() -> Lrc<Repeating> {
901 let period = SequenceExpr::default().then_period();
902 Lrc::new(Repeating::new(Box::new(period), 2))
903 }
904
905 thread_local! {
906 static ELLIPSIS_EXPR: Lrc<Repeating> = Document::uncached_ellipsis_pattern();
907 }
908
909 fn condense_ellipsis(&mut self) {
910 let expr = Self::ELLIPSIS_EXPR.with(|v| v.clone());
911 self.condense_expr(&expr, |tok| {
912 tok.kind = TokenKind::Punctuation(Punctuation::Ellipsis)
913 });
914 }
915}
916
917macro_rules! create_fns_on_doc {
919 ($thing:ident) => {
920 paste! {
921 fn [< first_ $thing >](&self) -> Option<&Token> {
922 self.tokens.[< first_ $thing >]()
923 }
924
925 fn [< last_ $thing >](&self) -> Option<&Token> {
926 self.tokens.[< last_ $thing >]()
927 }
928
929 fn [< last_ $thing _index>](&self) -> Option<usize> {
930 self.tokens.[< last_ $thing _index >]()
931 }
932
933 fn [<iter_ $thing _indices>](&self) -> impl DoubleEndedIterator<Item = usize> + '_ {
934 self.tokens.[< iter_ $thing _indices >]()
935 }
936
937 fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_ {
938 self.tokens.[< iter_ $thing s >]()
939 }
940 }
941 };
942}
943
944impl TokenStringExt for Document {
945 create_fns_on_doc!(adjective);
946 create_fns_on_doc!(apostrophe);
947 create_fns_on_doc!(at);
948 create_fns_on_doc!(chunk_terminator);
949 create_fns_on_doc!(comma);
950 create_fns_on_doc!(conjunction);
951 create_fns_on_doc!(currency);
952 create_fns_on_doc!(ellipsis);
953 create_fns_on_doc!(hostname);
954 create_fns_on_doc!(likely_homograph);
955 create_fns_on_doc!(noun);
956 create_fns_on_doc!(number);
957 create_fns_on_doc!(paragraph_break);
958 create_fns_on_doc!(pipe);
959 create_fns_on_doc!(preposition);
960 create_fns_on_doc!(punctuation);
961 create_fns_on_doc!(quote);
962 create_fns_on_doc!(sentence_terminator);
963 create_fns_on_doc!(space);
964 create_fns_on_doc!(unlintable);
965 create_fns_on_doc!(verb);
966 create_fns_on_doc!(word);
967 create_fns_on_doc!(word_like);
968 create_fns_on_doc!(heading_start);
969
970 fn first_sentence_word(&self) -> Option<&Token> {
971 self.tokens.first_sentence_word()
972 }
973
974 fn first_non_whitespace(&self) -> Option<&Token> {
975 self.tokens.first_non_whitespace()
976 }
977
978 fn span(&self) -> Option<Span<char>> {
979 self.tokens.span()
980 }
981
982 fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
983 self.tokens.iter_linking_verb_indices()
984 }
985
986 fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_ {
987 self.tokens.iter_linking_verbs()
988 }
989
990 fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
991 self.tokens.iter_chunks()
992 }
993
994 fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
995 self.tokens.iter_paragraphs()
996 }
997
998 fn iter_headings(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
999 self.tokens.iter_headings()
1000 }
1001
1002 fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
1003 self.tokens.iter_sentences()
1004 }
1005
1006 fn iter_sentences_mut(&mut self) -> impl Iterator<Item = &'_ mut [Token]> + '_ {
1007 self.tokens.iter_sentences_mut()
1008 }
1009}
1010
1011impl Display for Document {
1012 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1013 for token in &self.tokens {
1014 write!(f, "{}", self.get_span_content_str(&token.span))?;
1015 }
1016
1017 Ok(())
1018 }
1019}
1020
1021#[cfg(test)]
1022mod tests {
1023 use itertools::Itertools;
1024
1025 use super::Document;
1026 use crate::TokenStringExt;
1027 use crate::{Span, parsers::MarkdownOptions};
1028
1029 fn assert_condensed_contractions(text: &str, final_tok_count: usize) {
1030 let document = Document::new_plain_english_curated(text);
1031
1032 assert_eq!(document.tokens.len(), final_tok_count);
1033
1034 let document = Document::new_markdown_curated(text, MarkdownOptions::default());
1035
1036 assert_eq!(document.tokens.len(), final_tok_count);
1037 }
1038
1039 #[test]
1040 fn simple_contraction() {
1041 assert_condensed_contractions("isn't", 1);
1042 }
1043
1044 #[test]
1045 fn simple_contraction2() {
1046 assert_condensed_contractions("wasn't", 1);
1047 }
1048
1049 #[test]
1050 fn simple_contraction3() {
1051 assert_condensed_contractions("There's", 1);
1052 }
1053
1054 #[test]
1055 fn simple_contraction4() {
1056 assert_condensed_contractions("doesn't", 1);
1057 }
1058
1059 #[test]
1060 fn medium_contraction() {
1061 assert_condensed_contractions("isn't wasn't", 3);
1062 }
1063
1064 #[test]
1065 fn medium_contraction2() {
1066 assert_condensed_contractions("There's no way", 5);
1067 }
1068
1069 #[test]
1070 fn selects_token_at_char_index() {
1071 let text = "There were three little pigs. They built three little homes.";
1072 let document = Document::new_plain_english_curated(text);
1073
1074 let got = document.get_token_at_char_index(19).unwrap();
1075
1076 assert!(got.kind.is_word());
1077 assert_eq!(got.span, Span::new(17, 23));
1078 }
1079
1080 fn assert_token_count(source: &str, count: usize) {
1081 let document = Document::new_plain_english_curated(source);
1082
1083 dbg!(document.tokens().map(|t| t.kind.clone()).collect_vec());
1084 assert_eq!(document.tokens.len(), count);
1085 }
1086
1087 #[test]
1088 fn condenses_number_suffixes() {
1089 assert_token_count("1st", 1);
1090 assert_token_count("This is the 2nd test", 9);
1091 assert_token_count("This is the 3rd test", 9);
1092 assert_token_count(
1093 "It works even with weird capitalization like this: 600nD",
1094 18,
1095 );
1096 }
1097
1098 #[test]
1099 fn condenses_ie() {
1100 assert_token_count("There is a thing (i.e. that one)", 15);
1101 assert_token_count("We are trying to condense \"i.e.\"", 13);
1102 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1103 }
1104
1105 #[test]
1106 fn condenses_eg() {
1107 assert_token_count("We are trying to condense \"e.g.\"", 13);
1108 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1109 }
1110
1111 #[test]
1112 fn condenses_nsa() {
1113 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1114 }
1115
1116 #[test]
1117 fn parses_ellipsis() {
1118 assert_token_count("...", 1);
1119 }
1120
1121 #[test]
1122 fn parses_long_ellipsis() {
1123 assert_token_count(".....", 1);
1124 }
1125
1126 #[test]
1127 fn parses_short_ellipsis() {
1128 assert_token_count("..", 1);
1129 }
1130
1131 #[test]
1132 fn selects_token_at_offset() {
1133 let doc = Document::new_plain_english_curated("Foo bar baz");
1134
1135 let tok = doc.get_token_offset(1, -1).unwrap();
1136
1137 assert_eq!(tok.span, Span::new(0, 3));
1138 }
1139
1140 #[test]
1141 fn cant_select_token_before_start() {
1142 let doc = Document::new_plain_english_curated("Foo bar baz");
1143
1144 let tok = doc.get_token_offset(0, -1);
1145
1146 assert!(tok.is_none());
1147 }
1148
1149 #[test]
1150 fn select_next_word_pos_offset() {
1151 let doc = Document::new_plain_english_curated("Foo bar baz");
1152
1153 let bar = doc.get_next_word_from_offset(0, 1).unwrap();
1154 let bar = doc.get_span_content(&bar.span);
1155 assert_eq!(bar, ['b', 'a', 'r']);
1156 }
1157
1158 #[test]
1159 fn select_next_word_neg_offset() {
1160 let doc = Document::new_plain_english_curated("Foo bar baz");
1161
1162 let bar = doc.get_next_word_from_offset(2, -1).unwrap();
1163 let bar = doc.get_span_content(&bar.span);
1164 assert_eq!(bar, ['F', 'o', 'o']);
1165 }
1166
1167 #[test]
1168 fn cant_select_next_word_not_from_whitespace() {
1169 let doc = Document::new_plain_english_curated("Foo bar baz");
1170
1171 let tok = doc.get_next_word_from_offset(0, 2);
1172
1173 assert!(tok.is_none());
1174 }
1175
1176 #[test]
1177 fn cant_select_next_word_before_start() {
1178 let doc = Document::new_plain_english_curated("Foo bar baz");
1179
1180 let tok = doc.get_next_word_from_offset(0, -1);
1181
1182 assert!(tok.is_none());
1183 }
1184
1185 #[test]
1186 fn cant_select_next_word_with_punctuation_instead_of_whitespace() {
1187 let doc = Document::new_plain_english_curated("Foo, bar, baz");
1188
1189 let tok = doc.get_next_word_from_offset(0, 1);
1190
1191 assert!(tok.is_none());
1192 }
1193
1194 #[test]
1195 fn cant_select_next_word_with_punctuation_after_whitespace() {
1196 let doc = Document::new_plain_english_curated("Foo \"bar\", baz");
1197
1198 let tok = doc.get_next_word_from_offset(0, 1);
1199
1200 assert!(tok.is_none());
1201 }
1202
1203 #[test]
1204 fn condenses_filename_extensions() {
1205 let doc = Document::new_plain_english_curated(".c and .exe and .js");
1206 assert!(doc.tokens[0].kind.is_unlintable());
1207 assert!(doc.tokens[4].kind.is_unlintable());
1208 assert!(doc.tokens[8].kind.is_unlintable());
1209 }
1210
1211 #[test]
1212 fn condense_filename_extension_ok_at_start_and_end() {
1213 let doc = Document::new_plain_english_curated(".c and .EXE");
1214 assert!(doc.tokens.len() == 5);
1215 assert!(doc.tokens[0].kind.is_unlintable());
1216 assert!(doc.tokens[4].kind.is_unlintable());
1217 }
1218
1219 #[test]
1220 fn doesnt_condense_filename_extensions_with_mixed_case() {
1221 let doc = Document::new_plain_english_curated(".c and .Exe");
1222 assert!(doc.tokens.len() == 6);
1223 assert!(doc.tokens[0].kind.is_unlintable());
1224 assert!(doc.tokens[4].kind.is_punctuation());
1225 assert!(doc.tokens[5].kind.is_word());
1226 }
1227
1228 #[test]
1229 fn doesnt_condense_filename_extensions_with_non_letters() {
1230 let doc = Document::new_plain_english_curated(".COM and .C0M");
1231 assert!(doc.tokens.len() == 6);
1232 assert!(doc.tokens[0].kind.is_unlintable());
1233 assert!(doc.tokens[4].kind.is_punctuation());
1234 assert!(doc.tokens[5].kind.is_word());
1235 }
1236
1237 #[test]
1238 fn doesnt_condense_filename_extensions_longer_than_three() {
1239 let doc = Document::new_plain_english_curated(".dll and .dlls");
1240 assert!(doc.tokens.len() == 6);
1241 assert!(doc.tokens[0].kind.is_unlintable());
1242 assert!(doc.tokens[4].kind.is_punctuation());
1243 assert!(doc.tokens[5].kind.is_word());
1244 }
1245
1246 #[test]
1247 fn condense_filename_extension_in_parens() {
1248 let doc = Document::new_plain_english_curated(
1249 "true for the manual installation when trying to run the executable(.exe) after a manual download",
1250 );
1251 assert!(doc.tokens.len() > 23);
1252 assert!(doc.tokens[21].kind.is_open_round());
1253 assert!(doc.tokens[22].kind.is_unlintable());
1254 assert!(doc.tokens[23].kind.is_close_round());
1255 }
1256
1257 #[test]
1258 fn condense_tldr_uppercase() {
1259 let doc = Document::new_plain_english_curated("TL;DR");
1260 assert!(doc.tokens.len() == 1);
1261 assert!(doc.tokens[0].kind.is_word());
1262 assert!(doc.tokens[0].span.len() == 5);
1263 }
1264
1265 #[test]
1266 fn condense_tldr_lowercase() {
1267 let doc = Document::new_plain_english_curated("tl;dr");
1268 assert!(doc.tokens.len() == 1);
1269 assert!(doc.tokens[0].kind.is_word());
1270 }
1271
1272 #[test]
1273 fn condense_tldr_mixed_case_1() {
1274 let doc = Document::new_plain_english_curated("tl;DR");
1275 assert!(doc.tokens.len() == 1);
1276 assert!(doc.tokens[0].kind.is_word());
1277 }
1278
1279 #[test]
1280 fn condense_tldr_mixed_case_2() {
1281 let doc = Document::new_plain_english_curated("TL;Dr");
1282 assert!(doc.tokens.len() == 1);
1283 assert!(doc.tokens[0].kind.is_word());
1284 }
1285
1286 #[test]
1287 fn condense_tldr_pural() {
1288 let doc = Document::new_plain_english_curated(
1289 "managing the flow between components to produce relevant TL;DRs of current news articles",
1290 );
1291 assert!(
1293 doc.tokens
1294 .iter()
1295 .all(|t| t.kind.is_word() || t.kind.is_whitespace())
1296 );
1297 let tldrs = doc
1299 .tokens
1300 .iter()
1301 .filter(|t| t.span.get_content(&doc.source).contains(&';'))
1302 .collect_vec();
1303 assert!(tldrs.len() == 1);
1304 assert!(tldrs[0].span.get_content_string(&doc.source) == "TL;DRs");
1305 }
1306
1307 #[test]
1308 fn condense_common_top_level_domains() {
1309 let doc = Document::new_plain_english_curated(".blog and .com and .NET");
1310 assert!(doc.tokens.len() == 9);
1311 assert!(doc.tokens[0].kind.is_unlintable());
1312 assert!(doc.tokens[4].kind.is_unlintable());
1313 assert!(doc.tokens[8].kind.is_unlintable());
1314 }
1315
1316 #[test]
1317 fn condense_common_top_level_domains_in_parens() {
1318 let doc = Document::new_plain_english_curated("(.blog)");
1319 assert!(doc.tokens.len() == 3);
1320 assert!(doc.tokens[0].kind.is_open_round());
1321 assert!(doc.tokens[1].kind.is_unlintable());
1322 assert!(doc.tokens[2].kind.is_close_round());
1323 }
1324
1325 #[test]
1326 fn doesnt_condense_unknown_top_level_domains() {
1327 let doc = Document::new_plain_english_curated(".harper");
1328 assert!(doc.tokens.len() == 2);
1329 assert!(doc.tokens[0].kind.is_punctuation());
1330 assert!(doc.tokens[1].kind.is_word());
1331 }
1332
1333 #[test]
1334 fn condense_r_and_d_caps() {
1335 let doc = Document::new_plain_english_curated("R&D");
1336 assert!(doc.tokens.len() == 1);
1337 assert!(doc.tokens[0].kind.is_word());
1338 }
1339
1340 #[test]
1341 fn condense_r_and_d_mixed_case() {
1342 let doc = Document::new_plain_english_curated("R&d");
1343 assert!(doc.tokens.len() == 1);
1344 assert!(doc.tokens[0].kind.is_word());
1345 }
1346
1347 #[test]
1348 fn condense_r_and_d_lowercase() {
1349 let doc = Document::new_plain_english_curated("r&d");
1350 assert!(doc.tokens.len() == 1);
1351 assert!(doc.tokens[0].kind.is_word());
1352 }
1353
1354 #[test]
1355 fn dont_condense_r_and_d_with_spaces() {
1356 let doc = Document::new_plain_english_curated("R & D");
1357 assert!(doc.tokens.len() == 5);
1358 assert!(doc.tokens[0].kind.is_word());
1359 assert!(doc.tokens[1].kind.is_whitespace());
1360 assert!(doc.tokens[2].kind.is_ampersand());
1361 assert!(doc.tokens[3].kind.is_whitespace());
1362 assert!(doc.tokens[4].kind.is_word());
1363 }
1364
1365 #[test]
1366 fn condense_q_and_a() {
1367 let doc =
1368 Document::new_plain_english_curated("A Q&A platform software for teams at any scales.");
1369 assert!(doc.tokens.len() >= 3);
1370 assert!(doc.tokens[2].kind.is_word());
1371 assert!(doc.tokens[2].span.get_content_string(&doc.source) == "Q&A");
1372 }
1373
1374 #[test]
1375 fn dont_allow_mixed_r_and_d_with_q_and_a() {
1376 let doc = Document::new_plain_english_curated("R&A or Q&D");
1377 assert!(doc.tokens.len() == 9);
1378 assert!(doc.tokens[1].kind.is_ampersand() || doc.tokens[7].kind.is_ampersand());
1379 }
1380
1381 #[test]
1382 fn condense_io() {
1383 let doc = Document::new_plain_english_curated("I/O");
1384 assert!(doc.tokens.len() == 1);
1385 assert!(doc.tokens[0].kind.is_word());
1386 }
1387
1388 #[test]
1389 fn finds_unmatched_quotes_in_document() {
1390 let raw = r#"
1391This is a paragraph with a single word "quoted."
1392
1393This is a second paragraph with no quotes.
1394
1395This is a third paragraph with a single erroneous "quote.
1396
1397This is a final paragraph with a weird "quote and a not-weird "quote".
1398 "#;
1399
1400 let doc = Document::new_markdown_default_curated(raw);
1401
1402 let quote_twins: Vec<_> = doc
1403 .iter_quotes()
1404 .map(|t| t.kind.as_quote().unwrap().twin_loc)
1405 .collect();
1406
1407 assert_eq!(
1408 quote_twins,
1409 vec![Some(19), Some(16), None, None, Some(89), Some(87)]
1410 )
1411 }
1412
1413 #[test]
1414 fn issue_1901() {
1415 let raw = r#"
1416"A quoted line"
1417"A quote without a closing mark
1418"Another quoted lined"
1419"The last quoted line"
1420 "#;
1421
1422 let doc = Document::new_markdown_default_curated(raw);
1423
1424 let quote_twins: Vec<_> = doc
1425 .iter_quotes()
1426 .map(|t| t.kind.as_quote().unwrap().twin_loc)
1427 .collect();
1428
1429 assert_eq!(
1430 quote_twins,
1431 vec![
1432 Some(6),
1433 Some(0),
1434 None,
1435 Some(27),
1436 Some(21),
1437 Some(37),
1438 Some(29)
1439 ]
1440 )
1441 }
1442}