1use std::cmp::Ordering;
2use std::collections::VecDeque;
3use std::fmt::Display;
4
5use harper_brill::{Chunker, Tagger, brill_tagger, burn_chunker};
6use itertools::Itertools;
7use paste::paste;
8
9use crate::expr::{Expr, ExprExt, FirstMatchOf, Repeating, SequenceExpr};
10use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
11use crate::punctuation::Punctuation;
12use crate::spell::{Dictionary, FstDictionary};
13use crate::vec_ext::VecExt;
14use crate::{CharStringExt, FatStringToken, FatToken, Lrc, Token, TokenKind, TokenStringExt};
15use crate::{OrdinalSuffix, Span};
16
17#[derive(Debug, Clone)]
19pub struct Document {
20 source: Lrc<[char]>,
21 tokens: Vec<Token>,
22}
23
24impl Default for Document {
25 fn default() -> Self {
26 Self::new("", &PlainEnglish, &FstDictionary::curated())
27 }
28}
29
30impl Document {
31 pub fn token_indices_intersecting(&self, span: Span<char>) -> Vec<usize> {
35 self.tokens()
36 .enumerate()
37 .filter_map(|(idx, tok)| tok.span.overlaps_with(span).then_some(idx))
38 .collect()
39 }
40
41 pub fn fat_tokens_intersecting(&self, span: Span<char>) -> Vec<FatToken> {
45 let indices = self.token_indices_intersecting(span);
46
47 indices
48 .into_iter()
49 .map(|i| self.tokens[i].to_fat(&self.source))
50 .collect()
51 }
52
53 pub fn new(text: &str, parser: &impl Parser, dictionary: &impl Dictionary) -> Self {
56 let source: Lrc<_> = text.chars().collect();
57
58 Self::new_from_chars(source, parser, dictionary)
59 }
60
61 pub fn new_curated(text: &str, parser: &impl Parser) -> Self {
64 let source: Lrc<_> = text.chars().collect();
65
66 Self::new_from_chars(source, parser, &FstDictionary::curated())
67 }
68
69 pub fn new_from_chars(
72 source: Lrc<[char]>,
73 parser: &impl Parser,
74 dictionary: &impl Dictionary,
75 ) -> Self {
76 let tokens = parser.parse(&source);
77
78 let mut document = Self { source, tokens };
79 document.parse(dictionary);
80
81 document
82 }
83
84 pub fn new_plain_english_curated_chars(source: &[char]) -> Self {
87 Self::new_from_chars(Lrc::from(source), &PlainEnglish, &FstDictionary::curated())
88 }
89
90 pub fn new_plain_english_curated(text: &str) -> Self {
93 Self::new(text, &PlainEnglish, &FstDictionary::curated())
94 }
95
96 pub(crate) fn new_basic_tokenize(text: &str, parser: &impl Parser) -> Self {
102 let source: Lrc<_> = text.chars().collect();
103 let tokens = parser.parse(&source);
104 let mut document = Self { source, tokens };
105 document.apply_fixups();
106 document
107 }
108
109 pub fn new_plain_english(text: &str, dictionary: &impl Dictionary) -> Self {
112 Self::new(text, &PlainEnglish, dictionary)
113 }
114
115 pub fn new_markdown_curated(text: &str, markdown_options: MarkdownOptions) -> Self {
118 Self::new(
119 text,
120 &Markdown::new(markdown_options),
121 &FstDictionary::curated(),
122 )
123 }
124
125 pub fn new_markdown_default_curated_chars(chars: &[char]) -> Self {
128 Self::new_from_chars(
129 chars.to_vec().into(),
130 &Markdown::default(),
131 &FstDictionary::curated(),
132 )
133 }
134
135 pub fn new_markdown_default_curated(text: &str) -> Self {
138 Self::new_markdown_curated(text, MarkdownOptions::default())
139 }
140
141 pub fn new_markdown(
144 text: &str,
145 markdown_options: MarkdownOptions,
146 dictionary: &impl Dictionary,
147 ) -> Self {
148 Self::new(text, &Markdown::new(markdown_options), dictionary)
149 }
150
151 pub fn new_markdown_default(text: &str, dictionary: &impl Dictionary) -> Self {
154 Self::new_markdown(text, MarkdownOptions::default(), dictionary)
155 }
156
157 fn apply_fixups(&mut self) {
158 self.condense_spaces();
159 self.condense_newlines();
160 self.newlines_to_breaks();
161 self.condense_dotted_initialisms();
162 self.condense_number_suffixes();
163 self.condense_ellipsis();
164 self.condense_dotted_truncations();
165 self.condense_common_top_level_domains();
166 self.condense_filename_extensions();
167 self.condense_tldr();
168 self.condense_ampersand_pairs();
169 self.condense_slash_pairs();
170 self.match_quotes();
171 }
172
173 fn parse(&mut self, dictionary: &impl Dictionary) {
177 self.apply_fixups();
178
179 let chunker = burn_chunker();
180 let tagger = brill_tagger();
181
182 for sent in self.tokens.iter_sentences_mut() {
183 let token_strings: Vec<_> = sent
184 .iter()
185 .filter(|t| !t.kind.is_whitespace())
186 .map(|t| t.get_str(&self.source))
187 .collect();
188
189 let token_tags = tagger.tag_sentence(&token_strings);
190 let np_flags = chunker.chunk_sentence(&token_strings, &token_tags);
191
192 let word_sources: Vec<_> = sent
194 .iter()
195 .filter(|t| matches!(t.kind, TokenKind::Word(_)))
196 .map(|t| t.get_ch(&self.source))
197 .collect();
198
199 let mut ti = 0; let mut wi = 0; for token in sent.iter_mut() {
202 if let TokenKind::Word(meta) = &mut token.kind {
203 let word_source = word_sources[wi];
204 let mut found_meta = dictionary
205 .get_word_metadata(word_source)
206 .map(|c| c.into_owned());
207
208 if let Some(inner) = &mut found_meta {
209 inner.pos_tag = token_tags[ti].or_else(|| inner.infer_pos_tag());
210 inner.np_member = Some(np_flags[ti]);
211 }
212
213 *meta = found_meta;
214 ti += 1;
215 wi += 1;
216 } else if !token.kind.is_whitespace() {
217 ti += 1;
218 }
219 }
220 }
221 }
222
223 fn newlines_to_breaks(&mut self) {
225 for token in &mut self.tokens {
226 if let TokenKind::Newline(n) = token.kind
227 && n >= 2
228 {
229 token.kind = TokenKind::ParagraphBreak;
230 }
231 }
232 }
233
234 fn condense_indices(&mut self, indices: &[usize], stretch_len: usize) {
240 for idx in indices {
242 let end_tok = self.tokens[idx + stretch_len - 1].clone();
243 let start_tok = &mut self.tokens[*idx];
244
245 start_tok.span.end = end_tok.span.end;
246 }
247
248 let old = self.tokens.clone();
250 self.tokens.clear();
251
252 self.tokens
254 .extend_from_slice(&old[0..indices.first().copied().unwrap_or(indices.len())]);
255
256 let mut iter = indices.iter().peekable();
257
258 while let (Some(a_idx), b) = (iter.next(), iter.peek()) {
259 self.tokens.push(old[*a_idx].clone());
260
261 if let Some(b_idx) = b {
262 self.tokens
263 .extend_from_slice(&old[a_idx + stretch_len..**b_idx]);
264 }
265 }
266
267 self.tokens.extend_from_slice(
269 &old[indices
270 .last()
271 .map(|v| v + stretch_len)
272 .unwrap_or(indices.len())..],
273 );
274 }
275
276 pub fn get_token_at_char_index(&self, char_index: usize) -> Option<&Token> {
277 let index = self
278 .tokens
279 .binary_search_by(|t| {
280 if t.span.overlaps_with(Span::new_with_len(char_index, 1)) {
281 Ordering::Equal
282 } else {
283 t.span.start.cmp(&char_index)
284 }
285 })
286 .ok()?;
287
288 Some(&self.tokens[index])
289 }
290
291 pub fn get_token(&self, index: usize) -> Option<&Token> {
293 self.tokens.get(index)
294 }
295
296 pub fn get_token_offset(&self, base: usize, offset: isize) -> Option<&Token> {
298 match base.checked_add_signed(offset) {
299 None => None,
300 Some(idx) => self.get_token(idx),
301 }
302 }
303
304 pub fn tokens(&self) -> impl Iterator<Item = &Token> + '_ {
306 self.tokens.iter()
307 }
308
309 pub fn iter_nominal_phrases(&self) -> impl Iterator<Item = &[Token]> {
310 fn is_np_member(t: &Token) -> bool {
311 t.kind
312 .as_word()
313 .and_then(|x| x.as_ref())
314 .and_then(|w| w.np_member)
315 .unwrap_or(false)
316 }
317
318 fn trim(slice: &[Token]) -> &[Token] {
319 let mut start = 0;
320 let mut end = slice.len();
321 while start < end && slice[start].kind.is_whitespace() {
322 start += 1;
323 }
324 while end > start && slice[end - 1].kind.is_whitespace() {
325 end -= 1;
326 }
327 &slice[start..end]
328 }
329
330 self.tokens
331 .as_slice()
332 .split(|t| !(is_np_member(t) || t.kind.is_whitespace()))
333 .filter_map(|s| {
334 let s = trim(s);
335 if s.iter().any(is_np_member) {
336 Some(s)
337 } else {
338 None
339 }
340 })
341 }
342
343 pub fn fat_tokens(&self) -> impl Iterator<Item = FatToken> + '_ {
345 self.tokens().map(|token| token.to_fat(&self.source))
346 }
347
348 pub fn get_next_word_from_offset(&self, base: usize, offset: isize) -> Option<&Token> {
351 if !self.get_token_offset(base, offset)?.kind.is_whitespace() {
353 return None;
354 }
355 let word_token = self.get_token_offset(base, offset + offset.signum());
357 let word_token = word_token?;
358 word_token.kind.is_word().then_some(word_token)
359 }
360
361 pub fn fat_string_tokens(&self) -> impl Iterator<Item = FatStringToken> + '_ {
363 self.fat_tokens().map(|t| t.into())
364 }
365
366 pub fn get_span_content(&self, span: &Span<char>) -> &[char] {
367 span.get_content(&self.source)
368 }
369
370 pub fn get_span_content_str(&self, span: &Span<char>) -> String {
371 String::from_iter(self.get_span_content(span))
372 }
373
374 pub fn get_full_string(&self) -> String {
375 self.get_span_content_str(&Span::new(0, self.source.len()))
376 }
377
378 pub fn get_full_content(&self) -> &[char] {
379 &self.source
380 }
381
382 pub fn get_source(&self) -> &[char] {
383 &self.source
384 }
385
386 pub fn get_tokens(&self) -> &[Token] {
387 &self.tokens
388 }
389
390 fn match_quotes(&mut self) {
396 let mut pg_indices: Vec<_> = vec![0];
397 pg_indices.extend(self.iter_paragraph_break_indices());
398 pg_indices.push(self.tokens.len());
399
400 let mut quote_indices = Vec::new();
402 let mut open_quote_indices = Vec::new();
403
404 for (start, end) in pg_indices.into_iter().tuple_windows() {
405 let pg = &mut self.tokens[start..end];
406
407 quote_indices.clear();
408 quote_indices.extend(pg.iter_quote_indices());
409 open_quote_indices.clear();
410
411 for quote in "e_indices {
413 let is_open = *quote == 0
414 || pg[0..*quote].iter_word_likes().next().is_none()
415 || pg[quote - 1].kind.is_whitespace()
416 || matches!(
417 pg[quote - 1].kind.as_punctuation(),
418 Some(Punctuation::LessThan)
419 | Some(Punctuation::OpenRound)
420 | Some(Punctuation::OpenSquare)
421 | Some(Punctuation::OpenCurly)
422 | Some(Punctuation::Apostrophe)
423 );
424
425 if is_open {
426 open_quote_indices.push(*quote);
427 }
428 }
429
430 while let Some(open_idx) = open_quote_indices.pop() {
431 let Some(close_idx) = pg[open_idx + 1..].iter_quote_indices().next() else {
432 continue;
433 };
434
435 if pg[close_idx + open_idx + 1]
436 .kind
437 .as_quote()
438 .unwrap()
439 .twin_loc
440 .is_some()
441 {
442 continue;
443 }
444
445 pg[open_idx].kind.as_mut_quote().unwrap().twin_loc =
446 Some(close_idx + open_idx + start + 1);
447 pg[close_idx + open_idx + 1]
448 .kind
449 .as_mut_quote()
450 .unwrap()
451 .twin_loc = Some(open_idx + start);
452 }
453 }
454 }
455
456 fn condense_number_suffixes(&mut self) {
458 if self.tokens.len() < 2 {
459 return;
460 }
461
462 let mut replace_starts = Vec::new();
463
464 for idx in 0..self.tokens.len() - 1 {
465 let b = &self.tokens[idx + 1];
466 let a = &self.tokens[idx];
467
468 if let (TokenKind::Number(..), TokenKind::Word(..)) = (&a.kind, &b.kind)
471 && let Some(found_suffix) =
472 OrdinalSuffix::from_chars(self.get_span_content(&b.span))
473 {
474 self.tokens[idx].kind.as_mut_number().unwrap().suffix = Some(found_suffix);
475 replace_starts.push(idx);
476 }
477 }
478
479 self.condense_indices(&replace_starts, 2);
480 }
481
482 fn condense_spaces(&mut self) {
485 let mut cursor = 0;
486 let copy = self.tokens.clone();
487
488 let mut remove_these = VecDeque::new();
489
490 while cursor < self.tokens.len() {
491 let start_tok = &mut self.tokens[cursor];
493
494 if let TokenKind::Space(start_count) = &mut start_tok.kind {
495 loop {
496 cursor += 1;
497
498 if cursor >= copy.len() {
499 break;
500 }
501
502 let child_tok = ©[cursor];
503
504 if start_tok.span.end != child_tok.span.start {
506 break;
507 }
508
509 if let TokenKind::Space(n) = child_tok.kind {
510 *start_count += n;
511 start_tok.span.end = child_tok.span.end;
512 remove_these.push_back(cursor);
513 cursor += 1;
514 } else {
515 break;
516 };
517 }
518 }
519
520 cursor += 1;
521 }
522
523 self.tokens.remove_indices(remove_these);
524 }
525
526 thread_local! {
527 static DOTTED_TRUNCATION_EXPR: Lrc<FirstMatchOf> = Document::uncached_dotted_truncation_expr();
528 }
529
530 fn uncached_dotted_truncation_expr() -> Lrc<FirstMatchOf> {
531 Lrc::new(FirstMatchOf::new(vec![
532 Box::new(SequenceExpr::word_set(&["esp", "etc", "vs"]).then_period()),
533 Box::new(
534 SequenceExpr::aco("et")
535 .then_whitespace()
536 .t_aco("al")
537 .then_period(),
538 ),
539 ]))
540 }
541
542 fn condense_expr<F>(&mut self, expr: &impl Expr, edit: F)
545 where
546 F: Fn(&mut Token),
547 {
548 let matches = expr.iter_matches_in_doc(self).collect::<Vec<_>>();
549
550 let mut remove_indices = VecDeque::with_capacity(matches.len());
551
552 for m in matches {
553 remove_indices.extend(m.start + 1..m.end);
554 self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
555 edit(&mut self.tokens[m.start]);
556 }
557
558 self.tokens.remove_indices(remove_indices);
559 }
560
561 fn condense_dotted_truncations(&mut self) {
562 self.condense_expr(&Self::DOTTED_TRUNCATION_EXPR.with(|v| v.clone()), |_| {})
563 }
564
565 fn condense_newlines(&mut self) {
568 let mut cursor = 0;
569 let copy = self.tokens.clone();
570
571 let mut remove_these = VecDeque::new();
572
573 while cursor < self.tokens.len() {
574 let start_tok = &mut self.tokens[cursor];
576
577 if let TokenKind::Newline(start_count) = &mut start_tok.kind {
578 loop {
579 cursor += 1;
580
581 if cursor >= copy.len() {
582 break;
583 }
584
585 let child_tok = ©[cursor];
586 if let TokenKind::Newline(n) = child_tok.kind {
587 *start_count += n;
588 start_tok.span.end = child_tok.span.end;
589 remove_these.push_back(cursor);
590 cursor += 1;
591 } else {
592 break;
593 };
594 }
595 }
596
597 cursor += 1;
598 }
599
600 self.tokens.remove_indices(remove_these);
601 }
602
603 fn condense_dotted_initialisms(&mut self) {
606 if self.tokens.len() < 2 {
607 return;
608 }
609
610 let mut to_remove = VecDeque::new();
611
612 let mut cursor = 1;
613
614 let mut initialism_start = None;
615
616 loop {
617 let a = &self.tokens[cursor - 1];
618 let b = &self.tokens[cursor];
619
620 let is_initialism_chunk = a.kind.is_word() && a.span.len() == 1 && b.kind.is_period();
621
622 if is_initialism_chunk {
623 if initialism_start.is_none() {
624 initialism_start = Some(cursor - 1);
625 } else {
626 to_remove.push_back(cursor - 1);
627 }
628
629 to_remove.push_back(cursor);
630 cursor += 1;
631 } else {
632 if let Some(start) = initialism_start {
633 let end = self.tokens[cursor - 2].span.end;
634 let start_tok: &mut Token = &mut self.tokens[start];
635 start_tok.span.end = end;
636 }
637
638 initialism_start = None;
639 }
640
641 cursor += 1;
642
643 if cursor >= self.tokens.len() - 1 {
644 break;
645 }
646 }
647
648 self.tokens.remove_indices(to_remove);
649 }
650
651 fn condense_filename_extensions(&mut self) {
653 if self.tokens.len() < 2 {
654 return;
655 }
656
657 let mut to_remove = VecDeque::new();
658
659 let mut cursor = 1;
660
661 let mut ext_start = None;
662
663 loop {
664 let l = self.get_token_offset(cursor, -2);
666 let d = &self.tokens[cursor - 1];
667 let x = &self.tokens[cursor];
668 let r = self.get_token_offset(cursor, 1);
669
670 let is_ext_chunk = d.kind.is_period()
671 && x.kind.is_word()
672 && x.span.len() <= 3
673 && ((l.is_none_or(|t| t.kind.is_whitespace())
674 && r.is_none_or(|t| t.kind.is_whitespace()))
675 || (l.is_some_and(|t| t.kind.is_open_round())
676 && r.is_some_and(|t| t.kind.is_close_round())))
677 && {
678 let ext_chars = x.get_ch(&self.source);
679 ext_chars.iter().all(|c| c.is_ascii_lowercase())
680 || ext_chars.iter().all(|c| c.is_ascii_uppercase())
681 };
682
683 if is_ext_chunk {
684 if ext_start.is_none() {
685 ext_start = Some(cursor - 1);
686 self.tokens[cursor - 1].kind = TokenKind::Unlintable;
687 } else {
688 to_remove.push_back(cursor - 1);
689 }
690
691 to_remove.push_back(cursor);
692 cursor += 1;
693 } else {
694 if let Some(start) = ext_start {
695 let end = self.tokens[cursor - 2].span.end;
696 let start_tok: &mut Token = &mut self.tokens[start];
697 start_tok.span.end = end;
698 }
699
700 ext_start = None;
701 }
702
703 cursor += 1;
704
705 if cursor >= self.tokens.len() {
706 break;
707 }
708 }
709
710 self.tokens.remove_indices(to_remove);
711 }
712
713 fn condense_common_top_level_domains(&mut self) {
715 const COMMON_TOP_LEVEL_DOMAINS: &[&str; 106] = &[
716 "ai", "app", "blog", "co", "com", "dev", "edu", "gov", "info", "io", "me", "mil",
717 "net", "org", "shop", "tech", "uk", "us", "xyz", "jp", "de", "fr", "br", "it", "ru",
718 "es", "pl", "ca", "au", "cn", "in", "nl", "eu", "ch", "id", "at", "kr", "cz", "mx",
719 "be", "tv", "se", "tr", "tw", "al", "ua", "ir", "vn", "cl", "sk", "ly", "cc", "to",
720 "no", "fi", "pt", "dk", "ar", "hu", "tk", "gr", "il", "news", "ro", "my", "biz", "ie",
721 "za", "nz", "sg", "ee", "th", "pe", "bg", "hk", "rs", "lt", "link", "ph", "club", "si",
722 "site", "mobi", "by", "cat", "wiki", "la", "ga", "xxx", "cf", "hr", "ng", "jobs",
723 "online", "kz", "ug", "gq", "ae", "is", "lv", "pro", "fm", "tips", "ms", "sa", "int",
724 ];
725
726 if self.tokens.len() < 2 {
727 return;
728 }
729
730 let mut to_remove = VecDeque::new();
731 for cursor in 1..self.tokens.len() {
732 let l = self.get_token_offset(cursor, -2);
734 let d = &self.tokens[cursor - 1];
735 let tld = &self.tokens[cursor];
736 let r = self.get_token_offset(cursor, 1);
737
738 let is_tld_chunk = d.kind.is_period()
739 && tld.kind.is_word()
740 && tld
741 .get_ch(&self.source)
742 .iter()
743 .all(|c| c.is_ascii_alphabetic())
744 && tld
745 .get_ch(&self.source)
746 .eq_any_ignore_ascii_case_str(COMMON_TOP_LEVEL_DOMAINS)
747 && ((l.is_none_or(|t| t.kind.is_whitespace())
748 && r.is_none_or(|t| t.kind.is_whitespace()))
749 || (l.is_some_and(|t| t.kind.is_open_round())
750 && r.is_some_and(|t| t.kind.is_close_round())));
751
752 if is_tld_chunk {
753 self.tokens[cursor - 1].kind = TokenKind::Unlintable;
754 self.tokens[cursor - 1].span.end = self.tokens[cursor].span.end;
755 to_remove.push_back(cursor);
756 }
757 }
758
759 self.tokens.remove_indices(to_remove);
760 }
761
762 fn condense_tldr(&mut self) {
764 if self.tokens.len() < 3 {
765 return;
766 }
767
768 let mut to_remove = VecDeque::new();
769 let mut cursor = 2;
770
771 loop {
772 let tl = &self.tokens[cursor - 2];
773 let simicolon = &self.tokens[cursor - 1];
774 let dr = &self.tokens[cursor];
775
776 let is_tldr_chunk = tl.kind.is_word()
777 && tl.span.len() == 2
778 && tl.get_ch(&self.source).eq_ch(&['t', 'l'])
779 && simicolon.kind.is_semicolon()
780 && dr.kind.is_word()
781 && dr.span.len() >= 2
782 && dr.span.len() <= 3
783 && dr
784 .get_ch(&self.source)
785 .eq_any_ignore_ascii_case_chars(&[&['d', 'r'], &['d', 'r', 's']]);
786
787 if is_tldr_chunk {
788 self.tokens[cursor - 2].span = Span::new(
790 self.tokens[cursor - 2].span.start,
791 self.tokens[cursor].span.end,
792 );
793
794 to_remove.push_back(cursor - 1);
796 to_remove.push_back(cursor);
797 }
798
799 cursor += 1;
801
802 if cursor >= self.tokens.len() {
803 break;
804 }
805 }
806
807 self.tokens.remove_indices(to_remove);
809 }
810
811 fn condense_delimited_pairs<F>(&mut self, is_delimiter: F, valid_pairs: &[(char, char)])
819 where
820 F: Fn(&TokenKind) -> bool,
821 {
822 if self.tokens.len() < 3 {
823 return;
824 }
825
826 let mut to_remove = VecDeque::new();
827 let mut cursor = 2;
828
829 loop {
830 let l1 = &self.tokens[cursor - 2];
831 let delim = &self.tokens[cursor - 1];
832 let l2 = &self.tokens[cursor];
833
834 let is_delimited_chunk = l1.kind.is_word()
835 && l1.span.len() == 1
836 && is_delimiter(&delim.kind)
837 && l2.kind.is_word()
838 && l2.span.len() == 1;
839
840 if is_delimited_chunk {
841 let (l1, l2) = (
842 l1.get_ch(&self.source).first(),
843 l2.get_ch(&self.source).first(),
844 );
845
846 let is_valid_pair = match (l1, l2) {
847 (Some(l1), Some(l2)) => {
848 let pair = (l1.to_ascii_lowercase(), l2.to_ascii_lowercase());
849 valid_pairs.contains(&pair)
850 }
851 _ => false,
852 };
853
854 if is_valid_pair {
855 self.tokens[cursor - 2].span = Span::new(
856 self.tokens[cursor - 2].span.start,
857 self.tokens[cursor].span.end,
858 );
859 to_remove.push_back(cursor - 1);
860 to_remove.push_back(cursor);
861 }
862 }
863
864 cursor += 1;
865 if cursor >= self.tokens.len() {
866 break;
867 }
868 }
869
870 self.tokens.remove_indices(to_remove);
871 }
872
873 fn condense_ampersand_pairs(&mut self) {
875 self.condense_delimited_pairs(
876 |kind| kind.is_ampersand(),
877 &[
878 ('b', 'b'), ('b', 'w'), ('g', 't'), ('k', 'r'), ('q', 'a'), ('r', 'b'), ('r', 'd'), ('r', 'r'), ('s', 'p'), ],
888 );
889 }
890
891 fn condense_slash_pairs(&mut self) {
893 self.condense_delimited_pairs(
894 |kind| kind.is_slash(),
895 &[
896 ('a', 'c'), ('b', 'w'), ('c', 'o'), ('d', 'c'), ('d', 'l'), ('i', 'o'), ('j', 'k'), ('n', 'a'), ('r', 'c'), ('s', 'n'), ('y', 'n'), ('y', 'o'), ],
909 );
910 }
911
912 fn uncached_ellipsis_pattern() -> Lrc<Repeating> {
913 let period = SequenceExpr::default().then_period();
914 Lrc::new(Repeating::new(Box::new(period), 2))
915 }
916
917 thread_local! {
918 static ELLIPSIS_EXPR: Lrc<Repeating> = Document::uncached_ellipsis_pattern();
919 }
920
921 fn condense_ellipsis(&mut self) {
922 let expr = Self::ELLIPSIS_EXPR.with(|v| v.clone());
923 self.condense_expr(&expr, |tok| {
924 tok.kind = TokenKind::Punctuation(Punctuation::Ellipsis)
925 });
926 }
927}
928
929macro_rules! create_fns_on_doc {
931 ($thing:ident) => {
932 paste! {
933 fn [< first_ $thing >](&self) -> Option<&Token> {
934 self.tokens.[< first_ $thing >]()
935 }
936
937 fn [< last_ $thing >](&self) -> Option<&Token> {
938 self.tokens.[< last_ $thing >]()
939 }
940
941 fn [< last_ $thing _index>](&self) -> Option<usize> {
942 self.tokens.[< last_ $thing _index >]()
943 }
944
945 fn [<iter_ $thing _indices>](&self) -> impl DoubleEndedIterator<Item = usize> + '_ {
946 self.tokens.[< iter_ $thing _indices >]()
947 }
948
949 fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_ {
950 self.tokens.[< iter_ $thing s >]()
951 }
952 }
953 };
954}
955
956impl TokenStringExt for Document {
957 create_fns_on_doc!(adjective);
958 create_fns_on_doc!(apostrophe);
959 create_fns_on_doc!(at);
960 create_fns_on_doc!(chunk_terminator);
961 create_fns_on_doc!(comma);
962 create_fns_on_doc!(conjunction);
963 create_fns_on_doc!(currency);
964 create_fns_on_doc!(ellipsis);
965 create_fns_on_doc!(hostname);
966 create_fns_on_doc!(likely_homograph);
967 create_fns_on_doc!(noun);
968 create_fns_on_doc!(number);
969 create_fns_on_doc!(paragraph_break);
970 create_fns_on_doc!(pipe);
971 create_fns_on_doc!(preposition);
972 create_fns_on_doc!(punctuation);
973 create_fns_on_doc!(quote);
974 create_fns_on_doc!(sentence_terminator);
975 create_fns_on_doc!(space);
976 create_fns_on_doc!(unlintable);
977 create_fns_on_doc!(verb);
978 create_fns_on_doc!(word);
979 create_fns_on_doc!(word_like);
980 create_fns_on_doc!(heading_start);
981
982 fn first_sentence_word(&self) -> Option<&Token> {
983 self.tokens.first_sentence_word()
984 }
985
986 fn first_non_whitespace(&self) -> Option<&Token> {
987 self.tokens.first_non_whitespace()
988 }
989
990 fn span(&self) -> Option<Span<char>> {
991 self.tokens.span()
992 }
993
994 fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
995 self.tokens.iter_linking_verb_indices()
996 }
997
998 fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_ {
999 self.tokens.iter_linking_verbs()
1000 }
1001
1002 fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
1003 self.tokens.iter_chunks()
1004 }
1005
1006 fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
1007 self.tokens.iter_paragraphs()
1008 }
1009
1010 fn iter_headings(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
1011 self.tokens.iter_headings()
1012 }
1013
1014 fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
1015 self.tokens.iter_sentences()
1016 }
1017
1018 fn iter_sentences_mut(&mut self) -> impl Iterator<Item = &'_ mut [Token]> + '_ {
1019 self.tokens.iter_sentences_mut()
1020 }
1021}
1022
1023impl Display for Document {
1024 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1025 for token in &self.tokens {
1026 write!(f, "{}", self.get_span_content_str(&token.span))?;
1027 }
1028
1029 Ok(())
1030 }
1031}
1032
1033#[cfg(test)]
1034mod tests {
1035 use itertools::Itertools;
1036
1037 use super::Document;
1038 use crate::TokenStringExt;
1039 use crate::{Span, parsers::MarkdownOptions};
1040
1041 fn assert_condensed_contractions(text: &str, final_tok_count: usize) {
1042 let document = Document::new_plain_english_curated(text);
1043
1044 assert_eq!(document.tokens.len(), final_tok_count);
1045
1046 let document = Document::new_markdown_curated(text, MarkdownOptions::default());
1047
1048 assert_eq!(document.tokens.len(), final_tok_count);
1049 }
1050
1051 #[test]
1052 fn simple_contraction() {
1053 assert_condensed_contractions("isn't", 1);
1054 }
1055
1056 #[test]
1057 fn simple_contraction2() {
1058 assert_condensed_contractions("wasn't", 1);
1059 }
1060
1061 #[test]
1062 fn simple_contraction3() {
1063 assert_condensed_contractions("There's", 1);
1064 }
1065
1066 #[test]
1067 fn simple_contraction4() {
1068 assert_condensed_contractions("doesn't", 1);
1069 }
1070
1071 #[test]
1072 fn medium_contraction() {
1073 assert_condensed_contractions("isn't wasn't", 3);
1074 }
1075
1076 #[test]
1077 fn medium_contraction2() {
1078 assert_condensed_contractions("There's no way", 5);
1079 }
1080
1081 #[test]
1082 fn selects_token_at_char_index() {
1083 let text = "There were three little pigs. They built three little homes.";
1084 let document = Document::new_plain_english_curated(text);
1085
1086 let got = document.get_token_at_char_index(19).unwrap();
1087
1088 assert!(got.kind.is_word());
1089 assert_eq!(got.span, Span::new(17, 23));
1090 }
1091
1092 fn assert_token_count(source: &str, count: usize) {
1093 let document = Document::new_plain_english_curated(source);
1094
1095 dbg!(document.tokens().map(|t| t.kind.clone()).collect_vec());
1096 assert_eq!(document.tokens.len(), count);
1097 }
1098
1099 #[test]
1100 fn condenses_number_suffixes() {
1101 assert_token_count("1st", 1);
1102 assert_token_count("This is the 2nd test", 9);
1103 assert_token_count("This is the 3rd test", 9);
1104 assert_token_count(
1105 "It works even with weird capitalization like this: 600nD",
1106 18,
1107 );
1108 }
1109
1110 #[test]
1111 fn condenses_ie() {
1112 assert_token_count("There is a thing (i.e. that one)", 15);
1113 assert_token_count("We are trying to condense \"i.e.\"", 13);
1114 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1115 }
1116
1117 #[test]
1118 fn condenses_eg() {
1119 assert_token_count("We are trying to condense \"e.g.\"", 13);
1120 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1121 }
1122
1123 #[test]
1124 fn condenses_nsa() {
1125 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1126 }
1127
1128 #[test]
1129 fn parses_ellipsis() {
1130 assert_token_count("...", 1);
1131 }
1132
1133 #[test]
1134 fn parses_long_ellipsis() {
1135 assert_token_count(".....", 1);
1136 }
1137
1138 #[test]
1139 fn parses_short_ellipsis() {
1140 assert_token_count("..", 1);
1141 }
1142
1143 #[test]
1144 fn selects_token_at_offset() {
1145 let doc = Document::new_plain_english_curated("Foo bar baz");
1146
1147 let tok = doc.get_token_offset(1, -1).unwrap();
1148
1149 assert_eq!(tok.span, Span::new(0, 3));
1150 }
1151
1152 #[test]
1153 fn cant_select_token_before_start() {
1154 let doc = Document::new_plain_english_curated("Foo bar baz");
1155
1156 let tok = doc.get_token_offset(0, -1);
1157
1158 assert!(tok.is_none());
1159 }
1160
1161 #[test]
1162 fn select_next_word_pos_offset() {
1163 let doc = Document::new_plain_english_curated("Foo bar baz");
1164
1165 let bar = doc.get_next_word_from_offset(0, 1).unwrap();
1166 let bar = doc.get_span_content(&bar.span);
1167 assert_eq!(bar, ['b', 'a', 'r']);
1168 }
1169
1170 #[test]
1171 fn select_next_word_neg_offset() {
1172 let doc = Document::new_plain_english_curated("Foo bar baz");
1173
1174 let bar = doc.get_next_word_from_offset(2, -1).unwrap();
1175 let bar = doc.get_span_content(&bar.span);
1176 assert_eq!(bar, ['F', 'o', 'o']);
1177 }
1178
1179 #[test]
1180 fn cant_select_next_word_not_from_whitespace() {
1181 let doc = Document::new_plain_english_curated("Foo bar baz");
1182
1183 let tok = doc.get_next_word_from_offset(0, 2);
1184
1185 assert!(tok.is_none());
1186 }
1187
1188 #[test]
1189 fn cant_select_next_word_before_start() {
1190 let doc = Document::new_plain_english_curated("Foo bar baz");
1191
1192 let tok = doc.get_next_word_from_offset(0, -1);
1193
1194 assert!(tok.is_none());
1195 }
1196
1197 #[test]
1198 fn cant_select_next_word_with_punctuation_instead_of_whitespace() {
1199 let doc = Document::new_plain_english_curated("Foo, bar, baz");
1200
1201 let tok = doc.get_next_word_from_offset(0, 1);
1202
1203 assert!(tok.is_none());
1204 }
1205
1206 #[test]
1207 fn cant_select_next_word_with_punctuation_after_whitespace() {
1208 let doc = Document::new_plain_english_curated("Foo \"bar\", baz");
1209
1210 let tok = doc.get_next_word_from_offset(0, 1);
1211
1212 assert!(tok.is_none());
1213 }
1214
1215 #[test]
1216 fn condenses_filename_extensions() {
1217 let doc = Document::new_plain_english_curated(".c and .exe and .js");
1218 assert!(doc.tokens[0].kind.is_unlintable());
1219 assert!(doc.tokens[4].kind.is_unlintable());
1220 assert!(doc.tokens[8].kind.is_unlintable());
1221 }
1222
1223 #[test]
1224 fn condense_filename_extension_ok_at_start_and_end() {
1225 let doc = Document::new_plain_english_curated(".c and .EXE");
1226 assert!(doc.tokens.len() == 5);
1227 assert!(doc.tokens[0].kind.is_unlintable());
1228 assert!(doc.tokens[4].kind.is_unlintable());
1229 }
1230
1231 #[test]
1232 fn doesnt_condense_filename_extensions_with_mixed_case() {
1233 let doc = Document::new_plain_english_curated(".c and .Exe");
1234 assert!(doc.tokens.len() == 6);
1235 assert!(doc.tokens[0].kind.is_unlintable());
1236 assert!(doc.tokens[4].kind.is_punctuation());
1237 assert!(doc.tokens[5].kind.is_word());
1238 }
1239
1240 #[test]
1241 fn doesnt_condense_filename_extensions_with_non_letters() {
1242 let doc = Document::new_plain_english_curated(".COM and .C0M");
1243 assert!(doc.tokens.len() == 6);
1244 assert!(doc.tokens[0].kind.is_unlintable());
1245 assert!(doc.tokens[4].kind.is_punctuation());
1246 assert!(doc.tokens[5].kind.is_word());
1247 }
1248
1249 #[test]
1250 fn doesnt_condense_filename_extensions_longer_than_three() {
1251 let doc = Document::new_plain_english_curated(".dll and .dlls");
1252 assert!(doc.tokens.len() == 6);
1253 assert!(doc.tokens[0].kind.is_unlintable());
1254 assert!(doc.tokens[4].kind.is_punctuation());
1255 assert!(doc.tokens[5].kind.is_word());
1256 }
1257
1258 #[test]
1259 fn condense_filename_extension_in_parens() {
1260 let doc = Document::new_plain_english_curated(
1261 "true for the manual installation when trying to run the executable(.exe) after a manual download",
1262 );
1263 assert!(doc.tokens.len() > 23);
1264 assert!(doc.tokens[21].kind.is_open_round());
1265 assert!(doc.tokens[22].kind.is_unlintable());
1266 assert!(doc.tokens[23].kind.is_close_round());
1267 }
1268
1269 #[test]
1270 fn condense_tldr_uppercase() {
1271 let doc = Document::new_plain_english_curated("TL;DR");
1272 assert!(doc.tokens.len() == 1);
1273 assert!(doc.tokens[0].kind.is_word());
1274 assert!(doc.tokens[0].span.len() == 5);
1275 }
1276
1277 #[test]
1278 fn condense_tldr_lowercase() {
1279 let doc = Document::new_plain_english_curated("tl;dr");
1280 assert!(doc.tokens.len() == 1);
1281 assert!(doc.tokens[0].kind.is_word());
1282 }
1283
1284 #[test]
1285 fn condense_tldr_mixed_case_1() {
1286 let doc = Document::new_plain_english_curated("tl;DR");
1287 assert!(doc.tokens.len() == 1);
1288 assert!(doc.tokens[0].kind.is_word());
1289 }
1290
1291 #[test]
1292 fn condense_tldr_mixed_case_2() {
1293 let doc = Document::new_plain_english_curated("TL;Dr");
1294 assert!(doc.tokens.len() == 1);
1295 assert!(doc.tokens[0].kind.is_word());
1296 }
1297
1298 #[test]
1299 fn condense_tldr_pural() {
1300 let doc = Document::new_plain_english_curated(
1301 "managing the flow between components to produce relevant TL;DRs of current news articles",
1302 );
1303 assert!(
1305 doc.tokens
1306 .iter()
1307 .all(|t| t.kind.is_word() || t.kind.is_whitespace())
1308 );
1309 let tldrs = doc
1311 .tokens
1312 .iter()
1313 .filter(|t| t.get_ch(&doc.source).contains(&';'))
1314 .collect_vec();
1315 assert!(tldrs.len() == 1);
1316 assert!(tldrs[0].get_str(&doc.source) == "TL;DRs");
1317 }
1318
1319 #[test]
1320 fn condense_common_top_level_domains() {
1321 let doc = Document::new_plain_english_curated(".blog and .com and .NET");
1322 assert!(doc.tokens.len() == 9);
1323 assert!(doc.tokens[0].kind.is_unlintable());
1324 assert!(doc.tokens[4].kind.is_unlintable());
1325 assert!(doc.tokens[8].kind.is_unlintable());
1326 }
1327
1328 #[test]
1329 fn condense_common_top_level_domains_in_parens() {
1330 let doc = Document::new_plain_english_curated("(.blog)");
1331 assert!(doc.tokens.len() == 3);
1332 assert!(doc.tokens[0].kind.is_open_round());
1333 assert!(doc.tokens[1].kind.is_unlintable());
1334 assert!(doc.tokens[2].kind.is_close_round());
1335 }
1336
1337 #[test]
1338 fn doesnt_condense_unknown_top_level_domains() {
1339 let doc = Document::new_plain_english_curated(".harper");
1340 assert!(doc.tokens.len() == 2);
1341 assert!(doc.tokens[0].kind.is_punctuation());
1342 assert!(doc.tokens[1].kind.is_word());
1343 }
1344
1345 #[test]
1346 fn condense_r_and_d_caps() {
1347 let doc = Document::new_plain_english_curated("R&D");
1348 assert!(doc.tokens.len() == 1);
1349 assert!(doc.tokens[0].kind.is_word());
1350 }
1351
1352 #[test]
1353 fn condense_r_and_d_mixed_case() {
1354 let doc = Document::new_plain_english_curated("R&d");
1355 assert!(doc.tokens.len() == 1);
1356 assert!(doc.tokens[0].kind.is_word());
1357 }
1358
1359 #[test]
1360 fn condense_r_and_d_lowercase() {
1361 let doc = Document::new_plain_english_curated("r&d");
1362 assert!(doc.tokens.len() == 1);
1363 assert!(doc.tokens[0].kind.is_word());
1364 }
1365
1366 #[test]
1367 fn dont_condense_r_and_d_with_spaces() {
1368 let doc = Document::new_plain_english_curated("R & D");
1369 assert!(doc.tokens.len() == 5);
1370 assert!(doc.tokens[0].kind.is_word());
1371 assert!(doc.tokens[1].kind.is_whitespace());
1372 assert!(doc.tokens[2].kind.is_ampersand());
1373 assert!(doc.tokens[3].kind.is_whitespace());
1374 assert!(doc.tokens[4].kind.is_word());
1375 }
1376
1377 #[test]
1378 fn condense_q_and_a() {
1379 let doc =
1380 Document::new_plain_english_curated("A Q&A platform software for teams at any scales.");
1381 assert!(doc.tokens.len() >= 3);
1382 assert!(doc.tokens[2].kind.is_word());
1383 assert!(doc.tokens[2].get_str(&doc.source) == "Q&A");
1384 }
1385
1386 #[test]
1387 fn dont_allow_mixed_r_and_d_with_q_and_a() {
1388 let doc = Document::new_plain_english_curated("R&A or Q&D");
1389 assert!(doc.tokens.len() == 9);
1390 assert!(doc.tokens[1].kind.is_ampersand() || doc.tokens[7].kind.is_ampersand());
1391 }
1392
1393 #[test]
1394 fn condense_io() {
1395 let doc = Document::new_plain_english_curated("I/O");
1396 assert!(doc.tokens.len() == 1);
1397 assert!(doc.tokens[0].kind.is_word());
1398 }
1399
1400 #[test]
1401 fn finds_unmatched_quotes_in_document() {
1402 let raw = r#"
1403This is a paragraph with a single word "quoted."
1404
1405This is a second paragraph with no quotes.
1406
1407This is a third paragraph with a single erroneous "quote.
1408
1409This is a final paragraph with a weird "quote and a not-weird "quote".
1410 "#;
1411
1412 let doc = Document::new_markdown_default_curated(raw);
1413
1414 let quote_twins: Vec<_> = doc
1415 .iter_quotes()
1416 .map(|t| t.kind.as_quote().unwrap().twin_loc)
1417 .collect();
1418
1419 assert_eq!(
1420 quote_twins,
1421 vec![Some(19), Some(16), None, None, Some(89), Some(87)]
1422 )
1423 }
1424
1425 #[test]
1426 fn issue_1901() {
1427 let raw = r#"
1428"A quoted line"
1429"A quote without a closing mark
1430"Another quoted lined"
1431"The last quoted line"
1432 "#;
1433
1434 let doc = Document::new_markdown_default_curated(raw);
1435
1436 let quote_twins: Vec<_> = doc
1437 .iter_quotes()
1438 .map(|t| t.kind.as_quote().unwrap().twin_loc)
1439 .collect();
1440
1441 assert_eq!(
1442 quote_twins,
1443 vec![
1444 Some(6),
1445 Some(0),
1446 None,
1447 Some(27),
1448 Some(21),
1449 Some(37),
1450 Some(29)
1451 ]
1452 )
1453 }
1454}