1use std::cmp::Ordering;
2use std::collections::VecDeque;
3use std::fmt::Display;
4
5use harper_brill::{Chunker, Tagger, brill_tagger, burn_chunker};
6use itertools::Itertools;
7use paste::paste;
8
9use crate::expr::{Expr, ExprExt, FirstMatchOf, Repeating, SequenceExpr};
10use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
11use crate::punctuation::Punctuation;
12use crate::spell::{Dictionary, FstDictionary};
13use crate::vec_ext::VecExt;
14use crate::{CharStringExt, FatStringToken, FatToken, Lrc, Token, TokenKind, TokenStringExt};
15use crate::{OrdinalSuffix, Span};
16
17#[derive(Debug, Clone)]
19pub struct Document {
20 source: Lrc<Vec<char>>,
21 tokens: Vec<Token>,
22}
23
24impl Default for Document {
25 fn default() -> Self {
26 Self::new("", &PlainEnglish, &FstDictionary::curated())
27 }
28}
29
30impl Document {
31 pub fn token_indices_intersecting(&self, span: Span<char>) -> Vec<usize> {
35 self.tokens()
36 .enumerate()
37 .filter_map(|(idx, tok)| tok.span.overlaps_with(span).then_some(idx))
38 .collect()
39 }
40
41 pub fn fat_tokens_intersecting(&self, span: Span<char>) -> Vec<FatToken> {
45 let indices = self.token_indices_intersecting(span);
46
47 indices
48 .into_iter()
49 .map(|i| self.tokens[i].to_fat(&self.source))
50 .collect()
51 }
52
53 pub fn new(text: &str, parser: &impl Parser, dictionary: &impl Dictionary) -> Self {
56 let source: Vec<_> = text.chars().collect();
57
58 Self::new_from_vec(Lrc::new(source), parser, dictionary)
59 }
60
61 pub fn new_curated(text: &str, parser: &impl Parser) -> Self {
64 let source: Vec<_> = text.chars().collect();
65
66 Self::new_from_vec(Lrc::new(source), parser, &FstDictionary::curated())
67 }
68
69 pub fn new_from_vec(
72 source: Lrc<Vec<char>>,
73 parser: &impl Parser,
74 dictionary: &impl Dictionary,
75 ) -> Self {
76 let tokens = parser.parse(&source);
77
78 let mut document = Self { source, tokens };
79 document.parse(dictionary);
80
81 document
82 }
83
84 pub fn new_plain_english_curated_chars(source: &[char]) -> Self {
87 Self::new_from_vec(
88 Lrc::new(source.to_vec()),
89 &PlainEnglish,
90 &FstDictionary::curated(),
91 )
92 }
93
94 pub fn new_plain_english_curated(text: &str) -> Self {
97 Self::new(text, &PlainEnglish, &FstDictionary::curated())
98 }
99
100 pub(crate) fn new_basic_tokenize(text: &str, parser: &impl Parser) -> Self {
106 let source = Lrc::new(text.chars().collect_vec());
107 let tokens = parser.parse(&source);
108 let mut document = Self { source, tokens };
109 document.apply_fixups();
110 document
111 }
112
113 pub fn new_plain_english(text: &str, dictionary: &impl Dictionary) -> Self {
116 Self::new(text, &PlainEnglish, dictionary)
117 }
118
119 pub fn new_markdown_curated(text: &str, markdown_options: MarkdownOptions) -> Self {
122 Self::new(
123 text,
124 &Markdown::new(markdown_options),
125 &FstDictionary::curated(),
126 )
127 }
128
129 pub fn new_markdown_default_curated_chars(chars: &[char]) -> Self {
132 Self::new_from_vec(
133 chars.to_vec().into(),
134 &Markdown::default(),
135 &FstDictionary::curated(),
136 )
137 }
138
139 pub fn new_markdown_default_curated(text: &str) -> Self {
142 Self::new_markdown_curated(text, MarkdownOptions::default())
143 }
144
145 pub fn new_markdown(
148 text: &str,
149 markdown_options: MarkdownOptions,
150 dictionary: &impl Dictionary,
151 ) -> Self {
152 Self::new(text, &Markdown::new(markdown_options), dictionary)
153 }
154
155 pub fn new_markdown_default(text: &str, dictionary: &impl Dictionary) -> Self {
158 Self::new_markdown(text, MarkdownOptions::default(), dictionary)
159 }
160
161 fn apply_fixups(&mut self) {
162 self.condense_spaces();
163 self.condense_newlines();
164 self.newlines_to_breaks();
165 self.condense_dotted_initialisms();
166 self.condense_number_suffixes();
167 self.condense_ellipsis();
168 self.condense_dotted_truncations();
169 self.condense_common_top_level_domains();
170 self.condense_filename_extensions();
171 self.condense_tldr();
172 self.condense_ampersand_pairs();
173 self.condense_slash_pairs();
174 self.match_quotes();
175 }
176
177 fn parse(&mut self, dictionary: &impl Dictionary) {
181 self.apply_fixups();
182
183 let chunker = burn_chunker();
184 let tagger = brill_tagger();
185
186 for sent in self.tokens.iter_sentences_mut() {
187 let token_strings: Vec<_> = sent
188 .iter()
189 .filter(|t| !t.kind.is_whitespace())
190 .map(|t| t.span.get_content_string(&self.source))
191 .collect();
192
193 let token_tags = tagger.tag_sentence(&token_strings);
194 let np_flags = chunker.chunk_sentence(&token_strings, &token_tags);
195
196 let mut i = 0;
197
198 for token in sent.iter_mut() {
200 if let TokenKind::Word(meta) = &mut token.kind {
201 let word_source = token.span.get_content(&self.source);
202 let mut found_meta = dictionary
203 .get_word_metadata(word_source)
204 .map(|c| c.into_owned());
205
206 if let Some(inner) = &mut found_meta {
207 inner.pos_tag = token_tags[i].or_else(|| inner.infer_pos_tag());
208 inner.np_member = Some(np_flags[i]);
209 }
210
211 *meta = found_meta;
212 i += 1;
213 } else if !token.kind.is_whitespace() {
214 i += 1;
215 }
216 }
217 }
218 }
219
220 fn newlines_to_breaks(&mut self) {
222 for token in &mut self.tokens {
223 if let TokenKind::Newline(n) = token.kind
224 && n >= 2
225 {
226 token.kind = TokenKind::ParagraphBreak;
227 }
228 }
229 }
230
231 fn condense_indices(&mut self, indices: &[usize], stretch_len: usize) {
237 for idx in indices {
239 let end_tok = self.tokens[idx + stretch_len - 1].clone();
240 let start_tok = &mut self.tokens[*idx];
241
242 start_tok.span.end = end_tok.span.end;
243 }
244
245 let old = self.tokens.clone();
247 self.tokens.clear();
248
249 self.tokens
251 .extend_from_slice(&old[0..indices.first().copied().unwrap_or(indices.len())]);
252
253 let mut iter = indices.iter().peekable();
254
255 while let (Some(a_idx), b) = (iter.next(), iter.peek()) {
256 self.tokens.push(old[*a_idx].clone());
257
258 if let Some(b_idx) = b {
259 self.tokens
260 .extend_from_slice(&old[a_idx + stretch_len..**b_idx]);
261 }
262 }
263
264 self.tokens.extend_from_slice(
266 &old[indices
267 .last()
268 .map(|v| v + stretch_len)
269 .unwrap_or(indices.len())..],
270 );
271 }
272
273 pub fn get_token_at_char_index(&self, char_index: usize) -> Option<&Token> {
274 let index = self
275 .tokens
276 .binary_search_by(|t| {
277 if t.span.overlaps_with(Span::new_with_len(char_index, 1)) {
278 Ordering::Equal
279 } else {
280 t.span.start.cmp(&char_index)
281 }
282 })
283 .ok()?;
284
285 Some(&self.tokens[index])
286 }
287
288 pub fn get_token(&self, index: usize) -> Option<&Token> {
290 self.tokens.get(index)
291 }
292
293 pub fn get_token_offset(&self, base: usize, offset: isize) -> Option<&Token> {
295 match base.checked_add_signed(offset) {
296 None => None,
297 Some(idx) => self.get_token(idx),
298 }
299 }
300
301 pub fn tokens(&self) -> impl Iterator<Item = &Token> + '_ {
303 self.tokens.iter()
304 }
305
306 pub fn iter_nominal_phrases(&self) -> impl Iterator<Item = &[Token]> {
307 fn is_np_member(t: &Token) -> bool {
308 t.kind
309 .as_word()
310 .and_then(|x| x.as_ref())
311 .and_then(|w| w.np_member)
312 .unwrap_or(false)
313 }
314
315 fn trim(slice: &[Token]) -> &[Token] {
316 let mut start = 0;
317 let mut end = slice.len();
318 while start < end && slice[start].kind.is_whitespace() {
319 start += 1;
320 }
321 while end > start && slice[end - 1].kind.is_whitespace() {
322 end -= 1;
323 }
324 &slice[start..end]
325 }
326
327 self.tokens
328 .as_slice()
329 .split(|t| !(is_np_member(t) || t.kind.is_whitespace()))
330 .filter_map(|s| {
331 let s = trim(s);
332 if s.iter().any(is_np_member) {
333 Some(s)
334 } else {
335 None
336 }
337 })
338 }
339
340 pub fn fat_tokens(&self) -> impl Iterator<Item = FatToken> + '_ {
342 self.tokens().map(|token| token.to_fat(&self.source))
343 }
344
345 pub fn get_next_word_from_offset(&self, base: usize, offset: isize) -> Option<&Token> {
348 if !self.get_token_offset(base, offset)?.kind.is_whitespace() {
350 return None;
351 }
352 let word_token = self.get_token_offset(base, offset + offset.signum());
354 let word_token = word_token?;
355 word_token.kind.is_word().then_some(word_token)
356 }
357
358 pub fn fat_string_tokens(&self) -> impl Iterator<Item = FatStringToken> + '_ {
360 self.fat_tokens().map(|t| t.into())
361 }
362
363 pub fn get_span_content(&self, span: &Span<char>) -> &[char] {
364 span.get_content(&self.source)
365 }
366
367 pub fn get_span_content_str(&self, span: &Span<char>) -> String {
368 String::from_iter(self.get_span_content(span))
369 }
370
371 pub fn get_full_string(&self) -> String {
372 self.get_span_content_str(&Span::new(0, self.source.len()))
373 }
374
375 pub fn get_full_content(&self) -> &[char] {
376 &self.source
377 }
378
379 pub fn get_source(&self) -> &[char] {
380 &self.source
381 }
382
383 pub fn get_tokens(&self) -> &[Token] {
384 &self.tokens
385 }
386
387 fn match_quotes(&mut self) {
393 let mut pg_indices: Vec<_> = vec![0];
394 pg_indices.extend(self.iter_paragraph_break_indices());
395 pg_indices.push(self.tokens.len());
396
397 let mut quote_indices = Vec::new();
399 let mut open_quote_indices = Vec::new();
400
401 for (start, end) in pg_indices.into_iter().tuple_windows() {
402 let pg = &mut self.tokens[start..end];
403
404 quote_indices.clear();
405 quote_indices.extend(pg.iter_quote_indices());
406 open_quote_indices.clear();
407
408 for quote in "e_indices {
410 let is_open = *quote == 0
411 || pg[0..*quote].iter_word_likes().next().is_none()
412 || pg[quote - 1].kind.is_whitespace()
413 || matches!(
414 pg[quote - 1].kind.as_punctuation(),
415 Some(Punctuation::LessThan)
416 | Some(Punctuation::OpenRound)
417 | Some(Punctuation::OpenSquare)
418 | Some(Punctuation::OpenCurly)
419 | Some(Punctuation::Apostrophe)
420 );
421
422 if is_open {
423 open_quote_indices.push(*quote);
424 }
425 }
426
427 while let Some(open_idx) = open_quote_indices.pop() {
428 let Some(close_idx) = pg[open_idx + 1..].iter_quote_indices().next() else {
429 continue;
430 };
431
432 if pg[close_idx + open_idx + 1]
433 .kind
434 .as_quote()
435 .unwrap()
436 .twin_loc
437 .is_some()
438 {
439 continue;
440 }
441
442 pg[open_idx].kind.as_mut_quote().unwrap().twin_loc =
443 Some(close_idx + open_idx + start + 1);
444 pg[close_idx + open_idx + 1]
445 .kind
446 .as_mut_quote()
447 .unwrap()
448 .twin_loc = Some(open_idx + start);
449 }
450 }
451 }
452
453 fn condense_number_suffixes(&mut self) {
455 if self.tokens.len() < 2 {
456 return;
457 }
458
459 let mut replace_starts = Vec::new();
460
461 for idx in 0..self.tokens.len() - 1 {
462 let b = &self.tokens[idx + 1];
463 let a = &self.tokens[idx];
464
465 if let (TokenKind::Number(..), TokenKind::Word(..)) = (&a.kind, &b.kind)
468 && let Some(found_suffix) =
469 OrdinalSuffix::from_chars(self.get_span_content(&b.span))
470 {
471 self.tokens[idx].kind.as_mut_number().unwrap().suffix = Some(found_suffix);
472 replace_starts.push(idx);
473 }
474 }
475
476 self.condense_indices(&replace_starts, 2);
477 }
478
479 fn condense_spaces(&mut self) {
482 let mut cursor = 0;
483 let copy = self.tokens.clone();
484
485 let mut remove_these = VecDeque::new();
486
487 while cursor < self.tokens.len() {
488 let start_tok = &mut self.tokens[cursor];
490
491 if let TokenKind::Space(start_count) = &mut start_tok.kind {
492 loop {
493 cursor += 1;
494
495 if cursor >= copy.len() {
496 break;
497 }
498
499 let child_tok = ©[cursor];
500
501 if start_tok.span.end != child_tok.span.start {
503 break;
504 }
505
506 if let TokenKind::Space(n) = child_tok.kind {
507 *start_count += n;
508 start_tok.span.end = child_tok.span.end;
509 remove_these.push_back(cursor);
510 cursor += 1;
511 } else {
512 break;
513 };
514 }
515 }
516
517 cursor += 1;
518 }
519
520 self.tokens.remove_indices(remove_these);
521 }
522
523 thread_local! {
524 static DOTTED_TRUNCATION_EXPR: Lrc<FirstMatchOf> = Document::uncached_dotted_truncation_expr();
525 }
526
527 fn uncached_dotted_truncation_expr() -> Lrc<FirstMatchOf> {
528 Lrc::new(FirstMatchOf::new(vec![
529 Box::new(SequenceExpr::word_set(&["esp", "etc", "vs"]).then_period()),
530 Box::new(
531 SequenceExpr::aco("et")
532 .then_whitespace()
533 .t_aco("al")
534 .then_period(),
535 ),
536 ]))
537 }
538
539 fn condense_expr<F>(&mut self, expr: &impl Expr, edit: F)
542 where
543 F: Fn(&mut Token),
544 {
545 let matches = expr.iter_matches_in_doc(self).collect::<Vec<_>>();
546
547 let mut remove_indices = VecDeque::with_capacity(matches.len());
548
549 for m in matches {
550 remove_indices.extend(m.start + 1..m.end);
551 self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
552 edit(&mut self.tokens[m.start]);
553 }
554
555 self.tokens.remove_indices(remove_indices);
556 }
557
558 fn condense_dotted_truncations(&mut self) {
559 self.condense_expr(&Self::DOTTED_TRUNCATION_EXPR.with(|v| v.clone()), |_| {})
560 }
561
562 fn condense_newlines(&mut self) {
565 let mut cursor = 0;
566 let copy = self.tokens.clone();
567
568 let mut remove_these = VecDeque::new();
569
570 while cursor < self.tokens.len() {
571 let start_tok = &mut self.tokens[cursor];
573
574 if let TokenKind::Newline(start_count) = &mut start_tok.kind {
575 loop {
576 cursor += 1;
577
578 if cursor >= copy.len() {
579 break;
580 }
581
582 let child_tok = ©[cursor];
583 if let TokenKind::Newline(n) = child_tok.kind {
584 *start_count += n;
585 start_tok.span.end = child_tok.span.end;
586 remove_these.push_back(cursor);
587 cursor += 1;
588 } else {
589 break;
590 };
591 }
592 }
593
594 cursor += 1;
595 }
596
597 self.tokens.remove_indices(remove_these);
598 }
599
600 fn condense_dotted_initialisms(&mut self) {
603 if self.tokens.len() < 2 {
604 return;
605 }
606
607 let mut to_remove = VecDeque::new();
608
609 let mut cursor = 1;
610
611 let mut initialism_start = None;
612
613 loop {
614 let a = &self.tokens[cursor - 1];
615 let b = &self.tokens[cursor];
616
617 let is_initialism_chunk = a.kind.is_word() && a.span.len() == 1 && b.kind.is_period();
618
619 if is_initialism_chunk {
620 if initialism_start.is_none() {
621 initialism_start = Some(cursor - 1);
622 } else {
623 to_remove.push_back(cursor - 1);
624 }
625
626 to_remove.push_back(cursor);
627 cursor += 1;
628 } else {
629 if let Some(start) = initialism_start {
630 let end = self.tokens[cursor - 2].span.end;
631 let start_tok: &mut Token = &mut self.tokens[start];
632 start_tok.span.end = end;
633 }
634
635 initialism_start = None;
636 }
637
638 cursor += 1;
639
640 if cursor >= self.tokens.len() - 1 {
641 break;
642 }
643 }
644
645 self.tokens.remove_indices(to_remove);
646 }
647
648 fn condense_filename_extensions(&mut self) {
650 if self.tokens.len() < 2 {
651 return;
652 }
653
654 let mut to_remove = VecDeque::new();
655
656 let mut cursor = 1;
657
658 let mut ext_start = None;
659
660 loop {
661 let l = self.get_token_offset(cursor, -2);
663 let d = &self.tokens[cursor - 1];
664 let x = &self.tokens[cursor];
665 let r = self.get_token_offset(cursor, 1);
666
667 let is_ext_chunk = d.kind.is_period()
668 && x.kind.is_word()
669 && x.span.len() <= 3
670 && ((l.is_none_or(|t| t.kind.is_whitespace())
671 && r.is_none_or(|t| t.kind.is_whitespace()))
672 || (l.is_some_and(|t| t.kind.is_open_round())
673 && r.is_some_and(|t| t.kind.is_close_round())))
674 && {
675 let ext_chars = x.span.get_content(&self.source);
676 ext_chars.iter().all(|c| c.is_ascii_lowercase())
677 || ext_chars.iter().all(|c| c.is_ascii_uppercase())
678 };
679
680 if is_ext_chunk {
681 if ext_start.is_none() {
682 ext_start = Some(cursor - 1);
683 self.tokens[cursor - 1].kind = TokenKind::Unlintable;
684 } else {
685 to_remove.push_back(cursor - 1);
686 }
687
688 to_remove.push_back(cursor);
689 cursor += 1;
690 } else {
691 if let Some(start) = ext_start {
692 let end = self.tokens[cursor - 2].span.end;
693 let start_tok: &mut Token = &mut self.tokens[start];
694 start_tok.span.end = end;
695 }
696
697 ext_start = None;
698 }
699
700 cursor += 1;
701
702 if cursor >= self.tokens.len() {
703 break;
704 }
705 }
706
707 self.tokens.remove_indices(to_remove);
708 }
709
710 fn condense_common_top_level_domains(&mut self) {
712 const COMMON_TOP_LEVEL_DOMAINS: &[&str; 106] = &[
713 "ai", "app", "blog", "co", "com", "dev", "edu", "gov", "info", "io", "me", "mil",
714 "net", "org", "shop", "tech", "uk", "us", "xyz", "jp", "de", "fr", "br", "it", "ru",
715 "es", "pl", "ca", "au", "cn", "in", "nl", "eu", "ch", "id", "at", "kr", "cz", "mx",
716 "be", "tv", "se", "tr", "tw", "al", "ua", "ir", "vn", "cl", "sk", "ly", "cc", "to",
717 "no", "fi", "pt", "dk", "ar", "hu", "tk", "gr", "il", "news", "ro", "my", "biz", "ie",
718 "za", "nz", "sg", "ee", "th", "pe", "bg", "hk", "rs", "lt", "link", "ph", "club", "si",
719 "site", "mobi", "by", "cat", "wiki", "la", "ga", "xxx", "cf", "hr", "ng", "jobs",
720 "online", "kz", "ug", "gq", "ae", "is", "lv", "pro", "fm", "tips", "ms", "sa", "int",
721 ];
722
723 if self.tokens.len() < 2 {
724 return;
725 }
726
727 let mut to_remove = VecDeque::new();
728 for cursor in 1..self.tokens.len() {
729 let l = self.get_token_offset(cursor, -2);
731 let d = &self.tokens[cursor - 1];
732 let tld = &self.tokens[cursor];
733 let r = self.get_token_offset(cursor, 1);
734
735 let is_tld_chunk = d.kind.is_period()
736 && tld.kind.is_word()
737 && tld
738 .span
739 .get_content(&self.source)
740 .iter()
741 .all(|c| c.is_ascii_alphabetic())
742 && tld
743 .span
744 .get_content(&self.source)
745 .eq_any_ignore_ascii_case_str(COMMON_TOP_LEVEL_DOMAINS)
746 && ((l.is_none_or(|t| t.kind.is_whitespace())
747 && r.is_none_or(|t| t.kind.is_whitespace()))
748 || (l.is_some_and(|t| t.kind.is_open_round())
749 && r.is_some_and(|t| t.kind.is_close_round())));
750
751 if is_tld_chunk {
752 self.tokens[cursor - 1].kind = TokenKind::Unlintable;
753 self.tokens[cursor - 1].span.end = self.tokens[cursor].span.end;
754 to_remove.push_back(cursor);
755 }
756 }
757
758 self.tokens.remove_indices(to_remove);
759 }
760
761 fn condense_tldr(&mut self) {
763 if self.tokens.len() < 3 {
764 return;
765 }
766
767 let mut to_remove = VecDeque::new();
768 let mut cursor = 2;
769
770 loop {
771 let tl = &self.tokens[cursor - 2];
772 let simicolon = &self.tokens[cursor - 1];
773 let dr = &self.tokens[cursor];
774
775 let is_tldr_chunk = tl.kind.is_word()
776 && tl.span.len() == 2
777 && tl
778 .span
779 .get_content(&self.source)
780 .eq_ignore_ascii_case_chars(&['t', 'l'])
781 && simicolon.kind.is_semicolon()
782 && dr.kind.is_word()
783 && dr.span.len() >= 2
784 && dr.span.len() <= 3
785 && dr
786 .span
787 .get_content(&self.source)
788 .eq_any_ignore_ascii_case_chars(&[&['d', 'r'], &['d', 'r', 's']]);
789
790 if is_tldr_chunk {
791 self.tokens[cursor - 2].span = Span::new(
793 self.tokens[cursor - 2].span.start,
794 self.tokens[cursor].span.end,
795 );
796
797 to_remove.push_back(cursor - 1);
799 to_remove.push_back(cursor);
800 }
801
802 cursor += 1;
804
805 if cursor >= self.tokens.len() {
806 break;
807 }
808 }
809
810 self.tokens.remove_indices(to_remove);
812 }
813
814 fn condense_delimited_pairs<F>(&mut self, is_delimiter: F, valid_pairs: &[(char, char)])
822 where
823 F: Fn(&TokenKind) -> bool,
824 {
825 if self.tokens.len() < 3 {
826 return;
827 }
828
829 let mut to_remove = VecDeque::new();
830 let mut cursor = 2;
831
832 loop {
833 let l1 = &self.tokens[cursor - 2];
834 let delim = &self.tokens[cursor - 1];
835 let l2 = &self.tokens[cursor];
836
837 let is_delimited_chunk = l1.kind.is_word()
838 && l1.span.len() == 1
839 && is_delimiter(&delim.kind)
840 && l2.kind.is_word()
841 && l2.span.len() == 1;
842
843 if is_delimited_chunk {
844 let (l1, l2) = (
845 l1.span.get_content(&self.source).first(),
846 l2.span.get_content(&self.source).first(),
847 );
848
849 let is_valid_pair = match (l1, l2) {
850 (Some(l1), Some(l2)) => {
851 let pair = (l1.to_ascii_lowercase(), l2.to_ascii_lowercase());
852 valid_pairs.contains(&pair)
853 }
854 _ => false,
855 };
856
857 if is_valid_pair {
858 self.tokens[cursor - 2].span = Span::new(
859 self.tokens[cursor - 2].span.start,
860 self.tokens[cursor].span.end,
861 );
862 to_remove.push_back(cursor - 1);
863 to_remove.push_back(cursor);
864 }
865 }
866
867 cursor += 1;
868 if cursor >= self.tokens.len() {
869 break;
870 }
871 }
872
873 self.tokens.remove_indices(to_remove);
874 }
875
876 fn condense_ampersand_pairs(&mut self) {
878 self.condense_delimited_pairs(
879 |kind| kind.is_ampersand(),
880 &[
881 ('b', 'b'), ('b', 'w'), ('g', 't'), ('k', 'r'), ('q', 'a'), ('r', 'b'), ('r', 'd'), ('r', 'r'), ('s', 'p'), ],
891 );
892 }
893
894 fn condense_slash_pairs(&mut self) {
896 self.condense_delimited_pairs(
897 |kind| kind.is_slash(),
898 &[
899 ('a', 'c'), ('b', 'w'), ('c', 'o'), ('d', 'c'), ('d', 'l'), ('i', 'o'), ('j', 'k'), ('n', 'a'), ('r', 'c'), ('s', 'n'), ('y', 'n'), ('y', 'o'), ],
912 );
913 }
914
915 fn uncached_ellipsis_pattern() -> Lrc<Repeating> {
916 let period = SequenceExpr::default().then_period();
917 Lrc::new(Repeating::new(Box::new(period), 2))
918 }
919
920 thread_local! {
921 static ELLIPSIS_EXPR: Lrc<Repeating> = Document::uncached_ellipsis_pattern();
922 }
923
924 fn condense_ellipsis(&mut self) {
925 let expr = Self::ELLIPSIS_EXPR.with(|v| v.clone());
926 self.condense_expr(&expr, |tok| {
927 tok.kind = TokenKind::Punctuation(Punctuation::Ellipsis)
928 });
929 }
930}
931
932macro_rules! create_fns_on_doc {
934 ($thing:ident) => {
935 paste! {
936 fn [< first_ $thing >](&self) -> Option<&Token> {
937 self.tokens.[< first_ $thing >]()
938 }
939
940 fn [< last_ $thing >](&self) -> Option<&Token> {
941 self.tokens.[< last_ $thing >]()
942 }
943
944 fn [< last_ $thing _index>](&self) -> Option<usize> {
945 self.tokens.[< last_ $thing _index >]()
946 }
947
948 fn [<iter_ $thing _indices>](&self) -> impl DoubleEndedIterator<Item = usize> + '_ {
949 self.tokens.[< iter_ $thing _indices >]()
950 }
951
952 fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_ {
953 self.tokens.[< iter_ $thing s >]()
954 }
955 }
956 };
957}
958
959impl TokenStringExt for Document {
960 create_fns_on_doc!(adjective);
961 create_fns_on_doc!(apostrophe);
962 create_fns_on_doc!(at);
963 create_fns_on_doc!(chunk_terminator);
964 create_fns_on_doc!(comma);
965 create_fns_on_doc!(conjunction);
966 create_fns_on_doc!(currency);
967 create_fns_on_doc!(ellipsis);
968 create_fns_on_doc!(hostname);
969 create_fns_on_doc!(likely_homograph);
970 create_fns_on_doc!(noun);
971 create_fns_on_doc!(number);
972 create_fns_on_doc!(paragraph_break);
973 create_fns_on_doc!(pipe);
974 create_fns_on_doc!(preposition);
975 create_fns_on_doc!(punctuation);
976 create_fns_on_doc!(quote);
977 create_fns_on_doc!(sentence_terminator);
978 create_fns_on_doc!(space);
979 create_fns_on_doc!(unlintable);
980 create_fns_on_doc!(verb);
981 create_fns_on_doc!(word);
982 create_fns_on_doc!(word_like);
983 create_fns_on_doc!(heading_start);
984
985 fn first_sentence_word(&self) -> Option<&Token> {
986 self.tokens.first_sentence_word()
987 }
988
989 fn first_non_whitespace(&self) -> Option<&Token> {
990 self.tokens.first_non_whitespace()
991 }
992
993 fn span(&self) -> Option<Span<char>> {
994 self.tokens.span()
995 }
996
997 fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
998 self.tokens.iter_linking_verb_indices()
999 }
1000
1001 fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_ {
1002 self.tokens.iter_linking_verbs()
1003 }
1004
1005 fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
1006 self.tokens.iter_chunks()
1007 }
1008
1009 fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
1010 self.tokens.iter_paragraphs()
1011 }
1012
1013 fn iter_headings(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
1014 self.tokens.iter_headings()
1015 }
1016
1017 fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
1018 self.tokens.iter_sentences()
1019 }
1020
1021 fn iter_sentences_mut(&mut self) -> impl Iterator<Item = &'_ mut [Token]> + '_ {
1022 self.tokens.iter_sentences_mut()
1023 }
1024}
1025
1026impl Display for Document {
1027 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1028 for token in &self.tokens {
1029 write!(f, "{}", self.get_span_content_str(&token.span))?;
1030 }
1031
1032 Ok(())
1033 }
1034}
1035
1036#[cfg(test)]
1037mod tests {
1038 use itertools::Itertools;
1039
1040 use super::Document;
1041 use crate::TokenStringExt;
1042 use crate::{Span, parsers::MarkdownOptions};
1043
1044 fn assert_condensed_contractions(text: &str, final_tok_count: usize) {
1045 let document = Document::new_plain_english_curated(text);
1046
1047 assert_eq!(document.tokens.len(), final_tok_count);
1048
1049 let document = Document::new_markdown_curated(text, MarkdownOptions::default());
1050
1051 assert_eq!(document.tokens.len(), final_tok_count);
1052 }
1053
1054 #[test]
1055 fn simple_contraction() {
1056 assert_condensed_contractions("isn't", 1);
1057 }
1058
1059 #[test]
1060 fn simple_contraction2() {
1061 assert_condensed_contractions("wasn't", 1);
1062 }
1063
1064 #[test]
1065 fn simple_contraction3() {
1066 assert_condensed_contractions("There's", 1);
1067 }
1068
1069 #[test]
1070 fn simple_contraction4() {
1071 assert_condensed_contractions("doesn't", 1);
1072 }
1073
1074 #[test]
1075 fn medium_contraction() {
1076 assert_condensed_contractions("isn't wasn't", 3);
1077 }
1078
1079 #[test]
1080 fn medium_contraction2() {
1081 assert_condensed_contractions("There's no way", 5);
1082 }
1083
1084 #[test]
1085 fn selects_token_at_char_index() {
1086 let text = "There were three little pigs. They built three little homes.";
1087 let document = Document::new_plain_english_curated(text);
1088
1089 let got = document.get_token_at_char_index(19).unwrap();
1090
1091 assert!(got.kind.is_word());
1092 assert_eq!(got.span, Span::new(17, 23));
1093 }
1094
1095 fn assert_token_count(source: &str, count: usize) {
1096 let document = Document::new_plain_english_curated(source);
1097
1098 dbg!(document.tokens().map(|t| t.kind.clone()).collect_vec());
1099 assert_eq!(document.tokens.len(), count);
1100 }
1101
1102 #[test]
1103 fn condenses_number_suffixes() {
1104 assert_token_count("1st", 1);
1105 assert_token_count("This is the 2nd test", 9);
1106 assert_token_count("This is the 3rd test", 9);
1107 assert_token_count(
1108 "It works even with weird capitalization like this: 600nD",
1109 18,
1110 );
1111 }
1112
1113 #[test]
1114 fn condenses_ie() {
1115 assert_token_count("There is a thing (i.e. that one)", 15);
1116 assert_token_count("We are trying to condense \"i.e.\"", 13);
1117 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1118 }
1119
1120 #[test]
1121 fn condenses_eg() {
1122 assert_token_count("We are trying to condense \"e.g.\"", 13);
1123 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1124 }
1125
1126 #[test]
1127 fn condenses_nsa() {
1128 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1129 }
1130
1131 #[test]
1132 fn parses_ellipsis() {
1133 assert_token_count("...", 1);
1134 }
1135
1136 #[test]
1137 fn parses_long_ellipsis() {
1138 assert_token_count(".....", 1);
1139 }
1140
1141 #[test]
1142 fn parses_short_ellipsis() {
1143 assert_token_count("..", 1);
1144 }
1145
1146 #[test]
1147 fn selects_token_at_offset() {
1148 let doc = Document::new_plain_english_curated("Foo bar baz");
1149
1150 let tok = doc.get_token_offset(1, -1).unwrap();
1151
1152 assert_eq!(tok.span, Span::new(0, 3));
1153 }
1154
1155 #[test]
1156 fn cant_select_token_before_start() {
1157 let doc = Document::new_plain_english_curated("Foo bar baz");
1158
1159 let tok = doc.get_token_offset(0, -1);
1160
1161 assert!(tok.is_none());
1162 }
1163
1164 #[test]
1165 fn select_next_word_pos_offset() {
1166 let doc = Document::new_plain_english_curated("Foo bar baz");
1167
1168 let bar = doc.get_next_word_from_offset(0, 1).unwrap();
1169 let bar = doc.get_span_content(&bar.span);
1170 assert_eq!(bar, ['b', 'a', 'r']);
1171 }
1172
1173 #[test]
1174 fn select_next_word_neg_offset() {
1175 let doc = Document::new_plain_english_curated("Foo bar baz");
1176
1177 let bar = doc.get_next_word_from_offset(2, -1).unwrap();
1178 let bar = doc.get_span_content(&bar.span);
1179 assert_eq!(bar, ['F', 'o', 'o']);
1180 }
1181
1182 #[test]
1183 fn cant_select_next_word_not_from_whitespace() {
1184 let doc = Document::new_plain_english_curated("Foo bar baz");
1185
1186 let tok = doc.get_next_word_from_offset(0, 2);
1187
1188 assert!(tok.is_none());
1189 }
1190
1191 #[test]
1192 fn cant_select_next_word_before_start() {
1193 let doc = Document::new_plain_english_curated("Foo bar baz");
1194
1195 let tok = doc.get_next_word_from_offset(0, -1);
1196
1197 assert!(tok.is_none());
1198 }
1199
1200 #[test]
1201 fn cant_select_next_word_with_punctuation_instead_of_whitespace() {
1202 let doc = Document::new_plain_english_curated("Foo, bar, baz");
1203
1204 let tok = doc.get_next_word_from_offset(0, 1);
1205
1206 assert!(tok.is_none());
1207 }
1208
1209 #[test]
1210 fn cant_select_next_word_with_punctuation_after_whitespace() {
1211 let doc = Document::new_plain_english_curated("Foo \"bar\", baz");
1212
1213 let tok = doc.get_next_word_from_offset(0, 1);
1214
1215 assert!(tok.is_none());
1216 }
1217
1218 #[test]
1219 fn condenses_filename_extensions() {
1220 let doc = Document::new_plain_english_curated(".c and .exe and .js");
1221 assert!(doc.tokens[0].kind.is_unlintable());
1222 assert!(doc.tokens[4].kind.is_unlintable());
1223 assert!(doc.tokens[8].kind.is_unlintable());
1224 }
1225
1226 #[test]
1227 fn condense_filename_extension_ok_at_start_and_end() {
1228 let doc = Document::new_plain_english_curated(".c and .EXE");
1229 assert!(doc.tokens.len() == 5);
1230 assert!(doc.tokens[0].kind.is_unlintable());
1231 assert!(doc.tokens[4].kind.is_unlintable());
1232 }
1233
1234 #[test]
1235 fn doesnt_condense_filename_extensions_with_mixed_case() {
1236 let doc = Document::new_plain_english_curated(".c and .Exe");
1237 assert!(doc.tokens.len() == 6);
1238 assert!(doc.tokens[0].kind.is_unlintable());
1239 assert!(doc.tokens[4].kind.is_punctuation());
1240 assert!(doc.tokens[5].kind.is_word());
1241 }
1242
1243 #[test]
1244 fn doesnt_condense_filename_extensions_with_non_letters() {
1245 let doc = Document::new_plain_english_curated(".COM and .C0M");
1246 assert!(doc.tokens.len() == 6);
1247 assert!(doc.tokens[0].kind.is_unlintable());
1248 assert!(doc.tokens[4].kind.is_punctuation());
1249 assert!(doc.tokens[5].kind.is_word());
1250 }
1251
1252 #[test]
1253 fn doesnt_condense_filename_extensions_longer_than_three() {
1254 let doc = Document::new_plain_english_curated(".dll and .dlls");
1255 assert!(doc.tokens.len() == 6);
1256 assert!(doc.tokens[0].kind.is_unlintable());
1257 assert!(doc.tokens[4].kind.is_punctuation());
1258 assert!(doc.tokens[5].kind.is_word());
1259 }
1260
1261 #[test]
1262 fn condense_filename_extension_in_parens() {
1263 let doc = Document::new_plain_english_curated(
1264 "true for the manual installation when trying to run the executable(.exe) after a manual download",
1265 );
1266 assert!(doc.tokens.len() > 23);
1267 assert!(doc.tokens[21].kind.is_open_round());
1268 assert!(doc.tokens[22].kind.is_unlintable());
1269 assert!(doc.tokens[23].kind.is_close_round());
1270 }
1271
1272 #[test]
1273 fn condense_tldr_uppercase() {
1274 let doc = Document::new_plain_english_curated("TL;DR");
1275 assert!(doc.tokens.len() == 1);
1276 assert!(doc.tokens[0].kind.is_word());
1277 assert!(doc.tokens[0].span.len() == 5);
1278 }
1279
1280 #[test]
1281 fn condense_tldr_lowercase() {
1282 let doc = Document::new_plain_english_curated("tl;dr");
1283 assert!(doc.tokens.len() == 1);
1284 assert!(doc.tokens[0].kind.is_word());
1285 }
1286
1287 #[test]
1288 fn condense_tldr_mixed_case_1() {
1289 let doc = Document::new_plain_english_curated("tl;DR");
1290 assert!(doc.tokens.len() == 1);
1291 assert!(doc.tokens[0].kind.is_word());
1292 }
1293
1294 #[test]
1295 fn condense_tldr_mixed_case_2() {
1296 let doc = Document::new_plain_english_curated("TL;Dr");
1297 assert!(doc.tokens.len() == 1);
1298 assert!(doc.tokens[0].kind.is_word());
1299 }
1300
1301 #[test]
1302 fn condense_tldr_pural() {
1303 let doc = Document::new_plain_english_curated(
1304 "managing the flow between components to produce relevant TL;DRs of current news articles",
1305 );
1306 assert!(
1308 doc.tokens
1309 .iter()
1310 .all(|t| t.kind.is_word() || t.kind.is_whitespace())
1311 );
1312 let tldrs = doc
1314 .tokens
1315 .iter()
1316 .filter(|t| t.span.get_content(&doc.source).contains(&';'))
1317 .collect_vec();
1318 assert!(tldrs.len() == 1);
1319 assert!(tldrs[0].span.get_content_string(&doc.source) == "TL;DRs");
1320 }
1321
1322 #[test]
1323 fn condense_common_top_level_domains() {
1324 let doc = Document::new_plain_english_curated(".blog and .com and .NET");
1325 assert!(doc.tokens.len() == 9);
1326 assert!(doc.tokens[0].kind.is_unlintable());
1327 assert!(doc.tokens[4].kind.is_unlintable());
1328 assert!(doc.tokens[8].kind.is_unlintable());
1329 }
1330
1331 #[test]
1332 fn condense_common_top_level_domains_in_parens() {
1333 let doc = Document::new_plain_english_curated("(.blog)");
1334 assert!(doc.tokens.len() == 3);
1335 assert!(doc.tokens[0].kind.is_open_round());
1336 assert!(doc.tokens[1].kind.is_unlintable());
1337 assert!(doc.tokens[2].kind.is_close_round());
1338 }
1339
1340 #[test]
1341 fn doesnt_condense_unknown_top_level_domains() {
1342 let doc = Document::new_plain_english_curated(".harper");
1343 assert!(doc.tokens.len() == 2);
1344 assert!(doc.tokens[0].kind.is_punctuation());
1345 assert!(doc.tokens[1].kind.is_word());
1346 }
1347
1348 #[test]
1349 fn condense_r_and_d_caps() {
1350 let doc = Document::new_plain_english_curated("R&D");
1351 assert!(doc.tokens.len() == 1);
1352 assert!(doc.tokens[0].kind.is_word());
1353 }
1354
1355 #[test]
1356 fn condense_r_and_d_mixed_case() {
1357 let doc = Document::new_plain_english_curated("R&d");
1358 assert!(doc.tokens.len() == 1);
1359 assert!(doc.tokens[0].kind.is_word());
1360 }
1361
1362 #[test]
1363 fn condense_r_and_d_lowercase() {
1364 let doc = Document::new_plain_english_curated("r&d");
1365 assert!(doc.tokens.len() == 1);
1366 assert!(doc.tokens[0].kind.is_word());
1367 }
1368
1369 #[test]
1370 fn dont_condense_r_and_d_with_spaces() {
1371 let doc = Document::new_plain_english_curated("R & D");
1372 assert!(doc.tokens.len() == 5);
1373 assert!(doc.tokens[0].kind.is_word());
1374 assert!(doc.tokens[1].kind.is_whitespace());
1375 assert!(doc.tokens[2].kind.is_ampersand());
1376 assert!(doc.tokens[3].kind.is_whitespace());
1377 assert!(doc.tokens[4].kind.is_word());
1378 }
1379
1380 #[test]
1381 fn condense_q_and_a() {
1382 let doc =
1383 Document::new_plain_english_curated("A Q&A platform software for teams at any scales.");
1384 assert!(doc.tokens.len() >= 3);
1385 assert!(doc.tokens[2].kind.is_word());
1386 assert!(doc.tokens[2].span.get_content_string(&doc.source) == "Q&A");
1387 }
1388
1389 #[test]
1390 fn dont_allow_mixed_r_and_d_with_q_and_a() {
1391 let doc = Document::new_plain_english_curated("R&A or Q&D");
1392 assert!(doc.tokens.len() == 9);
1393 assert!(doc.tokens[1].kind.is_ampersand() || doc.tokens[7].kind.is_ampersand());
1394 }
1395
1396 #[test]
1397 fn condense_io() {
1398 let doc = Document::new_plain_english_curated("I/O");
1399 assert!(doc.tokens.len() == 1);
1400 assert!(doc.tokens[0].kind.is_word());
1401 }
1402
1403 #[test]
1404 fn finds_unmatched_quotes_in_document() {
1405 let raw = r#"
1406This is a paragraph with a single word "quoted."
1407
1408This is a second paragraph with no quotes.
1409
1410This is a third paragraph with a single erroneous "quote.
1411
1412This is a final paragraph with a weird "quote and a not-weird "quote".
1413 "#;
1414
1415 let doc = Document::new_markdown_default_curated(raw);
1416
1417 let quote_twins: Vec<_> = doc
1418 .iter_quotes()
1419 .map(|t| t.kind.as_quote().unwrap().twin_loc)
1420 .collect();
1421
1422 assert_eq!(
1423 quote_twins,
1424 vec![Some(19), Some(16), None, None, Some(89), Some(87)]
1425 )
1426 }
1427
1428 #[test]
1429 fn issue_1901() {
1430 let raw = r#"
1431"A quoted line"
1432"A quote without a closing mark
1433"Another quoted lined"
1434"The last quoted line"
1435 "#;
1436
1437 let doc = Document::new_markdown_default_curated(raw);
1438
1439 let quote_twins: Vec<_> = doc
1440 .iter_quotes()
1441 .map(|t| t.kind.as_quote().unwrap().twin_loc)
1442 .collect();
1443
1444 assert_eq!(
1445 quote_twins,
1446 vec![
1447 Some(6),
1448 Some(0),
1449 None,
1450 Some(27),
1451 Some(21),
1452 Some(37),
1453 Some(29)
1454 ]
1455 )
1456 }
1457}