1use std::cmp::Ordering;
2use std::collections::VecDeque;
3use std::fmt::Display;
4
5use harper_brill::{Chunker, Tagger, brill_tagger, burn_chunker};
6use itertools::Itertools;
7
8use crate::expr::{Expr, ExprExt, FirstMatchOf, Repeating, SequenceExpr};
9use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
10use crate::punctuation::Punctuation;
11use crate::spell::{Dictionary, FstDictionary};
12use crate::vec_ext::VecExt;
13use crate::{CharStringExt, FatStringToken, FatToken, Lrc, Token, TokenKind, TokenStringExt};
14use crate::{OrdinalSuffix, Span};
15
16#[derive(Debug, Clone)]
18pub struct Document {
19 source: Lrc<[char]>,
20 tokens: Vec<Token>,
21}
22
23impl Default for Document {
24 fn default() -> Self {
25 Self::new("", &PlainEnglish, &FstDictionary::curated())
26 }
27}
28
29impl Document {
30 pub fn token_indices_intersecting(&self, span: Span<char>) -> Vec<usize> {
34 self.tokens()
35 .enumerate()
36 .filter_map(|(idx, tok)| tok.span.overlaps_with(span).then_some(idx))
37 .collect()
38 }
39
40 pub fn fat_tokens_intersecting(&self, span: Span<char>) -> Vec<FatToken> {
44 let indices = self.token_indices_intersecting(span);
45
46 indices
47 .into_iter()
48 .map(|i| self.tokens[i].to_fat(&self.source))
49 .collect()
50 }
51
52 pub fn new(text: &str, parser: &impl Parser, dictionary: &impl Dictionary) -> Self {
55 let source: Lrc<_> = text.chars().collect();
56
57 Self::new_from_chars(source, parser, dictionary)
58 }
59
60 pub fn new_curated(text: &str, parser: &impl Parser) -> Self {
63 let source: Lrc<_> = text.chars().collect();
64
65 Self::new_from_chars(source, parser, &FstDictionary::curated())
66 }
67
68 pub fn new_from_chars(
71 source: Lrc<[char]>,
72 parser: &impl Parser,
73 dictionary: &impl Dictionary,
74 ) -> Self {
75 let tokens = parser.parse(&source);
76
77 let mut document = Self { source, tokens };
78 document.parse(dictionary);
79
80 document
81 }
82
83 pub fn new_plain_english_curated_chars(source: &[char]) -> Self {
86 Self::new_from_chars(Lrc::from(source), &PlainEnglish, &FstDictionary::curated())
87 }
88
89 pub fn new_plain_english_curated(text: &str) -> Self {
92 Self::new(text, &PlainEnglish, &FstDictionary::curated())
93 }
94
95 pub(crate) fn new_basic_tokenize(text: &str, parser: &impl Parser) -> Self {
101 let source: Lrc<_> = text.chars().collect();
102 let tokens = parser.parse(&source);
103 let mut document = Self { source, tokens };
104 document.apply_fixups();
105 document
106 }
107
108 pub fn new_plain_english(text: &str, dictionary: &impl Dictionary) -> Self {
111 Self::new(text, &PlainEnglish, dictionary)
112 }
113
114 pub fn new_markdown_curated(text: &str, markdown_options: MarkdownOptions) -> Self {
117 Self::new(
118 text,
119 &Markdown::new(markdown_options),
120 &FstDictionary::curated(),
121 )
122 }
123
124 pub fn new_markdown_default_curated_chars(chars: &[char]) -> Self {
127 Self::new_from_chars(
128 chars.to_vec().into(),
129 &Markdown::default(),
130 &FstDictionary::curated(),
131 )
132 }
133
134 pub fn new_markdown_default_curated(text: &str) -> Self {
137 Self::new_markdown_curated(text, MarkdownOptions::default())
138 }
139
140 pub fn new_markdown(
143 text: &str,
144 markdown_options: MarkdownOptions,
145 dictionary: &impl Dictionary,
146 ) -> Self {
147 Self::new(text, &Markdown::new(markdown_options), dictionary)
148 }
149
150 pub fn new_markdown_default(text: &str, dictionary: &impl Dictionary) -> Self {
153 Self::new_markdown(text, MarkdownOptions::default(), dictionary)
154 }
155
156 fn apply_fixups(&mut self) {
157 self.condense_spaces();
158 self.condense_newlines();
159 self.newlines_to_breaks();
160 self.condense_dotted_initialisms();
161 self.condense_number_suffixes();
162 self.condense_ellipsis();
163 self.condense_dotted_truncations();
164 self.condense_common_top_level_domains();
165 self.condense_filename_extensions();
166 self.condense_tldr();
167 self.condense_ampersand_pairs();
168 self.condense_slash_pairs();
169 self.match_quotes();
170 }
171
172 fn parse(&mut self, dictionary: &impl Dictionary) {
176 self.apply_fixups();
177
178 let chunker = burn_chunker();
179 let tagger = brill_tagger();
180
181 for sent in self.tokens.iter_sentences_mut() {
182 let token_strings: Vec<_> = sent
183 .iter()
184 .filter(|t| !t.kind.is_whitespace())
185 .map(|t| t.get_str(&self.source))
186 .collect();
187
188 let token_tags = tagger.tag_sentence(&token_strings);
189 let np_flags = chunker.chunk_sentence(&token_strings, &token_tags);
190
191 let word_sources: Vec<_> = sent
193 .iter()
194 .filter(|t| matches!(t.kind, TokenKind::Word(_)))
195 .map(|t| t.get_ch(&self.source))
196 .collect();
197
198 let mut ti = 0; let mut wi = 0; for token in sent.iter_mut() {
201 if let TokenKind::Word(meta) = &mut token.kind {
202 let word_source = word_sources[wi];
203 let mut found_meta = dictionary
204 .get_word_metadata(word_source)
205 .map(|c| c.into_owned());
206
207 if let Some(inner) = &mut found_meta {
208 inner.pos_tag = token_tags[ti].or_else(|| inner.infer_pos_tag());
209 inner.np_member = Some(np_flags[ti]);
210 }
211
212 *meta = found_meta;
213 ti += 1;
214 wi += 1;
215 } else if !token.kind.is_whitespace() {
216 ti += 1;
217 }
218 }
219 }
220 }
221
222 fn newlines_to_breaks(&mut self) {
224 for token in &mut self.tokens {
225 if let TokenKind::Newline(n) = token.kind
226 && n >= 2
227 {
228 token.kind = TokenKind::ParagraphBreak;
229 }
230 }
231 }
232
233 fn condense_indices(&mut self, indices: &[usize], stretch_len: usize) {
239 for idx in indices {
241 let end_tok = self.tokens[idx + stretch_len - 1].clone();
242 let start_tok = &mut self.tokens[*idx];
243
244 start_tok.span.end = end_tok.span.end;
245 }
246
247 let old = self.tokens.clone();
249 self.tokens.clear();
250
251 self.tokens
253 .extend_from_slice(&old[0..indices.first().copied().unwrap_or(indices.len())]);
254
255 let mut iter = indices.iter().peekable();
256
257 while let (Some(a_idx), b) = (iter.next(), iter.peek()) {
258 self.tokens.push(old[*a_idx].clone());
259
260 if let Some(b_idx) = b {
261 self.tokens
262 .extend_from_slice(&old[a_idx + stretch_len..**b_idx]);
263 }
264 }
265
266 self.tokens.extend_from_slice(
268 &old[indices
269 .last()
270 .map(|v| v + stretch_len)
271 .unwrap_or(indices.len())..],
272 );
273 }
274
275 pub fn get_token_at_char_index(&self, char_index: usize) -> Option<&Token> {
276 let index = self
277 .tokens
278 .binary_search_by(|t| {
279 if t.span.overlaps_with(Span::new_with_len(char_index, 1)) {
280 Ordering::Equal
281 } else {
282 t.span.start.cmp(&char_index)
283 }
284 })
285 .ok()?;
286
287 Some(&self.tokens[index])
288 }
289
290 pub fn get_token(&self, index: usize) -> Option<&Token> {
292 self.tokens.get(index)
293 }
294
295 pub fn get_token_offset(&self, base: usize, offset: isize) -> Option<&Token> {
297 match base.checked_add_signed(offset) {
298 None => None,
299 Some(idx) => self.get_token(idx),
300 }
301 }
302
303 pub fn tokens(&self) -> impl Iterator<Item = &Token> + '_ {
305 self.tokens.iter()
306 }
307
308 pub fn iter_nominal_phrases(&self) -> impl Iterator<Item = &[Token]> {
309 fn is_np_member(t: &Token) -> bool {
310 t.kind
311 .as_word()
312 .and_then(|x| x.as_ref())
313 .and_then(|w| w.np_member)
314 .unwrap_or(false)
315 }
316
317 fn trim(slice: &[Token]) -> &[Token] {
318 let mut start = 0;
319 let mut end = slice.len();
320 while start < end && slice[start].kind.is_whitespace() {
321 start += 1;
322 }
323 while end > start && slice[end - 1].kind.is_whitespace() {
324 end -= 1;
325 }
326 &slice[start..end]
327 }
328
329 self.tokens
330 .as_slice()
331 .split(|t| !(is_np_member(t) || t.kind.is_whitespace()))
332 .filter_map(|s| {
333 let s = trim(s);
334 if s.iter().any(is_np_member) {
335 Some(s)
336 } else {
337 None
338 }
339 })
340 }
341
342 pub fn fat_tokens(&self) -> impl Iterator<Item = FatToken> + '_ {
344 self.tokens().map(|token| token.to_fat(&self.source))
345 }
346
347 pub fn get_next_word_from_offset(&self, base: usize, offset: isize) -> Option<&Token> {
350 if !self.get_token_offset(base, offset)?.kind.is_whitespace() {
352 return None;
353 }
354 let word_token = self.get_token_offset(base, offset + offset.signum());
356 let word_token = word_token?;
357 word_token.kind.is_word().then_some(word_token)
358 }
359
360 pub fn fat_string_tokens(&self) -> impl Iterator<Item = FatStringToken> + '_ {
362 self.fat_tokens().map(|t| t.into())
363 }
364
365 pub fn get_span_content(&self, span: &Span<char>) -> &[char] {
366 span.get_content(&self.source)
367 }
368
369 pub fn get_span_content_str(&self, span: &Span<char>) -> String {
370 String::from_iter(self.get_span_content(span))
371 }
372
373 pub fn get_full_string(&self) -> String {
374 self.get_span_content_str(&Span::new(0, self.source.len()))
375 }
376
377 pub fn get_full_content(&self) -> &[char] {
378 &self.source
379 }
380
381 pub fn get_source(&self) -> &[char] {
382 &self.source
383 }
384
385 pub fn get_tokens(&self) -> &[Token] {
386 &self.tokens
387 }
388
389 fn match_quotes(&mut self) {
395 let mut pg_indices: Vec<_> = vec![0];
396 pg_indices.extend(self.iter_paragraph_break_indices());
397 pg_indices.push(self.tokens.len());
398
399 let mut quote_indices = Vec::new();
401 let mut open_quote_indices = Vec::new();
402
403 for (start, end) in pg_indices.into_iter().tuple_windows() {
404 let pg = &mut self.tokens[start..end];
405
406 quote_indices.clear();
407 quote_indices.extend(pg.iter_quote_indices());
408 open_quote_indices.clear();
409
410 for quote in "e_indices {
412 let is_open = *quote == 0
413 || pg[0..*quote].iter_word_likes().next().is_none()
414 || pg[quote - 1].kind.is_whitespace()
415 || matches!(
416 pg[quote - 1].kind.as_punctuation(),
417 Some(Punctuation::LessThan)
418 | Some(Punctuation::OpenRound)
419 | Some(Punctuation::OpenSquare)
420 | Some(Punctuation::OpenCurly)
421 | Some(Punctuation::EmDash)
422 | Some(Punctuation::EnDash)
423 | Some(Punctuation::Apostrophe)
424 );
425
426 if is_open {
427 open_quote_indices.push(*quote);
428 }
429 }
430
431 while let Some(open_idx) = open_quote_indices.pop() {
432 let Some(close_idx) = pg[open_idx + 1..].iter_quote_indices().next() else {
433 continue;
434 };
435
436 if pg[close_idx + open_idx + 1]
437 .kind
438 .as_quote()
439 .unwrap()
440 .twin_loc
441 .is_some()
442 {
443 continue;
444 }
445
446 pg[open_idx].kind.as_mut_quote().unwrap().twin_loc =
447 Some(close_idx + open_idx + start + 1);
448 pg[close_idx + open_idx + 1]
449 .kind
450 .as_mut_quote()
451 .unwrap()
452 .twin_loc = Some(open_idx + start);
453 }
454 }
455 }
456
457 fn condense_number_suffixes(&mut self) {
459 if self.tokens.len() < 2 {
460 return;
461 }
462
463 let mut replace_starts = Vec::new();
464
465 for idx in 0..self.tokens.len() - 1 {
466 let b = &self.tokens[idx + 1];
467 let a = &self.tokens[idx];
468
469 if let (TokenKind::Number(..), TokenKind::Word(..)) = (&a.kind, &b.kind)
472 && let Some(found_suffix) =
473 OrdinalSuffix::from_chars(self.get_span_content(&b.span))
474 {
475 self.tokens[idx].kind.as_mut_number().unwrap().suffix = Some(found_suffix);
476 replace_starts.push(idx);
477 }
478 }
479
480 self.condense_indices(&replace_starts, 2);
481 }
482
483 fn condense_spaces(&mut self) {
486 let mut cursor = 0;
487 let copy = self.tokens.clone();
488
489 let mut remove_these = VecDeque::new();
490
491 while cursor < self.tokens.len() {
492 let start_tok = &mut self.tokens[cursor];
494
495 if let TokenKind::Space(start_count) = &mut start_tok.kind {
496 loop {
497 cursor += 1;
498
499 if cursor >= copy.len() {
500 break;
501 }
502
503 let child_tok = ©[cursor];
504
505 if start_tok.span.end != child_tok.span.start {
507 break;
508 }
509
510 if let TokenKind::Space(n) = child_tok.kind {
511 *start_count += n;
512 start_tok.span.end = child_tok.span.end;
513 remove_these.push_back(cursor);
514 cursor += 1;
515 } else {
516 break;
517 };
518 }
519 }
520
521 cursor += 1;
522 }
523
524 self.tokens.remove_indices(remove_these);
525 }
526
527 thread_local! {
528 static DOTTED_TRUNCATION_EXPR: Lrc<FirstMatchOf> = Document::uncached_dotted_truncation_expr();
529 }
530
531 fn uncached_dotted_truncation_expr() -> Lrc<FirstMatchOf> {
532 Lrc::new(FirstMatchOf::new(vec![
533 Box::new(SequenceExpr::word_set(&["esp", "etc", "vs"]).then_period()),
534 Box::new(
535 SequenceExpr::aco("et")
536 .then_whitespace()
537 .t_aco("al")
538 .then_period(),
539 ),
540 ]))
541 }
542
543 fn condense_expr<F>(&mut self, expr: &impl Expr, edit: F)
546 where
547 F: Fn(&mut Token),
548 {
549 let matches = expr.iter_matches_in_doc(self).collect::<Vec<_>>();
550
551 let mut remove_indices = VecDeque::with_capacity(matches.len());
552
553 for m in matches {
554 remove_indices.extend(m.start + 1..m.end);
555 self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
556 edit(&mut self.tokens[m.start]);
557 }
558
559 self.tokens.remove_indices(remove_indices);
560 }
561
562 fn condense_dotted_truncations(&mut self) {
563 self.condense_expr(&Self::DOTTED_TRUNCATION_EXPR.with(|v| v.clone()), |_| {})
564 }
565
566 fn condense_newlines(&mut self) {
569 let mut cursor = 0;
570 let copy = self.tokens.clone();
571
572 let mut remove_these = VecDeque::new();
573
574 while cursor < self.tokens.len() {
575 let start_tok = &mut self.tokens[cursor];
577
578 if let TokenKind::Newline(start_count) = &mut start_tok.kind {
579 loop {
580 cursor += 1;
581
582 if cursor >= copy.len() {
583 break;
584 }
585
586 let child_tok = ©[cursor];
587 if let TokenKind::Newline(n) = child_tok.kind {
588 *start_count += n;
589 start_tok.span.end = child_tok.span.end;
590 remove_these.push_back(cursor);
591 cursor += 1;
592 } else {
593 break;
594 };
595 }
596 }
597
598 cursor += 1;
599 }
600
601 self.tokens.remove_indices(remove_these);
602 }
603
604 fn condense_dotted_initialisms(&mut self) {
607 if self.tokens.len() < 2 {
608 return;
609 }
610
611 let mut to_remove = VecDeque::new();
612
613 let mut cursor = 1;
614
615 let mut initialism_start = None;
616
617 loop {
618 let a = &self.tokens[cursor - 1];
619 let b = &self.tokens[cursor];
620
621 let is_initialism_chunk = a.kind.is_word() && a.span.len() == 1 && b.kind.is_period();
622
623 if is_initialism_chunk {
624 if initialism_start.is_none() {
625 initialism_start = Some(cursor - 1);
626 } else {
627 to_remove.push_back(cursor - 1);
628 }
629
630 to_remove.push_back(cursor);
631 cursor += 1;
632 } else {
633 if let Some(start) = initialism_start {
634 let end = self.tokens[cursor - 2].span.end;
635 let start_tok: &mut Token = &mut self.tokens[start];
636 start_tok.span.end = end;
637 }
638
639 initialism_start = None;
640 }
641
642 cursor += 1;
643
644 if cursor >= self.tokens.len() - 1 {
645 break;
646 }
647 }
648
649 self.tokens.remove_indices(to_remove);
650 }
651
652 fn condense_filename_extensions(&mut self) {
654 if self.tokens.len() < 2 {
655 return;
656 }
657
658 let mut to_remove = VecDeque::new();
659
660 let mut cursor = 1;
661
662 let mut ext_start = None;
663
664 loop {
665 let l = self.get_token_offset(cursor, -2);
667 let d = &self.tokens[cursor - 1];
668 let x = &self.tokens[cursor];
669 let r = self.get_token_offset(cursor, 1);
670
671 let is_ext_chunk = d.kind.is_period()
672 && x.kind.is_word()
673 && x.span.len() <= 3
674 && ((l.is_none_or(|t| t.kind.is_whitespace())
675 && r.is_none_or(|t| t.kind.is_whitespace()))
676 || (l.is_some_and(|t| t.kind.is_open_round())
677 && r.is_some_and(|t| t.kind.is_close_round())))
678 && {
679 let ext_chars = x.get_ch(&self.source);
680 ext_chars.iter().all(|c| c.is_ascii_lowercase())
681 || ext_chars.iter().all(|c| c.is_ascii_uppercase())
682 };
683
684 if is_ext_chunk {
685 if ext_start.is_none() {
686 ext_start = Some(cursor - 1);
687 self.tokens[cursor - 1].kind = TokenKind::Unlintable;
688 } else {
689 to_remove.push_back(cursor - 1);
690 }
691
692 to_remove.push_back(cursor);
693 cursor += 1;
694 } else {
695 if let Some(start) = ext_start {
696 let end = self.tokens[cursor - 2].span.end;
697 let start_tok: &mut Token = &mut self.tokens[start];
698 start_tok.span.end = end;
699 }
700
701 ext_start = None;
702 }
703
704 cursor += 1;
705
706 if cursor >= self.tokens.len() {
707 break;
708 }
709 }
710
711 self.tokens.remove_indices(to_remove);
712 }
713
714 fn condense_common_top_level_domains(&mut self) {
716 const COMMON_TOP_LEVEL_DOMAINS: &[&str; 106] = &[
717 "ai", "app", "blog", "co", "com", "dev", "edu", "gov", "info", "io", "me", "mil",
718 "net", "org", "shop", "tech", "uk", "us", "xyz", "jp", "de", "fr", "br", "it", "ru",
719 "es", "pl", "ca", "au", "cn", "in", "nl", "eu", "ch", "id", "at", "kr", "cz", "mx",
720 "be", "tv", "se", "tr", "tw", "al", "ua", "ir", "vn", "cl", "sk", "ly", "cc", "to",
721 "no", "fi", "pt", "dk", "ar", "hu", "tk", "gr", "il", "news", "ro", "my", "biz", "ie",
722 "za", "nz", "sg", "ee", "th", "pe", "bg", "hk", "rs", "lt", "link", "ph", "club", "si",
723 "site", "mobi", "by", "cat", "wiki", "la", "ga", "xxx", "cf", "hr", "ng", "jobs",
724 "online", "kz", "ug", "gq", "ae", "is", "lv", "pro", "fm", "tips", "ms", "sa", "int",
725 ];
726
727 if self.tokens.len() < 2 {
728 return;
729 }
730
731 let mut to_remove = VecDeque::new();
732 for cursor in 1..self.tokens.len() {
733 let l = self.get_token_offset(cursor, -2);
735 let d = &self.tokens[cursor - 1];
736 let tld = &self.tokens[cursor];
737 let r = self.get_token_offset(cursor, 1);
738
739 let is_tld_chunk = d.kind.is_period()
740 && tld.kind.is_word()
741 && tld
742 .get_ch(&self.source)
743 .iter()
744 .all(|c| c.is_ascii_alphabetic())
745 && tld
746 .get_ch(&self.source)
747 .eq_any_ignore_ascii_case_str(COMMON_TOP_LEVEL_DOMAINS)
748 && ((l.is_none_or(|t| t.kind.is_whitespace())
749 && r.is_none_or(|t| t.kind.is_whitespace()))
750 || (l.is_some_and(|t| t.kind.is_open_round())
751 && r.is_some_and(|t| t.kind.is_close_round())));
752
753 if is_tld_chunk {
754 self.tokens[cursor - 1].kind = TokenKind::Unlintable;
755 self.tokens[cursor - 1].span.end = self.tokens[cursor].span.end;
756 to_remove.push_back(cursor);
757 }
758 }
759
760 self.tokens.remove_indices(to_remove);
761 }
762
763 fn condense_tldr(&mut self) {
765 if self.tokens.len() < 3 {
766 return;
767 }
768
769 let mut to_remove = VecDeque::new();
770 let mut cursor = 2;
771
772 loop {
773 let tl = &self.tokens[cursor - 2];
774 let simicolon = &self.tokens[cursor - 1];
775 let dr = &self.tokens[cursor];
776
777 let is_tldr_chunk = tl.kind.is_word()
778 && tl.span.len() == 2
779 && tl.get_ch(&self.source).eq_ch(&['t', 'l'])
780 && simicolon.kind.is_semicolon()
781 && dr.kind.is_word()
782 && dr.span.len() >= 2
783 && dr.span.len() <= 3
784 && dr
785 .get_ch(&self.source)
786 .eq_any_ignore_ascii_case_chars(&[&['d', 'r'], &['d', 'r', 's']]);
787
788 if is_tldr_chunk {
789 self.tokens[cursor - 2].span = Span::new(
791 self.tokens[cursor - 2].span.start,
792 self.tokens[cursor].span.end,
793 );
794
795 to_remove.push_back(cursor - 1);
797 to_remove.push_back(cursor);
798 }
799
800 cursor += 1;
802
803 if cursor >= self.tokens.len() {
804 break;
805 }
806 }
807
808 self.tokens.remove_indices(to_remove);
810 }
811
812 fn condense_delimited_pairs<F>(&mut self, is_delimiter: F, valid_pairs: &[(char, char)])
820 where
821 F: Fn(&TokenKind) -> bool,
822 {
823 if self.tokens.len() < 3 {
824 return;
825 }
826
827 let mut to_remove = VecDeque::new();
828 let mut cursor = 2;
829
830 loop {
831 let l1 = &self.tokens[cursor - 2];
832 let delim = &self.tokens[cursor - 1];
833 let l2 = &self.tokens[cursor];
834
835 let is_delimited_chunk = l1.kind.is_word()
836 && l1.span.len() == 1
837 && is_delimiter(&delim.kind)
838 && l2.kind.is_word()
839 && l2.span.len() == 1;
840
841 if is_delimited_chunk {
842 let (l1, l2) = (
843 l1.get_ch(&self.source).first(),
844 l2.get_ch(&self.source).first(),
845 );
846
847 let is_valid_pair = match (l1, l2) {
848 (Some(l1), Some(l2)) => {
849 let pair = (l1.to_ascii_lowercase(), l2.to_ascii_lowercase());
850 valid_pairs.contains(&pair)
851 }
852 _ => false,
853 };
854
855 if is_valid_pair {
856 self.tokens[cursor - 2].span = Span::new(
857 self.tokens[cursor - 2].span.start,
858 self.tokens[cursor].span.end,
859 );
860 to_remove.push_back(cursor - 1);
861 to_remove.push_back(cursor);
862 }
863 }
864
865 cursor += 1;
866 if cursor >= self.tokens.len() {
867 break;
868 }
869 }
870
871 self.tokens.remove_indices(to_remove);
872 }
873
874 fn condense_ampersand_pairs(&mut self) {
876 self.condense_delimited_pairs(
877 |kind| kind.is_ampersand(),
878 &[
879 ('b', 'b'), ('b', 'w'), ('g', 't'), ('k', 'r'), ('q', 'a'), ('r', 'b'), ('r', 'd'), ('r', 'r'), ('s', 'p'), ],
889 );
890 }
891
892 fn condense_slash_pairs(&mut self) {
894 self.condense_delimited_pairs(
895 |kind| kind.is_slash(),
896 &[
897 ('a', 'c'), ('b', 'w'), ('c', 'o'), ('d', 'c'), ('d', 'l'), ('i', 'o'), ('j', 'k'), ('n', 'a'), ('r', 'c'), ('s', 'n'), ('y', 'n'), ('y', 'o'), ],
910 );
911 }
912
913 fn uncached_ellipsis_pattern() -> Lrc<Repeating> {
914 let period = SequenceExpr::default().then_period();
915 Lrc::new(Repeating::new(Box::new(period), 2))
916 }
917
918 thread_local! {
919 static ELLIPSIS_EXPR: Lrc<Repeating> = Document::uncached_ellipsis_pattern();
920 }
921
922 fn condense_ellipsis(&mut self) {
923 let expr = Self::ELLIPSIS_EXPR.with(|v| v.clone());
924 self.condense_expr(&expr, |tok| {
925 tok.kind = TokenKind::Punctuation(Punctuation::Ellipsis)
926 });
927 }
928}
929
930impl TokenStringExt for Document {
931 fn tokens(&self) -> &[Token] {
932 &self.tokens
933 }
934
935 fn tokens_mut(&mut self) -> &mut [Token] {
936 &mut self.tokens
937 }
938}
939
940impl Display for Document {
941 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
942 for token in &self.tokens {
943 write!(f, "{}", self.get_span_content_str(&token.span))?;
944 }
945
946 Ok(())
947 }
948}
949
950#[cfg(test)]
951mod tests {
952 use itertools::Itertools;
953
954 use super::Document;
955 use crate::TokenStringExt;
956 use crate::{Span, parsers::MarkdownOptions};
957
958 fn assert_condensed_contractions(text: &str, final_tok_count: usize) {
959 let document = Document::new_plain_english_curated(text);
960
961 assert_eq!(document.tokens.len(), final_tok_count);
962
963 let document = Document::new_markdown_curated(text, MarkdownOptions::default());
964
965 assert_eq!(document.tokens.len(), final_tok_count);
966 }
967
968 #[test]
969 fn simple_contraction() {
970 assert_condensed_contractions("isn't", 1);
971 }
972
973 #[test]
974 fn simple_contraction2() {
975 assert_condensed_contractions("wasn't", 1);
976 }
977
978 #[test]
979 fn simple_contraction3() {
980 assert_condensed_contractions("There's", 1);
981 }
982
983 #[test]
984 fn simple_contraction4() {
985 assert_condensed_contractions("doesn't", 1);
986 }
987
988 #[test]
989 fn medium_contraction() {
990 assert_condensed_contractions("isn't wasn't", 3);
991 }
992
993 #[test]
994 fn medium_contraction2() {
995 assert_condensed_contractions("There's no way", 5);
996 }
997
998 #[test]
999 fn selects_token_at_char_index() {
1000 let text = "There were three little pigs. They built three little homes.";
1001 let document = Document::new_plain_english_curated(text);
1002
1003 let got = document.get_token_at_char_index(19).unwrap();
1004
1005 assert!(got.kind.is_word());
1006 assert_eq!(got.span, Span::new(17, 23));
1007 }
1008
1009 fn assert_token_count(source: &str, count: usize) {
1010 let document = Document::new_plain_english_curated(source);
1011
1012 dbg!(document.tokens().map(|t| t.kind.clone()).collect_vec());
1013 assert_eq!(document.tokens.len(), count);
1014 }
1015
1016 #[test]
1017 fn condenses_number_suffixes() {
1018 assert_token_count("1st", 1);
1019 assert_token_count("This is the 2nd test", 9);
1020 assert_token_count("This is the 3rd test", 9);
1021 assert_token_count(
1022 "It works even with weird capitalization like this: 600nD",
1023 18,
1024 );
1025 }
1026
1027 #[test]
1028 fn condenses_ie() {
1029 assert_token_count("There is a thing (i.e. that one)", 15);
1030 assert_token_count("We are trying to condense \"i.e.\"", 13);
1031 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1032 }
1033
1034 #[test]
1035 fn condenses_eg() {
1036 assert_token_count("We are trying to condense \"e.g.\"", 13);
1037 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1038 }
1039
1040 #[test]
1041 fn condenses_nsa() {
1042 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1043 }
1044
1045 #[test]
1046 fn parses_ellipsis() {
1047 assert_token_count("...", 1);
1048 }
1049
1050 #[test]
1051 fn parses_long_ellipsis() {
1052 assert_token_count(".....", 1);
1053 }
1054
1055 #[test]
1056 fn parses_short_ellipsis() {
1057 assert_token_count("..", 1);
1058 }
1059
1060 #[test]
1061 fn selects_token_at_offset() {
1062 let doc = Document::new_plain_english_curated("Foo bar baz");
1063
1064 let tok = doc.get_token_offset(1, -1).unwrap();
1065
1066 assert_eq!(tok.span, Span::new(0, 3));
1067 }
1068
1069 #[test]
1070 fn cant_select_token_before_start() {
1071 let doc = Document::new_plain_english_curated("Foo bar baz");
1072
1073 let tok = doc.get_token_offset(0, -1);
1074
1075 assert!(tok.is_none());
1076 }
1077
1078 #[test]
1079 fn select_next_word_pos_offset() {
1080 let doc = Document::new_plain_english_curated("Foo bar baz");
1081
1082 let bar = doc.get_next_word_from_offset(0, 1).unwrap();
1083 let bar = doc.get_span_content(&bar.span);
1084 assert_eq!(bar, ['b', 'a', 'r']);
1085 }
1086
1087 #[test]
1088 fn select_next_word_neg_offset() {
1089 let doc = Document::new_plain_english_curated("Foo bar baz");
1090
1091 let bar = doc.get_next_word_from_offset(2, -1).unwrap();
1092 let bar = doc.get_span_content(&bar.span);
1093 assert_eq!(bar, ['F', 'o', 'o']);
1094 }
1095
1096 #[test]
1097 fn cant_select_next_word_not_from_whitespace() {
1098 let doc = Document::new_plain_english_curated("Foo bar baz");
1099
1100 let tok = doc.get_next_word_from_offset(0, 2);
1101
1102 assert!(tok.is_none());
1103 }
1104
1105 #[test]
1106 fn cant_select_next_word_before_start() {
1107 let doc = Document::new_plain_english_curated("Foo bar baz");
1108
1109 let tok = doc.get_next_word_from_offset(0, -1);
1110
1111 assert!(tok.is_none());
1112 }
1113
1114 #[test]
1115 fn cant_select_next_word_with_punctuation_instead_of_whitespace() {
1116 let doc = Document::new_plain_english_curated("Foo, bar, baz");
1117
1118 let tok = doc.get_next_word_from_offset(0, 1);
1119
1120 assert!(tok.is_none());
1121 }
1122
1123 #[test]
1124 fn cant_select_next_word_with_punctuation_after_whitespace() {
1125 let doc = Document::new_plain_english_curated("Foo \"bar\", baz");
1126
1127 let tok = doc.get_next_word_from_offset(0, 1);
1128
1129 assert!(tok.is_none());
1130 }
1131
1132 #[test]
1133 fn condenses_filename_extensions() {
1134 let doc = Document::new_plain_english_curated(".c and .exe and .js");
1135 assert!(doc.tokens[0].kind.is_unlintable());
1136 assert!(doc.tokens[4].kind.is_unlintable());
1137 assert!(doc.tokens[8].kind.is_unlintable());
1138 }
1139
1140 #[test]
1141 fn condense_filename_extension_ok_at_start_and_end() {
1142 let doc = Document::new_plain_english_curated(".c and .EXE");
1143 assert!(doc.tokens.len() == 5);
1144 assert!(doc.tokens[0].kind.is_unlintable());
1145 assert!(doc.tokens[4].kind.is_unlintable());
1146 }
1147
1148 #[test]
1149 fn doesnt_condense_filename_extensions_with_mixed_case() {
1150 let doc = Document::new_plain_english_curated(".c and .Exe");
1151 assert!(doc.tokens.len() == 6);
1152 assert!(doc.tokens[0].kind.is_unlintable());
1153 assert!(doc.tokens[4].kind.is_punctuation());
1154 assert!(doc.tokens[5].kind.is_word());
1155 }
1156
1157 #[test]
1158 fn doesnt_condense_filename_extensions_with_non_letters() {
1159 let doc = Document::new_plain_english_curated(".COM and .C0M");
1160 assert!(doc.tokens.len() == 6);
1161 assert!(doc.tokens[0].kind.is_unlintable());
1162 assert!(doc.tokens[4].kind.is_punctuation());
1163 assert!(doc.tokens[5].kind.is_word());
1164 }
1165
1166 #[test]
1167 fn doesnt_condense_filename_extensions_longer_than_three() {
1168 let doc = Document::new_plain_english_curated(".dll and .dlls");
1169 assert!(doc.tokens.len() == 6);
1170 assert!(doc.tokens[0].kind.is_unlintable());
1171 assert!(doc.tokens[4].kind.is_punctuation());
1172 assert!(doc.tokens[5].kind.is_word());
1173 }
1174
1175 #[test]
1176 fn condense_filename_extension_in_parens() {
1177 let doc = Document::new_plain_english_curated(
1178 "true for the manual installation when trying to run the executable(.exe) after a manual download",
1179 );
1180 assert!(doc.tokens.len() > 23);
1181 assert!(doc.tokens[21].kind.is_open_round());
1182 assert!(doc.tokens[22].kind.is_unlintable());
1183 assert!(doc.tokens[23].kind.is_close_round());
1184 }
1185
1186 #[test]
1187 fn condense_tldr_uppercase() {
1188 let doc = Document::new_plain_english_curated("TL;DR");
1189 assert!(doc.tokens.len() == 1);
1190 assert!(doc.tokens[0].kind.is_word());
1191 assert!(doc.tokens[0].span.len() == 5);
1192 }
1193
1194 #[test]
1195 fn condense_tldr_lowercase() {
1196 let doc = Document::new_plain_english_curated("tl;dr");
1197 assert!(doc.tokens.len() == 1);
1198 assert!(doc.tokens[0].kind.is_word());
1199 }
1200
1201 #[test]
1202 fn condense_tldr_mixed_case_1() {
1203 let doc = Document::new_plain_english_curated("tl;DR");
1204 assert!(doc.tokens.len() == 1);
1205 assert!(doc.tokens[0].kind.is_word());
1206 }
1207
1208 #[test]
1209 fn condense_tldr_mixed_case_2() {
1210 let doc = Document::new_plain_english_curated("TL;Dr");
1211 assert!(doc.tokens.len() == 1);
1212 assert!(doc.tokens[0].kind.is_word());
1213 }
1214
1215 #[test]
1216 fn condense_tldr_pural() {
1217 let doc = Document::new_plain_english_curated(
1218 "managing the flow between components to produce relevant TL;DRs of current news articles",
1219 );
1220 assert!(
1222 doc.tokens
1223 .iter()
1224 .all(|t| t.kind.is_word() || t.kind.is_whitespace())
1225 );
1226 let tldrs = doc
1228 .tokens
1229 .iter()
1230 .filter(|t| t.get_ch(&doc.source).contains(&';'))
1231 .collect_vec();
1232 assert!(tldrs.len() == 1);
1233 assert!(tldrs[0].get_str(&doc.source) == "TL;DRs");
1234 }
1235
1236 #[test]
1237 fn condense_common_top_level_domains() {
1238 let doc = Document::new_plain_english_curated(".blog and .com and .NET");
1239 assert!(doc.tokens.len() == 9);
1240 assert!(doc.tokens[0].kind.is_unlintable());
1241 assert!(doc.tokens[4].kind.is_unlintable());
1242 assert!(doc.tokens[8].kind.is_unlintable());
1243 }
1244
1245 #[test]
1246 fn condense_common_top_level_domains_in_parens() {
1247 let doc = Document::new_plain_english_curated("(.blog)");
1248 assert!(doc.tokens.len() == 3);
1249 assert!(doc.tokens[0].kind.is_open_round());
1250 assert!(doc.tokens[1].kind.is_unlintable());
1251 assert!(doc.tokens[2].kind.is_close_round());
1252 }
1253
1254 #[test]
1255 fn doesnt_condense_unknown_top_level_domains() {
1256 let doc = Document::new_plain_english_curated(".harper");
1257 assert!(doc.tokens.len() == 2);
1258 assert!(doc.tokens[0].kind.is_punctuation());
1259 assert!(doc.tokens[1].kind.is_word());
1260 }
1261
1262 #[test]
1263 fn condense_r_and_d_caps() {
1264 let doc = Document::new_plain_english_curated("R&D");
1265 assert!(doc.tokens.len() == 1);
1266 assert!(doc.tokens[0].kind.is_word());
1267 }
1268
1269 #[test]
1270 fn condense_r_and_d_mixed_case() {
1271 let doc = Document::new_plain_english_curated("R&d");
1272 assert!(doc.tokens.len() == 1);
1273 assert!(doc.tokens[0].kind.is_word());
1274 }
1275
1276 #[test]
1277 fn condense_r_and_d_lowercase() {
1278 let doc = Document::new_plain_english_curated("r&d");
1279 assert!(doc.tokens.len() == 1);
1280 assert!(doc.tokens[0].kind.is_word());
1281 }
1282
1283 #[test]
1284 fn dont_condense_r_and_d_with_spaces() {
1285 let doc = Document::new_plain_english_curated("R & D");
1286 assert!(doc.tokens.len() == 5);
1287 assert!(doc.tokens[0].kind.is_word());
1288 assert!(doc.tokens[1].kind.is_whitespace());
1289 assert!(doc.tokens[2].kind.is_ampersand());
1290 assert!(doc.tokens[3].kind.is_whitespace());
1291 assert!(doc.tokens[4].kind.is_word());
1292 }
1293
1294 #[test]
1295 fn condense_q_and_a() {
1296 let doc =
1297 Document::new_plain_english_curated("A Q&A platform software for teams at any scales.");
1298 assert!(doc.tokens.len() >= 3);
1299 assert!(doc.tokens[2].kind.is_word());
1300 assert!(doc.tokens[2].get_str(&doc.source) == "Q&A");
1301 }
1302
1303 #[test]
1304 fn dont_allow_mixed_r_and_d_with_q_and_a() {
1305 let doc = Document::new_plain_english_curated("R&A or Q&D");
1306 assert!(doc.tokens.len() == 9);
1307 assert!(doc.tokens[1].kind.is_ampersand() || doc.tokens[7].kind.is_ampersand());
1308 }
1309
1310 #[test]
1311 fn condense_io() {
1312 let doc = Document::new_plain_english_curated("I/O");
1313 assert!(doc.tokens.len() == 1);
1314 assert!(doc.tokens[0].kind.is_word());
1315 }
1316
1317 #[test]
1318 fn finds_unmatched_quotes_in_document() {
1319 let raw = r#"
1320This is a paragraph with a single word "quoted."
1321
1322This is a second paragraph with no quotes.
1323
1324This is a third paragraph with a single erroneous "quote.
1325
1326This is a final paragraph with a weird "quote and a not-weird "quote".
1327 "#;
1328
1329 let doc = Document::new_markdown_default_curated(raw);
1330
1331 let quote_twins: Vec<_> = doc
1332 .iter_quotes()
1333 .map(|t| t.kind.as_quote().unwrap().twin_loc)
1334 .collect();
1335
1336 assert_eq!(
1337 quote_twins,
1338 vec![Some(19), Some(16), None, None, Some(89), Some(87)]
1339 )
1340 }
1341
1342 #[test]
1343 fn issue_1901() {
1344 let raw = r#"
1345"A quoted line"
1346"A quote without a closing mark
1347"Another quoted lined"
1348"The last quoted line"
1349 "#;
1350
1351 let doc = Document::new_markdown_default_curated(raw);
1352
1353 let quote_twins: Vec<_> = doc
1354 .iter_quotes()
1355 .map(|t| t.kind.as_quote().unwrap().twin_loc)
1356 .collect();
1357
1358 assert_eq!(
1359 quote_twins,
1360 vec![
1361 Some(6),
1362 Some(0),
1363 None,
1364 Some(27),
1365 Some(21),
1366 Some(37),
1367 Some(29)
1368 ]
1369 )
1370 }
1371}