1use std::cmp::Ordering;
2use std::collections::VecDeque;
3use std::fmt::Display;
4
5use harper_brill::{Chunker, Tagger, brill_tagger, burn_chunker};
6use itertools::Itertools;
7use paste::paste;
8
9use crate::expr::{Expr, ExprExt, FirstMatchOf, Repeating, SequenceExpr};
10use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
11use crate::patterns::WordSet;
12use crate::punctuation::Punctuation;
13use crate::spell::{Dictionary, FstDictionary};
14use crate::vec_ext::VecExt;
15use crate::{CharStringExt, FatStringToken, FatToken, Lrc, Token, TokenKind, TokenStringExt};
16use crate::{OrdinalSuffix, Span};
17
18#[derive(Debug, Clone)]
20pub struct Document {
21 source: Lrc<Vec<char>>,
22 tokens: Vec<Token>,
23}
24
25impl Default for Document {
26 fn default() -> Self {
27 Self::new("", &PlainEnglish, &FstDictionary::curated())
28 }
29}
30
31impl Document {
32 pub fn token_indices_intersecting(&self, span: Span<char>) -> Vec<usize> {
36 self.tokens()
37 .enumerate()
38 .filter_map(|(idx, tok)| tok.span.overlaps_with(span).then_some(idx))
39 .collect()
40 }
41
42 pub fn fat_tokens_intersecting(&self, span: Span<char>) -> Vec<FatToken> {
46 let indices = self.token_indices_intersecting(span);
47
48 indices
49 .into_iter()
50 .map(|i| self.tokens[i].to_fat(&self.source))
51 .collect()
52 }
53
54 pub fn new(text: &str, parser: &impl Parser, dictionary: &impl Dictionary) -> Self {
57 let source: Vec<_> = text.chars().collect();
58
59 Self::new_from_vec(Lrc::new(source), parser, dictionary)
60 }
61
62 pub fn new_curated(text: &str, parser: &impl Parser) -> Self {
65 let source: Vec<_> = text.chars().collect();
66
67 Self::new_from_vec(Lrc::new(source), parser, &FstDictionary::curated())
68 }
69
70 pub fn new_from_vec(
73 source: Lrc<Vec<char>>,
74 parser: &impl Parser,
75 dictionary: &impl Dictionary,
76 ) -> Self {
77 let tokens = parser.parse(&source);
78
79 let mut document = Self { source, tokens };
80 document.parse(dictionary);
81
82 document
83 }
84
85 pub fn new_plain_english_curated(text: &str) -> Self {
88 Self::new(text, &PlainEnglish, &FstDictionary::curated())
89 }
90
91 pub fn new_plain_english(text: &str, dictionary: &impl Dictionary) -> Self {
94 Self::new(text, &PlainEnglish, dictionary)
95 }
96
97 pub fn new_markdown_curated(text: &str, markdown_options: MarkdownOptions) -> Self {
100 Self::new(
101 text,
102 &Markdown::new(markdown_options),
103 &FstDictionary::curated(),
104 )
105 }
106
107 pub fn new_markdown_default_curated(text: &str) -> Self {
110 Self::new_markdown_curated(text, MarkdownOptions::default())
111 }
112
113 pub fn new_markdown(
116 text: &str,
117 markdown_options: MarkdownOptions,
118 dictionary: &impl Dictionary,
119 ) -> Self {
120 Self::new(text, &Markdown::new(markdown_options), dictionary)
121 }
122
123 pub fn new_markdown_default(text: &str, dictionary: &impl Dictionary) -> Self {
126 Self::new_markdown(text, MarkdownOptions::default(), dictionary)
127 }
128
129 fn parse(&mut self, dictionary: &impl Dictionary) {
133 self.condense_spaces();
134 self.condense_newlines();
135 self.newlines_to_breaks();
136 self.condense_contractions();
137 self.condense_dotted_initialisms();
138 self.condense_number_suffixes();
139 self.condense_ellipsis();
140 self.condense_latin();
141 self.condense_filename_extensions();
142 self.condense_tldr();
143 self.condense_ampersand_pairs();
144 self.condense_slash_pairs();
145 self.match_quotes();
146
147 let chunker = burn_chunker();
148 let tagger = brill_tagger();
149
150 for sent in self.tokens.iter_sentences_mut() {
151 let token_strings: Vec<_> = sent
152 .iter()
153 .filter(|t| !t.kind.is_whitespace())
154 .map(|t| t.span.get_content_string(&self.source))
155 .collect();
156
157 let token_tags = tagger.tag_sentence(&token_strings);
158 let np_flags = chunker.chunk_sentence(&token_strings, &token_tags);
159
160 let mut i = 0;
161
162 for token in sent.iter_mut() {
164 if let TokenKind::Word(meta) = &mut token.kind {
165 let word_source = token.span.get_content(&self.source);
166 let mut found_meta = dictionary
167 .get_lexeme_metadata(word_source)
168 .map(|c| c.into_owned());
169
170 if let Some(inner) = &mut found_meta {
171 inner.pos_tag = token_tags[i].or_else(|| inner.infer_pos_tag());
172 inner.np_member = Some(np_flags[i]);
173 }
174
175 *meta = found_meta;
176 i += 1;
177 } else if !token.kind.is_whitespace() {
178 i += 1;
179 }
180 }
181 }
182 }
183
184 fn newlines_to_breaks(&mut self) {
186 for token in &mut self.tokens {
187 if let TokenKind::Newline(n) = token.kind
188 && n >= 2
189 {
190 token.kind = TokenKind::ParagraphBreak;
191 }
192 }
193 }
194
195 fn condense_indices(&mut self, indices: &[usize], stretch_len: usize) {
201 for idx in indices {
203 let end_tok = self.tokens[idx + stretch_len - 1].clone();
204 let start_tok = &mut self.tokens[*idx];
205
206 start_tok.span.end = end_tok.span.end;
207 }
208
209 let old = self.tokens.clone();
211 self.tokens.clear();
212
213 self.tokens
215 .extend_from_slice(&old[0..indices.first().copied().unwrap_or(indices.len())]);
216
217 let mut iter = indices.iter().peekable();
218
219 while let (Some(a_idx), b) = (iter.next(), iter.peek()) {
220 self.tokens.push(old[*a_idx].clone());
221
222 if let Some(b_idx) = b {
223 self.tokens
224 .extend_from_slice(&old[a_idx + stretch_len..**b_idx]);
225 }
226 }
227
228 self.tokens.extend_from_slice(
230 &old[indices
231 .last()
232 .map(|v| v + stretch_len)
233 .unwrap_or(indices.len())..],
234 );
235 }
236
237 pub fn get_token_at_char_index(&self, char_index: usize) -> Option<&Token> {
238 let index = self
239 .tokens
240 .binary_search_by(|t| {
241 if t.span.overlaps_with(Span::new_with_len(char_index, 1)) {
242 Ordering::Equal
243 } else {
244 t.span.start.cmp(&char_index)
245 }
246 })
247 .ok()?;
248
249 Some(&self.tokens[index])
250 }
251
252 pub fn get_token(&self, index: usize) -> Option<&Token> {
254 self.tokens.get(index)
255 }
256
257 pub fn get_token_offset(&self, base: usize, offset: isize) -> Option<&Token> {
259 match base.checked_add_signed(offset) {
260 None => None,
261 Some(idx) => self.get_token(idx),
262 }
263 }
264
265 pub fn tokens(&self) -> impl Iterator<Item = &Token> + '_ {
267 self.tokens.iter()
268 }
269
270 pub fn iter_nominal_phrases(&self) -> impl Iterator<Item = &[Token]> {
271 fn is_np_member(t: &Token) -> bool {
272 t.kind
273 .as_word()
274 .and_then(|x| x.as_ref())
275 .and_then(|w| w.np_member)
276 .unwrap_or(false)
277 }
278
279 fn trim(slice: &[Token]) -> &[Token] {
280 let mut start = 0;
281 let mut end = slice.len();
282 while start < end && slice[start].kind.is_whitespace() {
283 start += 1;
284 }
285 while end > start && slice[end - 1].kind.is_whitespace() {
286 end -= 1;
287 }
288 &slice[start..end]
289 }
290
291 self.tokens
292 .as_slice()
293 .split(|t| !(is_np_member(t) || t.kind.is_whitespace()))
294 .filter_map(|s| {
295 let s = trim(s);
296 if s.iter().any(is_np_member) {
297 Some(s)
298 } else {
299 None
300 }
301 })
302 }
303
304 pub fn fat_tokens(&self) -> impl Iterator<Item = FatToken> + '_ {
306 self.tokens().map(|token| token.to_fat(&self.source))
307 }
308
309 pub fn get_next_word_from_offset(&self, base: usize, offset: isize) -> Option<&Token> {
312 if !self.get_token_offset(base, offset)?.kind.is_whitespace() {
314 return None;
315 }
316 let word_token = self.get_token_offset(base, offset + offset.signum());
318 let word_token = word_token?;
319 word_token.kind.is_word().then_some(word_token)
320 }
321
322 pub fn fat_string_tokens(&self) -> impl Iterator<Item = FatStringToken> + '_ {
324 self.fat_tokens().map(|t| t.into())
325 }
326
327 pub fn get_span_content(&self, span: &Span<char>) -> &[char] {
328 span.get_content(&self.source)
329 }
330
331 pub fn get_span_content_str(&self, span: &Span<char>) -> String {
332 String::from_iter(self.get_span_content(span))
333 }
334
335 pub fn get_full_string(&self) -> String {
336 self.get_span_content_str(&Span::new(0, self.source.len()))
337 }
338
339 pub fn get_full_content(&self) -> &[char] {
340 &self.source
341 }
342
343 pub fn get_source(&self) -> &[char] {
344 &self.source
345 }
346
347 pub fn get_tokens(&self) -> &[Token] {
348 &self.tokens
349 }
350
351 fn match_quotes(&mut self) {
357 let mut pg_indices: Vec<_> = vec![0];
358 pg_indices.extend(self.iter_paragraph_break_indices());
359 pg_indices.push(self.tokens.len());
360
361 let mut quote_indices = Vec::new();
363 let mut open_quote_indices = Vec::new();
364
365 for (start, end) in pg_indices.into_iter().tuple_windows() {
366 let pg = &mut self.tokens[start..end];
367
368 quote_indices.clear();
369 quote_indices.extend(pg.iter_quote_indices());
370 open_quote_indices.clear();
371
372 for quote in "e_indices {
374 let is_open = *quote == 0
375 || pg[0..*quote].iter_word_likes().next().is_none()
376 || pg[quote - 1].kind.is_whitespace()
377 || matches!(
378 pg[quote - 1].kind.as_punctuation(),
379 Some(Punctuation::LessThan)
380 | Some(Punctuation::OpenRound)
381 | Some(Punctuation::OpenSquare)
382 | Some(Punctuation::OpenCurly)
383 | Some(Punctuation::Apostrophe)
384 );
385
386 if is_open {
387 open_quote_indices.push(*quote);
388 }
389 }
390
391 while let Some(open_idx) = open_quote_indices.pop() {
392 let Some(close_idx) = pg[open_idx + 1..].iter_quote_indices().next() else {
393 continue;
394 };
395
396 if pg[close_idx + open_idx + 1]
397 .kind
398 .as_quote()
399 .unwrap()
400 .twin_loc
401 .is_some()
402 {
403 continue;
404 }
405
406 pg[open_idx].kind.as_mut_quote().unwrap().twin_loc =
407 Some(close_idx + open_idx + start + 1);
408 pg[close_idx + open_idx + 1]
409 .kind
410 .as_mut_quote()
411 .unwrap()
412 .twin_loc = Some(open_idx + start);
413 }
414 }
415 }
416
417 fn condense_number_suffixes(&mut self) {
419 if self.tokens.len() < 2 {
420 return;
421 }
422
423 let mut replace_starts = Vec::new();
424
425 for idx in 0..self.tokens.len() - 1 {
426 let b = &self.tokens[idx + 1];
427 let a = &self.tokens[idx];
428
429 if let (TokenKind::Number(..), TokenKind::Word(..)) = (&a.kind, &b.kind)
432 && let Some(found_suffix) =
433 OrdinalSuffix::from_chars(self.get_span_content(&b.span))
434 {
435 self.tokens[idx].kind.as_mut_number().unwrap().suffix = Some(found_suffix);
436 replace_starts.push(idx);
437 }
438 }
439
440 self.condense_indices(&replace_starts, 2);
441 }
442
443 fn condense_spaces(&mut self) {
446 let mut cursor = 0;
447 let copy = self.tokens.clone();
448
449 let mut remove_these = VecDeque::new();
450
451 while cursor < self.tokens.len() {
452 let start_tok = &mut self.tokens[cursor];
454
455 if let TokenKind::Space(start_count) = &mut start_tok.kind {
456 loop {
457 cursor += 1;
458
459 if cursor >= copy.len() {
460 break;
461 }
462
463 let child_tok = ©[cursor];
464
465 if start_tok.span.end != child_tok.span.start {
467 break;
468 }
469
470 if let TokenKind::Space(n) = child_tok.kind {
471 *start_count += n;
472 start_tok.span.end = child_tok.span.end;
473 remove_these.push_back(cursor);
474 cursor += 1;
475 } else {
476 break;
477 };
478 }
479 }
480
481 cursor += 1;
482 }
483
484 self.tokens.remove_indices(remove_these);
485 }
486
487 thread_local! {
488 static LATIN_EXPR: Lrc<FirstMatchOf> = Document::uncached_latin_expr();
489 }
490
491 fn uncached_latin_expr() -> Lrc<FirstMatchOf> {
492 Lrc::new(FirstMatchOf::new(vec![
493 Box::new(
494 SequenceExpr::default()
495 .then(WordSet::new(&["etc", "vs"]))
496 .then_period(),
497 ),
498 Box::new(
499 SequenceExpr::aco("et")
500 .then_whitespace()
501 .t_aco("al")
502 .then_period(),
503 ),
504 ]))
505 }
506
507 fn condense_expr<F>(&mut self, expr: &impl Expr, edit: F)
510 where
511 F: Fn(&mut Token),
512 {
513 let matches = expr.iter_matches_in_doc(self).collect::<Vec<_>>();
514
515 let mut remove_indices = VecDeque::with_capacity(matches.len());
516
517 for m in matches {
518 remove_indices.extend(m.start + 1..m.end);
519 self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
520 edit(&mut self.tokens[m.start]);
521 }
522
523 self.tokens.remove_indices(remove_indices);
524 }
525
526 fn condense_latin(&mut self) {
527 self.condense_expr(&Self::LATIN_EXPR.with(|v| v.clone()), |_| {})
528 }
529
530 fn condense_newlines(&mut self) {
533 let mut cursor = 0;
534 let copy = self.tokens.clone();
535
536 let mut remove_these = VecDeque::new();
537
538 while cursor < self.tokens.len() {
539 let start_tok = &mut self.tokens[cursor];
541
542 if let TokenKind::Newline(start_count) = &mut start_tok.kind {
543 loop {
544 cursor += 1;
545
546 if cursor >= copy.len() {
547 break;
548 }
549
550 let child_tok = ©[cursor];
551 if let TokenKind::Newline(n) = child_tok.kind {
552 *start_count += n;
553 start_tok.span.end = child_tok.span.end;
554 remove_these.push_back(cursor);
555 cursor += 1;
556 } else {
557 break;
558 };
559 }
560 }
561
562 cursor += 1;
563 }
564
565 self.tokens.remove_indices(remove_these);
566 }
567
568 fn condense_dotted_initialisms(&mut self) {
571 if self.tokens.len() < 2 {
572 return;
573 }
574
575 let mut to_remove = VecDeque::new();
576
577 let mut cursor = 1;
578
579 let mut initialism_start = None;
580
581 loop {
582 let a = &self.tokens[cursor - 1];
583 let b = &self.tokens[cursor];
584
585 let is_initialism_chunk = a.kind.is_word() && a.span.len() == 1 && b.kind.is_period();
586
587 if is_initialism_chunk {
588 if initialism_start.is_none() {
589 initialism_start = Some(cursor - 1);
590 } else {
591 to_remove.push_back(cursor - 1);
592 }
593
594 to_remove.push_back(cursor);
595 cursor += 1;
596 } else {
597 if let Some(start) = initialism_start {
598 let end = self.tokens[cursor - 2].span.end;
599 let start_tok: &mut Token = &mut self.tokens[start];
600 start_tok.span.end = end;
601 }
602
603 initialism_start = None;
604 }
605
606 cursor += 1;
607
608 if cursor >= self.tokens.len() - 1 {
609 break;
610 }
611 }
612
613 self.tokens.remove_indices(to_remove);
614 }
615
616 fn condense_filename_extensions(&mut self) {
618 if self.tokens.len() < 2 {
619 return;
620 }
621
622 let mut to_remove = VecDeque::new();
623
624 let mut cursor = 1;
625
626 let mut ext_start = None;
627
628 loop {
629 let l = self.get_token_offset(cursor, -2);
631 let d = &self.tokens[cursor - 1];
632 let x = &self.tokens[cursor];
633 let r = self.get_token_offset(cursor, 1);
634
635 let is_ext_chunk = d.kind.is_period()
636 && x.kind.is_word()
637 && x.span.len() <= 3
638 && ((l.is_none_or(|t| t.kind.is_whitespace())
639 && r.is_none_or(|t| t.kind.is_whitespace()))
640 || (l.is_some_and(|t| t.kind.is_open_round())
641 && r.is_some_and(|t| t.kind.is_close_round())))
642 && {
643 let ext_chars = x.span.get_content(&self.source);
644 ext_chars.iter().all(|c| c.is_ascii_lowercase())
645 || ext_chars.iter().all(|c| c.is_ascii_uppercase())
646 };
647
648 if is_ext_chunk {
649 if ext_start.is_none() {
650 ext_start = Some(cursor - 1);
651 self.tokens[cursor - 1].kind = TokenKind::Unlintable;
652 } else {
653 to_remove.push_back(cursor - 1);
654 }
655
656 to_remove.push_back(cursor);
657 cursor += 1;
658 } else {
659 if let Some(start) = ext_start {
660 let end = self.tokens[cursor - 2].span.end;
661 let start_tok: &mut Token = &mut self.tokens[start];
662 start_tok.span.end = end;
663 }
664
665 ext_start = None;
666 }
667
668 cursor += 1;
669
670 if cursor >= self.tokens.len() {
671 break;
672 }
673 }
674
675 self.tokens.remove_indices(to_remove);
676 }
677
678 fn condense_tldr(&mut self) {
680 if self.tokens.len() < 3 {
681 return;
682 }
683
684 let mut to_remove = VecDeque::new();
685 let mut cursor = 2;
686
687 loop {
688 let tl = &self.tokens[cursor - 2];
689 let simicolon = &self.tokens[cursor - 1];
690 let dr = &self.tokens[cursor];
691
692 let is_tldr_chunk = tl.kind.is_word()
693 && tl.span.len() == 2
694 && tl
695 .span
696 .get_content(&self.source)
697 .eq_ignore_ascii_case_chars(&['t', 'l'])
698 && simicolon.kind.is_semicolon()
699 && dr.kind.is_word()
700 && dr.span.len() >= 2
701 && dr.span.len() <= 3
702 && dr
703 .span
704 .get_content(&self.source)
705 .eq_any_ignore_ascii_case_chars(&[&['d', 'r'], &['d', 'r', 's']]);
706
707 if is_tldr_chunk {
708 self.tokens[cursor - 2].span = Span::new(
710 self.tokens[cursor - 2].span.start,
711 self.tokens[cursor].span.end,
712 );
713
714 to_remove.push_back(cursor - 1);
716 to_remove.push_back(cursor);
717 }
718
719 cursor += 1;
721
722 if cursor >= self.tokens.len() {
723 break;
724 }
725 }
726
727 self.tokens.remove_indices(to_remove);
729 }
730
731 fn condense_delimited_pairs<F>(&mut self, is_delimiter: F, valid_pairs: &[(char, char)])
739 where
740 F: Fn(&TokenKind) -> bool,
741 {
742 if self.tokens.len() < 3 {
743 return;
744 }
745
746 let mut to_remove = VecDeque::new();
747 let mut cursor = 2;
748
749 loop {
750 let l1 = &self.tokens[cursor - 2];
751 let delim = &self.tokens[cursor - 1];
752 let l2 = &self.tokens[cursor];
753
754 let is_delimited_chunk = l1.kind.is_word()
755 && l1.span.len() == 1
756 && is_delimiter(&delim.kind)
757 && l2.kind.is_word()
758 && l2.span.len() == 1;
759
760 if is_delimited_chunk {
761 let (l1, l2) = (
762 l1.span.get_content(&self.source).first(),
763 l2.span.get_content(&self.source).first(),
764 );
765
766 let is_valid_pair = match (l1, l2) {
767 (Some(l1), Some(l2)) => {
768 let pair = (l1.to_ascii_lowercase(), l2.to_ascii_lowercase());
769 valid_pairs.contains(&pair)
770 }
771 _ => false,
772 };
773
774 if is_valid_pair {
775 self.tokens[cursor - 2].span = Span::new(
776 self.tokens[cursor - 2].span.start,
777 self.tokens[cursor].span.end,
778 );
779 to_remove.push_back(cursor - 1);
780 to_remove.push_back(cursor);
781 }
782 }
783
784 cursor += 1;
785 if cursor >= self.tokens.len() {
786 break;
787 }
788 }
789
790 self.tokens.remove_indices(to_remove);
791 }
792
793 fn condense_ampersand_pairs(&mut self) {
795 self.condense_delimited_pairs(
796 |kind| kind.is_ampersand(),
797 &[
798 ('b', 'b'), ('b', 'w'), ('g', 't'), ('k', 'r'), ('q', 'a'), ('r', 'b'), ('r', 'd'), ('r', 'r'), ('s', 'p'), ],
808 );
809 }
810
811 fn condense_slash_pairs(&mut self) {
813 self.condense_delimited_pairs(
814 |kind| kind.is_slash(),
815 &[
816 ('a', 'c'), ('b', 'w'), ('c', 'o'), ('d', 'c'), ('d', 'l'), ('i', 'o'), ('j', 'k'), ('n', 'a'), ('r', 'c'), ('s', 'n'), ('y', 'n'), ('y', 'o'), ],
829 );
830 }
831
832 fn uncached_ellipsis_pattern() -> Lrc<Repeating> {
833 let period = SequenceExpr::default().then_period();
834 Lrc::new(Repeating::new(Box::new(period), 2))
835 }
836
837 thread_local! {
838 static ELLIPSIS_EXPR: Lrc<Repeating> = Document::uncached_ellipsis_pattern();
839 }
840
841 fn condense_ellipsis(&mut self) {
842 let expr = Self::ELLIPSIS_EXPR.with(|v| v.clone());
843 self.condense_expr(&expr, |tok| {
844 tok.kind = TokenKind::Punctuation(Punctuation::Ellipsis)
845 });
846 }
847
848 fn uncached_contraction_expr() -> Lrc<SequenceExpr> {
849 Lrc::new(
850 SequenceExpr::default()
851 .then_any_word()
852 .then_apostrophe()
853 .then_any_word(),
854 )
855 }
856
857 thread_local! {
858 static CONTRACTION_EXPR: Lrc<SequenceExpr> = Document::uncached_contraction_expr();
859 }
860
861 fn condense_contractions(&mut self) {
864 let expr = Self::CONTRACTION_EXPR.with(|v| v.clone());
865
866 self.condense_expr(&expr, |_| {})
867 }
868}
869
870macro_rules! create_fns_on_doc {
872 ($thing:ident) => {
873 paste! {
874 fn [< first_ $thing >](&self) -> Option<&Token> {
875 self.tokens.[< first_ $thing >]()
876 }
877
878 fn [< last_ $thing >](&self) -> Option<&Token> {
879 self.tokens.[< last_ $thing >]()
880 }
881
882 fn [< last_ $thing _index>](&self) -> Option<usize> {
883 self.tokens.[< last_ $thing _index >]()
884 }
885
886 fn [<iter_ $thing _indices>](&self) -> impl DoubleEndedIterator<Item = usize> + '_ {
887 self.tokens.[< iter_ $thing _indices >]()
888 }
889
890 fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_ {
891 self.tokens.[< iter_ $thing s >]()
892 }
893 }
894 };
895}
896
897impl TokenStringExt for Document {
898 create_fns_on_doc!(adjective);
899 create_fns_on_doc!(apostrophe);
900 create_fns_on_doc!(at);
901 create_fns_on_doc!(chunk_terminator);
902 create_fns_on_doc!(comma);
903 create_fns_on_doc!(conjunction);
904 create_fns_on_doc!(currency);
905 create_fns_on_doc!(ellipsis);
906 create_fns_on_doc!(hostname);
907 create_fns_on_doc!(likely_homograph);
908 create_fns_on_doc!(noun);
909 create_fns_on_doc!(number);
910 create_fns_on_doc!(paragraph_break);
911 create_fns_on_doc!(pipe);
912 create_fns_on_doc!(preposition);
913 create_fns_on_doc!(punctuation);
914 create_fns_on_doc!(quote);
915 create_fns_on_doc!(sentence_terminator);
916 create_fns_on_doc!(space);
917 create_fns_on_doc!(unlintable);
918 create_fns_on_doc!(verb);
919 create_fns_on_doc!(word);
920 create_fns_on_doc!(word_like);
921
922 fn first_sentence_word(&self) -> Option<&Token> {
923 self.tokens.first_sentence_word()
924 }
925
926 fn first_non_whitespace(&self) -> Option<&Token> {
927 self.tokens.first_non_whitespace()
928 }
929
930 fn span(&self) -> Option<Span<char>> {
931 self.tokens.span()
932 }
933
934 fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
935 self.tokens.iter_linking_verb_indices()
936 }
937
938 fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_ {
939 self.tokens.iter_linking_verbs()
940 }
941
942 fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
943 self.tokens.iter_chunks()
944 }
945
946 fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
947 self.tokens.iter_paragraphs()
948 }
949
950 fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
951 self.tokens.iter_sentences()
952 }
953
954 fn iter_sentences_mut(&mut self) -> impl Iterator<Item = &'_ mut [Token]> + '_ {
955 self.tokens.iter_sentences_mut()
956 }
957}
958
959impl Display for Document {
960 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
961 for token in &self.tokens {
962 write!(f, "{}", self.get_span_content_str(&token.span))?;
963 }
964
965 Ok(())
966 }
967}
968
969#[cfg(test)]
970mod tests {
971 use itertools::Itertools;
972
973 use super::Document;
974 use crate::TokenStringExt;
975 use crate::{Span, parsers::MarkdownOptions};
976
977 fn assert_condensed_contractions(text: &str, final_tok_count: usize) {
978 let document = Document::new_plain_english_curated(text);
979
980 assert_eq!(document.tokens.len(), final_tok_count);
981
982 let document = Document::new_markdown_curated(text, MarkdownOptions::default());
983
984 assert_eq!(document.tokens.len(), final_tok_count);
985 }
986
987 #[test]
988 fn simple_contraction() {
989 assert_condensed_contractions("isn't", 1);
990 }
991
992 #[test]
993 fn simple_contraction2() {
994 assert_condensed_contractions("wasn't", 1);
995 }
996
997 #[test]
998 fn simple_contraction3() {
999 assert_condensed_contractions("There's", 1);
1000 }
1001
1002 #[test]
1003 fn medium_contraction() {
1004 assert_condensed_contractions("isn't wasn't", 3);
1005 }
1006
1007 #[test]
1008 fn medium_contraction2() {
1009 assert_condensed_contractions("There's no way", 5);
1010 }
1011
1012 #[test]
1013 fn selects_token_at_char_index() {
1014 let text = "There were three little pigs. They built three little homes.";
1015 let document = Document::new_plain_english_curated(text);
1016
1017 let got = document.get_token_at_char_index(19).unwrap();
1018
1019 assert!(got.kind.is_word());
1020 assert_eq!(got.span, Span::new(17, 23));
1021 }
1022
1023 fn assert_token_count(source: &str, count: usize) {
1024 let document = Document::new_plain_english_curated(source);
1025
1026 dbg!(document.tokens().map(|t| t.kind.clone()).collect_vec());
1027 assert_eq!(document.tokens.len(), count);
1028 }
1029
1030 #[test]
1031 fn condenses_number_suffixes() {
1032 assert_token_count("1st", 1);
1033 assert_token_count("This is the 2nd test", 9);
1034 assert_token_count("This is the 3rd test", 9);
1035 assert_token_count(
1036 "It works even with weird capitalization like this: 600nD",
1037 18,
1038 );
1039 }
1040
1041 #[test]
1042 fn condenses_ie() {
1043 assert_token_count("There is a thing (i.e. that one)", 15);
1044 assert_token_count("We are trying to condense \"i.e.\"", 13);
1045 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1046 }
1047
1048 #[test]
1049 fn condenses_eg() {
1050 assert_token_count("We are trying to condense \"e.g.\"", 13);
1051 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1052 }
1053
1054 #[test]
1055 fn condenses_nsa() {
1056 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1057 }
1058
1059 #[test]
1060 fn parses_ellipsis() {
1061 assert_token_count("...", 1);
1062 }
1063
1064 #[test]
1065 fn parses_long_ellipsis() {
1066 assert_token_count(".....", 1);
1067 }
1068
1069 #[test]
1070 fn parses_short_ellipsis() {
1071 assert_token_count("..", 1);
1072 }
1073
1074 #[test]
1075 fn selects_token_at_offset() {
1076 let doc = Document::new_plain_english_curated("Foo bar baz");
1077
1078 let tok = doc.get_token_offset(1, -1).unwrap();
1079
1080 assert_eq!(tok.span, Span::new(0, 3));
1081 }
1082
1083 #[test]
1084 fn cant_select_token_before_start() {
1085 let doc = Document::new_plain_english_curated("Foo bar baz");
1086
1087 let tok = doc.get_token_offset(0, -1);
1088
1089 assert!(tok.is_none());
1090 }
1091
1092 #[test]
1093 fn select_next_word_pos_offset() {
1094 let doc = Document::new_plain_english_curated("Foo bar baz");
1095
1096 let bar = doc.get_next_word_from_offset(0, 1).unwrap();
1097 let bar = doc.get_span_content(&bar.span);
1098 assert_eq!(bar, ['b', 'a', 'r']);
1099 }
1100
1101 #[test]
1102 fn select_next_word_neg_offset() {
1103 let doc = Document::new_plain_english_curated("Foo bar baz");
1104
1105 let bar = doc.get_next_word_from_offset(2, -1).unwrap();
1106 let bar = doc.get_span_content(&bar.span);
1107 assert_eq!(bar, ['F', 'o', 'o']);
1108 }
1109
1110 #[test]
1111 fn cant_select_next_word_not_from_whitespace() {
1112 let doc = Document::new_plain_english_curated("Foo bar baz");
1113
1114 let tok = doc.get_next_word_from_offset(0, 2);
1115
1116 assert!(tok.is_none());
1117 }
1118
1119 #[test]
1120 fn cant_select_next_word_before_start() {
1121 let doc = Document::new_plain_english_curated("Foo bar baz");
1122
1123 let tok = doc.get_next_word_from_offset(0, -1);
1124
1125 assert!(tok.is_none());
1126 }
1127
1128 #[test]
1129 fn cant_select_next_word_with_punctuation_instead_of_whitespace() {
1130 let doc = Document::new_plain_english_curated("Foo, bar, baz");
1131
1132 let tok = doc.get_next_word_from_offset(0, 1);
1133
1134 assert!(tok.is_none());
1135 }
1136
1137 #[test]
1138 fn cant_select_next_word_with_punctuation_after_whitespace() {
1139 let doc = Document::new_plain_english_curated("Foo \"bar\", baz");
1140
1141 let tok = doc.get_next_word_from_offset(0, 1);
1142
1143 assert!(tok.is_none());
1144 }
1145
1146 #[test]
1147 fn condenses_filename_extensions() {
1148 let doc = Document::new_plain_english_curated(".c and .exe and .js");
1149 assert!(doc.tokens[0].kind.is_unlintable());
1150 assert!(doc.tokens[4].kind.is_unlintable());
1151 assert!(doc.tokens[8].kind.is_unlintable());
1152 }
1153
1154 #[test]
1155 fn condense_filename_extension_ok_at_start_and_end() {
1156 let doc = Document::new_plain_english_curated(".c and .EXE");
1157 assert!(doc.tokens.len() == 5);
1158 assert!(doc.tokens[0].kind.is_unlintable());
1159 assert!(doc.tokens[4].kind.is_unlintable());
1160 }
1161
1162 #[test]
1163 fn doesnt_condense_filename_extensions_with_mixed_case() {
1164 let doc = Document::new_plain_english_curated(".c and .Exe");
1165 assert!(doc.tokens.len() == 6);
1166 assert!(doc.tokens[0].kind.is_unlintable());
1167 assert!(doc.tokens[4].kind.is_punctuation());
1168 assert!(doc.tokens[5].kind.is_word());
1169 }
1170
1171 #[test]
1172 fn doesnt_condense_filename_extensions_with_non_letters() {
1173 let doc = Document::new_plain_english_curated(".COM and .C0M");
1174 assert!(doc.tokens.len() == 6);
1175 assert!(doc.tokens[0].kind.is_unlintable());
1176 assert!(doc.tokens[4].kind.is_punctuation());
1177 assert!(doc.tokens[5].kind.is_word());
1178 }
1179
1180 #[test]
1181 fn doesnt_condense_filename_extensions_longer_than_three() {
1182 let doc = Document::new_plain_english_curated(".dll and .dlls");
1183 assert!(doc.tokens.len() == 6);
1184 assert!(doc.tokens[0].kind.is_unlintable());
1185 assert!(doc.tokens[4].kind.is_punctuation());
1186 assert!(doc.tokens[5].kind.is_word());
1187 }
1188
1189 #[test]
1190 fn condense_filename_extension_in_parens() {
1191 let doc = Document::new_plain_english_curated(
1192 "true for the manual installation when trying to run the executable(.exe) after a manual download",
1193 );
1194 assert!(doc.tokens.len() > 23);
1195 assert!(doc.tokens[21].kind.is_open_round());
1196 assert!(doc.tokens[22].kind.is_unlintable());
1197 assert!(doc.tokens[23].kind.is_close_round());
1198 }
1199
1200 #[test]
1201 fn condense_tldr_uppercase() {
1202 let doc = Document::new_plain_english_curated("TL;DR");
1203 assert!(doc.tokens.len() == 1);
1204 assert!(doc.tokens[0].kind.is_word());
1205 assert!(doc.tokens[0].span.len() == 5);
1206 }
1207
1208 #[test]
1209 fn condense_tldr_lowercase() {
1210 let doc = Document::new_plain_english_curated("tl;dr");
1211 assert!(doc.tokens.len() == 1);
1212 assert!(doc.tokens[0].kind.is_word());
1213 }
1214
1215 #[test]
1216 fn condense_tldr_mixed_case_1() {
1217 let doc = Document::new_plain_english_curated("tl;DR");
1218 assert!(doc.tokens.len() == 1);
1219 assert!(doc.tokens[0].kind.is_word());
1220 }
1221
1222 #[test]
1223 fn condense_tldr_mixed_case_2() {
1224 let doc = Document::new_plain_english_curated("TL;Dr");
1225 assert!(doc.tokens.len() == 1);
1226 assert!(doc.tokens[0].kind.is_word());
1227 }
1228
1229 #[test]
1230 fn condense_tldr_pural() {
1231 let doc = Document::new_plain_english_curated(
1232 "managing the flow between components to produce relevant TL;DRs of current news articles",
1233 );
1234 assert!(
1236 doc.tokens
1237 .iter()
1238 .all(|t| t.kind.is_word() || t.kind.is_whitespace())
1239 );
1240 let tldrs = doc
1242 .tokens
1243 .iter()
1244 .filter(|t| t.span.get_content(&doc.source).contains(&';'))
1245 .collect_vec();
1246 assert!(tldrs.len() == 1);
1247 assert!(tldrs[0].span.get_content_string(&doc.source) == "TL;DRs");
1248 }
1249
1250 #[test]
1251 fn condense_r_and_d_caps() {
1252 let doc = Document::new_plain_english_curated("R&D");
1253 assert!(doc.tokens.len() == 1);
1254 assert!(doc.tokens[0].kind.is_word());
1255 }
1256
1257 #[test]
1258 fn condense_r_and_d_mixed_case() {
1259 let doc = Document::new_plain_english_curated("R&d");
1260 assert!(doc.tokens.len() == 1);
1261 assert!(doc.tokens[0].kind.is_word());
1262 }
1263
1264 #[test]
1265 fn condense_r_and_d_lowercase() {
1266 let doc = Document::new_plain_english_curated("r&d");
1267 assert!(doc.tokens.len() == 1);
1268 assert!(doc.tokens[0].kind.is_word());
1269 }
1270
1271 #[test]
1272 fn dont_condense_r_and_d_with_spaces() {
1273 let doc = Document::new_plain_english_curated("R & D");
1274 assert!(doc.tokens.len() == 5);
1275 assert!(doc.tokens[0].kind.is_word());
1276 assert!(doc.tokens[1].kind.is_whitespace());
1277 assert!(doc.tokens[2].kind.is_ampersand());
1278 assert!(doc.tokens[3].kind.is_whitespace());
1279 assert!(doc.tokens[4].kind.is_word());
1280 }
1281
1282 #[test]
1283 fn condense_q_and_a() {
1284 let doc =
1285 Document::new_plain_english_curated("A Q&A platform software for teams at any scales.");
1286 assert!(doc.tokens.len() >= 3);
1287 assert!(doc.tokens[2].kind.is_word());
1288 assert!(doc.tokens[2].span.get_content_string(&doc.source) == "Q&A");
1289 }
1290
1291 #[test]
1292 fn dont_allow_mixed_r_and_d_with_q_and_a() {
1293 let doc = Document::new_plain_english_curated("R&A or Q&D");
1294 assert!(doc.tokens.len() == 9);
1295 assert!(doc.tokens[1].kind.is_ampersand() || doc.tokens[7].kind.is_ampersand());
1296 }
1297
1298 #[test]
1299 fn condense_io() {
1300 let doc = Document::new_plain_english_curated("I/O");
1301 assert!(doc.tokens.len() == 1);
1302 assert!(doc.tokens[0].kind.is_word());
1303 }
1304
1305 #[test]
1306 fn finds_unmatched_quotes_in_document() {
1307 let raw = r#"
1308This is a paragraph with a single word "quoted."
1309
1310This is a second paragraph with no quotes.
1311
1312This is a third paragraph with a single erroneous "quote.
1313
1314This is a final paragraph with a weird "quote and a not-weird "quote".
1315 "#;
1316
1317 let doc = Document::new_markdown_default_curated(raw);
1318
1319 let quote_twins: Vec<_> = doc
1320 .iter_quotes()
1321 .map(|t| t.kind.as_quote().unwrap().twin_loc)
1322 .collect();
1323
1324 assert_eq!(
1325 quote_twins,
1326 vec![Some(19), Some(16), None, None, Some(89), Some(87)]
1327 )
1328 }
1329
1330 #[test]
1331 fn issue_1901() {
1332 let raw = r#"
1333"A quoted line"
1334"A quote without a closing mark
1335"Another quoted lined"
1336"The last quoted line"
1337 "#;
1338
1339 let doc = Document::new_markdown_default_curated(raw);
1340
1341 let quote_twins: Vec<_> = doc
1342 .iter_quotes()
1343 .map(|t| t.kind.as_quote().unwrap().twin_loc)
1344 .collect();
1345
1346 assert_eq!(
1347 quote_twins,
1348 vec![
1349 Some(6),
1350 Some(0),
1351 None,
1352 Some(27),
1353 Some(21),
1354 Some(37),
1355 Some(29)
1356 ]
1357 )
1358 }
1359}