1use std::cmp::Ordering;
2use std::collections::VecDeque;
3use std::fmt::Display;
4
5use harper_brill::{Chunker, Tagger, brill_tagger, burn_chunker};
6use paste::paste;
7
8use crate::expr::{Expr, ExprExt, FirstMatchOf, Repeating, SequenceExpr};
9use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
10use crate::patterns::WordSet;
11use crate::punctuation::Punctuation;
12use crate::spell::{Dictionary, FstDictionary};
13use crate::vec_ext::VecExt;
14use crate::{CharStringExt, FatStringToken, FatToken, Lrc, Token, TokenKind, TokenStringExt};
15use crate::{OrdinalSuffix, Span};
16
17#[derive(Debug, Clone)]
19pub struct Document {
20 source: Lrc<Vec<char>>,
21 tokens: Vec<Token>,
22}
23
24impl Default for Document {
25 fn default() -> Self {
26 Self::new("", &PlainEnglish, &FstDictionary::curated())
27 }
28}
29
30impl Document {
31 pub fn token_indices_intersecting(&self, span: Span<char>) -> Vec<usize> {
35 self.tokens()
36 .enumerate()
37 .filter_map(|(idx, tok)| tok.span.overlaps_with(span).then_some(idx))
38 .collect()
39 }
40
41 pub fn fat_tokens_intersecting(&self, span: Span<char>) -> Vec<FatToken> {
45 let indices = self.token_indices_intersecting(span);
46
47 indices
48 .into_iter()
49 .map(|i| self.tokens[i].to_fat(&self.source))
50 .collect()
51 }
52
53 pub fn new(text: &str, parser: &impl Parser, dictionary: &impl Dictionary) -> Self {
56 let source: Vec<_> = text.chars().collect();
57
58 Self::new_from_vec(Lrc::new(source), parser, dictionary)
59 }
60
61 pub fn new_curated(text: &str, parser: &impl Parser) -> Self {
64 let source: Vec<_> = text.chars().collect();
65
66 Self::new_from_vec(Lrc::new(source), parser, &FstDictionary::curated())
67 }
68
69 pub fn new_from_vec(
72 source: Lrc<Vec<char>>,
73 parser: &impl Parser,
74 dictionary: &impl Dictionary,
75 ) -> Self {
76 let tokens = parser.parse(&source);
77
78 let mut document = Self { source, tokens };
79 document.parse(dictionary);
80
81 document
82 }
83
84 pub fn new_plain_english_curated(text: &str) -> Self {
87 Self::new(text, &PlainEnglish, &FstDictionary::curated())
88 }
89
90 pub fn new_plain_english(text: &str, dictionary: &impl Dictionary) -> Self {
93 Self::new(text, &PlainEnglish, dictionary)
94 }
95
96 pub fn new_markdown_curated(text: &str, markdown_options: MarkdownOptions) -> Self {
99 Self::new(
100 text,
101 &Markdown::new(markdown_options),
102 &FstDictionary::curated(),
103 )
104 }
105
106 pub fn new_markdown_default_curated(text: &str) -> Self {
109 Self::new_markdown_curated(text, MarkdownOptions::default())
110 }
111
112 pub fn new_markdown(
115 text: &str,
116 markdown_options: MarkdownOptions,
117 dictionary: &impl Dictionary,
118 ) -> Self {
119 Self::new(text, &Markdown::new(markdown_options), dictionary)
120 }
121
122 pub fn new_markdown_default(text: &str, dictionary: &impl Dictionary) -> Self {
125 Self::new_markdown(text, MarkdownOptions::default(), dictionary)
126 }
127
128 fn parse(&mut self, dictionary: &impl Dictionary) {
132 self.condense_spaces();
133 self.condense_newlines();
134 self.newlines_to_breaks();
135 self.condense_contractions();
136 self.condense_dotted_initialisms();
137 self.condense_number_suffixes();
138 self.condense_ellipsis();
139 self.condense_latin();
140 self.condense_filename_extensions();
141 self.condense_tldr();
142 self.condense_ampersand_pairs();
143 self.condense_slash_pairs();
144 self.match_quotes();
145
146 let chunker = burn_chunker();
147 let tagger = brill_tagger();
148
149 for sent in self.tokens.iter_sentences_mut() {
150 let token_strings: Vec<_> = sent
151 .iter()
152 .filter(|t| !t.kind.is_whitespace())
153 .map(|t| t.span.get_content_string(&self.source))
154 .collect();
155
156 let token_tags = tagger.tag_sentence(&token_strings);
157 let np_flags = chunker.chunk_sentence(&token_strings, &token_tags);
158
159 let mut i = 0;
160
161 for token in sent.iter_mut() {
163 if let TokenKind::Word(meta) = &mut token.kind {
164 let word_source = token.span.get_content(&self.source);
165 let mut found_meta = dictionary
166 .get_lexeme_metadata(word_source)
167 .map(|c| c.into_owned());
168
169 if let Some(inner) = &mut found_meta {
170 inner.pos_tag = token_tags[i].or_else(|| inner.infer_pos_tag());
171 inner.np_member = Some(np_flags[i]);
172 }
173
174 *meta = found_meta;
175 i += 1;
176 } else if !token.kind.is_whitespace() {
177 i += 1;
178 }
179 }
180 }
181 }
182
183 fn newlines_to_breaks(&mut self) {
185 for token in &mut self.tokens {
186 if let TokenKind::Newline(n) = token.kind
187 && n >= 2
188 {
189 token.kind = TokenKind::ParagraphBreak;
190 }
191 }
192 }
193
194 fn condense_indices(&mut self, indices: &[usize], stretch_len: usize) {
200 for idx in indices {
202 let end_tok = self.tokens[idx + stretch_len - 1].clone();
203 let start_tok = &mut self.tokens[*idx];
204
205 start_tok.span.end = end_tok.span.end;
206 }
207
208 let old = self.tokens.clone();
210 self.tokens.clear();
211
212 self.tokens
214 .extend_from_slice(&old[0..indices.first().copied().unwrap_or(indices.len())]);
215
216 let mut iter = indices.iter().peekable();
217
218 while let (Some(a_idx), b) = (iter.next(), iter.peek()) {
219 self.tokens.push(old[*a_idx].clone());
220
221 if let Some(b_idx) = b {
222 self.tokens
223 .extend_from_slice(&old[a_idx + stretch_len..**b_idx]);
224 }
225 }
226
227 self.tokens.extend_from_slice(
229 &old[indices
230 .last()
231 .map(|v| v + stretch_len)
232 .unwrap_or(indices.len())..],
233 );
234 }
235
236 pub fn get_token_at_char_index(&self, char_index: usize) -> Option<&Token> {
237 let index = self
238 .tokens
239 .binary_search_by(|t| {
240 if t.span.overlaps_with(Span::new_with_len(char_index, 1)) {
241 Ordering::Equal
242 } else {
243 t.span.start.cmp(&char_index)
244 }
245 })
246 .ok()?;
247
248 Some(&self.tokens[index])
249 }
250
251 pub fn get_token(&self, index: usize) -> Option<&Token> {
253 self.tokens.get(index)
254 }
255
256 pub fn get_token_offset(&self, base: usize, offset: isize) -> Option<&Token> {
258 match base.checked_add_signed(offset) {
259 None => None,
260 Some(idx) => self.get_token(idx),
261 }
262 }
263
264 pub fn tokens(&self) -> impl Iterator<Item = &Token> + '_ {
266 self.tokens.iter()
267 }
268
269 pub fn iter_nominal_phrases(&self) -> impl Iterator<Item = &[Token]> {
270 fn is_np_member(t: &Token) -> bool {
271 t.kind
272 .as_word()
273 .and_then(|x| x.as_ref())
274 .and_then(|w| w.np_member)
275 .unwrap_or(false)
276 }
277
278 fn trim(slice: &[Token]) -> &[Token] {
279 let mut start = 0;
280 let mut end = slice.len();
281 while start < end && slice[start].kind.is_whitespace() {
282 start += 1;
283 }
284 while end > start && slice[end - 1].kind.is_whitespace() {
285 end -= 1;
286 }
287 &slice[start..end]
288 }
289
290 self.tokens
291 .as_slice()
292 .split(|t| !(is_np_member(t) || t.kind.is_whitespace()))
293 .filter_map(|s| {
294 let s = trim(s);
295 if s.iter().any(is_np_member) {
296 Some(s)
297 } else {
298 None
299 }
300 })
301 }
302
303 pub fn fat_tokens(&self) -> impl Iterator<Item = FatToken> + '_ {
305 self.tokens().map(|token| token.to_fat(&self.source))
306 }
307
308 pub fn get_next_word_from_offset(&self, base: usize, offset: isize) -> Option<&Token> {
311 if !self.get_token_offset(base, offset)?.kind.is_whitespace() {
313 return None;
314 }
315 let word_token = self.get_token_offset(base, offset + offset.signum());
317 let word_token = word_token?;
318 word_token.kind.is_word().then_some(word_token)
319 }
320
321 pub fn fat_string_tokens(&self) -> impl Iterator<Item = FatStringToken> + '_ {
323 self.fat_tokens().map(|t| t.into())
324 }
325
326 pub fn get_span_content(&self, span: &Span<char>) -> &[char] {
327 span.get_content(&self.source)
328 }
329
330 pub fn get_span_content_str(&self, span: &Span<char>) -> String {
331 String::from_iter(self.get_span_content(span))
332 }
333
334 pub fn get_full_string(&self) -> String {
335 self.get_span_content_str(&Span::new(0, self.source.len()))
336 }
337
338 pub fn get_full_content(&self) -> &[char] {
339 &self.source
340 }
341
342 pub fn get_source(&self) -> &[char] {
343 &self.source
344 }
345
346 pub fn get_tokens(&self) -> &[Token] {
347 &self.tokens
348 }
349
350 fn match_quotes(&mut self) {
356 let quote_indices: Vec<usize> = self.tokens.iter_quote_indices().collect();
357
358 for i in 0..quote_indices.len() / 2 {
359 let a_i = quote_indices[i * 2];
360 let b_i = quote_indices[i * 2 + 1];
361
362 {
363 let a = self.tokens[a_i].kind.as_mut_quote().unwrap();
364 a.twin_loc = Some(b_i);
365 }
366
367 {
368 let b = self.tokens[b_i].kind.as_mut_quote().unwrap();
369 b.twin_loc = Some(a_i);
370 }
371 }
372 }
373
374 fn condense_number_suffixes(&mut self) {
376 if self.tokens.len() < 2 {
377 return;
378 }
379
380 let mut replace_starts = Vec::new();
381
382 for idx in 0..self.tokens.len() - 1 {
383 let b = &self.tokens[idx + 1];
384 let a = &self.tokens[idx];
385
386 if let (TokenKind::Number(..), TokenKind::Word(..)) = (&a.kind, &b.kind)
389 && let Some(found_suffix) =
390 OrdinalSuffix::from_chars(self.get_span_content(&b.span))
391 {
392 self.tokens[idx].kind.as_mut_number().unwrap().suffix = Some(found_suffix);
393 replace_starts.push(idx);
394 }
395 }
396
397 self.condense_indices(&replace_starts, 2);
398 }
399
400 fn condense_spaces(&mut self) {
403 let mut cursor = 0;
404 let copy = self.tokens.clone();
405
406 let mut remove_these = VecDeque::new();
407
408 while cursor < self.tokens.len() {
409 let start_tok = &mut self.tokens[cursor];
411
412 if let TokenKind::Space(start_count) = &mut start_tok.kind {
413 loop {
414 cursor += 1;
415
416 if cursor >= copy.len() {
417 break;
418 }
419
420 let child_tok = ©[cursor];
421
422 if start_tok.span.end != child_tok.span.start {
424 break;
425 }
426
427 if let TokenKind::Space(n) = child_tok.kind {
428 *start_count += n;
429 start_tok.span.end = child_tok.span.end;
430 remove_these.push_back(cursor);
431 cursor += 1;
432 } else {
433 break;
434 };
435 }
436 }
437
438 cursor += 1;
439 }
440
441 self.tokens.remove_indices(remove_these);
442 }
443
444 thread_local! {
445 static LATIN_EXPR: Lrc<FirstMatchOf> = Document::uncached_latin_expr();
446 }
447
448 fn uncached_latin_expr() -> Lrc<FirstMatchOf> {
449 Lrc::new(FirstMatchOf::new(vec![
450 Box::new(
451 SequenceExpr::default()
452 .then(WordSet::new(&["etc", "vs"]))
453 .then_period(),
454 ),
455 Box::new(
456 SequenceExpr::aco("et")
457 .then_whitespace()
458 .t_aco("al")
459 .then_period(),
460 ),
461 ]))
462 }
463
464 fn condense_expr<F>(&mut self, expr: &impl Expr, edit: F)
467 where
468 F: Fn(&mut Token),
469 {
470 let matches = expr.iter_matches_in_doc(self).collect::<Vec<_>>();
471
472 let mut remove_indices = VecDeque::with_capacity(matches.len());
473
474 for m in matches {
475 remove_indices.extend(m.start + 1..m.end);
476 self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
477 edit(&mut self.tokens[m.start]);
478 }
479
480 self.tokens.remove_indices(remove_indices);
481 }
482
483 fn condense_latin(&mut self) {
484 self.condense_expr(&Self::LATIN_EXPR.with(|v| v.clone()), |_| {})
485 }
486
487 fn condense_newlines(&mut self) {
490 let mut cursor = 0;
491 let copy = self.tokens.clone();
492
493 let mut remove_these = VecDeque::new();
494
495 while cursor < self.tokens.len() {
496 let start_tok = &mut self.tokens[cursor];
498
499 if let TokenKind::Newline(start_count) = &mut start_tok.kind {
500 loop {
501 cursor += 1;
502
503 if cursor >= copy.len() {
504 break;
505 }
506
507 let child_tok = ©[cursor];
508 if let TokenKind::Newline(n) = child_tok.kind {
509 *start_count += n;
510 start_tok.span.end = child_tok.span.end;
511 remove_these.push_back(cursor);
512 cursor += 1;
513 } else {
514 break;
515 };
516 }
517 }
518
519 cursor += 1;
520 }
521
522 self.tokens.remove_indices(remove_these);
523 }
524
525 fn condense_dotted_initialisms(&mut self) {
528 if self.tokens.len() < 2 {
529 return;
530 }
531
532 let mut to_remove = VecDeque::new();
533
534 let mut cursor = 1;
535
536 let mut initialism_start = None;
537
538 loop {
539 let a = &self.tokens[cursor - 1];
540 let b = &self.tokens[cursor];
541
542 let is_initialism_chunk = a.kind.is_word() && a.span.len() == 1 && b.kind.is_period();
543
544 if is_initialism_chunk {
545 if initialism_start.is_none() {
546 initialism_start = Some(cursor - 1);
547 } else {
548 to_remove.push_back(cursor - 1);
549 }
550
551 to_remove.push_back(cursor);
552 cursor += 1;
553 } else {
554 if let Some(start) = initialism_start {
555 let end = self.tokens[cursor - 2].span.end;
556 let start_tok: &mut Token = &mut self.tokens[start];
557 start_tok.span.end = end;
558 }
559
560 initialism_start = None;
561 }
562
563 cursor += 1;
564
565 if cursor >= self.tokens.len() - 1 {
566 break;
567 }
568 }
569
570 self.tokens.remove_indices(to_remove);
571 }
572
573 fn condense_filename_extensions(&mut self) {
575 if self.tokens.len() < 2 {
576 return;
577 }
578
579 let mut to_remove = VecDeque::new();
580
581 let mut cursor = 1;
582
583 let mut ext_start = None;
584
585 loop {
586 let l = self.get_token_offset(cursor, -2);
588 let d = &self.tokens[cursor - 1];
589 let x = &self.tokens[cursor];
590 let r = self.get_token_offset(cursor, 1);
591
592 let is_ext_chunk = d.kind.is_period()
593 && x.kind.is_word()
594 && x.span.len() <= 3
595 && ((l.is_none_or(|t| t.kind.is_whitespace())
596 && r.is_none_or(|t| t.kind.is_whitespace()))
597 || (l.is_some_and(|t| t.kind.is_open_round())
598 && r.is_some_and(|t| t.kind.is_close_round())))
599 && {
600 let ext_chars = x.span.get_content(&self.source);
601 ext_chars.iter().all(|c| c.is_ascii_lowercase())
602 || ext_chars.iter().all(|c| c.is_ascii_uppercase())
603 };
604
605 if is_ext_chunk {
606 if ext_start.is_none() {
607 ext_start = Some(cursor - 1);
608 self.tokens[cursor - 1].kind = TokenKind::Unlintable;
609 } else {
610 to_remove.push_back(cursor - 1);
611 }
612
613 to_remove.push_back(cursor);
614 cursor += 1;
615 } else {
616 if let Some(start) = ext_start {
617 let end = self.tokens[cursor - 2].span.end;
618 let start_tok: &mut Token = &mut self.tokens[start];
619 start_tok.span.end = end;
620 }
621
622 ext_start = None;
623 }
624
625 cursor += 1;
626
627 if cursor >= self.tokens.len() {
628 break;
629 }
630 }
631
632 self.tokens.remove_indices(to_remove);
633 }
634
635 fn condense_tldr(&mut self) {
637 if self.tokens.len() < 3 {
638 return;
639 }
640
641 let mut to_remove = VecDeque::new();
642 let mut cursor = 2;
643
644 loop {
645 let tl = &self.tokens[cursor - 2];
646 let simicolon = &self.tokens[cursor - 1];
647 let dr = &self.tokens[cursor];
648
649 let is_tldr_chunk = tl.kind.is_word()
650 && tl.span.len() == 2
651 && tl
652 .span
653 .get_content(&self.source)
654 .eq_ignore_ascii_case_chars(&['t', 'l'])
655 && simicolon.kind.is_semicolon()
656 && dr.kind.is_word()
657 && dr.span.len() >= 2
658 && dr.span.len() <= 3
659 && dr
660 .span
661 .get_content(&self.source)
662 .eq_any_ignore_ascii_case_chars(&[&['d', 'r'], &['d', 'r', 's']]);
663
664 if is_tldr_chunk {
665 self.tokens[cursor - 2].span = Span::new(
667 self.tokens[cursor - 2].span.start,
668 self.tokens[cursor].span.end,
669 );
670
671 to_remove.push_back(cursor - 1);
673 to_remove.push_back(cursor);
674 }
675
676 cursor += 1;
678
679 if cursor >= self.tokens.len() {
680 break;
681 }
682 }
683
684 self.tokens.remove_indices(to_remove);
686 }
687
688 fn condense_delimited_pairs<F>(&mut self, is_delimiter: F, valid_pairs: &[(char, char)])
696 where
697 F: Fn(&TokenKind) -> bool,
698 {
699 if self.tokens.len() < 3 {
700 return;
701 }
702
703 let mut to_remove = VecDeque::new();
704 let mut cursor = 2;
705
706 loop {
707 let l1 = &self.tokens[cursor - 2];
708 let delim = &self.tokens[cursor - 1];
709 let l2 = &self.tokens[cursor];
710
711 let is_delimited_chunk = l1.kind.is_word()
712 && l1.span.len() == 1
713 && is_delimiter(&delim.kind)
714 && l2.kind.is_word()
715 && l2.span.len() == 1;
716
717 if is_delimited_chunk {
718 let (l1, l2) = (
719 l1.span.get_content(&self.source).first(),
720 l2.span.get_content(&self.source).first(),
721 );
722
723 let is_valid_pair = match (l1, l2) {
724 (Some(l1), Some(l2)) => {
725 let pair = (l1.to_ascii_lowercase(), l2.to_ascii_lowercase());
726 valid_pairs.contains(&pair)
727 }
728 _ => false,
729 };
730
731 if is_valid_pair {
732 self.tokens[cursor - 2].span = Span::new(
733 self.tokens[cursor - 2].span.start,
734 self.tokens[cursor].span.end,
735 );
736 to_remove.push_back(cursor - 1);
737 to_remove.push_back(cursor);
738 }
739 }
740
741 cursor += 1;
742 if cursor >= self.tokens.len() {
743 break;
744 }
745 }
746
747 self.tokens.remove_indices(to_remove);
748 }
749
750 fn condense_ampersand_pairs(&mut self) {
752 self.condense_delimited_pairs(
753 |kind| kind.is_ampersand(),
754 &[
755 ('b', 'b'), ('b', 'w'), ('g', 't'), ('k', 'r'), ('q', 'a'), ('r', 'b'), ('r', 'd'), ('r', 'r'), ('s', 'p'), ],
765 );
766 }
767
768 fn condense_slash_pairs(&mut self) {
770 self.condense_delimited_pairs(
771 |kind| kind.is_slash(),
772 &[
773 ('a', 'c'), ('b', 'w'), ('c', 'o'), ('d', 'c'), ('d', 'l'), ('i', 'o'), ('j', 'k'), ('n', 'a'), ('r', 'c'), ('s', 'n'), ('y', 'n'), ('y', 'o'), ],
786 );
787 }
788
789 fn uncached_ellipsis_pattern() -> Lrc<Repeating> {
790 let period = SequenceExpr::default().then_period();
791 Lrc::new(Repeating::new(Box::new(period), 2))
792 }
793
794 thread_local! {
795 static ELLIPSIS_EXPR: Lrc<Repeating> = Document::uncached_ellipsis_pattern();
796 }
797
798 fn condense_ellipsis(&mut self) {
799 let expr = Self::ELLIPSIS_EXPR.with(|v| v.clone());
800 self.condense_expr(&expr, |tok| {
801 tok.kind = TokenKind::Punctuation(Punctuation::Ellipsis)
802 });
803 }
804
805 fn uncached_contraction_expr() -> Lrc<SequenceExpr> {
806 Lrc::new(
807 SequenceExpr::default()
808 .then_any_word()
809 .then_apostrophe()
810 .then_any_word(),
811 )
812 }
813
814 thread_local! {
815 static CONTRACTION_EXPR: Lrc<SequenceExpr> = Document::uncached_contraction_expr();
816 }
817
818 fn condense_contractions(&mut self) {
821 let expr = Self::CONTRACTION_EXPR.with(|v| v.clone());
822
823 self.condense_expr(&expr, |_| {})
824 }
825}
826
827macro_rules! create_fns_on_doc {
829 ($thing:ident) => {
830 paste! {
831 fn [< first_ $thing >](&self) -> Option<&Token> {
832 self.tokens.[< first_ $thing >]()
833 }
834
835 fn [< last_ $thing >](&self) -> Option<&Token> {
836 self.tokens.[< last_ $thing >]()
837 }
838
839 fn [< last_ $thing _index>](&self) -> Option<usize> {
840 self.tokens.[< last_ $thing _index >]()
841 }
842
843 fn [<iter_ $thing _indices>](&self) -> impl DoubleEndedIterator<Item = usize> + '_ {
844 self.tokens.[< iter_ $thing _indices >]()
845 }
846
847 fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_ {
848 self.tokens.[< iter_ $thing s >]()
849 }
850 }
851 };
852}
853
854impl TokenStringExt for Document {
855 create_fns_on_doc!(adjective);
856 create_fns_on_doc!(apostrophe);
857 create_fns_on_doc!(at);
858 create_fns_on_doc!(chunk_terminator);
859 create_fns_on_doc!(comma);
860 create_fns_on_doc!(conjunction);
861 create_fns_on_doc!(currency);
862 create_fns_on_doc!(ellipsis);
863 create_fns_on_doc!(hostname);
864 create_fns_on_doc!(likely_homograph);
865 create_fns_on_doc!(noun);
866 create_fns_on_doc!(number);
867 create_fns_on_doc!(paragraph_break);
868 create_fns_on_doc!(pipe);
869 create_fns_on_doc!(preposition);
870 create_fns_on_doc!(punctuation);
871 create_fns_on_doc!(quote);
872 create_fns_on_doc!(sentence_terminator);
873 create_fns_on_doc!(space);
874 create_fns_on_doc!(unlintable);
875 create_fns_on_doc!(verb);
876 create_fns_on_doc!(word);
877 create_fns_on_doc!(word_like);
878
879 fn first_sentence_word(&self) -> Option<&Token> {
880 self.tokens.first_sentence_word()
881 }
882
883 fn first_non_whitespace(&self) -> Option<&Token> {
884 self.tokens.first_non_whitespace()
885 }
886
887 fn span(&self) -> Option<Span<char>> {
888 self.tokens.span()
889 }
890
891 fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
892 self.tokens.iter_linking_verb_indices()
893 }
894
895 fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_ {
896 self.tokens.iter_linking_verbs()
897 }
898
899 fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
900 self.tokens.iter_chunks()
901 }
902
903 fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
904 self.tokens.iter_paragraphs()
905 }
906
907 fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
908 self.tokens.iter_sentences()
909 }
910
911 fn iter_sentences_mut(&mut self) -> impl Iterator<Item = &'_ mut [Token]> + '_ {
912 self.tokens.iter_sentences_mut()
913 }
914}
915
916impl Display for Document {
917 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
918 for token in &self.tokens {
919 write!(f, "{}", self.get_span_content_str(&token.span))?;
920 }
921
922 Ok(())
923 }
924}
925
926#[cfg(test)]
927mod tests {
928 use itertools::Itertools;
929
930 use super::Document;
931 use crate::{Span, parsers::MarkdownOptions};
932
933 fn assert_condensed_contractions(text: &str, final_tok_count: usize) {
934 let document = Document::new_plain_english_curated(text);
935
936 assert_eq!(document.tokens.len(), final_tok_count);
937
938 let document = Document::new_markdown_curated(text, MarkdownOptions::default());
939
940 assert_eq!(document.tokens.len(), final_tok_count);
941 }
942
943 #[test]
944 fn simple_contraction() {
945 assert_condensed_contractions("isn't", 1);
946 }
947
948 #[test]
949 fn simple_contraction2() {
950 assert_condensed_contractions("wasn't", 1);
951 }
952
953 #[test]
954 fn simple_contraction3() {
955 assert_condensed_contractions("There's", 1);
956 }
957
958 #[test]
959 fn medium_contraction() {
960 assert_condensed_contractions("isn't wasn't", 3);
961 }
962
963 #[test]
964 fn medium_contraction2() {
965 assert_condensed_contractions("There's no way", 5);
966 }
967
968 #[test]
969 fn selects_token_at_char_index() {
970 let text = "There were three little pigs. They built three little homes.";
971 let document = Document::new_plain_english_curated(text);
972
973 let got = document.get_token_at_char_index(19).unwrap();
974
975 assert!(got.kind.is_word());
976 assert_eq!(got.span, Span::new(17, 23));
977 }
978
979 fn assert_token_count(source: &str, count: usize) {
980 let document = Document::new_plain_english_curated(source);
981
982 dbg!(document.tokens().map(|t| t.kind.clone()).collect_vec());
983 assert_eq!(document.tokens.len(), count);
984 }
985
986 #[test]
987 fn condenses_number_suffixes() {
988 assert_token_count("1st", 1);
989 assert_token_count("This is the 2nd test", 9);
990 assert_token_count("This is the 3rd test", 9);
991 assert_token_count(
992 "It works even with weird capitalization like this: 600nD",
993 18,
994 );
995 }
996
997 #[test]
998 fn condenses_ie() {
999 assert_token_count("There is a thing (i.e. that one)", 15);
1000 assert_token_count("We are trying to condense \"i.e.\"", 13);
1001 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1002 }
1003
1004 #[test]
1005 fn condenses_eg() {
1006 assert_token_count("We are trying to condense \"e.g.\"", 13);
1007 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1008 }
1009
1010 #[test]
1011 fn condenses_nsa() {
1012 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1013 }
1014
1015 #[test]
1016 fn parses_ellipsis() {
1017 assert_token_count("...", 1);
1018 }
1019
1020 #[test]
1021 fn parses_long_ellipsis() {
1022 assert_token_count(".....", 1);
1023 }
1024
1025 #[test]
1026 fn parses_short_ellipsis() {
1027 assert_token_count("..", 1);
1028 }
1029
1030 #[test]
1031 fn selects_token_at_offset() {
1032 let doc = Document::new_plain_english_curated("Foo bar baz");
1033
1034 let tok = doc.get_token_offset(1, -1).unwrap();
1035
1036 assert_eq!(tok.span, Span::new(0, 3));
1037 }
1038
1039 #[test]
1040 fn cant_select_token_before_start() {
1041 let doc = Document::new_plain_english_curated("Foo bar baz");
1042
1043 let tok = doc.get_token_offset(0, -1);
1044
1045 assert!(tok.is_none());
1046 }
1047
1048 #[test]
1049 fn select_next_word_pos_offset() {
1050 let doc = Document::new_plain_english_curated("Foo bar baz");
1051
1052 let bar = doc.get_next_word_from_offset(0, 1).unwrap();
1053 let bar = doc.get_span_content(&bar.span);
1054 assert_eq!(bar, ['b', 'a', 'r']);
1055 }
1056
1057 #[test]
1058 fn select_next_word_neg_offset() {
1059 let doc = Document::new_plain_english_curated("Foo bar baz");
1060
1061 let bar = doc.get_next_word_from_offset(2, -1).unwrap();
1062 let bar = doc.get_span_content(&bar.span);
1063 assert_eq!(bar, ['F', 'o', 'o']);
1064 }
1065
1066 #[test]
1067 fn cant_select_next_word_not_from_whitespace() {
1068 let doc = Document::new_plain_english_curated("Foo bar baz");
1069
1070 let tok = doc.get_next_word_from_offset(0, 2);
1071
1072 assert!(tok.is_none());
1073 }
1074
1075 #[test]
1076 fn cant_select_next_word_before_start() {
1077 let doc = Document::new_plain_english_curated("Foo bar baz");
1078
1079 let tok = doc.get_next_word_from_offset(0, -1);
1080
1081 assert!(tok.is_none());
1082 }
1083
1084 #[test]
1085 fn cant_select_next_word_with_punctuation_instead_of_whitespace() {
1086 let doc = Document::new_plain_english_curated("Foo, bar, baz");
1087
1088 let tok = doc.get_next_word_from_offset(0, 1);
1089
1090 assert!(tok.is_none());
1091 }
1092
1093 #[test]
1094 fn cant_select_next_word_with_punctuation_after_whitespace() {
1095 let doc = Document::new_plain_english_curated("Foo \"bar\", baz");
1096
1097 let tok = doc.get_next_word_from_offset(0, 1);
1098
1099 assert!(tok.is_none());
1100 }
1101
1102 #[test]
1103 fn condenses_filename_extensions() {
1104 let doc = Document::new_plain_english_curated(".c and .exe and .js");
1105 assert!(doc.tokens[0].kind.is_unlintable());
1106 assert!(doc.tokens[4].kind.is_unlintable());
1107 assert!(doc.tokens[8].kind.is_unlintable());
1108 }
1109
1110 #[test]
1111 fn condense_filename_extension_ok_at_start_and_end() {
1112 let doc = Document::new_plain_english_curated(".c and .EXE");
1113 assert!(doc.tokens.len() == 5);
1114 assert!(doc.tokens[0].kind.is_unlintable());
1115 assert!(doc.tokens[4].kind.is_unlintable());
1116 }
1117
1118 #[test]
1119 fn doesnt_condense_filename_extensions_with_mixed_case() {
1120 let doc = Document::new_plain_english_curated(".c and .Exe");
1121 assert!(doc.tokens.len() == 6);
1122 assert!(doc.tokens[0].kind.is_unlintable());
1123 assert!(doc.tokens[4].kind.is_punctuation());
1124 assert!(doc.tokens[5].kind.is_word());
1125 }
1126
1127 #[test]
1128 fn doesnt_condense_filename_extensions_with_non_letters() {
1129 let doc = Document::new_plain_english_curated(".COM and .C0M");
1130 assert!(doc.tokens.len() == 6);
1131 assert!(doc.tokens[0].kind.is_unlintable());
1132 assert!(doc.tokens[4].kind.is_punctuation());
1133 assert!(doc.tokens[5].kind.is_word());
1134 }
1135
1136 #[test]
1137 fn doesnt_condense_filename_extensions_longer_than_three() {
1138 let doc = Document::new_plain_english_curated(".dll and .dlls");
1139 assert!(doc.tokens.len() == 6);
1140 assert!(doc.tokens[0].kind.is_unlintable());
1141 assert!(doc.tokens[4].kind.is_punctuation());
1142 assert!(doc.tokens[5].kind.is_word());
1143 }
1144
1145 #[test]
1146 fn condense_filename_extension_in_parens() {
1147 let doc = Document::new_plain_english_curated(
1148 "true for the manual installation when trying to run the executable(.exe) after a manual download",
1149 );
1150 assert!(doc.tokens.len() > 23);
1151 assert!(doc.tokens[21].kind.is_open_round());
1152 assert!(doc.tokens[22].kind.is_unlintable());
1153 assert!(doc.tokens[23].kind.is_close_round());
1154 }
1155
1156 #[test]
1157 fn condense_tldr_uppercase() {
1158 let doc = Document::new_plain_english_curated("TL;DR");
1159 assert!(doc.tokens.len() == 1);
1160 assert!(doc.tokens[0].kind.is_word());
1161 assert!(doc.tokens[0].span.len() == 5);
1162 }
1163
1164 #[test]
1165 fn condense_tldr_lowercase() {
1166 let doc = Document::new_plain_english_curated("tl;dr");
1167 assert!(doc.tokens.len() == 1);
1168 assert!(doc.tokens[0].kind.is_word());
1169 }
1170
1171 #[test]
1172 fn condense_tldr_mixed_case_1() {
1173 let doc = Document::new_plain_english_curated("tl;DR");
1174 assert!(doc.tokens.len() == 1);
1175 assert!(doc.tokens[0].kind.is_word());
1176 }
1177
1178 #[test]
1179 fn condense_tldr_mixed_case_2() {
1180 let doc = Document::new_plain_english_curated("TL;Dr");
1181 assert!(doc.tokens.len() == 1);
1182 assert!(doc.tokens[0].kind.is_word());
1183 }
1184
1185 #[test]
1186 fn condense_tldr_pural() {
1187 let doc = Document::new_plain_english_curated(
1188 "managing the flow between components to produce relevant TL;DRs of current news articles",
1189 );
1190 assert!(
1192 doc.tokens
1193 .iter()
1194 .all(|t| t.kind.is_word() || t.kind.is_whitespace())
1195 );
1196 let tldrs = doc
1198 .tokens
1199 .iter()
1200 .filter(|t| t.span.get_content(&doc.source).contains(&';'))
1201 .collect_vec();
1202 assert!(tldrs.len() == 1);
1203 assert!(tldrs[0].span.get_content_string(&doc.source) == "TL;DRs");
1204 }
1205
1206 #[test]
1207 fn condense_r_and_d_caps() {
1208 let doc = Document::new_plain_english_curated("R&D");
1209 assert!(doc.tokens.len() == 1);
1210 assert!(doc.tokens[0].kind.is_word());
1211 }
1212
1213 #[test]
1214 fn condense_r_and_d_mixed_case() {
1215 let doc = Document::new_plain_english_curated("R&d");
1216 assert!(doc.tokens.len() == 1);
1217 assert!(doc.tokens[0].kind.is_word());
1218 }
1219
1220 #[test]
1221 fn condense_r_and_d_lowercase() {
1222 let doc = Document::new_plain_english_curated("r&d");
1223 assert!(doc.tokens.len() == 1);
1224 assert!(doc.tokens[0].kind.is_word());
1225 }
1226
1227 #[test]
1228 fn dont_condense_r_and_d_with_spaces() {
1229 let doc = Document::new_plain_english_curated("R & D");
1230 assert!(doc.tokens.len() == 5);
1231 assert!(doc.tokens[0].kind.is_word());
1232 assert!(doc.tokens[1].kind.is_whitespace());
1233 assert!(doc.tokens[2].kind.is_ampersand());
1234 assert!(doc.tokens[3].kind.is_whitespace());
1235 assert!(doc.tokens[4].kind.is_word());
1236 }
1237
1238 #[test]
1239 fn condense_q_and_a() {
1240 let doc =
1241 Document::new_plain_english_curated("A Q&A platform software for teams at any scales.");
1242 assert!(doc.tokens.len() >= 3);
1243 assert!(doc.tokens[2].kind.is_word());
1244 assert!(doc.tokens[2].span.get_content_string(&doc.source) == "Q&A");
1245 }
1246
1247 #[test]
1248 fn dont_allow_mixed_r_and_d_with_q_and_a() {
1249 let doc = Document::new_plain_english_curated("R&A or Q&D");
1250 assert!(doc.tokens.len() == 9);
1251 assert!(doc.tokens[1].kind.is_ampersand() || doc.tokens[7].kind.is_ampersand());
1252 }
1253
1254 #[test]
1255 fn condense_io() {
1256 let doc = Document::new_plain_english_curated("I/O");
1257 assert!(doc.tokens.len() == 1);
1258 assert!(doc.tokens[0].kind.is_word());
1259 }
1260}