use std::fmt::Display;
use itertools::Itertools;
use crate::linting::Suggestion;
use crate::parsers::{Markdown, Parser, PlainEnglish};
use crate::span::Span;
use crate::{FatToken, Punctuation, Token, TokenKind, TokenStringExt};
pub struct Document {
source: Vec<char>,
tokens: Vec<Token>,
parser: Box<dyn Parser>
}
impl Default for Document {
fn default() -> Self {
Self::new("", Box::new(PlainEnglish))
}
}
impl Document {
pub fn new(text: &str, parser: Box<dyn Parser>) -> Self {
let source: Vec<_> = text.chars().collect();
let mut doc = Self {
source,
tokens: Vec::new(),
parser
};
doc.parse();
doc
}
pub fn new_plain_english(text: &str) -> Self {
Self::new(text, Box::new(PlainEnglish))
}
pub fn new_markdown(text: &str) -> Self {
Self::new(text, Box::new(Markdown))
}
fn parse(&mut self) {
self.tokens = self.parser.parse(&self.source);
self.condense_contractions();
self.match_quotes();
}
pub fn get_token(&self, index: usize) -> Option<Token> {
self.tokens.get(index).copied()
}
pub fn tokens(&self) -> impl Iterator<Item = Token> + '_ {
self.tokens.iter().copied()
}
pub fn fat_tokens(&self) -> impl Iterator<Item = FatToken> + '_ {
self.tokens().map(|token| token.to_fat(&self.source))
}
fn sentence_terminators(&self) -> impl Iterator<Item = usize> + '_ {
self.tokens.iter().enumerate().filter_map(|(index, token)| {
if is_sentence_terminator(&token.kind) {
return Some(index);
}
None
})
}
fn last_sentence_terminator(&self) -> Option<usize> {
self.tokens
.iter()
.enumerate()
.rev()
.find_map(|(index, token)| {
if is_sentence_terminator(&token.kind) {
return Some(index);
}
None
})
}
pub fn sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
let first_sentence = self
.sentence_terminators()
.next()
.map(|first_term| &self.tokens[0..=first_term]);
let rest = self
.sentence_terminators()
.tuple_windows()
.map(move |(a, b)| &self.tokens[a + 1..=b]);
let last = if let Some(last_i) = self.last_sentence_terminator() {
if last_i + 1 < self.tokens.len() {
Some(&self.tokens[last_i + 1..])
} else {
None
}
} else {
Some(self.tokens.as_slice())
};
first_sentence.into_iter().chain(rest).chain(last)
}
pub fn words(&self) -> impl Iterator<Item = Token> + '_ {
self.tokens
.iter()
.filter(|token| token.kind.is_word())
.cloned()
}
pub fn get_span_content(&self, span: Span) -> &[char] {
span.get_content(&self.source)
}
pub fn get_span_content_str(&self, span: Span) -> String {
String::from_iter(self.get_span_content(span))
}
pub fn get_full_string(&self) -> String {
self.get_span_content_str(Span {
start: 0,
end: self.source.len()
})
}
pub fn get_full_content(&self) -> &[char] {
&self.source
}
pub fn apply_suggestion(&mut self, suggestion: &Suggestion, span: Span) {
match suggestion {
Suggestion::ReplaceWith(chars) => {
if chars.len() == span.len() {
for (index, c) in chars.iter().enumerate() {
self.source[index + span.start] = *c
}
} else {
let popped = self.source.split_off(span.start);
self.source.extend(chars);
self.source.extend(popped.into_iter().skip(span.len()));
}
}
Suggestion::Remove => {
for i in span.end..self.source.len() {
self.source[i - span.len()] = self.source[i];
}
self.source.truncate(self.source.len() - span.len());
}
}
self.parse();
}
fn match_quotes(&mut self) {
let quote_indices: Vec<usize> = self.tokens.iter_quote_indices().collect();
for i in 0..quote_indices.len() / 2 {
let a_i = quote_indices[i * 2];
let b_i = quote_indices[i * 2 + 1];
{
let a = self.tokens[a_i].kind.as_mut_quote().unwrap();
a.twin_loc = Some(b_i);
}
{
let b = self.tokens[b_i].kind.as_mut_quote().unwrap();
b.twin_loc = Some(a_i);
}
}
}
fn condense_contractions(&mut self) {
if self.tokens.len() < 3 {
return;
}
let mut replace_starts = Vec::new();
for idx in 0..self.tokens.len() - 2 {
let a = self.tokens[idx];
let b = self.tokens[idx + 1];
let c = self.tokens[idx + 2];
if matches!(
(a.kind, b.kind, c.kind),
(
TokenKind::Word,
TokenKind::Punctuation(Punctuation::Apostrophe),
TokenKind::Word
)
) {
let should_replace = if let Some(last_idx) = replace_starts.last() {
*last_idx < idx - 2
} else {
true
};
if should_replace {
replace_starts.push(idx);
self.tokens[idx].span.end = c.span.end;
}
}
}
let old = self.tokens.clone();
self.tokens.clear();
self.tokens.extend_from_slice(
&old[0..replace_starts
.first()
.copied()
.unwrap_or(replace_starts.len())]
);
let mut iter = replace_starts.iter().peekable();
while let (Some(a_idx), b) = (iter.next(), iter.peek()) {
self.tokens.push(old[*a_idx]);
if let Some(b_idx) = b {
self.tokens.extend_from_slice(&old[a_idx + 3..**b_idx]);
}
}
self.tokens.extend_from_slice(
&old[replace_starts
.last()
.map(|v| v + 3)
.unwrap_or(replace_starts.len())..]
)
}
}
impl Display for Document {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
for token in &self.tokens {
write!(f, "{}", self.get_span_content_str(token.span))?;
}
Ok(())
}
}
impl TokenStringExt for Document {
fn first_word(&self) -> Option<Token> {
self.tokens.first_word()
}
fn first_sentence_word(&self) -> Option<Token> {
self.tokens.first_sentence_word()
}
fn first_non_whitespace(&self) -> Option<Token> {
self.tokens.first_non_whitespace()
}
fn iter_word_indices(&self) -> impl Iterator<Item = usize> + '_ {
self.tokens.iter_word_indices()
}
fn iter_words(&self) -> impl Iterator<Item = &Token> + '_ {
self.tokens.iter_words()
}
fn iter_space_indices(&self) -> impl Iterator<Item = usize> + '_ {
self.tokens.iter_space_indices()
}
fn iter_spaces(&self) -> impl Iterator<Item = &Token> + '_ {
self.tokens.iter_spaces()
}
fn iter_apostrophe_indices(&self) -> impl Iterator<Item = usize> + '_ {
self.tokens.iter_apostrophe_indices()
}
fn iter_apostrophes(&self) -> impl Iterator<Item = &Token> + '_ {
self.tokens.iter_apostrophes()
}
fn span(&self) -> Option<Span> {
self.tokens.span()
}
fn iter_quote_indices(&self) -> impl Iterator<Item = usize> + '_ {
self.tokens.iter_quote_indices()
}
fn iter_quotes(&self) -> impl Iterator<Item = Token> + '_ {
self.tokens.iter_quotes()
}
}
fn is_sentence_terminator(token: &TokenKind) -> bool {
match token {
TokenKind::Punctuation(punct) => [
Punctuation::Period,
Punctuation::Bang,
Punctuation::Question
]
.contains(punct),
TokenKind::Newline(_) => true,
_ => false
}
}
#[cfg(test)]
mod tests {
use super::Document;
use crate::parsers::{Markdown, PlainEnglish};
use crate::token::TokenStringExt;
#[test]
fn parses_sentences_correctly() {
let text = "There were three little pigs. They built three little homes.";
let document = Document::new(text, Box::new(PlainEnglish));
let mut sentence_strs = vec![];
for sentence in document.sentences() {
if let Some(span) = sentence.span() {
sentence_strs.push(document.get_span_content_str(span));
}
}
assert_eq!(
sentence_strs,
vec![
"There were three little pigs.",
" They built three little homes."
]
)
}
fn assert_condensed_contractions(text: &str, final_tok_count: usize) {
let document = Document::new(text, Box::new(PlainEnglish));
assert_eq!(document.tokens.len(), final_tok_count);
let markdown_parser = Markdown;
let document = Document::new(text, Box::new(markdown_parser));
assert_eq!(document.tokens.len(), final_tok_count + 1);
}
#[test]
fn simple_contraction() {
assert_condensed_contractions("isn't", 1);
}
#[test]
fn simple_contraction2() {
assert_condensed_contractions("wasn't", 1);
}
#[test]
fn simple_contraction3() {
assert_condensed_contractions("There's", 1);
}
#[test]
fn medium_contraction() {
assert_condensed_contractions("isn't wasn't", 3);
}
#[test]
fn medium_contraction2() {
assert_condensed_contractions("There's no way", 5);
}
}