mod document;
mod markdown;
pub mod phrase_search;
mod tokenizer;
pub use document::{Directive, Document, ListItem, Paragraph, Section, Sentence};
pub use markdown::parse_markdown;
pub use tokenizer::{split_sentences, word_count};
use crate::types::SourceFile;
#[must_use]
pub fn parse_plain(text: &str, source: SourceFile) -> Document {
let normalized: std::borrow::Cow<'_, str> = if text.contains('\r') {
std::borrow::Cow::Owned(text.replace("\r\n", "\n").replace('\r', "\n"))
} else {
std::borrow::Cow::Borrowed(text)
};
let text = normalized.as_ref();
let paragraphs: Vec<Paragraph> = text
.split("\n\n")
.enumerate()
.filter_map(|(idx, chunk)| {
let trimmed = chunk.trim();
if trimmed.is_empty() {
None
} else {
let start_line = count_lines_until(text, idx);
Some(Paragraph::new(trimmed.to_string(), start_line))
}
})
.collect();
let section = Section::new(None, 0, paragraphs);
Document::new(source, vec![section])
}
fn count_lines_until(text: &str, chunk_index: usize) -> u32 {
if chunk_index == 0 {
return 1;
}
let mut line: u32 = 1;
let mut chunk: usize = 0;
let mut prev_was_newline = false;
for c in text.chars() {
if c == '\n' {
line += 1;
if prev_was_newline {
chunk += 1;
if chunk == chunk_index {
return line;
}
}
prev_was_newline = true;
} else {
prev_was_newline = false;
}
}
line
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parse_plain_splits_on_blank_lines() {
let text = "First paragraph.\n\nSecond paragraph.\n\nThird.";
let doc = parse_plain(text, SourceFile::Anonymous);
assert_eq!(doc.sections.len(), 1);
assert_eq!(doc.sections[0].paragraphs.len(), 3);
}
#[test]
fn parse_plain_ignores_empty_chunks() {
let text = "\n\n\nFirst.\n\n\n\nSecond.";
let doc = parse_plain(text, SourceFile::Anonymous);
assert_eq!(doc.sections[0].paragraphs.len(), 2);
}
#[test]
fn parse_plain_handles_crlf_line_endings() {
let text = "First paragraph.\r\n\r\nSecond paragraph.\r\n\r\nThird.";
let doc = parse_plain(text, SourceFile::Anonymous);
assert_eq!(doc.sections[0].paragraphs.len(), 3);
assert!(doc.sections[0]
.paragraphs
.iter()
.all(|p| !p.text.contains('\r')));
}
#[test]
fn parse_plain_single_paragraph() {
let text = "Just one paragraph, with one sentence.";
let doc = parse_plain(text, SourceFile::Anonymous);
assert_eq!(doc.sections[0].paragraphs.len(), 1);
}
}