use regex::Regex;
use std::sync::LazyLock;
use crate::wrapping::atomic_patterns::ATOMIC_CONSTRUCT_PATTERN;
use crate::wrapping::tag_handling::{denormalize_adjacent_tags, normalize_adjacent_tags};
const PLACEHOLDER_PREFIX: &str = "\x00AC";
const PLACEHOLDER_SUFFIX: &str = "\x00";
const PLACEHOLDER_FILLER: char = '\x01';
static MD_SPECIALS_PAT: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"^([-*+>]|#+)$").expect("valid MD_SPECIALS_PAT regex"));
static MD_NUMERAL_PAT: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"^[0-9]+[.)]$").expect("valid MD_NUMERAL_PAT regex"));
static WHITESPACE_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\s+").expect("valid WHITESPACE_RE regex"));
fn extract_atomic_constructs(text: &str) -> (Vec<String>, Vec<String>, String) {
let mut constructs: Vec<String> = Vec::new();
let mut placeholders: Vec<String> = Vec::new();
let result = ATOMIC_CONSTRUCT_PATTERN.replace_all(text, |caps: ®ex::Captures<'_>| {
let construct = caps.get(0).expect("group 0 always exists").as_str().to_string();
let idx = constructs.len();
let construct_len = construct.chars().count();
let core = format!("{PLACEHOLDER_PREFIX}{idx}{PLACEHOLDER_SUFFIX}");
let core_len = core.chars().count();
let placeholder = if construct_len > core_len {
let padding: String =
std::iter::repeat_n(PLACEHOLDER_FILLER, construct_len - core_len).collect();
format!("{PLACEHOLDER_PREFIX}{idx}{padding}{PLACEHOLDER_SUFFIX}")
} else {
core
};
constructs.push(construct);
placeholders.push(placeholder.clone());
placeholder
});
(constructs, placeholders, result.into_owned())
}
fn restore_atomic_constructs(
tokens: &[String],
constructs: &[String],
placeholders: &[String],
) -> Vec<String> {
if constructs.is_empty() {
return tokens.to_vec();
}
let placeholder_map: std::collections::HashMap<&str, &str> =
placeholders.iter().zip(constructs.iter()).map(|(p, c)| (p.as_str(), c.as_str())).collect();
tokens
.iter()
.map(|token| {
if !token.contains('\x00') {
return token.clone();
}
if let Some(construct) = placeholder_map.get(token.as_str()) {
return (*construct).to_string();
}
let mut result = token.clone();
for (placeholder, construct) in placeholders.iter().zip(constructs.iter()) {
if result.contains(placeholder.as_str()) {
result = result.replace(placeholder.as_str(), construct);
}
}
result
})
.collect()
}
pub fn html_md_word_split(text: &str) -> Vec<String> {
let text = normalize_adjacent_tags(text);
let (constructs, placeholders, text_with_placeholders) = extract_atomic_constructs(&text);
let tokens: Vec<String> = text_with_placeholders.split_whitespace().map(String::from).collect();
restore_atomic_constructs(&tokens, &constructs, &placeholders)
}
pub fn simple_word_split(text: &str) -> Vec<String> {
text.split_whitespace().map(String::from).collect()
}
pub fn markdown_escape_word(word: &str) -> String {
if MD_NUMERAL_PAT.is_match(word) {
let last_char_len = word.chars().next_back().map_or(0, char::len_utf8);
let prefix = &word[..word.len() - last_char_len];
let last = &word[word.len() - last_char_len..];
format!("{prefix}\\{last}")
} else if MD_SPECIALS_PAT.is_match(word) {
format!("\\{word}")
} else {
word.to_string()
}
}
#[allow(clippy::too_many_arguments, clippy::fn_params_excessive_bools, clippy::type_complexity)]
pub fn wrap_paragraph_lines(
text: &str,
width: usize,
initial_column: usize,
subsequent_offset: usize,
replace_whitespace: bool,
drop_whitespace: bool,
splitter: Option<&dyn Fn(&str) -> Vec<String>>,
is_markdown: bool,
) -> Vec<String> {
let mut lines: Vec<String> = Vec::new();
if width == 0 {
let mut text = text.to_string();
if replace_whitespace {
text = WHITESPACE_RE.replace_all(&text, " ").into_owned();
}
if drop_whitespace {
text = text.trim().to_string();
}
if text.is_empty() {
return vec![];
}
return vec![text];
}
let text = if replace_whitespace {
WHITESPACE_RE.replace_all(text, " ").into_owned()
} else {
text.to_string()
};
let default_splitter = html_md_word_split;
let splitter = splitter.unwrap_or(&default_splitter);
let words = splitter(&text);
let mut current_line: Vec<String> = Vec::new();
let mut current_width = initial_column;
let mut first_line = true;
for word in &words {
let word_width = word.chars().count();
let space_width: usize = usize::from(!current_line.is_empty());
if current_width + word_width + space_width <= width {
current_line.push(word.clone());
current_width += word_width + space_width;
} else {
if !current_line.is_empty() {
let mut line = current_line.join(" ");
if drop_whitespace {
line = line.trim().to_string();
}
lines.push(line);
first_line = false;
}
let escaped_word =
if is_markdown && !first_line { markdown_escape_word(word) } else { word.clone() };
let escaped_word_width = escaped_word.chars().count();
current_line = vec![escaped_word];
current_width = subsequent_offset + escaped_word_width;
}
}
if !current_line.is_empty() {
let mut line = current_line.join(" ");
if drop_whitespace {
line = line.trim().to_string();
}
lines.push(line);
}
lines
}
#[allow(clippy::too_many_arguments, clippy::type_complexity)]
pub fn wrap_paragraph(
text: &str,
width: usize,
initial_indent: &str,
subsequent_indent: &str,
initial_column: usize,
replace_whitespace: bool,
drop_whitespace: bool,
splitter: Option<&dyn Fn(&str) -> Vec<String>>,
is_markdown: bool,
) -> String {
let mut lines = wrap_paragraph_lines(
text,
width,
initial_column + initial_indent.chars().count(),
subsequent_indent.chars().count(),
replace_whitespace,
drop_whitespace,
splitter,
is_markdown,
);
if !initial_indent.is_empty() && initial_column == 0 && !lines.is_empty() {
lines[0] = format!("{initial_indent}{}", lines[0]);
}
if !subsequent_indent.is_empty() && lines.len() > 1 {
for line in lines.iter_mut().skip(1) {
*line = format!("{subsequent_indent}{line}");
}
}
let result = lines.join("\n");
denormalize_adjacent_tags(&result)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_markdown_escape_word() {
assert_eq!(markdown_escape_word("1."), "1\\.");
assert_eq!(markdown_escape_word("10."), "10\\.");
assert_eq!(markdown_escape_word("1)"), "1\\)");
assert_eq!(markdown_escape_word("-"), "\\-");
assert_eq!(markdown_escape_word("*"), "\\*");
assert_eq!(markdown_escape_word("+"), "\\+");
assert_eq!(markdown_escape_word(">"), "\\>");
assert_eq!(markdown_escape_word("#"), "\\#");
assert_eq!(markdown_escape_word("##"), "\\##");
assert_eq!(markdown_escape_word("hello"), "hello");
}
#[test]
fn test_simple_wrapping() {
let lines = wrap_paragraph_lines(
"Hello world this is a test",
10,
0,
0,
true,
true,
Some(&simple_word_split),
false,
);
assert!(!lines.is_empty());
for line in &lines {
assert!(line.chars().count() <= 10 || line.split_whitespace().count() == 1);
}
}
#[test]
fn test_no_wrap() {
let lines =
wrap_paragraph_lines("Hello world this is a test", 0, 0, 0, true, true, None, false);
assert_eq!(lines.len(), 1);
assert_eq!(lines[0], "Hello world this is a test");
}
#[test]
fn test_html_md_word_split() {
let words = html_md_word_split("Hello `code` world");
assert_eq!(words, vec!["Hello", "`code`", "world"]);
}
#[test]
fn test_html_md_word_split_links() {
let words = html_md_word_split("See [link](url) here");
assert_eq!(words, vec", "here"]);
}
}