use crate::text_utils::{char_len, leading_whitespace};
use super::layout_signals::looks_like_git_log_graph_line;
use super::should_keep_pdf_line_layout;
pub(crate) fn parse_list_marker(
line: &str,
) -> Option<(String, String, String)> {
let indent = leading_whitespace(line).to_string();
let trimmed = line.trim_start_matches([' ', '\t']);
if looks_like_git_log_graph_line(trimmed) {
return None;
}
for bullet in ["•", "-", "*", "◦"] {
let marker = format!("{bullet} ");
if let Some(rest) = trimmed.strip_prefix(&marker) {
return Some((indent, marker, rest.trim().to_string()));
}
}
if let Some(flag) = parse_option_flag(trimmed) {
let rest = &trimmed[flag.len()..];
if let Some(rest) = rest.strip_prefix(' ') {
let trimmed_rest = rest.trim_start();
if !trimmed_rest.is_empty() {
let marker = format!("{flag} ");
return Some((indent, marker, trimmed_rest.to_string()));
}
}
}
if let Some(spec) = parse_format_specifier(trimmed) {
let rest = &trimmed[spec.len()..];
if let Some(rest) = rest.strip_prefix(' ') {
let trimmed_rest = rest.trim_start();
if !trimmed_rest.is_empty() {
let marker = format!("{spec} ");
return Some((indent, marker, trimmed_rest.to_string()));
}
}
}
let mut idx = 0usize;
for ch in trimmed.chars() {
if !ch.is_ascii_digit() {
break;
}
idx += ch.len_utf8();
}
if idx == 0 {
return None;
}
let remainder = &trimmed[idx..];
let mut chars = remainder.chars();
let delimiter = chars.next()?;
if delimiter != '.' && delimiter != ')' {
return None;
}
let space = chars.next()?;
if space != ' ' {
return None;
}
let marker = format!("{}{} ", &trimmed[..idx], delimiter);
let content = chars.as_str().trim().to_string();
Some((indent, marker, content))
}
fn parse_option_flag(trimmed: &str) -> Option<&str> {
let bytes = trimmed.as_bytes();
if bytes.first() != Some(&b'-') {
return None;
}
let dash_end = if bytes.get(1) == Some(&b'-') { 2 } else { 1 };
let first_name = *bytes.get(dash_end)?;
if !first_name.is_ascii_alphabetic() {
return None;
}
let mut end = dash_end + 1;
while end < bytes.len() {
let ch = bytes[end];
if ch.is_ascii_alphanumeric() || ch == b'-' {
end += 1;
} else {
break;
}
}
Some(&trimmed[..end])
}
fn parse_format_specifier(trimmed: &str) -> Option<&str> {
let bytes = trimmed.as_bytes();
if bytes.first() != Some(&b'%') {
return None;
}
let first_name = *bytes.get(1)?;
if !first_name.is_ascii_alphabetic() {
return None;
}
let mut end = 2;
while end < bytes.len() {
let ch = bytes[end];
if ch.is_ascii_alphanumeric() {
end += 1;
} else {
break;
}
}
Some(&trimmed[..end])
}
pub(crate) fn is_list_continuation_line(
line: &str,
list_indent: &str,
marker: &str,
) -> bool {
let trimmed = line.trim();
if trimmed.is_empty() {
return false;
}
if parse_list_marker(line).is_some() {
return false;
}
if should_keep_pdf_line_layout(line) {
return false;
}
let leading_ws =
line.chars().take_while(|&ch| ch == ' ' || ch == '\t').count();
let list_indent_width = char_len(list_indent);
let continuation_indent_width = list_indent_width + char_len(marker);
if leading_ws >= continuation_indent_width {
return true;
}
leading_ws >= list_indent_width
&& trimmed.chars().next().is_some_and(|ch| ch.is_lowercase())
}
pub(crate) fn looks_like_table_or_figure_caption(trimmed: &str) -> bool {
let mut words = trimmed.split_whitespace();
let Some(label) = words.next() else {
return false;
};
if !matches!(label, "Table" | "Figure" | "Plate" | "Diagram") {
return false;
}
let Some(number) = words.next() else {
return false;
};
let number_clean = number.trim_end_matches(['.', ':', ')']);
if number_clean.is_empty() {
return false;
}
if !number_clean.chars().all(|ch| ch.is_ascii_digit() || ch == '.') {
return false;
}
words.next().is_some()
}
pub(crate) fn should_start_new_pdf_paragraph(
current_indent: &str,
previous_line: &str,
line: &str,
) -> bool {
if looks_like_table_or_figure_caption(line.trim()) {
return true;
}
let next_indent = leading_whitespace(line);
if next_indent == current_indent {
let prev = previous_line.trim_end();
let next_trimmed = line.trim_start();
if prev.ends_with(')') && next_trimmed.starts_with("( ") {
return true;
}
if next_trimmed.starts_with("( ")
|| matches!(next_trimmed, "(" | ")" | "( )")
{
return true;
}
if prev.ends_with('\\') {
return true;
}
return false;
}
let current_indent_width = char_len(current_indent);
let next_indent_width = char_len(next_indent);
if next_indent_width > current_indent_width {
let prev = previous_line.trim_end();
if !prev.is_empty() && !prev.ends_with(['.', '?', '!', ':']) {
let next_trimmed = line.trim_start_matches([' ', '\t']);
if next_trimmed.is_empty() {
return true;
}
let first = next_trimmed.chars().next().unwrap_or(' ');
let looks_like_continuation_fragment = first.is_lowercase()
|| matches!(
first,
'('
| ')'
| ']'
| '}'
| ','
| '.'
| ':'
| ';'
| '!'
| '?'
| '-'
| '—'
| '–'
| '/'
| '\\'
| '~'
)
|| next_trimmed.chars().count() <= 4;
if looks_like_continuation_fragment {
return false;
}
let indent_bump = next_indent_width - current_indent_width;
if indent_bump <= 2 {
return false;
}
}
}
true
}