html_to_markdown_rs/
wrapper.rs

1//! Text wrapping functionality for Markdown output.
2//!
3//! This module provides text wrapping capabilities similar to Python's textwrap.fill(),
4//! specifically designed to work with Markdown content while preserving formatting.
5
6use crate::options::ConversionOptions;
7
8/// Wrap text at specified width while preserving Markdown formatting.
9///
10/// This function wraps paragraphs of text at the specified width, but:
11/// - Does not break long words
12/// - Does not break on hyphens
13/// - Preserves Markdown formatting (links, bold, etc.)
14/// - Only wraps paragraph content, not headers, lists, code blocks, etc.
15pub fn wrap_markdown(markdown: &str, options: &ConversionOptions) -> String {
16    if !options.wrap {
17        return markdown.to_string();
18    }
19
20    let mut result = String::with_capacity(markdown.len());
21    let mut in_code_block = false;
22    let mut in_paragraph = false;
23    let mut paragraph_buffer = String::new();
24
25    for line in markdown.lines() {
26        let trimmed = line.trim_start();
27        let is_code_fence = trimmed.starts_with("```");
28        let is_indented_code = line.starts_with("    ")
29            && !is_list_like(trimmed)
30            && !is_numbered_list(trimmed)
31            && !is_heading(trimmed)
32            && !trimmed.starts_with('>')
33            && !trimmed.starts_with('|');
34
35        if is_code_fence || is_indented_code {
36            if in_paragraph && !paragraph_buffer.is_empty() {
37                result.push_str(&wrap_line(&paragraph_buffer, options.wrap_width));
38                result.push_str("\n\n");
39                paragraph_buffer.clear();
40                in_paragraph = false;
41            }
42
43            if is_code_fence {
44                in_code_block = !in_code_block;
45            }
46            result.push_str(line);
47            result.push('\n');
48            continue;
49        }
50
51        if in_code_block {
52            result.push_str(line);
53            result.push('\n');
54            continue;
55        }
56
57        // Try to parse as a list item (unordered or ordered)
58        if let Some((indent, marker, content)) = parse_list_item(line) {
59            // Flush any pending paragraph
60            if in_paragraph && !paragraph_buffer.is_empty() {
61                result.push_str(&wrap_line(&paragraph_buffer, options.wrap_width));
62                result.push_str("\n\n");
63                paragraph_buffer.clear();
64                in_paragraph = false;
65            }
66
67            // Wrap the list item while preserving structure
68            result.push_str(&wrap_list_item(&indent, &marker, &content, options.wrap_width));
69            continue;
70        }
71
72        // Check other structural elements (headings, blockquotes, tables, horizontal rules)
73        let is_structural =
74            is_heading(trimmed) || trimmed.starts_with('>') || trimmed.starts_with('|') || trimmed.starts_with('=');
75
76        if is_structural {
77            if in_paragraph && !paragraph_buffer.is_empty() {
78                result.push_str(&wrap_line(&paragraph_buffer, options.wrap_width));
79                result.push_str("\n\n");
80                paragraph_buffer.clear();
81                in_paragraph = false;
82            }
83
84            result.push_str(line);
85            result.push('\n');
86            continue;
87        }
88
89        if line.trim().is_empty() {
90            if in_paragraph && !paragraph_buffer.is_empty() {
91                result.push_str(&wrap_line(&paragraph_buffer, options.wrap_width));
92                result.push_str("\n\n");
93                paragraph_buffer.clear();
94                in_paragraph = false;
95            } else if !in_paragraph {
96                result.push('\n');
97            }
98            continue;
99        }
100
101        if in_paragraph {
102            paragraph_buffer.push(' ');
103        }
104        paragraph_buffer.push_str(line.trim());
105        in_paragraph = true;
106    }
107
108    if in_paragraph && !paragraph_buffer.is_empty() {
109        result.push_str(&wrap_line(&paragraph_buffer, options.wrap_width));
110        result.push_str("\n\n");
111    }
112
113    result
114}
115
116fn is_list_like(trimmed: &str) -> bool {
117    matches!(trimmed.chars().next(), Some('-' | '*' | '+'))
118}
119
120fn is_numbered_list(trimmed: &str) -> bool {
121    let token = trimmed.split_whitespace().next().unwrap_or("");
122    if token.is_empty() || !(token.ends_with('.') || token.ends_with(')')) {
123        return false;
124    }
125
126    let digits = token.trim_end_matches(['.', ')']);
127    !digits.is_empty() && digits.chars().all(|c| c.is_ascii_digit())
128}
129
130fn is_heading(trimmed: &str) -> bool {
131    trimmed.starts_with('#')
132}
133
134/// Parse a list item into its components: (indent, marker, content)
135///
136/// Returns Some((indent, marker, content)) if the line is a valid list item,
137/// None otherwise.
138///
139/// Examples:
140/// - "- text" -> ("", "- ", "text")
141/// - "  - text" -> ("  ", "- ", "text")
142/// - "1. text" -> ("", "1. ", "text")
143/// - "  42) text" -> ("  ", "42) ", "text")
144fn parse_list_item(line: &str) -> Option<(String, String, String)> {
145    let trimmed = line.trim_start();
146    let indent = &line[..line.len() - trimmed.len()];
147
148    // Check for unordered list markers: -, *, +
149    if let Some(rest) = trimmed.strip_prefix('-') {
150        if rest.starts_with(' ') || rest.is_empty() {
151            return Some((indent.to_string(), "- ".to_string(), rest.trim_start().to_string()));
152        }
153    }
154    if let Some(rest) = trimmed.strip_prefix('*') {
155        if rest.starts_with(' ') || rest.is_empty() {
156            return Some((indent.to_string(), "* ".to_string(), rest.trim_start().to_string()));
157        }
158    }
159    if let Some(rest) = trimmed.strip_prefix('+') {
160        if rest.starts_with(' ') || rest.is_empty() {
161            return Some((indent.to_string(), "+ ".to_string(), rest.trim_start().to_string()));
162        }
163    }
164
165    // Check for ordered list (e.g., "1. ", "42) ")
166    let first_token = trimmed.split_whitespace().next()?;
167    if first_token.ends_with('.') || first_token.ends_with(')') {
168        let digits = first_token.trim_end_matches(['.', ')']);
169        if !digits.is_empty() && digits.chars().all(|c| c.is_ascii_digit()) {
170            let marker_len = first_token.len();
171            let rest = trimmed[marker_len..].trim_start();
172            return Some((
173                indent.to_string(),
174                trimmed[..marker_len].to_string() + " ",
175                rest.to_string(),
176            ));
177        }
178    }
179
180    None
181}
182
183/// Wrap a list item while preserving its structure.
184///
185/// The first line of output will be: `<indent><marker><content_start>`
186/// Continuation lines will be: `<indent><spaces_matching_marker><content_continued>`
187///
188/// # Arguments
189/// - `indent`: The leading whitespace (for nested lists)
190/// - `marker`: The list marker (e.g., "- ", "1. ")
191/// - `content`: The text content after the marker
192/// - `width`: The maximum line width
193fn wrap_list_item(indent: &str, marker: &str, content: &str, width: usize) -> String {
194    if content.is_empty() {
195        return format!("{}{}\n", indent, marker.trim_end());
196    }
197
198    let full_marker = format!("{}{}", indent, marker);
199    let continuation_indent = format!("{}{}", indent, " ".repeat(marker.len()));
200
201    // Calculate effective width for first line (accounting for marker)
202    let first_line_prefix_len = full_marker.len();
203    let first_line_width = if width > first_line_prefix_len {
204        width - first_line_prefix_len
205    } else {
206        width
207    };
208
209    // Calculate effective width for continuation lines (accounting for indent)
210    let cont_line_prefix_len = continuation_indent.len();
211    let cont_line_width = if width > cont_line_prefix_len {
212        width - cont_line_prefix_len
213    } else {
214        width
215    };
216
217    // Split content into words
218    let words: Vec<&str> = content.split_whitespace().collect();
219    if words.is_empty() {
220        return format!("{}\n", full_marker.trim_end());
221    }
222
223    let mut result = String::new();
224    let mut current_line = String::new();
225    let mut current_width = first_line_width;
226    let mut is_first_line = true;
227
228    for word in words {
229        let word_len = word.len();
230        let space_needed = if current_line.is_empty() { 0 } else { 1 };
231
232        // Check if adding this word would exceed the current line width
233        if !current_line.is_empty() && current_line.len() + space_needed + word_len > current_width {
234            // Flush current line
235            if is_first_line {
236                result.push_str(&full_marker);
237                is_first_line = false;
238            } else {
239                result.push_str(&continuation_indent);
240            }
241            result.push_str(&current_line);
242            result.push('\n');
243            current_line.clear();
244            current_width = cont_line_width;
245        }
246
247        // Add the word to the current line
248        if !current_line.is_empty() {
249            current_line.push(' ');
250        }
251        current_line.push_str(word);
252    }
253
254    // Flush remaining content
255    if !current_line.is_empty() {
256        if is_first_line {
257            result.push_str(&full_marker);
258        } else {
259            result.push_str(&continuation_indent);
260        }
261        result.push_str(&current_line);
262        result.push('\n');
263    }
264
265    result
266}
267
268/// Wrap a single line of text at the specified width.
269///
270/// This function wraps text without breaking long words or on hyphens,
271/// similar to Python's textwrap.fill() with break_long_words=False and break_on_hyphens=False.
272fn wrap_line(text: &str, width: usize) -> String {
273    if text.len() <= width {
274        return text.to_string();
275    }
276
277    let mut result = String::new();
278    let mut current_line = String::new();
279    let words: Vec<&str> = text.split_whitespace().collect();
280
281    for word in words {
282        if current_line.is_empty() {
283            current_line.push_str(word);
284        } else if current_line.len() + 1 + word.len() <= width {
285            current_line.push(' ');
286            current_line.push_str(word);
287        } else {
288            if !result.is_empty() {
289                result.push('\n');
290            }
291            result.push_str(&current_line);
292            current_line.clear();
293            current_line.push_str(word);
294        }
295    }
296
297    if !current_line.is_empty() {
298        if !result.is_empty() {
299            result.push('\n');
300        }
301        result.push_str(&current_line);
302    }
303
304    result
305}
306
307#[cfg(test)]
308mod tests {
309    use super::*;
310    use crate::options::ConversionOptions;
311
312    #[test]
313    fn test_wrap_line_short() {
314        let text = "Short text";
315        let wrapped = wrap_line(text, 80);
316        assert_eq!(wrapped, "Short text");
317    }
318
319    #[test]
320    fn test_wrap_line_long() {
321        let text = "123456789 123456789";
322        let wrapped = wrap_line(text, 10);
323        assert_eq!(wrapped, "123456789\n123456789");
324    }
325
326    #[test]
327    fn test_wrap_line_no_break_long_words() {
328        let text = "12345678901 12345";
329        let wrapped = wrap_line(text, 10);
330        assert_eq!(wrapped, "12345678901\n12345");
331    }
332
333    #[test]
334    fn test_wrap_markdown_disabled() {
335        let markdown = "This is a very long line that would normally be wrapped at 40 characters";
336        let options = ConversionOptions {
337            wrap: false,
338            ..Default::default()
339        };
340        let result = wrap_markdown(markdown, &options);
341        assert_eq!(result, markdown);
342    }
343
344    #[test]
345    fn test_wrap_markdown_paragraph() {
346        let markdown = "This is a very long line that would normally be wrapped at 40 characters\n\n";
347        let options = ConversionOptions {
348            wrap: true,
349            wrap_width: 40,
350            ..Default::default()
351        };
352        let result = wrap_markdown(markdown, &options);
353        assert!(result.lines().all(|line| line.len() <= 40 || line.trim().is_empty()));
354    }
355
356    #[test]
357    fn test_wrap_markdown_preserves_code() {
358        let markdown = "```\nThis is a very long line in a code block that should not be wrapped\n```\n";
359        let options = ConversionOptions {
360            wrap: true,
361            wrap_width: 40,
362            ..Default::default()
363        };
364        let result = wrap_markdown(markdown, &options);
365        assert!(result.contains("This is a very long line in a code block that should not be wrapped"));
366    }
367
368    #[test]
369    fn test_wrap_markdown_preserves_headings() {
370        let markdown = "# This is a very long heading that should not be wrapped even if it exceeds the width\n\n";
371        let options = ConversionOptions {
372            wrap: true,
373            wrap_width: 40,
374            ..Default::default()
375        };
376        let result = wrap_markdown(markdown, &options);
377        assert!(
378            result.contains("# This is a very long heading that should not be wrapped even if it exceeds the width")
379        );
380    }
381
382    #[test]
383    fn wrap_markdown_wraps_long_list_items() {
384        let markdown = "- This is a very long list item that should definitely be wrapped when it exceeds the specified wrap width\n- Short item\n";
385        let options = ConversionOptions {
386            wrap: true,
387            wrap_width: 60,
388            ..Default::default()
389        };
390
391        let result = wrap_markdown(markdown, &options);
392
393        // First list item should be wrapped
394        assert!(
395            result.contains("- This is a very long list item that should definitely be\n  wrapped"),
396            "First list item not properly wrapped. Got: {}",
397            result
398        );
399        // Second list item stays on one line
400        assert!(
401            result.contains("- Short item"),
402            "Short list item incorrectly modified. Got: {}",
403            result
404        );
405    }
406
407    #[test]
408    fn wrap_markdown_wraps_ordered_lists() {
409        let markdown = "1. This is a numbered list item with a very long text that should be wrapped at the specified width\n2. Short\n";
410        let options = ConversionOptions {
411            wrap: true,
412            wrap_width: 60,
413            ..Default::default()
414        };
415
416        let result = wrap_markdown(markdown, &options);
417
418        // Should wrap the long ordered list item
419        assert!(
420            result.lines().all(|line| line.len() <= 60 || line.trim().is_empty()),
421            "Some lines exceed wrap width. Got: {}",
422            result
423        );
424        // Should preserve list structure
425        assert!(result.contains("1."), "Lost ordered list marker. Got: {}", result);
426        assert!(
427            result.contains("2."),
428            "Lost second ordered list marker. Got: {}",
429            result
430        );
431    }
432
433    #[test]
434    fn wrap_markdown_preserves_nested_list_structure() {
435        let markdown = "- Item one with some additional text that will need to be wrapped across multiple lines\n  - Nested item with long text that also needs wrapping at the specified width\n  - Short nested\n";
436        let options = ConversionOptions {
437            wrap: true,
438            wrap_width: 50,
439            ..Default::default()
440        };
441
442        let result = wrap_markdown(markdown, &options);
443
444        // Verify list structure is preserved
445        assert!(result.contains("- Item"), "Lost top-level list marker. Got: {}", result);
446        assert!(
447            result.contains("  - Nested"),
448            "Lost nested list structure. Got: {}",
449            result
450        );
451        // All lines should respect wrap width
452        assert!(
453            result.lines().all(|line| line.len() <= 50 || line.trim().is_empty()),
454            "Some lines exceed wrap width. Got: {}",
455            result
456        );
457    }
458
459    #[test]
460    fn wrap_markdown_handles_list_with_links() {
461        let markdown = "- [A](#a) with additional text that is long enough to require wrapping at the configured width\n  - [B](#b) also has more content that needs wrapping\n  - [C](#c)\n";
462        let options = ConversionOptions {
463            wrap: true,
464            wrap_width: 50,
465            ..Default::default()
466        };
467
468        let result = wrap_markdown(markdown, &options);
469
470        // Should preserve links
471        assert!(result.contains("[A](#a)"), "Lost link in list. Got: {}", result);
472        assert!(result.contains("[B](#b)"), "Lost nested link. Got: {}", result);
473        assert!(result.contains("[C](#c)"), "Lost short nested link. Got: {}", result);
474        // Should wrap while preserving structure
475        assert!(
476            result.contains("- [A](#a)"),
477            "Lost list structure with link. Got: {}",
478            result
479        );
480        assert!(
481            result.contains("  - [B](#b)"),
482            "Lost nested list structure. Got: {}",
483            result
484        );
485    }
486
487    #[test]
488    fn wrap_markdown_handles_empty_list_items() {
489        let markdown = "- \n- Item with text\n- \n";
490        let options = ConversionOptions {
491            wrap: true,
492            wrap_width: 40,
493            ..Default::default()
494        };
495
496        let result = wrap_markdown(markdown, &options);
497
498        // Should handle empty items gracefully
499        assert!(result.contains("- "), "Lost list markers. Got: {}", result);
500        assert!(result.contains("Item with text"), "Lost item text. Got: {}", result);
501    }
502
503    #[test]
504    fn wrap_markdown_preserves_indented_lists_with_wrapping() {
505        let markdown = "- [A](#a) with some additional text that makes this line very long and should be wrapped\n  - [B](#b)\n  - [C](#c) with more text that is also quite long and needs wrapping\n";
506        let options = ConversionOptions {
507            wrap: true,
508            wrap_width: 50,
509            ..Default::default()
510        };
511
512        let result = wrap_markdown(markdown, &options);
513
514        // Should wrap but preserve list structure
515        assert!(result.contains("- [A](#a)"), "Lost top-level link. Got: {}", result);
516        assert!(result.contains("  - [B](#b)"), "Lost nested link B. Got: {}", result);
517        assert!(result.contains("  - [C](#c)"), "Lost nested link C. Got: {}", result);
518        // Should have wrapped long lines
519        assert!(
520            result.lines().all(|line| line.len() <= 50),
521            "Some lines exceed wrap width:\n{}",
522            result
523        );
524    }
525}