html_to_markdown_rs/
wrapper.rs

1//! Text wrapping functionality for Markdown output.
2//!
3//! This module provides text wrapping capabilities similar to Python's textwrap.fill(),
4//! specifically designed to work with Markdown content while preserving formatting.
5
6use crate::options::ConversionOptions;
7
8/// Wrap text at specified width while preserving Markdown formatting.
9///
10/// This function wraps paragraphs of text at the specified width, but:
11/// - Does not break long words
12/// - Does not break on hyphens
13/// - Preserves Markdown formatting (links, bold, etc.)
14/// - Only wraps paragraph content, not headers, lists, code blocks, etc.
15pub fn wrap_markdown(markdown: &str, options: &ConversionOptions) -> String {
16    if !options.wrap {
17        return markdown.to_string();
18    }
19
20    let mut result = String::with_capacity(markdown.len());
21    let mut in_code_block = false;
22    let mut in_paragraph = false;
23    let mut paragraph_buffer = String::new();
24
25    for line in markdown.lines() {
26        let trimmed = line.trim_start();
27        let is_code_fence = trimmed.starts_with("```");
28        let is_indented_code = line.starts_with("    ")
29            && !is_list_like(trimmed)
30            && !is_numbered_list(trimmed)
31            && !is_heading(trimmed)
32            && !trimmed.starts_with('>')
33            && !trimmed.starts_with('|');
34
35        if is_code_fence || is_indented_code {
36            if in_paragraph && !paragraph_buffer.is_empty() {
37                result.push_str(&wrap_line(&paragraph_buffer, options.wrap_width));
38                result.push_str("\n\n");
39                paragraph_buffer.clear();
40                in_paragraph = false;
41            }
42
43            if is_code_fence {
44                in_code_block = !in_code_block;
45            }
46            result.push_str(line);
47            result.push('\n');
48            continue;
49        }
50
51        if in_code_block {
52            result.push_str(line);
53            result.push('\n');
54            continue;
55        }
56
57        if let Some((indent, marker, content)) = parse_list_item(line) {
58            if in_paragraph && !paragraph_buffer.is_empty() {
59                result.push_str(&wrap_line(&paragraph_buffer, options.wrap_width));
60                result.push_str("\n\n");
61                paragraph_buffer.clear();
62                in_paragraph = false;
63            }
64
65            result.push_str(&wrap_list_item(&indent, &marker, &content, options.wrap_width));
66            continue;
67        }
68
69        let is_structural =
70            is_heading(trimmed) || trimmed.starts_with('>') || trimmed.starts_with('|') || trimmed.starts_with('=');
71
72        if is_structural {
73            if in_paragraph && !paragraph_buffer.is_empty() {
74                result.push_str(&wrap_line(&paragraph_buffer, options.wrap_width));
75                result.push_str("\n\n");
76                paragraph_buffer.clear();
77                in_paragraph = false;
78            }
79
80            result.push_str(line);
81            result.push('\n');
82            continue;
83        }
84
85        if line.trim().is_empty() {
86            if in_paragraph && !paragraph_buffer.is_empty() {
87                result.push_str(&wrap_line(&paragraph_buffer, options.wrap_width));
88                result.push_str("\n\n");
89                paragraph_buffer.clear();
90                in_paragraph = false;
91            } else if !in_paragraph {
92                result.push('\n');
93            }
94            continue;
95        }
96
97        if in_paragraph {
98            paragraph_buffer.push(' ');
99        }
100        paragraph_buffer.push_str(line.trim());
101        in_paragraph = true;
102    }
103
104    if in_paragraph && !paragraph_buffer.is_empty() {
105        result.push_str(&wrap_line(&paragraph_buffer, options.wrap_width));
106        result.push_str("\n\n");
107    }
108
109    result
110}
111
112fn is_list_like(trimmed: &str) -> bool {
113    matches!(trimmed.chars().next(), Some('-' | '*' | '+'))
114}
115
116fn is_numbered_list(trimmed: &str) -> bool {
117    let token = trimmed.split_whitespace().next().unwrap_or("");
118    if token.is_empty() || !(token.ends_with('.') || token.ends_with(')')) {
119        return false;
120    }
121
122    let digits = token.trim_end_matches(['.', ')']);
123    !digits.is_empty() && digits.chars().all(|c| c.is_ascii_digit())
124}
125
126fn is_heading(trimmed: &str) -> bool {
127    trimmed.starts_with('#')
128}
129
130/// Parse a list item into its components: (indent, marker, content)
131///
132/// Returns Some((indent, marker, content)) if the line is a valid list item,
133/// None otherwise.
134///
135/// Examples:
136/// - "- text" -> ("", "- ", "text")
137/// - "  - text" -> ("  ", "- ", "text")
138/// - "1. text" -> ("", "1. ", "text")
139/// - "  42) text" -> ("  ", "42) ", "text")
140fn parse_list_item(line: &str) -> Option<(String, String, String)> {
141    let trimmed = line.trim_start();
142    let indent = &line[..line.len() - trimmed.len()];
143
144    if let Some(rest) = trimmed.strip_prefix('-') {
145        if rest.starts_with(' ') || rest.is_empty() {
146            return Some((indent.to_string(), "- ".to_string(), rest.trim_start().to_string()));
147        }
148    }
149    if let Some(rest) = trimmed.strip_prefix('*') {
150        if rest.starts_with(' ') || rest.is_empty() {
151            return Some((indent.to_string(), "* ".to_string(), rest.trim_start().to_string()));
152        }
153    }
154    if let Some(rest) = trimmed.strip_prefix('+') {
155        if rest.starts_with(' ') || rest.is_empty() {
156            return Some((indent.to_string(), "+ ".to_string(), rest.trim_start().to_string()));
157        }
158    }
159
160    let first_token = trimmed.split_whitespace().next()?;
161    if first_token.ends_with('.') || first_token.ends_with(')') {
162        let digits = first_token.trim_end_matches(['.', ')']);
163        if !digits.is_empty() && digits.chars().all(|c| c.is_ascii_digit()) {
164            let marker_len = first_token.len();
165            let rest = trimmed[marker_len..].trim_start();
166            return Some((
167                indent.to_string(),
168                trimmed[..marker_len].to_string() + " ",
169                rest.to_string(),
170            ));
171        }
172    }
173
174    None
175}
176
177/// Wrap a list item while preserving its structure.
178///
179/// The first line of output will be: `<indent><marker><content_start>`
180/// Continuation lines will be: `<indent><spaces_matching_marker><content_continued>`
181///
182/// # Arguments
183/// - `indent`: The leading whitespace (for nested lists)
184/// - `marker`: The list marker (e.g., "- ", "1. ")
185/// - `content`: The text content after the marker
186/// - `width`: The maximum line width
187fn wrap_list_item(indent: &str, marker: &str, content: &str, width: usize) -> String {
188    if content.is_empty() {
189        return format!("{}{}\n", indent, marker.trim_end());
190    }
191
192    if is_single_inline_link(content) {
193        return format!("{}{}{}\n", indent, marker, content.trim());
194    }
195
196    let full_marker = format!("{}{}", indent, marker);
197    let continuation_indent = format!("{}{}", indent, " ".repeat(marker.len()));
198
199    let first_line_prefix_len = full_marker.len();
200    let first_line_width = if width > first_line_prefix_len {
201        width - first_line_prefix_len
202    } else {
203        width
204    };
205
206    let cont_line_prefix_len = continuation_indent.len();
207    let cont_line_width = if width > cont_line_prefix_len {
208        width - cont_line_prefix_len
209    } else {
210        width
211    };
212
213    let words: Vec<&str> = content.split_whitespace().collect();
214    if words.is_empty() {
215        return format!("{}\n", full_marker.trim_end());
216    }
217
218    let mut result = String::new();
219    let mut current_line = String::new();
220    let mut current_width = first_line_width;
221    let mut is_first_line = true;
222
223    for word in words {
224        let word_len = word.len();
225        let space_needed = if current_line.is_empty() { 0 } else { 1 };
226
227        if !current_line.is_empty() && current_line.len() + space_needed + word_len > current_width {
228            if is_first_line {
229                result.push_str(&full_marker);
230                is_first_line = false;
231            } else {
232                result.push_str(&continuation_indent);
233            }
234            result.push_str(&current_line);
235            result.push('\n');
236            current_line.clear();
237            current_width = cont_line_width;
238        }
239
240        if !current_line.is_empty() {
241            current_line.push(' ');
242        }
243        current_line.push_str(word);
244    }
245
246    if !current_line.is_empty() {
247        if is_first_line {
248            result.push_str(&full_marker);
249        } else {
250            result.push_str(&continuation_indent);
251        }
252        result.push_str(&current_line);
253        result.push('\n');
254    }
255
256    result
257}
258
259fn is_single_inline_link(content: &str) -> bool {
260    let trimmed = content.trim();
261    if !(trimmed.starts_with('[') && trimmed.ends_with(')')) {
262        return false;
263    }
264
265    let Some(mid) = trimmed.find("](") else {
266        return false;
267    };
268
269    let url_part = &trimmed[mid + 2..trimmed.len() - 1];
270    if url_part.chars().any(|c| c.is_whitespace()) {
271        return false;
272    }
273
274    !trimmed[mid + 2..].contains("](")
275}
276
277/// Wrap a single line of text at the specified width.
278///
279/// This function wraps text without breaking long words or on hyphens,
280/// similar to Python's textwrap.fill() with break_long_words=False and break_on_hyphens=False.
281fn wrap_line(text: &str, width: usize) -> String {
282    if text.len() <= width {
283        return text.to_string();
284    }
285
286    let mut result = String::new();
287    let mut current_line = String::new();
288    let words: Vec<&str> = text.split_whitespace().collect();
289
290    for word in words {
291        if current_line.is_empty() {
292            current_line.push_str(word);
293        } else if current_line.len() + 1 + word.len() <= width {
294            current_line.push(' ');
295            current_line.push_str(word);
296        } else {
297            if !result.is_empty() {
298                result.push('\n');
299            }
300            result.push_str(&current_line);
301            current_line.clear();
302            current_line.push_str(word);
303        }
304    }
305
306    if !current_line.is_empty() {
307        if !result.is_empty() {
308            result.push('\n');
309        }
310        result.push_str(&current_line);
311    }
312
313    result
314}
315
316#[cfg(test)]
317mod tests {
318    use super::*;
319    use crate::options::ConversionOptions;
320
321    #[test]
322    fn test_wrap_line_short() {
323        let text = "Short text";
324        let wrapped = wrap_line(text, 80);
325        assert_eq!(wrapped, "Short text");
326    }
327
328    #[test]
329    fn test_wrap_line_long() {
330        let text = "123456789 123456789";
331        let wrapped = wrap_line(text, 10);
332        assert_eq!(wrapped, "123456789\n123456789");
333    }
334
335    #[test]
336    fn test_wrap_line_no_break_long_words() {
337        let text = "12345678901 12345";
338        let wrapped = wrap_line(text, 10);
339        assert_eq!(wrapped, "12345678901\n12345");
340    }
341
342    #[test]
343    fn test_wrap_markdown_disabled() {
344        let markdown = "This is a very long line that would normally be wrapped at 40 characters";
345        let options = ConversionOptions {
346            wrap: false,
347            ..Default::default()
348        };
349        let result = wrap_markdown(markdown, &options);
350        assert_eq!(result, markdown);
351    }
352
353    #[test]
354    fn test_wrap_markdown_paragraph() {
355        let markdown = "This is a very long line that would normally be wrapped at 40 characters\n\n";
356        let options = ConversionOptions {
357            wrap: true,
358            wrap_width: 40,
359            ..Default::default()
360        };
361        let result = wrap_markdown(markdown, &options);
362        assert!(result.lines().all(|line| line.len() <= 40 || line.trim().is_empty()));
363    }
364
365    #[test]
366    fn test_wrap_markdown_preserves_code() {
367        let markdown = "```\nThis is a very long line in a code block that should not be wrapped\n```\n";
368        let options = ConversionOptions {
369            wrap: true,
370            wrap_width: 40,
371            ..Default::default()
372        };
373        let result = wrap_markdown(markdown, &options);
374        assert!(result.contains("This is a very long line in a code block that should not be wrapped"));
375    }
376
377    #[test]
378    fn test_wrap_markdown_preserves_headings() {
379        let markdown = "# This is a very long heading that should not be wrapped even if it exceeds the width\n\n";
380        let options = ConversionOptions {
381            wrap: true,
382            wrap_width: 40,
383            ..Default::default()
384        };
385        let result = wrap_markdown(markdown, &options);
386        assert!(
387            result.contains("# This is a very long heading that should not be wrapped even if it exceeds the width")
388        );
389    }
390
391    #[test]
392    fn wrap_markdown_wraps_long_list_items() {
393        let markdown = "- This is a very long list item that should definitely be wrapped when it exceeds the specified wrap width\n- Short item\n";
394        let options = ConversionOptions {
395            wrap: true,
396            wrap_width: 60,
397            ..Default::default()
398        };
399
400        let result = wrap_markdown(markdown, &options);
401
402        assert!(
403            result.contains("- This is a very long list item that should definitely be\n  wrapped"),
404            "First list item not properly wrapped. Got: {}",
405            result
406        );
407        assert!(
408            result.contains("- Short item"),
409            "Short list item incorrectly modified. Got: {}",
410            result
411        );
412    }
413
414    #[test]
415    fn wrap_markdown_wraps_ordered_lists() {
416        let markdown = "1. This is a numbered list item with a very long text that should be wrapped at the specified width\n2. Short\n";
417        let options = ConversionOptions {
418            wrap: true,
419            wrap_width: 60,
420            ..Default::default()
421        };
422
423        let result = wrap_markdown(markdown, &options);
424
425        assert!(
426            result.lines().all(|line| line.len() <= 60 || line.trim().is_empty()),
427            "Some lines exceed wrap width. Got: {}",
428            result
429        );
430        assert!(result.contains("1."), "Lost ordered list marker. Got: {}", result);
431        assert!(
432            result.contains("2."),
433            "Lost second ordered list marker. Got: {}",
434            result
435        );
436    }
437
438    #[test]
439    fn wrap_markdown_preserves_nested_list_structure() {
440        let markdown = "- Item one with some additional text that will need to be wrapped across multiple lines\n  - Nested item with long text that also needs wrapping at the specified width\n  - Short nested\n";
441        let options = ConversionOptions {
442            wrap: true,
443            wrap_width: 50,
444            ..Default::default()
445        };
446
447        let result = wrap_markdown(markdown, &options);
448
449        assert!(result.contains("- Item"), "Lost top-level list marker. Got: {}", result);
450        assert!(
451            result.contains("  - Nested"),
452            "Lost nested list structure. Got: {}",
453            result
454        );
455        assert!(
456            result.lines().all(|line| line.len() <= 50 || line.trim().is_empty()),
457            "Some lines exceed wrap width. Got: {}",
458            result
459        );
460    }
461
462    #[test]
463    fn wrap_markdown_handles_list_with_links() {
464        let markdown = "- [A](#a) with additional text that is long enough to require wrapping at the configured width\n  - [B](#b) also has more content that needs wrapping\n  - [C](#c)\n";
465        let options = ConversionOptions {
466            wrap: true,
467            wrap_width: 50,
468            ..Default::default()
469        };
470
471        let result = wrap_markdown(markdown, &options);
472
473        assert!(result.contains("[A](#a)"), "Lost link in list. Got: {}", result);
474        assert!(result.contains("[B](#b)"), "Lost nested link. Got: {}", result);
475        assert!(result.contains("[C](#c)"), "Lost short nested link. Got: {}", result);
476        assert!(
477            result.contains("- [A](#a)"),
478            "Lost list structure with link. Got: {}",
479            result
480        );
481        assert!(
482            result.contains("  - [B](#b)"),
483            "Lost nested list structure. Got: {}",
484            result
485        );
486    }
487
488    #[test]
489    fn wrap_markdown_handles_empty_list_items() {
490        let markdown = "- \n- Item with text\n- \n";
491        let options = ConversionOptions {
492            wrap: true,
493            wrap_width: 40,
494            ..Default::default()
495        };
496
497        let result = wrap_markdown(markdown, &options);
498
499        assert!(result.contains("- "), "Lost list markers. Got: {}", result);
500        assert!(result.contains("Item with text"), "Lost item text. Got: {}", result);
501    }
502
503    #[test]
504    fn wrap_markdown_preserves_indented_lists_with_wrapping() {
505        let markdown = "- [A](#a) with some additional text that makes this line very long and should be wrapped\n  - [B](#b)\n  - [C](#c) with more text that is also quite long and needs wrapping\n";
506        let options = ConversionOptions {
507            wrap: true,
508            wrap_width: 50,
509            ..Default::default()
510        };
511
512        let result = wrap_markdown(markdown, &options);
513
514        assert!(result.contains("- [A](#a)"), "Lost top-level link. Got: {}", result);
515        assert!(result.contains("  - [B](#b)"), "Lost nested link B. Got: {}", result);
516        assert!(result.contains("  - [C](#c)"), "Lost nested link C. Got: {}", result);
517        assert!(
518            result.lines().all(|line| line.len() <= 50),
519            "Some lines exceed wrap width:\n{}",
520            result
521        );
522    }
523
524    #[test]
525    fn wrap_markdown_does_not_wrap_link_only_items() {
526        let markdown = "- [A very long link label that would exceed wrap width](#a-very-long-link-label)\n  - [Nested very long link label that would also exceed](#nested)\n";
527        let options = ConversionOptions {
528            wrap: true,
529            wrap_width: 30,
530            ..Default::default()
531        };
532
533        let result = wrap_markdown(markdown, &options);
534
535        assert!(result.contains("- [A very long link label that would exceed wrap width](#a-very-long-link-label)"));
536        assert!(result.contains("  - [Nested very long link label that would also exceed](#nested)"));
537    }
538}