html_to_markdown_rs/
wrapper.rs

1//! Text wrapping functionality for Markdown output.
2//!
3//! This module provides text wrapping capabilities similar to Python's textwrap.fill(),
4//! specifically designed to work with Markdown content while preserving formatting.
5
6use crate::options::ConversionOptions;
7
8/// Wrap text at specified width while preserving Markdown formatting.
9///
10/// This function wraps paragraphs of text at the specified width, but:
11/// - Does not break long words
12/// - Does not break on hyphens
13/// - Preserves Markdown formatting (links, bold, etc.)
14/// - Only wraps paragraph content, not headers, lists, code blocks, etc.
15pub fn wrap_markdown(markdown: &str, options: &ConversionOptions) -> String {
16    if !options.wrap {
17        return markdown.to_string();
18    }
19
20    let mut result = String::with_capacity(markdown.len());
21    let mut in_code_block = false;
22    let mut in_paragraph = false;
23    let mut paragraph_buffer = String::new();
24    let mut in_blockquote_paragraph = false;
25    let mut blockquote_prefix = String::new();
26    let mut blockquote_buffer = String::new();
27
28    for line in markdown.lines() {
29        let trimmed = line.trim_start();
30        let is_code_fence = trimmed.starts_with("```");
31        let is_indented_code = line.starts_with("    ")
32            && !is_list_like(trimmed)
33            && !is_numbered_list(trimmed)
34            && !is_heading(trimmed)
35            && !trimmed.starts_with('>')
36            && !trimmed.starts_with('|');
37
38        if is_code_fence || is_indented_code {
39            if in_paragraph && !paragraph_buffer.is_empty() {
40                result.push_str(&wrap_line(&paragraph_buffer, options.wrap_width));
41                result.push_str("\n\n");
42                paragraph_buffer.clear();
43                in_paragraph = false;
44            }
45
46            if is_code_fence {
47                in_code_block = !in_code_block;
48            }
49            result.push_str(line);
50            result.push('\n');
51            continue;
52        }
53
54        if in_code_block {
55            result.push_str(line);
56            result.push('\n');
57            continue;
58        }
59
60        if let Some((prefix, content)) = parse_blockquote_line(line) {
61            if in_paragraph && !paragraph_buffer.is_empty() {
62                result.push_str(&wrap_line(&paragraph_buffer, options.wrap_width));
63                result.push_str("\n\n");
64                paragraph_buffer.clear();
65                in_paragraph = false;
66            }
67
68            let mut normalized_prefix = prefix;
69            if !normalized_prefix.ends_with(' ') {
70                normalized_prefix.push(' ');
71            }
72
73            if content.is_empty() {
74                if in_blockquote_paragraph && !blockquote_buffer.is_empty() {
75                    result.push_str(&wrap_blockquote_paragraph(
76                        &blockquote_prefix,
77                        &blockquote_buffer,
78                        options.wrap_width,
79                    ));
80                    result.push('\n');
81                    blockquote_buffer.clear();
82                    in_blockquote_paragraph = false;
83                }
84                result.push_str(normalized_prefix.trim_end());
85                result.push('\n');
86                continue;
87            }
88
89            if in_blockquote_paragraph && normalized_prefix != blockquote_prefix {
90                result.push_str(&wrap_blockquote_paragraph(
91                    &blockquote_prefix,
92                    &blockquote_buffer,
93                    options.wrap_width,
94                ));
95                result.push('\n');
96                blockquote_buffer.clear();
97                in_blockquote_paragraph = false;
98            }
99
100            if !in_blockquote_paragraph {
101                blockquote_prefix = normalized_prefix;
102                blockquote_buffer.push_str(&content);
103                in_blockquote_paragraph = true;
104            } else {
105                blockquote_buffer.push(' ');
106                blockquote_buffer.push_str(&content);
107            }
108            continue;
109        } else if in_blockquote_paragraph && !blockquote_buffer.is_empty() {
110            result.push_str(&wrap_blockquote_paragraph(
111                &blockquote_prefix,
112                &blockquote_buffer,
113                options.wrap_width,
114            ));
115            result.push('\n');
116            blockquote_buffer.clear();
117            in_blockquote_paragraph = false;
118        }
119
120        if let Some((indent, marker, content)) = parse_list_item(line) {
121            if in_paragraph && !paragraph_buffer.is_empty() {
122                result.push_str(&wrap_line(&paragraph_buffer, options.wrap_width));
123                result.push_str("\n\n");
124                paragraph_buffer.clear();
125                in_paragraph = false;
126            }
127
128            result.push_str(&wrap_list_item(&indent, &marker, &content, options.wrap_width));
129            continue;
130        }
131
132        let is_structural =
133            is_heading(trimmed) || trimmed.starts_with('>') || trimmed.starts_with('|') || trimmed.starts_with('=');
134
135        if is_structural {
136            if in_paragraph && !paragraph_buffer.is_empty() {
137                result.push_str(&wrap_line(&paragraph_buffer, options.wrap_width));
138                result.push_str("\n\n");
139                paragraph_buffer.clear();
140                in_paragraph = false;
141            }
142
143            result.push_str(line);
144            result.push('\n');
145            continue;
146        }
147
148        if line.trim().is_empty() {
149            if in_paragraph && !paragraph_buffer.is_empty() {
150                result.push_str(&wrap_line(&paragraph_buffer, options.wrap_width));
151                result.push_str("\n\n");
152                paragraph_buffer.clear();
153                in_paragraph = false;
154            } else if !in_paragraph {
155                result.push('\n');
156            }
157            continue;
158        }
159
160        if in_paragraph {
161            paragraph_buffer.push(' ');
162        }
163        paragraph_buffer.push_str(line.trim());
164        in_paragraph = true;
165    }
166
167    if in_blockquote_paragraph && !blockquote_buffer.is_empty() {
168        result.push_str(&wrap_blockquote_paragraph(
169            &blockquote_prefix,
170            &blockquote_buffer,
171            options.wrap_width,
172        ));
173        result.push('\n');
174    }
175
176    if in_paragraph && !paragraph_buffer.is_empty() {
177        result.push_str(&wrap_line(&paragraph_buffer, options.wrap_width));
178        result.push_str("\n\n");
179    }
180
181    result
182}
183
184fn parse_blockquote_line(line: &str) -> Option<(String, String)> {
185    let trimmed = line.trim_start();
186    if !trimmed.starts_with('>') {
187        return None;
188    }
189
190    let indent_len = line.len() - trimmed.len();
191    let bytes = line.as_bytes();
192    let mut i = indent_len;
193
194    while i < bytes.len() {
195        if bytes[i] != b'>' {
196            break;
197        }
198        i += 1;
199        if i < bytes.len() && bytes[i] == b' ' {
200            i += 1;
201        }
202        while i + 1 < bytes.len() && bytes[i] == b' ' && bytes[i + 1] == b'>' {
203            i += 1;
204        }
205    }
206
207    let prefix = line[..i].to_string();
208    let content = line[i..].trim().to_string();
209    Some((prefix, content))
210}
211
212fn wrap_blockquote_paragraph(prefix: &str, content: &str, width: usize) -> String {
213    let prefix_len = prefix.len();
214    let inner_width = if width > prefix_len { width - prefix_len } else { 1 };
215
216    let wrapped = wrap_line(content, inner_width);
217    let mut out = String::new();
218    for (idx, part) in wrapped.split('\n').enumerate() {
219        if idx > 0 {
220            out.push('\n');
221        }
222        out.push_str(prefix);
223        out.push_str(part);
224    }
225    out
226}
227
228fn is_list_like(trimmed: &str) -> bool {
229    matches!(trimmed.chars().next(), Some('-' | '*' | '+'))
230}
231
232fn is_numbered_list(trimmed: &str) -> bool {
233    let token = trimmed.split_whitespace().next().unwrap_or("");
234    if token.is_empty() || !(token.ends_with('.') || token.ends_with(')')) {
235        return false;
236    }
237
238    let digits = token.trim_end_matches(['.', ')']);
239    !digits.is_empty() && digits.chars().all(|c| c.is_ascii_digit())
240}
241
242fn is_heading(trimmed: &str) -> bool {
243    trimmed.starts_with('#')
244}
245
246/// Parse a list item into its components: (indent, marker, content)
247///
248/// Returns Some((indent, marker, content)) if the line is a valid list item,
249/// None otherwise.
250///
251/// Examples:
252/// - "- text" -> ("", "- ", "text")
253/// - "  - text" -> ("  ", "- ", "text")
254/// - "1. text" -> ("", "1. ", "text")
255/// - "  42) text" -> ("  ", "42) ", "text")
256fn parse_list_item(line: &str) -> Option<(String, String, String)> {
257    let trimmed = line.trim_start();
258    let indent = &line[..line.len() - trimmed.len()];
259
260    if let Some(rest) = trimmed.strip_prefix('-') {
261        if rest.starts_with(' ') || rest.is_empty() {
262            return Some((indent.to_string(), "- ".to_string(), rest.trim_start().to_string()));
263        }
264    }
265    if let Some(rest) = trimmed.strip_prefix('*') {
266        if rest.starts_with(' ') || rest.is_empty() {
267            return Some((indent.to_string(), "* ".to_string(), rest.trim_start().to_string()));
268        }
269    }
270    if let Some(rest) = trimmed.strip_prefix('+') {
271        if rest.starts_with(' ') || rest.is_empty() {
272            return Some((indent.to_string(), "+ ".to_string(), rest.trim_start().to_string()));
273        }
274    }
275
276    let first_token = trimmed.split_whitespace().next()?;
277    if first_token.ends_with('.') || first_token.ends_with(')') {
278        let digits = first_token.trim_end_matches(['.', ')']);
279        if !digits.is_empty() && digits.chars().all(|c| c.is_ascii_digit()) {
280            let marker_len = first_token.len();
281            let rest = trimmed[marker_len..].trim_start();
282            return Some((
283                indent.to_string(),
284                trimmed[..marker_len].to_string() + " ",
285                rest.to_string(),
286            ));
287        }
288    }
289
290    None
291}
292
293/// Wrap a list item while preserving its structure.
294///
295/// The first line of output will be: `<indent><marker><content_start>`
296/// Continuation lines will be: `<indent><spaces_matching_marker><content_continued>`
297///
298/// # Arguments
299/// - `indent`: The leading whitespace (for nested lists)
300/// - `marker`: The list marker (e.g., "- ", "1. ")
301/// - `content`: The text content after the marker
302/// - `width`: The maximum line width
303fn wrap_list_item(indent: &str, marker: &str, content: &str, width: usize) -> String {
304    if content.is_empty() {
305        return format!("{}{}\n", indent, marker.trim_end());
306    }
307
308    if is_single_inline_link(content) {
309        return format!("{}{}{}\n", indent, marker, content.trim());
310    }
311
312    let full_marker = format!("{}{}", indent, marker);
313    let continuation_indent = format!("{}{}", indent, " ".repeat(marker.len()));
314
315    let first_line_prefix_len = full_marker.len();
316    let first_line_width = if width > first_line_prefix_len {
317        width - first_line_prefix_len
318    } else {
319        width
320    };
321
322    let cont_line_prefix_len = continuation_indent.len();
323    let cont_line_width = if width > cont_line_prefix_len {
324        width - cont_line_prefix_len
325    } else {
326        width
327    };
328
329    let words: Vec<&str> = content.split_whitespace().collect();
330    if words.is_empty() {
331        return format!("{}\n", full_marker.trim_end());
332    }
333
334    let mut result = String::new();
335    let mut current_line = String::new();
336    let mut current_width = first_line_width;
337    let mut is_first_line = true;
338
339    for word in words {
340        let word_len = word.len();
341        let space_needed = if current_line.is_empty() { 0 } else { 1 };
342
343        if !current_line.is_empty() && current_line.len() + space_needed + word_len > current_width {
344            if is_first_line {
345                result.push_str(&full_marker);
346                is_first_line = false;
347            } else {
348                result.push_str(&continuation_indent);
349            }
350            result.push_str(&current_line);
351            result.push('\n');
352            current_line.clear();
353            current_width = cont_line_width;
354        }
355
356        if !current_line.is_empty() {
357            current_line.push(' ');
358        }
359        current_line.push_str(word);
360    }
361
362    if !current_line.is_empty() {
363        if is_first_line {
364            result.push_str(&full_marker);
365        } else {
366            result.push_str(&continuation_indent);
367        }
368        result.push_str(&current_line);
369        result.push('\n');
370    }
371
372    result
373}
374
375fn is_single_inline_link(content: &str) -> bool {
376    let trimmed = content.trim();
377    if !(trimmed.starts_with('[') && trimmed.ends_with(')')) {
378        return false;
379    }
380
381    let Some(mid) = trimmed.find("](") else {
382        return false;
383    };
384
385    let url_part = &trimmed[mid + 2..trimmed.len() - 1];
386    if url_part.chars().any(|c| c.is_whitespace()) {
387        return false;
388    }
389
390    !trimmed[mid + 2..].contains("](")
391}
392
393/// Wrap a single line of text at the specified width.
394///
395/// This function wraps text without breaking long words or on hyphens,
396/// similar to Python's textwrap.fill() with break_long_words=False and break_on_hyphens=False.
397fn wrap_line(text: &str, width: usize) -> String {
398    if text.len() <= width {
399        return text.to_string();
400    }
401
402    let mut result = String::new();
403    let mut current_line = String::new();
404    let words: Vec<&str> = text.split_whitespace().collect();
405
406    for word in words {
407        if current_line.is_empty() {
408            current_line.push_str(word);
409        } else if current_line.len() + 1 + word.len() <= width {
410            current_line.push(' ');
411            current_line.push_str(word);
412        } else {
413            if !result.is_empty() {
414                result.push('\n');
415            }
416            result.push_str(&current_line);
417            current_line.clear();
418            current_line.push_str(word);
419        }
420    }
421
422    if !current_line.is_empty() {
423        if !result.is_empty() {
424            result.push('\n');
425        }
426        result.push_str(&current_line);
427    }
428
429    result
430}
431
432#[cfg(test)]
433mod tests {
434    use super::*;
435    use crate::options::ConversionOptions;
436
437    #[test]
438    fn test_wrap_line_short() {
439        let text = "Short text";
440        let wrapped = wrap_line(text, 80);
441        assert_eq!(wrapped, "Short text");
442    }
443
444    #[test]
445    fn test_wrap_line_long() {
446        let text = "123456789 123456789";
447        let wrapped = wrap_line(text, 10);
448        assert_eq!(wrapped, "123456789\n123456789");
449    }
450
451    #[test]
452    fn test_wrap_line_no_break_long_words() {
453        let text = "12345678901 12345";
454        let wrapped = wrap_line(text, 10);
455        assert_eq!(wrapped, "12345678901\n12345");
456    }
457
458    #[test]
459    fn test_wrap_markdown_disabled() {
460        let markdown = "This is a very long line that would normally be wrapped at 40 characters";
461        let options = ConversionOptions {
462            wrap: false,
463            ..Default::default()
464        };
465        let result = wrap_markdown(markdown, &options);
466        assert_eq!(result, markdown);
467    }
468
469    #[test]
470    fn test_wrap_markdown_paragraph() {
471        let markdown = "This is a very long line that would normally be wrapped at 40 characters\n\n";
472        let options = ConversionOptions {
473            wrap: true,
474            wrap_width: 40,
475            ..Default::default()
476        };
477        let result = wrap_markdown(markdown, &options);
478        assert!(result.lines().all(|line| line.len() <= 40 || line.trim().is_empty()));
479    }
480
481    #[test]
482    fn test_wrap_markdown_blockquote_paragraph() {
483        let markdown = "> This is a very long blockquote line that should wrap at 30 characters\n";
484        let options = ConversionOptions {
485            wrap: true,
486            wrap_width: 30,
487            ..Default::default()
488        };
489        let result = wrap_markdown(markdown, &options);
490        assert!(
491            result.lines().all(|line| line.len() <= 30 || line.trim().is_empty()),
492            "Some lines exceed wrap width. Got: {}",
493            result
494        );
495        assert!(
496            result.contains("> This is a very"),
497            "Missing expected wrapped content. Got: {}",
498            result
499        );
500        assert!(
501            result.lines().filter(|l| l.starts_with("> ")).count() >= 2,
502            "Expected multiple wrapped blockquote lines. Got: {}",
503            result
504        );
505    }
506
507    #[test]
508    fn test_wrap_markdown_preserves_code() {
509        let markdown = "```\nThis is a very long line in a code block that should not be wrapped\n```\n";
510        let options = ConversionOptions {
511            wrap: true,
512            wrap_width: 40,
513            ..Default::default()
514        };
515        let result = wrap_markdown(markdown, &options);
516        assert!(result.contains("This is a very long line in a code block that should not be wrapped"));
517    }
518
519    #[test]
520    fn test_wrap_markdown_preserves_headings() {
521        let markdown = "# This is a very long heading that should not be wrapped even if it exceeds the width\n\n";
522        let options = ConversionOptions {
523            wrap: true,
524            wrap_width: 40,
525            ..Default::default()
526        };
527        let result = wrap_markdown(markdown, &options);
528        assert!(
529            result.contains("# This is a very long heading that should not be wrapped even if it exceeds the width")
530        );
531    }
532
533    #[test]
534    fn wrap_markdown_wraps_long_list_items() {
535        let markdown = "- This is a very long list item that should definitely be wrapped when it exceeds the specified wrap width\n- Short item\n";
536        let options = ConversionOptions {
537            wrap: true,
538            wrap_width: 60,
539            ..Default::default()
540        };
541
542        let result = wrap_markdown(markdown, &options);
543
544        assert!(
545            result.contains("- This is a very long list item that should definitely be\n  wrapped"),
546            "First list item not properly wrapped. Got: {}",
547            result
548        );
549        assert!(
550            result.contains("- Short item"),
551            "Short list item incorrectly modified. Got: {}",
552            result
553        );
554    }
555
556    #[test]
557    fn wrap_markdown_wraps_ordered_lists() {
558        let markdown = "1. This is a numbered list item with a very long text that should be wrapped at the specified width\n2. Short\n";
559        let options = ConversionOptions {
560            wrap: true,
561            wrap_width: 60,
562            ..Default::default()
563        };
564
565        let result = wrap_markdown(markdown, &options);
566
567        assert!(
568            result.lines().all(|line| line.len() <= 60 || line.trim().is_empty()),
569            "Some lines exceed wrap width. Got: {}",
570            result
571        );
572        assert!(result.contains("1."), "Lost ordered list marker. Got: {}", result);
573        assert!(
574            result.contains("2."),
575            "Lost second ordered list marker. Got: {}",
576            result
577        );
578    }
579
580    #[test]
581    fn wrap_markdown_preserves_nested_list_structure() {
582        let markdown = "- Item one with some additional text that will need to be wrapped across multiple lines\n  - Nested item with long text that also needs wrapping at the specified width\n  - Short nested\n";
583        let options = ConversionOptions {
584            wrap: true,
585            wrap_width: 50,
586            ..Default::default()
587        };
588
589        let result = wrap_markdown(markdown, &options);
590
591        assert!(result.contains("- Item"), "Lost top-level list marker. Got: {}", result);
592        assert!(
593            result.contains("  - Nested"),
594            "Lost nested list structure. Got: {}",
595            result
596        );
597        assert!(
598            result.lines().all(|line| line.len() <= 50 || line.trim().is_empty()),
599            "Some lines exceed wrap width. Got: {}",
600            result
601        );
602    }
603
604    #[test]
605    fn wrap_markdown_handles_list_with_links() {
606        let markdown = "- [A](#a) with additional text that is long enough to require wrapping at the configured width\n  - [B](#b) also has more content that needs wrapping\n  - [C](#c)\n";
607        let options = ConversionOptions {
608            wrap: true,
609            wrap_width: 50,
610            ..Default::default()
611        };
612
613        let result = wrap_markdown(markdown, &options);
614
615        assert!(result.contains("[A](#a)"), "Lost link in list. Got: {}", result);
616        assert!(result.contains("[B](#b)"), "Lost nested link. Got: {}", result);
617        assert!(result.contains("[C](#c)"), "Lost short nested link. Got: {}", result);
618        assert!(
619            result.contains("- [A](#a)"),
620            "Lost list structure with link. Got: {}",
621            result
622        );
623        assert!(
624            result.contains("  - [B](#b)"),
625            "Lost nested list structure. Got: {}",
626            result
627        );
628    }
629
630    #[test]
631    fn wrap_markdown_handles_empty_list_items() {
632        let markdown = "- \n- Item with text\n- \n";
633        let options = ConversionOptions {
634            wrap: true,
635            wrap_width: 40,
636            ..Default::default()
637        };
638
639        let result = wrap_markdown(markdown, &options);
640
641        assert!(result.contains("- "), "Lost list markers. Got: {}", result);
642        assert!(result.contains("Item with text"), "Lost item text. Got: {}", result);
643    }
644
645    #[test]
646    fn wrap_markdown_preserves_indented_lists_with_wrapping() {
647        let markdown = "- [A](#a) with some additional text that makes this line very long and should be wrapped\n  - [B](#b)\n  - [C](#c) with more text that is also quite long and needs wrapping\n";
648        let options = ConversionOptions {
649            wrap: true,
650            wrap_width: 50,
651            ..Default::default()
652        };
653
654        let result = wrap_markdown(markdown, &options);
655
656        assert!(result.contains("- [A](#a)"), "Lost top-level link. Got: {}", result);
657        assert!(result.contains("  - [B](#b)"), "Lost nested link B. Got: {}", result);
658        assert!(result.contains("  - [C](#c)"), "Lost nested link C. Got: {}", result);
659        assert!(
660            result.lines().all(|line| line.len() <= 50),
661            "Some lines exceed wrap width:\n{}",
662            result
663        );
664    }
665
666    #[test]
667    fn wrap_markdown_does_not_wrap_link_only_items() {
668        let markdown = "- [A very long link label that would exceed wrap width](#a-very-long-link-label)\n  - [Nested very long link label that would also exceed](#nested)\n";
669        let options = ConversionOptions {
670            wrap: true,
671            wrap_width: 30,
672            ..Default::default()
673        };
674
675        let result = wrap_markdown(markdown, &options);
676
677        assert!(result.contains("- [A very long link label that would exceed wrap width](#a-very-long-link-label)"));
678        assert!(result.contains("  - [Nested very long link label that would also exceed](#nested)"));
679    }
680}