Skip to main content

markdown_strip/
lib.rs

1//! # markdown-strip
2//!
3//! Reduce Markdown text to plain text. Conservative — keeps semantic
4//! content intact, drops only formatting markers. Intended for piping
5//! LLM output into TTS, keyword matching, or analytics.
6//!
7//! Handles: ATX headers (`# `, `## ` …), bold/italic (`**x**`, `*x*`,
8//! `__x__`, `_x_`), inline code (backtick spans), fenced code blocks,
9//! links (`[text](url)` → `text`), images (`![alt](url)` → `alt`),
10//! blockquote markers (`> `), and bullet/number list markers.
11//!
12//! ## Example
13//!
14//! ```
15//! use markdown_strip::strip_markdown;
16//! let md = "## Hello\n\n**bold** and *italic* with `code` and [a link](https://x).";
17//! let plain = strip_markdown(md);
18//! assert_eq!(plain, "Hello\n\nbold and italic with code and a link.");
19//! ```
20
21#![deny(missing_docs)]
22
23/// Strip Markdown formatting from `s`.
24pub fn strip_markdown(s: &str) -> String {
25    let mut out = String::with_capacity(s.len());
26    let mut in_fence = false;
27
28    for line in s.lines() {
29        // Fenced code blocks: skip the fences themselves; keep the body
30        // as-is (LLM tool output is often code we want to preserve).
31        let trimmed = line.trim_start();
32        if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
33            in_fence = !in_fence;
34            continue;
35        }
36        if in_fence {
37            out.push_str(line);
38            out.push('\n');
39            continue;
40        }
41
42        let stripped = strip_line(line);
43        out.push_str(&stripped);
44        out.push('\n');
45    }
46
47    // Trim the final pushed newline if input didn't end with one.
48    if !s.ends_with('\n') {
49        if out.ends_with('\n') {
50            out.pop();
51        }
52    }
53    out
54}
55
56fn strip_line(line: &str) -> String {
57    let mut s = line.to_string();
58
59    // ATX headers: leading `#{1,6}\s+`
60    s = strip_atx_header(&s);
61
62    // Blockquote: leading `>` optionally followed by space.
63    s = strip_blockquote(&s);
64
65    // Bullet/number list markers at the start.
66    s = strip_list_marker(&s);
67
68    // Inline tokens (links, images, bold, italic, code).
69    s = strip_inline(&s);
70
71    s
72}
73
74fn strip_atx_header(s: &str) -> String {
75    let leading_ws: String = s.chars().take_while(|c| c.is_whitespace()).collect();
76    let rest = &s[leading_ws.len()..];
77    let mut hashes = 0;
78    for c in rest.chars().take(6) {
79        if c == '#' {
80            hashes += 1;
81        } else {
82            break;
83        }
84    }
85    if hashes > 0 && rest[hashes..].starts_with(' ') {
86        format!("{leading_ws}{}", &rest[hashes + 1..])
87    } else {
88        s.to_string()
89    }
90}
91
92fn strip_blockquote(s: &str) -> String {
93    let leading_ws: String = s.chars().take_while(|c| c.is_whitespace()).collect();
94    let rest = &s[leading_ws.len()..];
95    if let Some(stripped) = rest.strip_prefix("> ") {
96        format!("{leading_ws}{stripped}")
97    } else if let Some(stripped) = rest.strip_prefix('>') {
98        format!("{leading_ws}{stripped}")
99    } else {
100        s.to_string()
101    }
102}
103
104fn strip_list_marker(s: &str) -> String {
105    let leading_ws: String = s.chars().take_while(|c| c.is_whitespace()).collect();
106    let rest = &s[leading_ws.len()..];
107    // Unordered: -, *, +
108    if let Some(stripped) = rest.strip_prefix("- ").or(rest.strip_prefix("* ")).or(rest.strip_prefix("+ ")) {
109        return format!("{leading_ws}{stripped}");
110    }
111    // Ordered: N. or N) where N is digits
112    let mut digits = 0;
113    for c in rest.chars() {
114        if c.is_ascii_digit() {
115            digits += 1;
116        } else {
117            break;
118        }
119    }
120    if digits > 0
121        && rest.len() > digits + 1
122        && (rest.as_bytes()[digits] == b'.' || rest.as_bytes()[digits] == b')')
123        && rest.as_bytes()[digits + 1] == b' '
124    {
125        return format!("{leading_ws}{}", &rest[digits + 2..]);
126    }
127    s.to_string()
128}
129
130fn strip_inline(s: &str) -> String {
131    let mut out = String::with_capacity(s.len());
132    let bytes = s.as_bytes();
133    let mut i = 0;
134    while i < bytes.len() {
135        // Image: ![alt](url) -> alt
136        if i + 1 < bytes.len() && bytes[i] == b'!' && bytes[i + 1] == b'[' {
137            if let Some((alt, end)) = parse_link(&s[i + 1..]) {
138                out.push_str(&alt);
139                i += 1 + end;
140                continue;
141            }
142        }
143        // Link: [text](url) -> text
144        if bytes[i] == b'[' {
145            if let Some((text, end)) = parse_link(&s[i..]) {
146                out.push_str(&text);
147                i += end;
148                continue;
149            }
150        }
151        // Inline code `code` -> code
152        if bytes[i] == b'`' {
153            if let Some(end_rel) = s[i + 1..].find('`') {
154                out.push_str(&s[i + 1..i + 1 + end_rel]);
155                i += 2 + end_rel;
156                continue;
157            }
158        }
159        // Bold **x** or __x__ → x. Italic *x* or _x_ → x. We handle the
160        // bold case first so we don't strip half a bold pair as italic.
161        if i + 1 < bytes.len() && (bytes[i] == b'*' && bytes[i + 1] == b'*') {
162            if let Some(end_rel) = s[i + 2..].find("**") {
163                out.push_str(&s[i + 2..i + 2 + end_rel]);
164                i += 4 + end_rel;
165                continue;
166            }
167        }
168        if i + 1 < bytes.len() && (bytes[i] == b'_' && bytes[i + 1] == b'_') {
169            if let Some(end_rel) = s[i + 2..].find("__") {
170                out.push_str(&s[i + 2..i + 2 + end_rel]);
171                i += 4 + end_rel;
172                continue;
173            }
174        }
175        if bytes[i] == b'*' {
176            if let Some(end_rel) = s[i + 1..].find('*') {
177                out.push_str(&s[i + 1..i + 1 + end_rel]);
178                i += 2 + end_rel;
179                continue;
180            }
181        }
182        if bytes[i] == b'_' && is_word_boundary(bytes, i) {
183            if let Some(end_rel) = s[i + 1..].find('_') {
184                out.push_str(&s[i + 1..i + 1 + end_rel]);
185                i += 2 + end_rel;
186                continue;
187            }
188        }
189        // Default: copy one byte (safe because we only branched on ASCII).
190        out.push(bytes[i] as char);
191        i += 1;
192    }
193    out
194}
195
196fn parse_link(s: &str) -> Option<(String, usize)> {
197    // s starts with '['; find ']('; then ')'.
198    let bytes = s.as_bytes();
199    if bytes[0] != b'[' {
200        return None;
201    }
202    let close_text = s[1..].find("](")?;
203    let after_url_off = 1 + close_text + 2;
204    let close_url = s[after_url_off..].find(')')?;
205    let text = s[1..1 + close_text].to_string();
206    Some((text, after_url_off + close_url + 1))
207}
208
209fn is_word_boundary(bytes: &[u8], i: usize) -> bool {
210    if i == 0 {
211        return true;
212    }
213    let prev = bytes[i - 1];
214    !prev.is_ascii_alphanumeric()
215}