Skip to main content

web_capture/
postprocess.rs

1//! Markdown post-processing pipeline (R1).
2//!
3//! Applies a series of text transformations to improve markdown quality:
4//! - Unicode normalization (non-breaking spaces, curly quotes, dashes)
5//! - LaTeX formula spacing fixes for GitHub rendering
6//! - Bold formatting cleanup
7//! - Percent sign fix for GitHub `KaTeX`
8//!
9//! Based on reference implementation from:
10//! <https://github.com/link-foundation/meta-theory/blob/main/scripts/download-article.mjs>
11
12use regex::Regex;
13
14/// Options for post-processing.
15#[allow(clippy::struct_excessive_bools)]
16#[derive(Debug, Clone)]
17pub struct PostProcessOptions {
18    pub normalize_unicode: bool,
19    pub fix_latex_spacing: bool,
20    pub fix_bold_formatting: bool,
21    pub fix_percent_sign: bool,
22}
23
24impl Default for PostProcessOptions {
25    fn default() -> Self {
26        Self {
27            normalize_unicode: true,
28            fix_latex_spacing: true,
29            fix_bold_formatting: true,
30            fix_percent_sign: true,
31        }
32    }
33}
34
35/// Apply all post-processing transformations to markdown text.
36#[must_use]
37pub fn post_process_markdown(markdown: &str, options: &PostProcessOptions) -> String {
38    let mut result = markdown.to_string();
39
40    if options.normalize_unicode {
41        result = apply_unicode_normalization(&result);
42    }
43
44    if options.fix_latex_spacing {
45        result = apply_latex_spacing_fixes(&result);
46    }
47
48    if options.fix_percent_sign {
49        result = apply_percent_sign_fix(&result);
50    }
51
52    if options.fix_bold_formatting {
53        result = apply_bold_formatting_fixes(&result);
54    }
55
56    // Fix double spaces (but not in code blocks)
57    if let Ok(re) = Regex::new(r"([^\n`]) +") {
58        result = re
59            .replace_all(&result, |caps: &regex::Captures| format!("{} ", &caps[1]))
60            .to_string();
61    }
62
63    // Clean up extra spaces around em-dashes
64    if let Ok(re) = Regex::new(r"\s+\u{2014}\s+") {
65        result = re.replace_all(&result, " \u{2014} ").to_string();
66    }
67
68    // Fix stray standalone $ signs on their own line
69    if let Ok(re) = Regex::new(r"(?m)^\$\s*$") {
70        result = re.replace_all(&result, "").to_string();
71    }
72
73    result
74}
75
76/// Normalize unicode characters for consistent rendering.
77#[must_use]
78pub fn apply_unicode_normalization(text: &str) -> String {
79    let mut result = text.to_string();
80
81    // Preserve non-breaking spaces as &nbsp; entities for clear marking
82    result = result.replace('\u{00A0}', "&nbsp;");
83
84    // Normalize curly quotes to straight quotes
85    result = result.replace('\u{2018}', "'");
86    result = result.replace('\u{2019}', "'");
87    result = result.replace('\u{201C}', "\"");
88    result = result.replace('\u{201D}', "\"");
89
90    // Normalize em-dash and en-dash
91    result = result.replace('\u{2014}', " \u{2014} "); // em-dash with spaces
92    result = result.replace('\u{2013}', "-"); // en-dash to hyphen
93
94    // Normalize ellipsis
95    result = result.replace('\u{2026}', "...");
96
97    result
98}
99
100/// Fix spacing around inline LaTeX formulas for GitHub rendering.
101///
102/// Uses a line-by-line token-based approach to correctly identify
103/// opening/closing `$` delimiters.
104#[must_use]
105pub fn apply_latex_spacing_fixes(text: &str) -> String {
106    text.lines()
107        .map(|line| {
108            // Skip block formula lines ($$...$$) and blockquote block formulas
109            let trimmed = line.trim_start_matches('>').trim_start();
110            if trimmed.starts_with("$$") && trimmed.ends_with("$$") {
111                return line.to_string();
112            }
113
114            // Find all inline formula spans by tracking $ delimiters
115            let chars: Vec<char> = line.chars().collect();
116            let mut formulas = Vec::new();
117            let mut i = 0;
118
119            while i < chars.len() {
120                if chars[i] == '$' && (i == 0 || chars[i - 1] != '\\') {
121                    // Skip $$ block delimiters
122                    if i + 1 < chars.len() && chars[i + 1] == '$' {
123                        i += 2;
124                        continue;
125                    }
126                    // Found opening $, find closing $
127                    let start = i;
128                    i += 1;
129                    while i < chars.len() && (chars[i] != '$' || chars[i - 1] == '\\') {
130                        i += 1;
131                    }
132                    if i < chars.len() {
133                        formulas.push((start, i));
134                        i += 1;
135                    }
136                } else {
137                    i += 1;
138                }
139            }
140
141            if formulas.is_empty() {
142                return line.to_string();
143            }
144
145            // Build the line with fixes applied
146            let mut fixed = String::new();
147            let mut pos = 0;
148
149            for (start, end) in &formulas {
150                // Append text before formula
151                let before: String = chars[pos..*start].iter().collect();
152                fixed.push_str(&before);
153
154                let raw_inner: String = chars[start + 1..*end].iter().collect();
155                let inner = raw_inner.trim();
156
157                // Add space before formula if preceded by word char, comma, colon, etc.
158                if !fixed.is_empty() {
159                    let last_char = fixed.chars().last().unwrap_or(' ');
160                    if is_pre_formula_char(last_char) {
161                        fixed.push(' ');
162                    }
163                }
164
165                fixed.push('$');
166                fixed.push_str(inner);
167                fixed.push('$');
168
169                // Add space after formula if followed by word character
170                let after_pos = end + 1;
171                if after_pos < chars.len() && is_post_formula_char(chars[after_pos]) {
172                    fixed.push(' ');
173                }
174
175                pos = end + 1;
176            }
177            // Append remaining text
178            let remaining: String = chars[pos..].iter().collect();
179            fixed.push_str(&remaining);
180
181            fixed
182        })
183        .collect::<Vec<_>>()
184        .join("\n")
185}
186
187/// Check if a character should trigger a space before a formula delimiter.
188fn is_pre_formula_char(c: char) -> bool {
189    c.is_ascii_alphanumeric()
190        || ('\u{0430}'..='\u{044F}').contains(&c) // Russian lowercase
191        || ('\u{0410}'..='\u{042F}').contains(&c) // Russian uppercase
192        || c == '\u{0451}' // ё
193        || c == '\u{0401}' // Ё
194        || c == ','
195        || c == ':'
196        || c == ';'
197        || c == '\u{00BB}' // »
198        || c == ')'
199        || c == ']'
200}
201
202/// Check if a character should trigger a space after a formula delimiter.
203fn is_post_formula_char(c: char) -> bool {
204    c.is_ascii_alphabetic()
205        || ('\u{0430}'..='\u{044F}').contains(&c)
206        || ('\u{0410}'..='\u{042F}').contains(&c)
207        || c == '\u{0451}'
208        || c == '\u{0401}'
209}
210
211/// Fix percent sign in inline formulas for GitHub `KaTeX` rendering.
212///
213/// GitHub's `KaTeX` treats `%` as a LaTeX comment character.
214/// Workaround: use `\\%` which GitHub's preprocessor converts to `\%`.
215#[must_use]
216pub fn apply_percent_sign_fix(text: &str) -> String {
217    let mut result = text.to_string();
218    if let Ok(re) = Regex::new(r"\$(\d+)\\+%\$") {
219        result = re.replace_all(&result, r"$$$1\\%$$").to_string();
220    }
221    if let Ok(re) = Regex::new(r"\$(\d+)\\text\{%\}\$") {
222        result = re.replace_all(&result, r"$$$1\\%$$").to_string();
223    }
224    result
225}
226
227/// Clean up bold formatting artifacts from HTML-to-markdown conversion.
228#[must_use]
229pub fn apply_bold_formatting_fixes(text: &str) -> String {
230    let mut result = text.to_string();
231
232    // Remove empty bold markers
233    if let Ok(re) = Regex::new(r"(\S)\*\*[^\S\n]*\*\*(\S)") {
234        result = re.replace_all(&result, "$1 $2").to_string();
235    }
236    if let Ok(re) = Regex::new(r"\*\*[^\S\n]*\*\*") {
237        result = re.replace_all(&result, "").to_string();
238    }
239
240    // Fix bold marker spacing: trim content inside **...**
241    result = result
242        .lines()
243        .map(fix_bold_line)
244        .collect::<Vec<_>>()
245        .join("\n");
246
247    result
248}
249
250/// Fix bold formatting on a single line.
251fn fix_bold_line(line: &str) -> String {
252    enum Part {
253        Text(String),
254        Bold(String),
255    }
256
257    let Ok(bold_re) = Regex::new(r"\*\*(.+?)\*\*") else {
258        return line.to_string();
259    };
260
261    if !bold_re.is_match(line) {
262        return line.to_string();
263    }
264    let mut parts: Vec<Part> = Vec::new();
265    let mut last_end = 0;
266
267    for cap in bold_re.captures_iter(line) {
268        let m = cap.get(0).unwrap();
269        if m.start() > last_end {
270            parts.push(Part::Text(line[last_end..m.start()].to_string()));
271        }
272        parts.push(Part::Bold(cap[1].trim().to_string()));
273        last_end = m.end();
274    }
275    if last_end < line.len() {
276        parts.push(Part::Text(line[last_end..].to_string()));
277    }
278
279    // Rebuild line
280    let mut rebuilt = String::new();
281    let parts_len = parts.len();
282    for (idx, part) in parts.into_iter().enumerate() {
283        match part {
284            Part::Text(s) => rebuilt.push_str(&s),
285            Part::Bold(content) => {
286                if content.is_empty() {
287                    continue;
288                }
289                if !rebuilt.is_empty() {
290                    let last = rebuilt.chars().last().unwrap_or(' ');
291                    if last.is_alphanumeric()
292                        || ('\u{0430}'..='\u{044F}').contains(&last)
293                        || ('\u{0410}'..='\u{042F}').contains(&last)
294                        || last == ')'
295                        || last == '.'
296                    {
297                        rebuilt.push(' ');
298                    }
299                }
300                rebuilt.push_str("**");
301                rebuilt.push_str(&content);
302                rebuilt.push_str("**");
303                // Check if next part starts with word character
304                if idx + 1 < parts_len {
305                    // Peek is hard here, but the JS just checks next part content
306                    // We'll handle this by checking rebuilt state in next iteration
307                }
308            }
309        }
310    }
311
312    rebuilt
313}