web_capture/
postprocess.rs1use regex::Regex;
13
14#[allow(clippy::struct_excessive_bools)]
16#[derive(Debug, Clone)]
17pub struct PostProcessOptions {
18 pub normalize_unicode: bool,
19 pub fix_latex_spacing: bool,
20 pub fix_bold_formatting: bool,
21 pub fix_percent_sign: bool,
22}
23
24impl Default for PostProcessOptions {
25 fn default() -> Self {
26 Self {
27 normalize_unicode: true,
28 fix_latex_spacing: true,
29 fix_bold_formatting: true,
30 fix_percent_sign: true,
31 }
32 }
33}
34
35#[must_use]
37pub fn post_process_markdown(markdown: &str, options: &PostProcessOptions) -> String {
38 let mut result = markdown.to_string();
39
40 if options.normalize_unicode {
41 result = apply_unicode_normalization(&result);
42 }
43
44 if options.fix_latex_spacing {
45 result = apply_latex_spacing_fixes(&result);
46 }
47
48 if options.fix_percent_sign {
49 result = apply_percent_sign_fix(&result);
50 }
51
52 if options.fix_bold_formatting {
53 result = apply_bold_formatting_fixes(&result);
54 }
55
56 if let Ok(re) = Regex::new(r"([^\n`]) +") {
58 result = re
59 .replace_all(&result, |caps: ®ex::Captures| format!("{} ", &caps[1]))
60 .to_string();
61 }
62
63 if let Ok(re) = Regex::new(r"\s+\u{2014}\s+") {
65 result = re.replace_all(&result, " \u{2014} ").to_string();
66 }
67
68 if let Ok(re) = Regex::new(r"(?m)^\$\s*$") {
70 result = re.replace_all(&result, "").to_string();
71 }
72
73 result
74}
75
76#[must_use]
78pub fn apply_unicode_normalization(text: &str) -> String {
79 let mut result = text.to_string();
80
81 result = result.replace('\u{00A0}', " ");
83
84 result = result.replace('\u{2018}', "'");
86 result = result.replace('\u{2019}', "'");
87 result = result.replace('\u{201C}', "\"");
88 result = result.replace('\u{201D}', "\"");
89
90 result = result.replace('\u{2014}', " \u{2014} "); result = result.replace('\u{2013}', "-"); result = result.replace('\u{2026}', "...");
96
97 result
98}
99
100#[must_use]
105pub fn apply_latex_spacing_fixes(text: &str) -> String {
106 text.lines()
107 .map(|line| {
108 let trimmed = line.trim_start_matches('>').trim_start();
110 if trimmed.starts_with("$$") && trimmed.ends_with("$$") {
111 return line.to_string();
112 }
113
114 let chars: Vec<char> = line.chars().collect();
116 let mut formulas = Vec::new();
117 let mut i = 0;
118
119 while i < chars.len() {
120 if chars[i] == '$' && (i == 0 || chars[i - 1] != '\\') {
121 if i + 1 < chars.len() && chars[i + 1] == '$' {
123 i += 2;
124 continue;
125 }
126 let start = i;
128 i += 1;
129 while i < chars.len() && (chars[i] != '$' || chars[i - 1] == '\\') {
130 i += 1;
131 }
132 if i < chars.len() {
133 formulas.push((start, i));
134 i += 1;
135 }
136 } else {
137 i += 1;
138 }
139 }
140
141 if formulas.is_empty() {
142 return line.to_string();
143 }
144
145 let mut fixed = String::new();
147 let mut pos = 0;
148
149 for (start, end) in &formulas {
150 let before: String = chars[pos..*start].iter().collect();
152 fixed.push_str(&before);
153
154 let raw_inner: String = chars[start + 1..*end].iter().collect();
155 let inner = raw_inner.trim();
156
157 if !fixed.is_empty() {
159 let last_char = fixed.chars().last().unwrap_or(' ');
160 if is_pre_formula_char(last_char) {
161 fixed.push(' ');
162 }
163 }
164
165 fixed.push('$');
166 fixed.push_str(inner);
167 fixed.push('$');
168
169 let after_pos = end + 1;
171 if after_pos < chars.len() && is_post_formula_char(chars[after_pos]) {
172 fixed.push(' ');
173 }
174
175 pos = end + 1;
176 }
177 let remaining: String = chars[pos..].iter().collect();
179 fixed.push_str(&remaining);
180
181 fixed
182 })
183 .collect::<Vec<_>>()
184 .join("\n")
185}
186
187fn is_pre_formula_char(c: char) -> bool {
189 c.is_ascii_alphanumeric()
190 || ('\u{0430}'..='\u{044F}').contains(&c) || ('\u{0410}'..='\u{042F}').contains(&c) || c == '\u{0451}' || c == '\u{0401}' || c == ','
195 || c == ':'
196 || c == ';'
197 || c == '\u{00BB}' || c == ')'
199 || c == ']'
200}
201
202fn is_post_formula_char(c: char) -> bool {
204 c.is_ascii_alphabetic()
205 || ('\u{0430}'..='\u{044F}').contains(&c)
206 || ('\u{0410}'..='\u{042F}').contains(&c)
207 || c == '\u{0451}'
208 || c == '\u{0401}'
209}
210
211#[must_use]
216pub fn apply_percent_sign_fix(text: &str) -> String {
217 let mut result = text.to_string();
218 if let Ok(re) = Regex::new(r"\$(\d+)\\+%\$") {
219 result = re.replace_all(&result, r"$$$1\\%$$").to_string();
220 }
221 if let Ok(re) = Regex::new(r"\$(\d+)\\text\{%\}\$") {
222 result = re.replace_all(&result, r"$$$1\\%$$").to_string();
223 }
224 result
225}
226
227#[must_use]
229pub fn apply_bold_formatting_fixes(text: &str) -> String {
230 let mut result = text.to_string();
231
232 if let Ok(re) = Regex::new(r"(\S)\*\*[^\S\n]*\*\*(\S)") {
234 result = re.replace_all(&result, "$1 $2").to_string();
235 }
236 if let Ok(re) = Regex::new(r"\*\*[^\S\n]*\*\*") {
237 result = re.replace_all(&result, "").to_string();
238 }
239
240 result = result
242 .lines()
243 .map(fix_bold_line)
244 .collect::<Vec<_>>()
245 .join("\n");
246
247 result
248}
249
250fn fix_bold_line(line: &str) -> String {
252 enum Part {
253 Text(String),
254 Bold(String),
255 }
256
257 let Ok(bold_re) = Regex::new(r"\*\*(.+?)\*\*") else {
258 return line.to_string();
259 };
260
261 if !bold_re.is_match(line) {
262 return line.to_string();
263 }
264 let mut parts: Vec<Part> = Vec::new();
265 let mut last_end = 0;
266
267 for cap in bold_re.captures_iter(line) {
268 let m = cap.get(0).unwrap();
269 if m.start() > last_end {
270 parts.push(Part::Text(line[last_end..m.start()].to_string()));
271 }
272 parts.push(Part::Bold(cap[1].trim().to_string()));
273 last_end = m.end();
274 }
275 if last_end < line.len() {
276 parts.push(Part::Text(line[last_end..].to_string()));
277 }
278
279 let mut rebuilt = String::new();
281 let parts_len = parts.len();
282 for (idx, part) in parts.into_iter().enumerate() {
283 match part {
284 Part::Text(s) => rebuilt.push_str(&s),
285 Part::Bold(content) => {
286 if content.is_empty() {
287 continue;
288 }
289 if !rebuilt.is_empty() {
290 let last = rebuilt.chars().last().unwrap_or(' ');
291 if last.is_alphanumeric()
292 || ('\u{0430}'..='\u{044F}').contains(&last)
293 || ('\u{0410}'..='\u{042F}').contains(&last)
294 || last == ')'
295 || last == '.'
296 {
297 rebuilt.push(' ');
298 }
299 }
300 rebuilt.push_str("**");
301 rebuilt.push_str(&content);
302 rebuilt.push_str("**");
303 if idx + 1 < parts_len {
305 }
308 }
309 }
310 }
311
312 rebuilt
313}