Skip to main content

imessage_core/
formatting.rs

1/// Markdown-to-iMessage text formatting parser.
2///
3/// Converts markdown emphasis markers to iMessage `textFormatting` ranges.
4use fancy_regex::Regex;
5use serde::{Deserialize, Serialize};
6use serde_json::{Value, json};
7use std::sync::LazyLock;
8
9/// A text formatting style supported by iMessage.
10#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
11#[serde(rename_all = "camelCase")]
12pub enum TextFormattingStyle {
13    Bold,
14    Italic,
15    Underline,
16    Strikethrough,
17}
18
19impl TextFormattingStyle {
20    fn as_str(&self) -> &'static str {
21        match self {
22            Self::Bold => "bold",
23            Self::Italic => "italic",
24            Self::Underline => "underline",
25            Self::Strikethrough => "strikethrough",
26        }
27    }
28
29    fn from_str(s: &str) -> Option<Self> {
30        match s {
31            "bold" => Some(Self::Bold),
32            "italic" => Some(Self::Italic),
33            "underline" => Some(Self::Underline),
34            "strikethrough" => Some(Self::Strikethrough),
35            _ => None,
36        }
37    }
38}
39
40/// A formatting range: start offset, length, and styles.
41#[derive(Debug, Clone, PartialEq, Eq)]
42pub struct TextFormattingRange {
43    pub start: usize,
44    pub length: usize,
45    pub styles: Vec<TextFormattingStyle>,
46}
47
48impl TextFormattingRange {
49    pub fn to_json(&self) -> Value {
50        json!({
51            "start": self.start,
52            "length": self.length,
53            "styles": self.styles.iter().map(|s| s.as_str()).collect::<Vec<_>>(),
54        })
55    }
56}
57
58/// Result of parsing markdown formatting.
59pub struct ParsedFormatting {
60    pub clean_text: String,
61    pub formatting: Vec<TextFormattingRange>,
62}
63
64impl ParsedFormatting {
65    /// Convert the formatting ranges to a JSON array suitable for the Private API.
66    pub fn formatting_json(&self) -> Value {
67        json!(
68            self.formatting
69                .iter()
70                .map(|r| r.to_json())
71                .collect::<Vec<_>>()
72        )
73    }
74}
75
76/// Validate client-provided text formatting ranges.
77pub fn validate_text_formatting(formatting: &Value, message: &str) -> Result<(), String> {
78    let arr = formatting
79        .as_array()
80        .ok_or("textFormatting must be an array")?;
81
82    if message.is_empty() {
83        return Err("A non-empty 'message' is required when using textFormatting".to_string());
84    }
85
86    let msg_len = message.len();
87    for (i, range) in arr.iter().enumerate() {
88        let obj = range
89            .as_object()
90            .ok_or(format!("textFormatting[{i}] must be an object"))?;
91
92        let start = obj
93            .get("start")
94            .and_then(|v| v.as_u64())
95            .ok_or(format!("textFormatting[{i}].start must be an integer >= 0"))?
96            as usize;
97
98        let length = obj
99            .get("length")
100            .and_then(|v| v.as_u64())
101            .filter(|&v| v > 0)
102            .ok_or(format!("textFormatting[{i}].length must be an integer > 0"))?
103            as usize;
104
105        if start + length > msg_len {
106            return Err(format!("textFormatting[{i}] range exceeds message length"));
107        }
108
109        let styles = obj
110            .get("styles")
111            .and_then(|v| v.as_array())
112            .filter(|a| !a.is_empty())
113            .ok_or(format!(
114                "textFormatting[{i}].styles must be a non-empty array"
115            ))?;
116
117        for style_val in styles {
118            let s = style_val.as_str().ok_or(format!(
119                "textFormatting[{i}].styles contains non-string value"
120            ))?;
121            if TextFormattingStyle::from_str(s).is_none() {
122                return Err(format!(
123                    "textFormatting[{i}].styles contains unsupported value: {s}"
124                ));
125            }
126        }
127    }
128
129    Ok(())
130}
131
132/// Check if a JSON value represents non-empty text formatting.
133pub fn has_text_formatting(formatting: Option<&Value>) -> bool {
134    formatting
135        .and_then(|v| v.as_array())
136        .map(|a| !a.is_empty())
137        .unwrap_or(false)
138}
139
140// PUA character bases for protecting regions
141const PUA_PROTECT: char = '\u{E000}';
142const PUA_ESCAPE: char = '\u{F000}';
143
144/// Emphasis pattern definition.
145struct EmphasisPattern {
146    regex: &'static LazyLock<Regex>,
147    styles: Vec<TextFormattingStyle>,
148}
149
150// Pre-compiled regex patterns.
151// Uses fancy-regex for look-around support.
152static RE_FENCED_CODE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"```[\s\S]*?```").unwrap());
153static RE_INLINE_CODE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"`[^`]+`").unwrap());
154static RE_URL: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"https?://[^\s)>\]]+").unwrap());
155static RE_BACKSLASH_ESCAPE: LazyLock<Regex> =
156    LazyLock::new(|| Regex::new(r#"\\([\\`*_~\{}\[\]<>()\#+\-.!|])"#).unwrap());
157
158// Emphasis patterns (longest markers first)
159static RE_BOLD_ITALIC_STAR: LazyLock<Regex> =
160    LazyLock::new(|| Regex::new(r"\*\*\*(.+?)\*\*\*").unwrap());
161static RE_BOLD_ITALIC_UNDER: LazyLock<Regex> =
162    LazyLock::new(|| Regex::new(r"(?<!\w)___(.+?)___(?!\w)").unwrap());
163static RE_BOLD_STAR: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\*\*(.+?)\*\*").unwrap());
164static RE_BOLD_UNDER: LazyLock<Regex> =
165    LazyLock::new(|| Regex::new(r"(?<!\w)__(.+?)__(?!\w)").unwrap());
166static RE_ITALIC_STAR: LazyLock<Regex> =
167    LazyLock::new(|| Regex::new(r"(?<!\*)\*(?!\*)(.+?)(?<!\*)\*(?!\*)").unwrap());
168static RE_ITALIC_UNDER: LazyLock<Regex> =
169    LazyLock::new(|| Regex::new(r"(?<!\w)_(?!_)(.+?)(?<!_)_(?!\w)").unwrap());
170static RE_STRIKETHROUGH: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"~~(.+?)~~").unwrap());
171
172/// Collect all non-overlapping matches from a fancy_regex pattern.
173fn find_all_matches(re: &Regex, text: &str) -> Vec<(usize, usize, String)> {
174    let mut results = Vec::new();
175    let mut start = 0;
176    while start < text.len() {
177        match re.find_from_pos(text, start) {
178            Ok(Some(m)) => {
179                results.push((m.start(), m.end(), m.as_str().to_string()));
180                start = m.end();
181            }
182            _ => break,
183        }
184    }
185    results
186}
187
188/// Collect all captures from a fancy_regex pattern.
189fn captures_all(re: &Regex, text: &str) -> Vec<(usize, usize, String, String)> {
190    // Returns: (full_start, full_end, full_text, group1_text)
191    let mut results = Vec::new();
192    let mut start = 0;
193    while start < text.len() {
194        match re.captures_from_pos(text, start) {
195            Ok(Some(caps)) => {
196                let full = caps.get(0).unwrap();
197                let group1 = caps
198                    .get(1)
199                    .map(|m| m.as_str().to_string())
200                    .unwrap_or_default();
201                results.push((full.start(), full.end(), full.as_str().to_string(), group1));
202                start = full.end();
203            }
204            _ => break,
205        }
206    }
207    results
208}
209
210/// Parse markdown emphasis markers from message text and return cleaned text
211/// plus formatting ranges. Returns `None` if text is unchanged.
212///
213/// Algorithm (4 phases):
214/// 1. Protect code blocks, inline code, URLs with PUA character runs
215/// 2. Process backslash escapes
216/// 3. Sequential emphasis passes (longest markers first), adjusting offsets
217/// 4. Restore protected regions and escaped characters
218pub fn parse_markdown_formatting(text: &str) -> Option<ParsedFormatting> {
219    if text.is_empty() {
220        return None;
221    }
222
223    // --- Phase 1: Protect regions that must not be parsed for emphasis. ---
224    let mut protected_regions: Vec<String> = Vec::new();
225    let mut work = text.to_string();
226
227    let protect = |work: &mut String, regions: &mut Vec<String>, re: &Regex| {
228        let matches = find_all_matches(re, work);
229        if matches.is_empty() {
230            return;
231        }
232        let mut result = String::new();
233        let mut last = 0;
234        for (mstart, mend, mtext) in &matches {
235            result.push_str(&work[last..*mstart]);
236            let idx = regions.len();
237            regions.push(mtext.clone());
238            let pua = char::from_u32(PUA_PROTECT as u32 + idx as u32).unwrap_or(PUA_PROTECT);
239            for _ in 0..mtext.len() {
240                result.push(pua);
241            }
242            last = *mend;
243        }
244        result.push_str(&work[last..]);
245        *work = result;
246    };
247
248    protect(&mut work, &mut protected_regions, &RE_FENCED_CODE);
249    protect(&mut work, &mut protected_regions, &RE_INLINE_CODE);
250    protect(&mut work, &mut protected_regions, &RE_URL);
251
252    // --- Phase 2: Backslash escapes. ---
253    let mut escaped_chars: Vec<String> = Vec::new();
254    {
255        let caps = captures_all(&RE_BACKSLASH_ESCAPE, &work);
256        if !caps.is_empty() {
257            let mut result = String::new();
258            let mut last = 0;
259            for (fstart, fend, _, group1) in &caps {
260                result.push_str(&work[last..*fstart]);
261                let idx = escaped_chars.len();
262                escaped_chars.push(group1.clone());
263                let pua = char::from_u32(PUA_ESCAPE as u32 + idx as u32).unwrap_or(PUA_ESCAPE);
264                result.push(pua);
265                last = *fend;
266            }
267            result.push_str(&work[last..]);
268            work = result;
269        }
270    }
271
272    // --- Phase 3: Sequential emphasis passes (longest markers first). ---
273    let mut formatting: Vec<TextFormattingRange> = Vec::new();
274
275    let patterns = [
276        EmphasisPattern {
277            regex: &RE_BOLD_ITALIC_STAR,
278            styles: vec![TextFormattingStyle::Bold, TextFormattingStyle::Italic],
279        },
280        EmphasisPattern {
281            regex: &RE_BOLD_ITALIC_UNDER,
282            styles: vec![TextFormattingStyle::Bold, TextFormattingStyle::Italic],
283        },
284        EmphasisPattern {
285            regex: &RE_BOLD_STAR,
286            styles: vec![TextFormattingStyle::Bold],
287        },
288        EmphasisPattern {
289            regex: &RE_BOLD_UNDER,
290            styles: vec![TextFormattingStyle::Bold],
291        },
292        EmphasisPattern {
293            regex: &RE_ITALIC_STAR,
294            styles: vec![TextFormattingStyle::Italic],
295        },
296        EmphasisPattern {
297            regex: &RE_ITALIC_UNDER,
298            styles: vec![TextFormattingStyle::Italic],
299        },
300        EmphasisPattern {
301            regex: &RE_STRIKETHROUGH,
302            styles: vec![TextFormattingStyle::Strikethrough],
303        },
304    ];
305
306    for pattern in &patterns {
307        let caps_list = captures_all(pattern.regex, &work);
308        if caps_list.is_empty() {
309            continue;
310        }
311
312        let mut pass_ranges: Vec<TextFormattingRange> = Vec::new();
313        let mut removed_positions: Vec<usize> = Vec::new();
314
315        let mut result = String::new();
316        let mut last = 0;
317
318        for (fstart, fend, full_text, content) in &caps_list {
319            result.push_str(&work[last..*fstart]);
320
321            let marker_len = (full_text.len() - content.len()) / 2;
322
323            // Track positions of removed marker chars (in this pass's coordinates)
324            for j in 0..marker_len {
325                removed_positions.push(fstart + j);
326            }
327            for j in 0..marker_len {
328                removed_positions.push(fstart + marker_len + content.len() + j);
329            }
330
331            let start = result.len();
332            result.push_str(content);
333            pass_ranges.push(TextFormattingRange {
334                start,
335                length: content.len(),
336                styles: pattern.styles.clone(),
337            });
338
339            last = *fend;
340        }
341
342        result.push_str(&work[last..]);
343        work = result;
344
345        // Adjust all previously-computed ranges for markers removed in this pass
346        if !removed_positions.is_empty() {
347            removed_positions.sort();
348            for range in &mut formatting {
349                let mut start_shift = 0usize;
350                let mut length_reduction = 0usize;
351                for &pos in &removed_positions {
352                    if pos < range.start {
353                        start_shift += 1;
354                    } else if pos < range.start + range.length {
355                        length_reduction += 1;
356                    }
357                }
358                range.start -= start_shift;
359                range.length -= length_reduction;
360            }
361        }
362
363        formatting.extend(pass_ranges);
364    }
365
366    // --- Phase 4: Restore protected regions and escaped characters. ---
367    let mut clean_text = work;
368
369    // Code/URL regions (same-length PUA runs -> original text)
370    for i in (0..protected_regions.len()).rev() {
371        let pua = char::from_u32(PUA_PROTECT as u32 + i as u32).unwrap_or(PUA_PROTECT);
372        let pua_run: String = std::iter::repeat_n(pua, protected_regions[i].len()).collect();
373        clean_text = clean_text.replace(&pua_run, &protected_regions[i]);
374    }
375
376    // Escaped chars (single PUA char -> original literal char)
377    for (i, escaped) in escaped_chars.iter().enumerate() {
378        let pua = char::from_u32(PUA_ESCAPE as u32 + i as u32).unwrap_or(PUA_ESCAPE);
379        clean_text = clean_text.replace(pua, escaped);
380    }
381
382    // Nothing changed -> return None
383    if formatting.is_empty() && clean_text == text {
384        return None;
385    }
386
387    // Drop degenerate ranges and sort by position
388    formatting.retain(|r| r.length > 0);
389    formatting.sort_by_key(|r| r.start);
390
391    Some(ParsedFormatting {
392        clean_text,
393        formatting,
394    })
395}
396
397#[cfg(test)]
398mod tests {
399    use super::*;
400
401    #[test]
402    fn null_empty_input() {
403        assert!(parse_markdown_formatting("").is_none());
404    }
405
406    #[test]
407    fn plain_text_unchanged() {
408        assert!(parse_markdown_formatting("hello world").is_none());
409    }
410
411    #[test]
412    fn bold_stars() {
413        let r = parse_markdown_formatting("**bold**").unwrap();
414        assert_eq!(r.clean_text, "bold");
415        assert_eq!(r.formatting.len(), 1);
416        assert_eq!(r.formatting[0].start, 0);
417        assert_eq!(r.formatting[0].length, 4);
418        assert_eq!(r.formatting[0].styles, vec![TextFormattingStyle::Bold]);
419    }
420
421    #[test]
422    fn italic_stars() {
423        let r = parse_markdown_formatting("*italic*").unwrap();
424        assert_eq!(r.clean_text, "italic");
425        assert_eq!(r.formatting.len(), 1);
426        assert_eq!(r.formatting[0].start, 0);
427        assert_eq!(r.formatting[0].length, 6);
428        assert_eq!(r.formatting[0].styles, vec![TextFormattingStyle::Italic]);
429    }
430
431    #[test]
432    fn strikethrough() {
433        let r = parse_markdown_formatting("~~struck~~").unwrap();
434        assert_eq!(r.clean_text, "struck");
435        assert_eq!(r.formatting.len(), 1);
436        assert_eq!(r.formatting[0].start, 0);
437        assert_eq!(r.formatting[0].length, 6);
438        assert_eq!(
439            r.formatting[0].styles,
440            vec![TextFormattingStyle::Strikethrough]
441        );
442    }
443
444    #[test]
445    fn bold_italic_stars() {
446        let r = parse_markdown_formatting("***both***").unwrap();
447        assert_eq!(r.clean_text, "both");
448        assert_eq!(r.formatting.len(), 1);
449        assert_eq!(r.formatting[0].start, 0);
450        assert_eq!(r.formatting[0].length, 4);
451        assert_eq!(
452            r.formatting[0].styles,
453            vec![TextFormattingStyle::Bold, TextFormattingStyle::Italic]
454        );
455    }
456
457    #[test]
458    fn bold_underscore() {
459        let r = parse_markdown_formatting("__bold__").unwrap();
460        assert_eq!(r.clean_text, "bold");
461        assert_eq!(r.formatting.len(), 1);
462        assert_eq!(r.formatting[0].start, 0);
463        assert_eq!(r.formatting[0].length, 4);
464        assert_eq!(r.formatting[0].styles, vec![TextFormattingStyle::Bold]);
465    }
466
467    #[test]
468    fn italic_underscore() {
469        let r = parse_markdown_formatting("_italic_").unwrap();
470        assert_eq!(r.clean_text, "italic");
471        assert_eq!(r.formatting.len(), 1);
472        assert_eq!(r.formatting[0].start, 0);
473        assert_eq!(r.formatting[0].length, 6);
474        assert_eq!(r.formatting[0].styles, vec![TextFormattingStyle::Italic]);
475    }
476
477    #[test]
478    fn bold_italic_underscore() {
479        let r = parse_markdown_formatting("___both___").unwrap();
480        assert_eq!(r.clean_text, "both");
481        assert_eq!(r.formatting.len(), 1);
482        assert_eq!(r.formatting[0].start, 0);
483        assert_eq!(r.formatting[0].length, 4);
484        assert_eq!(
485            r.formatting[0].styles,
486            vec![TextFormattingStyle::Bold, TextFormattingStyle::Italic]
487        );
488    }
489
490    #[test]
491    fn mid_word_underscores_preserved() {
492        assert!(parse_markdown_formatting("some_var_name").is_none());
493    }
494
495    #[test]
496    fn urls_protected() {
497        assert!(
498            parse_markdown_formatting("https://en.wikipedia.org/wiki/Hong_Kong_Island").is_none()
499        );
500    }
501
502    #[test]
503    fn code_spans_protected() {
504        assert!(parse_markdown_formatting("`*not italic*`").is_none());
505    }
506
507    #[test]
508    fn backslash_escapes() {
509        let r = parse_markdown_formatting("\\*literal\\*").unwrap();
510        assert_eq!(r.clean_text, "*literal*");
511        assert!(r.formatting.is_empty());
512    }
513
514    #[test]
515    fn mixed_formatting_correct_offsets() {
516        let r = parse_markdown_formatting("**bold** and *italic*").unwrap();
517        assert_eq!(r.clean_text, "bold and italic");
518        assert_eq!(r.formatting.len(), 2);
519        assert_eq!(r.formatting[0].start, 0);
520        assert_eq!(r.formatting[0].length, 4);
521        assert_eq!(r.formatting[0].styles, vec![TextFormattingStyle::Bold]);
522        assert_eq!(r.formatting[1].start, 9);
523        assert_eq!(r.formatting[1].length, 6);
524        assert_eq!(r.formatting[1].styles, vec![TextFormattingStyle::Italic]);
525    }
526
527    #[test]
528    fn fenced_code_blocks_protected() {
529        assert!(parse_markdown_formatting("```\n**not bold**\n```").is_none());
530    }
531
532    #[test]
533    fn nested_bold_italic() {
534        let r = parse_markdown_formatting("**_bold italic_**").unwrap();
535        assert_eq!(r.clean_text, "bold italic");
536        let all_styles: Vec<_> = r.formatting.iter().flat_map(|r| &r.styles).collect();
537        assert!(all_styles.contains(&&TextFormattingStyle::Bold));
538        assert!(all_styles.contains(&&TextFormattingStyle::Italic));
539    }
540
541    #[test]
542    fn validate_valid_formatting() {
543        let f = json!([{"start": 0, "length": 4, "styles": ["bold"]}]);
544        assert!(validate_text_formatting(&f, "test").is_ok());
545    }
546
547    #[test]
548    fn validate_empty_message() {
549        let f = json!([{"start": 0, "length": 1, "styles": ["bold"]}]);
550        assert!(validate_text_formatting(&f, "").is_err());
551    }
552
553    #[test]
554    fn validate_range_exceeds() {
555        let f = json!([{"start": 2, "length": 10, "styles": ["bold"]}]);
556        assert!(validate_text_formatting(&f, "test").is_err());
557    }
558
559    #[test]
560    fn validate_invalid_style() {
561        let f = json!([{"start": 0, "length": 1, "styles": ["comic-sans"]}]);
562        assert!(validate_text_formatting(&f, "test").is_err());
563    }
564
565    #[test]
566    fn has_formatting_works() {
567        assert!(!has_text_formatting(None));
568        assert!(!has_text_formatting(Some(&json!([]))));
569        assert!(has_text_formatting(Some(&json!([{"start": 0}]))));
570    }
571}