pter 0.1.0

Plain Text Email Renderer — convert HTML email bodies into readable markdown
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
use scraper::node::Node;
use scraper::ElementRef;

/// Check if an element marks the beginning of a quoted reply.
///
/// This is the central abstraction for reply detection. Rather than
/// building per-client logic throughout the converter, all client-specific
/// knowledge lives here behind a single predicate.
///
/// An element is a reply boundary if it's a container that wraps quoted
/// content from a previous message in the thread. The converter treats
/// these identically to `<blockquote>` — children get `>` prefixed.
pub fn is_reply_boundary(el: ElementRef) -> bool {
    let element = el.value();
    let name = element.name();

    // <blockquote> is already handled by the element classifier.
    // This function catches non-blockquote reply wrappers.

    // Structural: elements with type="cite" (Apple Mail, some webmail)
    if element.attr("type") == Some("cite") {
        return true;
    }

    // Class/ID-based detection — thin per-client checks
    if element.attr("class").is_some_and(is_reply_class) {
        return true;
    }

    if element.attr("id").is_some_and(is_reply_id) {
        return true;
    }

    // Heuristic: a <div> whose first meaningful text child matches
    // an attribution pattern ("On ... wrote:") followed by a blockquote
    if name == "div" && has_attribution_then_quote(el) {
        return true;
    }

    false
}

/// Extract an attribution line from just before or at the start of a reply boundary.
///
/// Returns the attribution text (e.g. "On Mon, Jan 5, Alice wrote:") if found,
/// so the converter can render it above the quoted block.
pub fn find_attribution(el: ElementRef) -> Option<String> {
    // Check the element's own leading text for attribution patterns
    for child in el.children() {
        match child.value() {
            Node::Text(text) => {
                let trimmed = text.text.trim();
                if is_attribution_text(trimmed) {
                    return Some(trimmed.to_string());
                }
            }
            Node::Element(_) => {
                // Stop at the first child element — attribution is leading text
                break;
            }
            _ => {}
        }
    }

    // Check for a preceding sibling text node or element with attribution
    if let Some(prev) = previous_sibling_text(el) {
        let trimmed = prev.trim().to_string();
        if is_attribution_text(&trimmed) {
            return Some(trimmed);
        }
    }

    None
}

/// Check if text matches common email attribution patterns.
///
/// These patterns are cross-client — every email client generates some
/// variant of "On [date], [person] wrote:" or "--- Forwarded message ---".
fn is_attribution_text(text: &str) -> bool {
    let t = text.trim();

    // "On ... wrote:" (Gmail, Apple Mail, Thunderbird, most clients)
    if t.starts_with("On ") && t.ends_with("wrote:") {
        return true;
    }
    // Localized variants: "Le ... a écrit :" (French), "Am ... schrieb" (German)
    if (t.starts_with("Le ") || t.starts_with("El "))
        && (t.ends_with("crit :") || t.ends_with("crit:"))
    {
        return true;
    }
    if t.starts_with("Am ") && (t.ends_with("schrieb:") || t.ends_with("schrieb :")) {
        return true;
    }

    // Forwarded message separators
    if t.contains("Forwarded message")
        || t.contains("Begin forwarded message")
        || t.contains("Original Message")
    {
        return true;
    }

    false
}

/// Thin per-client class checks. Each is one line — easy to add new clients.
fn is_reply_class(class: &str) -> bool {
    // Split on whitespace to check individual class names
    class.split_whitespace().any(|c| {
        matches!(
            c,
            "gmail_quote"
                | "gmail_extra"
                | "yahoo_quoted"
                | "protonmail_quote"
                | "tutanota_quote"
                | "moz-cite-prefix"      // Thunderbird
                | "zmail_extra"          // Zoho
                | "WordSection1"         // Outlook (sometimes wraps replies)
        )
    })
}

/// Thin per-client ID checks.
fn is_reply_id(id: &str) -> bool {
    matches!(
        id,
        "divRplyFwdMsg"             // Outlook
            | "reply-message"       // Generic
            | "OLK_SRC_BODY_SECTION" // Outlook Mac
    )
}

/// Check if a div contains attribution text followed by a blockquote.
///
/// This catches the common pattern where no class/id is present but
/// the structure is: `<div>On ... wrote:<br><blockquote>...</blockquote></div>`
fn has_attribution_then_quote(el: ElementRef) -> bool {
    let mut found_attribution = false;

    for child in el.children() {
        match child.value() {
            Node::Text(text) => {
                if is_attribution_text(text.text.trim()) {
                    found_attribution = true;
                }
            }
            Node::Element(e) => {
                if found_attribution && e.name() == "blockquote" {
                    return true;
                }
                // Skip <br> tags between attribution and blockquote
                if e.name() != "br" {
                    // If we hit a non-br element before finding attribution, stop
                    if !found_attribution {
                        return false;
                    }
                }
            }
            _ => {}
        }
    }

    false
}

/// Get text from the previous sibling, if it exists and is a text or inline element.
fn previous_sibling_text(el: ElementRef) -> Option<String> {
    let prev = el.prev_sibling()?;

    match prev.value() {
        Node::Text(text) => Some(text.text.to_string()),
        Node::Element(e) => {
            // Check inline elements like <span>, <font> that might wrap attribution
            if matches!(e.name(), "span" | "font" | "b" | "i" | "div" | "p") {
                let el_ref = ElementRef::wrap(prev)?;
                let text: String = el_ref.text().collect();
                if !text.trim().is_empty() {
                    return Some(text);
                }
            }
            None
        }
        _ => None,
    }
}

/// Check if a separator element marks the boundary between original
/// content and a forwarded/replied message.
///
/// This catches `<hr>` or styled divs that act as visual separators
/// before reply content (common in Outlook "From: ... Sent: ..." blocks).
pub fn is_outlook_separator(el: ElementRef) -> bool {
    let element = el.value();

    // Outlook uses a specific pattern: a div containing
    // "From: ... Sent: ... To: ... Subject: ..." as a reply header
    if element.name() == "div" || element.name() == "p" {
        let text: String = el.text().collect();
        let t = text.trim();

        // Must have at least From + Sent/Date or Subject
        let has_from = t.contains("From:");
        let has_sent = t.contains("Sent:") || t.contains("Date:");
        let has_subject = t.contains("Subject:");

        if has_from && (has_sent || has_subject) {
            return true;
        }
    }

    false
}

#[cfg(test)]
mod tests {
    use super::*;
    use scraper::{Html, Selector};

    fn parse_and_select(html: &str, selector: &str) -> (Html, Selector) {
        let doc = Html::parse_document(html);
        let sel = Selector::parse(selector).unwrap();
        (doc, sel)
    }

    // -- Attribution detection --

    #[test]
    fn attribution_on_wrote() {
        assert!(is_attribution_text("On Mon, Jan 5, 2026 at 3:00 PM Alice <alice@example.com> wrote:"));
    }

    #[test]
    fn attribution_forwarded() {
        assert!(is_attribution_text("---------- Forwarded message ----------"));
    }

    #[test]
    fn attribution_original_message() {
        assert!(is_attribution_text("-----Original Message-----"));
    }

    #[test]
    fn attribution_begin_forwarded() {
        assert!(is_attribution_text("Begin forwarded message:"));
    }

    #[test]
    fn not_attribution() {
        assert!(!is_attribution_text("Hello, how are you?"));
        assert!(!is_attribution_text("On the other hand, this is fine."));
    }

    // -- Reply class detection --

    #[test]
    fn gmail_quote_class() {
        assert!(is_reply_class("gmail_quote"));
    }

    #[test]
    fn multiple_classes_with_reply() {
        assert!(is_reply_class("some-class gmail_quote another"));
    }

    #[test]
    fn non_reply_class() {
        assert!(!is_reply_class("regular-div content-wrapper"));
    }

    // -- Reply boundary detection --

    #[test]
    fn type_cite_is_boundary() {
        let html = r#"<div type="cite"><p>quoted</p></div>"#;
        let (doc, sel) = parse_and_select(html, r#"div[type="cite"]"#);
        let el = doc.select(&sel).next().unwrap();
        assert!(is_reply_boundary(el));
    }

    #[test]
    fn gmail_quote_is_boundary() {
        let html = r#"<div class="gmail_quote"><p>quoted</p></div>"#;
        let (doc, sel) = parse_and_select(html, "div.gmail_quote");
        let el = doc.select(&sel).next().unwrap();
        assert!(is_reply_boundary(el));
    }

    #[test]
    fn outlook_id_is_boundary() {
        let html = r#"<div id="divRplyFwdMsg"><p>quoted</p></div>"#;
        let (doc, sel) = parse_and_select(html, "#divRplyFwdMsg");
        let el = doc.select(&sel).next().unwrap();
        assert!(is_reply_boundary(el));
    }

    #[test]
    fn plain_div_not_boundary() {
        let html = r#"<div class="content"><p>not quoted</p></div>"#;
        let (doc, sel) = parse_and_select(html, "div.content");
        let el = doc.select(&sel).next().unwrap();
        assert!(!is_reply_boundary(el));
    }

    // -- Outlook separator --

    #[test]
    fn outlook_from_sent_subject() {
        let html = "<div>From: Alice\nSent: Monday\nTo: Bob\nSubject: Hello</div>";
        let (doc, sel) = parse_and_select(html, "div");
        let el = doc.select(&sel).next().unwrap();
        assert!(is_outlook_separator(el));
    }

    #[test]
    fn regular_div_not_separator() {
        let html = "<div>Just a normal paragraph.</div>";
        let (doc, sel) = parse_and_select(html, "div");
        let el = doc.select(&sel).next().unwrap();
        assert!(!is_outlook_separator(el));
    }

    // -- Boundary tests for `is_attribution_text`: each arm needs both sides --

    #[test]
    fn attribution_on_without_wrote_is_false() {
        // "On ..." without "wrote:" — catches mutating && to ||
        assert!(!is_attribution_text("On the bright side, this is fine."));
    }

    #[test]
    fn attribution_wrote_without_on_is_false() {
        // "... wrote:" without leading "On " — catches mutating && to ||
        assert!(!is_attribution_text("Alice wrote:"));
    }

    #[test]
    fn attribution_french_le_with_colon_space() {
        assert!(is_attribution_text("Le lundi 5 janvier 2026, Alice a écrit :"));
    }

    #[test]
    fn attribution_french_le_no_space_before_colon() {
        // "écrit:" without space — covers L89 || mutation between the two ending forms
        assert!(is_attribution_text("Le lundi, Alice a écrit:"));
    }

    #[test]
    fn attribution_spanish_el_with_colon_space() {
        assert!(is_attribution_text("El lunes 5 de enero, Alice a escrit :"));
    }

    #[test]
    fn attribution_spanish_el_no_space_before_colon() {
        assert!(is_attribution_text("El lunes, Alice a escrit:"));
    }

    #[test]
    fn attribution_french_le_without_wrote_ending_is_false() {
        // "Le X" without "écrit" — catches L89 mutating || to &&
        assert!(!is_attribution_text("Le lundi, Alice est ici."));
    }

    #[test]
    fn attribution_starts_with_le_but_not_french_pattern() {
        // Word starts with "Le" but isn't the French attribution form.
        assert!(!is_attribution_text("Le sigh."));
    }

    #[test]
    fn attribution_german_am_with_colon() {
        assert!(is_attribution_text("Am Montag, 5. Januar 2026, schrieb:"));
    }

    #[test]
    fn attribution_german_am_with_space_colon() {
        assert!(is_attribution_text("Am Montag schrieb :"));
    }

    #[test]
    fn attribution_german_am_without_schrieb_is_false() {
        // "Am X" without "schrieb" — catches L93 && mutation
        assert!(!is_attribution_text("Am very fine, thanks."));
    }

    #[test]
    fn attribution_german_schrieb_without_am_is_false() {
        // "schrieb:" without leading "Am " — catches L93 && mutation
        assert!(!is_attribution_text("Bob schrieb:"));
    }

    #[test]
    fn attribution_begin_forwarded_only() {
        // Only "Begin forwarded message" present — catches the || chain mutating to &&
        assert!(is_attribution_text("Begin forwarded message"));
    }

    #[test]
    fn attribution_original_message_only() {
        // Only "Original Message" present — catches the || chain mutating to &&
        assert!(is_attribution_text("-----Original Message-----"));
    }

    // -- Boundary tests for `is_reply_id` --

    #[test]
    fn reply_id_reply_message() {
        assert!(is_reply_id("reply-message"));
    }

    #[test]
    fn reply_id_olk_src_body_section() {
        assert!(is_reply_id("OLK_SRC_BODY_SECTION"));
    }

    #[test]
    fn reply_id_unknown_is_false() {
        // Catches `replace is_reply_id -> bool with true` mutant
        assert!(!is_reply_id("main-content"));
        assert!(!is_reply_id(""));
        assert!(!is_reply_id("reply"));
    }

    // -- Boundary tests for `find_attribution` --

    #[test]
    fn find_attribution_in_leading_text() {
        let html = r#"<div>On Mon, Alice wrote:<blockquote>quoted</blockquote></div>"#;
        let (doc, sel) = parse_and_select(html, "div");
        let el = doc.select(&sel).next().unwrap();
        let attr = find_attribution(el);
        assert!(attr.is_some());
        assert!(attr.unwrap().contains("wrote:"));
    }

    #[test]
    fn find_attribution_none_when_no_match() {
        let html = r#"<div>Just regular text here, nothing fancy.</div>"#;
        let (doc, sel) = parse_and_select(html, "div");
        let el = doc.select(&sel).next().unwrap();
        assert!(find_attribution(el).is_none());
    }

    #[test]
    fn find_attribution_stops_at_first_element_child() {
        // Element-then-text: the Text(_) arm should still match leading text BEFORE
        // hitting any element. With a leading element, the loop should `break`
        // out without inspecting later text. Catches "delete match arm Node::Element(_)".
        let html = r#"<div><span>hi</span>On Mon, Alice wrote:</div>"#;
        let (doc, sel) = parse_and_select(html, "div");
        let el = doc.select(&sel).next().unwrap();
        // Leading content is an element, not text — and the later text falls outside
        // the leading-text scan. So no attribution should be found from leading text.
        // Also, no preceding sibling. → None.
        assert!(find_attribution(el).is_none());
    }

    #[test]
    fn find_attribution_in_preceding_sibling() {
        let html = r#"<div><p>On Mon, Alice wrote:</p><div class="quote">body</div></div>"#;
        let (doc, sel) = parse_and_select(html, "div.quote");
        let el = doc.select(&sel).next().unwrap();
        let attr = find_attribution(el);
        assert!(attr.is_some(), "expected attribution from preceding <p>");
    }

    // -- Boundary tests for `has_attribution_then_quote` --
    // These exercise the function via `is_reply_boundary` since it's private.

    #[test]
    fn boundary_div_with_attribution_then_blockquote() {
        let html = r#"<div>On Mon, Alice wrote:<blockquote>quoted</blockquote></div>"#;
        let (doc, sel) = parse_and_select(html, "div");
        let el = doc.select(&sel).next().unwrap();
        assert!(is_reply_boundary(el));
    }

    #[test]
    fn boundary_div_blockquote_without_attribution_is_false() {
        // A bare blockquote-wrapping div without attribution text is not a boundary.
        // Catches "replace has_attribution_then_quote -> bool with false" (would
        // make this still pass, but the positive case above would fail).
        let html = r#"<div><blockquote>quoted</blockquote></div>"#;
        let (doc, sel) = parse_and_select(html, "div");
        let el = doc.select(&sel).next().unwrap();
        assert!(!is_reply_boundary(el));
    }

    #[test]
    fn boundary_div_attribution_no_blockquote_is_false() {
        // Attribution text but no blockquote → not a boundary.
        // Catches the L151 == mutation (would treat any element as blockquote).
        let html = r#"<div>On Mon, Alice wrote:<p>not a quote</p></div>"#;
        let (doc, sel) = parse_and_select(html, "div");
        let el = doc.select(&sel).next().unwrap();
        assert!(!is_reply_boundary(el));
    }

    #[test]
    fn boundary_div_attribution_br_blockquote() {
        // Attribution → <br> → blockquote. The <br> must be skipped.
        // Catches the L155 != mutation in br-handling.
        let html = r#"<div>On Mon, Alice wrote:<br><blockquote>quoted</blockquote></div>"#;
        let (doc, sel) = parse_and_select(html, "div");
        let el = doc.select(&sel).next().unwrap();
        assert!(is_reply_boundary(el));
    }

    #[test]
    fn boundary_div_non_br_element_before_attribution_is_false() {
        // Non-br element BEFORE finding attribution → early return false.
        // Catches the L157 `!` deletion.
        let html = r#"<div><p>preface</p>On Mon, Alice wrote:<blockquote>q</blockquote></div>"#;
        let (doc, sel) = parse_and_select(html, "div");
        let el = doc.select(&sel).next().unwrap();
        assert!(!is_reply_boundary(el));
    }

    // -- Boundary tests for `previous_sibling_text` --
    // Exercised via find_attribution since the function is private.

    #[test]
    fn prev_sibling_text_node() {
        // Raw Text node as preceding sibling. Inside a parent <div>, a leading
        // text run followed by a child <div class="q"> means the inner div's
        // `prev_sibling()` is a `Node::Text`. Catches `delete match arm Node::Text(text)`.
        let html = r#"<div>On Mon, Alice wrote:<div class="q">body</div></div>"#;
        let (doc, sel) = parse_and_select(html, "div.q");
        let el = doc.select(&sel).next().unwrap();
        assert!(find_attribution(el).is_some());
    }

    #[test]
    fn prev_sibling_inline_span_with_attribution() {
        let html = r#"<div><span>On Mon, Alice wrote:</span><div class="q">body</div></div>"#;
        let (doc, sel) = parse_and_select(html, "div.q");
        let el = doc.select(&sel).next().unwrap();
        assert!(find_attribution(el).is_some());
    }

    #[test]
    fn prev_sibling_inline_font_with_attribution() {
        // <font> is also inline-treated; covers a different arm in the matches!.
        let html = r#"<div><font>On Mon, Alice wrote:</font><div class="q">body</div></div>"#;
        let (doc, sel) = parse_and_select(html, "div.q");
        let el = doc.select(&sel).next().unwrap();
        assert!(find_attribution(el).is_some());
    }

    #[test]
    fn prev_sibling_non_inline_element_returns_none() {
        // <table> is not in the inline whitelist → preceding-sibling lookup fails.
        let html = r#"<div><table><tr><td>On Mon, Alice wrote:</td></tr></table><div class="q">body</div></div>"#;
        let (doc, sel) = parse_and_select(html, "div.q");
        let el = doc.select(&sel).next().unwrap();
        assert!(find_attribution(el).is_none());
    }

    #[test]
    fn prev_sibling_empty_inline_returns_none() {
        let html = r#"<div><span>   </span><div class="q">body</div></div>"#;
        let (doc, sel) = parse_and_select(html, "div.q");
        let el = doc.select(&sel).next().unwrap();
        // Whitespace-only preceding span → no attribution match.
        assert!(find_attribution(el).is_none());
    }

    // -- Boundary tests for `is_outlook_separator` --

    #[test]
    fn outlook_from_date_subject_is_separator() {
        // Date instead of Sent → covers L206 || (Sent || Date) mutation
        let html = "<div>From: Alice\nDate: Monday\nSubject: Hello</div>";
        let (doc, sel) = parse_and_select(html, "div");
        let el = doc.select(&sel).next().unwrap();
        assert!(is_outlook_separator(el));
    }

    #[test]
    fn outlook_from_sent_no_subject_is_separator() {
        // From + Sent, no Subject → catches L209 mutating || to &&
        let html = "<div>From: Alice\nSent: Monday</div>";
        let (doc, sel) = parse_and_select(html, "div");
        let el = doc.select(&sel).next().unwrap();
        assert!(is_outlook_separator(el));
    }

    #[test]
    fn outlook_from_subject_no_sent_is_separator() {
        // From + Subject, no Sent/Date → catches L209 mutating || to &&
        let html = "<div>From: Alice\nSubject: Hello</div>";
        let (doc, sel) = parse_and_select(html, "div");
        let el = doc.select(&sel).next().unwrap();
        assert!(is_outlook_separator(el));
    }

    #[test]
    fn outlook_from_only_is_not_separator() {
        // From alone (no Sent/Date/Subject) → must be false.
        // Catches L209 && mutation to ||.
        let html = "<div>From: Alice</div>";
        let (doc, sel) = parse_and_select(html, "div");
        let el = doc.select(&sel).next().unwrap();
        assert!(!is_outlook_separator(el));
    }

    #[test]
    fn outlook_sent_subject_no_from_is_not_separator() {
        // No From → must be false regardless of Sent/Subject presence.
        let html = "<div>Sent: Monday\nSubject: Hello</div>";
        let (doc, sel) = parse_and_select(html, "div");
        let el = doc.select(&sel).next().unwrap();
        assert!(!is_outlook_separator(el));
    }
}