Skip to main content

ngit/
mbox_parser.rs

1//! Parser for mbox-format git patch content.
2//!
3//! This module is a **fallback path** used only when nostr patch events are
4//! missing optional tags (`author`, `committer`, `description`,
5//! `parent-commit`). When those tags are present they always take precedence —
6//! see [`crate::git::RepoActions::apply_patch_chain`].
7//!
8//! ## Why hand-rolled rather than a library?
9//!
10//! Neither libgit2 (via the `git2` crate) nor gitoxide (`gix`) exposes a
11//! mailinfo-style parser. libgit2's email API is output-only
12//! (`git_email_create_from_commit`); there is no `git_mailinfo` equivalent. The
13//! gitoxide monorepo has no `gix-patch` crate, not even as a placeholder. No
14//! production-quality standalone Rust mbox/git-patch parser crate exists.
15//!
16//! The genuinely hard parts of RFC 2822 parsing (header folding, RFC 2047 MIME
17//! encoded-words for non-ASCII author names and subjects) are delegated to the
18//! `mailparse` crate. The git-specific overlay (mbox envelope line, `[PATCH]`
19//! prefix stripping, commit-message body extraction up to the `---` diffstat
20//! separator) is implemented here, matching the behaviour of `git am`'s
21//! `patchbreak()` function in `mailinfo.c`.
22//!
23//! ## If edge cases are reported
24//!
25//! If real-world patches produce incorrect metadata through this parser, the
26//! escape hatch is to shell out to `git mailinfo` directly:
27//! ```text
28//! git mailinfo /tmp/msg /tmp/patch < input.patch
29//! ```
30//! This prints `Author:`, `Email:`, `Subject:`, `Date:` to stdout and writes
31//! the commit body to `/tmp/msg`. Since ngit already requires `git` in PATH (it
32//! is a git plugin), this is always available. It is not the primary approach
33//! because it requires two temp files and a process spawn per patch, which is
34//! acceptable cost but unnecessary given that most patches in the ngit `pr/`
35//! flow will have the optional nostr tags and never reach this code.
36//!
37//! ## Known limitation: `---` in commit message body
38//!
39//! The `---` line that separates the commit message from the diffstat is
40//! ambiguous when the commit message itself contains `---` (e.g. Markdown
41//! horizontal rules). This parser stops at the first `---`-only line, matching
42//! git am's own behaviour — `git am` has the same limitation and documents it.
43//! This is not a bug we can fix without lookahead into the diff structure.
44//!
45//! ## Commit ID from mbox envelope
46//!
47//! The SHA1 in the mbox `From <sha1> <date>` envelope line is extracted but
48//! **must not be assumed correct**. libgit2 generates this ID from the commit
49//! object, but if the original commit was GPG-signed, or if the patch was
50//! generated by a different tool, the reconstructed commit (applied via
51//! `apply_to_tree` + `commit_create_buffer`) will have a different OID.
52//! The `commit` nostr tag is the authoritative source for commit identity when
53//! present.
54
55use anyhow::{Context, Result, bail};
56use chrono::DateTime;
57use mailparse::{MailHeaderMap, parse_headers};
58
59#[derive(Debug, Clone, PartialEq)]
60pub struct PatchMetadata {
61    pub commit_id: String,
62    pub author_name: String,
63    pub author_email: String,
64    pub author_timestamp: i64,
65    pub author_offset_minutes: i32,
66    pub committer_timestamp: Option<i64>,
67    pub subject: String,
68    pub body: String,
69}
70
71pub fn parse_mbox_patch(content: &str) -> Result<PatchMetadata> {
72    let commit_id = extract_commit_id_from_mbox(content)?;
73    let (author_name, author_email) = extract_author_from_from_header(content)?;
74    let (author_timestamp, author_offset_minutes) = extract_date_from_header(content)?;
75    let committer_timestamp = None;
76    let subject = extract_subject(content)?;
77    let body = extract_commit_message_body(content)?;
78
79    Ok(PatchMetadata {
80        commit_id,
81        author_name,
82        author_email,
83        author_timestamp,
84        author_offset_minutes,
85        committer_timestamp,
86        subject,
87        body,
88    })
89}
90
91/// Extract the SHA1 from the mbox `From <sha1> <date>` envelope line.
92///
93/// **This value should not be assumed correct for the reconstructed commit.**
94/// If the original commit was GPG-signed, or the patch was generated by a
95/// different tool (e.g. `git format-patch` vs libgit2), the commit recreated
96/// by applying this patch via `commit_create_buffer` will have a different OID.
97/// Use the `commit` nostr event tag as the authoritative commit identity when
98/// present.
99fn extract_commit_id_from_mbox(content: &str) -> Result<String> {
100    if !content.starts_with("From ") {
101        bail!("patch does not start with 'From ' - not a valid mbox format");
102    }
103
104    let first_line = content.lines().next().context("patch content is empty")?;
105
106    let parts: Vec<&str> = first_line.split_whitespace().collect();
107    if parts.len() < 2 {
108        bail!("mbox 'From ' line does not contain a commit id");
109    }
110
111    Ok(parts[1].to_string())
112}
113
114/// Extract the header section from the mbox content (everything after the first
115/// line up to the first blank line that ends the headers).
116fn extract_header_section(content: &str) -> &str {
117    // Skip the mbox envelope line ("From <sha> <date>"), then pass the rest
118    // to mailparse which understands where headers end.
119    let after_envelope = content
120        .find('\n')
121        .map(|pos| &content[pos + 1..])
122        .unwrap_or("");
123    // Return only up to (and including) the blank line that terminates headers,
124    // so mailparse doesn't try to parse the diff body.
125    let header_end = after_envelope
126        .find("\n\n")
127        .map(|pos| pos + 2)
128        .unwrap_or(after_envelope.len());
129    &after_envelope[..header_end]
130}
131
132fn extract_author_from_from_header(content: &str) -> Result<(String, String)> {
133    let header_bytes = extract_header_section(content).as_bytes();
134    if let Ok((headers, _)) = parse_headers(header_bytes) {
135        if let Some(from_value) = headers.get_first_value("From") {
136            return parse_from_header_value(&from_value);
137        }
138    }
139
140    // Fallback: manual search
141    let from_line = content
142        .lines()
143        .find(|line| line.starts_with("From:"))
144        .context("patch does not contain a 'From:' header")?;
145
146    let from_value = from_line
147        .strip_prefix("From:")
148        .context("failed to strip 'From:' prefix")?
149        .trim();
150
151    parse_from_header_value(from_value)
152}
153
154fn parse_from_header_value(value: &str) -> Result<(String, String)> {
155    if let Some(start) = value.find('<') {
156        if let Some(end) = value.find('>') {
157            let email = value[start + 1..end].to_string();
158            let name_part = value[..start].trim();
159            let name = name_part.trim_matches('"').trim().to_string();
160            return Ok((name, email));
161        }
162    }
163
164    if value.contains('@') {
165        let email = value.trim().to_string();
166        let name = email.split('@').next().unwrap_or("unknown").to_string();
167        return Ok((name, email));
168    }
169
170    bail!("could not parse From header: {}", value)
171}
172
173fn extract_date_from_header(content: &str) -> Result<(i64, i32)> {
174    let date_line = content
175        .lines()
176        .find(|line| line.starts_with("Date:"))
177        .context("patch does not contain a 'Date:' header")?;
178
179    let date_value = date_line
180        .strip_prefix("Date:")
181        .context("failed to strip 'Date:' prefix")?
182        .trim();
183
184    parse_rfc2822_date(date_value)
185}
186
187fn parse_rfc2822_date(value: &str) -> Result<(i64, i32)> {
188    let parsed = DateTime::parse_from_rfc2822(value)
189        .context(format!("failed to parse RFC2822 date: {}", value))?;
190
191    let timestamp = parsed.timestamp();
192    let offset_minutes = parsed.offset().local_minus_utc() / 60;
193
194    Ok((timestamp, offset_minutes))
195}
196
197fn extract_subject(content: &str) -> Result<String> {
198    // Use mailparse to handle RFC 2047 encoded-words and RFC 2822 header folding.
199    let header_bytes = extract_header_section(content).as_bytes();
200    if let Ok((headers, _)) = parse_headers(header_bytes) {
201        if let Some(subject_value) = headers.get_first_value("Subject") {
202            return Ok(cleanup_subject(&subject_value));
203        }
204    }
205
206    // Fallback: manual single-line extraction.
207    let subject_line = content
208        .lines()
209        .find(|line| line.starts_with("Subject:"))
210        .context("patch does not contain a 'Subject:' header")?;
211
212    let subject_value = subject_line
213        .strip_prefix("Subject:")
214        .context("failed to strip 'Subject:' prefix")?
215        .trim();
216
217    Ok(cleanup_subject(subject_value))
218}
219
220fn cleanup_subject(subject: &str) -> String {
221    let mut result = subject.to_string();
222
223    loop {
224        let trimmed = result.trim();
225
226        if trimmed.starts_with("Re:") || trimmed.starts_with("re:") {
227            result = trimmed[3..].trim().to_string();
228            continue;
229        }
230
231        if let Some(stripped) = trimmed.strip_prefix(':') {
232            result = stripped.trim().to_string();
233            continue;
234        }
235
236        if trimmed.starts_with('[') {
237            if let Some(end) = trimmed.find(']') {
238                result = trimmed[end + 1..].trim().to_string();
239                continue;
240            }
241        }
242
243        break;
244    }
245
246    result
247}
248
249fn extract_commit_message_body(content: &str) -> Result<String> {
250    let mut in_body = false;
251    let mut body_lines: Vec<String> = Vec::new();
252    let mut found_first_content = false;
253
254    for line in content.lines() {
255        if !in_body {
256            if line.is_empty() {
257                in_body = true;
258            }
259            continue;
260        }
261
262        if line.starts_with("diff --git ")
263            || line.starts_with("Index: ")
264            || line.starts_with("--- ")
265            || line.starts_with("From ")
266        {
267            break;
268        }
269
270        if line.starts_with("---") && line.trim().eq("---") {
271            break;
272        }
273
274        // The email signature separator is exactly "-- " (dash dash space, nothing
275        // after). Lines that merely start with "-- " followed by other text are
276        // body content.
277        if line == "-- " {
278            break;
279        }
280
281        if !found_first_content && line.trim().is_empty() {
282            continue;
283        }
284
285        found_first_content = true;
286        body_lines.push(line.to_string());
287    }
288
289    while body_lines.last().is_some_and(|l| l.trim().is_empty()) {
290        body_lines.pop();
291    }
292
293    Ok(body_lines.join("\n").trim().to_string())
294}
295
296pub fn extract_description_from_patch(content: &str) -> Result<String> {
297    let subject = extract_subject(content)?;
298    let body = extract_commit_message_body(content)?;
299
300    if body.is_empty() {
301        Ok(subject)
302    } else {
303        Ok(format!("{}\n\n{}", subject, body))
304    }
305}
306
307#[cfg(test)]
308mod tests {
309    use super::*;
310
311    fn sample_patch() -> String {
312        "\
313From 431b84edc0d2fa118d63faa3c2db9c73d630a5ae Mon Sep 17 00:00:00 2001
314From: Joe Bloggs <joe.bloggs@pm.me>
315Date: Thu, 1 Jan 1970 00:00:00 +0000
316Subject: [PATCH] add t2.md
317
318This is the commit message body.
319
320It can have multiple lines.
321
322---
323 t2.md | 1 +
324 1 file changed, 1 insertion(+)
325 create mode 100644 t2.md
326
327diff --git a/t2.md b/t2.md
328new file mode 100644
329index 0000000..a66525d
330--- /dev/null
331+++ b/t2.md
332@@ -0,0 +1 @@
333+some content1
334\\ No newline at end of file
335--
336libgit2 1.9.2
337
338"
339        .to_string()
340    }
341
342    #[test]
343    fn parse_commit_id() {
344        let patch = sample_patch();
345        let result = extract_commit_id_from_mbox(&patch).unwrap();
346        assert_eq!(result, "431b84edc0d2fa118d63faa3c2db9c73d630a5ae");
347    }
348
349    #[test]
350    fn parse_author() {
351        let patch = sample_patch();
352        let (name, email) = extract_author_from_from_header(&patch).unwrap();
353        assert_eq!(name, "Joe Bloggs");
354        assert_eq!(email, "joe.bloggs@pm.me");
355    }
356
357    #[test]
358    fn parse_author_with_quoted_name() {
359        let patch = "\
360From abc123 Mon Sep 17 00:00:00 2001
361From: \"John (nickname) Doe\" <john.doe@example.com>
362Date: Thu, 1 Jan 1970 00:00:00 +0000
363Subject: test
364
365Body
366";
367        let (name, email) = extract_author_from_from_header(patch).unwrap();
368        assert_eq!(name, "John (nickname) Doe");
369        assert_eq!(email, "john.doe@example.com");
370    }
371
372    #[test]
373    fn parse_author_email_only() {
374        let patch = "\
375From abc123 Mon Sep 17 00:00:00 2001
376From: john.doe@example.com
377Date: Thu, 1 Jan 1970 00:00:00 +0000
378Subject: test
379
380Body
381";
382        let (name, email) = extract_author_from_from_header(patch).unwrap();
383        assert_eq!(name, "john.doe");
384        assert_eq!(email, "john.doe@example.com");
385    }
386
387    #[test]
388    fn parse_date() {
389        let patch = sample_patch();
390        let (timestamp, offset) = extract_date_from_header(&patch).unwrap();
391        assert_eq!(timestamp, 0);
392        assert_eq!(offset, 0);
393    }
394
395    #[test]
396    fn parse_date_with_timezone() {
397        let patch = "\
398From abc123 Mon Sep 17 00:00:00 2001
399From: Joe <joe@example.com>
400Date: Thu, 1 Jan 1970 00:00:00 +0500
401Subject: test
402
403Body
404";
405        let (timestamp, offset) = extract_date_from_header(patch).unwrap();
406        assert_eq!(timestamp, -18000);
407        assert_eq!(offset, 300);
408    }
409
410    #[test]
411    fn parse_subject() {
412        let patch = sample_patch();
413        let subject = extract_subject(&patch).unwrap();
414        assert_eq!(subject, "add t2.md");
415    }
416
417    #[test]
418    fn parse_subject_with_patch_prefix() {
419        let patch = "\
420From abc123 Mon Sep 17 00:00:00 2001
421From: Joe <joe@example.com>
422Date: Thu, 1 Jan 1970 00:00:00 +0000
423Subject: [PATCH v2 3/5] fix: important bug
424
425Body
426";
427        let subject = extract_subject(patch).unwrap();
428        assert_eq!(subject, "fix: important bug");
429    }
430
431    #[test]
432    fn parse_subject_with_re_prefix() {
433        let patch = "\
434From abc123 Mon Sep 17 00:00:00 2001
435From: Joe <joe@example.com>
436Date: Thu, 1 Jan 1970 00:00:00 +0000
437Subject: Re: [PATCH] fix: important bug
438
439Body
440";
441        let subject = extract_subject(patch).unwrap();
442        assert_eq!(subject, "fix: important bug");
443    }
444
445    #[test]
446    fn parse_subject_folded_rfc2822() {
447        // RFC 2822 header folding: continuation lines start with whitespace.
448        let patch = "\
449From abc123 Mon Sep 17 00:00:00 2001
450From: Joe <joe@example.com>
451Date: Thu, 1 Jan 1970 00:00:00 +0000
452Subject: [PATCH] fix: this is a very long commit message subject line
453 that has been folded across two lines by RFC 2822 rules
454
455Body
456";
457        let subject = extract_subject(patch).unwrap();
458        assert_eq!(
459            subject,
460            "fix: this is a very long commit message subject line that has been folded across two lines by RFC 2822 rules"
461        );
462    }
463
464    #[test]
465    fn parse_subject_mime_q_encoded() {
466        // RFC 2047 Q-encoding: =?UTF-8?q?...?=
467        let patch = "\
468From abc123 Mon Sep 17 00:00:00 2001
469From: Joe <joe@example.com>
470Date: Thu, 1 Jan 1970 00:00:00 +0000
471Subject: [PATCH] =?UTF-8?q?fix=3A_add_=E2=9C=93_check?=
472
473Body
474";
475        let subject = extract_subject(patch).unwrap();
476        // Q-decoded: "fix: add ✓ check"
477        assert_eq!(subject, "fix: add \u{2713} check");
478    }
479
480    #[test]
481    fn parse_subject_mime_b_encoded() {
482        // RFC 2047 B-encoding: =?UTF-8?b?...?= (base64)
483        // "fix: résumé" base64 encoded
484        let patch = "\
485From abc123 Mon Sep 17 00:00:00 2001
486From: Joe <joe@example.com>
487Date: Thu, 1 Jan 1970 00:00:00 +0000
488Subject: [PATCH] =?UTF-8?b?Zml4OiByw6lzdW3DqQ==?=
489
490Body
491";
492        let subject = extract_subject(patch).unwrap();
493        // B-decoded: "fix: résumé"
494        assert_eq!(subject, "fix: r\u{e9}sum\u{e9}");
495    }
496
497    #[test]
498    fn parse_body() {
499        let patch = sample_patch();
500        let body = extract_commit_message_body(&patch).unwrap();
501        assert_eq!(
502            body,
503            "This is the commit message body.\n\nIt can have multiple lines."
504        );
505    }
506
507    #[test]
508    fn parse_body_empty() {
509        let patch = "\
510From abc123 Mon Sep 17 00:00:00 2001
511From: Joe <joe@example.com>
512Date: Thu, 1 Jan 1970 00:00:00 +0000
513Subject: test
514
515---
516 file.txt | 1 +
517diff --git a/file.txt b/file.txt
518";
519        let body = extract_commit_message_body(patch).unwrap();
520        assert_eq!(body, "");
521    }
522
523    #[test]
524    fn parse_body_stops_at_exact_email_sig_separator() {
525        // "-- " (dash dash space, nothing after) is the email sig separator.
526        let patch = "\
527From abc123 Mon Sep 17 00:00:00 2001
528From: Joe <joe@example.com>
529Date: Thu, 1 Jan 1970 00:00:00 +0000
530Subject: [PATCH] test
531
532This is the body.
533-- 
534libgit2 1.9.2
535
536diff --git a/file.txt b/file.txt
537";
538        let body = extract_commit_message_body(patch).unwrap();
539        assert_eq!(body, "This is the body.");
540    }
541
542    #[test]
543    fn parse_body_does_not_stop_at_double_dash_with_text() {
544        // "-- some text" must NOT be treated as an email sig separator.
545        let patch = "\
546From abc123 Mon Sep 17 00:00:00 2001
547From: Joe <joe@example.com>
548Date: Thu, 1 Jan 1970 00:00:00 +0000
549Subject: [PATCH] test
550
551This is the body.
552-- some CLI flag description
553More body text.
554
555---
556diff --git a/file.txt b/file.txt
557";
558        let body = extract_commit_message_body(patch).unwrap();
559        assert_eq!(
560            body,
561            "This is the body.\n-- some CLI flag description\nMore body text."
562        );
563    }
564
565    #[test]
566    fn parse_full_metadata() {
567        let patch = sample_patch();
568        let metadata = parse_mbox_patch(&patch).unwrap();
569
570        assert_eq!(
571            metadata.commit_id,
572            "431b84edc0d2fa118d63faa3c2db9c73d630a5ae"
573        );
574        assert_eq!(metadata.author_name, "Joe Bloggs");
575        assert_eq!(metadata.author_email, "joe.bloggs@pm.me");
576        assert_eq!(metadata.author_timestamp, 0);
577        assert_eq!(metadata.author_offset_minutes, 0);
578        assert_eq!(metadata.committer_timestamp, None);
579        assert_eq!(metadata.subject, "add t2.md");
580        assert_eq!(
581            metadata.body,
582            "This is the commit message body.\n\nIt can have multiple lines."
583        );
584    }
585
586    #[test]
587    fn extract_description_combines_subject_and_body() {
588        let patch = sample_patch();
589        let description = extract_description_from_patch(&patch).unwrap();
590        assert_eq!(
591            description,
592            "add t2.md\n\nThis is the commit message body.\n\nIt can have multiple lines."
593        );
594    }
595
596    #[test]
597    fn extract_description_subject_only() {
598        let patch = "\
599From abc123 Mon Sep 17 00:00:00 2001
600From: Joe <joe@example.com>
601Date: Thu, 1 Jan 1970 00:00:00 +0000
602Subject: [PATCH] simple fix
603
604---
605 file.txt | 1 +
606";
607        let description = extract_description_from_patch(patch).unwrap();
608        assert_eq!(description, "simple fix");
609    }
610
611    #[test]
612    fn cleanup_subject_strips_patch_prefixes() {
613        assert_eq!(cleanup_subject("[PATCH] test"), "test");
614        assert_eq!(cleanup_subject("[PATCH v2] test"), "test");
615        assert_eq!(cleanup_subject("[PATCH 1/3] test"), "test");
616        assert_eq!(cleanup_subject("[PATCH v2 1/3] test"), "test");
617        assert_eq!(cleanup_subject("Re: [PATCH] test"), "test");
618        assert_eq!(cleanup_subject("re: test"), "test");
619        assert_eq!(cleanup_subject(":test"), "test");
620    }
621}