Skip to main content

libverify_core/
linkage.rs

1use serde::{Deserialize, Serialize};
2
3/// The kind of issue reference found in a change request body.
4#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
5#[serde(rename_all = "kebab-case")]
6pub enum IssueRefKind {
7    /// Numeric issue: `#123` (GitHub, GitLab, Bitbucket, Gitea)
8    NumericIssue,
9    /// Project ticket in KEY-123 format (Jira, Linear, Shortcut, etc.).
10    ProjectTicket,
11    Url,
12}
13
14// Backward-compatible alias.
15#[allow(non_upper_case_globals)]
16impl IssueRefKind {
17    pub const JiraTicket: Self = Self::ProjectTicket;
18}
19
20/// A single issue reference extracted from change request text.
21#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
22pub struct IssueReference {
23    pub kind: IssueRefKind,
24    pub value: String,
25}
26
27/// Closing keyword prefixes (GitHub, GitLab, Bitbucket) — case-insensitive matching handled by caller.
28const CLOSING_KEYWORDS: &[&str] = &[
29    "fixes", "fix", "fixed", "closes", "close", "closed", "resolves", "resolve", "resolved",
30];
31
32/// Common acronym prefixes that should NOT be treated as project ticket keys.
33const TICKET_BLOCKLIST: &[&str] = &[
34    "UTF", "HTTP", "RFC", "CVE", "ISO", "SHA", "SSL", "TLS", "TCP", "UDP", "DNS", "SSH", "API",
35    "URL", "URI", "XML", "JSON", "YAML", "TOML", "HTML", "CSS", "ANSI", "ASCII", "IEEE", "IETF",
36    "SMTP", "IMAP", "LDAP", "SAML", "CORS", "CSRF", "ECDSA", "HMAC",
37];
38
39/// Known project tracker URL host patterns that indicate issue linkage.
40const TRACKER_URL_PATTERNS: &[&str] = &[
41    "/issues/",          // GitHub, GitLab
42    "/browse/",          // Jira
43    "linear.app/",       // Linear
44    "app.shortcut.com/", // Shortcut
45    "notion.so/",        // Notion
46];
47
48/// Extract issue references from a change request body.
49///
50/// Recognized patterns:
51/// - Numeric issue: `#123`, `fixes #456`, `closes #789`, `resolves #012` (GitHub, GitLab, Bitbucket, Gitea)
52/// - Project ticket: `PROJ-123`, `ENG-456` (Jira, Linear, etc.)
53/// - Shortcut: `sc-12345` (case-insensitive two-letter prefix)
54/// - URL: URLs containing known tracker patterns (GitHub, Jira, Linear, Notion, Shortcut)
55/// - Custom patterns provided by the caller
56pub fn extract_issue_references(body: &str, custom_patterns: &[&str]) -> Vec<IssueReference> {
57    let mut refs = Vec::new();
58
59    // Extract URL references first (before other parsing mutates state)
60    extract_urls(body, &mut refs);
61
62    // Extract numeric issue references (#N and keyword #N)
63    extract_numeric_issues(body, &mut refs);
64
65    // Extract project ticket references (PROJ-123, ENG-456, sc-12345)
66    extract_project_tickets(body, &mut refs);
67
68    // Extract custom pattern matches
69    for pattern in custom_patterns {
70        extract_custom(body, pattern, &mut refs);
71    }
72
73    // Deduplicate by value
74    refs.dedup_by(|a, b| a.value == b.value);
75    refs
76}
77
78/// Returns true if the slice contains at least one issue reference.
79pub fn has_issue_linkage(refs: &[IssueReference]) -> bool {
80    !refs.is_empty()
81}
82
83/// Extract numeric issue references: bare `#123` and keyword-prefixed `fixes #123`.
84///
85/// All indexing operates on `Vec<char>` to avoid byte/char index confusion
86/// with non-ASCII input. The keyword text for the output is reconstructed
87/// from `body_chars` (original casing) rather than slicing `body` by byte.
88fn extract_numeric_issues(body: &str, refs: &mut Vec<IssueReference>) {
89    let lower = body.to_lowercase();
90    let chars: Vec<char> = lower.chars().collect();
91    let body_chars: Vec<char> = body.chars().collect();
92
93    let mut i = 0;
94    while i < chars.len() {
95        // Check for keyword + optional whitespace + #N
96        let mut matched_keyword = false;
97        for keyword in CLOSING_KEYWORDS {
98            let kw_chars: Vec<char> = keyword.chars().collect();
99            if i + kw_chars.len() < chars.len() && chars[i..i + kw_chars.len()] == kw_chars[..] {
100                let after_kw = i + kw_chars.len();
101                // Must be preceded by word boundary (start of string or non-alphanumeric)
102                if i > 0 && chars[i - 1].is_alphanumeric() {
103                    continue;
104                }
105                // Skip optional whitespace
106                let mut j = after_kw;
107                while j < chars.len() && chars[j] == ' ' {
108                    j += 1;
109                }
110                if j < chars.len()
111                    && chars[j] == '#'
112                    && let Some((num_str, end)) = parse_digits(&body_chars, j + 1)
113                {
114                    // Reconstruct keyword from original chars (preserves casing)
115                    let kw_original: String = body_chars[i..i + kw_chars.len()].iter().collect();
116                    let full = format!("{kw_original} #{num_str}");
117                    refs.push(IssueReference {
118                        kind: IssueRefKind::NumericIssue,
119                        value: full,
120                    });
121                    i = end;
122                    matched_keyword = true;
123                    break;
124                }
125            }
126        }
127
128        if matched_keyword {
129            continue;
130        }
131
132        // Bare #N (not preceded by alphanumeric or &)
133        if chars[i] == '#' {
134            let preceded_ok = i == 0 || (!chars[i - 1].is_alphanumeric() && chars[i - 1] != '&');
135            if preceded_ok && let Some((num_str, end)) = parse_digits(&body_chars, i + 1) {
136                refs.push(IssueReference {
137                    kind: IssueRefKind::NumericIssue,
138                    value: format!("#{num_str}"),
139                });
140                i = end;
141                continue;
142            }
143        }
144
145        i += 1;
146    }
147}
148
149/// Parse a run of ASCII digits starting at `start`, returning the digit string and end index.
150///
151/// Returns `None` if there are no digits **or** if the digit run is immediately
152/// followed by an alphanumeric character, `_`, or `-`. This prevents matching
153/// `#123abc` as a numeric issue reference while still accepting `#123`, `#123 `,
154/// `#123.`, and `#123!`.
155fn parse_digits(chars: &[char], start: usize) -> Option<(String, usize)> {
156    let mut end = start;
157    while end < chars.len() && chars[end].is_ascii_digit() {
158        end += 1;
159    }
160    if end == start {
161        return None;
162    }
163    // Reject if digits are followed by word-like characters (e.g. #123abc)
164    if end < chars.len() {
165        let next = chars[end];
166        if next.is_alphanumeric() || next == '_' || next == '-' {
167            return None;
168        }
169    }
170    let s: String = chars[start..end].iter().collect();
171    Some((s, end))
172}
173
174/// Known lowercase ticket prefixes (e.g. Shortcut `sc-12345`).
175const LOWERCASE_TICKET_PREFIXES: &[&str] = &["sc"];
176
177/// Extract project ticket references.
178///
179/// Matches two patterns:
180/// 1. Uppercase: `[A-Z]{2,}-\d+` — Jira (`PROJ-123`), Linear (`ENG-456`), etc.
181/// 2. Known lowercase: `sc-\d+` — Shortcut
182///
183/// Rejects prefixes in [`TICKET_BLOCKLIST`] (common acronyms like UTF, HTTP, etc.).
184fn extract_project_tickets(body: &str, refs: &mut Vec<IssueReference>) {
185    // Pass 1: uppercase prefixes (Jira, Linear, etc.)
186    extract_uppercase_tickets(body, refs);
187    // Pass 2: known lowercase prefixes (Shortcut, etc.)
188    extract_lowercase_tickets(body, refs);
189}
190
191/// Extract uppercase ticket references: `[A-Z]{2,}-\d+`.
192fn extract_uppercase_tickets(body: &str, refs: &mut Vec<IssueReference>) {
193    let chars: Vec<char> = body.chars().collect();
194    let mut i = 0;
195
196    while i < chars.len() {
197        // Must start at word boundary
198        if i > 0 && (chars[i - 1].is_alphanumeric() || chars[i - 1] == '-') {
199            i += 1;
200            continue;
201        }
202
203        // Scan uppercase letters (need at least 2)
204        let alpha_start = i;
205        let mut j = i;
206        while j < chars.len() && chars[j].is_ascii_uppercase() {
207            j += 1;
208        }
209        let alpha_len = j - alpha_start;
210        if alpha_len < 2 {
211            i += 1;
212            continue;
213        }
214
215        // Must be followed by '-'
216        if j >= chars.len() || chars[j] != '-' {
217            i += 1;
218            continue;
219        }
220        j += 1;
221
222        // Must be followed by digits
223        let digit_start = j;
224        while j < chars.len() && chars[j].is_ascii_digit() {
225            j += 1;
226        }
227        if j == digit_start {
228            i += 1;
229            continue;
230        }
231
232        // Must end at word boundary
233        if j < chars.len() && (chars[j].is_alphanumeric() || chars[j] == '-') {
234            i += 1;
235            continue;
236        }
237
238        let prefix: String = chars[alpha_start..alpha_start + alpha_len].iter().collect();
239
240        // Reject well-known acronyms
241        if TICKET_BLOCKLIST.iter().any(|b| *b == prefix) {
242            i = j;
243            continue;
244        }
245
246        let ticket: String = chars[alpha_start..j].iter().collect();
247
248        // Skip if this was already captured as part of a URL
249        if !refs.iter().any(|r| r.value.contains(&ticket)) {
250            refs.push(IssueReference {
251                kind: IssueRefKind::ProjectTicket,
252                value: ticket,
253            });
254        }
255
256        i = j;
257    }
258}
259
260/// Extract known lowercase ticket references (e.g. Shortcut `sc-12345`).
261fn extract_lowercase_tickets(body: &str, refs: &mut Vec<IssueReference>) {
262    let chars: Vec<char> = body.chars().collect();
263    let mut i = 0;
264
265    while i < chars.len() {
266        // Must start at word boundary
267        if i > 0 && (chars[i - 1].is_alphanumeric() || chars[i - 1] == '-') {
268            i += 1;
269            continue;
270        }
271
272        for prefix in LOWERCASE_TICKET_PREFIXES {
273            let prefix_chars: Vec<char> = prefix.chars().collect();
274            let plen = prefix_chars.len();
275            if i + plen >= chars.len() {
276                continue;
277            }
278
279            // Match prefix (case-insensitive)
280            let body_slice: String = chars[i..i + plen].iter().collect();
281            if body_slice.to_ascii_lowercase() != *prefix {
282                continue;
283            }
284
285            // Must be followed by '-'
286            let mut j = i + plen;
287            if j >= chars.len() || chars[j] != '-' {
288                continue;
289            }
290            j += 1;
291
292            // Must be followed by digits
293            let digit_start = j;
294            while j < chars.len() && chars[j].is_ascii_digit() {
295                j += 1;
296            }
297            if j == digit_start {
298                continue;
299            }
300
301            // Must end at word boundary
302            if j < chars.len() && (chars[j].is_alphanumeric() || chars[j] == '-') {
303                continue;
304            }
305
306            let ticket: String = chars[i..j].iter().collect();
307            if !refs.iter().any(|r| r.value.contains(&ticket)) {
308                refs.push(IssueReference {
309                    kind: IssueRefKind::ProjectTicket,
310                    value: ticket,
311                });
312            }
313            i = j;
314            break;
315        }
316
317        i += 1;
318    }
319}
320
321/// Extract URL references pointing to known issue trackers.
322///
323/// Matches URLs containing patterns from [`TRACKER_URL_PATTERNS`]:
324/// GitHub/GitLab (`/issues/`), Jira (`/browse/`), Linear (`linear.app/`),
325/// Shortcut (`app.shortcut.com/`), Notion (`notion.so/`).
326///
327/// Handles both whitespace-delimited URLs and Markdown link syntax
328/// `[text](url)`.
329fn extract_urls(body: &str, refs: &mut Vec<IssueReference>) {
330    let mut search_start = 0;
331    while search_start < body.len() {
332        let rest = &body[search_start..];
333        let offset = rest.find("https://").or_else(|| rest.find("http://"));
334
335        let Some(pos) = offset else { break };
336        let url_start = search_start + pos;
337
338        // Determine end of URL: stop at whitespace, ')', '>', ']', or end of string
339        let url_end = body[url_start..]
340            .find(|c: char| c.is_whitespace() || c == ')' || c == '>' || c == ']')
341            .map(|e| url_start + e)
342            .unwrap_or(body.len());
343
344        let url = body[url_start..url_end].trim_end_matches(['.', ',']);
345
346        if TRACKER_URL_PATTERNS.iter().any(|p| url.contains(p)) {
347            refs.push(IssueReference {
348                kind: IssueRefKind::Url,
349                value: url.to_string(),
350            });
351        }
352
353        search_start = url_end;
354    }
355}
356
357/// Extract matches for a custom literal pattern.
358fn extract_custom(body: &str, pattern: &str, refs: &mut Vec<IssueReference>) {
359    if pattern.is_empty() {
360        return;
361    }
362    let mut start = 0;
363    while let Some(pos) = body[start..].find(pattern) {
364        let abs_pos = start + pos;
365        let end = abs_pos + pattern.len();
366        refs.push(IssueReference {
367            kind: IssueRefKind::Url, // custom patterns categorized as Url
368            value: body[abs_pos..end].to_string(),
369        });
370        start = end;
371    }
372}
373
374#[cfg(test)]
375mod tests {
376    use super::*;
377
378    #[test]
379    fn github_issue_bare_hash() {
380        let refs = extract_issue_references("Related to #123", &[]);
381        assert!(has_issue_linkage(&refs));
382        assert_eq!(refs[0].kind, IssueRefKind::NumericIssue);
383        assert_eq!(refs[0].value, "#123");
384    }
385
386    #[test]
387    fn github_issue_fixes_keyword() {
388        let refs = extract_issue_references("fixes #456", &[]);
389        assert!(has_issue_linkage(&refs));
390        assert_eq!(refs[0].value, "fixes #456");
391    }
392
393    #[test]
394    fn github_issue_closes_keyword() {
395        let refs = extract_issue_references("Closes #789", &[]);
396        assert!(has_issue_linkage(&refs));
397        assert_eq!(refs[0].value, "Closes #789");
398    }
399
400    #[test]
401    fn github_issue_resolves_keyword() {
402        let refs = extract_issue_references("resolves #012", &[]);
403        assert!(has_issue_linkage(&refs));
404        assert_eq!(refs[0].value, "resolves #012");
405    }
406
407    #[test]
408    fn jira_ticket() {
409        let refs = extract_issue_references("See PROJ-789 for details", &[]);
410        assert!(has_issue_linkage(&refs));
411        assert_eq!(refs[0].kind, IssueRefKind::JiraTicket);
412        assert_eq!(refs[0].value, "PROJ-789");
413    }
414
415    #[test]
416    fn url_github_issues() {
417        let refs = extract_issue_references("https://github.com/owner/repo/issues/1", &[]);
418        assert!(has_issue_linkage(&refs));
419        assert_eq!(refs[0].kind, IssueRefKind::Url);
420    }
421
422    #[test]
423    fn url_jira_browse() {
424        let refs = extract_issue_references("See https://jira.example.com/browse/PROJ-123", &[]);
425        assert!(has_issue_linkage(&refs));
426        assert_eq!(refs[0].kind, IssueRefKind::Url);
427    }
428
429    #[test]
430    fn empty_body_no_linkage() {
431        let refs = extract_issue_references("", &[]);
432        assert!(!has_issue_linkage(&refs));
433    }
434
435    #[test]
436    fn no_references_in_body() {
437        let refs = extract_issue_references("Just a regular PR description.", &[]);
438        assert!(!has_issue_linkage(&refs));
439    }
440
441    #[test]
442    fn multiple_mixed_patterns() {
443        let body = "fixes #123\nAlso related to PROJ-789 and https://github.com/o/r/issues/5";
444        let refs = extract_issue_references(body, &[]);
445        assert!(has_issue_linkage(&refs));
446        assert!(refs.len() >= 3);
447        let kinds: Vec<&IssueRefKind> = refs.iter().map(|r| &r.kind).collect();
448        assert!(kinds.contains(&&IssueRefKind::NumericIssue));
449        assert!(kinds.contains(&&IssueRefKind::JiraTicket));
450        assert!(kinds.contains(&&IssueRefKind::Url));
451    }
452
453    #[test]
454    fn custom_pattern() {
455        let refs = extract_issue_references("Ref: CUSTOM-42", &["CUSTOM-42"]);
456        assert!(has_issue_linkage(&refs));
457    }
458
459    #[test]
460    fn hash_in_html_entity_not_matched() {
461        // &#123; should not match as a GitHub issue reference
462        let refs = extract_issue_references("Use &#123; entity", &[]);
463        assert!(!has_issue_linkage(&refs));
464    }
465
466    #[test]
467    fn jira_single_letter_not_matched() {
468        // Single letter prefix is not valid Jira
469        let refs = extract_issue_references("X-123 should not match", &[]);
470        assert!(!has_issue_linkage(&refs));
471    }
472
473    // --- P1: Non-ASCII safety ---
474
475    #[test]
476    fn non_ascii_body_with_issue_ref() {
477        // Multi-byte chars before issue reference must not panic
478        let refs = extract_issue_references("あいう fixes #12", &[]);
479        assert!(has_issue_linkage(&refs));
480        assert_eq!(refs[0].value, "fixes #12");
481    }
482
483    #[test]
484    fn non_ascii_body_bare_hash() {
485        let refs = extract_issue_references("日本語テスト #99 です", &[]);
486        assert!(has_issue_linkage(&refs));
487        assert_eq!(refs[0].value, "#99");
488    }
489
490    #[test]
491    fn emoji_body_with_issue_ref() {
492        let refs = extract_issue_references("🎉🎊 closes #42", &[]);
493        assert!(has_issue_linkage(&refs));
494        assert_eq!(refs[0].value, "closes #42");
495    }
496
497    // --- P2: Markdown URL detection ---
498
499    #[test]
500    fn markdown_link_github_issues() {
501        let body = "See [the issue](https://github.com/o/r/issues/1) for details";
502        let refs = extract_issue_references(body, &[]);
503        assert!(has_issue_linkage(&refs));
504        assert_eq!(refs[0].kind, IssueRefKind::Url);
505        assert!(refs[0].value.contains("/issues/1"));
506    }
507
508    #[test]
509    fn markdown_link_jira_browse() {
510        let body = "Related: [ticket](https://jira.example.com/browse/PROJ-456)";
511        let refs = extract_issue_references(body, &[]);
512        assert!(
513            refs.iter()
514                .any(|r| r.kind == IssueRefKind::Url && r.value.contains("/browse/"))
515        );
516    }
517
518    // --- P3: Jira blocklist ---
519
520    #[test]
521    fn blocklist_utf8_not_jira() {
522        let refs = extract_issue_references("Supports UTF-8 encoding", &[]);
523        assert!(!refs.iter().any(|r| r.kind == IssueRefKind::JiraTicket));
524    }
525
526    #[test]
527    fn blocklist_http_not_jira() {
528        let refs = extract_issue_references("Returns HTTP-500 errors", &[]);
529        assert!(!refs.iter().any(|r| r.kind == IssueRefKind::JiraTicket));
530    }
531
532    #[test]
533    fn blocklist_rfc_not_jira() {
534        let refs = extract_issue_references("Per RFC-9110 specification", &[]);
535        assert!(!refs.iter().any(|r| r.kind == IssueRefKind::JiraTicket));
536    }
537
538    #[test]
539    fn blocklist_cve_not_jira() {
540        let refs = extract_issue_references("Fixes CVE-2024 vulnerability", &[]);
541        assert!(!refs.iter().any(|r| r.kind == IssueRefKind::JiraTicket));
542    }
543
544    #[test]
545    fn real_jira_ticket_still_works() {
546        let refs = extract_issue_references("See PROJ-123 and MYAPP-456", &[]);
547        assert_eq!(
548            refs.iter()
549                .filter(|r| r.kind == IssueRefKind::JiraTicket)
550                .count(),
551            2
552        );
553        assert!(refs.iter().any(|r| r.value == "PROJ-123"));
554        assert!(refs.iter().any(|r| r.value == "MYAPP-456"));
555    }
556
557    // --- Trailing-character rejection (coderabbit fix) ---
558
559    #[test]
560    fn hash_followed_by_alpha_not_matched() {
561        // #123abc is not a valid GitHub issue reference
562        let refs = extract_issue_references("#123abc", &[]);
563        assert!(!has_issue_linkage(&refs));
564    }
565
566    #[test]
567    fn color_hex_not_matched() {
568        // CSS hex color should not match
569        let refs = extract_issue_references("color: #FF0000", &[]);
570        assert!(!has_issue_linkage(&refs));
571    }
572
573    #[test]
574    fn hash_followed_by_period_matched() {
575        // Period is not alphanumeric, so #123. should match
576        let refs = extract_issue_references("#123.", &[]);
577        assert!(has_issue_linkage(&refs));
578        assert_eq!(refs[0].value, "#123");
579    }
580
581    #[test]
582    fn keyword_hash_followed_by_exclamation_matched() {
583        // Exclamation is not alphanumeric, so fixes #123! should match
584        let refs = extract_issue_references("fixes #123!", &[]);
585        assert!(has_issue_linkage(&refs));
586        assert_eq!(refs[0].value, "fixes #123");
587    }
588
589    // --- Biconditional property test ---
590
591    /// Property: has_issue_linkage returns true iff extract_issue_references returns non-empty.
592    #[test]
593    fn linkage_biconditional() {
594        // Forward: references exist => linkage
595        let with_refs = extract_issue_references("fixes #1", &[]);
596        assert!(has_issue_linkage(&with_refs));
597
598        // Backward: no references => no linkage
599        let without_refs = extract_issue_references("plain text", &[]);
600        assert!(!has_issue_linkage(&without_refs));
601    }
602
603    // --- Linear ---
604
605    #[test]
606    fn linear_ticket_matched() {
607        let refs = extract_issue_references("Implements ENG-456", &[]);
608        assert!(has_issue_linkage(&refs));
609        assert_eq!(refs[0].kind, IssueRefKind::ProjectTicket);
610        assert_eq!(refs[0].value, "ENG-456");
611    }
612
613    #[test]
614    fn linear_url_matched() {
615        let refs = extract_issue_references(
616            "https://linear.app/myteam/issue/ENG-456/implement-feature",
617            &[],
618        );
619        assert!(has_issue_linkage(&refs));
620        assert_eq!(refs[0].kind, IssueRefKind::Url);
621    }
622
623    // --- Shortcut ---
624
625    #[test]
626    fn shortcut_ticket_matched() {
627        let refs = extract_issue_references("Fixes sc-12345", &[]);
628        assert!(has_issue_linkage(&refs));
629        assert_eq!(refs[0].kind, IssueRefKind::ProjectTicket);
630        assert_eq!(refs[0].value, "sc-12345");
631    }
632
633    #[test]
634    fn shortcut_url_matched() {
635        let refs =
636            extract_issue_references("https://app.shortcut.com/myorg/story/12345/fix-bug", &[]);
637        assert!(has_issue_linkage(&refs));
638        assert_eq!(refs[0].kind, IssueRefKind::Url);
639    }
640
641    // --- Notion ---
642
643    #[test]
644    fn notion_url_matched() {
645        let refs = extract_issue_references("https://notion.so/myworkspace/Task-abc123def456", &[]);
646        assert!(has_issue_linkage(&refs));
647        assert_eq!(refs[0].kind, IssueRefKind::Url);
648    }
649}