tga 2.2.0

Developer productivity analytics — git commit collection, classification, and reporting
Documentation
//! Ticket-reference detection for commit messages.
//!
//! A commit is considered *ticketed* if its message contains any reference
//! to an external work-tracking system. We currently recognize:
//!
//! - **JIRA / Linear style**: `PROJ-123`, `ENG-456`, `ABC-9` —
//!   uppercase project key, hyphen, digits. The Linear identifier format
//!   (`ENG-123`, `FE-456`) is a subset of this pattern.
//! - **GitHub action-keyword refs**: `fixes #123`, `closes #45`,
//!   `resolves #7` (case-insensitive, also matches `fix`/`close`/`resolve`).
//! - **GitHub bare issue refs**: `#123` preceded by start-of-string or
//!   whitespace, so we don't false-positive on things like a hex color
//!   `#abc123` inside another token.
//! - **Azure DevOps work-item refs**: `AB#123`. Bare `#N` is intentionally
//!   excluded from the ADO pattern because it collides with GitHub PR/issue
//!   numbers (the existing GitHub bare-`#N` rule above still applies).
//!
//! Patterns are compiled exactly once on first use via [`OnceLock`].

use std::sync::OnceLock;

use regex::Regex;

/// Compiled regexes used by [`is_ticketed`].
struct TicketPatterns {
    jira: Regex,
    gh_action: Regex,
    gh_bare: Regex,
    azdo: Regex,
}

/// Global, lazily-initialized pattern set.
fn patterns() -> &'static TicketPatterns {
    static PATTERNS: OnceLock<TicketPatterns> = OnceLock::new();
    PATTERNS.get_or_init(|| {
        // SAFETY of unwrap: these literals are validated by the test
        // [`patterns_compile`] below — any regression is caught at test
        // time, not at runtime.
        TicketPatterns {
            // JIRA / Linear: uppercase letters (>=1), optional digits, '-', digits.
            // Word boundaries prevent matching inside `FOO-BAR-1`-style identifiers'
            // middle, while still catching the trailing `BAR-1` segment.
            jira: Regex::new(r"\b[A-Z][A-Z0-9]*-\d+\b").expect("jira pattern compiles"),
            // GitHub action keyword: fix(es|ed)?|close(s|d)?|resolve(s|d)?  #123
            gh_action: Regex::new(r"(?i)\b(?:fix(?:es|ed)?|close[sd]?|resolve[sd]?)\s+#\d+\b")
                .expect("gh_action pattern compiles"),
            // Bare `#123` preceded by start-of-line or whitespace.
            gh_bare: Regex::new(r"(?m)(?:^|\s)#\d+\b").expect("gh_bare pattern compiles"),
            // Azure DevOps work-item reference: AB#123.
            // Bare #N intentionally excluded — collides with GitHub PR/issue numbers.
            azdo: Regex::new(r"\bAB#\d+\b").expect("azdo pattern compiles"),
        }
    })
}

/// Compiled extraction patterns used by [`extract_ticket_id`], ordered from
/// most-specific to least-specific so the highest-fidelity match wins.
struct ExtractPatterns {
    /// Azure DevOps work-item reference: `AB#123`.
    azdo: Regex,
    /// JIRA / Linear style: `PROJ-123`, `ENG-456`, `DRE-405`.
    jira: Regex,
    /// GitHub bare issue reference: `#123`.
    gh_bare: Regex,
}

/// Global, lazily-initialized extraction pattern set.
fn extract_patterns() -> &'static ExtractPatterns {
    static EXTRACT: OnceLock<ExtractPatterns> = OnceLock::new();
    EXTRACT.get_or_init(|| {
        // SAFETY of unwrap: all literals are validated by the test
        // [`extract_patterns_compile`] — any regression is caught at test time.
        ExtractPatterns {
            azdo: Regex::new(r"\bAB#\d+\b").expect("azdo extract pattern compiles"),
            jira: Regex::new(r"\b[A-Z][A-Z0-9]*-\d+\b").expect("jira extract pattern compiles"),
            gh_bare: Regex::new(r"(?:^|\s)(#\d+)\b").expect("gh_bare extract pattern compiles"),
        }
    })
}

/// Return `true` if `message` contains any recognized ticket reference.
///
/// The check is performed against the full message (subject + body); a
/// reference anywhere in the text — including later lines of a multi-line
/// commit body — flags the commit as ticketed.
///
/// # Examples
///
/// ```
/// use tga::collect::ticket::is_ticketed;
///
/// assert!(is_ticketed("ENG-123: add feature"));
/// assert!(is_ticketed("Fix login (closes #42)"));
/// assert!(!is_ticketed("misc cleanup"));
/// ```
pub fn is_ticketed(message: &str) -> bool {
    let p = patterns();
    p.jira.is_match(message)
        || p.gh_action.is_match(message)
        || p.gh_bare.is_match(message)
        || p.azdo.is_match(message)
}

/// Extract the first recognizable ticket identifier from a commit message.
///
/// Why: `commits.ticket_id` must be populated at insert time so that JIRA
/// classification and ticket-rate metrics work without requiring a separate
/// `tga backfill ticket-ids` pass. Issue #316 identified that 32% of
/// uncategorized commits had clearly extractable JIRA IDs (e.g. `BB-2746`,
/// `SRE-3104`, `DRE-405`) but NULL `ticket_id` because this extraction
/// only happened during backfill, not during `tga collect`.
/// What: tests the message against ADO (`AB#N`), JIRA/Linear (`PROJ-N`),
/// and GitHub bare (`#N`) patterns in that priority order; returns the
/// first match as `Some(String)`, or `None` when no ticket ref is found.
/// Test: `tests::extract_ticket_id_*` below; also exercised by
/// `collect::git::extractor` tests that verify `ticket_id` is populated
/// at INSERT time during `tga collect`.
///
/// # Examples
///
/// ```
/// use tga::collect::ticket::extract_ticket_id;
///
/// assert_eq!(extract_ticket_id("BB-2746: refactor auth"), Some("BB-2746".to_string()));
/// assert_eq!(extract_ticket_id("SRE-3104: increase RDS timeout"), Some("SRE-3104".to_string()));
/// assert_eq!(extract_ticket_id("DRE-405 fix demand calculation"), Some("DRE-405".to_string()));
/// assert_eq!(extract_ticket_id("fixes #99"), Some("#99".to_string()));
/// assert_eq!(extract_ticket_id("misc cleanup"), None);
/// ```
pub fn extract_ticket_id(message: &str) -> Option<String> {
    let p = extract_patterns();

    // ADO: AB#123 — most specific, checked first.
    if let Some(m) = p.azdo.find(message) {
        return Some(m.as_str().to_string());
    }

    // JIRA / Linear: PROJ-123.
    if let Some(m) = p.jira.find(message) {
        return Some(m.as_str().to_string());
    }

    // GitHub bare: #123 — the capture group strips the leading whitespace
    // that the pattern uses as a left-boundary guard.
    if let Some(caps) = p.gh_bare.captures(message) {
        if let Some(m) = caps.get(1) {
            return Some(m.as_str().to_string());
        }
    }

    None
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn patterns_compile() {
        // Force lazy init; if any pattern is malformed this will panic.
        let _ = patterns();
    }

    #[test]
    fn extract_patterns_compile() {
        // Force lazy init; if any pattern is malformed this will panic.
        let _ = extract_patterns();
    }

    // ── extract_ticket_id: issue #316 sample commits ──────────────────────

    #[test]
    fn extract_ticket_id_bb_2746() {
        // Sample from issue #316 — was producing NULL ticket_id before fix.
        assert_eq!(
            extract_ticket_id("BB-2746: refactor auth service"),
            Some("BB-2746".to_string())
        );
    }

    #[test]
    fn extract_ticket_id_sre_3104() {
        // Sample from issue #316 — was producing NULL ticket_id before fix.
        assert_eq!(
            extract_ticket_id("SRE-3104: increase RDS connection timeout"),
            Some("SRE-3104".to_string())
        );
    }

    #[test]
    fn extract_ticket_id_dre_405() {
        // Sample from issue #316 — note: no colon separator, space only.
        assert_eq!(
            extract_ticket_id("DRE-405 fix demand calculation"),
            Some("DRE-405".to_string())
        );
    }

    #[test]
    fn extract_ticket_id_returns_none_for_plain_message() {
        assert_eq!(extract_ticket_id("misc cleanup"), None);
        assert_eq!(extract_ticket_id("update README"), None);
        assert_eq!(extract_ticket_id("bump version to 1.2.3"), None);
    }

    #[test]
    fn extract_ticket_id_github_bare_ref() {
        assert_eq!(extract_ticket_id("fixes #99"), Some("#99".to_string()));
    }

    #[test]
    fn extract_ticket_id_azdo_ref() {
        assert_eq!(
            extract_ticket_id("AB#42 implement feature"),
            Some("AB#42".to_string())
        );
    }

    #[test]
    fn extract_ticket_id_azdo_preferred_over_jira() {
        // When both AB# and JIRA patterns appear, ADO wins (more specific).
        assert_eq!(
            extract_ticket_id("AB#10 fixes PROJ-99"),
            Some("AB#10".to_string())
        );
    }

    #[test]
    fn extract_ticket_id_jira_preferred_over_gh_bare() {
        // JIRA is checked before bare GitHub ref.
        assert_eq!(
            extract_ticket_id("ENG-7 closes #10"),
            Some("ENG-7".to_string())
        );
    }

    #[test]
    fn extract_ticket_id_multiline_body() {
        let msg = "Refactor module structure\n\nRelates to SRE-999.\n";
        assert_eq!(extract_ticket_id(msg), Some("SRE-999".to_string()));
    }

    #[test]
    fn jira_style_is_ticketed() {
        assert!(is_ticketed("ENG-123: add feature"));
        assert!(is_ticketed("PROJ-1 initial commit"));
        assert!(is_ticketed("Backport from upstream (ABC-4567)"));
    }

    #[test]
    fn linear_style_is_ticketed() {
        // Linear identifiers are a subset of the JIRA pattern.
        assert!(is_ticketed("FE-456 fix login"));
        assert!(is_ticketed("API-9 add endpoint"));
    }

    #[test]
    fn github_action_keyword_is_ticketed() {
        assert!(is_ticketed("Fix race condition, fixes #123"));
        assert!(is_ticketed("closes #45"));
        assert!(is_ticketed("Resolves #7 by reworking auth"));
        assert!(is_ticketed("CLOSED #99")); // case-insensitive
    }

    #[test]
    fn github_bare_hash_ref_is_ticketed() {
        assert!(is_ticketed("Bug from #123 still present"));
        assert!(is_ticketed("#42 follow-up"));
    }

    #[test]
    fn plain_message_is_not_ticketed() {
        assert!(!is_ticketed("misc cleanup"));
        assert!(!is_ticketed("update README"));
        assert!(!is_ticketed("bump version to 1.2.3"));
        // Hex color shouldn't false-positive — `#abc123` is not preceded by
        // whitespace+digits-only.
        assert!(!is_ticketed("set color to #abc123"));
        // Lowercase project key is not a JIRA identifier.
        assert!(!is_ticketed("eng-123 lowercase doesn't count"));
    }

    #[test]
    fn multiline_body_with_ticket_is_ticketed() {
        let msg = "Refactor module structure\n\nMoves things around.\nRelates to PROJ-789.\n";
        assert!(is_ticketed(msg));

        let msg2 = "First line no ticket\n\nSecond paragraph mentions #321 explicitly.";
        assert!(is_ticketed(msg2));
    }

    #[test]
    fn azdo_ab_ref_is_ticketed() {
        assert!(is_ticketed("AB#1234 implement new feature"));
        assert!(is_ticketed("Refactor module (AB#42)"));
        assert!(is_ticketed("First line\n\nbody mentions AB#7 explicitly"));
    }

    #[test]
    fn bare_hash_without_ab_prefix_is_not_azdo() {
        // GitHub bare `#N` still matches via the gh_bare pattern, but it
        // must NOT match the ADO `AB#` pattern specifically.
        let p = patterns();
        assert!(!p.azdo.is_match("#1234 some work"));
        assert!(!p.azdo.is_match("fixes #99"));
        // And the existing JIRA pattern must not accidentally fire on AB#N
        // (different separator: `#` vs `-`).
        assert!(!p.jira.is_match("AB#1234"));
    }

    #[test]
    fn empty_message_is_not_ticketed() {
        assert!(!is_ticketed(""));
        assert!(!is_ticketed("\n\n"));
    }
}