Skip to main content

tj_core/
artifacts.rs

1//! Artifact extraction — regex-based scrape of structured references
2//! out of free-form event text. Captures the bits that turn a journal
3//! entry into a real ledger of what shipped: commit hashes, PR URLs,
4//! ticket IDs, branch names, file paths.
5//!
6//! Intentionally regex-only and side-effect free: the classifier may
7//! still emit a richer JSON payload in the future, but those will be
8//! merged into the same shape via `Artifacts::merge`. Keeping the
9//! extractor pure means `reclassify` can run it offline over historic
10//! events without spawning the model.
11
12use regex::Regex;
13use serde::{Deserialize, Serialize};
14
15/// Structured artifacts collected from one or many events. All vectors
16/// are deduplicated (case-sensitive) by the `merge` constructor — the
17/// extractor itself emits raw matches.
18#[derive(Debug, Default, Clone, Serialize, Deserialize, PartialEq, Eq)]
19pub struct Artifacts {
20    #[serde(default, skip_serializing_if = "Vec::is_empty")]
21    pub commit_hashes: Vec<String>,
22    #[serde(default, skip_serializing_if = "Vec::is_empty")]
23    pub pr_urls: Vec<String>,
24    #[serde(default, skip_serializing_if = "Vec::is_empty")]
25    pub linked_issues: Vec<String>,
26    #[serde(default, skip_serializing_if = "Vec::is_empty")]
27    pub files: Vec<String>,
28    #[serde(default, skip_serializing_if = "Vec::is_empty")]
29    pub branch_names: Vec<String>,
30}
31
32impl Artifacts {
33    pub fn is_empty(&self) -> bool {
34        self.commit_hashes.is_empty()
35            && self.pr_urls.is_empty()
36            && self.linked_issues.is_empty()
37            && self.files.is_empty()
38            && self.branch_names.is_empty()
39    }
40
41    /// Merge another `Artifacts` into self, preserving insertion order
42    /// and deduplicating exact-match strings.
43    pub fn merge(&mut self, other: Artifacts) {
44        for (dst, src) in [
45            (&mut self.commit_hashes, other.commit_hashes),
46            (&mut self.pr_urls, other.pr_urls),
47            (&mut self.linked_issues, other.linked_issues),
48            (&mut self.files, other.files),
49            (&mut self.branch_names, other.branch_names),
50        ] {
51            for s in src {
52                if !dst.iter().any(|x| x == &s) {
53                    dst.push(s);
54                }
55            }
56        }
57    }
58}
59
60/// Extract artifacts from a single piece of text (event body, prompt,
61/// tool output — anything stringly-typed). Idempotent and free of I/O.
62pub fn extract(text: &str) -> Artifacts {
63    let mut a = Artifacts::default();
64
65    // Commit hashes — 7 to 40 hex chars surrounded by word boundaries.
66    // Word boundary on \b avoids matching inside longer non-hex tokens
67    // (e.g. ULIDs are base32, but adjacent digits + letters could
68    // technically pass — the boundary keeps matches clean).
69    static_re(
70        r"\b[0-9a-f]{7,40}\b",
71        |m| {
72            // Reject if all-digits (could be a year, an ID, a port).
73            // A real abbreviated commit always has at least one letter.
74            if m.chars().all(|c| c.is_ascii_digit()) {
75                return;
76            }
77            a.commit_hashes.push(m.to_string());
78        },
79        text,
80    );
81
82    // GitHub / GitLab PR URLs.
83    static_re(
84        r"https?://[A-Za-z0-9.\-]+/[A-Za-z0-9_./\-]+/(?:pull|merge_requests)/\d+",
85        |m| a.pr_urls.push(m.to_string()),
86        text,
87    );
88
89    // Ticket IDs: ABC-123. At least 2 letters to avoid matching version
90    // strings like v1-2 and minimum 1 digit.
91    static_re(
92        r"\b[A-Z]{2,}-\d+\b",
93        |m| a.linked_issues.push(m.to_string()),
94        text,
95    );
96
97    // File paths — heuristic: path-like tokens with at least one slash
98    // (and an extension) OR a leading ./ . Path segments allow a
99    // leading dot so `.docs/specs/auth.md`, `.github/workflows/ci.yml`
100    // etc are captured as artifacts. Tight enough to skip prose, loose
101    // enough to catch the common cases (src/foo.rs, ./bar.ts,
102    // crates/tj-core/src/db.rs).
103    static_re(
104        r"(?:\./|\.?[A-Za-z0-9_\-]+/)+[A-Za-z0-9_.\-]+\.[A-Za-z0-9]{1,8}\b",
105        |m| a.files.push(m.to_string()),
106        text,
107    );
108
109    // Branch names from explicit git commands. v0.6.1: anchor the
110    // pattern to `git ...` so that prose like "branches: commits, PRs,
111    // files, branches names" does not capture the next word as a
112    // branch. The bare-`branch <name>` form is intentionally dropped —
113    // it caused too many false positives in journal events that
114    // mention the word "branch" without naming one.
115    if let Ok(re) =
116        Regex::new(r"\bgit\s+(?:checkout\s+-b|switch\s+-c|branch)\s+([A-Za-z0-9._/\-]+)")
117    {
118        for cap in re.captures_iter(text) {
119            if let Some(m) = cap.get(1) {
120                a.branch_names.push(m.as_str().to_string());
121            }
122        }
123    }
124
125    // Dedup in place — emit-time order matters for stable test output.
126    dedup(&mut a.commit_hashes);
127    dedup(&mut a.pr_urls);
128    dedup(&mut a.linked_issues);
129    dedup(&mut a.files);
130    dedup(&mut a.branch_names);
131    a
132}
133
134fn dedup(v: &mut Vec<String>) {
135    let mut seen = std::collections::HashSet::new();
136    v.retain(|x| seen.insert(x.clone()));
137}
138
139fn static_re(pat: &str, mut f: impl FnMut(&str), text: &str) {
140    if let Ok(re) = Regex::new(pat) {
141        for m in re.find_iter(text) {
142            f(m.as_str());
143        }
144    }
145}
146
147#[cfg(test)]
148mod tests {
149    use super::*;
150
151    #[test]
152    fn extracts_commit_hash() {
153        let a = extract("fixed in commit abc1234 and 9012abcdef");
154        assert_eq!(a.commit_hashes, vec!["abc1234", "9012abcdef"]);
155    }
156
157    #[test]
158    fn rejects_all_digit_commit_lookalikes() {
159        // Year-like sequence, port numbers, etc.
160        let a = extract("ran tests on port 12345 in 2026");
161        assert!(a.commit_hashes.is_empty());
162    }
163
164    #[test]
165    fn extracts_github_pr_url() {
166        let a = extract("see https://github.com/Digital-Threads/Task-Journal/pull/42");
167        assert_eq!(
168            a.pr_urls,
169            vec!["https://github.com/Digital-Threads/Task-Journal/pull/42"]
170        );
171    }
172
173    #[test]
174    fn extracts_linked_issues() {
175        let a = extract("FIN-868 references JIRA-12345 and INC-7");
176        assert_eq!(a.linked_issues, vec!["FIN-868", "JIRA-12345", "INC-7"]);
177    }
178
179    #[test]
180    fn extracts_file_paths() {
181        let a = extract("edited crates/tj-core/src/db.rs and ./README.md");
182        assert!(a.files.contains(&"crates/tj-core/src/db.rs".to_string()));
183        assert!(a.files.contains(&"./README.md".to_string()));
184    }
185
186    #[test]
187    fn extracts_dot_prefixed_dirs() {
188        // .docs/specs/*.md, .github/workflows/*.yml — leading-dot dirs
189        // are spec/config holders we want surfaced as artifacts so the
190        // pack ties decisions back to the document that justified them.
191        let a = extract("see .docs/specs/auth.md and .github/workflows/ci.yml");
192        assert!(a.files.contains(&".docs/specs/auth.md".to_string()));
193        assert!(a.files.contains(&".github/workflows/ci.yml".to_string()));
194    }
195
196    #[test]
197    fn extracts_branch_names() {
198        // v0.6.1: only match explicit `git ...` commands so prose like
199        // "branches: commits, PRs, names" no longer captures "names"
200        // as a branch. Bare `switch -c` without `git ` prefix is also
201        // ignored — keep the pattern conservative.
202        let a = extract("git checkout -b FIN-868-fix-paygate-fee then git switch -c hotfix/abc");
203        assert_eq!(
204            a.branch_names,
205            vec!["FIN-868-fix-paygate-fee", "hotfix/abc"]
206        );
207    }
208
209    #[test]
210    fn does_not_capture_branch_from_prose() {
211        // Journal events mention `branches:` as a list header. The
212        // pre-v0.6.1 extractor captured the next word ("names") as a
213        // branch. The tightened regex requires explicit `git ` prefix.
214        let a =
215            extract("Artifacts groups: commits, PRs, issues, files, branches names listed below");
216        assert!(
217            a.branch_names.is_empty(),
218            "regex must not pick up branches from prose, got: {:?}",
219            a.branch_names
220        );
221    }
222
223    #[test]
224    fn merge_dedupes() {
225        let mut a = Artifacts {
226            commit_hashes: vec!["abc1234".into()],
227            ..Default::default()
228        };
229        let b = Artifacts {
230            commit_hashes: vec!["abc1234".into(), "def5678".into()],
231            ..Default::default()
232        };
233        a.merge(b);
234        assert_eq!(a.commit_hashes, vec!["abc1234", "def5678"]);
235    }
236
237    #[test]
238    fn empty_text_yields_empty_artifacts() {
239        let a = extract("");
240        assert!(a.is_empty());
241    }
242
243    #[test]
244    fn json_round_trip() {
245        let a = Artifacts {
246            commit_hashes: vec!["abc1234".into()],
247            linked_issues: vec!["FIN-868".into()],
248            ..Default::default()
249        };
250        let s = serde_json::to_string(&a).unwrap();
251        let b: Artifacts = serde_json::from_str(&s).unwrap();
252        assert_eq!(a, b);
253    }
254}