use std::cmp::Reverse;
use std::collections::HashSet;
use rusqlite::params;
use tga::collect::git::diff::diff_for_commit;
use tga::core::db::Database;
use tracing::{debug, warn};
use super::config::{DiffSamplerConfig, MAX_DIFF_CHARS};
use crate::profile::error::Result;
use crate::profile::types::period::{PeriodBatch, SampledDiff};
#[derive(Debug, Clone)]
pub(super) struct CommitRecord {
pub sha: String,
pub repository: String,
pub message: String,
pub category: Option<String>,
pub effort: Option<String>,
pub effort_rank: u8,
}
pub fn sample_diffs_for_batches(
batches: &mut [PeriodBatch],
db: &Database,
canonical_email: &str,
config: &DiffSamplerConfig,
) -> Result<()> {
for batch in batches.iter_mut() {
let since = batch.stats.since.as_str();
let until = batch.stats.until.as_str();
let commits = query_commits_in_period(db, canonical_email, since, until)?;
let selected = stratify_and_select(&commits, config.max_diffs);
for commit in selected {
let Some(repo_path) = config.repo_path(&commit.repository) else {
warn!(
sha = %commit.sha,
repository = %commit.repository,
"diff sampler: repository not configured locally — skipping"
);
continue;
};
if !repo_path.exists() {
warn!(
sha = %commit.sha,
repository = %commit.repository,
path = %repo_path.display(),
"diff sampler: repository path does not exist — skipping"
);
continue;
}
match diff_for_commit(&repo_path, &commit.sha) {
Ok(diff_text) => {
let truncated = truncate_diff(&diff_text);
debug!(
sha = %commit.sha,
diff_len = diff_text.len(),
truncated_len = truncated.len(),
"sampled diff"
);
batch.sampled_diffs.push(SampledDiff {
sha: commit.sha.clone(),
repository: commit.repository.clone(),
message: commit.message.clone(),
diff_text: truncated,
category: commit.category.clone(),
effort: commit.effort.clone(),
});
}
Err(e) => {
warn!(
sha = %commit.sha,
repository = %commit.repository,
error = %e,
"diff sampler: diff_for_commit failed — skipping"
);
}
}
}
}
Ok(())
}
fn query_commits_in_period(
db: &Database,
email: &str,
since: &str,
until: &str,
) -> Result<Vec<CommitRecord>> {
let conn = db.connection();
let mut stmt = conn
.prepare(
"SELECT c.sha, c.repository, c.message, \
cl.category, fce.size \
FROM commits c \
JOIN authors a ON a.id = c.author_id \
LEFT JOIN classifications cl ON cl.id = c.classification_id \
LEFT JOIN fact_commit_effort fce ON fce.sha = c.sha \
WHERE LOWER(a.canonical_email) = LOWER(?1) \
AND c.timestamp >= ?2 \
AND c.timestamp <= ?3 || 'T23:59:59Z' \
ORDER BY c.timestamp DESC",
)
.map_err(|e| crate::profile::error::ProfileError::Db(tga::core::TgaError::from(e)))?;
let rows = stmt
.query_map(params![email, since, until], |row| {
let sha: String = row.get(0)?;
let repository: String = row.get(1)?;
let message: String = row.get(2)?;
let category: Option<String> = row.get(3)?;
let effort: Option<String> = row.get(4)?;
Ok((sha, repository, message, category, effort))
})
.map_err(|e| crate::profile::error::ProfileError::Db(tga::core::TgaError::from(e)))?;
let mut commits = Vec::new();
for r in rows {
let (sha, repository, message, category, effort) =
r.map_err(|e| crate::profile::error::ProfileError::Db(tga::core::TgaError::from(e)))?;
let effort_rank = effort_to_rank(effort.as_deref());
commits.push(CommitRecord {
sha,
repository,
message,
category,
effort,
effort_rank,
});
}
Ok(commits)
}
fn effort_to_rank(size: Option<&str>) -> u8 {
match size {
Some("XS") => 1,
Some("S") => 2,
Some("M") => 3,
Some("L") => 4,
Some("XL") => 5,
_ => 0,
}
}
const PRIORITY_CATEGORIES: &[&str] = &["bugfix", "feature", "refactor"];
pub(super) fn stratify_and_select(
commits: &[CommitRecord],
max_diffs: usize,
) -> Vec<&CommitRecord> {
if max_diffs == 0 || commits.is_empty() {
return Vec::new();
}
let mut selected: Vec<&CommitRecord> = Vec::with_capacity(max_diffs);
let mut used_indices: HashSet<usize> = HashSet::new();
for cat in PRIORITY_CATEGORIES {
if selected.len() >= max_diffs {
break;
}
if let Some((idx, commit)) = commits
.iter()
.enumerate()
.find(|(i, c)| !used_indices.contains(i) && c.category.as_deref() == Some(cat))
{
selected.push(commit);
used_indices.insert(idx);
}
}
if selected.len() < max_diffs {
let mut remaining: Vec<(usize, &CommitRecord)> = commits
.iter()
.enumerate()
.filter(|(i, _)| !used_indices.contains(i))
.collect();
remaining.sort_by_key(|b| Reverse(b.1.effort_rank));
for (_, commit) in remaining {
if selected.len() >= max_diffs {
break;
}
selected.push(commit);
}
}
selected
}
pub(super) fn truncate_diff(diff_text: &str) -> String {
let char_count = diff_text.chars().count();
if char_count <= MAX_DIFF_CHARS {
return diff_text.to_string();
}
let byte_end = diff_text
.char_indices()
.nth(MAX_DIFF_CHARS)
.map(|(i, _)| i)
.unwrap_or(diff_text.len());
format!(
"{}\n[... diff truncated at {} chars ...]",
&diff_text[..byte_end],
MAX_DIFF_CHARS
)
}