use anyhow::Result;
use chrono::{TimeZone, Utc};
use rusqlite::types::Value;
use std::collections::{HashMap, HashSet};
use crate::domain::path::{canonicalize_scopes, path_strip_prefix};
use crate::domain::scope::ScopeMatch;
use crate::domain::source::Source;
use crate::expr::filter::{self, Filter};
use crate::repo::source::BATCH_SIZE;
use crate::repo::{self, Connection, Db};
pub fn run(
db: &mut Db,
scope_paths: &[std::path::PathBuf],
filter_strs: &[String],
archived_mode: Option<&str>,
unarchived_only: bool,
unhashed_only: bool,
include_archived: bool,
include_excluded: bool,
use_relative_paths: bool,
long_format: bool,
sort_by: &str,
reverse: bool,
null_delim: bool,
) -> Result<()> {
let archived_only = archived_mode.is_some();
let show_archive_paths = archived_mode == Some("show");
let conn = db.conn_mut();
if !matches!(sort_by, "path" | "size" | "mtime" | "name") {
anyhow::bail!(
"Invalid sort option '{sort_by}'. Valid options: path, size, mtime, name"
);
}
let filters: Vec<Filter> = filter_strs
.iter()
.map(|f| Filter::parse(f))
.collect::<Result<Vec<_>>>()?;
let scope_prefixes = canonicalize_scopes(scope_paths)?;
let scopes = ScopeMatch::classify_all(&scope_prefixes);
let cwd = if use_relative_paths {
std::env::current_dir()
.ok()
.and_then(|p| std::fs::canonicalize(p).ok())
.and_then(|p| p.to_str().map(String::from))
} else {
None
};
let (sources, excluded_count) =
get_matching_sources(conn, &scopes, &filters, include_archived, include_excluded)?;
if sources.is_empty() {
eprintln!("No sources match the given filters.");
if !include_excluded && excluded_count > 0 {
eprintln!(
"({excluded_count} excluded sources hidden, use --include-excluded to show)"
);
}
return Ok(());
}
let object_ids: Vec<i64> = sources.iter().filter_map(|s| s.object_id).collect();
let archived_set: HashSet<i64> = if archived_only || unarchived_only {
repo::object::batch_check_archived(conn, &object_ids, None)?
} else {
HashSet::new()
};
let archive_paths_map: HashMap<i64, Vec<String>> = if show_archive_paths {
repo::object::batch_find_archive_paths(conn, &object_ids)?
} else {
HashMap::new()
};
let mut output_lines: Vec<(String, Option<String>, i64, i64)> = Vec::new();
let mut unhashed_count = 0usize;
for source in &sources {
let formatted_source = format_path(&source.path(), cwd.as_deref());
let object_id = source.object_id;
let size = source.size;
let mtime = source.mtime;
if archived_only {
match object_id {
None => {
unhashed_count += 1;
}
Some(obj_id) => {
if show_archive_paths {
if let Some(paths) = archive_paths_map.get(&obj_id) {
for archive_path in paths {
output_lines.push((
formatted_source.clone(),
Some(archive_path.clone()),
size,
mtime,
));
}
}
} else if archived_set.contains(&obj_id) {
output_lines.push((formatted_source, None, size, mtime));
}
}
}
} else if unarchived_only {
match object_id {
None => {
unhashed_count += 1;
}
Some(obj_id) => {
if !archived_set.contains(&obj_id) {
output_lines.push((formatted_source, None, size, mtime));
}
}
}
} else if unhashed_only {
if object_id.is_none() {
output_lines.push((formatted_source, None, size, mtime));
}
} else {
output_lines.push((formatted_source, None, size, mtime));
}
}
match sort_by {
"path" => output_lines.sort_by(|a, b| a.0.cmp(&b.0)),
"size" => output_lines.sort_by(|a, b| a.2.cmp(&b.2)),
"mtime" => output_lines.sort_by(|a, b| a.3.cmp(&b.3)),
"name" => output_lines.sort_by(|a, b| {
let name_a = a.0.rsplit('/').next().unwrap_or(&a.0);
let name_b = b.0.rsplit('/').next().unwrap_or(&b.0);
name_a.cmp(name_b)
}),
_ => {} }
if reverse {
output_lines.reverse();
}
let line_end = if null_delim { "\0" } else { "\n" };
for (source_path, archive_path, size, mtime) in &output_lines {
if long_format {
let size_str = format_size(*size);
let date_str = format_date(*mtime);
if let Some(ap) = archive_path {
print!(
"{size_str:>8} {date_str} {source_path}\t{ap}{line_end}"
);
} else {
print!("{size_str:>8} {date_str} {source_path}{line_end}");
}
} else if let Some(ap) = archive_path {
print!("{source_path}\t{ap}{line_end}");
} else {
print!("{source_path}{line_end}");
}
}
let source_count = if show_archive_paths {
output_lines
.iter()
.map(|(s, _, _, _)| s)
.collect::<std::collections::HashSet<_>>()
.len()
} else {
output_lines.len()
};
let mut footer_parts = vec![format!("{} sources", source_count)];
if !include_excluded && excluded_count > 0 {
footer_parts.push(format!("{excluded_count} excluded hidden"));
}
if (archived_only || unarchived_only) && unhashed_count > 0 {
footer_parts.push(format!(
"{unhashed_count} unhashed skipped, use --unhashed to see"
));
}
if footer_parts.len() > 1 {
eprintln!("{} ({})", footer_parts[0], footer_parts[1..].join(", "));
} else {
eprintln!("{}", footer_parts[0]);
}
Ok(())
}
fn get_matching_sources(
conn: &mut Connection,
scopes: &[ScopeMatch],
filters: &[Filter],
include_archived: bool,
include_excluded: bool,
) -> Result<(Vec<Source>, usize)> {
let root_ids: Vec<i64> = conn
.prepare("SELECT id FROM roots")?
.query_map([], |row| row.get(0))?
.collect::<Result<Vec<_>, _>>()?;
let all_sources = repo::source::batch_fetch_by_roots(conn, &root_ids)?;
let mut excluded_count = 0usize;
let filtered: Vec<Source> = all_sources
.into_iter()
.filter(|s| s.is_active())
.filter(|s| include_archived || s.is_from_role("source"))
.filter(|s| s.matches_scope(scopes))
.filter(|s| {
if s.is_excluded()
&& !include_excluded {
excluded_count += 1;
return false;
}
true
})
.collect();
if filters.is_empty() {
return Ok((filtered, excluded_count));
}
let source_ids: Vec<i64> = filtered.iter().map(|s| s.id).collect();
let filtered_ids = filter::apply_filters(conn, &source_ids, filters)?;
let filtered_id_set: std::collections::HashSet<i64> = filtered_ids.into_iter().collect();
let result: Vec<Source> = filtered
.into_iter()
.filter(|s| filtered_id_set.contains(&s.id))
.collect();
Ok((result, excluded_count))
}
fn format_path(full_path: &str, cwd: Option<&str>) -> String {
if let Some(cwd) = cwd {
if full_path == cwd {
".".to_string()
} else if let Some(rel) = path_strip_prefix(full_path, cwd) {
rel.to_string()
} else {
full_path.to_string()
}
} else {
full_path.to_string()
}
}
fn format_size(bytes: i64) -> String {
const KB: i64 = 1024;
const MB: i64 = 1024 * KB;
const GB: i64 = 1024 * MB;
if bytes >= GB {
format!("{:.1} GB", bytes as f64 / GB as f64)
} else if bytes >= MB {
format!("{:.1} MB", bytes as f64 / MB as f64)
} else if bytes >= KB {
format!("{:.1} KB", bytes as f64 / KB as f64)
} else {
format!("{bytes} B")
}
}
fn format_date(unix_timestamp: i64) -> String {
Utc.timestamp_opt(unix_timestamp, 0)
.single()
.map(|dt| dt.format("%b %e %Y").to_string())
.unwrap_or_else(|| "???".to_string())
}
pub fn show_duplicates(
db: &mut Db,
scope_paths: &[std::path::PathBuf],
filter_strs: &[String],
include_archived: bool,
include_excluded: bool,
use_relative_paths: bool,
) -> Result<()> {
let conn = db.conn_mut();
let filters: Vec<Filter> = filter_strs
.iter()
.map(|f| Filter::parse(f))
.collect::<Result<Vec<_>>>()?;
let scope_prefixes = canonicalize_scopes(scope_paths)?;
let scopes = ScopeMatch::classify_all(&scope_prefixes);
let cwd = if use_relative_paths {
std::env::current_dir()
.ok()
.and_then(|p| std::fs::canonicalize(p).ok())
.and_then(|p| p.to_str().map(String::from))
} else {
None
};
let (sources, excluded_count) =
get_matching_sources(conn, &scopes, &filters, include_archived, include_excluded)?;
if sources.is_empty() {
eprintln!("No sources match the given filters.");
if !include_excluded && excluded_count > 0 {
eprintln!(
"({excluded_count} excluded sources hidden, use --include-excluded to show)"
);
}
return Ok(());
}
let source_ids: Vec<i64> = sources.iter().map(|s| s.id).collect();
let duplicate_groups = find_duplicate_groups(conn, &source_ids)?;
if duplicate_groups.is_empty() {
println!("No duplicates found.");
if !include_excluded && excluded_count > 0 {
eprintln!(
"({excluded_count} excluded sources hidden, use --include-excluded to show)"
);
}
return Ok(());
}
let mut total_sources = 0usize;
for (hash, size, dup_sources) in &duplicate_groups {
let short_hash = if hash.len() > 12 { &hash[..12] } else { hash };
let size_str = format_size(*size);
println!(
"[{}...] {} sources, {}:",
short_hash,
dup_sources.len(),
size_str
);
for (path, source_id) in dup_sources {
let display_path = format_path(path, cwd.as_deref());
println!(" {display_path} (id: {source_id})");
}
println!();
total_sources += dup_sources.len();
}
println!(
"Found {} duplicate groups ({} sources)",
duplicate_groups.len(),
total_sources
);
if !include_excluded && excluded_count > 0 {
eprintln!(
"({excluded_count} excluded sources hidden, use --include-excluded to show)"
);
}
Ok(())
}
fn find_duplicate_groups(
conn: &Connection,
source_ids: &[i64],
) -> Result<Vec<(String, i64, Vec<(String, i64)>)>> {
if source_ids.is_empty() {
return Ok(Vec::new());
}
let mut object_map: HashMap<i64, (String, i64, Vec<(String, i64)>)> = HashMap::new();
for chunk in source_ids.chunks(BATCH_SIZE) {
let placeholders: Vec<&str> = chunk.iter().map(|_| "?").collect();
let sql = format!(
"SELECT s.id, s.object_id, o.hash_value, s.size, r.path, s.rel_path
FROM sources s
JOIN roots r ON s.root_id = r.id
JOIN objects o ON s.object_id = o.id
WHERE s.id IN ({}) AND s.object_id IS NOT NULL",
placeholders.join(",")
);
let params: Vec<Value> = chunk.iter().map(|&id| Value::from(id)).collect();
let mut stmt = conn.prepare(&sql)?;
let rows = stmt.query_map(rusqlite::params_from_iter(params), |row| {
Ok((
row.get::<_, i64>(0)?, row.get::<_, i64>(1)?, row.get::<_, String>(2)?, row.get::<_, i64>(3)?, row.get::<_, String>(4)?, row.get::<_, String>(5)?, ))
})?;
for row in rows {
let (source_id, object_id, hash, size, root_path, rel_path) = row?;
let full_path = if rel_path.is_empty() {
root_path
} else {
format!("{root_path}/{rel_path}")
};
object_map
.entry(object_id)
.or_insert_with(|| (hash, size, Vec::new()))
.2
.push((full_path, source_id));
}
}
let mut groups: Vec<(String, i64, Vec<(String, i64)>)> = object_map
.into_values()
.filter(|(_, _, sources)| sources.len() > 1)
.collect();
for (_, _, sources) in &mut groups {
sources.sort_by(|a, b| a.0.cmp(&b.0));
}
groups.sort_by(|a, b| {
a.2.first()
.map(|(p, _)| p.as_str())
.cmp(&b.2.first().map(|(p, _)| p.as_str()))
});
Ok(groups)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::repo::open_in_memory_for_test;
use rusqlite::Connection as RusqliteConnection;
fn setup_test_db() -> RusqliteConnection {
open_in_memory_for_test()
}
fn insert_root(conn: &RusqliteConnection, path: &str, role: &str, suspended: bool) -> i64 {
conn.execute(
"INSERT INTO roots (path, role, suspended) VALUES (?, ?, ?)",
rusqlite::params![path, role, suspended as i64],
)
.unwrap();
conn.last_insert_rowid()
}
fn insert_object(conn: &RusqliteConnection, hash: &str, excluded: bool) -> i64 {
conn.execute(
"INSERT INTO objects (hash_type, hash_value, excluded) VALUES ('sha256', ?, ?)",
rusqlite::params![hash, excluded as i64],
)
.unwrap();
conn.last_insert_rowid()
}
fn insert_source(
conn: &RusqliteConnection,
root_id: i64,
rel_path: &str,
object_id: Option<i64>,
) -> i64 {
conn.execute(
"INSERT INTO sources (root_id, rel_path, object_id, size, mtime, partial_hash, scanned_at, last_seen_at, device, inode)
VALUES (?, ?, ?, 1000, 1704067200, '', 0, 0, 0, 0)",
rusqlite::params![root_id, rel_path, object_id],
)
.unwrap();
conn.last_insert_rowid()
}
#[test]
fn test_ls_archived_flag_counts_sources_not_objects() {
let conn = setup_test_db();
let source_root = insert_root(&conn, "/photos", "source", false);
let archive_root = insert_root(&conn, "/archive", "archive", false);
let archived_obj = insert_object(&conn, "abc123archived", false);
let source1 = insert_source(&conn, source_root, "photo1.jpg", Some(archived_obj));
let source2 = insert_source(&conn, source_root, "photo2.jpg", Some(archived_obj));
let source3 = insert_source(&conn, source_root, "photo3.jpg", Some(archived_obj));
let unarchived_obj = insert_object(&conn, "def456unarchived", false);
let _source4 = insert_source(&conn, source_root, "photo4.jpg", Some(unarchived_obj));
insert_source(&conn, archive_root, "photo_backup.jpg", Some(archived_obj));
let object_ids = vec![archived_obj, unarchived_obj];
let archived_set = repo::object::batch_check_archived(&conn, &object_ids, None).unwrap();
assert!(archived_set.contains(&archived_obj));
assert!(!archived_set.contains(&unarchived_obj));
assert_eq!(
archived_set.len(),
1,
"Only 1 unique object should be archived"
);
let source_ids = [source1, source2, source3];
let archived_source_count = source_ids
.iter()
.filter(|_| {
archived_set.contains(&archived_obj)
})
.count();
assert_eq!(
archived_source_count, 3,
"Should count 3 SOURCES with archived objects, not 1 unique object"
);
}
}