#![warn(clippy::all)]
#![allow(clippy::range_plus_one, clippy::many_single_char_names, clippy::too_many_arguments, clippy::cast_lossless, unused_variables)]
use grafix_toolbox::*;
use glsl::*;
use uses::{asyn::*, math::*, sync::fs, GL, *};
use GL::{mesh::*, offhand::*, window::*, *};
fn main() {
std::env::set_var("SMOL_THREADS", (num_cpus::get_physical() * 2 + 2).to_string());
LOGGER!(logging::Null, DEBUG);
use clap::{App, SubCommand};
let args = App::new("Finds duplicate images from blockhash list")
.about(
r#"
Install blockhash first: https://github.com/commonsmachinery/blockhash/
To prepare a list of blockhashes run:
find . -regextype posix-egrep -regex ".*\.(png|jpe?g)$" -type f -printf '"%p"\n' | xargs -IF -P8 blockhash F > hashes
Note that paths are relative and you must run dedup from the same directory as find"#,
)
.subcommand(
SubCommand::with_name("find")
.about("Prints all files with duplicates within a list of blockhashes. Prints filenames to strdout and progress to stderr, so redirect output to a file 'dedup_adv find hashes > dupes'")
.args_from_usage(
"<PATH> 'Path to the list of blockhashes'
[PATH2] 'If present duplicate wills be considered within PATH -> PATH2'
-s, --similarity=[SIMILARITY] 'Integer [1..100], cutoff similarity percentage'
-U, --ultra 'Ultra precision, performs pixelwise feature comparisons'
-D, --display 'Displays ultra precision diffs'",
),
)
.subcommand(
SubCommand::with_name("filter")
.about("Filters duplicates from a list generated by dedup. Duplicates are judged on smaller resolution, otherwise alphabetically and if located deeper within filesystem")
.args_from_usage(
"<PATH> 'Path to the list of duplicates'
-D, --depth-first 'Force depth first. Will judge files based on directory names, filesystem depth, then on resolution'
-A, --append 'Append duplicates filenames to main file. WILL MANIPULATE FILESYSTEM!'",
),
)
.subcommand(
SubCommand::with_name("dirs")
.about("Prints directories that share N duplicates")
.args_from_usage(
"<PATH> 'Path to the list of duplicates'
-m, --matches=[MATCHES] 'Print directories that share at least N matches'",
),
)
.subcommand(
SubCommand::with_name("names")
.about("Prints all files with duplicates within a list of filenames. Uses regexp to find matching ids in filenames")
.args_from_usage(
"<PATH> 'Path to the list of blockhashes'
[PATH2] 'If present duplicate wills be considered within PATH -> PATH2'
-r, --regex=[REGEX] 'Regexp for the id. Defaults to \"^[0-9]*\"'
-m, --match=[MATCH] 'Regexp to match id against. \"__ARG__\" will be substituted with id. If not supplied will check for id'",
),
)
.get_matches();
if let Some(args) = args.subcommand_matches("find") {
run_search(args);
} else if let Some(args) = args.subcommand_matches("filter") {
run_filter(args);
} else if let Some(args) = args.subcommand_matches("dirs") {
run_dirfilter(args);
} else if let Some(args) = args.subcommand_matches("names") {
run_namesearch(args);
}
}
fn run_filter(args: &clap::ArgMatches) {
let (depth_first, append, paths) = (args.is_present("depth-first"), args.is_present("append"), args.value_of("PATH").unwrap());
let paths = FS::Load::Text(paths).expect(&format!("Couldn't open results file {}", paths));
let paths = paths
.lines()
.map(|p| p.to_string())
.collect::<Vec<_>>()
.split(|l| l.is_empty())
.filter_map(|c| Some(c.to_vec()).filter(|c| !c.is_empty()))
.collect::<Vec<Vec<String>>>();
let tasks: Vec<_> = paths
.into_iter()
.map(|mut path| {
task::spawn(async move {
let mut sizes: HashMap<&'static str, _> = HashMap::with_capacity(path.len());
path.sort_unstable_by(move |l, r| {
let depth = |l: &str| Path::new(l).ancestors().count();
let alph = |l: &str| Path::new(l).ancestors().skip(1).map(|p| p.to_str().unwrap().to_string()).collect::<Vec<String>>();
let mut size = |p: &str| {
let p = unsafe { mem::transmute(p) };
*sizes.entry(p).or_insert(imagesize::size(p).ok().map_or_else(
|| {
eprintln!("Failed to determine size of {:?}", p);
0
},
|size| size.width * size.height,
))
};
let depth = || {
alph(l)
.iter()
.rev()
.zip(alph(r).iter().rev())
.find_map(|(l, r)| Some(l.cmp(r)).filter(|c| *c != ord::Equal))
.unwrap_or(depth(l).cmp(&depth(r)))
};
let mut size = || size(r).cmp(&size(l));
if depth_first {
depth().then(size()).reverse()
} else {
size().then(depth()).reverse()
}
});
let (base, dupes) = path.split_last().unwrap();
(|| {
if append {
let base = Path::new(base);
let previous = base.to_owned();
let (path, mut base, ext) = (base.parent()?, base.file_stem()?.to_str()?.to_string(), base.extension()?);
base.push('_');
for n in dupes {
base.push_str(Path::new(n).file_stem()?.to_str()?);
}
let max_len = 247 - ext.to_str()?.len();
let base: String = base.char_indices().take_while(|(i, _)| *i < max_len).map(|(_, c)| c).collect();
let path = path.join(format!("{}.{}", &base, ext.to_str()?));
if let Err(e) = fs::rename(&previous, &path) {
eprintln!("Could not rename base file {:?} to {:?}, err {}", previous, path, e);
}
}
Some(())
})()
.unwrap();
dupes.iter().for_each(|d| println!("{}", d));
})
})
.collect();
task::block_on(async move {
for t in tasks {
t.await
}
});
}
fn run_dirfilter(args: &clap::ArgMatches) {
let (min_matches, paths) = (args.value_of("matches").and_then(|a| a.parse().ok()).unwrap_or(4).max(1), args.value_of("PATH").unwrap());
let paths = FS::Load::Text(paths).expect(&format!("Couldn't open results file {}", paths));
let paths = paths
.lines()
.collect::<Vec<_>>()
.split(|l| l.is_empty())
.filter(|a| !a.is_empty())
.map(|c| {
let mut chunk = c.iter().map(|l| Path::new(l).parent().unwrap().to_str().unwrap().to_string()).collect::<Vec<String>>();
chunk.sort_unstable();
chunk.dedup();
chunk
})
.collect::<Vec<Vec<String>>>();
let paths = paths.into_iter().fold(HashMap::new(), |mut map, chunk| {
chunk.into_iter().for_each(|p| *map.entry(p).or_insert(0) += 1);
map
});
paths.into_iter().filter(|(_, c)| *c >= min_matches).for_each(|(p, _)| println!("{}", p));
}
fn run_namesearch(args: &clap::ArgMatches) {
let args = unsafe { mem::transmute::<_, &'static clap::ArgMatches>(args) };
let (regex, regex2, paths, paths2) = (
args.value_of("regex").unwrap_or("^[0-9]*"),
args.value_of("match"),
args.value_of("PATH").unwrap(),
args.value_of("PATH2"),
);
let load = |p| FS::Load::Text(p).expect(&format!("Couldn't open paths file {}", p));
let paths = load(paths);
let paths2 = paths2.map(load);
let regex = regex::Regex::new(regex).expect("Bad regex");
let paths = paths
.lines()
.into_iter()
.filter_map(|path| Some(unsafe { mem::transmute((path, regex.find(path)?.as_str())) }))
.collect::<HashMap<&str, &str>>();
let (paths, paths2) = StaticPtr!(&paths, &paths2);
let tasks: Vec<_> = paths
.get()
.iter()
.map(|(name, idx)| {
task::spawn(async move {
let is_match: Box<dyn Fn(_) -> _> = if let Some(regex2) = regex2 {
let regex2 = regex2.replace("__ARG__", idx);
let regex2 = regex::Regex::new(®ex2).expect("Bad match regex");
Box::new(move |l| regex2.find(l).is_some())
} else {
Box::new(|l| l.contains(idx))
};
if let Some(paths2) = paths2.get() {
paths2.lines().find(|l| is_match(l))
} else {
paths.get().iter().find(|(l, _)| is_match(l) && *l != name).map(|(l, _)| *l)
}
.map(|m| println!("{}\n{}\n", name, m));
})
})
.collect();
task::block_on(async move {
for t in tasks {
t.await
}
});
}
fn run_search(args: &clap::ArgMatches) {
let (num_cpus, ultra_precision, show_differences, precision, paths, paths2) = (
num_cpus::get_physical(),
args.is_present("ultra"),
args.is_present("display"),
args.value_of("similarity").and_then(|a| a.parse().ok()).unwrap_or(88).min(100),
args.value_of("PATH").unwrap(),
args.value_of("PATH2"),
);
type Names = HashMap<usize, &'static str>;
fn parse_paths(paths: &str) -> (Vec<u8>, Names) {
let mut hashes = vec![];
let names = paths
.lines()
.enumerate()
.map(|(n, l)| {
let n = n * 32;
let (hash, name) = l.split_once(char::is_whitespace).expect(&format!("Invalid input {:?}", (n, l)));
let (hash, name) = (hash.trim_end(), name.trim_start());
hashes.extend((0..32).map(|i| {
let i = 2 * i;
u8::from_str_radix(&hash[i..i + 2], 16).expect("Invalid blockhash list formatting")
}));
let h = &mut hashes[n];
*h = h.or_val(*h != 0, 1);
(n, unsafe { mem::transmute(name) })
})
.collect();
(hashes, names)
}
let paths = FS::Load::Text(paths).expect(&format!("Couldn't open blockhash list file {}", paths));
let (h1, names1) = parse_paths(&paths);
FnStatic!(hashes1, Vec<u8>, { h1 });
let paths2 = paths2.map(|p| FS::Load::Text(p).expect(&format!("Couldn't open blockhash list file {}", p)));
let (mut hashes2, names2) = if let Some((h, n)) = paths2.as_ref().map(|p| parse_paths(p)) { (h, n) } else { Def() };
let two_sets = !hashes2.is_empty();
let ultra_precision = ultra_precision && precision > 89;
let show_differences = ultra_precision && show_differences;
let thresh = u64::to(256. * (1. - precision as f32 / 100.));
let (hashes2, names2) = if two_sets { (&mut hashes2, names2) } else { (hashes1(), names1.clone()) };
let (total1, total2) = (names1.len() * 32, names2.len() * 32);
let (names2, mut hashes2) = StaticPtr!(&names2, hashes2);
let mut window = Window::get((50, 50, 64, 64), "Engine").expect("Can't start GL");
GLDisable!(DEPTH_TEST, BLEND, MULTISAMPLE, CULL_FACE, DEPTH_WRITEMASK);
let mut subs = Shader::new((mesh__2d_screen_vs, substract_ps)).expect("Can't create shader");
let mut render = Shader::new((mesh__2d_screen_vs, mesh__2d_screen_ps)).expect("Can't create shader");
let linear = &Sampler::linear();
let imgify = |data: uImage<_>| Tex2d::<RGBA, u8>::from(data);
let (offhand_sn, offhand_rx) = Offhand::new(&mut window, 64, move |p: Option<_>| p.map(|(n, name, data)| (n, name, imgify(data))));
for i1 in (0..total1).step_by(32) {
if !two_sets && *unsafe { hashes1().get_unchecked(i1) } == 0 {
continue;
}
let dupe = AtomicBool::default();
let base_name = names1.get(&i1).unwrap();
{
let dupe = StaticPtr!(&dupe);
let sn = offhand_sn.clone();
let base = unsafe { hashes1().get_unchecked(i1..i1 + 32) };
let mut tasks = vec![];
tasks.push(task::spawn(async move {
let mut workers = vec![];
let start = i1.or_def(!two_sets);
let step = 32 * 1.max((total2 - start) / num_cpus / 4 / 32);
for i in (start..total2).step_by(step) {
let snl = sn.clone();
let range = i..total2.min(i + step);
let w = task::spawn(async move {
let dupe = dupe.get();
let (names2, hashes2) = (names2.get(), hashes2.get_mut());
for i2 in range.step_by(32) {
if *unsafe { hashes2.get_unchecked(i2) } == 0 || (!two_sets && i1 == i2) {
continue;
}
let diff = { hamming::distance_fast(base, unsafe { hashes2.get_unchecked(i2..i2 + 32) }).unwrap() };
if diff < thresh {
let name = names2.get(&i2).unwrap();
if !ultra_precision {
dupe.store(true, Ordering::Release);
*unsafe { hashes2.get_unchecked_mut(i2) } = 0;
println!("{}", name);
} else if let Ok(f) = FS::Load::File(name).and_then(uImage::new) {
let _ = snl.send_async(Some((i2, name, f))).await.unwrap();
} else {
eprintln!("could not open copy {}", name);
}
}
}
});
workers.push(w);
}
for w in workers {
w.await
}
if ultra_precision {
let _ = sn.send(None).unwrap();
}
}));
if ultra_precision {
let mut base_file = Some(FS::Preload::File(base_name));
let mut base = None;
while let Some((i2, name, recv)) = offhand_rx.recv().wait().unwrap() {
if let Some(f) = base_file.take() {
if let Ok(b) = uImage::new(f.take()) {
base = Some(imgify(b));
} else {
eprintln!("could not open base {}", base_name);
}
}
let base = if let Some(base) = base.as_ref() {
base
} else {
continue;
};
let s1 = base.Bind(linear);
let s2 = recv.Bind(linear);
let size = {
let s = base.param;
(s.w, s.h).div(2)
};
let data = {
let out = {
let _ = Uniforms!(subs, ("tex1", &s1), ("tex2", &s2), ("size", ((1., 1.).div(size))));
let mut out = Fbo::<RGB, u8>::new(size);
out.bind();
Screen::Draw();
if show_differences {
window.draw_to_screen();
GL::ClearScreen((0., 1.));
let b = out.tex.Bind(linear);
let _ = Uniforms!(render, ("tex", &b));
Screen::Draw();
let _ = window.poll_events();
window.swap();
}
out.tex
};
let b = out.Bind(linear);
let _ = Uniforms!(render, ("tex", &b));
let mut out = Fbo::<RGB, u8>::new((256, 256));
out.bind();
Screen::Draw();
out.tex.Save::<RED, u8>(0)
};
tasks.push(task::spawn(async move {
let (dupe, hashes2) = (dupe.get(), hashes2.get_mut());
let base = data.iter().fold(0., |acc, v| acc + (*v as f32)) / data.len() as f32;
let diff = data
.iter()
.fold(0., |acc, v| {
let v = *v as f32;
let d = (base - v).powi(2);
acc + d
})
.sqrt();
if diff < 1. + 10. * (100 - precision) as f32 {
dupe.store(true, Ordering::Release);
*unsafe { hashes2.get_unchecked_mut(i2) } = 0;
println!("{}", name);
}
}));
}
}
task::block_on(async move {
for t in tasks {
t.await
}
});
}
if dupe.load(Ordering::Acquire) {
println!("{}\n", base_name);
}
if i1 % 100 == 0 {
eprintln!("processed {}% files", i1 as f32 * 100. / total1 as f32);
}
}
drop(offhand_sn);
}
SHADER!(
substract_ps,
r"#version 330 core
in vec2 glTexCoord;
layout(location = 0)out vec4 glFragColor;
uniform sampler2D tex1, tex2;
uniform vec2 size;
#define edge2 mat3(-1, -1, -1, -1, 8, -1, -1, -1, -1)
float pix(sampler2D t) {
vec3 p = texture(t, glTexCoord).rgb;
return (p.r + p.g + p.b) * 0.333;
}
float val(float x, float y, sampler2D t) {
vec3 c = texture(t, glTexCoord + vec2(x, y) * size).rgb;
return round(10 * (c.r + c.g + c.b)) * 0.0333;
}
mat3 reg(sampler2D t) {
return mat3(val(-1, -1, t), val(0, -1, t), val(1, -1, t),
val(-1, 0, t), val(0, 0, t), val(1, 0, t),
val(-1, 1, t), val(0, 1, t), val(1, 1, t));
}
float edg(sampler2D t) {
mat3 r = matrixCompMult(edge2, reg(t));
return abs(r[0][0] + r[1][0] + r[2][0]
+ r[0][1] + r[1][1] + r[2][1]
+ r[0][2] + r[1][2] + r[2][2]);
}
float mag(sampler2D t) {
mat3 r = reg(t);
return max(max(max(r[0][0], r[1][0]), max(r[2][0], r[0][1])),
max(max(max(r[1][1], r[2][1]), max(r[0][2], r[1][2])), r[2][2]));
}
void main()
{
float p1 = edg(tex1);
float p2 = edg(tex2);
float c1 = pix(tex1);
float c2 = pix(tex2);
float c = abs(c1 - c2);
c = c * float(c > 0.1);
float max_p1 = mag(tex1);
float max_p2 = mag(tex2);
float d = abs(p1 - p2);
d = d * float(d > 0.2);
d = max(0, max(d - max_p1, d - max_p2));
glFragColor = vec4(d * d * d + c, 0, 0, 1);
}"
);