use crate::config::Config;
use anyhow::{Context, Result};
use clap::Args as ClapArgs;
use geojson::{Feature, FeatureCollection, GeoJson, Geometry, Value};
use std::collections::HashSet;
use std::path::PathBuf;
#[derive(Debug, ClapArgs)]
pub struct Args {
pub input: PathBuf,
#[arg(short, long)]
pub output: Option<PathBuf>,
#[arg(long, default_value = "1.0")]
pub min_length: f64,
#[arg(long)]
pub stats: bool,
}
fn haversine(lat1: f64, lon1: f64, lat2: f64, lon2: f64) -> f64 {
const R: f64 = 6_371_000.0;
let dlat = (lat2 - lat1).to_radians();
let dlon = (lon2 - lon1).to_radians();
let a = (dlat / 2.0).sin().powi(2)
+ lat1.to_radians().cos() * lat2.to_radians().cos() * (dlon / 2.0).sin().powi(2);
let c = 2.0 * a.sqrt().atan2((1.0 - a).sqrt());
R * c
}
fn linestring_length(coords: &[Vec<f64>]) -> f64 {
let mut total = 0.0;
for i in 0..coords.len().saturating_sub(1) {
if coords[i].len() >= 2 && coords[i + 1].len() >= 2 {
let lon1 = coords[i][0];
let lat1 = coords[i][1];
let lon2 = coords[i + 1][0];
let lat2 = coords[i + 1][1];
total += haversine(lat1, lon1, lat2, lon2);
}
}
total
}
fn is_self_loop(coords: &[Vec<f64>]) -> bool {
if coords.len() < 2 {
return false;
}
let first = &coords[0];
let last = &coords[coords.len() - 1];
if first.len() >= 2 && last.len() >= 2 {
(first[0] - last[0]).abs() < 1e-9 && (first[1] - last[1]).abs() < 1e-9
} else {
false
}
}
fn feature_dedup_key(feature: &Feature) -> Option<String> {
let geom = feature.geometry.as_ref()?;
match &geom.value {
Value::LineString(coords) => {
if coords.is_empty() {
return None;
}
let first = &coords[0];
let last = &coords[coords.len() - 1];
let mut parts = vec![
format!("{:.7},{:.7}", first[1], first[0]),
format!("{:.7},{:.7}", last[1], last[0]),
];
parts.sort();
Some(parts.join("|"))
}
Value::MultiLineString(multi) => {
if multi.is_empty() || multi[0].is_empty() {
return None;
}
let first_line = &multi[0];
let first = &first_line[0];
let last = &first_line[first_line.len() - 1];
let mut parts = vec![
format!("{:.7},{:.7}", first[1], first[0]),
format!("{:.7},{:.7}", last[1], last[0]),
];
parts.sort();
Some(parts.join("|"))
}
_ => None,
}
}
pub async fn run(args: Args) -> Result<()> {
let config = Config::load().unwrap_or_default();
config.init_logging();
tracing::info!("Cleaning GeoJSON: {}", args.input.display());
let geojson_str = std::fs::read_to_string(&args.input)
.with_context(|| format!("Failed to read {}", args.input.display()))?;
let geojson: GeoJson = geojson_str.parse().context("Failed to parse GeoJSON")?;
let fc = match geojson {
GeoJson::FeatureCollection(fc) => fc,
GeoJson::Feature(f) => FeatureCollection {
features: vec![f],
bbox: None,
foreign_members: None,
},
GeoJson::Geometry(g) => {
anyhow::bail!("Expected FeatureCollection, got bare Geometry");
}
};
let input_count = fc.features.len();
let mut stats = CleanStats::default();
let mut cleaned: Vec<Feature> = Vec::new();
let mut seen_keys: HashSet<String> = HashSet::new();
for feature in fc.features {
let geom = match feature.geometry.as_ref() {
Some(g) => g,
None => {
stats.skipped_no_geometry += 1;
continue;
}
};
match &geom.value {
Value::LineString(coords) => {
if is_self_loop(coords) {
stats.self_loops_removed += 1;
continue;
}
let length = linestring_length(coords);
if length < args.min_length {
stats.short_segments_removed += 1;
continue;
}
if let Some(key) = feature_dedup_key(&feature) {
if seen_keys.contains(&key) {
stats.duplicates_removed += 1;
continue;
}
seen_keys.insert(key);
}
cleaned.push(feature);
}
Value::MultiLineString(multi) => {
let mut kept_lines: Vec<Vec<Vec<f64>>> = Vec::new();
for line in multi {
if is_self_loop(&line) {
stats.self_loops_removed += 1;
continue;
}
let length = linestring_length(&line);
if length < args.min_length {
stats.short_segments_removed += 1;
continue;
}
kept_lines.push(line.clone());
}
if kept_lines.is_empty() {
continue;
}
if kept_lines.len() == 1 {
let mut f = feature.clone();
f.geometry = Some(Geometry::new(Value::LineString(kept_lines.into_iter().next().unwrap())));
if let Some(key) = feature_dedup_key(&f) {
if seen_keys.contains(&key) {
stats.duplicates_removed += 1;
continue;
}
seen_keys.insert(key);
}
cleaned.push(f);
} else {
let mut f = feature.clone();
f.geometry = Some(Geometry::new(Value::MultiLineString(kept_lines)));
cleaned.push(f);
}
}
Value::Point(_) | Value::MultiPoint(_) => {
cleaned.push(feature);
}
Value::Polygon(_) | Value::MultiPolygon(_) => {
cleaned.push(feature);
}
_ => {
stats.skipped_unknown_type += 1;
}
}
}
let output_count = cleaned.len();
let result_fc = FeatureCollection {
features: cleaned,
bbox: None,
foreign_members: None,
};
let output_geojson = GeoJson::from(result_fc);
let json = serde_json::to_string_pretty(&output_geojson)
.context("Failed to serialize cleaned GeoJSON")?;
match &args.output {
Some(path) => {
std::fs::write(path, &json)
.with_context(|| format!("Failed to write to {}", path.display()))?;
tracing::info!("Cleaned GeoJSON written to {}", path.display());
}
None => println!("{}", json),
}
if args.stats {
println!("Clean Statistics:");
println!(" Input features: {}", input_count);
println!(" Output features: {}", output_count);
println!(" Self-loops removed: {}", stats.self_loops_removed);
println!(" Duplicates removed: {}", stats.duplicates_removed);
println!(" Short segments removed: {}", stats.short_segments_removed);
println!(" Skipped (no geometry): {}", stats.skipped_no_geometry);
println!(" Skipped (unknown type): {}", stats.skipped_unknown_type);
}
tracing::info!(
"Cleaned: {} → {} features (removed {} self-loops, {} duplicates, {} short segments)",
input_count,
output_count,
stats.self_loops_removed,
stats.duplicates_removed,
stats.short_segments_removed
);
Ok(())
}
#[derive(Default)]
struct CleanStats {
self_loops_removed: usize,
duplicates_removed: usize,
short_segments_removed: usize,
skipped_no_geometry: usize,
skipped_unknown_type: usize,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_clean_args() {
let args = Args {
input: PathBuf::from("test.geojson"),
output: None,
min_length: 2.0,
stats: true,
};
assert_eq!(args.input, PathBuf::from("test.geojson"));
assert_eq!(args.min_length, 2.0);
assert!(args.stats);
}
#[test]
fn test_haversine() {
let d = haversine(45.5017, -73.5673, 45.5088, -73.5542);
assert!(d > 800.0 && d < 2000.0, "d={}", d);
}
#[test]
fn test_self_loop_detection() {
let coords = vec![vec![-73.6, 45.5], vec![-73.61, 45.51], vec![-73.6, 45.5]];
assert!(is_self_loop(&coords));
let open = vec![vec![-73.6, 45.5], vec![-73.61, 45.51], vec![-73.62, 45.52]];
assert!(!is_self_loop(&open));
}
#[test]
fn test_linestring_length() {
let coords = vec![vec![-73.6, 45.5], vec![-73.6, 45.5]];
assert_eq!(linestring_length(&coords), 0.0);
let coords = vec![vec![0.0, 0.0], vec![0.0, 1.0]];
let len = linestring_length(&coords);
assert!(len > 110_000.0 && len < 112_000.0, "len={}", len);
}
#[test]
fn test_feature_dedup_key() {
let f1 = Feature {
geometry: Some(Geometry::new(Value::LineString(vec![
vec![-73.6, 45.5],
vec![-73.61, 45.51],
]))),
properties: None,
..Default::default()
};
let key1 = feature_dedup_key(&f1);
assert!(key1.is_some());
}
}