mod scrape;
#[cfg(test)]
mod tests;
use std::collections::{BTreeMap, HashMap, HashSet};
use std::io::BufReader;
use std::path::Path;
use anyhow::{bail, Result};
use fs_err::File;
use geo::algorithm::bounding_rect::BoundingRect;
use geo::algorithm::contains::Contains;
use geo::algorithm::haversine_distance::HaversineDistance;
use geo_types::{LineString, MultiPolygon, Point, Rect};
use geojson::{Feature, FeatureReader};
use ordered_float::NotNan;
use rand::prelude::SliceRandom;
use rand::rngs::StdRng;
use rand::Rng;
use rstar::{RTree, RTreeObject, AABB};
use serde_json::{Map, Value};
pub use self::scrape::scrape_points;
pub struct Options {
pub subsample_origin: Subsample,
pub subsample_destination: Subsample,
pub origin_key: String,
pub destination_key: String,
pub min_distance_meters: f64,
pub deduplicate_pairs: bool,
}
pub enum Subsample {
RandomPoints,
WeightedPoints(Vec<WeightedPoint>),
}
#[derive(Clone)]
pub struct WeightedPoint {
pub point: Point<f64>,
pub weight: f64,
}
impl RTreeObject for WeightedPoint {
type Envelope = AABB<[f64; 2]>;
fn envelope(&self) -> Self::Envelope {
AABB::from_point([self.point.x(), self.point.y()])
}
}
pub fn jitter<P: AsRef<Path>, F: FnMut(Feature) -> Result<()>>(
csv_path: P,
zones: &HashMap<String, MultiPolygon<f64>>,
disaggregation_threshold: usize,
disaggregation_key: String,
rng: &mut StdRng,
options: Options,
mut output: F,
) -> Result<()> {
let csv_path = csv_path.as_ref();
let points_per_origin_zone: Option<BTreeMap<String, Vec<WeightedPoint>>> =
if let Subsample::WeightedPoints(points) = options.subsample_origin {
Some(points_per_polygon(points, zones))
} else {
None
};
let points_per_destination_zone: Option<BTreeMap<String, Vec<WeightedPoint>>> =
if let Subsample::WeightedPoints(points) = options.subsample_destination {
Some(points_per_polygon(points, zones))
} else {
None
};
let mut seen_pairs: HashSet<ODPair> = HashSet::new();
println!("Disaggregating OD data");
for rec in csv::Reader::from_reader(File::open(csv_path)?).deserialize() {
let string_map: HashMap<String, String> = rec?;
let repeat = if let Some(count) = string_map
.get(&disaggregation_key)
.and_then(|count| count.parse::<f64>().ok())
{
if count == 0.0 {
1.0
} else {
(count / disaggregation_threshold as f64).ceil()
}
} else {
bail!(
"{} doesn't have a {} column or the value isn't numeric; set disaggregation_key properly",
csv_path.display(),
disaggregation_key
);
};
let mut json_map: Map<String, Value> = Map::new();
for (key, value) in string_map {
let json_value = if key == options.origin_key || key == options.destination_key {
Value::String(value)
} else if let Ok(x) = value.parse::<f64>() {
Value::Number(serde_json::Number::from_f64(x / repeat).unwrap())
} else {
Value::String(value)
};
json_map.insert(key, json_value);
}
let origin_id = if let Some(Value::String(id)) = json_map.get(&options.origin_key) {
id
} else {
bail!(
"{} doesn't have a {} column; set origin_key properly",
csv_path.display(),
options.origin_key
);
};
let destination_id = if let Some(Value::String(id)) = json_map.get(&options.destination_key)
{
id
} else {
bail!(
"{} doesn't have a {} column; set destination_key properly",
csv_path.display(),
options.destination_key
);
};
let origin_zone = if let Some(zone) = zones.get(origin_id) {
zone
} else {
bail!("Unknown origin zone {origin_id}");
};
let destination_zone = if let Some(zone) = zones.get(destination_id) {
zone
} else {
bail!("Unknown destination zone {destination_id}");
};
let origin_sampler = Subsampler::new(&points_per_origin_zone, origin_zone, origin_id)?;
let destination_sampler = Subsampler::new(
&points_per_destination_zone,
destination_zone,
destination_id,
)?;
if options.deduplicate_pairs {
if let (Some(num_origin), Some(num_destination)) = (
origin_sampler.num_points(),
destination_sampler.num_points(),
) {
if repeat as usize > num_origin * num_destination {
bail!("{repeat} unique pairs requested from {origin_id} ({num_origin} subpoints) to {destination_id} ({num_destination} subpoints), but this is impossible");
}
}
}
for _ in 0..repeat as usize {
loop {
let o = origin_sampler.sample(rng);
let d = destination_sampler.sample(rng);
if o.haversine_distance(&d) >= options.min_distance_meters {
if options.deduplicate_pairs {
let pair = hashify(o, d);
if seen_pairs.contains(&pair) {
continue;
} else {
seen_pairs.insert(pair);
}
}
output(to_geojson(o, d, json_map.clone()))?;
break;
}
}
}
}
Ok(())
}
pub fn disaggregate<P: AsRef<Path>, F: FnMut(Feature) -> Result<()>>(
csv_path: P,
zones: &HashMap<String, MultiPolygon<f64>>,
rng: &mut StdRng,
options: Options,
mut output: F,
) -> Result<()> {
let csv_path = csv_path.as_ref();
let points_per_origin_zone: Option<BTreeMap<String, Vec<WeightedPoint>>> =
if let Subsample::WeightedPoints(points) = options.subsample_origin {
Some(points_per_polygon(points, zones))
} else {
None
};
let points_per_destination_zone: Option<BTreeMap<String, Vec<WeightedPoint>>> =
if let Subsample::WeightedPoints(points) = options.subsample_destination {
Some(points_per_polygon(points, zones))
} else {
None
};
println!("Disaggregating OD data");
for rec in csv::Reader::from_reader(File::open(csv_path)?).deserialize() {
let mut string_map: HashMap<String, String> = rec?;
let origin_id = if let Some(id) = string_map.remove(&options.origin_key) {
id
} else {
bail!(
"{} doesn't have a {} column; set origin_key properly",
csv_path.display(),
options.origin_key
);
};
let destination_id = if let Some(id) = string_map.remove(&options.destination_key) {
id
} else {
bail!(
"{} doesn't have a {} column; set destination_key properly",
csv_path.display(),
options.destination_key
);
};
let origin_zone = if let Some(zone) = zones.get(&origin_id) {
zone
} else {
bail!("Unknown origin zone {origin_id}");
};
let destination_zone = if let Some(zone) = zones.get(&destination_id) {
zone
} else {
bail!("Unknown destination zone {destination_id}");
};
let origin_sampler = Subsampler::new(&points_per_origin_zone, origin_zone, &origin_id)?;
let destination_sampler = Subsampler::new(
&points_per_destination_zone,
destination_zone,
&destination_id,
)?;
let mut seen_pairs: HashSet<ODPair> = HashSet::new();
for (mode, value) in string_map {
if let Ok(count) = value.parse::<f64>() {
let count = count as usize;
if options.deduplicate_pairs {
if let (Some(num_origin), Some(num_destination)) = (
origin_sampler.num_points(),
destination_sampler.num_points(),
) {
if count > num_origin * num_destination {
bail!("{count} unique pairs requested for {mode} from {origin_id} ({num_origin} subpoints) to {destination_id} ({num_destination} subpoints), but this is impossible");
}
}
}
for _ in 0..count {
loop {
let o = origin_sampler.sample(rng);
let d = destination_sampler.sample(rng);
if o.haversine_distance(&d) >= options.min_distance_meters {
if options.deduplicate_pairs {
let pair = hashify(o, d);
if seen_pairs.contains(&pair) {
continue;
} else {
seen_pairs.insert(pair);
}
}
let mut json_map: Map<String, Value> = Map::new();
json_map.insert("mode".to_string(), Value::String(mode.clone()));
output(to_geojson(o, d, json_map))?;
break;
}
}
}
}
}
}
Ok(())
}
pub fn load_zones(
geojson_path: &str,
name_key: &str,
) -> Result<HashMap<String, MultiPolygon<f64>>> {
let reader = FeatureReader::from_reader(BufReader::new(File::open(geojson_path)?));
let mut zones: HashMap<String, MultiPolygon<f64>> = HashMap::new();
for feature in reader.features() {
let feature = feature?;
if let Some(zone_name) = feature
.property(name_key)
.and_then(|x| x.as_str())
.map(|x| x.to_string())
{
let gj_geom: geojson::Geometry = feature.geometry.unwrap();
let geo_geometry: geo_types::Geometry<f64> = gj_geom.try_into().unwrap();
if let geo_types::Geometry::MultiPolygon(mp) = geo_geometry {
zones.insert(zone_name, mp);
} else if let geo_types::Geometry::Polygon(p) = geo_geometry {
zones.insert(zone_name, p.into());
}
} else {
bail!(
"Feature doesn't have a string zone name {}: {:?}",
name_key,
feature
);
}
}
Ok(zones)
}
fn points_per_polygon(
points: Vec<WeightedPoint>,
polygons: &HashMap<String, MultiPolygon<f64>>,
) -> BTreeMap<String, Vec<WeightedPoint>> {
let tree = RTree::bulk_load(points);
let mut output = BTreeMap::new();
for (key, polygon) in polygons {
let mut pts_inside = Vec::new();
let bounds = polygon.bounding_rect().unwrap();
let min = bounds.min();
let max = bounds.max();
let envelope: AABB<[f64; 2]> = AABB::from_corners([min.x, min.y], [max.x, max.y]);
for pt in tree.locate_in_envelope(&envelope) {
if polygon.contains(&pt.point) {
pts_inside.push(pt.clone());
}
}
output.insert(key.clone(), pts_inside);
}
output
}
fn to_geojson(pt1: Point<f64>, pt2: Point<f64>, properties: Map<String, Value>) -> Feature {
let line_string: LineString<f64> = vec![pt1, pt2].into();
Feature {
geometry: Some(geojson::Geometry {
value: geojson::Value::from(&line_string),
bbox: None,
foreign_members: None,
}),
properties: Some(properties),
bbox: None,
id: None,
foreign_members: None,
}
}
enum Subsampler<'a> {
RandomPoints(&'a MultiPolygon<f64>, Rect<f64>),
WeightedPoints(&'a Vec<WeightedPoint>),
}
impl<'a> Subsampler<'a> {
fn new(
points_per_zone: &'a Option<BTreeMap<String, Vec<WeightedPoint>>>,
zone_polygon: &'a MultiPolygon<f64>,
zone_id: &str,
) -> Result<Subsampler<'a>> {
if let Some(points_per_zone) = points_per_zone {
if let Some(points) = points_per_zone.get(zone_id) {
if !points.is_empty() {
return Ok(Subsampler::WeightedPoints(points));
}
}
bail!("No subpoints for zone {}", zone_id);
} else {
match zone_polygon.bounding_rect() {
Some(bounds) => Ok(Subsampler::RandomPoints(zone_polygon, bounds)),
None => bail!("can't calculate bounding box for zone {}", zone_id),
}
}
}
fn sample(&self, rng: &mut StdRng) -> Point<f64> {
match self {
Subsampler::RandomPoints(polygon, bounds) => loop {
let x = rng.gen_range(bounds.min().x..=bounds.max().x);
let y = rng.gen_range(bounds.min().y..=bounds.max().y);
let pt = Point::new(x, y);
if polygon.contains(&pt) {
return pt;
}
},
Subsampler::WeightedPoints(points) => {
points.choose_weighted(rng, |pt| pt.weight).unwrap().point
}
}
}
fn num_points(&self) -> Option<usize> {
match self {
Subsampler::RandomPoints(_, _) => None,
Subsampler::WeightedPoints(points) => Some(points.len()),
}
}
}
type ODPair = [NotNan<f64>; 4];
fn hashify(o: Point<f64>, d: Point<f64>) -> ODPair {
[
NotNan::new(o.x()).unwrap(),
NotNan::new(o.y()).unwrap(),
NotNan::new(d.x()).unwrap(),
NotNan::new(d.y()).unwrap(),
]
}