use csv::StringRecord;
use failure::{format_err, ResultExt};
use serde::{Deserialize, Serialize};
use std::{
borrow::Cow,
collections::{HashMap, HashSet},
fs::File,
path::Path,
};
use crate::structure::Structure;
use crate::Result;
#[derive(Clone, Debug, Eq, PartialEq, Serialize)]
pub struct Address {
pub street: String,
pub city: Option<String>,
pub state: Option<String>,
pub zipcode: Option<String>,
}
#[derive(Debug, Deserialize, Eq, PartialEq)]
#[serde(untagged, deny_unknown_fields)]
pub enum ColumnKeyOrKeys<K: Eq> {
Key(K),
Keys(Vec<K>),
}
impl ColumnKeyOrKeys<usize> {
pub fn extract_from_record<'a>(
&self,
record: &'a StringRecord,
) -> Result<Cow<'a, str>> {
match self {
ColumnKeyOrKeys::Key(key) => Ok(Cow::Borrowed(&record[*key])),
ColumnKeyOrKeys::Keys(keys) => {
let mut extracted = String::with_capacity(40);
for key in keys {
let s = &record[*key];
if extracted.is_empty() {
extracted.push_str(s);
} else if extracted.ends_with(s) {
} else {
extracted.push(' ');
extracted.push_str(s);
}
}
Ok(Cow::Owned(extracted))
}
}
}
}
#[test]
fn extract_collapses_duplicate_suffixes() {
use std::iter::FromIterator;
let record = StringRecord::from_iter(&["100", "Main Street #302", "#302"]);
let keys = ColumnKeyOrKeys::Keys(vec![0, 1, 2]);
assert_eq!(
keys.extract_from_record(&record).unwrap(),
"100 Main Street #302",
);
}
#[derive(Debug, Deserialize, Eq, PartialEq)]
#[serde(deny_unknown_fields)]
pub struct AddressColumnKeys<K: Default + Eq> {
#[serde(alias = "house_number_and_street", alias = "address", alias = "glob")]
pub street: ColumnKeyOrKeys<K>,
#[serde(default)]
pub city: Option<K>,
#[serde(default)]
pub state: Option<K>,
#[serde(default, alias = "postcode")]
pub zipcode: Option<K>,
}
impl AddressColumnKeys<usize> {
pub fn extract_address_from_record(
&self,
record: &'_ StringRecord,
) -> Result<Address> {
Ok(Address {
street: self.street.extract_from_record(record)?.into_owned(),
city: self.city.map(|c| record[c].to_owned()),
state: self.state.map(|s| record[s].to_owned()),
zipcode: self.zipcode.map(|z| record[z].to_owned()),
})
}
}
#[test]
fn extract_simple_address_from_record() {
use std::iter::FromIterator;
let record = StringRecord::from_iter(&[
"1600 Pennsylvania Avenue NW, Washington DC, 20500",
]);
let keys = AddressColumnKeys {
street: ColumnKeyOrKeys::Key(0),
city: None,
state: None,
zipcode: None,
};
assert_eq!(
keys.extract_address_from_record(&record).unwrap(),
Address {
street: "1600 Pennsylvania Avenue NW, Washington DC, 20500".to_owned(),
city: None,
state: None,
zipcode: None,
},
);
}
#[test]
fn extract_complex_address_from_record() {
use std::iter::FromIterator;
let record = StringRecord::from_iter(&[
"1600",
"Pennsylvania Avenue NW",
"Washington",
"DC",
"20500",
]);
let keys = AddressColumnKeys {
street: ColumnKeyOrKeys::Keys(vec![0, 1]),
city: Some(2),
state: Some(3),
zipcode: Some(4),
};
assert_eq!(
keys.extract_address_from_record(&record).unwrap(),
Address {
street: "1600 Pennsylvania Avenue NW".to_owned(),
city: Some("Washington".to_owned()),
state: Some("DC".to_owned()),
zipcode: Some("20500".to_owned()),
},
);
}
#[derive(Debug, Deserialize, Eq, PartialEq)]
pub struct AddressColumnSpec<Key: Default + Eq> {
#[serde(flatten)]
address_columns_by_prefix: HashMap<String, AddressColumnKeys<Key>>,
}
impl<Key: Default + Eq> AddressColumnSpec<Key> {
pub fn prefix_count(&self) -> usize {
self.address_columns_by_prefix.len()
}
pub fn prefixes(&self) -> Vec<&str> {
let mut prefixes = self
.address_columns_by_prefix
.keys()
.map(|k| &k[..])
.collect::<Vec<_>>();
prefixes.sort_unstable();
prefixes
}
pub fn get(&self, prefix: &str) -> Option<&AddressColumnKeys<Key>> {
self.address_columns_by_prefix.get(prefix)
}
pub fn duplicate_columns<'header>(
&self,
structure: &Structure,
header: &'header StringRecord,
) -> Result<Vec<(&'header str, usize)>> {
let mut output_column_names = HashSet::new();
for prefix in self.prefixes() {
for name in structure.output_column_names(prefix)? {
if !output_column_names.insert(name.clone()) {
return Err(format_err!("duplicate column name {:?}", name));
}
}
}
let mut duplicate_columns = vec![];
for (i, col) in header.iter().enumerate() {
if output_column_names.contains(col) {
duplicate_columns.push((col, i));
}
}
Ok(duplicate_columns)
}
}
#[test]
fn find_columns_to_remove() {
use std::iter::FromIterator;
let address_column_spec_json = r#"{
"home": {
"house_number_and_street": ["home_number", "home_street"],
"city": "home_city",
"state": "home_state",
"postcode": "home_zip"
},
"work": {
"address": "work_address"
}
}"#;
let spec: AddressColumnSpec<String> =
serde_json::from_str(address_column_spec_json).unwrap();
let structure = Structure::complete().unwrap();
let header =
StringRecord::from_iter(&["existing", "home_addressee", "work_addressee"]);
let indices = spec.duplicate_columns(&structure, &header).unwrap();
assert_eq!(indices, vec![("home_addressee", 1), ("work_addressee", 2)]);
}
impl AddressColumnSpec<String> {
pub fn from_path(path: &Path) -> Result<Self> {
let f = File::open(path)
.with_context(|_| format_err!("cannot open {}", path.display()))?;
Ok(serde_json::from_reader(f)
.with_context(|_| format_err!("error parsing {}", path.display()))?)
}
pub fn convert_to_indices_using_headers(
&self,
headers: &StringRecord,
) -> Result<AddressColumnSpec<usize>> {
let mut header_columns = HashMap::new();
for (idx, header) in headers.iter().enumerate() {
if let Some(_existing) = header_columns.insert(header, idx) {
return Err(format_err!("duplicate header column `{}`", header));
}
}
self.convert_to_indices(&header_columns)
}
}
#[test]
fn convert_address_column_spec_to_indices() {
use std::iter::FromIterator;
let headers = StringRecord::from_iter(&[
"home_number",
"home_street",
"home_city",
"home_state",
"home_zip",
"work_address",
]);
let address_column_spec_json = r#"{
"home": {
"house_number_and_street": ["home_number", "home_street"],
"city": "home_city",
"state": "home_state",
"postcode": "home_zip"
},
"work": {
"address": "work_address"
}
}"#;
let address_column_spec: AddressColumnSpec<String> =
serde_json::from_str(address_column_spec_json).unwrap();
let mut expected = HashMap::new();
expected.insert(
"home".to_owned(),
AddressColumnKeys {
street: ColumnKeyOrKeys::Keys(vec![0, 1]),
city: Some(2),
state: Some(3),
zipcode: Some(4),
},
);
expected.insert(
"work".to_owned(),
AddressColumnKeys {
street: ColumnKeyOrKeys::Key(5),
city: None,
state: None,
zipcode: None,
},
);
assert_eq!(
address_column_spec
.convert_to_indices_using_headers(&headers)
.unwrap(),
AddressColumnSpec::<usize> {
address_columns_by_prefix: expected,
},
);
}
trait ConvertToIndices {
type Output;
fn convert_to_indices(
&self,
header_columns: &HashMap<&str, usize>,
) -> Result<Self::Output>;
}
impl ConvertToIndices for String {
type Output = usize;
fn convert_to_indices(
&self,
header_columns: &HashMap<&str, usize>,
) -> Result<Self::Output> {
header_columns
.get(&self[..])
.copied()
.ok_or_else(|| format_err!("could not find column `{}` in header", self))
}
}
impl ConvertToIndices for ColumnKeyOrKeys<String> {
type Output = ColumnKeyOrKeys<usize>;
fn convert_to_indices(
&self,
header_columns: &HashMap<&str, usize>,
) -> Result<Self::Output> {
match self {
ColumnKeyOrKeys::Key(key) => Ok(ColumnKeyOrKeys::Key(
key.convert_to_indices(header_columns)?,
)),
ColumnKeyOrKeys::Keys(keys) => Ok(ColumnKeyOrKeys::Keys(
keys.iter()
.map(|k| k.convert_to_indices(header_columns))
.collect::<Result<Vec<_>>>()?,
)),
}
}
}
impl ConvertToIndices for AddressColumnKeys<String> {
type Output = AddressColumnKeys<usize>;
fn convert_to_indices(
&self,
header_columns: &HashMap<&str, usize>,
) -> Result<Self::Output> {
Ok(AddressColumnKeys {
street: self.street.convert_to_indices(header_columns)?,
city: self
.city
.as_ref()
.map(|c| c.convert_to_indices(header_columns))
.transpose()?,
state: self
.state
.as_ref()
.map(|s| s.convert_to_indices(header_columns))
.transpose()?,
zipcode: self
.zipcode
.as_ref()
.map(|z| z.convert_to_indices(header_columns))
.transpose()?,
})
}
}
impl ConvertToIndices for AddressColumnSpec<String> {
type Output = AddressColumnSpec<usize>;
fn convert_to_indices(
&self,
header_columns: &HashMap<&str, usize>,
) -> Result<Self::Output> {
let mut address_columns_by_prefix = HashMap::new();
for (prefix, address_columns) in &self.address_columns_by_prefix {
address_columns_by_prefix.insert(
prefix.to_owned(),
address_columns.convert_to_indices(header_columns)?,
);
}
Ok(AddressColumnSpec {
address_columns_by_prefix,
})
}
}