use std::{
fs,
path::{Path, PathBuf},
};
use chrono::DateTime;
use regex::Regex;
use crate::{
helper::country_code::CountryRegionCode,
mdd::{
ReleasedMddData, country::CountryStats, species::SpeciesData, synonyms::SynonymData,
usa::UsaStats,
},
};
const DEFAULT_OUTPUT_FNAME: &str = "data";
const DEFAULT_COUNTRY_STATS_FNAME: &str = "country_stats";
const DEFAULT_COUNTRY_REGION_FNAME: &str = "country_region_code";
const DEFAULT_USA_STATE_DATA_FNAME: &str = "usa_states";
const JSON_EXT: &str = "json";
const GZIP_EXT: &str = "json.gz";
const DEFAULT_PREFIX: &str = "mdd";
pub struct JsonParser<'a> {
pub input_path: &'a Path,
pub synonym_path: &'a Path,
pub output_path: &'a Path,
pub plain_text: bool,
pub mdd_version: Option<String>,
pub release_date: Option<String>,
pub limit: Option<usize>,
pub prefix: Option<&'a str>,
}
impl<'a> JsonParser<'a> {
pub fn from_path(input_path: &'a Path, synonym_path: &'a Path, output_path: &'a Path) -> Self {
Self {
input_path,
synonym_path,
output_path,
plain_text: true,
mdd_version: None,
release_date: None,
limit: None,
prefix: Some(DEFAULT_PREFIX),
}
}
pub fn update_release_data(&mut self, date: &str, version: &str) {
self.release_date = Some(date.to_string());
self.mdd_version = Some(version.to_string());
}
pub fn parse_to_json(&self) {
let mut mdd_data = self.parse_mdd_data();
let mut synonym_data = self.parse_synonym_data();
if synonym_data.is_empty() {
println!("No synonym data found");
}
let country_stats = self.parse_country_stats(&mdd_data);
let usa_data = self.parse_usa_data(&mdd_data, &country_stats);
if let Some(limit) = self.limit {
self.limit_mdd_data(&mut mdd_data, limit);
self.limit_synonym_data(&mut synonym_data, limit);
}
let mdd_version = self.get_version();
let release_date = self.get_release_date();
println!(
"\nUsing MDD version: {}, release date: {}\n",
mdd_version, release_date
);
let released_json = self.merge_data(mdd_data, synonym_data);
self.create_dir_if_not_exist();
if self.plain_text {
self.write_plain_text(&released_json);
self.write_gzip(&released_json);
println!("Output written to: {:?}", self.get_output_path(false));
} else {
self.write_gzip(&released_json);
}
self.write_country_stats(&country_stats);
self.write_country_code();
self.write_usa_stats(&usa_data);
}
fn parse_mdd_data(&self) -> Vec<SpeciesData> {
let mdd_data = std::fs::read_to_string(self.input_path).expect("Failed to read MDD file");
println!("Parsing MDD data from: {:?}", self.input_path);
let parser = SpeciesData::new();
let mdd_data = parser.from_csv(&mdd_data);
println!("Found MDD data records: {}", mdd_data.len());
mdd_data
}
fn parse_synonym_data(&self) -> Vec<SynonymData> {
let syn_data =
std::fs::read_to_string(self.synonym_path).expect("Failed to read synonym file");
println!("Parsing synonym data from: {:?}", self.synonym_path);
let synonyms = SynonymData::new();
let synonym_data = synonyms.from_csv(&syn_data);
println!("Found synonym data records: {}", synonym_data.len());
synonym_data
}
fn parse_country_stats(&self, mdd_data: &[SpeciesData]) -> CountryStats {
println!("Creating country mammal diversity statistics from MDD records");
let mut country_stats = CountryStats::new();
country_stats.parse_country_data(mdd_data);
println!(
"Total countries and regions: {}, Total domesticated species: {}, Total widespread species: {}",
country_stats.total_countries,
country_stats.domesticated.len(),
country_stats.widespread.len()
);
country_stats
}
fn parse_usa_data(&self, mdd_data: &[SpeciesData], country_stats: &CountryStats) -> String {
let species_list = country_stats.get_species_list_by_country("US");
let usa_data: Vec<&SpeciesData> = mdd_data
.iter()
.filter(|species| species_list.contains(&species.id.to_string()))
.collect();
let mut usa_stats = UsaStats::new();
usa_stats.from_country_data(&usa_data);
println!("USA data parsed successfully");
println!(
"Total USA state/territory records: {}",
usa_stats.total_states
);
usa_stats.to_json()
}
fn merge_data(&self, mdd_data: Vec<SpeciesData>, synonym_data: Vec<SynonymData>) -> String {
let all_data = ReleasedMddData::from_parser(
mdd_data,
synonym_data,
&self.get_version(),
&self.get_release_date(),
);
println!("MDD {} data parsed successfully", self.get_version());
println!("Total MDD records: {}", all_data.data.len());
println!(
"Total synonym only records: {}",
all_data.synonym_only.len()
);
all_data.to_json()
}
fn get_version(&self) -> String {
match &self.mdd_version {
Some(version) => version.clone(),
None => {
let file_stem = self
.input_path
.file_stem()
.expect("Invalid file name")
.to_str()
.expect("Failed to convert OsStr to str");
let re =
Regex::new(r"MDD_v(\d+\.\d+)").expect("Failed to compile MDD version regex");
if let Some(caps) = re.captures(file_stem) {
caps.get(1)
.map_or("unknown".to_string(), |m| m.as_str().to_string())
} else {
"unknown".to_string()
}
}
}
}
fn get_release_date(&self) -> String {
match &self.release_date {
Some(date) => date.clone(),
None => {
let file_meta =
fs::metadata(self.input_path).expect("Failed to read file metadata");
let modified_time = file_meta
.created()
.expect("Failed to get file modified time");
let date = DateTime::<chrono::Local>::from(modified_time);
date.format("%B %e, %Y").to_string()
}
}
}
fn limit_mdd_data(&self, data: &mut Vec<SpeciesData>, limit: usize) {
data.truncate(limit);
}
fn limit_synonym_data(&self, data: &mut Vec<SynonymData>, limit: usize) {
data.truncate(limit);
}
fn write_plain_text(&self, data: &str) {
let output = self.get_output_path(false);
std::fs::write(output, data).expect("Unable to write file");
}
fn write_country_stats(&self, country_stats: &CountryStats) {
country_stats.write_to_json_file(
&self
.output_path
.join(DEFAULT_COUNTRY_STATS_FNAME)
.with_extension(JSON_EXT),
);
}
fn write_country_code(&self) {
let country_region_code = CountryRegionCode::new();
country_region_code.write_to_file(
self.output_path
.join(DEFAULT_COUNTRY_REGION_FNAME)
.with_extension(JSON_EXT),
);
}
fn write_usa_stats(&self, usa_data: &str) {
let output = self
.output_path
.join(DEFAULT_USA_STATE_DATA_FNAME)
.with_extension(JSON_EXT);
std::fs::write(output, usa_data).expect("Unable to write file");
}
fn write_gzip(&self, data: &str) {
let output = self.get_output_path(true);
let file = std::fs::File::create(output).expect("Unable to create file");
let mut encoder = flate2::write::GzEncoder::new(file, flate2::Compression::default());
std::io::Write::write_all(&mut encoder, data.as_bytes()).expect("Unable to write file");
}
fn get_output_path(&self, is_gunzip: bool) -> PathBuf {
let fname = match self.prefix {
Some(prefix) => prefix,
None => DEFAULT_OUTPUT_FNAME,
};
let output = self.output_path.join(fname);
if is_gunzip {
output.with_extension(GZIP_EXT)
} else {
output.with_extension(JSON_EXT)
}
}
fn create_dir_if_not_exist(&self) {
fs::create_dir_all(self.output_path).unwrap_or_else(|_| {
panic!("Failed to create output directory: {:?}", self.output_path)
});
}
}