mod error;
pub use error::DatasetError;
use serde::{Deserialize, Serialize};
use serde_with::serde_as;
use std::collections::HashMap;
use std::fs;
use crate::annotation::ChangeAnnotation;
use crate::congress::{BillDownload, CosponsorRecord, Member, SponsorInfo};
use crate::diff::TreeDiff;
use crate::uslm::bill_parser::Bill;
use crate::uslm::parser::ParseError;
use crate::uslm::{BillDiff, USLMElement};
use crate::utils::{load_uslm_folder, parse_uslm_xml};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DatasetMetadata {
pub name: String,
pub description: String,
pub author: String,
pub source_urls: Vec<String>,
pub license: String,
pub version: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VersionSnapshot {
pub date: String,
pub label: Option<String>,
pub element: USLMElement,
}
#[derive(Debug, Clone)]
pub struct SearchResult {
pub date: String,
pub path: String,
pub field: String,
pub snippet: String,
}
pub type VersionPair = (String, String);
#[serde_as]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Dataset {
pub metadata: DatasetMetadata,
pub versions: Vec<VersionSnapshot>,
pub bills: Vec<Bill>,
#[serde_as(as = "Vec<(_, _)>")]
pub diff_annotations: HashMap<VersionPair, Vec<ChangeAnnotation>>,
#[serde_as(as = "Vec<(_, _)>")]
#[serde(default)]
pub members: HashMap<String, Member>,
#[serde_as(as = "Vec<(_, _)>")]
#[serde(default)]
pub sponsors: HashMap<String, SponsorInfo>,
}
impl Dataset {
pub fn new(metadata: DatasetMetadata) -> Self {
Dataset {
metadata,
versions: Vec::new(),
bills: Vec::new(),
diff_annotations: HashMap::new(),
members: HashMap::new(),
sponsors: HashMap::new(),
}
}
pub fn add_version(&mut self, snapshot: VersionSnapshot) {
let pos = self
.versions
.binary_search_by(|v| v.date.cmp(&snapshot.date))
.unwrap_or_else(|pos| pos);
self.versions.insert(pos, snapshot);
}
pub fn get_version(&self, date: &str) -> Option<&VersionSnapshot> {
self.versions.iter().find(|v| v.date == date)
}
pub fn get_version_by_label(&self, label: &str) -> Option<&VersionSnapshot> {
self.versions
.iter()
.find(|v| v.label.as_deref() == Some(label))
}
pub fn next_version(&self, date: &str) -> Option<&VersionSnapshot> {
let pos = self.versions.iter().position(|v| v.date == date)?;
self.versions.get(pos + 1)
}
pub fn prev_version(&self, date: &str) -> Option<&VersionSnapshot> {
let pos = self.versions.iter().position(|v| v.date == date)?;
if pos == 0 {
None
} else {
self.versions.get(pos - 1)
}
}
pub fn save(&self, path: &str) -> Result<(), DatasetError> {
let json = serde_json::to_string_pretty(self)?;
fs::write(path, json)?;
Ok(())
}
pub fn load(path: &str) -> Result<Self, DatasetError> {
let json = fs::read_to_string(path)?;
let dataset = serde_json::from_str(&json)?;
Ok(dataset)
}
pub fn compute_diff(&self, from_date: &str, to_date: &str) -> Result<TreeDiff, DatasetError> {
let from = self
.get_version(from_date)
.ok_or_else(|| DatasetError::VersionNotFound(from_date.to_string()))?;
let to = self
.get_version(to_date)
.ok_or_else(|| DatasetError::VersionNotFound(to_date.to_string()))?;
Ok(TreeDiff::from_elements(&from.element, &to.element))
}
pub fn add_bill(&mut self, bill: Bill) {
self.bills.push(bill);
}
pub fn get_bill(&self, bill_id: &str) -> Option<&Bill> {
self.bills.iter().find(|b| b.bill_id == bill_id)
}
pub fn add_changes_to_amendment(&mut self, amendment_id: &str, bill_diff: &BillDiff) {
for bill in self.bills.iter_mut() {
if let Some(amendment) = bill.amendments.get_mut(amendment_id) {
amendment.changes.push(bill_diff.clone());
return;
}
}
}
pub fn get_annotations(&self, from: &str, to: &str) -> Option<&Vec<ChangeAnnotation>> {
self.diff_annotations
.get(&(from.to_string(), to.to_string()))
}
pub fn get_annotations_mut(&mut self, from: &str, to: &str) -> &mut Vec<ChangeAnnotation> {
self.diff_annotations
.entry((from.to_string(), to.to_string()))
.or_default()
}
pub fn add_annotation(&mut self, from: &str, to: &str, annotation: ChangeAnnotation) {
self.get_annotations_mut(from, to).push(annotation);
}
pub fn annotations_for_path(&self, path: &str) -> Vec<&ChangeAnnotation> {
self.diff_annotations
.values()
.flatten()
.filter(|a| a.paths.iter().any(|p| p == path))
.collect()
}
pub fn annotations_for_bill(&self, bill_id: &str) -> Vec<&ChangeAnnotation> {
self.diff_annotations
.values()
.flatten()
.filter(|a| a.source_bill.bill_id == bill_id)
.collect()
}
pub fn annotated_paths(&self, from: &str, to: &str) -> Vec<String> {
self.get_annotations(from, to)
.map(|annotations| {
annotations
.iter()
.flat_map(|a| a.paths.clone())
.collect::<std::collections::HashSet<_>>()
.into_iter()
.collect()
})
.unwrap_or_default()
}
pub fn unannotated_paths(&self, from: &str, to: &str) -> Result<Vec<String>, DatasetError> {
let diff = self.compute_diff(from, to)?;
let annotated = self.annotated_paths(from, to);
let annotated_set: std::collections::HashSet<_> = annotated.into_iter().collect();
let mut paths_with_changes = Vec::new();
Self::collect_paths_with_changes(&diff, &mut paths_with_changes);
Ok(paths_with_changes
.into_iter()
.filter(|p| !annotated_set.contains(p))
.collect())
}
fn collect_paths_with_changes(diff: &TreeDiff, paths: &mut Vec<String>) {
if !diff.changes.is_empty() || !diff.added.is_empty() || !diff.removed.is_empty() {
paths.push(diff.root_path.clone());
}
for child in &diff.child_diffs {
Self::collect_paths_with_changes(child, paths);
}
}
pub fn find_element(&self, path: &str) -> Vec<(&str, &USLMElement)> {
self.versions
.iter()
.filter_map(|v| v.element.find(path).map(|e| (v.date.as_str(), e)))
.collect()
}
pub fn add_uslm_xml(
&mut self,
xml_path: &str,
date: &str,
label: Option<String>,
) -> Result<(), ParseError> {
let result = parse_uslm_xml(xml_path, date)?;
self.add_version(VersionSnapshot {
date: date.to_string(),
label,
element: result,
});
Ok(())
}
pub fn add_uslm_folder(
&mut self,
folder_path: &str,
date: &str,
label: Option<String>,
) -> Result<(), DatasetError> {
let element = load_uslm_folder(folder_path, date)
.ok_or_else(|| DatasetError::FolderLoadFailed(folder_path.to_string()))?;
self.add_version(VersionSnapshot {
date: date.to_string(),
label,
element,
});
Ok(())
}
pub fn search_text(&self, query: &str) -> Vec<SearchResult> {
let query_lower = query.to_lowercase();
let mut results = Vec::new();
for version in &self.versions {
Self::search_element(&version.element, &version.date, &query_lower, &mut results);
}
results
}
fn search_element(
element: &USLMElement,
date: &str,
query: &str,
results: &mut Vec<SearchResult>,
) {
let fields = [
("heading", &element.data.heading),
("chapeau", &element.data.chapeau),
("content", &element.data.content),
("proviso", &element.data.proviso),
("continuation", &element.data.continuation),
];
for (field_name, field_value) in fields {
if let Some(text) = field_value
&& text.to_lowercase().contains(query)
{
results.push(SearchResult {
date: date.to_string(),
path: element.data.path.clone(),
field: field_name.to_string(),
snippet: text.clone(),
});
}
}
for child in &element.children {
Self::search_element(child, date, query, results);
}
}
pub fn add_member(&mut self, member: Member) {
self.members.insert(member.bioguide_id.clone(), member);
}
pub fn get_member(&self, bioguide_id: &str) -> Option<&Member> {
self.members.get(bioguide_id)
}
pub fn add_sponsor_info(&mut self, info: SponsorInfo) {
self.sponsors.insert(info.bill_id.clone(), info);
}
pub fn get_sponsor_info(&self, bill_id: &str) -> Option<&SponsorInfo> {
self.sponsors.get(bill_id)
}
pub fn sponsors_for_path(&self, path: &str) -> Vec<&Member> {
let bill_ids: Vec<_> = self
.annotations_for_path(path)
.iter()
.map(|a| &a.source_bill.bill_id)
.collect();
let mut member_ids: Vec<&str> = Vec::new();
for bill_id in &bill_ids {
if let Some(info) = self.sponsors.get(*bill_id) {
member_ids.push(&info.sponsor);
for cosponsor in &info.cosponsors {
member_ids.push(&cosponsor.bioguide_id);
}
}
}
member_ids
.into_iter()
.filter_map(|id| self.members.get(id))
.collect()
}
pub fn load_bill_download(&mut self, download: &BillDownload) -> Result<String, DatasetError> {
use crate::uslm::bill_parser;
use serde_json::Value;
let bill =
bill_parser::parse_bill_amendments_from_str(&download.bill_id, &download.bill_xml)
.map_err(|e| {
DatasetError::Json(serde_json::Error::io(std::io::Error::new(
std::io::ErrorKind::InvalidData,
e.to_string(),
)))
})?;
let bill_id = bill.bill_id.clone();
self.add_bill(bill);
let sponsors_v: Value = serde_json::from_str(&download.sponsors_json)?;
let sponsor_id = sponsors_v["bill"]["sponsors"]
.as_array()
.and_then(|arr| arr.first())
.and_then(|s| s["bioguideId"].as_str())
.unwrap_or("")
.to_string();
let cosponsors_v: Value = serde_json::from_str(&download.cosponsors_json)?;
let mut cosponsors = Vec::new();
if let Some(arr) = cosponsors_v["cosponsors"].as_array() {
for c in arr {
cosponsors.push(CosponsorRecord {
bioguide_id: c["bioguideId"].as_str().unwrap_or("").to_string(),
date: c["sponsorshipDate"].as_str().unwrap_or("").to_string(),
withdrawn: c["sponsorshipWithdrawnDate"].as_str().is_some(),
});
}
}
self.add_sponsor_info(SponsorInfo {
bill_id: bill_id.clone(),
sponsor: sponsor_id,
cosponsors,
});
for json in download.member_jsons.values() {
if let Ok(member) = Member::from_api_response(json) {
self.add_member(member);
}
}
Ok(bill_id)
}
}