congress_appropriations/approp/
loading.rs1use crate::approp::bill_meta::BillMeta;
7use crate::approp::ontology::{BillExtraction, ExtractionMetadata};
8use crate::approp::verification::VerificationReport;
9use anyhow::{Context, Result};
10use std::path::{Path, PathBuf};
11use tracing::debug;
12use walkdir::WalkDir;
13
14#[derive(Debug, Clone)]
16pub struct LoadedBill {
17 pub dir: PathBuf,
19 pub extraction: BillExtraction,
21 pub verification: Option<VerificationReport>,
23 pub metadata: Option<ExtractionMetadata>,
25 pub bill_meta: Option<BillMeta>,
27}
28
29pub fn load_bills(dir: &Path) -> Result<Vec<LoadedBill>> {
32 let extraction_files = find_files(dir, "extraction.json");
33
34 if extraction_files.is_empty() {
35 return Ok(Vec::new());
36 }
37
38 let mut bills = Vec::with_capacity(extraction_files.len());
39
40 for ext_path in &extraction_files {
41 let bill_dir = ext_path.parent().unwrap_or(Path::new(".")).to_path_buf();
42
43 let extraction: BillExtraction = load_json(ext_path)
44 .with_context(|| format!("Failed to load {}", ext_path.display()))?;
45
46 let verification: Option<VerificationReport> =
47 load_json_optional(&bill_dir.join("verification.json"));
48
49 let metadata: Option<ExtractionMetadata> =
50 load_json_optional(&bill_dir.join("metadata.json"));
51
52 let bill_meta: Option<BillMeta> = load_json_optional(&bill_dir.join("bill_meta.json"));
53
54 debug!(
55 bill = extraction.bill.identifier,
56 dir = %bill_dir.display(),
57 "Loaded bill"
58 );
59
60 bills.push(LoadedBill {
61 dir: bill_dir,
62 extraction,
63 verification,
64 metadata,
65 bill_meta,
66 });
67 }
68
69 bills.sort_by(|a, b| {
71 a.extraction
72 .bill
73 .identifier
74 .cmp(&b.extraction.bill.identifier)
75 });
76
77 Ok(bills)
78}
79
80pub fn find_bill_sources(dir: &Path) -> Vec<(String, PathBuf)> {
84 let mut by_dir: std::collections::HashMap<PathBuf, Vec<PathBuf>> =
86 std::collections::HashMap::new();
87
88 for entry in WalkDir::new(dir).into_iter().filter_map(|e| e.ok()) {
89 let path = entry.path();
90 if path.is_file()
91 && path
92 .file_stem()
93 .is_some_and(|n| n.to_string_lossy().starts_with("BILLS-"))
94 && path.extension().is_some_and(|e| e == "xml" || e == "txt")
95 {
96 let parent = path.parent().unwrap_or(Path::new(".")).to_path_buf();
97 by_dir.entry(parent).or_default().push(path.to_path_buf());
98 }
99 }
100
101 let mut results = Vec::new();
102 for (parent, files) in &by_dir {
103 let mut by_stem: std::collections::HashMap<String, PathBuf> =
105 std::collections::HashMap::new();
106 for file in files {
107 let stem = file
108 .file_stem()
109 .unwrap_or_default()
110 .to_string_lossy()
111 .to_string();
112 let ext = file
113 .extension()
114 .unwrap_or_default()
115 .to_string_lossy()
116 .to_string();
117 match by_stem.get(&stem) {
118 Some(existing) => {
119 let existing_ext = existing
121 .extension()
122 .unwrap_or_default()
123 .to_string_lossy()
124 .to_string();
125 if ext == "xml" && existing_ext != "xml" {
126 by_stem.insert(stem, file.clone());
127 }
128 }
129 None => {
130 by_stem.insert(stem, file.clone());
131 }
132 }
133 }
134
135 let has_enrolled = by_stem.keys().any(|s| s.ends_with("enr"));
139
140 let label = parent
141 .file_name()
142 .map(|n| n.to_string_lossy().to_string())
143 .unwrap_or_default();
144
145 for (stem, path) in &by_stem {
146 if has_enrolled && !stem.ends_with("enr") {
147 continue;
148 }
149 results.push((label.clone(), path.clone()));
150 }
151 }
152
153 results.sort_by(|a, b| a.1.cmp(&b.1));
154 results
155}
156
157pub fn find_bill_texts(dir: &Path) -> Vec<(String, PathBuf)> {
160 let mut results = Vec::new();
161 for entry in WalkDir::new(dir).into_iter().filter_map(|e| e.ok()) {
162 let path = entry.path();
163 if path.is_file()
164 && path.extension().is_some_and(|e| e == "txt")
165 && path
166 .file_stem()
167 .is_some_and(|n| n.to_string_lossy().starts_with("BILLS-"))
168 {
169 let label = path
170 .parent()
171 .and_then(|p| p.file_name())
172 .map(|n| n.to_string_lossy().to_string())
173 .unwrap_or_default();
174 results.push((label, path.to_path_buf()));
175 }
176 }
177 results.sort_by(|a, b| a.1.cmp(&b.1));
178 results
179}
180
181fn find_files(dir: &Path, filename: &str) -> Vec<PathBuf> {
185 let mut results = Vec::new();
186 for entry in WalkDir::new(dir).into_iter().filter_map(|e| e.ok()) {
187 let path = entry.path();
188 if path.is_file() && path.file_name().is_some_and(|n| n == filename) {
189 results.push(path.to_path_buf());
190 }
191 }
192 results.sort();
193 results
194}
195
196fn load_json<T: serde::de::DeserializeOwned>(path: &Path) -> Result<T> {
198 let content = std::fs::read_to_string(path)
199 .with_context(|| format!("Failed to read {}", path.display()))?;
200 let value: T = serde_json::from_str(&content)
201 .with_context(|| format!("Failed to parse {}", path.display()))?;
202 Ok(value)
203}
204
205fn load_json_optional<T: serde::de::DeserializeOwned>(path: &Path) -> Option<T> {
208 if !path.exists() {
209 return None;
210 }
211 match load_json(path) {
212 Ok(v) => Some(v),
213 Err(e) => {
214 debug!("Could not load {}: {e}", path.display());
215 None
216 }
217 }
218}
219
220#[cfg(test)]
221mod tests {
222 use super::*;
223 use std::fs;
224 use tempfile::TempDir;
225
226 fn make_minimal_extraction() -> String {
227 serde_json::json!({
228 "bill": {
229 "identifier": "H.R. 1",
230 "classification": "regular",
231 "short_title": null,
232 "fiscal_years": [2024],
233 "divisions": [],
234 "public_law": null
235 },
236 "provisions": [],
237 "summary": {
238 "total_provisions": 0,
239 "by_division": {},
240 "by_type": {},
241 "total_budget_authority": 0,
242 "total_rescissions": 0,
243 "sections_with_no_provisions": [],
244 "flagged_issues": []
245 }
246 })
247 .to_string()
248 }
249
250 #[test]
251 fn load_bills_empty_dir() {
252 let dir = TempDir::new().unwrap();
253 let bills = load_bills(dir.path()).unwrap();
254 assert!(bills.is_empty());
255 }
256
257 #[test]
258 fn load_bills_finds_extraction() {
259 let dir = TempDir::new().unwrap();
260 let bill_dir = dir.path().join("hr").join("1");
261 fs::create_dir_all(&bill_dir).unwrap();
262 fs::write(bill_dir.join("extraction.json"), make_minimal_extraction()).unwrap();
263
264 let bills = load_bills(dir.path()).unwrap();
265 assert_eq!(bills.len(), 1);
266 assert_eq!(bills[0].extraction.bill.identifier, "H.R. 1");
267 assert!(bills[0].verification.is_none());
268 assert!(bills[0].metadata.is_none());
269 }
270
271 #[test]
272 fn find_bill_texts_filters_correctly() {
273 let dir = TempDir::new().unwrap();
274 let bill_dir = dir.path().join("9468");
275 fs::create_dir_all(&bill_dir).unwrap();
276 fs::write(bill_dir.join("BILLS-118hr9468enr.txt"), "bill text").unwrap();
277 fs::write(bill_dir.join("notes.txt"), "not a bill").unwrap();
278
279 let texts = find_bill_texts(dir.path());
280 assert_eq!(texts.len(), 1);
281 assert!(texts[0].1.to_string_lossy().contains("BILLS-"));
282 }
283
284 #[test]
285 fn find_bill_sources_prefers_xml() {
286 let dir = TempDir::new().unwrap();
287 let bill_dir = dir.path().join("4366");
288 fs::create_dir_all(&bill_dir).unwrap();
289 fs::write(bill_dir.join("BILLS-118hr4366enr.xml"), "<bill/>").unwrap();
290 fs::write(bill_dir.join("BILLS-118hr4366enr.txt"), "text").unwrap();
291
292 let sources = find_bill_sources(dir.path());
293 assert_eq!(sources.len(), 1);
294 assert!(sources[0].1.to_string_lossy().ends_with(".xml"));
295 }
296
297 #[test]
298 fn find_bill_sources_falls_back_to_txt() {
299 let dir = TempDir::new().unwrap();
300 let bill_dir = dir.path().join("9468");
301 fs::create_dir_all(&bill_dir).unwrap();
302 fs::write(bill_dir.join("BILLS-118hr9468enr.txt"), "text only").unwrap();
303
304 let sources = find_bill_sources(dir.path());
305 assert_eq!(sources.len(), 1);
306 assert!(sources[0].1.to_string_lossy().ends_with(".txt"));
307 }
308
309 #[test]
310 fn find_bill_sources_prefers_enrolled_over_other_versions() {
311 let dir = TempDir::new().unwrap();
312 let bill_dir = dir.path().join("7463");
313 fs::create_dir_all(&bill_dir).unwrap();
314 fs::write(bill_dir.join("BILLS-118hr7463enr.xml"), "<bill/>").unwrap();
315 fs::write(bill_dir.join("BILLS-118hr7463ih.xml"), "<bill/>").unwrap();
316 fs::write(bill_dir.join("BILLS-118hr7463eh.xml"), "<bill/>").unwrap();
317 fs::write(bill_dir.join("BILLS-118hr7463eas.xml"), "<bill/>").unwrap();
318
319 let sources = find_bill_sources(dir.path());
320 assert_eq!(sources.len(), 1, "Should return only the enrolled version");
321 assert!(
322 sources[0].1.to_string_lossy().contains("enr"),
323 "Should be the enrolled version"
324 );
325 }
326
327 #[test]
328 fn find_bill_sources_keeps_all_if_no_enrolled() {
329 let dir = TempDir::new().unwrap();
330 let bill_dir = dir.path().join("9999");
331 fs::create_dir_all(&bill_dir).unwrap();
332 fs::write(bill_dir.join("BILLS-118hr9999ih.xml"), "<bill/>").unwrap();
333 fs::write(bill_dir.join("BILLS-118hr9999eh.xml"), "<bill/>").unwrap();
334
335 let sources = find_bill_sources(dir.path());
336 assert_eq!(
337 sources.len(),
338 2,
339 "Should return all versions when no enrolled exists"
340 );
341 }
342}