Skip to main content

congress_appropriations/approp/
loading.rs

1//! Shared bill loading and directory walking utilities.
2//!
3//! Provides functions to discover bill directories and deserialize
4//! extraction/verification artifacts from JSON files on disk.
5
6use crate::approp::bill_meta::BillMeta;
7use crate::approp::ontology::{BillExtraction, ExtractionMetadata};
8use crate::approp::verification::VerificationReport;
9use anyhow::{Context, Result};
10use std::path::{Path, PathBuf};
11use tracing::debug;
12use walkdir::WalkDir;
13
14/// A bill directory with its loaded artifacts.
15#[derive(Debug, Clone)]
16pub struct LoadedBill {
17    /// Path to the bill directory
18    pub dir: PathBuf,
19    /// The extraction output (provisions, summary, bill info)
20    pub extraction: BillExtraction,
21    /// Verification report, if available
22    pub verification: Option<VerificationReport>,
23    /// Extraction metadata (model, prompt version, timestamps), if available
24    pub metadata: Option<ExtractionMetadata>,
25    /// Bill-level metadata (fiscal years, jurisdictions, advance classification), if available
26    pub bill_meta: Option<BillMeta>,
27}
28
29/// Walk a directory tree, find all bill directories (those containing extraction.json),
30/// load and deserialize all artifacts. Returns sorted by bill identifier.
31pub fn load_bills(dir: &Path) -> Result<Vec<LoadedBill>> {
32    let extraction_files = find_files(dir, "extraction.json");
33
34    if extraction_files.is_empty() {
35        return Ok(Vec::new());
36    }
37
38    let mut bills = Vec::with_capacity(extraction_files.len());
39
40    for ext_path in &extraction_files {
41        let bill_dir = ext_path.parent().unwrap_or(Path::new(".")).to_path_buf();
42
43        let extraction: BillExtraction = load_json(ext_path)
44            .with_context(|| format!("Failed to load {}", ext_path.display()))?;
45
46        let verification: Option<VerificationReport> =
47            load_json_optional(&bill_dir.join("verification.json"));
48
49        let metadata: Option<ExtractionMetadata> =
50            load_json_optional(&bill_dir.join("metadata.json"));
51
52        let bill_meta: Option<BillMeta> = load_json_optional(&bill_dir.join("bill_meta.json"));
53
54        debug!(
55            bill = extraction.bill.identifier,
56            dir = %bill_dir.display(),
57            "Loaded bill"
58        );
59
60        bills.push(LoadedBill {
61            dir: bill_dir,
62            extraction,
63            verification,
64            metadata,
65            bill_meta,
66        });
67    }
68
69    // Sort by bill identifier for deterministic ordering
70    bills.sort_by(|a, b| {
71        a.extraction
72            .bill
73            .identifier
74            .cmp(&b.extraction.bill.identifier)
75    });
76
77    Ok(bills)
78}
79
80/// Find all bill source files (BILLS-*.xml or BILLS-*.txt) in a directory tree.
81/// Prefers XML over TXT when both exist for the same bill.
82/// Returns (label, path) pairs where label is the parent directory name.
83pub fn find_bill_sources(dir: &Path) -> Vec<(String, PathBuf)> {
84    // Collect all BILLS-* files grouped by directory
85    let mut by_dir: std::collections::HashMap<PathBuf, Vec<PathBuf>> =
86        std::collections::HashMap::new();
87
88    for entry in WalkDir::new(dir).into_iter().filter_map(|e| e.ok()) {
89        let path = entry.path();
90        if path.is_file()
91            && path
92                .file_stem()
93                .is_some_and(|n| n.to_string_lossy().starts_with("BILLS-"))
94            && path.extension().is_some_and(|e| e == "xml" || e == "txt")
95        {
96            let parent = path.parent().unwrap_or(Path::new(".")).to_path_buf();
97            by_dir.entry(parent).or_default().push(path.to_path_buf());
98        }
99    }
100
101    let mut results = Vec::new();
102    for (parent, files) in &by_dir {
103        // Prefer XML over TXT. Group by stem to handle cases where both exist.
104        let mut by_stem: std::collections::HashMap<String, PathBuf> =
105            std::collections::HashMap::new();
106        for file in files {
107            let stem = file
108                .file_stem()
109                .unwrap_or_default()
110                .to_string_lossy()
111                .to_string();
112            let ext = file
113                .extension()
114                .unwrap_or_default()
115                .to_string_lossy()
116                .to_string();
117            match by_stem.get(&stem) {
118                Some(existing) => {
119                    // XML wins over TXT
120                    let existing_ext = existing
121                        .extension()
122                        .unwrap_or_default()
123                        .to_string_lossy()
124                        .to_string();
125                    if ext == "xml" && existing_ext != "xml" {
126                        by_stem.insert(stem, file.clone());
127                    }
128                }
129                None => {
130                    by_stem.insert(stem, file.clone());
131                }
132            }
133        }
134
135        // Prefer enrolled versions: if any file stem ends with "enr",
136        // keep only enrolled files and discard other versions (ih, eh, eas, etc.)
137        // to avoid processing draft versions that may have different XML structures.
138        let has_enrolled = by_stem.keys().any(|s| s.ends_with("enr"));
139
140        let label = parent
141            .file_name()
142            .map(|n| n.to_string_lossy().to_string())
143            .unwrap_or_default();
144
145        for (stem, path) in &by_stem {
146            if has_enrolled && !stem.ends_with("enr") {
147                continue;
148            }
149            results.push((label.clone(), path.clone()));
150        }
151    }
152
153    results.sort_by(|a, b| a.1.cmp(&b.1));
154    results
155}
156
157/// Find all bill text files (BILLS-*.txt) in a directory tree.
158/// Returns (label, path) pairs where label is the parent directory name.
159pub fn find_bill_texts(dir: &Path) -> Vec<(String, PathBuf)> {
160    let mut results = Vec::new();
161    for entry in WalkDir::new(dir).into_iter().filter_map(|e| e.ok()) {
162        let path = entry.path();
163        if path.is_file()
164            && path.extension().is_some_and(|e| e == "txt")
165            && path
166                .file_stem()
167                .is_some_and(|n| n.to_string_lossy().starts_with("BILLS-"))
168        {
169            let label = path
170                .parent()
171                .and_then(|p| p.file_name())
172                .map(|n| n.to_string_lossy().to_string())
173                .unwrap_or_default();
174            results.push((label, path.to_path_buf()));
175        }
176    }
177    results.sort_by(|a, b| a.1.cmp(&b.1));
178    results
179}
180
181// ─── Helpers ─────────────────────────────────────────────────────────────────
182
183/// Find all files with a specific name in a directory tree.
184fn find_files(dir: &Path, filename: &str) -> Vec<PathBuf> {
185    let mut results = Vec::new();
186    for entry in WalkDir::new(dir).into_iter().filter_map(|e| e.ok()) {
187        let path = entry.path();
188        if path.is_file() && path.file_name().is_some_and(|n| n == filename) {
189            results.push(path.to_path_buf());
190        }
191    }
192    results.sort();
193    results
194}
195
196/// Deserialize a JSON file into a typed value.
197fn load_json<T: serde::de::DeserializeOwned>(path: &Path) -> Result<T> {
198    let content = std::fs::read_to_string(path)
199        .with_context(|| format!("Failed to read {}", path.display()))?;
200    let value: T = serde_json::from_str(&content)
201        .with_context(|| format!("Failed to parse {}", path.display()))?;
202    Ok(value)
203}
204
205/// Try to deserialize a JSON file, returning None if the file doesn't exist
206/// or can't be parsed.
207fn load_json_optional<T: serde::de::DeserializeOwned>(path: &Path) -> Option<T> {
208    if !path.exists() {
209        return None;
210    }
211    match load_json(path) {
212        Ok(v) => Some(v),
213        Err(e) => {
214            debug!("Could not load {}: {e}", path.display());
215            None
216        }
217    }
218}
219
220#[cfg(test)]
221mod tests {
222    use super::*;
223    use std::fs;
224    use tempfile::TempDir;
225
226    fn make_minimal_extraction() -> String {
227        serde_json::json!({
228            "bill": {
229                "identifier": "H.R. 1",
230                "classification": "regular",
231                "short_title": null,
232                "fiscal_years": [2024],
233                "divisions": [],
234                "public_law": null
235            },
236            "provisions": [],
237            "summary": {
238                "total_provisions": 0,
239                "by_division": {},
240                "by_type": {},
241                "total_budget_authority": 0,
242                "total_rescissions": 0,
243                "sections_with_no_provisions": [],
244                "flagged_issues": []
245            }
246        })
247        .to_string()
248    }
249
250    #[test]
251    fn load_bills_empty_dir() {
252        let dir = TempDir::new().unwrap();
253        let bills = load_bills(dir.path()).unwrap();
254        assert!(bills.is_empty());
255    }
256
257    #[test]
258    fn load_bills_finds_extraction() {
259        let dir = TempDir::new().unwrap();
260        let bill_dir = dir.path().join("hr").join("1");
261        fs::create_dir_all(&bill_dir).unwrap();
262        fs::write(bill_dir.join("extraction.json"), make_minimal_extraction()).unwrap();
263
264        let bills = load_bills(dir.path()).unwrap();
265        assert_eq!(bills.len(), 1);
266        assert_eq!(bills[0].extraction.bill.identifier, "H.R. 1");
267        assert!(bills[0].verification.is_none());
268        assert!(bills[0].metadata.is_none());
269    }
270
271    #[test]
272    fn find_bill_texts_filters_correctly() {
273        let dir = TempDir::new().unwrap();
274        let bill_dir = dir.path().join("9468");
275        fs::create_dir_all(&bill_dir).unwrap();
276        fs::write(bill_dir.join("BILLS-118hr9468enr.txt"), "bill text").unwrap();
277        fs::write(bill_dir.join("notes.txt"), "not a bill").unwrap();
278
279        let texts = find_bill_texts(dir.path());
280        assert_eq!(texts.len(), 1);
281        assert!(texts[0].1.to_string_lossy().contains("BILLS-"));
282    }
283
284    #[test]
285    fn find_bill_sources_prefers_xml() {
286        let dir = TempDir::new().unwrap();
287        let bill_dir = dir.path().join("4366");
288        fs::create_dir_all(&bill_dir).unwrap();
289        fs::write(bill_dir.join("BILLS-118hr4366enr.xml"), "<bill/>").unwrap();
290        fs::write(bill_dir.join("BILLS-118hr4366enr.txt"), "text").unwrap();
291
292        let sources = find_bill_sources(dir.path());
293        assert_eq!(sources.len(), 1);
294        assert!(sources[0].1.to_string_lossy().ends_with(".xml"));
295    }
296
297    #[test]
298    fn find_bill_sources_falls_back_to_txt() {
299        let dir = TempDir::new().unwrap();
300        let bill_dir = dir.path().join("9468");
301        fs::create_dir_all(&bill_dir).unwrap();
302        fs::write(bill_dir.join("BILLS-118hr9468enr.txt"), "text only").unwrap();
303
304        let sources = find_bill_sources(dir.path());
305        assert_eq!(sources.len(), 1);
306        assert!(sources[0].1.to_string_lossy().ends_with(".txt"));
307    }
308
309    #[test]
310    fn find_bill_sources_prefers_enrolled_over_other_versions() {
311        let dir = TempDir::new().unwrap();
312        let bill_dir = dir.path().join("7463");
313        fs::create_dir_all(&bill_dir).unwrap();
314        fs::write(bill_dir.join("BILLS-118hr7463enr.xml"), "<bill/>").unwrap();
315        fs::write(bill_dir.join("BILLS-118hr7463ih.xml"), "<bill/>").unwrap();
316        fs::write(bill_dir.join("BILLS-118hr7463eh.xml"), "<bill/>").unwrap();
317        fs::write(bill_dir.join("BILLS-118hr7463eas.xml"), "<bill/>").unwrap();
318
319        let sources = find_bill_sources(dir.path());
320        assert_eq!(sources.len(), 1, "Should return only the enrolled version");
321        assert!(
322            sources[0].1.to_string_lossy().contains("enr"),
323            "Should be the enrolled version"
324        );
325    }
326
327    #[test]
328    fn find_bill_sources_keeps_all_if_no_enrolled() {
329        let dir = TempDir::new().unwrap();
330        let bill_dir = dir.path().join("9999");
331        fs::create_dir_all(&bill_dir).unwrap();
332        fs::write(bill_dir.join("BILLS-118hr9999ih.xml"), "<bill/>").unwrap();
333        fs::write(bill_dir.join("BILLS-118hr9999eh.xml"), "<bill/>").unwrap();
334
335        let sources = find_bill_sources(dir.path());
336        assert_eq!(
337            sources.len(),
338            2,
339            "Should return all versions when no enrolled exists"
340        );
341    }
342}