ben/
utils.rs

1//! This module provides some utility functions for working with assignments
2//! and RLE encoding. It also provides a function to sort a JSON file by a key
3//! so as to make the BEN encoding more efficient.
4
5use super::{log, logln};
6use serde_json::{json, Value};
7use std::collections::HashMap;
8use std::io::{Read, Result, Write};
9use std::result::Result as StdResult;
10
11/// Convert a vector of assignments to a run-length encoded (RLE) vector.
12///
13/// # Arguments
14///
15/// * `assign_vec` - A vector of assignments to convert to RLE.
16///
17/// # Returns
18///
19/// A vector of tuples where the first element is the value and the second element is
20/// the length of the run.
21pub fn assign_to_rle(assign_vec: Vec<u16>) -> Vec<(u16, u16)> {
22    let mut prev_assign: u16 = 0;
23    let mut count: u16 = 0;
24    let mut first = true;
25    let mut rle_vec: Vec<(u16, u16)> = Vec::new();
26
27    for assign in assign_vec {
28        if first {
29            prev_assign = assign;
30            count = 1;
31            first = false;
32            continue;
33        }
34        if assign == prev_assign {
35            count += 1;
36        } else {
37            rle_vec.push((prev_assign, count));
38            // Reset for next run
39            prev_assign = assign;
40            count = 1;
41        }
42    }
43
44    // Handle the last run
45    if count > 0 {
46        rle_vec.push((prev_assign, count));
47    }
48    rle_vec
49}
50
51/// Convert a run-length encoded (RLE) vector to a vector of assignments.
52///
53/// # Arguments
54///
55/// * `rle_vec` - A vector of tuples where the first element is the value and the second element is
56/// the length of the run.
57///
58/// # Returns
59///
60/// A vector of assignments.
61pub fn rle_to_vec(rle_vec: Vec<(u16, u16)>) -> Vec<u16> {
62    let mut output_vec: Vec<u16> = Vec::new();
63    for (val, len) in rle_vec {
64        for _ in 0..len {
65            output_vec.push(val);
66        }
67    }
68    output_vec
69}
70
71/// Sorts a JSON-formatted NetworkX graph file by a key.
72/// This function will sort the nodes in the graph by the key provided and
73/// then relabel the nodes in the graph from 0 to n-1 where n is the number
74/// of nodes in the graph. It will also relabel the edges in the graph to
75/// match the new node labels.
76///
77/// # Arguments
78///
79/// * `reader` - A reader for the JSON file to sort.
80/// * `writer` - A writer for the new JSON file.
81/// * `key` - The key to sort the nodes by.
82///
83/// # Returns
84///
85/// A Result containing a HashMap from the old node labels to the new node labels.
86pub fn sort_json_file_by_key<R: Read, W: Write>(
87    reader: R,
88    mut writer: W,
89    key: &str,
90) -> Result<HashMap<usize, usize>> {
91    logln!("Loading JSON file...");
92    let mut data: Value = serde_json::from_reader(reader).unwrap();
93
94    logln!("Sorting JSON file by key: {}", key);
95    if let Some(nodes) = data["nodes"].as_array_mut() {
96        nodes.sort_by(|a, b| {
97            let extract_value = |val: &Value| -> StdResult<u64, String> {
98                match &val[key] {
99                    Value::String(s) => s.parse::<u64>().map_err(|_| s.clone()),
100                    Value::Number(n) => n.as_u64().ok_or_else(|| n.to_string()),
101                    _ => Err(val[key].to_string()),
102                }
103            };
104
105            match (extract_value(a), extract_value(b)) {
106                (Ok(a_num), Ok(b_num)) => a_num.cmp(&b_num),
107                (Err(a_str), Err(b_str)) => a_str.cmp(&b_str),
108                (Err(a_str), Ok(b_num)) => a_str.cmp(&b_num.to_string()),
109                (Ok(a_num), Err(b_str)) => a_num.to_string().cmp(&b_str),
110            }
111        });
112    }
113
114    let mut node_map = HashMap::new();
115    let mut rev_node_map = HashMap::new();
116    if let Some(nodes) = data["nodes"].as_array_mut() {
117        for (i, node) in nodes.iter_mut().enumerate() {
118            log!("Relabeling node: {}\r", i + 1);
119            node_map.insert(node["id"].to_string().parse::<usize>().unwrap(), i);
120            rev_node_map.insert(i, node["id"].to_string().parse::<usize>().unwrap());
121            node["id"] = json!(i);
122        }
123    }
124    logln!();
125
126    let mut edge_array = Vec::new();
127    if let Some(edges) = data["adjacency"].as_array() {
128        for i in 0..edges.len() {
129            log!("Relabeling edge: {}\r", i + 1);
130            let edge_list_location =
131                rev_node_map[&data["nodes"][i]["id"].to_string().parse::<usize>().unwrap()];
132            let mut new_edge_lst = edges[edge_list_location].as_array().unwrap().clone();
133            for link in new_edge_lst.iter_mut() {
134                let new = node_map[&link["id"].to_string().parse::<usize>().unwrap()];
135                link["id"] = json!(new);
136            }
137            edge_array.push(new_edge_lst);
138        }
139    }
140    logln!();
141
142    data["adjacency"] = json!(edge_array);
143
144    logln!("Writing new json to file...");
145    writer.write_all(serde_json::to_string(&data).unwrap().as_bytes())?;
146
147    Ok(node_map)
148}
149
150#[cfg(test)]
151mod test {
152    use super::*;
153
154    #[test]
155    fn test_assign_to_rle() {
156        let assign_vec: Vec<u16> = vec![1, 1, 1, 2, 2, 3];
157
158        let result: Vec<(u16, u16)> = vec![(1, 3), (2, 2), (3, 1)];
159
160        assert_eq!(assign_to_rle(assign_vec), result);
161    }
162
163    #[test]
164    fn test_rle_to_vec() {
165        let rle_vec: Vec<(u16, u16)> = vec![(1, 3), (2, 2), (3, 1)];
166
167        let result: Vec<u16> = vec![1, 1, 1, 2, 2, 3];
168
169        assert_eq!(rle_to_vec(rle_vec), result);
170    }
171
172    #[test]
173    fn test_relabel_small_file() {
174        //
175        // 6 -- 7 -- 8
176        // |    |    |
177        // 3 -- 4 -- 5
178        // |    |    |
179        // 0 -- 1 -- 2
180        //
181        let input = r#"{
182    "adjacency": [
183        [ { "id": 3 }, { "id": 1 } ],
184        [ { "id": 0 }, { "id": 4 }, { "id": 2 } ],
185        [ { "id": 1 }, { "id": 5 } ],
186        [ { "id": 0 }, { "id": 6 }, { "id": 4 } ],
187        [ { "id": 1 }, { "id": 3 }, { "id": 7 }, { "id": 5 } ],
188        [ { "id": 2 }, { "id": 4 }, { "id": 8 } ],
189        [ { "id": 3 }, { "id": 7 } ],
190        [ { "id": 4 }, { "id": 6 }, { "id": 8 } ],
191        [ { "id": 5 }, { "id": 7 } ]
192    ],
193    "directed": false,
194    "graph": [],
195    "multigraph": false,
196    "nodes": [
197        {
198            "TOTPOP": 1,
199            "boundary_nodes": true,
200            "boundary_perim": 1,
201            "GEOID20": "20258288005",
202            "id": 0
203        },
204        {
205            "TOTPOP": 1,
206            "boundary_nodes": true,
207            "boundary_perim": 1,
208            "GEOID20": "20258288004",
209            "id": 1
210        },
211        {
212            "TOTPOP": 1,
213            "boundary_nodes": true,
214            "boundary_perim": 1,
215            "GEOID20": "20258288003",
216            "id": 2
217        },
218        {
219            "TOTPOP": 1,
220            "boundary_nodes": true,
221            "boundary_perim": 1,
222            "GEOID20": "20258288006",
223            "id": 3
224        },
225        {
226            "TOTPOP": 1,
227            "boundary_nodes": false,
228            "boundary_perim": 0,
229            "GEOID20": "20258288001",
230            "id": 4
231        },
232        {
233            "TOTPOP": 1,
234            "boundary_nodes": true,
235            "boundary_perim": 1,
236            "GEOID20": "20258288002",
237            "id": 5
238        },
239        {
240            "TOTPOP": 1,
241            "boundary_nodes": true,
242            "boundary_perim": 1,
243            "GEOID20": "20258288007",
244            "id": 6
245        },
246        {
247            "TOTPOP": 1,
248            "boundary_nodes": true,
249            "boundary_perim": 1,
250            "GEOID20": "20258288008",
251            "id": 7
252        },
253        {
254            "TOTPOP": 1,
255            "boundary_nodes": true,
256            "boundary_perim": 1,
257            "GEOID20": "20258288009",
258            "id": 8
259        }
260    ]
261}
262"#;
263
264        let reader = input.as_bytes();
265
266        let mut output = Vec::new();
267        let writer = &mut output;
268
269        let key = "GEOID20";
270
271        let _ = sort_json_file_by_key(reader, writer, key).unwrap();
272
273        //
274        // 6 -- 7 -- 8
275        // |    |    |
276        // 5 -- 0 -- 1
277        // |    |    |
278        // 4 -- 3 -- 2
279        //
280        let expected_output = r#"{
281    "adjacency": [
282        [ { "id": 3 }, { "id": 5 }, { "id": 7 }, { "id": 1 } ],
283        [ { "id": 2 }, { "id": 0 }, { "id": 8 } ], [ { "id": 3 }, { "id": 1 } ],
284        [ { "id": 4 }, { "id": 0 }, { "id": 2 } ],
285        [ { "id": 5 }, { "id": 3 } ],
286        [ { "id": 4 }, { "id": 6 }, { "id": 0 } ],
287        [ { "id": 5 }, { "id": 7 } ],
288        [ { "id": 0 }, { "id": 6 }, { "id": 8 } ],
289        [ { "id": 1 }, { "id": 7 } ]
290    ],
291    "directed": false,
292    "graph": [],
293    "multigraph": false,
294    "nodes": [
295        {
296            "GEOID20": "20258288001",
297            "TOTPOP": 1,
298            "boundary_nodes": false,
299            "boundary_perim": 0,
300            "id": 0
301        },
302        {
303            "GEOID20": "20258288002",
304            "TOTPOP": 1,
305            "boundary_nodes": true,
306            "boundary_perim": 1,
307            "id": 1
308        },
309        {
310            "GEOID20": "20258288003",
311            "TOTPOP": 1,
312            "boundary_nodes": true,
313            "boundary_perim": 1,
314            "id": 2
315        },
316        {
317            "GEOID20": "20258288004",
318            "TOTPOP": 1,
319            "boundary_nodes": true,
320            "boundary_perim": 1,
321            "id": 3
322        },
323        {
324            "GEOID20": "20258288005",
325            "TOTPOP": 1,
326            "boundary_nodes": true,
327            "boundary_perim": 1,
328            "id": 4
329        },
330        {
331            "GEOID20": "20258288006",
332            "TOTPOP": 1,
333            "boundary_nodes": true,
334            "boundary_perim": 1,
335            "id": 5
336        },
337        {
338            "GEOID20": "20258288007",
339            "TOTPOP": 1,
340            "boundary_nodes": true,
341            "boundary_perim": 1,
342            "id": 6
343        },
344        {
345            "GEOID20": "20258288008",
346            "TOTPOP": 1,
347            "boundary_nodes": true,
348            "boundary_perim": 1,
349            "id": 7
350        },
351        {
352            "GEOID20": "20258288009",
353            "TOTPOP": 1,
354            "boundary_nodes": true,
355            "boundary_perim": 1,
356            "id": 8
357        }
358    ]
359}
360"#;
361
362        logln!();
363        let output_json: Value = serde_json::from_slice(&output).unwrap();
364        let expected_output_json: Value = serde_json::from_str(expected_output).unwrap();
365
366        assert_eq!(output_json, expected_output_json);
367    }
368}