Skip to main content

geo_coding/tree/
io.rs

1use super::Node;
2use super::Tree2D;
3
4use alloc::collections::BTreeMap;
5use alloc::string::String;
6use alloc::vec;
7use alloc::vec::Vec;
8
9impl Tree2D<i64, String> {
10    /// Writes the tree into a stream in RGC format.
11    ///
12    /// RGC is an internal format of this crate that uses columnar storage to compress the data.
13    #[cfg_attr(docsrs, doc(cfg(feature = "std")))]
14    pub fn write(&self, mut writer: impl std::io::Write) -> std::io::Result<()> {
15        use super::write::Write;
16        writer.write_u32(self.nodes.len() as u32)?;
17        writer.write_sign_magnitude(self.nodes.iter().map(|Node { location, .. }| location[0]))?;
18        writer.write_sign_magnitude(self.nodes.iter().map(|Node { location, .. }| location[1]))?;
19        writer.write_magnitude_monotonic(
20            self.nodes
21                .iter()
22                .map(|Node { lesser_index, .. }| *lesser_index),
23        )?;
24        writer.write_magnitude_monotonic(
25            self.nodes
26                .iter()
27                .map(|Node { greater_index, .. }| *greater_index),
28        )?;
29        // Value is the number of occurences of a particular word.
30        let mut words: BTreeMap<&str, usize> = BTreeMap::new();
31        let mut word_counts = Vec::with_capacity(self.nodes.len());
32        for Node { value, .. } in self.nodes.iter() {
33            let mut word_count = 0_u32;
34            for word in value.split(' ') {
35                *words.entry(word).or_default() += 1;
36                word_count += 1;
37            }
38            word_counts.push(word_count);
39        }
40        for (i, (_word, index)) in words.iter_mut().enumerate() {
41            *index = i;
42        }
43        writer.write_magnitude(word_counts.iter().copied())?;
44        // Write dictionary.
45        writer.write_u32(words.len() as u32)?;
46        writer.write_magnitude(words.keys().map(|word| word.len() as u32))?;
47        for (word, _index) in words.iter() {
48            writer.write_bytes(word.as_bytes())?;
49        }
50        // Write names.
51        let indices: Vec<_> = self
52            .nodes
53            .iter()
54            .flat_map(|Node { value, .. }| {
55                value
56                    .split(' ')
57                    .map(|word| words.get(word).copied().expect("Must exist") as u32)
58            })
59            .collect();
60        writer.write_u32(indices.len() as u32)?;
61        writer.write_magnitude(indices.into_iter())?;
62        Ok(())
63    }
64
65    /// Reads a tree from the stream in RGC format.
66    ///
67    /// # RGC format
68    ///
69    /// RGC is an internal format of this crate that uses columnar storage to compress the data.
70    /// To obtain a file in such a format you can use CLI utility provided in the Git repo.
71    /// You can use any OSM-PBF file as an input.
72    ///
73    /// In the following script `europe-latest.osm.pbf` (32 GiB) is used as an input.
74    /// CLI utiliy extracts names from each node, compresses them, and produces `other.rgc.zst` and
75    /// `settlements.rgc.zst`. The total size of the resulting files is around 200 MiB.
76    ///
77    /// ```bash
78    /// cargo run --bin geo-coding-cli --release -- convert europe-latest.osm.pbf
79    /// ```
80    #[cfg_attr(docsrs, doc(cfg(feature = "std")))]
81    pub fn read(mut reader: impl std::io::Read) -> std::io::Result<Self> {
82        use super::read::Read;
83        let num_points = reader.read_u32()? as usize;
84        let mut nodes = vec![Node::default(); num_points];
85        let longitudes = reader.read_sign_magnitude(num_points)?;
86        for (node, longitude) in nodes.iter_mut().zip(longitudes.into_iter()) {
87            node.location[0] = longitude;
88        }
89        let latitudes = reader.read_sign_magnitude(num_points)?;
90        for (node, latitude) in nodes.iter_mut().zip(latitudes.into_iter()) {
91            node.location[1] = latitude;
92        }
93        let lesser_indices = reader.read_magnitude_monotonic(num_points)?;
94        for (node, lesser_index) in nodes.iter_mut().zip(lesser_indices.into_iter()) {
95            node.lesser_index = lesser_index;
96        }
97        let greater_indices = reader.read_magnitude_monotonic(num_points)?;
98        for (node, greater_index) in nodes.iter_mut().zip(greater_indices.into_iter()) {
99            node.greater_index = greater_index;
100        }
101        let word_counts = reader.read_magnitude(num_points)?;
102        let num_words = reader.read_u32()? as usize;
103        let word_lens = reader.read_magnitude(num_words)?;
104        let mut words = Vec::with_capacity(num_words);
105        let mut buf = Vec::new();
106        for word_len in word_lens.iter().copied() {
107            buf.resize(word_len as usize, 0_u8);
108            reader.read_bytes(&mut buf[..])?;
109            let word = String::from_utf8(std::mem::take(&mut buf))
110                .map_err(|_| std::io::Error::other("Non-UTF-8 word"))?;
111            words.push(word);
112        }
113        drop(word_lens);
114        drop(buf);
115        let num_indices = reader.read_u32()? as usize;
116        let mut indices = reader.read_magnitude(num_indices)?.into_iter();
117        let mut buf = String::new();
118        for (node, word_count) in nodes.iter_mut().zip(word_counts.into_iter()) {
119            buf.clear();
120            for _ in 0..word_count {
121                let index = indices.next().ok_or(std::io::ErrorKind::InvalidData)?;
122                let word = words[index as usize].as_str();
123                buf.push_str(word);
124                buf.push(' ');
125            }
126            buf.pop();
127            node.value = buf.clone();
128        }
129        // TODO validate
130        Ok(Self { nodes })
131    }
132}
133
134#[cfg(test)]
135mod tests {
136    use super::*;
137    use arbitrary::Arbitrary;
138    use arbitrary::Unstructured;
139    use arbtest::arbtest;
140
141    #[test]
142    fn io_works() {
143        arbtest(|u| {
144            let nodes: Vec<TestNode> = u.arbitrary()?;
145            let nodes: Vec<([i64; 2], String)> = nodes
146                .into_iter()
147                .map(|TestNode(location, name)| (location, name))
148                .collect();
149            let mut buf = Vec::new();
150            let tree = Tree2D::from_nodes(nodes);
151            tree.write(&mut buf).unwrap();
152            let actual = Tree2D::<i64, String>::read(&buf[..])
153                .unwrap_or_else(|e| panic!("Decoding failed: {e}; tree = {tree:?}"));
154            assert_eq!(tree, actual);
155            Ok(())
156        });
157    }
158
159    struct TestNode([i64; 2], String);
160
161    impl<'a> Arbitrary<'a> for TestNode {
162        fn arbitrary(u: &mut Unstructured<'a>) -> arbitrary::Result<Self> {
163            let latitude = u.int_in_range(-90_000_000_000..=90_000_000_000)?;
164            let longitude = u.int_in_range(-180_000_000_000..=180_000_000_000)?;
165            let name = u.arbitrary()?;
166            Ok(Self([longitude, latitude], name))
167        }
168    }
169}