Skip to main content

gtfs_structures/
gtfs_reader.rs

1use serde::Deserialize;
2use sha2::{Digest, Sha256};
3
4use crate::{Error, Gtfs, RawGtfs};
5use std::collections::HashMap;
6use std::convert::TryFrom;
7use std::fs::File;
8use std::io::Read;
9use std::path::Path;
10use web_time::Instant;
11
12/// Allows to parameterize how the parsing library behaves
13///
14/// ```
15///let gtfs = gtfs_structures::GtfsReader::default()
16///    .read_stop_times(false) // Won’t read the stop times to save time and memory
17///    .read_shapes(false) // Won’t read shapes to save time and memory
18///    .unkown_enum_as_default(false) // Won’t convert unknown enumerations into default (e.g. LocationType=42 considered as a stop point)
19///    .read("fixtures/zips/gtfs.zip")?;
20///assert_eq!(0, gtfs.trips.get("trip1").unwrap().stop_times.len());
21/// # Ok::<(), gtfs_structures::error::Error>(())
22///```
23///
24/// You can also get a [RawGtfs] by doing
25/// ```
26///let gtfs = gtfs_structures::GtfsReader::default()
27///    .read_stop_times(false)
28///    .raw()
29///    .read("fixtures/zips/gtfs.zip")?;
30///assert_eq!(1, gtfs.trips?.len());
31///assert_eq!(0, gtfs.stop_times?.len());
32/// # Ok::<(), gtfs_structures::error::Error>(())
33///```
34pub struct GtfsReader {
35    /// [crate::objects::StopTime] are very large and not always needed. This allows to skip reading them
36    pub read_stop_times: bool,
37    /// [crate::objects::Shape] are very large and not always needed. This allows to skip reading them
38    pub read_shapes: bool,
39    /// If a an enumeration has an unknown value, should we use the default value
40    pub unkown_enum_as_default: bool,
41    /// Avoid trimming the fields
42    ///
43    /// It is quite time consuming
44    /// If performance is an issue, and if your data is high quality, you can switch it off
45    pub trim_fields: bool,
46}
47
48impl Default for GtfsReader {
49    fn default() -> Self {
50        GtfsReader {
51            read_stop_times: true,
52            read_shapes: true,
53            unkown_enum_as_default: false,
54            trim_fields: true,
55        }
56    }
57}
58
59impl GtfsReader {
60    /// Configures the reader to read or not the stop times (default: true)
61    ///
62    /// This can be useful to save time and memory with large datasets when the timetable are not needed
63    /// Returns Self and can be chained
64    pub fn read_stop_times(mut self, read_stop_times: bool) -> Self {
65        self.read_stop_times = read_stop_times;
66        self
67    }
68
69    /// This can be useful to save time and memory with large datasets when shapes are not needed
70    /// Returns Self and can be chained
71    pub fn read_shapes(mut self, read_shapes: bool) -> Self {
72        self.read_shapes = read_shapes;
73        self
74    }
75
76    /// If a an enumeration has un unknown value, should we use the default value (default: false)
77    ///
78    /// For instance, if [crate::objects::Stop] has a [crate::objects::LocationType] with a value 42 in the GTFS
79    /// when true, we will parse it as StopPoint
80    /// when false, we will parse it as Unknown(42)
81    /// Returns Self and can be chained
82    pub fn unkown_enum_as_default(mut self, unkown_enum_as_default: bool) -> Self {
83        self.unkown_enum_as_default = unkown_enum_as_default;
84        self
85    }
86
87    /// Should the fields be trimmed (default: true)
88    ///
89    /// It is quite time consuming
90    /// If performance is an issue, and if your data is high quality, you can set it to false
91    pub fn trim_fields(mut self, trim_fields: bool) -> Self {
92        self.trim_fields = trim_fields;
93        self
94    }
95
96    /// Reads from an url (if starts with `"http"`), or a local path (either a directory or zipped file)
97    ///
98    /// To read from an url, build with read-url feature
99    /// See also [Gtfs::from_url] and [Gtfs::from_path] if you don’t want the library to guess
100    #[cfg(not(target_arch = "wasm32"))]
101    pub fn read(self, gtfs: &str) -> Result<Gtfs, Error> {
102        self.raw().read(gtfs).and_then(Gtfs::try_from)
103    }
104
105    /// Reads the raw GTFS from a local zip archive or local directory
106    pub fn read_from_path<P>(self, path: P) -> Result<Gtfs, Error>
107    where
108        P: AsRef<Path>,
109    {
110        self.raw().read_from_path(path).and_then(Gtfs::try_from)
111    }
112
113    /// Reads the GTFS from a remote url
114    ///
115    /// The library must be built with the read-url feature. Not available on WASM targets.
116    #[cfg(all(feature = "read-url", not(target_arch = "wasm32")))]
117    pub fn read_from_url<U: reqwest::IntoUrl>(self, url: U) -> Result<Gtfs, Error> {
118        self.raw().read_from_url(url).and_then(Gtfs::try_from)
119    }
120
121    /// Asynchronously reads the GTFS from a remote url
122    ///
123    /// The library must be built with the read-url feature
124    #[cfg(feature = "read-url")]
125    pub async fn read_from_url_async<U: reqwest::IntoUrl>(self, url: U) -> Result<Gtfs, Error> {
126        self.raw()
127            .read_from_url_async(url)
128            .await
129            .and_then(Gtfs::try_from)
130    }
131
132    /// Read the Gtfs as a [RawGtfs].
133    ///
134    /// ```
135    ///let gtfs = gtfs_structures::GtfsReader::default()
136    ///    .read_stop_times(false)
137    ///    .raw()
138    ///    .read("fixtures/zips/gtfs.zip")?;
139    ///assert_eq!(1, gtfs.trips?.len());
140    ///assert_eq!(0, gtfs.stop_times?.len());
141    /// # Ok::<(), gtfs_structures::error::Error>(())
142    ///```
143    pub fn raw(self) -> RawGtfsReader {
144        RawGtfsReader { reader: self }
145    }
146}
147
148/// This reader generates [RawGtfs]. It must be built using [GtfsReader::raw]
149///
150/// The methods to read a Gtfs are the same as for [GtfsReader]
151pub struct RawGtfsReader {
152    reader: GtfsReader,
153}
154
155impl RawGtfsReader {
156    fn read_from_directory(&self, p: &std::path::Path) -> Result<RawGtfs, Error> {
157        let start_of_read_instant = Instant::now();
158        // Thoses files are not mandatory
159        // We use None if they don’t exist, not an Error
160        let files = std::fs::read_dir(p)?
161            .filter_map(|d| {
162                d.ok().and_then(|e| {
163                    e.path()
164                        .strip_prefix(p)
165                        .ok()
166                        .and_then(|f| f.to_str().map(|s| s.to_owned()))
167                })
168            })
169            .collect();
170
171        let mut result = RawGtfs {
172            trips: self.read_objs_from_path(p.join("trips.txt")),
173            calendar: self.read_objs_from_optional_path(p, "calendar.txt"),
174            calendar_dates: self.read_objs_from_optional_path(p, "calendar_dates.txt"),
175            stops: self.read_objs_from_path(p.join("stops.txt")),
176            routes: self.read_objs_from_path(p.join("routes.txt")),
177            stop_times: if self.reader.read_stop_times {
178                self.read_objs_from_path(p.join("stop_times.txt"))
179            } else {
180                Ok(Vec::new())
181            },
182            agencies: self.read_objs_from_path(p.join("agency.txt")),
183            shapes: self.read_objs_from_optional_path(p, "shapes.txt"),
184            fare_attributes: self.read_objs_from_optional_path(p, "fare_attributes.txt"),
185            fare_rules: self.read_objs_from_optional_path(p, "fare_rules.txt"),
186            fare_products: self.read_objs_from_optional_path(p, "fare_products.txt"),
187            fare_media: self.read_objs_from_optional_path(p, "fare_media.txt"),
188            rider_categories: self.read_objs_from_optional_path(p, "rider_categories.txt"),
189            frequencies: self.read_objs_from_optional_path(p, "frequencies.txt"),
190            transfers: self.read_objs_from_optional_path(p, "transfers.txt"),
191            pathways: self.read_objs_from_optional_path(p, "pathways.txt"),
192            feed_info: self.read_objs_from_optional_path(p, "feed_info.txt"),
193            read_duration: start_of_read_instant.elapsed(),
194            translations: self.read_objs_from_optional_path(p, "translations.txt"),
195            ticketing_deep_links: self.read_objs_from_optional_path(p, "ticketing_deep_links.txt"),
196            ticketing_identifiers: self
197                .read_objs_from_optional_path(p, "ticketing_identifiers.txt"),
198            files,
199            source_format: crate::SourceFormat::Directory,
200            sha256: None,
201        };
202
203        if self.reader.unkown_enum_as_default {
204            result.unknown_to_default();
205        }
206        Ok(result)
207    }
208
209    /// Reads from an url (if starts with `"http"`) if the feature `read-url` is activated,
210    /// or a local path (either a directory or zipped file). Not available on WASM targets.
211    #[cfg(not(target_arch = "wasm32"))]
212    pub fn read(self, gtfs: &str) -> Result<RawGtfs, Error> {
213        #[cfg(feature = "read-url")]
214        if gtfs.starts_with("http") {
215            return self.read_from_url(gtfs);
216        }
217        self.read_from_path(gtfs)
218    }
219
220    /// Reads the GTFS from a remote url. Not available on WASM targets.
221    #[cfg(all(feature = "read-url", not(target_arch = "wasm32")))]
222    pub fn read_from_url<U: reqwest::IntoUrl>(self, url: U) -> Result<RawGtfs, Error> {
223        let mut res = reqwest::blocking::get(url)?;
224        let mut body = Vec::new();
225        res.read_to_end(&mut body)?;
226        let cursor = std::io::Cursor::new(body);
227        self.read_from_reader(cursor)
228    }
229
230    /// Asynchronously reads the GTFS from a remote url
231    #[cfg(feature = "read-url")]
232    pub async fn read_from_url_async<U: reqwest::IntoUrl>(self, url: U) -> Result<RawGtfs, Error> {
233        let res = reqwest::get(url).await?.bytes().await?;
234        let reader = std::io::Cursor::new(res);
235        self.read_from_reader(reader)
236    }
237
238    /// Reads the raw GTFS from a local zip archive or local directory
239    pub fn read_from_path<P>(&self, path: P) -> Result<RawGtfs, Error>
240    where
241        P: AsRef<Path>,
242    {
243        let p = path.as_ref();
244        if p.is_file() {
245            let reader = File::open(p)?;
246            self.read_from_reader(reader)
247        } else if p.is_dir() {
248            self.read_from_directory(p)
249        } else {
250            Err(Error::NotFileNorDirectory(format!("{}", p.display())))
251        }
252    }
253
254    pub fn read_from_reader<T: std::io::Read + std::io::Seek>(
255        &self,
256        reader: T,
257    ) -> Result<RawGtfs, Error> {
258        let start_of_read_instant = Instant::now();
259        let hasher = Sha256::new();
260        let mut buf_reader = std::io::BufReader::new(reader);
261        let mut hash_io = digest_io::IoWrapper(hasher);
262        let _n = std::io::copy(&mut buf_reader, &mut hash_io)?;
263        let digest_io::IoWrapper(hasher) = hash_io;
264        let hash = hasher.finalize();
265        let mut archive = zip::ZipArchive::new(buf_reader)?;
266        let mut file_mapping = HashMap::new();
267        let mut files = Vec::new();
268
269        for i in 0..archive.len() {
270            let archive_file = archive.by_index(i)?;
271            files.push(archive_file.name().to_owned());
272
273            for gtfs_file in &[
274                "agency.txt",
275                "calendar.txt",
276                "calendar_dates.txt",
277                "routes.txt",
278                "stops.txt",
279                "stop_times.txt",
280                "trips.txt",
281                "fare_attributes.txt",
282                "fare_rules.txt",
283                "fare_products.txt",
284                "fare_media.txt",
285                "rider_categories.txt",
286                "frequencies.txt",
287                "transfers.txt",
288                "pathways.txt",
289                "feed_info.txt",
290                "shapes.txt",
291            ] {
292                let path = std::path::Path::new(archive_file.name());
293                if path.file_name() == Some(std::ffi::OsStr::new(gtfs_file)) {
294                    file_mapping.insert(gtfs_file, i);
295                    break;
296                }
297            }
298        }
299
300        let mut result = RawGtfs {
301            agencies: self.read_file(&file_mapping, &mut archive, "agency.txt"),
302            calendar: self.read_optional_file(&file_mapping, &mut archive, "calendar.txt"),
303            calendar_dates: self.read_optional_file(
304                &file_mapping,
305                &mut archive,
306                "calendar_dates.txt",
307            ),
308            routes: self.read_file(&file_mapping, &mut archive, "routes.txt"),
309            stops: self.read_file(&file_mapping, &mut archive, "stops.txt"),
310            stop_times: if self.reader.read_stop_times {
311                self.read_file(&file_mapping, &mut archive, "stop_times.txt")
312            } else {
313                Ok(Vec::new())
314            },
315            trips: self.read_file(&file_mapping, &mut archive, "trips.txt"),
316            fare_attributes: self.read_optional_file(
317                &file_mapping,
318                &mut archive,
319                "fare_attributes.txt",
320            ),
321            fare_rules: self.read_optional_file(&file_mapping, &mut archive, "fare_rules.txt"),
322            fare_products: self.read_optional_file(
323                &file_mapping,
324                &mut archive,
325                "fare_products.txt",
326            ),
327            fare_media: self.read_optional_file(&file_mapping, &mut archive, "fare_media.txt"),
328            rider_categories: self.read_optional_file(
329                &file_mapping,
330                &mut archive,
331                "rider_categories.txt",
332            ),
333            frequencies: self.read_optional_file(&file_mapping, &mut archive, "frequencies.txt"),
334            transfers: self.read_optional_file(&file_mapping, &mut archive, "transfers.txt"),
335            pathways: self.read_optional_file(&file_mapping, &mut archive, "pathways.txt"),
336            feed_info: self.read_optional_file(&file_mapping, &mut archive, "feed_info.txt"),
337            shapes: if self.reader.read_shapes {
338                self.read_optional_file(&file_mapping, &mut archive, "shapes.txt")
339            } else {
340                Some(Ok(Vec::new()))
341            },
342            translations: self.read_optional_file(&file_mapping, &mut archive, "translations.txt"),
343            ticketing_deep_links: self.read_optional_file(
344                &file_mapping,
345                &mut archive,
346                "ticketing_deep_links.txt",
347            ),
348            ticketing_identifiers: self.read_optional_file(
349                &file_mapping,
350                &mut archive,
351                "ticketing_identifiers.txt",
352            ),
353            read_duration: start_of_read_instant.elapsed(),
354            files,
355            source_format: crate::SourceFormat::Zip,
356            sha256: Some(base16ct::lower::encode_string(&hash)),
357        };
358
359        if self.reader.unkown_enum_as_default {
360            result.unknown_to_default();
361        }
362        Ok(result)
363    }
364
365    fn read_objs<T, O>(&self, mut reader: T, file_name: &str) -> Result<Vec<O>, Error>
366    where
367        for<'de> O: Deserialize<'de>,
368        T: std::io::Read,
369    {
370        let mut bom = [0; 3];
371        reader
372            .read_exact(&mut bom)
373            .map_err(|e| Error::NamedFileIO {
374                file_name: file_name.to_owned(),
375                source: Box::new(e),
376            })?;
377
378        let chained = if bom != [0xefu8, 0xbbu8, 0xbfu8] {
379            bom.chain(reader)
380        } else {
381            [].chain(reader)
382        };
383
384        let mut reader = csv::ReaderBuilder::new()
385            .flexible(true)
386            .trim(if self.reader.trim_fields {
387                csv::Trim::Fields
388            } else {
389                csv::Trim::None
390            })
391            .from_reader(chained);
392        // We store the headers to be able to return them in case of errors
393        let headers = reader
394            .headers()
395            .map_err(|e| Error::CSVError {
396                file_name: file_name.to_owned(),
397                source: e,
398                line_in_error: None,
399            })?
400            .clone()
401            .into_iter()
402            .map(|x| x.trim())
403            .collect::<csv::StringRecord>();
404
405        // Pre-allocate a StringRecord for performance reasons
406        let mut rec = csv::StringRecord::new();
407        let mut objs = Vec::new();
408
409        // Read each record into the pre-allocated StringRecord one at a time
410        while reader.read_record(&mut rec).map_err(|e| Error::CSVError {
411            file_name: file_name.to_owned(),
412            source: e,
413            line_in_error: None,
414        })? {
415            let obj = rec
416                .deserialize(Some(&headers))
417                .map_err(|e| Error::CSVError {
418                    file_name: file_name.to_owned(),
419                    source: e,
420                    line_in_error: Some(crate::error::LineError {
421                        headers: headers.into_iter().map(String::from).collect(),
422                        values: rec.into_iter().map(String::from).collect(),
423                    }),
424                })?;
425            objs.push(obj);
426        }
427        Ok(objs)
428    }
429
430    fn read_objs_from_path<O>(&self, path: std::path::PathBuf) -> Result<Vec<O>, Error>
431    where
432        for<'de> O: Deserialize<'de>,
433    {
434        let file_name = path
435            .file_name()
436            .and_then(|f| f.to_str())
437            .unwrap_or("invalid_file_name")
438            .to_string();
439        if path.exists() {
440            File::open(path)
441                .map_err(|e| Error::NamedFileIO {
442                    file_name: file_name.to_owned(),
443                    source: Box::new(e),
444                })
445                .and_then(|r| self.read_objs(r, &file_name))
446        } else {
447            Err(Error::MissingFile(file_name))
448        }
449    }
450
451    fn read_objs_from_optional_path<O>(
452        &self,
453        dir_path: &std::path::Path,
454        file_name: &str,
455    ) -> Option<Result<Vec<O>, Error>>
456    where
457        for<'de> O: Deserialize<'de>,
458    {
459        File::open(dir_path.join(file_name))
460            .ok()
461            .map(|r| self.read_objs(r, file_name))
462    }
463
464    fn read_file<O, T>(
465        &self,
466        file_mapping: &HashMap<&&str, usize>,
467        archive: &mut zip::ZipArchive<T>,
468        file_name: &str,
469    ) -> Result<Vec<O>, Error>
470    where
471        for<'de> O: Deserialize<'de>,
472        T: std::io::Read + std::io::Seek,
473    {
474        self.read_optional_file(file_mapping, archive, file_name)
475            .unwrap_or_else(|| Err(Error::MissingFile(file_name.to_owned())))
476    }
477
478    fn read_optional_file<O, T>(
479        &self,
480        file_mapping: &HashMap<&&str, usize>,
481        archive: &mut zip::ZipArchive<T>,
482        file_name: &str,
483    ) -> Option<Result<Vec<O>, Error>>
484    where
485        for<'de> O: Deserialize<'de>,
486        T: std::io::Read + std::io::Seek,
487    {
488        file_mapping.get(&file_name).map(|i| {
489            self.read_objs(
490                archive.by_index(*i).map_err(|e| Error::NamedFileIO {
491                    file_name: file_name.to_owned(),
492                    source: Box::new(e),
493                })?,
494                file_name,
495            )
496        })
497    }
498}