Skip to main content

gtfs_structures/
gtfs_reader.rs

1use serde::Deserialize;
2use sha2::{Digest, Sha256};
3
4use crate::{Error, Gtfs, RawGtfs};
5use std::collections::HashMap;
6use std::convert::TryFrom;
7use std::fs::File;
8use std::io::Read;
9use std::path::Path;
10use web_time::Instant;
11
12/// Allows to parameterize how the parsing library behaves
13///
14/// ```
15///let gtfs = gtfs_structures::GtfsReader::default()
16///    .read_stop_times(false) // Won’t read the stop times to save time and memory
17///    .read_shapes(false) // Won’t read shapes to save time and memory
18///    .unkown_enum_as_default(false) // Won’t convert unknown enumerations into default (e.g. LocationType=42 considered as a stop point)
19///    .read("fixtures/zips/gtfs.zip")?;
20///assert_eq!(0, gtfs.trips.get("trip1").unwrap().stop_times.len());
21/// # Ok::<(), gtfs_structures::error::Error>(())
22///```
23///
24/// You can also get a [RawGtfs] by doing
25/// ```
26///let gtfs = gtfs_structures::GtfsReader::default()
27///    .read_stop_times(false)
28///    .raw()
29///    .read("fixtures/zips/gtfs.zip")?;
30///assert_eq!(1, gtfs.trips?.len());
31///assert_eq!(0, gtfs.stop_times?.len());
32/// # Ok::<(), gtfs_structures::error::Error>(())
33///```
34pub struct GtfsReader {
35    /// [crate::objects::StopTime] are very large and not always needed. This allows to skip reading them
36    pub read_stop_times: bool,
37    /// [crate::objects::Shape] are very large and not always needed. This allows to skip reading them
38    pub read_shapes: bool,
39    /// If a an enumeration has an unknown value, should we use the default value
40    pub unkown_enum_as_default: bool,
41    /// Avoid trimming the fields
42    ///
43    /// It is quite time consuming
44    /// If performance is an issue, and if your data is high quality, you can switch it off
45    pub trim_fields: bool,
46}
47
48impl Default for GtfsReader {
49    fn default() -> Self {
50        GtfsReader {
51            read_stop_times: true,
52            read_shapes: true,
53            unkown_enum_as_default: false,
54            trim_fields: true,
55        }
56    }
57}
58
59impl GtfsReader {
60    /// Configures the reader to read or not the stop times (default: true)
61    ///
62    /// This can be useful to save time and memory with large datasets when the timetable are not needed
63    /// Returns Self and can be chained
64    pub fn read_stop_times(mut self, read_stop_times: bool) -> Self {
65        self.read_stop_times = read_stop_times;
66        self
67    }
68
69    /// This can be useful to save time and memory with large datasets when shapes are not needed
70    /// Returns Self and can be chained
71    pub fn read_shapes(mut self, read_shapes: bool) -> Self {
72        self.read_shapes = read_shapes;
73        self
74    }
75
76    /// If a an enumeration has un unknown value, should we use the default value (default: false)
77    ///
78    /// For instance, if [crate::objects::Stop] has a [crate::objects::LocationType] with a value 42 in the GTFS
79    /// when true, we will parse it as StopPoint
80    /// when false, we will parse it as Unknown(42)
81    /// Returns Self and can be chained
82    pub fn unkown_enum_as_default(mut self, unkown_enum_as_default: bool) -> Self {
83        self.unkown_enum_as_default = unkown_enum_as_default;
84        self
85    }
86
87    /// Should the fields be trimmed (default: true)
88    ///
89    /// It is quite time consuming
90    /// If performance is an issue, and if your data is high quality, you can set it to false
91    pub fn trim_fields(mut self, trim_fields: bool) -> Self {
92        self.trim_fields = trim_fields;
93        self
94    }
95
96    /// Reads from an url (if starts with `"http"`), or a local path (either a directory or zipped file)
97    ///
98    /// To read from an url, build with read-url feature
99    /// See also [Gtfs::from_url] and [Gtfs::from_path] if you don’t want the library to guess
100    #[cfg(not(target_arch = "wasm32"))]
101    pub fn read(self, gtfs: &str) -> Result<Gtfs, Error> {
102        self.raw().read(gtfs).and_then(Gtfs::try_from)
103    }
104
105    /// Reads the raw GTFS from a local zip archive or local directory
106    pub fn read_from_path<P>(self, path: P) -> Result<Gtfs, Error>
107    where
108        P: AsRef<Path>,
109    {
110        self.raw().read_from_path(path).and_then(Gtfs::try_from)
111    }
112
113    /// Reads the GTFS from a remote url
114    ///
115    /// The library must be built with the read-url feature. Not available on WASM targets.
116    #[cfg(all(feature = "read-url", not(target_arch = "wasm32")))]
117    pub fn read_from_url<U: reqwest::IntoUrl>(self, url: U) -> Result<Gtfs, Error> {
118        self.raw().read_from_url(url).and_then(Gtfs::try_from)
119    }
120
121    /// Asynchronously reads the GTFS from a remote url
122    ///
123    /// The library must be built with the read-url feature
124    #[cfg(feature = "read-url")]
125    pub async fn read_from_url_async<U: reqwest::IntoUrl>(self, url: U) -> Result<Gtfs, Error> {
126        self.raw()
127            .read_from_url_async(url)
128            .await
129            .and_then(Gtfs::try_from)
130    }
131
132    /// Read the Gtfs as a [RawGtfs].
133    ///
134    /// ```
135    ///let gtfs = gtfs_structures::GtfsReader::default()
136    ///    .read_stop_times(false)
137    ///    .raw()
138    ///    .read("fixtures/zips/gtfs.zip")?;
139    ///assert_eq!(1, gtfs.trips?.len());
140    ///assert_eq!(0, gtfs.stop_times?.len());
141    /// # Ok::<(), gtfs_structures::error::Error>(())
142    ///```
143    pub fn raw(self) -> RawGtfsReader {
144        RawGtfsReader { reader: self }
145    }
146}
147
148/// This reader generates [RawGtfs]. It must be built using [GtfsReader::raw]
149///
150/// The methods to read a Gtfs are the same as for [GtfsReader]
151pub struct RawGtfsReader {
152    reader: GtfsReader,
153}
154
155impl RawGtfsReader {
156    fn read_from_directory(&self, p: &std::path::Path) -> Result<RawGtfs, Error> {
157        let start_of_read_instant = Instant::now();
158        // Thoses files are not mandatory
159        // We use None if they don’t exist, not an Error
160        let files = std::fs::read_dir(p)?
161            .filter_map(|d| {
162                d.ok().and_then(|e| {
163                    e.path()
164                        .strip_prefix(p)
165                        .ok()
166                        .and_then(|f| f.to_str().map(|s| s.to_owned()))
167                })
168            })
169            .collect();
170
171        let mut result = RawGtfs {
172            trips: self.read_objs_from_path(p.join("trips.txt")),
173            calendar: self.read_objs_from_optional_path(p, "calendar.txt"),
174            calendar_dates: self.read_objs_from_optional_path(p, "calendar_dates.txt"),
175            stops: self.read_objs_from_path(p.join("stops.txt")),
176            routes: self.read_objs_from_path(p.join("routes.txt")),
177            stop_times: if self.reader.read_stop_times {
178                self.read_objs_from_path(p.join("stop_times.txt"))
179            } else {
180                Ok(Vec::new())
181            },
182            agencies: self.read_objs_from_path(p.join("agency.txt")),
183            shapes: self.read_objs_from_optional_path(p, "shapes.txt"),
184            fare_attributes: self.read_objs_from_optional_path(p, "fare_attributes.txt"),
185            fare_rules: self.read_objs_from_optional_path(p, "fare_rules.txt"),
186            fare_products: self.read_objs_from_optional_path(p, "fare_products.txt"),
187            fare_media: self.read_objs_from_optional_path(p, "fare_media.txt"),
188            rider_categories: self.read_objs_from_optional_path(p, "rider_categories.txt"),
189            frequencies: self.read_objs_from_optional_path(p, "frequencies.txt"),
190            transfers: self.read_objs_from_optional_path(p, "transfers.txt"),
191            pathways: self.read_objs_from_optional_path(p, "pathways.txt"),
192            feed_info: self.read_objs_from_optional_path(p, "feed_info.txt"),
193            read_duration: start_of_read_instant.elapsed(),
194            translations: self.read_objs_from_optional_path(p, "translations.txt"),
195            ticketing_deep_links: self.read_objs_from_optional_path(p, "ticketing_deep_links.txt"),
196            ticketing_identifiers: self
197                .read_objs_from_optional_path(p, "ticketing_identifiers.txt"),
198            files,
199            source_format: crate::SourceFormat::Directory,
200            sha256: None,
201        };
202
203        if self.reader.unkown_enum_as_default {
204            result.unknown_to_default();
205        }
206        Ok(result)
207    }
208
209    /// Reads from an url (if starts with `"http"`) if the feature `read-url` is activated,
210    /// or a local path (either a directory or zipped file). Not available on WASM targets.
211    #[cfg(not(target_arch = "wasm32"))]
212    pub fn read(self, gtfs: &str) -> Result<RawGtfs, Error> {
213        #[cfg(feature = "read-url")]
214        if gtfs.starts_with("http") {
215            return self.read_from_url(gtfs);
216        }
217        self.read_from_path(gtfs)
218    }
219
220    /// Reads the GTFS from a remote url. Not available on WASM targets.
221    #[cfg(all(feature = "read-url", not(target_arch = "wasm32")))]
222    pub fn read_from_url<U: reqwest::IntoUrl>(self, url: U) -> Result<RawGtfs, Error> {
223        let mut res = reqwest::blocking::get(url)?;
224        let mut body = Vec::new();
225        res.read_to_end(&mut body)?;
226        let cursor = std::io::Cursor::new(body);
227        self.read_from_reader(cursor)
228    }
229
230    /// Asynchronously reads the GTFS from a remote url
231    #[cfg(feature = "read-url")]
232    pub async fn read_from_url_async<U: reqwest::IntoUrl>(self, url: U) -> Result<RawGtfs, Error> {
233        let res = reqwest::get(url).await?.bytes().await?;
234        let reader = std::io::Cursor::new(res);
235        self.read_from_reader(reader)
236    }
237
238    /// Reads the raw GTFS from a local zip archive or local directory
239    pub fn read_from_path<P>(&self, path: P) -> Result<RawGtfs, Error>
240    where
241        P: AsRef<Path>,
242    {
243        let p = path.as_ref();
244        if p.is_file() {
245            let reader = File::open(p)?;
246            self.read_from_reader(reader)
247        } else if p.is_dir() {
248            self.read_from_directory(p)
249        } else {
250            Err(Error::NotFileNorDirectory(format!("{}", p.display())))
251        }
252    }
253
254    pub fn read_from_reader<T: std::io::Read + std::io::Seek>(
255        &self,
256        reader: T,
257    ) -> Result<RawGtfs, Error> {
258        let start_of_read_instant = Instant::now();
259        let hasher = Sha256::new();
260        let mut buf_reader = std::io::BufReader::new(reader);
261        let mut hash_io = digest_io::IoWrapper(hasher);
262        let _n = std::io::copy(&mut buf_reader, &mut hash_io)?;
263        let digest_io::IoWrapper(hasher) = hash_io;
264        let hash = hasher.finalize();
265        let mut archive = zip::ZipArchive::new(buf_reader)?;
266        let mut file_mapping = HashMap::new();
267        let mut files = Vec::new();
268
269        for i in 0..archive.len() {
270            let archive_file = archive.by_index(i)?;
271            files.push(archive_file.name().to_owned());
272
273            for gtfs_file in &[
274                "agency.txt",
275                "calendar.txt",
276                "calendar_dates.txt",
277                "routes.txt",
278                "stops.txt",
279                "stop_times.txt",
280                "trips.txt",
281                "fare_attributes.txt",
282                "fare_rules.txt",
283                "fare_products.txt",
284                "fare_media.txt",
285                "rider_categories.txt",
286                "frequencies.txt",
287                "transfers.txt",
288                "pathways.txt",
289                "feed_info.txt",
290                "shapes.txt",
291                "translations.txt",
292                "ticketing_deep_links.txt",
293                "ticketing_identifiers.txt",
294            ] {
295                let path = std::path::Path::new(archive_file.name());
296                if path.file_name() == Some(std::ffi::OsStr::new(gtfs_file)) {
297                    file_mapping.insert(gtfs_file, i);
298                    break;
299                }
300            }
301        }
302
303        let mut result = RawGtfs {
304            agencies: self.read_file(&file_mapping, &mut archive, "agency.txt"),
305            calendar: self.read_optional_file(&file_mapping, &mut archive, "calendar.txt"),
306            calendar_dates: self.read_optional_file(
307                &file_mapping,
308                &mut archive,
309                "calendar_dates.txt",
310            ),
311            routes: self.read_file(&file_mapping, &mut archive, "routes.txt"),
312            stops: self.read_file(&file_mapping, &mut archive, "stops.txt"),
313            stop_times: if self.reader.read_stop_times {
314                self.read_file(&file_mapping, &mut archive, "stop_times.txt")
315            } else {
316                Ok(Vec::new())
317            },
318            trips: self.read_file(&file_mapping, &mut archive, "trips.txt"),
319            fare_attributes: self.read_optional_file(
320                &file_mapping,
321                &mut archive,
322                "fare_attributes.txt",
323            ),
324            fare_rules: self.read_optional_file(&file_mapping, &mut archive, "fare_rules.txt"),
325            fare_products: self.read_optional_file(
326                &file_mapping,
327                &mut archive,
328                "fare_products.txt",
329            ),
330            fare_media: self.read_optional_file(&file_mapping, &mut archive, "fare_media.txt"),
331            rider_categories: self.read_optional_file(
332                &file_mapping,
333                &mut archive,
334                "rider_categories.txt",
335            ),
336            frequencies: self.read_optional_file(&file_mapping, &mut archive, "frequencies.txt"),
337            transfers: self.read_optional_file(&file_mapping, &mut archive, "transfers.txt"),
338            pathways: self.read_optional_file(&file_mapping, &mut archive, "pathways.txt"),
339            feed_info: self.read_optional_file(&file_mapping, &mut archive, "feed_info.txt"),
340            shapes: if self.reader.read_shapes {
341                self.read_optional_file(&file_mapping, &mut archive, "shapes.txt")
342            } else {
343                Some(Ok(Vec::new()))
344            },
345            translations: self.read_optional_file(&file_mapping, &mut archive, "translations.txt"),
346            ticketing_deep_links: self.read_optional_file(
347                &file_mapping,
348                &mut archive,
349                "ticketing_deep_links.txt",
350            ),
351            ticketing_identifiers: self.read_optional_file(
352                &file_mapping,
353                &mut archive,
354                "ticketing_identifiers.txt",
355            ),
356            read_duration: start_of_read_instant.elapsed(),
357            files,
358            source_format: crate::SourceFormat::Zip,
359            sha256: Some(base16ct::lower::encode_string(&hash)),
360        };
361
362        if self.reader.unkown_enum_as_default {
363            result.unknown_to_default();
364        }
365        Ok(result)
366    }
367
368    fn read_objs<T, O>(&self, mut reader: T, file_name: &str) -> Result<Vec<O>, Error>
369    where
370        for<'de> O: Deserialize<'de>,
371        T: std::io::Read,
372    {
373        let mut bom = [0; 3];
374        reader
375            .read_exact(&mut bom)
376            .map_err(|e| Error::NamedFileIO {
377                file_name: file_name.to_owned(),
378                source: Box::new(e),
379            })?;
380
381        let chained = if bom != [0xefu8, 0xbbu8, 0xbfu8] {
382            bom.chain(reader)
383        } else {
384            [].chain(reader)
385        };
386
387        let mut reader = csv::ReaderBuilder::new()
388            .flexible(true)
389            .trim(if self.reader.trim_fields {
390                csv::Trim::Fields
391            } else {
392                csv::Trim::None
393            })
394            .from_reader(chained);
395        // We store the headers to be able to return them in case of errors
396        let headers = reader
397            .headers()
398            .map_err(|e| Error::CSVError {
399                file_name: file_name.to_owned(),
400                source: e,
401                line_in_error: None,
402            })?
403            .clone()
404            .into_iter()
405            .map(|x| x.trim())
406            .collect::<csv::StringRecord>();
407
408        // Pre-allocate a StringRecord for performance reasons
409        let mut rec = csv::StringRecord::new();
410        let mut objs = Vec::new();
411
412        // Read each record into the pre-allocated StringRecord one at a time
413        while reader.read_record(&mut rec).map_err(|e| Error::CSVError {
414            file_name: file_name.to_owned(),
415            source: e,
416            line_in_error: None,
417        })? {
418            let obj = rec
419                .deserialize(Some(&headers))
420                .map_err(|e| Error::CSVError {
421                    file_name: file_name.to_owned(),
422                    source: e,
423                    line_in_error: Some(crate::error::LineError {
424                        headers: headers.into_iter().map(String::from).collect(),
425                        values: rec.into_iter().map(String::from).collect(),
426                    }),
427                })?;
428            objs.push(obj);
429        }
430        Ok(objs)
431    }
432
433    fn read_objs_from_path<O>(&self, path: std::path::PathBuf) -> Result<Vec<O>, Error>
434    where
435        for<'de> O: Deserialize<'de>,
436    {
437        let file_name = path
438            .file_name()
439            .and_then(|f| f.to_str())
440            .unwrap_or("invalid_file_name")
441            .to_string();
442        if path.exists() {
443            File::open(path)
444                .map_err(|e| Error::NamedFileIO {
445                    file_name: file_name.to_owned(),
446                    source: Box::new(e),
447                })
448                .and_then(|r| self.read_objs(r, &file_name))
449        } else {
450            Err(Error::MissingFile(file_name))
451        }
452    }
453
454    fn read_objs_from_optional_path<O>(
455        &self,
456        dir_path: &std::path::Path,
457        file_name: &str,
458    ) -> Option<Result<Vec<O>, Error>>
459    where
460        for<'de> O: Deserialize<'de>,
461    {
462        File::open(dir_path.join(file_name))
463            .ok()
464            .map(|r| self.read_objs(r, file_name))
465    }
466
467    fn read_file<O, T>(
468        &self,
469        file_mapping: &HashMap<&&str, usize>,
470        archive: &mut zip::ZipArchive<T>,
471        file_name: &str,
472    ) -> Result<Vec<O>, Error>
473    where
474        for<'de> O: Deserialize<'de>,
475        T: std::io::Read + std::io::Seek,
476    {
477        self.read_optional_file(file_mapping, archive, file_name)
478            .unwrap_or_else(|| Err(Error::MissingFile(file_name.to_owned())))
479    }
480
481    fn read_optional_file<O, T>(
482        &self,
483        file_mapping: &HashMap<&&str, usize>,
484        archive: &mut zip::ZipArchive<T>,
485        file_name: &str,
486    ) -> Option<Result<Vec<O>, Error>>
487    where
488        for<'de> O: Deserialize<'de>,
489        T: std::io::Read + std::io::Seek,
490    {
491        file_mapping.get(&file_name).map(|i| {
492            self.read_objs(
493                archive.by_index(*i).map_err(|e| Error::NamedFileIO {
494                    file_name: file_name.to_owned(),
495                    source: Box::new(e),
496                })?,
497                file_name,
498            )
499        })
500    }
501}