Skip to main content

gtfs_structures/
gtfs_reader.rs

1use serde::Deserialize;
2use sha2::{Digest, Sha256};
3
4use crate::{Error, Gtfs, RawGtfs};
5use std::collections::HashMap;
6use std::convert::TryFrom;
7use std::fs::File;
8use std::io::Read;
9use std::path::Path;
10use web_time::Instant;
11
12/// Allows to parameterize how the parsing library behaves
13///
14/// ```
15///let gtfs = gtfs_structures::GtfsReader::default()
16///    .read_stop_times(false) // Won’t read the stop times to save time and memory
17///    .read_shapes(false) // Won’t read shapes to save time and memory
18///    .unkown_enum_as_default(false) // Won’t convert unknown enumerations into default (e.g. LocationType=42 considered as a stop point)
19///    .read("fixtures/zips/gtfs.zip")?;
20///assert_eq!(0, gtfs.trips.get("trip1").unwrap().stop_times.len());
21/// # Ok::<(), gtfs_structures::error::Error>(())
22///```
23///
24/// You can also get a [RawGtfs] by doing
25/// ```
26///let gtfs = gtfs_structures::GtfsReader::default()
27///    .read_stop_times(false)
28///    .raw()
29///    .read("fixtures/zips/gtfs.zip")?;
30///assert_eq!(1, gtfs.trips?.len());
31///assert_eq!(0, gtfs.stop_times?.len());
32/// # Ok::<(), gtfs_structures::error::Error>(())
33///```
34#[derive(Derivative)]
35#[derivative(Default)]
36pub struct GtfsReader {
37    /// [crate::objects::StopTime] are very large and not always needed. This allows to skip reading them
38    #[derivative(Default(value = "true"))]
39    pub read_stop_times: bool,
40    /// [crate::objects::Shape] are very large and not always needed. This allows to skip reading them
41    #[derivative(Default(value = "true"))]
42    pub read_shapes: bool,
43    /// If a an enumeration has an unknown value, should we use the default value
44    #[derivative(Default(value = "false"))]
45    pub unkown_enum_as_default: bool,
46    /// Avoid trimming the fields
47    ///
48    /// It is quite time consuming
49    /// If performance is an issue, and if your data is high quality, you can switch it off
50    #[derivative(Default(value = "true"))]
51    pub trim_fields: bool,
52}
53
54impl GtfsReader {
55    /// Configures the reader to read or not the stop times (default: true)
56    ///
57    /// This can be useful to save time and memory with large datasets when the timetable are not needed
58    /// Returns Self and can be chained
59    pub fn read_stop_times(mut self, read_stop_times: bool) -> Self {
60        self.read_stop_times = read_stop_times;
61        self
62    }
63
64    /// This can be useful to save time and memory with large datasets when shapes are not needed
65    /// Returns Self and can be chained
66    pub fn read_shapes(mut self, read_shapes: bool) -> Self {
67        self.read_shapes = read_shapes;
68        self
69    }
70
71    /// If a an enumeration has un unknown value, should we use the default value (default: false)
72    ///
73    /// For instance, if [crate::objects::Stop] has a [crate::objects::LocationType] with a value 42 in the GTFS
74    /// when true, we will parse it as StopPoint
75    /// when false, we will parse it as Unknown(42)
76    /// Returns Self and can be chained
77    pub fn unkown_enum_as_default(mut self, unkown_enum_as_default: bool) -> Self {
78        self.unkown_enum_as_default = unkown_enum_as_default;
79        self
80    }
81
82    /// Should the fields be trimmed (default: true)
83    ///
84    /// It is quite time consuming
85    /// If performance is an issue, and if your data is high quality, you can set it to false
86    pub fn trim_fields(mut self, trim_fields: bool) -> Self {
87        self.trim_fields = trim_fields;
88        self
89    }
90
91    /// Reads from an url (if starts with `"http"`), or a local path (either a directory or zipped file)
92    ///
93    /// To read from an url, build with read-url feature
94    /// See also [Gtfs::from_url] and [Gtfs::from_path] if you don’t want the library to guess
95    #[cfg(not(target_arch = "wasm32"))]
96    pub fn read(self, gtfs: &str) -> Result<Gtfs, Error> {
97        self.raw().read(gtfs).and_then(Gtfs::try_from)
98    }
99
100    /// Reads the raw GTFS from a local zip archive or local directory
101    pub fn read_from_path<P>(self, path: P) -> Result<Gtfs, Error>
102    where
103        P: AsRef<Path>,
104    {
105        self.raw().read_from_path(path).and_then(Gtfs::try_from)
106    }
107
108    /// Reads the GTFS from a remote url
109    ///
110    /// The library must be built with the read-url feature. Not available on WASM targets.
111    #[cfg(all(feature = "read-url", not(target_arch = "wasm32")))]
112    pub fn read_from_url<U: reqwest::IntoUrl>(self, url: U) -> Result<Gtfs, Error> {
113        self.raw().read_from_url(url).and_then(Gtfs::try_from)
114    }
115
116    /// Asynchronously reads the GTFS from a remote url
117    ///
118    /// The library must be built with the read-url feature
119    #[cfg(feature = "read-url")]
120    pub async fn read_from_url_async<U: reqwest::IntoUrl>(self, url: U) -> Result<Gtfs, Error> {
121        self.raw()
122            .read_from_url_async(url)
123            .await
124            .and_then(Gtfs::try_from)
125    }
126
127    /// Read the Gtfs as a [RawGtfs].
128    ///
129    /// ```
130    ///let gtfs = gtfs_structures::GtfsReader::default()
131    ///    .read_stop_times(false)
132    ///    .raw()
133    ///    .read("fixtures/zips/gtfs.zip")?;
134    ///assert_eq!(1, gtfs.trips?.len());
135    ///assert_eq!(0, gtfs.stop_times?.len());
136    /// # Ok::<(), gtfs_structures::error::Error>(())
137    ///```
138    pub fn raw(self) -> RawGtfsReader {
139        RawGtfsReader { reader: self }
140    }
141}
142
143/// This reader generates [RawGtfs]. It must be built using [GtfsReader::raw]
144///
145/// The methods to read a Gtfs are the same as for [GtfsReader]
146pub struct RawGtfsReader {
147    reader: GtfsReader,
148}
149
150impl RawGtfsReader {
151    fn read_from_directory(&self, p: &std::path::Path) -> Result<RawGtfs, Error> {
152        let start_of_read_instant = Instant::now();
153        // Thoses files are not mandatory
154        // We use None if they don’t exist, not an Error
155        let files = std::fs::read_dir(p)?
156            .filter_map(|d| {
157                d.ok().and_then(|e| {
158                    e.path()
159                        .strip_prefix(p)
160                        .ok()
161                        .and_then(|f| f.to_str().map(|s| s.to_owned()))
162                })
163            })
164            .collect();
165
166        let mut result = RawGtfs {
167            trips: self.read_objs_from_path(p.join("trips.txt")),
168            calendar: self.read_objs_from_optional_path(p, "calendar.txt"),
169            calendar_dates: self.read_objs_from_optional_path(p, "calendar_dates.txt"),
170            stops: self.read_objs_from_path(p.join("stops.txt")),
171            routes: self.read_objs_from_path(p.join("routes.txt")),
172            stop_times: if self.reader.read_stop_times {
173                self.read_objs_from_path(p.join("stop_times.txt"))
174            } else {
175                Ok(Vec::new())
176            },
177            agencies: self.read_objs_from_path(p.join("agency.txt")),
178            shapes: self.read_objs_from_optional_path(p, "shapes.txt"),
179            fare_attributes: self.read_objs_from_optional_path(p, "fare_attributes.txt"),
180            fare_rules: self.read_objs_from_optional_path(p, "fare_rules.txt"),
181            fare_products: self.read_objs_from_optional_path(p, "fare_products.txt"),
182            fare_media: self.read_objs_from_optional_path(p, "fare_media.txt"),
183            rider_categories: self.read_objs_from_optional_path(p, "rider_categories.txt"),
184            frequencies: self.read_objs_from_optional_path(p, "frequencies.txt"),
185            transfers: self.read_objs_from_optional_path(p, "transfers.txt"),
186            pathways: self.read_objs_from_optional_path(p, "pathways.txt"),
187            feed_info: self.read_objs_from_optional_path(p, "feed_info.txt"),
188            read_duration: start_of_read_instant.elapsed(),
189            translations: self.read_objs_from_optional_path(p, "translations.txt"),
190            ticketing_deep_links: self.read_objs_from_optional_path(p, "ticketing_deep_links.txt"),
191            ticketing_identifiers: self
192                .read_objs_from_optional_path(p, "ticketing_identifiers.txt"),
193            files,
194            source_format: crate::SourceFormat::Directory,
195            sha256: None,
196        };
197
198        if self.reader.unkown_enum_as_default {
199            result.unknown_to_default();
200        }
201        Ok(result)
202    }
203
204    /// Reads from an url (if starts with `"http"`) if the feature `read-url` is activated,
205    /// or a local path (either a directory or zipped file). Not available on WASM targets.
206    #[cfg(not(target_arch = "wasm32"))]
207    pub fn read(self, gtfs: &str) -> Result<RawGtfs, Error> {
208        #[cfg(feature = "read-url")]
209        if gtfs.starts_with("http") {
210            return self.read_from_url(gtfs);
211        }
212        self.read_from_path(gtfs)
213    }
214
215    /// Reads the GTFS from a remote url. Not available on WASM targets.
216    #[cfg(all(feature = "read-url", not(target_arch = "wasm32")))]
217    pub fn read_from_url<U: reqwest::IntoUrl>(self, url: U) -> Result<RawGtfs, Error> {
218        let mut res = reqwest::blocking::get(url)?;
219        let mut body = Vec::new();
220        res.read_to_end(&mut body)?;
221        let cursor = std::io::Cursor::new(body);
222        self.read_from_reader(cursor)
223    }
224
225    /// Asynchronously reads the GTFS from a remote url
226    #[cfg(feature = "read-url")]
227    pub async fn read_from_url_async<U: reqwest::IntoUrl>(self, url: U) -> Result<RawGtfs, Error> {
228        let res = reqwest::get(url).await?.bytes().await?;
229        let reader = std::io::Cursor::new(res);
230        self.read_from_reader(reader)
231    }
232
233    /// Reads the raw GTFS from a local zip archive or local directory
234    pub fn read_from_path<P>(&self, path: P) -> Result<RawGtfs, Error>
235    where
236        P: AsRef<Path>,
237    {
238        let p = path.as_ref();
239        if p.is_file() {
240            let reader = File::open(p)?;
241            self.read_from_reader(reader)
242        } else if p.is_dir() {
243            self.read_from_directory(p)
244        } else {
245            Err(Error::NotFileNorDirectory(format!("{}", p.display())))
246        }
247    }
248
249    pub fn read_from_reader<T: std::io::Read + std::io::Seek>(
250        &self,
251        reader: T,
252    ) -> Result<RawGtfs, Error> {
253        let start_of_read_instant = Instant::now();
254        let mut hasher = Sha256::new();
255        let mut buf_reader = std::io::BufReader::new(reader);
256        let _n = std::io::copy(&mut buf_reader, &mut hasher)?;
257        let hash = hasher.finalize();
258        let mut archive = zip::ZipArchive::new(buf_reader)?;
259        let mut file_mapping = HashMap::new();
260        let mut files = Vec::new();
261
262        for i in 0..archive.len() {
263            let archive_file = archive.by_index(i)?;
264            files.push(archive_file.name().to_owned());
265
266            for gtfs_file in &[
267                "agency.txt",
268                "calendar.txt",
269                "calendar_dates.txt",
270                "routes.txt",
271                "stops.txt",
272                "stop_times.txt",
273                "trips.txt",
274                "fare_attributes.txt",
275                "fare_rules.txt",
276                "fare_products.txt",
277                "fare_media.txt",
278                "rider_categories.txt",
279                "frequencies.txt",
280                "transfers.txt",
281                "pathways.txt",
282                "feed_info.txt",
283                "shapes.txt",
284            ] {
285                let path = std::path::Path::new(archive_file.name());
286                if path.file_name() == Some(std::ffi::OsStr::new(gtfs_file)) {
287                    file_mapping.insert(gtfs_file, i);
288                    break;
289                }
290            }
291        }
292
293        let mut result = RawGtfs {
294            agencies: self.read_file(&file_mapping, &mut archive, "agency.txt"),
295            calendar: self.read_optional_file(&file_mapping, &mut archive, "calendar.txt"),
296            calendar_dates: self.read_optional_file(
297                &file_mapping,
298                &mut archive,
299                "calendar_dates.txt",
300            ),
301            routes: self.read_file(&file_mapping, &mut archive, "routes.txt"),
302            stops: self.read_file(&file_mapping, &mut archive, "stops.txt"),
303            stop_times: if self.reader.read_stop_times {
304                self.read_file(&file_mapping, &mut archive, "stop_times.txt")
305            } else {
306                Ok(Vec::new())
307            },
308            trips: self.read_file(&file_mapping, &mut archive, "trips.txt"),
309            fare_attributes: self.read_optional_file(
310                &file_mapping,
311                &mut archive,
312                "fare_attributes.txt",
313            ),
314            fare_rules: self.read_optional_file(&file_mapping, &mut archive, "fare_rules.txt"),
315            fare_products: self.read_optional_file(
316                &file_mapping,
317                &mut archive,
318                "fare_products.txt",
319            ),
320            fare_media: self.read_optional_file(&file_mapping, &mut archive, "fare_media.txt"),
321            rider_categories: self.read_optional_file(
322                &file_mapping,
323                &mut archive,
324                "rider_categories.txt",
325            ),
326            frequencies: self.read_optional_file(&file_mapping, &mut archive, "frequencies.txt"),
327            transfers: self.read_optional_file(&file_mapping, &mut archive, "transfers.txt"),
328            pathways: self.read_optional_file(&file_mapping, &mut archive, "pathways.txt"),
329            feed_info: self.read_optional_file(&file_mapping, &mut archive, "feed_info.txt"),
330            shapes: if self.reader.read_shapes {
331                self.read_optional_file(&file_mapping, &mut archive, "shapes.txt")
332            } else {
333                Some(Ok(Vec::new()))
334            },
335            translations: self.read_optional_file(&file_mapping, &mut archive, "translations.txt"),
336            ticketing_deep_links: self.read_optional_file(
337                &file_mapping,
338                &mut archive,
339                "ticketing_deep_links.txt",
340            ),
341            ticketing_identifiers: self.read_optional_file(
342                &file_mapping,
343                &mut archive,
344                "ticketing_identifiers.txt",
345            ),
346            read_duration: start_of_read_instant.elapsed(),
347            files,
348            source_format: crate::SourceFormat::Zip,
349            sha256: Some(format!("{hash:x}")),
350        };
351
352        if self.reader.unkown_enum_as_default {
353            result.unknown_to_default();
354        }
355        Ok(result)
356    }
357
358    fn read_objs<T, O>(&self, mut reader: T, file_name: &str) -> Result<Vec<O>, Error>
359    where
360        for<'de> O: Deserialize<'de>,
361        T: std::io::Read,
362    {
363        let mut bom = [0; 3];
364        reader
365            .read_exact(&mut bom)
366            .map_err(|e| Error::NamedFileIO {
367                file_name: file_name.to_owned(),
368                source: Box::new(e),
369            })?;
370
371        let chained = if bom != [0xefu8, 0xbbu8, 0xbfu8] {
372            bom.chain(reader)
373        } else {
374            [].chain(reader)
375        };
376
377        let mut reader = csv::ReaderBuilder::new()
378            .flexible(true)
379            .trim(if self.reader.trim_fields {
380                csv::Trim::Fields
381            } else {
382                csv::Trim::None
383            })
384            .from_reader(chained);
385        // We store the headers to be able to return them in case of errors
386        let headers = reader
387            .headers()
388            .map_err(|e| Error::CSVError {
389                file_name: file_name.to_owned(),
390                source: e,
391                line_in_error: None,
392            })?
393            .clone()
394            .into_iter()
395            .map(|x| x.trim())
396            .collect::<csv::StringRecord>();
397
398        // Pre-allocate a StringRecord for performance reasons
399        let mut rec = csv::StringRecord::new();
400        let mut objs = Vec::new();
401
402        // Read each record into the pre-allocated StringRecord one at a time
403        while reader.read_record(&mut rec).map_err(|e| Error::CSVError {
404            file_name: file_name.to_owned(),
405            source: e,
406            line_in_error: None,
407        })? {
408            let obj = rec
409                .deserialize(Some(&headers))
410                .map_err(|e| Error::CSVError {
411                    file_name: file_name.to_owned(),
412                    source: e,
413                    line_in_error: Some(crate::error::LineError {
414                        headers: headers.into_iter().map(String::from).collect(),
415                        values: rec.into_iter().map(String::from).collect(),
416                    }),
417                })?;
418            objs.push(obj);
419        }
420        Ok(objs)
421    }
422
423    fn read_objs_from_path<O>(&self, path: std::path::PathBuf) -> Result<Vec<O>, Error>
424    where
425        for<'de> O: Deserialize<'de>,
426    {
427        let file_name = path
428            .file_name()
429            .and_then(|f| f.to_str())
430            .unwrap_or("invalid_file_name")
431            .to_string();
432        if path.exists() {
433            File::open(path)
434                .map_err(|e| Error::NamedFileIO {
435                    file_name: file_name.to_owned(),
436                    source: Box::new(e),
437                })
438                .and_then(|r| self.read_objs(r, &file_name))
439        } else {
440            Err(Error::MissingFile(file_name))
441        }
442    }
443
444    fn read_objs_from_optional_path<O>(
445        &self,
446        dir_path: &std::path::Path,
447        file_name: &str,
448    ) -> Option<Result<Vec<O>, Error>>
449    where
450        for<'de> O: Deserialize<'de>,
451    {
452        File::open(dir_path.join(file_name))
453            .ok()
454            .map(|r| self.read_objs(r, file_name))
455    }
456
457    fn read_file<O, T>(
458        &self,
459        file_mapping: &HashMap<&&str, usize>,
460        archive: &mut zip::ZipArchive<T>,
461        file_name: &str,
462    ) -> Result<Vec<O>, Error>
463    where
464        for<'de> O: Deserialize<'de>,
465        T: std::io::Read + std::io::Seek,
466    {
467        self.read_optional_file(file_mapping, archive, file_name)
468            .unwrap_or_else(|| Err(Error::MissingFile(file_name.to_owned())))
469    }
470
471    fn read_optional_file<O, T>(
472        &self,
473        file_mapping: &HashMap<&&str, usize>,
474        archive: &mut zip::ZipArchive<T>,
475        file_name: &str,
476    ) -> Option<Result<Vec<O>, Error>>
477    where
478        for<'de> O: Deserialize<'de>,
479        T: std::io::Read + std::io::Seek,
480    {
481        file_mapping.get(&file_name).map(|i| {
482            self.read_objs(
483                archive.by_index(*i).map_err(|e| Error::NamedFileIO {
484                    file_name: file_name.to_owned(),
485                    source: Box::new(e),
486                })?,
487                file_name,
488            )
489        })
490    }
491}