ubs_lib/
parser.rs

1//! Low-level access to the schedule parser.
2
3use std::{borrow::Cow, fmt::Display, str::FromStr};
4
5use chrono::{NaiveDate, NaiveTime};
6use regex::Regex;
7use thiserror::Error;
8use tl::{Node, ParserOptions, VDom, VDomGuard};
9
10use crate::{ParseIdError, Semester};
11
12const CLASSES_PER_PAGE: u32 = 50;
13const CLASSES_PER_GROUP: u32 = 3;
14
15// Rust does macro expansion before resolving consts, thus I cannot embed `{}` directly
16// in consts and use the `format!` macro. Defining declarative macros via `macro_rules!` is an
17// alternative to get around this limitation.
18
19// First is the class group index ((page * 50) - 1)
20const SESSION_FORMAT: &str = r"^University (\d\d?) Week Session$";
21macro_rules! SESSION_TAG {
22    () => {
23        "SSR_DER_CS_GRP_SESSION_CODE$215$${}"
24    };
25}
26// First is class index in group (1-3)
27// Second is (294, 295, 296) depending on class index in group (1-3)
28// Third is the class group index ((page * 50) - 1)
29const CLASS_ID_FORMAT: &str = r"^Class Nbr (\d+) - Section ([A-Z](?:\d?)+) ([A-Z]+)$";
30const CLASS_ID_TAG_SEQ: [u32; 3] = [294, 295, 296];
31macro_rules! CLASS_ID_TAG {
32    () => {
33        "SSR_CLSRCH_F_WK_SSR_CMPNT_DESCR_{}${}$${}"
34    };
35}
36// First is the class group index ((page * 50) - 1)
37const DATES_TIME_FORMAT: &str = "%m/%d/%Y";
38macro_rules! DATES_TAG {
39    () => {
40        "SSR_CLSRCH_F_WK_SSR_MTG_DT_LONG_1$88$${}"
41    };
42}
43// First is class index in group (1-3)
44// Second is (134, 135, 154) depending on class index in group (1-3)
45// Third is the class group index ((page * 50) - 1)
46const DATETIME_TIME_FORMAT: &str = "%-I:%M%p";
47const DATETIME_FORMAT: &str =
48    r"^((?:[A-Z][a-z]+\s)+)(\d?\d:\d\d(?:AM|PM)) to (\d?\d:\d\d(?:AM|PM))$";
49const DATETIME_TAG_SEQ: [u32; 3] = [134, 135, 154];
50macro_rules! DATETIME_TAG {
51    () => {
52        "SSR_CLSRCH_F_WK_SSR_MTG_SCHED_L_{}${}$${}"
53    };
54}
55// First is class index in group (1-3)
56// Second is the class group index ((page * 50) - 1)
57macro_rules! ROOM_TAG {
58    () => {
59        "SSR_CLSRCH_F_WK_SSR_MTG_LOC_LONG_{}${}"
60    };
61}
62// First is class index in group (1-3)
63// Second is (86, 161, 162) depending on class index in group (1-3)
64// Third is the class group index ((page * 50) - 1)
65const INSTRUCTOR_TAG_SEQ: [u32; 3] = [86, 161, 162];
66macro_rules! INSTRUCTOR_TAG {
67    () => {
68        "SSR_CLSRCH_F_WK_SSR_INSTR_LONG_{}${}$${}"
69    };
70}
71// First is class index in group (1-3)
72// Second is the class group index ((page * 50) - 1)
73const SEATS_FORMAT: &str = r"^Open Seats (\d+) of (\d+)$";
74macro_rules! SEATS_TAG {
75    () => {
76        "SSR_CLSRCH_F_WK_SSR_DESCR50_{}${}"
77    };
78}
79
80// TODO: I can supply more information, like class description, units, etc.
81/// Parser for raw class schedule data.
82#[derive(Debug)]
83pub struct ClassSchedule {
84    dom: VDomGuard,
85    page: u32,
86}
87
88impl ClassSchedule {
89    /// Construct a new [`ClassSchedule`](ClassSchedule) with the specified bytes at the specified page.
90    pub fn new(bytes: Vec<u8>, page: u32) -> Result<Self, ParseError> {
91        // TODO: consider enabling tracking for perf
92        let dom = unsafe { tl::parse_owned(String::from_utf8(bytes)?, ParserOptions::default())? };
93
94        Ok(Self { dom, page })
95    }
96
97    /// Get the semester for the schedule.
98    pub fn semester(&self) -> Result<Semester, ParseError> {
99        get_text_from_id_without_sub_nodes(self.dom.get_ref(), "TERM_VAL_TBL_DESCR")?
100            .parse::<Semester>()
101            .map_err(|err| err.into())
102    }
103
104    /// Iterator over groups of classes.
105    ///
106    /// In the catalog, classes are grouped in sets of 3 (usually)
107    /// which can only be selected together.
108    pub fn group_iter(&self) -> impl Iterator<Item = ClassGroup<'_>> + '_ {
109        // Every page contains the bytes of the previous pages
110        let first_class_index = self.page.saturating_sub(1) * CLASSES_PER_PAGE;
111        let last_class_index = (self.page * CLASSES_PER_PAGE).saturating_sub(1);
112
113        (first_class_index..last_class_index).map(|group_num| ClassGroup {
114            dom: self.dom.get_ref(),
115            group_num,
116        })
117    }
118}
119
120// TODO: Every lecture is paired with every possible combo of recs/labs, I can simplify this
121/// Parser for raw class group data.
122#[derive(Debug, Clone)]
123pub struct ClassGroup<'a> {
124    dom: &'a VDom<'a>,
125    group_num: u32,
126}
127
128// TODO: return if group is open/closed (not as straightforward as getting id)
129impl<'a> ClassGroup<'a> {
130    /// Iterator over classes in group.
131    pub fn class_iter(&self) -> impl Iterator<Item = Class<'a>> + '_ {
132        (0..CLASSES_PER_GROUP).map(|class_num| Class {
133            dom: self.dom,
134            class_num,
135            group_num: self.group_num,
136        })
137    }
138
139    /// Get the current session of the class group.
140    ///
141    /// For instance, if the session is `University 15 Week Session`,
142    /// this function will return `15`.
143    pub fn session(&self) -> Result<u32, ParseError> {
144        let session =
145            get_text_from_id_without_sub_nodes(self.dom, &format!(SESSION_TAG!(), self.group_num))?;
146        let re = Regex::new(SESSION_FORMAT)
147            .unwrap()
148            .captures(session)
149            .ok_or(ParseError::UnknownElementFormat)?;
150        re.get(1)
151            .ok_or(ParseError::UnknownElementFormat)?
152            .as_str()
153            .parse()
154            .map_err(|_| ParseError::UnknownElementFormat)
155    }
156
157    /// Get the start date of the class group.
158    pub fn start_date(&self) -> Result<NaiveDate, ParseError> {
159        Ok(self.dates()?.0)
160    }
161
162    /// Get the end date of the class group.
163    pub fn end_date(&self) -> Result<NaiveDate, ParseError> {
164        Ok(self.dates()?.1)
165    }
166
167    /// Get the start and end date of the class group.
168    fn dates(&self) -> Result<(NaiveDate, NaiveDate), ParseError> {
169        let dates =
170            get_text_from_id_without_sub_nodes(self.dom, &format!(DATES_TAG!(), self.group_num))?;
171
172        let mut split_dates = dates.split(" - ");
173        // TODO: remove boilerplate, regex?
174        Ok((
175            NaiveDate::parse_from_str(
176                split_dates.next().ok_or(ParseError::UnknownElementFormat)?,
177                DATES_TIME_FORMAT,
178            )
179            .or(Err(ParseError::UnknownElementFormat))?,
180            NaiveDate::parse_from_str(
181                split_dates.next().ok_or(ParseError::UnknownElementFormat)?,
182                DATES_TIME_FORMAT,
183            )
184            .or(Err(ParseError::UnknownElementFormat))?,
185        ))
186    }
187}
188
189// TODO: empty text will equal `&nbsp;`
190/// Parser for raw class data.
191#[derive(Debug, Clone)]
192pub struct Class<'a> {
193    dom: &'a VDom<'a>,
194    class_num: u32,
195    group_num: u32,
196}
197
198impl Class<'_> {
199    /// Get if the class is open or closed.
200    pub fn is_open(&self) -> Result<bool, ParseError> {
201        let seats = get_text_from_id_without_sub_nodes(
202            self.dom,
203            &format!(SEATS_TAG!(), self.class_num + 1, self.group_num),
204        )?;
205
206        if seats == "Closed" {
207            return Ok(false);
208        }
209
210        Ok(true)
211    }
212
213    /// Get the type of class.
214    ///
215    /// For instance, this function will return `Lecture`, `Seminar`,
216    /// `Lab`, `Recitation`.
217    pub fn class_type(&self) -> Result<ClassType, ParseError> {
218        self.class_info()
219            .map(|info| info.2.parse().map_err(|_| ParseError::UnknownElementFormat))?
220    }
221
222    /// Get id of this class.
223    ///
224    /// For instance, if the class says `Class Nbr 23229`, this function
225    /// will return `23229`.
226    pub fn class_id(&self) -> Result<u32, ParseError> {
227        self.class_info()
228            .map(|info| info.0.parse().map_err(|_| ParseError::UnknownElementFormat))?
229    }
230
231    /// Get the section of this class.
232    ///
233    /// For instance, if the class says `Section A5`, this function will
234    /// return `A5`.
235    pub fn section(&self) -> Result<&str, ParseError> {
236        self.class_info().map(|info| info.1)
237    }
238
239    // If the class is asynchronous a datetime doesn't exist.
240    /// Get the days of week this class is in action.
241    pub fn days_of_week(&self) -> Result<Option<Vec<Result<DayOfWeek, ParseError>>>, ParseError> {
242        self.datetime().map(|result| {
243            result.map(|datetime| {
244                datetime
245                    .0
246                    .iter()
247                    .map(|days| days.parse().map_err(|_| ParseError::UnknownElementFormat))
248                    .collect()
249            })
250        })
251    }
252
253    /// Get the start time of this class.
254    pub fn start_time(&self) -> Result<Option<NaiveTime>, ParseError> {
255        self.datetime()
256            .map(|result| {
257                result.map(|datetime| {
258                    NaiveTime::parse_from_str(&datetime.1, DATETIME_TIME_FORMAT)
259                        .map_err(|_| ParseError::UnknownElementFormat)
260                })
261            })?
262            .transpose()
263    }
264
265    /// Get the end time of this class.
266    pub fn end_time(&self) -> Result<Option<NaiveTime>, ParseError> {
267        // TODO: fix boilerplate with above
268        self.datetime()
269            .map(|result| {
270                result.map(|datetime| {
271                    NaiveTime::parse_from_str(&datetime.2, DATETIME_TIME_FORMAT)
272                        .map_err(|_| ParseError::UnknownElementFormat)
273                })
274            })?
275            .transpose()
276    }
277
278    // Sometimes it returns `Arr Arr`
279    /// Get the room and room number of this class.
280    ///
281    /// For instance, if the class says `Nsc 215`, this function will
282    /// return `Nsc 215`.
283    pub fn room(&self) -> Result<&str, ParseError> {
284        // TODO: use regex to validate result
285        get_text_from_id_without_sub_nodes(
286            self.dom,
287            &format!(ROOM_TAG!(), self.class_num + 1, self.group_num),
288        )
289    }
290
291    // TODO: specific error if the class says "To be Announced"
292    /// Get the name of the instructor.
293    ///
294    /// Note that sometimes the instructor doesn't exist and is labeled as
295    /// `To be Announced`. In that case, the function will error.
296    pub fn instructor(&self) -> Result<&str, ParseError> {
297        // Not much I can do in terms of validation. Some people have very unique patterns in their
298        // names.
299        get_text_from_id_without_sub_nodes(
300            self.dom,
301            &format!(
302                INSTRUCTOR_TAG!(),
303                self.class_num + 1,
304                INSTRUCTOR_TAG_SEQ[self.class_num as usize],
305                self.group_num
306            ),
307        )
308    }
309
310    // TODO: specific error for closed class
311    /// Get the open seats for this class.
312    ///
313    /// Note that if the class is closed this function will error.
314    pub fn open_seats(&self) -> Result<Option<u32>, ParseError> {
315        self.seats().map(|seats| seats.map(|seats| seats.0))
316    }
317
318    // TODO: ^
319    /// Get the total seats for this class.
320    ///
321    /// Note that if the class is closed this function will error.
322    pub fn total_seats(&self) -> Result<Option<u32>, ParseError> {
323        self.seats().map(|seats| seats.map(|seats| seats.1))
324    }
325
326    /// Get various bits of information for this class in the form,
327    /// `(class_type, class_id, section)`.
328    fn class_info(&self) -> Result<(&str, &str, &str), ParseError> {
329        let class_info = get_text_from_id_without_sub_nodes(
330            self.dom,
331            &format!(
332                CLASS_ID_TAG!(),
333                self.class_num + 1,
334                CLASS_ID_TAG_SEQ[self.class_num as usize],
335                self.group_num
336            ),
337        )?;
338
339        let re = Regex::new(CLASS_ID_FORMAT)
340            .unwrap()
341            .captures(class_info)
342            .ok_or(ParseError::UnknownElementFormat)?;
343        Ok((
344            re.get(1).ok_or(ParseError::UnknownElementFormat)?.as_str(),
345            re.get(2).ok_or(ParseError::UnknownElementFormat)?.as_str(),
346            re.get(3).ok_or(ParseError::UnknownElementFormat)?.as_str(),
347        ))
348    }
349
350    /// Get various bits of information for this class dates in the form,
351    /// `(days_of_weeek, start_time, end_time)`.
352    fn datetime(&self) -> Result<Option<(Vec<String>, String, String)>, ParseError> {
353        get_node_from_id(
354            self.dom,
355            &format!(
356                DATETIME_TAG!(),
357                self.class_num + 1,
358                DATETIME_TAG_SEQ[self.class_num as usize],
359                self.group_num
360            ),
361        )
362        // If the tag is missing it could mean `Time Conflict` is being displayed. In that
363        // case, skip it and label the datetime as non-existent.
364        // TODO: but it could also mean the format is unknown. Return error with source attached.
365        .map_or_else(
366            |err| match err {
367                ParseError::MissingTag => Ok(None),
368                _ => Err(err),
369            },
370            |node| {
371                match node.inner_text(self.dom.parser()) {
372                    Cow::Borrowed(_) => Err(ParseError::UnknownHtmlFormat),
373                    Cow::Owned(value) => {
374                        let re = Regex::new(DATETIME_FORMAT)
375                            .unwrap()
376                            .captures(&value)
377                            .ok_or(ParseError::UnknownElementFormat)?;
378
379                        Ok(Some((
380                            re.get(1)
381                                .ok_or(ParseError::UnknownElementFormat)?
382                                .as_str()
383                                .split_whitespace()
384                                .map(|string| string.to_owned())
385                                .collect(), // Days of week (e.g. Wednesday)
386                            re.get(2)
387                                .ok_or(ParseError::UnknownElementFormat)?
388                                .as_str()
389                                .to_owned(), // Start time (e.g. 3:00PM)
390                            re.get(3)
391                                .ok_or(ParseError::UnknownElementFormat)?
392                                .as_str()
393                                .to_owned(), // End time (e.g. 4:00PM)
394                        )))
395                    }
396                }
397            },
398        )
399    }
400
401    /// Get various bits of information for this class seats in the form,
402    /// `(days_of_weeek, start_time, end_time)`.
403    fn seats(&self) -> Result<Option<(u32, u32)>, ParseError> {
404        let seats = get_text_from_id_without_sub_nodes(
405            self.dom,
406            &format!(SEATS_TAG!(), self.class_num + 1, self.group_num),
407        )?;
408
409        match seats {
410            "Closed" => Ok(None),
411            _ => {
412                let re = Regex::new(SEATS_FORMAT)
413                    .unwrap()
414                    .captures(seats)
415                    .ok_or(ParseError::UnknownElementFormat)?;
416
417                Ok(Some((
418                    re.get(1)
419                        .ok_or(ParseError::UnknownElementFormat)?
420                        .as_str()
421                        .parse()
422                        .map_err(|_| ParseError::UnknownElementFormat)?, // Open seats
423                    re.get(2)
424                        .ok_or(ParseError::UnknownHtmlFormat)?
425                        .as_str()
426                        .parse()
427                        .map_err(|_| ParseError::UnknownElementFormat)?, // Total seats
428                )))
429            }
430        }
431    }
432}
433
434/// Type of class.
435#[derive(Debug, Clone, Copy)]
436pub enum ClassType {
437    Recitation,
438    Lab,
439    Lecture,
440    Seminar,
441}
442
443impl FromStr for ClassType {
444    type Err = ParseError;
445
446    fn from_str(s: &str) -> Result<Self, Self::Err> {
447        Ok(match s {
448            "REC" => ClassType::Recitation,
449            "LAB" => ClassType::Lab,
450            "LEC" => ClassType::Lecture,
451            "SEM" => ClassType::Seminar,
452            _ => return Err(ParseError::UnknownElementFormat),
453        })
454    }
455}
456
457impl Display for ClassType {
458    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
459        write!(
460            f,
461            "{}",
462            match self {
463                ClassType::Recitation => "Recitation",
464                ClassType::Lab => "Lab",
465                ClassType::Lecture => "Lecture",
466                ClassType::Seminar => "Seminar",
467            }
468        )
469    }
470}
471
472/// Day of week.
473#[derive(Debug, Clone, Copy)]
474pub enum DayOfWeek {
475    Sunday,
476    Monday,
477    Tuesday,
478    Wednesday,
479    Thursday,
480    Friday,
481    Saturday,
482}
483
484impl FromStr for DayOfWeek {
485    type Err = ParseError;
486
487    fn from_str(s: &str) -> Result<Self, Self::Err> {
488        Ok(match s {
489            "Sunday" => DayOfWeek::Sunday,
490            "Monday" => DayOfWeek::Monday,
491            "Tuesday" => DayOfWeek::Tuesday,
492            "Wednesday" => DayOfWeek::Wednesday,
493            "Thursday" => DayOfWeek::Thursday,
494            "Friday" => DayOfWeek::Friday,
495            "Saturday" => DayOfWeek::Saturday,
496            _ => return Err(ParseError::UnknownElementFormat),
497        })
498    }
499}
500
501impl Display for DayOfWeek {
502    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
503        write!(
504            f,
505            "{}",
506            match self {
507                DayOfWeek::Sunday => "Sunday",
508                DayOfWeek::Monday => "Monday",
509                DayOfWeek::Tuesday => "Tuesday",
510                DayOfWeek::Wednesday => "Wednesday",
511                DayOfWeek::Thursday => "Thursday",
512                DayOfWeek::Friday => "Friday",
513                DayOfWeek::Saturday => "Saturday",
514            }
515        )
516    }
517}
518
519// TODO: document
520fn get_text_from_id_without_sub_nodes<'a>(dom: &'a VDom, id: &str) -> Result<&'a str, ParseError> {
521    match get_node_from_id(dom, id)?.inner_text(dom.parser()) {
522        Cow::Borrowed(string) => Ok(string),
523        // TODO: this is relying on implementation details, make it more explicit
524        // If it's owned, that means the element had multiple sub-nodes, which shouldn't be the
525        // case
526        Cow::Owned(_) => Err(ParseError::UnknownHtmlFormat),
527    }
528}
529
530// TODO: ^
531fn get_node_from_id<'a>(dom: &'a VDom, id: &str) -> Result<&'a Node<'a>, ParseError> {
532    Ok(dom
533        .get_element_by_id(id)
534        .ok_or(ParseError::MissingTag)?
535        .get(dom.parser())
536        // We know the element exists in the DOM because that's where we got it from
537        .unwrap())
538}
539
540/// Error when parsing schedule data.
541#[derive(Debug, Error)]
542pub enum ParseError {
543    /// Id is in an unknown format.
544    #[error(transparent)]
545    UnknownIdFormat(#[from] ParseIdError),
546    /// HTML is not valid Utf-8.
547    #[error("could not parse HTML due to invalid Utf-8 encoding")]
548    // HtmlInvalidUtf8(#[from] str::Utf8Error),
549    HtmlInvalidUtf8(#[from] std::string::FromUtf8Error),
550    /// HTML is not in a valid format.
551    #[error("could not parse HTML due to invalid format")]
552    InvalidHtmlFormat(#[from] tl::errors::ParseError),
553    /// HTML is empty.
554    #[error("could not find tags in HTML")]
555    EmptyHtml,
556    /// HTML is in an unknown format.
557    #[error("format of HTML could not be parsed because it is unknown")]
558    UnknownHtmlFormat,
559    // TODO: I can provide much more context here
560    /// Content of HTML element is in an unknown format
561    #[error("format of element could not be parsed because it is unknown")]
562    UnknownElementFormat,
563    /// HTML tag for class does not exist
564    #[error("could not find tag for class in HTML")]
565    MissingTag,
566}