tg_bot_api/
extractor.rs

1use crate::util::{ElementRefExt, StrExt};
2use itertools::Itertools;
3use scraper::{ElementRef, Html, Selector};
4
5#[derive(Debug, thiserror::Error)]
6pub enum ExtractorError {
7    #[error("No `Recent changes` found in document")]
8    NoRecentChanges,
9    #[error("No version string found in document")]
10    NoVersion,
11}
12
13pub struct Extractor {
14    doc: Html,
15}
16
17impl Extractor {
18    pub fn from_str(s: &str) -> Self {
19        Self {
20            doc: Html::parse_document(s),
21        }
22    }
23
24    pub fn extract(&self) -> Result<Extracted<'_>, ExtractorError> {
25        let mut recent_changes = None;
26        let mut version = None;
27        let mut objects = Vec::new();
28        let mut methods = Vec::new();
29
30        let h3 = Selector::parse("h3").unwrap();
31        let h4 = Selector::parse("h4").unwrap();
32        let table = Selector::parse("table").unwrap();
33        let td = Selector::parse("td").unwrap();
34        let p = Selector::parse("p").unwrap();
35        let ul = Selector::parse("ul").unwrap();
36        let li = Selector::parse("li").unwrap();
37        let any = Selector::parse("h3, h4, p, table, ul").unwrap();
38
39        let mut state = State::SearchRecentChanges;
40        let mut select_any = self.doc.select(&any).peekable();
41        while let Some(elem) = select_any.next() {
42            let new_state = match state {
43                State::SearchRecentChanges
44                    if h3.matches(&elem) && elem.plain_text() == "Recent changes" =>
45                {
46                    State::GetRecentChange
47                }
48                State::GetRecentChange if h4.matches(&elem) => {
49                    recent_changes = Some(elem.plain_text());
50                    State::GetVersion
51                }
52                State::GetVersion if p.matches(&elem) => {
53                    version = Some(elem);
54                    State::SearchGettingUpdates
55                }
56                State::SearchGettingUpdates
57                    if h3.matches(&elem) && elem.plain_text() == "Getting updates" =>
58                {
59                    State::GetName
60                }
61                State::GetName if h4.matches(&elem) => {
62                    let name = elem.plain_text();
63                    // get rid of elements like `Formatting options`, `Sending files` that are not objects or methods
64                    if name.chars().any(char::is_whitespace) {
65                        State::GetName
66                    } else {
67                        State::GetDescription {
68                            name: elem,
69                            description: RawDescription::default(),
70                        }
71                    }
72                }
73                State::GetDescription {
74                    name,
75                    mut description,
76                } if p.matches(&elem) || ul.matches(&elem) => {
77                    description.push(elem);
78
79                    let has_p = select_any.peek().matches(&p);
80                    let is_method = name.plain_text().is_first_letter_lowercase();
81                    let has_table = select_any.peek().matches(&table);
82                    let has_ul = select_any.peek().matches(&ul);
83
84                    if has_p || (has_ul && is_method) {
85                        State::GetDescription { name, description }
86                    } else {
87                        if has_ul && !is_method {
88                            let ul_elem = select_any.peek().cloned().unwrap();
89                            description.push(ul_elem);
90                        }
91
92                        match (is_method, has_table, has_ul) {
93                            (true, true, false) => State::GetMethodFields { name, description },
94                            (false, true, false) => State::GetObjectFields { name, description },
95                            (false, false, true) => State::GetObjectElements { name, description },
96                            (true, false, false) => {
97                                methods.push(RawMethod {
98                                    name,
99                                    description,
100                                    args: vec![],
101                                });
102                                State::GetName
103                            }
104                            (false, false, false) => {
105                                objects.push(RawObject {
106                                    name,
107                                    description,
108                                    data: RawObjectData::Fields(vec![]),
109                                });
110                                State::GetName
111                            }
112                            _ => unreachable!(),
113                        }
114                    }
115                }
116                State::GetObjectFields { name, description } if table.matches(&elem) => {
117                    objects.push(RawObject {
118                        name,
119                        description,
120                        data: RawObjectData::Fields(extract_fields(&td, elem)),
121                    });
122
123                    State::GetName
124                }
125                State::GetMethodFields { name, description } if table.matches(&elem) => {
126                    methods.push(RawMethod {
127                        name,
128                        description,
129                        args: extract_args(&td, elem),
130                    });
131
132                    State::GetName
133                }
134                State::GetObjectElements { name, description } if ul.matches(&elem) => {
135                    let elements = extract_elements(&li, elem);
136                    objects.push(RawObject {
137                        name,
138                        description,
139                        data: RawObjectData::Elements(elements),
140                    });
141                    State::GetName
142                }
143                x => x,
144            };
145            state = new_state;
146        }
147
148        Ok(Extracted {
149            recent_changes: recent_changes.ok_or(ExtractorError::NoRecentChanges)?,
150            version: version.ok_or(ExtractorError::NoVersion)?,
151            methods,
152            objects,
153        })
154    }
155}
156
157fn extract_fields<'a>(td: &Selector, elem: ElementRef<'a>) -> Vec<RawField<'a>> {
158    elem.select(td)
159        .chunks(3)
160        .into_iter()
161        .filter_map(|mut tds| {
162            let name = tds.next()?.plain_text();
163            let kind = tds.next()?.plain_text();
164            let description = tds.next()?;
165            Some(RawField {
166                name,
167                kind,
168                description,
169            })
170        })
171        .collect()
172}
173
174fn extract_args<'a>(td: &Selector, elem: ElementRef<'a>) -> Vec<RawArgument<'a>> {
175    elem.select(td)
176        .chunks(4)
177        .into_iter()
178        .filter_map(|mut tds| {
179            let name = tds.next()?.plain_text();
180            let kind = tds.next()?.plain_text();
181            let required = tds.next()?.plain_text();
182            let description = tds.next()?;
183
184            Some(RawArgument {
185                name,
186                kind,
187                required,
188                description,
189            })
190        })
191        .collect()
192}
193
194fn extract_elements<'a>(li: &Selector, elem: ElementRef<'a>) -> Vec<ElementRef<'a>> {
195    elem.select(li).collect()
196}
197
198pub struct Extracted<'a> {
199    pub recent_changes: String,
200    pub version: ElementRef<'a>,
201    pub methods: Vec<RawMethod<'a>>,
202    pub objects: Vec<RawObject<'a>>,
203}
204
205#[derive(Debug)]
206enum State<'a> {
207    SearchRecentChanges,
208    GetRecentChange,
209    GetVersion,
210    SearchGettingUpdates,
211    GetName,
212    GetDescription {
213        name: ElementRef<'a>,
214        description: RawDescription<'a>,
215    },
216    GetObjectFields {
217        name: ElementRef<'a>,
218        description: RawDescription<'a>,
219    },
220    GetMethodFields {
221        name: ElementRef<'a>,
222        description: RawDescription<'a>,
223    },
224    GetObjectElements {
225        name: ElementRef<'a>,
226        description: RawDescription<'a>,
227    },
228}
229
230#[derive(Debug, Default)]
231pub struct RawDescription<'a>(pub Vec<ElementRef<'a>>);
232
233impl<'a> RawDescription<'a> {
234    fn push(&mut self, element: ElementRef<'a>) {
235        self.0.push(element);
236    }
237}
238
239pub struct RawMethod<'a> {
240    pub name: ElementRef<'a>,
241    pub description: RawDescription<'a>,
242    pub args: Vec<RawArgument<'a>>,
243}
244
245pub struct RawArgument<'a> {
246    pub name: String,
247    pub kind: String,
248    pub required: String,
249    pub description: ElementRef<'a>,
250}
251
252pub struct RawObject<'a> {
253    pub name: ElementRef<'a>,
254    pub description: RawDescription<'a>,
255    pub data: RawObjectData<'a>,
256}
257
258pub enum RawObjectData<'a> {
259    Fields(Vec<RawField<'a>>),
260    Elements(Vec<ElementRef<'a>>),
261}
262
263pub struct RawField<'a> {
264    pub name: String,
265    pub kind: String,
266    pub description: ElementRef<'a>,
267}
268
269trait OptionExt {
270    fn matches(&self, selector: &Selector) -> bool;
271}
272
273impl OptionExt for Option<&ElementRef<'_>> {
274    fn matches(&self, selector: &Selector) -> bool {
275        self.map(|elem| selector.matches(elem)).unwrap_or(false)
276    }
277}