Skip to main content

vkopt_message_parser/
reader.rs

1use quick_xml::events::{attributes::Attributes, Event};
2use quick_xml::Reader;
3use regex::Regex;
4use std::borrow::Cow;
5use std::path::Path;
6
7lazy_static! {
8    static ref USER_MENTION_RE: Regex = Regex::new(r"\[id\d+\|(?P<name>[^\]]+)\]").unwrap();
9}
10
11#[derive(Debug)]
12pub enum MessageEvent<'a> {
13    Start(u32), // > 0 indicates the nesting level for forwarded messages
14    FullNameExtracted(&'a str),
15    ShortNameExtracted(&'a str),
16    DateExtracted(&'a str),
17    BodyPartExtracted(&'a str),
18    WallPartExtracted(&'a str),
19    RawAttachmentPartExtracted(&'a str),
20    AttachmentExtracted {
21        kind: MessageAttachmentKind,
22        url: &'a str,
23        vk_obj: &'a str,
24        description: &'a str,
25    },
26}
27
28#[derive(Debug, PartialEq, Copy, Clone)]
29pub enum MessageAttachmentKind {
30    Doc,
31    Photo,
32    Video,
33    Audio,
34    Sticker,
35    Location,
36    Wall,
37}
38
39pub enum EventResult<A> {
40    Consumed(A),
41    SkipMessage(A),
42}
43
44pub fn fold_html<P, A, F>(path: P, init: A, reducer: F) -> quick_xml::Result<A>
45where
46    P: AsRef<Path>,
47    F: for<'e> FnMut(A, MessageEvent<'e>) -> EventResult<A>,
48{
49    let mut reader = Reader::from_file(path)?;
50    reader.check_end_names(false);
51
52    fold_with_reader(reader, init, reducer)
53}
54
55#[derive(Debug, PartialEq)]
56enum ParseState {
57    Prelude,
58    NoMessage,
59    MessageStart,
60    MessageFullNameStart,
61    MessageFullNameExtracted,
62    MessageShortNameStart,
63    MessageShortNameExtracted,
64    MessageDateStart,
65    MessageDateExtracted,
66    MessageBodyStart,
67    MessageBodyExtracted,
68    MessageAttachmentsPrelude,
69    MessageAttachmentStart,
70    MessageAttachmentHeadStart(MessageAttachmentKind),
71    MessageAttachmentBodyStart(MessageAttachmentKind, String),
72    MessageAttachmentWallBodyStart,
73    MessageAttachmentRawBodyStart,
74    MessageAttachmentEpilogue,
75    MessageForwardedStart,
76    MessageChatActionStart,
77}
78
79struct ParseStateHolder<A, F>
80where
81    F: for<'e> FnMut(A, MessageEvent<'e>) -> EventResult<A>,
82{
83    at: ParseState,
84    msg_level: u32,
85    fwd_closed: bool,
86    skip_level: Option<u32>,
87    acc: A,
88    reducer: F,
89}
90
91impl<A, F> ParseStateHolder<A, F>
92where
93    F: for<'e> FnMut(A, MessageEvent<'e>) -> EventResult<A>,
94{
95    fn advance(&mut self, new_state: ParseState) {
96        self.at = new_state;
97    }
98}
99
100macro_rules! msg_event {
101    ($state: ident, $event: expr) => {
102        match $state.skip_level {
103            Some(max_level) if $state.msg_level > max_level => {}
104            Some(_) if $state.at != MessageStart => {}
105            _ => match ($state.reducer)($state.acc, $event) {
106                EventResult::Consumed(next_acc) => {
107                    $state.acc = next_acc;
108                    $state.skip_level = None;
109                }
110                EventResult::SkipMessage(next_acc) => {
111                    $state.acc = next_acc;
112                    $state.skip_level = Some($state.msg_level);
113                }
114            },
115        }
116    };
117}
118
119macro_rules! q {
120    ($event: ident, $tag: literal, $attr: literal) => {
121        $event.name() == $tag && $event.attributes_raw().contains_substring($attr)
122    };
123    ($event: ident, $tag: literal) => {
124        $event.name() == $tag
125    };
126}
127
128fn fold_with_reader<B, A, F>(mut reader: Reader<B>, init: A, reducer: F) -> quick_xml::Result<A>
129where
130    B: std::io::BufRead,
131    F: for<'e> FnMut(A, MessageEvent<'e>) -> EventResult<A>,
132{
133    use MessageEvent::*;
134    use ParseState::*;
135
136    let mut buf = Vec::new();
137    let mut state = ParseStateHolder {
138        at: Prelude,
139        msg_level: 0,
140        fwd_closed: false,
141        skip_level: None,
142        acc: init,
143        reducer,
144    };
145
146    loop {
147        match reader.read_event(&mut buf) {
148            Ok(Event::Start(ref e)) => match state.at {
149                // There's an <hr> tag right before the first msg_item
150                Prelude if q!(e, b"hr") => state.advance(NoMessage),
151                NoMessage | MessageBodyExtracted if q!(e, b"div", b"\"msg_item\"") => {
152                    state.advance(MessageStart);
153                    msg_event!(state, Start(state.msg_level));
154                }
155                MessageStart if q!(e, b"b") => {
156                    state.advance(MessageFullNameStart);
157                }
158                MessageFullNameExtracted if q!(e, b"a") => {
159                    state.advance(MessageShortNameStart);
160                }
161                MessageDateExtracted if q!(e, b"div", b"\"msg_body\"") => {
162                    state.advance(MessageBodyStart);
163                }
164                MessageBodyStart if q!(e, b"img", b"\"emoji\"") => {
165                    if let Some(alt) = get_attr(&mut e.attributes(), b"alt") {
166                        msg_event!(state, BodyPartExtracted(reader.decode(&alt)?));
167                    }
168                }
169                MessageDateExtracted if q!(e, b"div") && e.attributes_raw().is_empty() => {
170                    state.advance(MessageChatActionStart);
171                }
172                MessageDateExtracted | MessageBodyExtracted if q!(e, b"div", b"\"attacments\"") => {
173                    state.advance(MessageAttachmentsPrelude)
174                }
175                MessageAttachmentsPrelude | MessageBodyExtracted
176                    if q!(e, b"div", b"\"attacment\"") =>
177                {
178                    state.advance(MessageAttachmentStart)
179                }
180                MessageAttachmentsPrelude | MessageBodyExtracted
181                    if q!(e, b"div", b"\"att_head\"") =>
182                {
183                    state.advance(MessageForwardedStart)
184                }
185                MessageAttachmentStart if q!(e, b"div", b"att_ico") => {
186                    // Matching the last four symbols of the class only -- why? Just for lulz
187                    let attrs = e.attributes_raw();
188                    let kind = match &attrs[attrs.len() - 5..attrs.len() - 1] {
189                        b"_doc" => MessageAttachmentKind::Doc,
190                        b"udio" => MessageAttachmentKind::Audio,
191                        b"ideo" => MessageAttachmentKind::Video,
192                        b"hoto" => MessageAttachmentKind::Photo,
193                        b"cker" => MessageAttachmentKind::Sticker,
194                        b"_geo" => MessageAttachmentKind::Location,
195                        b"wall" => MessageAttachmentKind::Wall,
196                        _ => panic!("Unsupported attachment container: {:?}", e),
197                    };
198                    state.advance(MessageAttachmentHeadStart(kind));
199                }
200                MessageAttachmentStart if q!(e, b"pre") => {
201                    state.advance(MessageAttachmentRawBodyStart);
202                }
203                MessageAttachmentHeadStart(kind) if q!(e, b"a") => {
204                    let mut attrs = e.attributes();
205                    let href = get_attr(&mut attrs, b"href").unwrap_or(Cow::Borrowed(&[]));
206                    let src = reader.decode(&href)?.to_owned();
207                    state.advance(MessageAttachmentBodyStart(kind, src));
208                }
209                MessageAttachmentEpilogue if q!(e, b"div", b"\"att_wall_text\"") => {
210                    state.advance(MessageAttachmentWallBodyStart);
211                }
212                MessageForwardedStart if q!(e, b"div", b"\"fwd\"") => {
213                    state.msg_level += 1;
214                    state.fwd_closed = false;
215                    state.advance(NoMessage);
216                }
217                _ => {}
218            },
219            Ok(Event::Text(e)) => match state.at {
220                MessageFullNameStart => {
221                    state.advance(MessageFullNameExtracted);
222                    msg_event!(state, FullNameExtracted(reader.decode(e.escaped())?));
223                }
224                MessageShortNameStart => {
225                    state.advance(MessageShortNameExtracted);
226                    msg_event!(
227                        state,
228                        ShortNameExtracted(&reader.decode(e.escaped())?[1..]) // skip the leading @
229                    );
230                }
231                MessageDateStart => {
232                    let maybe_date = e.escaped().trim();
233                    if !maybe_date.is_empty() {
234                        state.advance(MessageDateExtracted);
235                        msg_event!(state, DateExtracted(reader.decode(maybe_date)?));
236                    }
237                }
238                MessageBodyStart => {
239                    let unescaped = &e.unescaped().unwrap_or(Cow::from(e.escaped()));
240                    let text = reader.decode(&unescaped)?;
241                    if text.contains('[') {
242                        let re_text = USER_MENTION_RE.replace_all(text, "$name");
243                        msg_event!(state, BodyPartExtracted(&re_text));
244                    } else if !text.is_empty() {
245                        msg_event!(state, BodyPartExtracted(&text));
246                    }
247                }
248                MessageAttachmentBodyStart(kind, ref url) => {
249                    let unescaped = &e.unescaped().unwrap_or(Cow::from(e.escaped()));
250                    let info = reader.decode(&unescaped)?.trim();
251                    let (vk_obj, description) = if info.starts_with('[') {
252                        let mut info_split = info[1..].splitn(2, ']');
253                        let vk_obj = info_split.next().unwrap_or("");
254                        let description = info_split.next().unwrap_or("").trim();
255                        (vk_obj, description)
256                    } else {
257                        ("", info)
258                    };
259                    msg_event!(
260                        state,
261                        AttachmentExtracted {
262                            kind,
263                            url,
264                            vk_obj,
265                            description
266                        }
267                    );
268                    state.advance(MessageAttachmentEpilogue);
269                }
270                MessageAttachmentRawBodyStart => {
271                    let unescaped = &e.unescaped().unwrap_or(Cow::from(e.escaped()));
272                    let data = reader.decode(&unescaped)?;
273                    msg_event!(state, RawAttachmentPartExtracted(&data));
274                }
275                MessageAttachmentWallBodyStart => {
276                    let unescaped = &e.unescaped().unwrap_or(Cow::from(e.escaped()));
277                    let text = reader.decode(&unescaped)?;
278                    msg_event!(state, WallPartExtracted(&text));
279                }
280                _ => (),
281            },
282            Ok(Event::Empty(ref e)) => match state.at {
283                MessageBodyStart if q!(e, b"br") => {
284                    msg_event!(state, BodyPartExtracted("\n"));
285                }
286                _ => (),
287            },
288            Ok(Event::End(ref e)) => match state.at {
289                MessageShortNameExtracted => state.advance(MessageDateStart),
290                MessageBodyStart
291                | MessageAttachmentWallBodyStart
292                | MessageChatActionStart
293                | MessageAttachmentEpilogue
294                    if q!(e, b"div") =>
295                {
296                    state.advance(MessageBodyExtracted);
297                }
298                MessageAttachmentRawBodyStart if q!(e, b"pre") => {
299                    state.advance(MessageBodyExtracted)
300                }
301                MessageBodyExtracted if q!(e, b"div") => {
302                    state.advance(NoMessage);
303                }
304                NoMessage if q!(e, b"div") => {
305                    if state.msg_level > 0 {
306                        if !state.fwd_closed {
307                            state.fwd_closed = true;
308                        } else {
309                            state.msg_level -= 1;
310                            state.fwd_closed = false;
311                        }
312                    }
313                }
314                _ => {}
315            },
316            Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
317            Ok(Event::Eof) => break,
318            _ => (),
319        }
320        buf.clear();
321    }
322    Ok(state.acc)
323}
324
325fn get_attr<'a>(attrs: &'a mut Attributes, key: &[u8]) -> Option<Cow<'a, [u8]>> {
326    attrs.with_checks(false).find_map(|ar| match ar {
327        Ok(a) if a.key == key => Some(a.value),
328        _ => None,
329    })
330}
331
332// Based on https://stackoverflow.com/a/31102496/1726690
333trait RawText {
334    fn trim(&self) -> &Self;
335    fn contains_substring(&self, sub: &[u8]) -> bool;
336}
337
338impl RawText for [u8] {
339    fn trim(&self) -> &[u8] {
340        fn is_not_whitespace(c: &u8) -> bool {
341            *c != b' ' && *c != b'\r' && *c != b'\n'
342        }
343
344        if let Some(first) = self.iter().position(is_not_whitespace) {
345            let last = self.iter().rposition(is_not_whitespace).unwrap();
346            &self[first..last + 1]
347        } else {
348            &[]
349        }
350    }
351
352    fn contains_substring(&self, sub: &[u8]) -> bool {
353        let mut s = self;
354        while !s.is_empty() {
355            if let Some(pos) = s.iter().position(|&c| c == sub[0]) {
356                let endpos = pos + sub.len();
357                if endpos > s.len() {
358                    return false;
359                }
360                if &s[pos..pos + sub.len()] == sub {
361                    return true;
362                }
363                s = &s[pos + 1..];
364            }
365        }
366        false
367    }
368}