hrx_parser/
lib.rs

1#[derive(Debug)]
2pub enum Error {
3    UnexpectedToken {
4        expected: String,
5        actual: String,
6        start: usize,
7        end: usize,
8    },
9    UnexpectedEof {
10        expected: String,
11        start: usize,
12    },
13    Invalid {
14        message: String,
15        start: usize,
16        end: usize,
17    },
18}
19
20impl std::fmt::Display for Error {
21    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
22        match self {
23            Error::UnexpectedToken {
24                actual, expected, ..
25            } => writeln!(f, "Expected {}, found {}", expected, actual),
26            Error::UnexpectedEof { expected, .. } => {
27                writeln!(f, "Expected {}, found Eof", expected)
28            }
29            Error::Invalid { message, .. } => writeln!(f, "{}", message),
30        }
31    }
32}
33
34pub struct Parser<'a> {
35    source: &'a str,
36    boundary_length: usize,
37    pos: usize,
38}
39
40impl<'a> Parser<'a> {
41    pub fn new(source: &'a str) -> Self {
42        Self {
43            source,
44            boundary_length: 0,
45            pos: 0,
46        }
47    }
48
49    pub fn cur_byte(&self) -> Option<u8> {
50        self.source.as_bytes().get(self.pos).map(|item| *item)
51    }
52
53    pub fn eof(&self) -> bool {
54        self.pos == self.source.len()
55    }
56
57    /// ```ignore
58    /// newline ::= U+000A LINE FEED
59    /// ```
60    pub fn is_line_feed(&self) -> bool {
61        self.cur_byte() == Some(b'\n')
62    }
63
64    pub fn pos(&self) -> usize {
65        self.pos
66    }
67
68    pub fn scan_boundary(&mut self) -> Result<(usize, usize), Error> {
69        let start = self.pos;
70        let mut eq_len = 1;
71        self.expect_byte(b'<')?;
72        self.expect_byte(b'=')?;
73        while self.cur_byte() == Some(b'=') {
74            eq_len += 1;
75            self.pos += 1;
76        }
77        self.expect_byte(b'>')?;
78
79        let end = self.pos;
80        if self.boundary_length == 0 {
81            self.boundary_length = eq_len;
82        } else if self.boundary_length != eq_len {
83            return Err(Error::Invalid {
84                message: "Un matched boundary length".to_string(),
85                start,
86                end,
87            });
88        }
89        Ok((start, end))
90    }
91
92    /// ```ignore
93    /// contents ::= any sequence of characters that neither begins with boundary nor
94    /// includes U+000A LINE FEED followed immediately by boundary
95    /// ```
96    pub fn scan_contents(&mut self) -> Result<(usize, usize), Error> {
97        let start = self.pos;
98        if self.cur_byte() == Some(b'<') {
99            match self.scan_boundary() {
100                // rewind
101                Ok(_) => {
102                    self.pos = start;
103                    return Ok((start, start));
104                }
105                Err(_) => {}
106            }
107        }
108        loop {
109            if self.eof() {
110                break;
111            }
112            if self.cur_byte() == Some(b'\n') {
113                self.pos += 1;
114                if self.cur_byte() == Some(b'<') {
115                    let checkpoint = self.pos;
116                    match self.scan_boundary() {
117                        // rewind
118                        Ok(_) => {
119                            self.pos = checkpoint;
120                            break;
121                        }
122                        Err(_) => {}
123                    }
124                }
125            } else {
126                self.pos += 1;
127            }
128        }
129        let end = self.pos;
130        Ok((start, end))
131    }
132
133    pub fn expect_byte(&mut self, byte: u8) -> Result<(), Error> {
134        if self.cur_byte() != Some(byte) {
135            if self.eof() {
136                return Err(Error::UnexpectedEof {
137                    expected: (byte as char).to_string(),
138                    // actual: (self.cur_byte() as char).to_string(),
139                    start: self.pos,
140                });
141            }
142            return Err(Error::UnexpectedToken {
143                expected: (byte as char).to_string(),
144                // SAFETY: we know that we don't reach the eof yet
145                actual: (self.cur_byte().unwrap() as char).to_string(),
146                start: self.pos,
147                end: self.pos + 1,
148            });
149        }
150        self.pos += 1;
151        Ok(())
152    }
153
154    pub fn scan_path_component(&mut self) -> Result<(usize, usize, &'a str), Error> {
155        let start = self.pos;
156        // SAFETY: we know that we don't reach eof yet
157        while !self.eof() && self.is_path_component(self.cur_byte().unwrap()) {
158            self.pos += 1;
159        }
160        let end = self.pos;
161        let source = &self.source[start..end];
162        if source.len() == 0 || source == "." || source == ".." {
163            if self.eof() {
164                return Err(Error::UnexpectedEof {
165                    expected: "path".to_string(),
166                    start: self.pos,
167                });
168            } else {
169                return Err(Error::UnexpectedToken {
170                    expected: "path".to_string(),
171                    actual: source.to_string(),
172                    start,
173                    end,
174                });
175            }
176        }
177        Ok((start, end, source))
178    }
179
180    /// ```ignore
181    /// path-character ::= any character other than U+0000 through U+001F, U+007F DELETE, U+002F
182    /// SOLIDUS, U+003A COLON, or U+005C REVERSE SOLIDUS
183    /// ```
184    #[inline]
185    fn is_path_component(&self, byte: u8) -> bool {
186        !matches!(byte, 0..=0x1f | 0x7f | 0x2f | 0x3a | 0x5c)
187    }
188}
189
190#[derive(Debug)]
191pub enum FileOrDirectory<'a> {
192    File(File<'a>),
193    Directory(Directory<'a>),
194}
195
196#[derive(Debug)]
197pub struct File<'a> {
198    pub start: usize,
199    pub end: usize,
200    pub body: Option<Body<'a>>,
201    pub path: Path<'a>,
202}
203#[derive(Debug)]
204pub struct Path<'a> {
205    pub start: usize,
206    pub end: usize,
207    pub source: &'a str,
208}
209
210#[derive(Debug)]
211pub struct Directory<'a> {
212    pub start: usize,
213    pub end: usize,
214    pub path: Path<'a>,
215}
216
217#[derive(Debug)]
218pub struct Archive<'a> {
219    pub start: usize,
220    pub end: usize,
221    pub entries: Vec<Entry<'a>>,
222    pub comment: Option<Comment<'a>>,
223}
224
225#[derive(Debug)]
226pub struct Entry<'a> {
227    pub start: usize,
228    pub end: usize,
229    pub comment: Option<Comment<'a>>,
230    pub body: FileOrDirectory<'a>,
231}
232
233impl<'a> Entry<'a> {
234    pub fn path(&self) -> String {
235        match self.body {
236            FileOrDirectory::File(ref f) => f.path.source.to_string(),
237            FileOrDirectory::Directory(ref d) => d.path.source.to_string() + "/",
238        }
239    }
240
241    pub fn content(&self) -> Option<String> {
242        match self.body {
243            FileOrDirectory::File(ref f) => f.body.as_ref().map(|item| item.source.to_string()),
244            FileOrDirectory::Directory(_) => None,
245        }
246    }
247}
248
249#[derive(Debug)]
250pub struct Comment<'a> {
251    pub start: usize,
252    pub end: usize,
253    pub source: &'a str,
254    pub boundary: Boundary,
255    pub body: Body<'a>,
256}
257
258#[derive(Debug, Clone)]
259pub struct Body<'a> {
260    pub start: usize,
261    pub end: usize,
262    pub source: &'a str,
263}
264
265#[derive(Debug)]
266pub struct Boundary {
267    pub start: usize,
268    pub end: usize,
269}
270
271pub fn parse_archive<'a>(p: &mut Parser<'a>) -> Result<Archive<'a>, Error> {
272    let start = p.pos;
273    let mut entries = vec![];
274    loop {
275        let checkpoint = p.pos;
276        match parse_entry(p) {
277            Ok(entry) => {
278                entries.push(entry);
279            }
280            Err(err) => {
281                p.pos = checkpoint;
282                match err {
283                    Error::UnexpectedToken { .. } => {}
284                    Error::UnexpectedEof { .. } => {}
285                    Error::Invalid { .. } => {
286                        return Err(err);
287                    }
288                }
289                break;
290            }
291        }
292    }
293    dbg!(&p.source[p.pos..]);
294    let comment = parse_comment(p).ok();
295    dbg!(&comment);
296    if !p.eof() {
297        return Err(Error::UnexpectedToken {
298            expected: "Eof eof".to_string(),
299            // SAFETY: we know that we don't reach eof yet
300            actual: p.cur_byte().unwrap().to_string(),
301            start: p.pos,
302            end: p.pos + 1,
303        });
304    }
305    Ok(Archive {
306        start,
307        end: p.pos,
308        entries,
309        comment,
310    })
311    // let
312}
313
314pub fn parse_entry<'a>(p: &mut Parser<'a>) -> Result<Entry<'a>, Error> {
315    let start = p.pos;
316    let comment = parse_comment(p).ok();
317    if comment.is_none() {
318        p.pos = start;
319    }
320    parse_boundary(p)?;
321    p.expect_byte(b' ')?;
322    let path = parse_path(p)?;
323
324    match p.cur_byte() {
325        Some(b'/') => {
326            p.expect_byte(b'/')?;
327            while !p.eof() && p.is_line_feed() {
328                p.pos += 1;
329            }
330            let end = p.pos;
331            if !p.eof() && p.cur_byte() != Some(b'<') {
332                return Err(Error::Invalid {
333                    message: "A directory can't have text contents.".to_string(),
334                    start: p.pos,
335                    end: p.pos + 1,
336                });
337            }
338            return Ok(Entry {
339                start,
340                end,
341                comment,
342                body: FileOrDirectory::Directory(Directory { start, end, path }),
343            });
344        }
345        Some(b'\n') => {
346            p.expect_byte(b'\n')?;
347            let checkpoint = p.pos;
348
349            let body = parse_body(p).ok();
350            if body.is_none() {
351                p.pos = checkpoint;
352            }
353            let end = p.pos;
354            return Ok(Entry {
355                start,
356                end,
357                comment,
358                body: FileOrDirectory::File(File {
359                    start,
360                    end,
361                    body,
362                    path,
363                }),
364            });
365        }
366        _ if p.eof() => {
367            return Err(Error::UnexpectedEof {
368                expected: "`/` or `\n`".to_string(),
369                start: p.pos,
370            })
371        }
372        _ => {
373            return Err(Error::UnexpectedToken {
374                expected: "`/` or `\n`".to_string(),
375                // SAFETY: we know that we don't reach the eof yet.
376                actual: p.cur_byte().unwrap().to_string(),
377                start,
378                end: p.pos,
379            });
380        }
381    }
382}
383
384pub fn parse_comment<'a>(p: &mut Parser<'a>) -> Result<Comment<'a>, Error> {
385    let start = p.pos();
386    let boundary = parse_boundary(p)?;
387    p.expect_byte(b'\n')?;
388    let body = parse_body(p)?;
389    let end = p.pos;
390    Ok(Comment {
391        start,
392        end,
393        source: &p.source[start..end],
394        boundary,
395        body,
396    })
397}
398
399fn parse_path<'a>(p: &mut Parser<'a>) -> Result<Path<'a>, Error> {
400    let (start, mut end, _) = p.scan_path_component()?;
401
402    while !p.eof() && p.cur_byte() == Some(b'/') {
403        let checkpoint = p.pos;
404        p.expect_byte(b'/')?;
405        match p.scan_path_component() {
406            Ok((_, e, ..)) => {
407                end = e;
408            }
409            // rewind, maybe this is end `/` separator of directory
410            Err(_) => {
411                p.pos = checkpoint;
412                break;
413            }
414        }
415    }
416
417    Ok(Path {
418        start,
419        end,
420        source: &p.source[start..end],
421    })
422}
423
424fn parse_body<'a>(p: &mut Parser<'a>) -> Result<Body<'a>, Error> {
425    let (start, end) = p.scan_contents()?;
426    Ok(Body {
427        start,
428        end,
429        source: &p.source[start..end],
430    })
431}
432
433pub fn parse_boundary<'a>(p: &mut Parser<'a>) -> Result<Boundary, Error> {
434    let (start, end) = p.scan_boundary()?;
435    Ok(Boundary { start, end })
436}
437
438pub fn parse(source: &str) -> Result<Archive, Error> {
439    let mut parser = Parser::new(source);
440    parse_archive(&mut parser)
441}