linux_ls_parser/
lib.rs

1#![cfg_attr(docsrs, feature(doc_cfg))]
2#![doc = include_str!("../README.md")]
3
4/// Parsed output of `ls -lpa` command
5pub struct LsOutput {
6    /// Sorted list of files
7    pub files: Vec<LsOutputFile>,
8    /// Sorted list of folders
9    pub folders: Vec<String>,
10}
11
12/// File
13pub struct LsOutputFile {
14    /// File name
15    pub name: String,
16    /// File size in bytes
17    pub size_bytes: i64,
18}
19
20/// Parsing error with the offending input line.
21#[derive(Debug, Clone, PartialEq, Eq)]
22pub struct Error {
23    /// Specific parsing failure.
24    pub kind: ErrorKind,
25    /// The line that failed to parse.
26    pub line: String,
27}
28
29/// Possible parsing error kinds when processing `ls -lpa` output.
30#[derive(Debug, Clone, PartialEq, Eq)]
31pub enum ErrorKind {
32    /// Missing file mode column.
33    MissingFileMode,
34    /// Missing link count column.
35    MissingLinkCount,
36    /// Missing owner column.
37    MissingOwner,
38    /// Missing group column.
39    MissingGroup,
40    /// Missing size column.
41    MissingSize,
42    /// Found a size column that is not a number.
43    InvalidSize {
44        /// The token that failed to parse.
45        token: String,
46    },
47    /// Missing timestamp month column.
48    MissingMonth,
49    /// Missing timestamp day column.
50    MissingDay,
51    /// Missing timestamp time or year column.
52    MissingTimestamp,
53    /// Missing file or directory name.
54    MissingName,
55    /// Found an empty quoted name.
56    EmptyQuotedName,
57    /// Found an unterminated escape sequence in a quoted name.
58    InvalidEscapeSequence,
59}
60
61impl std::str::FromStr for LsOutput {
62    type Err = Error;
63
64    fn from_str(s: &str) -> Result<Self, Self::Err> {
65        let mut files = Vec::new();
66        let mut folders = Vec::new();
67        let input = s
68            .strip_prefix("\\\r\n")
69            .or_else(|| s.strip_prefix("\\\n"))
70            .unwrap_or(s);
71
72        for raw_line in input.lines() {
73            let line = raw_line.trim();
74
75            let parsed = parse_line(line).map_err(|kind| Error::new(kind, line.to_string()))?;
76
77            if let Some(parsed) = parsed {
78                match parsed {
79                    ParsedLine::File(file) => files.push(file),
80                    ParsedLine::Folder(folder) => folders.push(folder),
81                }
82            }
83        }
84
85        files.sort_by(|a, b| a.name.cmp(&b.name));
86        folders.sort();
87
88        Ok(Self { files, folders })
89    }
90}
91
92fn unescape_double_quoted(input: &str) -> Result<String, ErrorKind> {
93    let mut result = String::with_capacity(input.len());
94    let mut chars = input.chars();
95
96    while let Some(ch) = chars.next() {
97        if ch == '\\' {
98            let escaped = chars.next().ok_or(ErrorKind::InvalidEscapeSequence)?;
99            result.push(match escaped {
100                'n' => '\n',
101                'r' => '\r',
102                't' => '\t',
103                other => other,
104            });
105        } else {
106            result.push(ch);
107        }
108    }
109
110    Ok(result)
111}
112
113fn parse_name(raw: &str) -> Result<String, ErrorKind> {
114    if raw.is_empty() {
115        return Err(ErrorKind::MissingName);
116    }
117
118    if raw.len() >= 2 {
119        let bytes = raw.as_bytes();
120        if bytes[0] == b'"' && bytes[raw.len() - 1] == b'"' {
121            let value = unescape_double_quoted(&raw[1..raw.len() - 1])?;
122            if value.is_empty() {
123                return Err(ErrorKind::EmptyQuotedName);
124            }
125            return Ok(value);
126        }
127
128        if bytes[0] == b'\'' && bytes[raw.len() - 1] == b'\'' {
129            let value = &raw[1..raw.len() - 1];
130            if value.is_empty() {
131                return Err(ErrorKind::EmptyQuotedName);
132            }
133            return Ok(value.to_string());
134        }
135    }
136
137    Ok(raw.to_string())
138}
139
140enum ParsedLine {
141    File(LsOutputFile),
142    Folder(String),
143}
144
145fn parse_line(line: &str) -> Result<Option<ParsedLine>, ErrorKind> {
146    if line.is_empty() || line.starts_with("total ") {
147        return Ok(None);
148    }
149
150    let mut parts = line.split_whitespace();
151    let file_mode = parts.next().ok_or(ErrorKind::MissingFileMode)?;
152    if file_mode.len() == 10 {
153        match file_mode.as_bytes()[0] {
154            b'l' => return Ok(None), // skip symlinks
155            b'b' => return Ok(None), // skip block devices
156            b'c' => return Ok(None), // skip char devices
157            _ => {}
158        }
159    }
160
161    // Skip link count, owner and group info. We only care about size.
162    parts.next().ok_or(ErrorKind::MissingLinkCount)?;
163    parts.next().ok_or(ErrorKind::MissingOwner)?;
164    parts.next().ok_or(ErrorKind::MissingGroup)?;
165
166    let size_token = parts.next().ok_or(ErrorKind::MissingSize)?;
167    let size: i64 = size_token.parse().map_err(|_| ErrorKind::InvalidSize {
168        token: size_token.to_string(),
169    })?;
170
171    // Skip month, day and time/year columns.
172    parts.next().ok_or(ErrorKind::MissingMonth)?;
173    parts.next().ok_or(ErrorKind::MissingDay)?;
174    parts.next().ok_or(ErrorKind::MissingTimestamp)?;
175
176    let mut raw_name = parts.collect::<Vec<_>>().join(" ");
177    if raw_name.is_empty() {
178        return Err(ErrorKind::MissingName);
179    }
180
181    let is_directory = raw_name.ends_with('/');
182    if is_directory {
183        while raw_name.ends_with('/') {
184            raw_name.pop();
185        }
186    }
187
188    let name = parse_name(&raw_name)?;
189
190    if name == "." || name == ".." {
191        return Ok(None);
192    }
193
194    if is_directory {
195        if name.is_empty() {
196            return Ok(None);
197        }
198
199        Ok(Some(ParsedLine::Folder(name)))
200    } else {
201        Ok(Some(ParsedLine::File(LsOutputFile {
202            name,
203            size_bytes: size,
204        })))
205    }
206}
207
208impl Error {
209    fn new(kind: ErrorKind, line: String) -> Self {
210        Self { kind, line }
211    }
212}
213
214impl std::fmt::Display for ErrorKind {
215    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
216        match self {
217            Self::MissingFileMode => write!(f, "missing file mode field"),
218            Self::MissingLinkCount => write!(f, "missing link count field"),
219            Self::MissingOwner => write!(f, "missing owner field"),
220            Self::MissingGroup => write!(f, "missing group field"),
221            Self::MissingSize => write!(f, "missing size field"),
222            Self::InvalidSize { token } => write!(f, "invalid size value `{token}`"),
223            Self::MissingMonth => write!(f, "missing timestamp month field"),
224            Self::MissingDay => write!(f, "missing timestamp day field"),
225            Self::MissingTimestamp => write!(f, "missing timestamp time or year field"),
226            Self::MissingName => write!(f, "missing file name"),
227            Self::EmptyQuotedName => write!(f, "empty quoted file name"),
228            Self::InvalidEscapeSequence => write!(f, "unterminated escape sequence in file name"),
229        }
230    }
231}
232
233impl std::fmt::Display for Error {
234    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
235        write!(f, "{} in line `{}`", self.kind, self.line)
236    }
237}
238
239impl std::error::Error for Error {}
240
241#[cfg(test)]
242mod tests {
243    use std::str::FromStr;
244
245    use super::*;
246
247    const EDGE_CASE_FILE_ENTRIES: [(&str, i64); 42] = [
248        (r" -space-dash-", 13),
249        (r" multiple consecutive spaces ", 34),
250        (r"!exclamation!mark!", 18),
251        (r#""double"quote""#, 14),
252        (r"#hash#tag#", 10),
253        (r"$dollar$sign$", 13),
254        (r"%percent%value%", 15),
255        (r"&ampersand&symbol&", 18),
256        (r"'$'\n''newline'$'\n''line", 17),
257        (r"'$'\r''return'$'\r''carriage'$'\r", 20),
258        (r"'$'\t''tab'$'\t''indent'$'\t", 15),
259        (r"'single'quote'", 14),
260        (r"(paren(open(", 12),
261        (r")paren)close)", 13),
262        (r"*asterisk*star*", 15),
263        (r"+plus+sign+", 11),
264        (r",comma,list,", 12),
265        (r"---dash---triple---", 19),
266        (r"-hyphen-entry-", 14),
267        (r"..double..dot..", 15),
268        (r".hidden. with spaces.", 21),
269        (r":colon:case:", 12),
270        (r";semicolon;case;", 16),
271        (r"<less<than<", 11),
272        (r"=equals=case=", 13),
273        (r">greater>than>", 14),
274        (r"?question?mark?", 15),
275        (r"@at@symbol@", 11),
276        (r"[bracket[left[", 14),
277        (r"\backslash\path\", 19),
278        (r"\x20space\x20pad\x20", 20),
279        (r"]bracket]right]", 15),
280        (r"^caret^symbol^", 14),
281        (r"_underscore_label_", 18),
282        (r"`backtick`quote`", 16),
283        (r"{brace{left{", 12),
284        (r"|pipe|vertical|", 15),
285        (r"}brace}right}", 13),
286        (r"~tilde~wave~", 12),
287        (r"файл", 8),
288        (r"文件", 6),
289        (r"🚀rocket🚀ship🚀", 22),
290    ];
291
292    #[test]
293    fn folders() {
294        let input = "\
295total 16
296drwxr-xr-x  5 user user  4096 Jan  1 12:00 ./
297drwxr-xr-x  2 user user  4096 Jan  1 12:01 ../
298drwxr-xr-x  4 user user  4096 Jan  1 12:02 zeta/
299drwxr-xr-x  4 user user  4096 Jan  1 12:02 alpha/
300";
301
302        let output = LsOutput::from_str(input).unwrap();
303
304        assert_eq!(output.folders.len(), 2);
305        assert_eq!(output.files.len(), 0);
306        assert_eq!(output.folders, vec!["alpha", "zeta"]);
307    }
308
309    #[test]
310    fn files() {
311        let input = "\
312total 12
313drwxr-xr-x  5 root root 4096 Jan  1 00:00 ./
314drwxr-xr-x  5 root root 4096 Jan  1 00:00 ../
315-rw-r--r--  1 root root   16 Jan  1 00:01 arrow -> name
316-rw-r--r--  1 root root   16 Jan  1 00:01 notes.txt
317-rw-r--r--  1 root root    8 Jan  1 00:02 .hidden
318";
319
320        let output = LsOutput::from_str(input).unwrap();
321
322        assert_eq!(output.folders.len(), 0);
323        assert_eq!(output.files.len(), 3);
324        let files: Vec<(&str, i64)> = output
325            .files
326            .iter()
327            .map(|f| (f.name.as_str(), f.size_bytes))
328            .collect();
329        assert_eq!(
330            files,
331            vec![(".hidden", 8), ("arrow -> name", 16), ("notes.txt", 16)]
332        );
333    }
334
335    #[test]
336    fn ignores_symlinks() {
337        let input = "\
338lrwxrwxrwx  1 user user     6 Jan  1 12:04 link -> target
339";
340
341        let output: LsOutput = input.parse().unwrap();
342        assert_eq!(output.folders.len(), 0);
343        assert_eq!(output.files.len(), 0);
344    }
345
346    #[test]
347    fn ignores_device_files() {
348        let input = "\
349brw-rw----  1 root disk 8, 0 Jan  1 12:00 sda
350crw-rw----  1 root disk 8, 1 Jan  1 12:00 sda1
351";
352
353        let output: LsOutput = input.parse().unwrap();
354        assert_eq!(output.folders.len(), 0);
355        assert_eq!(output.files.len(), 0);
356    }
357
358    #[test]
359    fn unicode_names() {
360        let input = "\
361drwxrwxr-x 2 imbolc imbolc 4096 Oct 14 10:43 пора/
362-rw-rw-r-- 1 imbolc imbolc    0 Oct 14 10:43 спать
363";
364
365        let output: LsOutput = input.parse().unwrap();
366        assert_eq!(output.folders.len(), 1);
367        assert_eq!(output.folders[0], "пора");
368        assert_eq!(output.files.len(), 1);
369        assert_eq!(output.files[0].name, "спать");
370    }
371
372    #[test]
373    fn spaces() {
374        let input = r#"\
375drwxrwxr-x 2 imbolc imbolc 4096 Oct 14 10:49 "let's play"/
376-rw-rw-r-- 1 imbolc imbolc    0 Oct 14 10:50 'давай играть'
377"#;
378
379        let output: LsOutput = input.parse().unwrap();
380        assert_eq!(output.folders.len(), 1);
381        assert_eq!(output.folders[0], "let's play");
382        assert_eq!(output.files.len(), 1);
383        assert_eq!(output.files[0].name, "давай играть");
384    }
385
386    #[test]
387    fn error_includes_offending_line() {
388        let err = match "broken line".parse::<LsOutput>() {
389            Err(err) => err,
390            Ok(_) => panic!("expected error"),
391        };
392        assert!(err.to_string().contains("broken line"));
393        assert_eq!(err.line, "broken line");
394    }
395
396    #[test]
397    fn rejects_malformed_line() {
398        assert!("broken line".parse::<LsOutput>().is_err());
399    }
400
401    // Tests files generated by ./edge-case-samples.sh
402    #[test]
403    fn edge_case_files() {
404        let input = r#"\
405total 176
406drwxrwxr-x 2 imbolc imbolc 4096 Oct 15 12:05  ./
407drwxrwxr-x 4 imbolc imbolc 4096 Oct 15 12:05  ../
408-rw-rw-r-- 1 imbolc imbolc   13 Oct 15 12:05 '$dollar$sign$'
409-rw-rw-r-- 1 imbolc imbolc   18 Oct 15 12:05 '&ampersand&symbol&'
410-rw-rw-r-- 1 imbolc imbolc   15 Oct 15 12:05 '*asterisk*star*'
411-rw-rw-r-- 1 imbolc imbolc   11 Oct 15 12:05  @at@symbol@
412-rw-rw-r-- 1 imbolc imbolc   19 Oct 15 12:05 '\backslash\path\'
413-rw-rw-r-- 1 imbolc imbolc   16 Oct 15 12:05 '`backtick`quote`'
414-rw-rw-r-- 1 imbolc imbolc   12 Oct 15 12:05  {brace{left{
415-rw-rw-r-- 1 imbolc imbolc   13 Oct 15 12:05  }brace}right}
416-rw-rw-r-- 1 imbolc imbolc   14 Oct 15 12:05 '[bracket[left['
417-rw-rw-r-- 1 imbolc imbolc   15 Oct 15 12:05  ]bracket]right]
418-rw-rw-r-- 1 imbolc imbolc   14 Oct 15 12:05 '^caret^symbol^'
419-rw-rw-r-- 1 imbolc imbolc   12 Oct 15 12:05  :colon:case:
420-rw-rw-r-- 1 imbolc imbolc   12 Oct 15 12:05  ,comma,list,
421-rw-rw-r-- 1 imbolc imbolc   19 Oct 15 12:05  ---dash---triple---
422-rw-rw-r-- 1 imbolc imbolc   15 Oct 15 12:05  ..double..dot..
423-rw-rw-r-- 1 imbolc imbolc   14 Oct 15 12:05 '"double"quote"'
424-rw-rw-r-- 1 imbolc imbolc   13 Oct 15 12:05 '=equals=case='
425-rw-rw-r-- 1 imbolc imbolc   18 Oct 15 12:05 '!exclamation!mark!'
426-rw-rw-r-- 1 imbolc imbolc   14 Oct 15 12:05 '>greater>than>'
427-rw-rw-r-- 1 imbolc imbolc   10 Oct 15 12:05 '#hash#tag#'
428-rw-rw-r-- 1 imbolc imbolc   21 Oct 15 12:05 '.hidden. with spaces.'
429-rw-rw-r-- 1 imbolc imbolc   14 Oct 15 12:05  -hyphen-entry-
430-rw-rw-r-- 1 imbolc imbolc   11 Oct 15 12:05 '<less<than<'
431-rw-rw-r-- 1 imbolc imbolc   34 Oct 15 12:05 '  multiple  consecutive   spaces  '
432-rw-rw-r-- 1 imbolc imbolc   17 Oct 15 12:05 ''$'\n''newline'$'\n''line'
433-rw-rw-r-- 1 imbolc imbolc   13 Oct 15 12:05 ')paren)close)'
434-rw-rw-r-- 1 imbolc imbolc   12 Oct 15 12:05 '(paren(open('
435-rw-rw-r-- 1 imbolc imbolc   15 Oct 15 12:05  %percent%value%
436-rw-rw-r-- 1 imbolc imbolc   15 Oct 15 12:05 '|pipe|vertical|'
437-rw-rw-r-- 1 imbolc imbolc   11 Oct 15 12:05  +plus+sign+
438-rw-rw-r-- 1 imbolc imbolc   15 Oct 15 12:05 '?question?mark?'
439-rw-rw-r-- 1 imbolc imbolc   20 Oct 15 12:05 ''$'\r''return'$'\r''carriage'$'\r'
440-rw-rw-r-- 1 imbolc imbolc   22 Oct 15 12:05  🚀rocket🚀ship🚀
441-rw-rw-r-- 1 imbolc imbolc   16 Oct 15 12:05 ';semicolon;case;'
442-rw-rw-r-- 1 imbolc imbolc   14 Oct 15 12:05 "'single'quote'"
443-rw-rw-r-- 1 imbolc imbolc   13 Oct 15 12:05 ' -space-dash-'
444-rw-rw-r-- 1 imbolc imbolc   15 Oct 15 12:05 ''$'\t''tab'$'\t''indent'$'\t'
445-rw-rw-r-- 1 imbolc imbolc   12 Oct 15 12:05 '~tilde~wave~'
446-rw-rw-r-- 1 imbolc imbolc   18 Oct 15 12:05  _underscore_label_
447-rw-rw-r-- 1 imbolc imbolc   20 Oct 15 12:05 '\x20space\x20pad\x20'
448-rw-rw-r-- 1 imbolc imbolc    8 Oct 15 12:05  файл
449-rw-rw-r-- 1 imbolc imbolc    6 Oct 15 12:05  文件
450"#;
451
452        let output: LsOutput = input.parse().unwrap();
453        assert!(output.folders.is_empty());
454        let parsed_files: Vec<(&str, i64)> = output
455            .files
456            .iter()
457            .map(|file| (file.name.as_str(), file.size_bytes))
458            .collect();
459        assert_eq!(parsed_files, EDGE_CASE_FILE_ENTRIES);
460    }
461
462    // Tests folders generated by ./edge-case-samples.sh
463    #[test]
464    fn edge_case_folders() {
465        let input = r#"\
466total 176
467drwxrwxr-x 44 imbolc imbolc 4096 Oct 15 12:05  ./
468drwxrwxr-x  4 imbolc imbolc 4096 Oct 15 12:05  ../
469drwxrwxr-x  2 imbolc imbolc 4096 Oct 15 12:05 '$dollar$sign$'/
470drwxrwxr-x  2 imbolc imbolc 4096 Oct 15 12:05 '&ampersand&symbol&'/
471drwxrwxr-x  2 imbolc imbolc 4096 Oct 15 12:05 '*asterisk*star*'/
472drwxrwxr-x  2 imbolc imbolc 4096 Oct 15 12:05  @at@symbol@/
473drwxrwxr-x  2 imbolc imbolc 4096 Oct 15 12:05 '\backslash\path\'/
474drwxrwxr-x  2 imbolc imbolc 4096 Oct 15 12:05 '`backtick`quote`'/
475drwxrwxr-x  2 imbolc imbolc 4096 Oct 15 12:05  {brace{left{/
476drwxrwxr-x  2 imbolc imbolc 4096 Oct 15 12:05  }brace}right}/
477drwxrwxr-x  2 imbolc imbolc 4096 Oct 15 12:05 '[bracket[left['/
478drwxrwxr-x  2 imbolc imbolc 4096 Oct 15 12:05  ]bracket]right]/
479drwxrwxr-x  2 imbolc imbolc 4096 Oct 15 12:05 '^caret^symbol^'/
480drwxrwxr-x  2 imbolc imbolc 4096 Oct 15 12:05  :colon:case:/
481drwxrwxr-x  2 imbolc imbolc 4096 Oct 15 12:05  ,comma,list,/
482drwxrwxr-x  2 imbolc imbolc 4096 Oct 15 12:05  ---dash---triple---/
483drwxrwxr-x  2 imbolc imbolc 4096 Oct 15 12:05  ..double..dot../
484drwxrwxr-x  2 imbolc imbolc 4096 Oct 15 12:05 '"double"quote"'/
485drwxrwxr-x  2 imbolc imbolc 4096 Oct 15 12:05 '=equals=case='/
486drwxrwxr-x  2 imbolc imbolc 4096 Oct 15 12:05 '!exclamation!mark!'/
487drwxrwxr-x  2 imbolc imbolc 4096 Oct 15 12:05 '>greater>than>'/
488drwxrwxr-x  2 imbolc imbolc 4096 Oct 15 12:05 '#hash#tag#'/
489drwxrwxr-x  2 imbolc imbolc 4096 Oct 15 12:05 '.hidden. with spaces.'/
490drwxrwxr-x  2 imbolc imbolc 4096 Oct 15 12:05  -hyphen-entry-/
491drwxrwxr-x  2 imbolc imbolc 4096 Oct 15 12:05 '<less<than<'/
492drwxrwxr-x  2 imbolc imbolc 4096 Oct 15 12:05 '  multiple  consecutive   spaces  '/
493drwxrwxr-x  2 imbolc imbolc 4096 Oct 15 12:05 ''$'\n''newline'$'\n''line'/
494drwxrwxr-x  2 imbolc imbolc 4096 Oct 15 12:05 ')paren)close)'/
495drwxrwxr-x  2 imbolc imbolc 4096 Oct 15 12:05 '(paren(open('/
496drwxrwxr-x  2 imbolc imbolc 4096 Oct 15 12:05  %percent%value%/
497drwxrwxr-x  2 imbolc imbolc 4096 Oct 15 12:05 '|pipe|vertical|'/
498drwxrwxr-x  2 imbolc imbolc 4096 Oct 15 12:05  +plus+sign+/
499drwxrwxr-x  2 imbolc imbolc 4096 Oct 15 12:05 '?question?mark?'/
500drwxrwxr-x  2 imbolc imbolc 4096 Oct 15 12:05 ''$'\r''return'$'\r''carriage'$'\r'/
501drwxrwxr-x  2 imbolc imbolc 4096 Oct 15 12:05  🚀rocket🚀ship🚀/
502drwxrwxr-x  2 imbolc imbolc 4096 Oct 15 12:05 ';semicolon;case;'/
503drwxrwxr-x  2 imbolc imbolc 4096 Oct 15 12:05 "'single'quote'"/
504drwxrwxr-x  2 imbolc imbolc 4096 Oct 15 12:05 ' -space-dash-'/
505drwxrwxr-x  2 imbolc imbolc 4096 Oct 15 12:05 ''$'\t''tab'$'\t''indent'$'\t'/
506drwxrwxr-x  2 imbolc imbolc 4096 Oct 15 12:05 '~tilde~wave~'/
507drwxrwxr-x  2 imbolc imbolc 4096 Oct 15 12:05  _underscore_label_/
508drwxrwxr-x  2 imbolc imbolc 4096 Oct 15 12:05 '\x20space\x20pad\x20'/
509drwxrwxr-x  2 imbolc imbolc 4096 Oct 15 12:05  файл/
510drwxrwxr-x  2 imbolc imbolc 4096 Oct 15 12:05  文件/
511"#;
512
513        let output: LsOutput = input.parse().unwrap();
514        assert!(output.files.is_empty());
515        let parsed_folders: Vec<&str> = output.folders.iter().map(String::as_str).collect();
516        let expected_folders: Vec<&str> = EDGE_CASE_FILE_ENTRIES
517            .iter()
518            .map(|(name, _)| *name)
519            .collect();
520        assert_eq!(parsed_folders, expected_folders);
521    }
522}