1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
use std::borrow::{Borrow, Cow};
use std::ops::Deref;
use std::str::Utf8Error;

/// OsStr, but specifically for Linux (since we aren't always processing native dumps).
#[derive(Debug, PartialOrd, Ord, Eq, PartialEq)]
pub struct LinuxOsStr([u8]);

/// OsString, but specifically for Linux (since we aren't always processing native dumps).
#[derive(Default, Debug, PartialOrd, Ord, Eq, PartialEq, Clone)]
pub struct LinuxOsString(Vec<u8>);

impl LinuxOsStr {
    pub fn new() -> &'static Self {
        Self::from_bytes(b"")
    }

    pub fn from_bytes(inner: &[u8]) -> &Self {
        // This is the idiom std uses for creating a type that wraps a slice.
        // Yes, there really isn't a way to do this without unsafe. No, it's
        // not at all a safety concern.
        unsafe { &*(inner as *const [u8] as *const LinuxOsStr) }
    }

    pub fn as_bytes(&self) -> &[u8] {
        &self.0
    }

    /// Tries to interpret the LinuxOsStr as a utf8 str.
    ///
    /// While linux OsStrs are "arbitrary bytes" in general, there are often
    /// parts that are known to be utf8 (ascii even).
    ///
    /// For instance, when parsing /proc/self/mem, most of the line is ascii
    /// like "r-xp" or "1a23-4fe2". However the "path" at the end of each line
    /// is a true LinuxOsStr and may not be proper utf8.
    pub fn to_str(&self) -> Result<&str, Utf8Error> {
        std::str::from_utf8(self)
    }

    /// Converts to a utf8 string lossily (uses the usual std lossy algorithm).
    pub fn to_string_lossy(&self) -> Cow<str> {
        // Ok so this is the end of the line for dancing around and acting
        // like we can just be fine with Linux OS strings being arbitrary bags
        // of bytes. We need some way to print this value in a reasonable way,
        // and the best precedent I can find for that is std::Path::display.
        // This wraps the a Path (which is just an OsStr) and provides a
        // Display impl.
        //
        // What does this Display impl do..?
        //
        // It just calls from_utf8_lossy.
        //
        // Whelp. Ok.
        //
        // (Strictly speaking it wraps it up in the internal/unstable
        // Utf8Lossy iterator so it avoids the allocation, but we don't
        // have that luxury, so we might as well make the allocation/conversion
        // explicit.)
        String::from_utf8_lossy(self.as_bytes())
    }

    // ~Copies of a bunch of string APIs since [u8] doesn't have them (reasonably)
    pub fn split_once(&self, separator: u8) -> Option<(&LinuxOsStr, &LinuxOsStr)> {
        self.iter().position(|&b| b == separator).map(|idx| {
            (
                Self::from_bytes(&self[..idx]),
                Self::from_bytes(&self[idx + 1..]),
            )
        })
    }
    pub fn rsplit_once(&self, separator: u8) -> Option<(&LinuxOsStr, &LinuxOsStr)> {
        self.iter().rposition(|&b| b == separator).map(|idx| {
            (
                Self::from_bytes(&self[..idx]),
                Self::from_bytes(&self[idx + 1..]),
            )
        })
    }

    pub fn split(&self, separator: u8) -> impl Iterator<Item = &LinuxOsStr> {
        self.as_bytes()
            .split(move |&b| b == separator)
            .map(LinuxOsStr::from_bytes)
    }

    pub fn split_ascii_whitespace(&self) -> impl Iterator<Item = &LinuxOsStr> {
        // Quick and dirty impl: just split on every individual whitespace
        // char but discard all the empty substrings.
        self.as_bytes()
            .split(|b| b.is_ascii_whitespace())
            .filter(|slice| !slice.is_empty())
            .map(LinuxOsStr::from_bytes)
    }

    pub fn lines(&self) -> impl Iterator<Item = &LinuxOsStr> {
        // Intentionally doesn't mess around with stuff like \r
        // since we're processing files generated by the OS, but maybe
        // this will be a problem later?
        self.split(b'\n')
    }

    pub fn trim_ascii_whitespace(&self) -> &LinuxOsStr {
        let input = self.as_bytes();

        let mut first = None;
        let mut last = None;

        // Find first non-whitespace index
        for (i, &c) in input.iter().enumerate() {
            if !c.is_ascii_whitespace() {
                first = Some(i);
                break;
            }
        }

        // Find last non-whitespace index
        for (i, &c) in input.iter().enumerate().rev() {
            if !c.is_ascii_whitespace() {
                last = Some(i);
                break;
            }
        }

        if let (Some(first), Some(last)) = (first, last) {
            Self::from_bytes(&input[first..=last])
        } else {
            // string was entirely whitespace, return an empty string starting
            // at its position (so that it's still strictly a substring).
            Self::from_bytes(&input[0..0])
        }
    }
}

impl LinuxOsString {
    /// Create a new LinuxOsString from an array of bytes.
    pub fn from_vec(vec: Vec<u8>) -> Self {
        Self(vec)
    }

    pub fn new() -> Self {
        Self(Vec::new())
    }

    pub fn as_os_str(&self) -> &LinuxOsStr {
        self
    }
}

impl Borrow<LinuxOsStr> for LinuxOsString {
    fn borrow(&self) -> &LinuxOsStr {
        LinuxOsStr::from_bytes(&self.0)
    }
}

impl ToOwned for LinuxOsStr {
    type Owned = LinuxOsString;

    fn to_owned(&self) -> LinuxOsString {
        LinuxOsString::from_vec(self.0.to_owned())
    }
}

impl Deref for LinuxOsString {
    type Target = LinuxOsStr;

    fn deref(&self) -> &LinuxOsStr {
        LinuxOsStr::from_bytes(&self.0)
    }
}

impl Deref for LinuxOsStr {
    type Target = [u8];

    fn deref(&self) -> &[u8] {
        &self.0
    }
}