polars_json/json/write/
utf8.rs

1// Adapted from https://github.com/serde-rs/json/blob/f901012df66811354cb1d490ad59480d8fdf77b5/src/ser.rs
2use std::io;
3
4use arrow::array::{Array, MutableBinaryViewArray, Utf8ViewArray};
5
6use crate::json::write::new_serializer;
7
8pub fn write_str<W>(writer: &mut W, value: &str) -> io::Result<()>
9where
10    W: io::Write,
11{
12    writer.write_all(b"\"")?;
13    let bytes = value.as_bytes();
14
15    let mut start = 0;
16
17    for (i, &byte) in bytes.iter().enumerate() {
18        let escape = ESCAPE[byte as usize];
19        if escape == 0 {
20            continue;
21        }
22
23        if start < i {
24            writer.write_all(&bytes[start..i])?;
25        }
26
27        let char_escape = CharEscape::from_escape_table(escape, byte);
28        write_char_escape(writer, char_escape)?;
29
30        start = i + 1;
31    }
32
33    if start != bytes.len() {
34        writer.write_all(&bytes[start..])?;
35    }
36    writer.write_all(b"\"")
37}
38
39const BB: u8 = b'b'; // \x08
40const TT: u8 = b't'; // \x09
41const NN: u8 = b'n'; // \x0A
42const FF: u8 = b'f'; // \x0C
43const RR: u8 = b'r'; // \x0D
44const QU: u8 = b'"'; // \x22
45const BS: u8 = b'\\'; // \x5C
46const UU: u8 = b'u'; // \x00...\x1F except the ones above
47const __: u8 = 0;
48
49// Lookup table of escape sequences. A value of b'x' at index i means that byte
50// i is escaped as "\x" in JSON. A value of 0 means that byte i is not escaped.
51static ESCAPE: [u8; 256] = [
52    //   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
53    UU, UU, UU, UU, UU, UU, UU, UU, BB, TT, NN, UU, FF, RR, UU, UU, // 0
54    UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, // 1
55    __, __, QU, __, __, __, __, __, __, __, __, __, __, __, __, __, // 2
56    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 3
57    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 4
58    __, __, __, __, __, __, __, __, __, __, __, __, BS, __, __, __, // 5
59    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 6
60    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 7
61    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 8
62    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 9
63    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A
64    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B
65    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C
66    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D
67    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E
68    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F
69];
70
71/// Represents a character escape code in a type-safe manner.
72pub enum CharEscape {
73    /// An escaped quote `"`
74    Quote,
75    /// An escaped reverse solidus `\`
76    ReverseSolidus,
77    // An escaped solidus `/`
78    //Solidus,
79    /// An escaped backspace character (usually escaped as `\b`)
80    Backspace,
81    /// An escaped form feed character (usually escaped as `\f`)
82    FormFeed,
83    /// An escaped line feed character (usually escaped as `\n`)
84    LineFeed,
85    /// An escaped carriage return character (usually escaped as `\r`)
86    CarriageReturn,
87    /// An escaped tab character (usually escaped as `\t`)
88    Tab,
89    /// An escaped ASCII plane control character (usually escaped as
90    /// `\u00XX` where `XX` are two hex characters)
91    AsciiControl(u8),
92}
93
94impl CharEscape {
95    #[inline]
96    fn from_escape_table(escape: u8, byte: u8) -> CharEscape {
97        match escape {
98            self::BB => CharEscape::Backspace,
99            self::TT => CharEscape::Tab,
100            self::NN => CharEscape::LineFeed,
101            self::FF => CharEscape::FormFeed,
102            self::RR => CharEscape::CarriageReturn,
103            self::QU => CharEscape::Quote,
104            self::BS => CharEscape::ReverseSolidus,
105            self::UU => CharEscape::AsciiControl(byte),
106            _ => unreachable!(),
107        }
108    }
109}
110
111#[inline]
112fn write_char_escape<W>(writer: &mut W, char_escape: CharEscape) -> io::Result<()>
113where
114    W: io::Write,
115{
116    use self::CharEscape::*;
117
118    let s = match char_escape {
119        Quote => b"\\\"",
120        ReverseSolidus => b"\\\\",
121        //Solidus => b"\\/",
122        Backspace => b"\\b",
123        FormFeed => b"\\f",
124        LineFeed => b"\\n",
125        CarriageReturn => b"\\r",
126        Tab => b"\\t",
127        AsciiControl(byte) => {
128            static HEX_DIGITS: [u8; 16] = *b"0123456789abcdef";
129            let bytes = &[
130                b'\\',
131                b'u',
132                b'0',
133                b'0',
134                HEX_DIGITS[(byte >> 4) as usize],
135                HEX_DIGITS[(byte & 0xF) as usize],
136            ];
137            return writer.write_all(bytes);
138        },
139    };
140
141    writer.write_all(s)
142}
143
144pub fn serialize_to_utf8(array: &dyn Array) -> Utf8ViewArray {
145    let mut values = MutableBinaryViewArray::with_capacity(array.len());
146    let mut serializer = new_serializer(array, 0, usize::MAX);
147
148    while let Some(v) = serializer.next() {
149        unsafe { values.push_value(std::str::from_utf8_unchecked(v)) }
150    }
151    values.into()
152}