Skip to main content

dm_database_parser_sqllog/
parser.rs

1use memchr::{memchr, memrchr};
2use memmap2::Mmap;
3use std::borrow::Cow;
4use std::fs::File;
5use std::path::Path;
6
7use crate::error::ParseError;
8use crate::sqllog::Sqllog;
9use encoding::all::GB18030;
10use encoding::{DecoderTrap, Encoding};
11
12#[derive(Copy, Clone, Debug, PartialEq, Eq, Default)]
13pub enum FileEncodingHint {
14    /// Unknown / detect per-record (backward compatible)
15    #[default]
16    Auto,
17    /// The file is UTF-8
18    Utf8,
19    /// The file is GB18030
20    Gb18030,
21}
22
23pub struct LogParser {
24    mmap: Mmap,
25    encoding: FileEncodingHint,
26}
27
28impl LogParser {
29    pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self, ParseError> {
30        let file = File::open(path).map_err(|e| ParseError::IoError(e.to_string()))?;
31        let mmap = unsafe { Mmap::map(&file).map_err(|e| ParseError::IoError(e.to_string()))? };
32
33        // Sample the beginning of the file to determine if it's valid UTF-8.
34        // If it's valid UTF-8, treat whole file as UTF-8 for speed. Otherwise assume GB18030.
35        let sample_len = std::cmp::min(65536, mmap.len());
36        let sample = &mmap[..sample_len];
37        let encoding = if std::str::from_utf8(sample).is_ok() {
38            FileEncodingHint::Utf8
39        } else {
40            FileEncodingHint::Gb18030
41        };
42
43        Ok(Self { mmap, encoding })
44    }
45
46    pub fn iter(&self) -> LogIterator<'_> {
47        LogIterator {
48            data: &self.mmap,
49            pos: 0,
50            encoding: self.encoding,
51        }
52    }
53}
54
55pub struct LogIterator<'a> {
56    data: &'a [u8],
57    pos: usize,
58    encoding: FileEncodingHint,
59}
60
61impl<'a> Iterator for LogIterator<'a> {
62    type Item = Result<Sqllog<'a>, ParseError>;
63
64    fn next(&mut self) -> Option<Self::Item> {
65        if self.pos >= self.data.len() {
66            return None;
67        }
68
69        let data = &self.data[self.pos..];
70        let mut scan_pos = 0;
71        let mut found_next = None;
72        let mut is_multiline = false;
73
74        while let Some(idx) = memchr(b'\n', &data[scan_pos..]) {
75            let newline_idx = scan_pos + idx;
76            let next_line_start = newline_idx + 1;
77
78            if next_line_start >= data.len() {
79                break;
80            }
81
82            // Check if next line starts with timestamp
83            let check_len = std::cmp::min(23, data.len() - next_line_start);
84            if check_len == 23 {
85                let next_bytes = &data[next_line_start..next_line_start + 23];
86                // Fast check: 20xx and separators
87                if next_bytes[0] == b'2'
88                    && next_bytes[1] == b'0'
89                    && next_bytes[4] == b'-'
90                    && next_bytes[7] == b'-'
91                    && next_bytes[10] == b' '
92                    && next_bytes[13] == b':'
93                    && next_bytes[16] == b':'
94                    && next_bytes[19] == b'.'
95                {
96                    found_next = Some(newline_idx);
97                    break;
98                }
99            }
100
101            is_multiline = true;
102            scan_pos = next_line_start;
103        }
104
105        let (record_end, next_start) = if let Some(idx) = found_next {
106            (idx, idx + 1)
107        } else {
108            (data.len(), data.len())
109        };
110
111        let record_slice = &data[..record_end];
112        self.pos += next_start;
113
114        // Trim trailing CR if present
115        let record_slice = if record_slice.ends_with(b"\r") {
116            &record_slice[..record_slice.len() - 1]
117        } else {
118            record_slice
119        };
120
121        if record_slice.is_empty() {
122            return self.next();
123        }
124
125        Some(parse_record_with_hint(
126            record_slice,
127            is_multiline,
128            self.encoding,
129        ))
130    }
131}
132
133pub fn parse_record<'a>(record_bytes: &'a [u8]) -> Result<Sqllog<'a>, ParseError> {
134    parse_record_with_hint(record_bytes, true, FileEncodingHint::Auto)
135}
136
137fn parse_record_with_hint<'a>(
138    record_bytes: &'a [u8],
139    is_multiline: bool,
140    encoding_hint: FileEncodingHint,
141) -> Result<Sqllog<'a>, ParseError> {
142    // Find end of first line
143    let (first_line, _rest) = if is_multiline {
144        match memchr(b'\n', record_bytes) {
145            Some(idx) => {
146                let mut line = &record_bytes[..idx];
147                if line.ends_with(b"\r") {
148                    line = &line[..line.len() - 1];
149                }
150                (line, &record_bytes[idx + 1..])
151            }
152            None => {
153                let mut line = record_bytes;
154                if line.ends_with(b"\r") {
155                    line = &line[..line.len() - 1];
156                }
157                (line, &[] as &[u8])
158            }
159        }
160    } else {
161        let mut line = record_bytes;
162        if line.ends_with(b"\r") {
163            line = &line[..line.len() - 1];
164        }
165        (line, &[] as &[u8])
166    };
167
168    // 1. Timestamp
169    if first_line.len() < 23 {
170        return Err(ParseError::InvalidFormat {
171            raw: String::from_utf8_lossy(first_line).to_string(),
172        });
173    }
174    // We assume ASCII/UTF-8 for timestamp
175    // SAFETY: We validated the timestamp format in LogIterator::next using is_ts_millis_bytes,
176    // which ensures it contains only digits and separators.
177    let ts = unsafe { Cow::Borrowed(std::str::from_utf8_unchecked(&first_line[0..23])) };
178
179    // 2. Meta
180    // Format: TS (META) BODY
181    // Find first '(' after TS
182    let meta_start = match memchr(b'(', &first_line[23..]) {
183        Some(idx) => 23 + idx,
184        None => {
185            return Err(ParseError::InvalidFormat {
186                raw: String::from_utf8_lossy(first_line).to_string(),
187            });
188        }
189    };
190
191    // Find closing ')' for meta.
192    // We search for ") " starting from meta_start.
193    // Optimization: use memchr loop instead of windows(2)
194    let mut search_pos = meta_start;
195    let meta_end = loop {
196        match memchr(b')', &first_line[search_pos..]) {
197            Some(idx) => {
198                let abs_idx = search_pos + idx;
199                // Check if followed by space
200                if abs_idx + 1 < first_line.len() && first_line[abs_idx + 1] == b' ' {
201                    break Some(abs_idx);
202                }
203                // If not, continue searching after this ')'
204                search_pos = abs_idx + 1;
205            }
206            None => {
207                // Fallback: find last ')' if ") " not found (robustness)
208                break memrchr(b')', &first_line[meta_start..]).map(|idx| meta_start + idx);
209            }
210        }
211    };
212
213    let meta_end = match meta_end {
214        Some(idx) => idx,
215        None => {
216            return Err(ParseError::InvalidFormat {
217                raw: String::from_utf8_lossy(first_line).to_string(),
218            });
219        }
220    };
221
222    let meta_bytes = &first_line[meta_start + 1..meta_end];
223    // Lazy parsing: store raw bytes
224    // SAFETY: meta_bytes is a sub-slice of first_line, which is 'a.
225    // Use the provided encoding hint (file-level autodetection) to decide how to decode meta bytes.
226    let meta_raw = match encoding_hint {
227        FileEncodingHint::Utf8 => match std::str::from_utf8(meta_bytes) {
228            Ok(s) => Cow::Borrowed(s),
229            Err(_) => Cow::Owned(String::from_utf8_lossy(meta_bytes).into_owned()),
230        },
231        FileEncodingHint::Gb18030 => match GB18030.decode(meta_bytes, DecoderTrap::Strict) {
232            Ok(s) => Cow::Owned(s),
233            Err(_) => Cow::Owned(String::from_utf8_lossy(meta_bytes).into_owned()),
234        },
235        FileEncodingHint::Auto => match std::str::from_utf8(meta_bytes) {
236            Ok(s) => Cow::Borrowed(s),
237            Err(_) => match GB18030.decode(meta_bytes, DecoderTrap::Strict) {
238                Ok(s) => Cow::Owned(s),
239                Err(_) => Cow::Owned(String::from_utf8_lossy(meta_bytes).into_owned()),
240            },
241        },
242    };
243
244    // 3. Body & 4. Indicators
245    let body_start_in_first_line = meta_end + 1;
246
247    let first_line_body = if body_start_in_first_line < first_line.len() {
248        &first_line[body_start_in_first_line..]
249    } else {
250        &[]
251    };
252
253    let start_idx = first_line_body
254        .iter()
255        .position(|b| !b.is_ascii_whitespace())
256        .unwrap_or(first_line_body.len());
257
258    let content_start = body_start_in_first_line + start_idx;
259
260    // Extract optional leading tag like [SEL] or [ORA]
261    let mut tag: Option<Cow<'a, str>> = None;
262    let content_slice = if content_start < record_bytes.len() {
263        let mut s = &record_bytes[content_start..];
264        // If it starts with '[', try to find matching ']' and treat inner token as tag
265        if !s.is_empty()
266            && s[0] == b'['
267            && let Some(end_idx) = memchr(b']', s)
268            && end_idx >= 1
269        {
270            let inner = &s[1..end_idx];
271            // Accept token without spaces and reasonable length
272            if !inner.contains(&b' ') && inner.len() <= 32 {
273                tag = match std::str::from_utf8(inner) {
274                    Ok(st) => Some(Cow::Borrowed(st)),
275                    Err(_) => match encoding_hint {
276                        FileEncodingHint::Gb18030 => {
277                            match GB18030.decode(inner, DecoderTrap::Strict) {
278                                Ok(s) => Some(Cow::Owned(s)),
279                                Err(_) => {
280                                    Some(Cow::Owned(String::from_utf8_lossy(inner).into_owned()))
281                                }
282                            }
283                        }
284                        _ => Some(Cow::Owned(String::from_utf8_lossy(inner).into_owned())),
285                    },
286                };
287                // Move past the closing ']' and any following ASCII whitespace
288                s = &s[end_idx + 1..];
289                let mut skip = 0usize;
290                while skip < s.len() && s[skip].is_ascii_whitespace() {
291                    skip += 1;
292                }
293                s = &s[skip..];
294            }
295        }
296        s
297    } else {
298        &[] as &[u8]
299    };
300
301    let content_raw = Cow::Borrowed(content_slice);
302
303    Ok(Sqllog {
304        ts,
305        meta_raw,
306        content_raw,
307        tag,
308        encoding: encoding_hint,
309    })
310}