dm_database_parser_sqllog/
parser.rs1use memchr::{memchr, memrchr};
2use memmap2::Mmap;
3use std::borrow::Cow;
4use std::fs::File;
5use std::path::Path;
6
7use crate::error::ParseError;
8use crate::sqllog::Sqllog;
9use encoding::all::GB18030;
10use encoding::{DecoderTrap, Encoding};
11
12#[derive(Copy, Clone, Debug, PartialEq, Eq, Default)]
13pub enum FileEncodingHint {
14 #[default]
16 Auto,
17 Utf8,
19 Gb18030,
21}
22
23pub struct LogParser {
24 mmap: Mmap,
25 encoding: FileEncodingHint,
26}
27
28impl LogParser {
29 pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self, ParseError> {
30 let file = File::open(path).map_err(|e| ParseError::IoError(e.to_string()))?;
31 let mmap = unsafe { Mmap::map(&file).map_err(|e| ParseError::IoError(e.to_string()))? };
32
33 let sample_len = std::cmp::min(65536, mmap.len());
36 let sample = &mmap[..sample_len];
37 let encoding = if std::str::from_utf8(sample).is_ok() {
38 FileEncodingHint::Utf8
39 } else {
40 FileEncodingHint::Gb18030
41 };
42
43 Ok(Self { mmap, encoding })
44 }
45
46 pub fn iter(&self) -> LogIterator<'_> {
47 LogIterator {
48 data: &self.mmap,
49 pos: 0,
50 encoding: self.encoding,
51 }
52 }
53}
54
55pub struct LogIterator<'a> {
56 data: &'a [u8],
57 pos: usize,
58 encoding: FileEncodingHint,
59}
60
61impl<'a> Iterator for LogIterator<'a> {
62 type Item = Result<Sqllog<'a>, ParseError>;
63
64 fn next(&mut self) -> Option<Self::Item> {
65 if self.pos >= self.data.len() {
66 return None;
67 }
68
69 let data = &self.data[self.pos..];
70 let mut scan_pos = 0;
71 let mut found_next = None;
72 let mut is_multiline = false;
73
74 while let Some(idx) = memchr(b'\n', &data[scan_pos..]) {
75 let newline_idx = scan_pos + idx;
76 let next_line_start = newline_idx + 1;
77
78 if next_line_start >= data.len() {
79 break;
80 }
81
82 let check_len = std::cmp::min(23, data.len() - next_line_start);
84 if check_len == 23 {
85 let next_bytes = &data[next_line_start..next_line_start + 23];
86 if next_bytes[0] == b'2'
88 && next_bytes[1] == b'0'
89 && next_bytes[4] == b'-'
90 && next_bytes[7] == b'-'
91 && next_bytes[10] == b' '
92 && next_bytes[13] == b':'
93 && next_bytes[16] == b':'
94 && next_bytes[19] == b'.'
95 {
96 found_next = Some(newline_idx);
97 break;
98 }
99 }
100
101 is_multiline = true;
102 scan_pos = next_line_start;
103 }
104
105 let (record_end, next_start) = if let Some(idx) = found_next {
106 (idx, idx + 1)
107 } else {
108 (data.len(), data.len())
109 };
110
111 let record_slice = &data[..record_end];
112 self.pos += next_start;
113
114 let record_slice = if record_slice.ends_with(b"\r") {
116 &record_slice[..record_slice.len() - 1]
117 } else {
118 record_slice
119 };
120
121 if record_slice.is_empty() {
122 return self.next();
123 }
124
125 Some(parse_record_with_hint(
126 record_slice,
127 is_multiline,
128 self.encoding,
129 ))
130 }
131}
132
133pub fn parse_record<'a>(record_bytes: &'a [u8]) -> Result<Sqllog<'a>, ParseError> {
134 parse_record_with_hint(record_bytes, true, FileEncodingHint::Auto)
135}
136
137fn parse_record_with_hint<'a>(
138 record_bytes: &'a [u8],
139 is_multiline: bool,
140 encoding_hint: FileEncodingHint,
141) -> Result<Sqllog<'a>, ParseError> {
142 let (first_line, _rest) = if is_multiline {
144 match memchr(b'\n', record_bytes) {
145 Some(idx) => {
146 let mut line = &record_bytes[..idx];
147 if line.ends_with(b"\r") {
148 line = &line[..line.len() - 1];
149 }
150 (line, &record_bytes[idx + 1..])
151 }
152 None => {
153 let mut line = record_bytes;
154 if line.ends_with(b"\r") {
155 line = &line[..line.len() - 1];
156 }
157 (line, &[] as &[u8])
158 }
159 }
160 } else {
161 let mut line = record_bytes;
162 if line.ends_with(b"\r") {
163 line = &line[..line.len() - 1];
164 }
165 (line, &[] as &[u8])
166 };
167
168 if first_line.len() < 23 {
170 return Err(ParseError::InvalidFormat {
171 raw: String::from_utf8_lossy(first_line).to_string(),
172 });
173 }
174 let ts = unsafe { Cow::Borrowed(std::str::from_utf8_unchecked(&first_line[0..23])) };
178
179 let meta_start = match memchr(b'(', &first_line[23..]) {
183 Some(idx) => 23 + idx,
184 None => {
185 return Err(ParseError::InvalidFormat {
186 raw: String::from_utf8_lossy(first_line).to_string(),
187 });
188 }
189 };
190
191 let mut search_pos = meta_start;
195 let meta_end = loop {
196 match memchr(b')', &first_line[search_pos..]) {
197 Some(idx) => {
198 let abs_idx = search_pos + idx;
199 if abs_idx + 1 < first_line.len() && first_line[abs_idx + 1] == b' ' {
201 break Some(abs_idx);
202 }
203 search_pos = abs_idx + 1;
205 }
206 None => {
207 break memrchr(b')', &first_line[meta_start..]).map(|idx| meta_start + idx);
209 }
210 }
211 };
212
213 let meta_end = match meta_end {
214 Some(idx) => idx,
215 None => {
216 return Err(ParseError::InvalidFormat {
217 raw: String::from_utf8_lossy(first_line).to_string(),
218 });
219 }
220 };
221
222 let meta_bytes = &first_line[meta_start + 1..meta_end];
223 let meta_raw = match encoding_hint {
227 FileEncodingHint::Utf8 => match std::str::from_utf8(meta_bytes) {
228 Ok(s) => Cow::Borrowed(s),
229 Err(_) => Cow::Owned(String::from_utf8_lossy(meta_bytes).into_owned()),
230 },
231 FileEncodingHint::Gb18030 => match GB18030.decode(meta_bytes, DecoderTrap::Strict) {
232 Ok(s) => Cow::Owned(s),
233 Err(_) => Cow::Owned(String::from_utf8_lossy(meta_bytes).into_owned()),
234 },
235 FileEncodingHint::Auto => match std::str::from_utf8(meta_bytes) {
236 Ok(s) => Cow::Borrowed(s),
237 Err(_) => match GB18030.decode(meta_bytes, DecoderTrap::Strict) {
238 Ok(s) => Cow::Owned(s),
239 Err(_) => Cow::Owned(String::from_utf8_lossy(meta_bytes).into_owned()),
240 },
241 },
242 };
243
244 let body_start_in_first_line = meta_end + 1;
246
247 let first_line_body = if body_start_in_first_line < first_line.len() {
248 &first_line[body_start_in_first_line..]
249 } else {
250 &[]
251 };
252
253 let start_idx = first_line_body
254 .iter()
255 .position(|b| !b.is_ascii_whitespace())
256 .unwrap_or(first_line_body.len());
257
258 let content_start = body_start_in_first_line + start_idx;
259
260 let mut tag: Option<Cow<'a, str>> = None;
262 let content_slice = if content_start < record_bytes.len() {
263 let mut s = &record_bytes[content_start..];
264 if !s.is_empty()
266 && s[0] == b'['
267 && let Some(end_idx) = memchr(b']', s)
268 && end_idx >= 1
269 {
270 let inner = &s[1..end_idx];
271 if !inner.contains(&b' ') && inner.len() <= 32 {
273 tag = match std::str::from_utf8(inner) {
274 Ok(st) => Some(Cow::Borrowed(st)),
275 Err(_) => match encoding_hint {
276 FileEncodingHint::Gb18030 => {
277 match GB18030.decode(inner, DecoderTrap::Strict) {
278 Ok(s) => Some(Cow::Owned(s)),
279 Err(_) => {
280 Some(Cow::Owned(String::from_utf8_lossy(inner).into_owned()))
281 }
282 }
283 }
284 _ => Some(Cow::Owned(String::from_utf8_lossy(inner).into_owned())),
285 },
286 };
287 s = &s[end_idx + 1..];
289 let mut skip = 0usize;
290 while skip < s.len() && s[skip].is_ascii_whitespace() {
291 skip += 1;
292 }
293 s = &s[skip..];
294 }
295 }
296 s
297 } else {
298 &[] as &[u8]
299 };
300
301 let content_raw = Cow::Borrowed(content_slice);
302
303 Ok(Sqllog {
304 ts,
305 meta_raw,
306 content_raw,
307 tag,
308 encoding: encoding_hint,
309 })
310}