Skip to main content

dm_database_parser_sqllog/
sqllog.rs

1use atoi::atoi;
2use encoding::DecoderTrap;
3use encoding::Encoding;
4use encoding::all::GB18030;
5use memchr::memmem::Finder;
6use memchr::{memchr, memrchr};
7use simdutf8::basic::from_utf8 as simd_from_utf8;
8use std::borrow::Cow;
9use std::sync::LazyLock;
10
11use crate::parser::FileEncodingHint;
12
13/// Pre-built SIMD finders for performance indicators — avoids per-call initialization.
14static FINDER_EXECTIME: LazyLock<Finder<'static>> = LazyLock::new(|| Finder::new(b"EXECTIME:"));
15static FINDER_ROWCOUNT: LazyLock<Finder<'static>> = LazyLock::new(|| Finder::new(b"ROWCOUNT:"));
16static FINDER_EXEC_ID: LazyLock<Finder<'static>> = LazyLock::new(|| Finder::new(b"EXEC_ID:"));
17
18/// SQL 日志记录
19///
20/// 表示一条完整的 SQL 日志记录,包含时间戳、元数据、SQL 语句体和可选的性能指标。
21#[derive(Debug, Clone, PartialEq, Default)]
22pub struct Sqllog<'a> {
23    /// 时间戳,格式为 "YYYY-MM-DD HH:MM:SS.mmm"
24    pub ts: Cow<'a, str>,
25
26    /// 原始元数据字节(延迟解析)
27    pub meta_raw: Cow<'a, str>,
28
29    /// 原始内容(包含 Body 和 Indicators),延迟分割和解析
30    pub content_raw: Cow<'a, [u8]>,
31
32    /// 提取出的方括号标签(例如 [SEL]、[ORA]),若无则为 None
33    pub tag: Option<Cow<'a, str>>,
34
35    /// 文件级编码 hint(由 parser 探测),用于正确解码 content
36    pub(crate) encoding: FileEncodingHint,
37}
38
39impl<'a> Sqllog<'a> {
40    // ── 公开 API ─────────────────────────────────────────────────────────────
41
42    /// 获取 SQL 语句体(延迟分割)
43    pub fn body(&self) -> Cow<'a, str> {
44        let split = self.find_indicators_split();
45        let is_borrowed = matches!(&self.content_raw, Cow::Borrowed(_));
46        // SAFETY: body_bytes 是 content_raw 的子切片,与 content_raw 共享 'a 生命周期
47        unsafe { decode_content_bytes(&self.content_raw[..split], is_borrowed, self.encoding) }
48    }
49
50    /// 获取 SQL 语句体的长度(不做 UTF-8 校验,不分配)
51    #[inline]
52    pub fn body_len(&self) -> usize {
53        self.find_indicators_split()
54    }
55
56    /// 获取 SQL 语句体的原始字节切片(不分配)
57    #[inline]
58    pub fn body_bytes(&self) -> &[u8] {
59        &self.content_raw[..self.find_indicators_split()]
60    }
61
62    /// 获取原始性能指标字符串(延迟分割)
63    pub fn indicators_raw(&self) -> Option<Cow<'a, str>> {
64        let split = self.find_indicators_split();
65        let ind_bytes = &self.content_raw[split..];
66        if ind_bytes.is_empty() {
67            return None;
68        }
69        let is_borrowed = matches!(&self.content_raw, Cow::Borrowed(_));
70        // SAFETY: ind_bytes 是 content_raw 的子切片,与 content_raw 共享 'a 生命周期
71        Some(unsafe { decode_content_bytes(ind_bytes, is_borrowed, self.encoding) })
72    }
73
74    /// 解析性能指标(sql 字段为空字符串)
75    pub fn parse_indicators(&self) -> Option<PerformanceMetrics<'static>> {
76        let ind_bytes = &self.content_raw[self.find_indicators_split()..];
77        if ind_bytes.is_empty() {
78            return None;
79        }
80        parse_indicators_from_bytes(ind_bytes)
81    }
82
83    /// 解析性能指标和 SQL 语句
84    ///
85    /// 返回包含 EXECTIME、ROWCOUNT、EXEC_ID 和 SQL 语句的 [`PerformanceMetrics`]。
86    ///
87    /// 当 tag 为 `"ORA"` 时,SQL 语句开头可能带有 `": "`,本方法会自动去除。
88    ///
89    /// # 实现说明
90    /// 仅调用一次 `find_indicators_split()`,body 解码与 indicators 解析均在同一
91    /// 次遍历中完成,`Cow::Borrowed` 路径全程零分配。
92    pub fn parse_performance_metrics(&self) -> PerformanceMetrics<'a> {
93        let split = self.find_indicators_split();
94        let is_borrowed = matches!(&self.content_raw, Cow::Borrowed(_));
95
96        // SAFETY: 子切片与 content_raw 共享 'a 生命周期
97        let sql_raw =
98            unsafe { decode_content_bytes(&self.content_raw[..split], is_borrowed, self.encoding) };
99
100        let sql = if self.tag.as_deref() == Some("ORA") {
101            strip_ora_prefix(sql_raw)
102        } else {
103            sql_raw
104        };
105
106        let mut pm = parse_indicators_from_bytes(&self.content_raw[split..]).unwrap_or_default();
107        pm.sql = sql;
108        pm
109    }
110
111    /// 解析元数据
112    pub fn parse_meta(&self) -> MetaParts<'a> {
113        let meta_bytes = self.meta_raw.as_bytes();
114        let mut meta = MetaParts::default();
115        let len = meta_bytes.len();
116        let is_borrowed = matches!(&self.meta_raw, Cow::Borrowed(_));
117
118        let to_cow = |bytes: &[u8]| -> Cow<'a, str> {
119            if is_borrowed {
120                unsafe {
121                    Cow::Borrowed(std::str::from_utf8_unchecked(std::slice::from_raw_parts(
122                        bytes.as_ptr(),
123                        bytes.len(),
124                    )))
125                }
126            } else {
127                unsafe { Cow::Owned(std::str::from_utf8_unchecked(bytes).to_string()) }
128            }
129        };
130
131        let mut idx = 0;
132        while idx < len {
133            // Skip whitespace
134            while idx < len && meta_bytes[idx] == b' ' {
135                idx += 1;
136            }
137            if idx >= len {
138                break;
139            }
140
141            // Find token end
142            let start = idx;
143            while idx < len && meta_bytes[idx] != b' ' {
144                idx += 1;
145            }
146            let part = &meta_bytes[start..idx];
147
148            // Parse EP[n]
149            if part.len() > 4
150                && part[0] == b'E'
151                && part[1] == b'P'
152                && part[2] == b'['
153                && part[part.len() - 1] == b']'
154            {
155                if let Some(ep) = atoi::<u8>(&part[3..part.len() - 1]) {
156                    meta.ep = ep;
157                }
158                continue;
159            }
160
161            // Find ':'
162            if let Some(sep) = memchr(b':', part) {
163                let key = &part[..sep];
164                let val = &part[sep + 1..];
165
166                match key {
167                    b"sess" => meta.sess_id = to_cow(val),
168                    b"thrd" => meta.thrd_id = to_cow(val),
169                    b"user" => meta.username = to_cow(val),
170                    b"trxid" => meta.trxid = to_cow(val),
171                    b"stmt" => meta.statement = to_cow(val),
172                    b"ip" => meta.client_ip = to_cow(val),
173                    b"appname" => {
174                        if !val.is_empty() {
175                            meta.appname = to_cow(val);
176                        } else {
177                            // Peek next token; treat it as appname only if it is not an ip field
178                            let mut peek = idx;
179                            while peek < len && meta_bytes[peek] == b' ' {
180                                peek += 1;
181                            }
182                            if peek < len {
183                                let peek_start = peek;
184                                while peek < len && meta_bytes[peek] != b' ' {
185                                    peek += 1;
186                                }
187                                let next = &meta_bytes[peek_start..peek];
188                                if !(next.starts_with(b"ip:") || next.starts_with(b"ip::")) {
189                                    meta.appname = to_cow(next);
190                                    idx = peek;
191                                }
192                            }
193                        }
194                    }
195                    _ => {}
196                }
197            }
198        }
199        meta
200    }
201
202    // ── Private helpers ───────────────────────────────────────────────────────
203
204    fn find_indicators_split(&self) -> usize {
205        let data = &self.content_raw;
206        let len = data.len();
207        let start = len.saturating_sub(256);
208        let window = &data[start..len];
209        let mut tail = window.len();
210
211        // Search backwards for each keyword in decreasing order of position
212        for keyword in [
213            b"EXEC_ID".as_ref(),
214            b"ROWCOUNT".as_ref(),
215            b"EXECTIME".as_ref(),
216        ] {
217            tail = find_keyword_end_backward(window, tail, keyword).unwrap_or(tail);
218        }
219
220        start + tail
221    }
222}
223
224// ── Module-level helpers ──────────────────────────────────────────────────────
225
226/// Decode a sub-slice of `content_raw` bytes into a `Cow<'a, str>`.
227///
228/// # Safety
229/// `bytes` must be a sub-slice of a `'a`-lived allocation (i.e., the original
230/// `Cow::Borrowed(&'a [u8])`). The caller guarantees this by passing `is_borrowed = true`
231/// only when the source `Cow` is `Borrowed`.
232#[inline]
233unsafe fn decode_content_bytes<'a>(
234    bytes: &[u8],
235    is_borrowed: bool,
236    encoding: FileEncodingHint,
237) -> Cow<'a, str> {
238    match encoding {
239        FileEncodingHint::Utf8 | FileEncodingHint::Auto => match simd_from_utf8(bytes) {
240            Ok(s) => {
241                if is_borrowed {
242                    unsafe {
243                        Cow::Borrowed(std::str::from_utf8_unchecked(std::slice::from_raw_parts(
244                            bytes.as_ptr(),
245                            bytes.len(),
246                        )))
247                    }
248                } else {
249                    Cow::Owned(s.to_string())
250                }
251            }
252            Err(_) => Cow::Owned(String::from_utf8_lossy(bytes).into_owned()),
253        },
254        FileEncodingHint::Gb18030 => match GB18030.decode(bytes, DecoderTrap::Strict) {
255            Ok(s) => Cow::Owned(s),
256            Err(_) => Cow::Owned(String::from_utf8_lossy(bytes).into_owned()),
257        },
258    }
259}
260
261/// Scan `window[..within]` backwards for `keyword:` followed by a space.
262/// Returns the offset just before the keyword (the new split boundary) if found.
263#[inline]
264fn find_keyword_end_backward(window: &[u8], within: usize, keyword: &[u8]) -> Option<usize> {
265    let klen = keyword.len();
266    let mut search_end = within;
267    while let Some(idx) = memrchr(b':', &window[..search_end]) {
268        if idx >= klen
269            && &window[idx - klen..idx] == keyword
270            && idx + 1 < window.len()
271            && window[idx + 1] == b' '
272        {
273            return Some(idx - klen);
274        }
275        if idx == 0 {
276            break;
277        }
278        search_end = idx;
279    }
280    None
281}
282
283/// Parse `EXECTIME`, `ROWCOUNT`, `EXEC_ID` from a raw indicators byte slice.
284/// The `sql` field of the returned struct is left as the default empty string.
285/// Returns `None` if none of the three fields are present.
286fn parse_indicators_from_bytes(ind: &[u8]) -> Option<PerformanceMetrics<'static>> {
287    if ind.is_empty() {
288        return None;
289    }
290
291    let mut out = PerformanceMetrics::default();
292    let mut found = false;
293
294    if let Some(idx) = FINDER_EXECTIME.find(ind) {
295        let ss = idx + 9;
296        if let Some(pi) = memchr(b'(', &ind[ss..]) {
297            let val = ind[ss..ss + pi].trim_ascii();
298            if let Ok(t) = unsafe { std::str::from_utf8_unchecked(val) }.parse::<f32>() {
299                out.exectime = t;
300                found = true;
301            }
302        }
303    }
304
305    if let Some(idx) = FINDER_ROWCOUNT.find(ind) {
306        let ss = idx + 9;
307        if let Some(pi) = memchr(b'(', &ind[ss..])
308            && let Some(c) = atoi::<u32>(ind[ss..ss + pi].trim_ascii())
309        {
310            out.rowcount = c;
311            found = true;
312        }
313    }
314
315    if let Some(idx) = FINDER_EXEC_ID.find(ind) {
316        let ss = idx + 8;
317        let end = memchr(b'.', &ind[ss..])
318            .map(|i| ss + i)
319            .unwrap_or(ind.len());
320        if let Some(id) = atoi::<i64>(ind[ss..end].trim_ascii()) {
321            out.exec_id = id;
322            found = true;
323        }
324    }
325
326    found.then_some(out)
327}
328
329/// Strip a leading `": "` prefix from a `Cow<str>` (zero-alloc for both paths).
330#[inline]
331fn strip_ora_prefix(s: Cow<'_, str>) -> Cow<'_, str> {
332    match s {
333        Cow::Borrowed(inner) => Cow::Borrowed(inner.strip_prefix(": ").unwrap_or(inner)),
334        Cow::Owned(mut inner) => {
335            if inner.starts_with(": ") {
336                inner.drain(..2);
337            }
338            Cow::Owned(inner)
339        }
340    }
341}
342
343// ── Public types ──────────────────────────────────────────────────────────────
344
345/// 元数据部分
346///
347/// 包含日志记录的所有元数据字段,如会话 ID、用户名等。
348#[derive(Debug, Clone, PartialEq, Default)]
349pub struct MetaParts<'a> {
350    /// EP(Execution Point)编号,范围 0-255
351    pub ep: u8,
352
353    /// 会话 ID
354    pub sess_id: Cow<'a, str>,
355
356    /// 线程 ID
357    pub thrd_id: Cow<'a, str>,
358
359    /// 用户名
360    pub username: Cow<'a, str>,
361
362    /// 事务 ID
363    pub trxid: Cow<'a, str>,
364
365    /// 语句 ID
366    pub statement: Cow<'a, str>,
367
368    /// 应用程序名称
369    pub appname: Cow<'a, str>,
370
371    /// 客户端 IP 地址(可选)
372    pub client_ip: Cow<'a, str>,
373}
374
375/// SQL 记录的性能指标和 SQL 语句
376///
377/// 包含 SQL 执行的性能指标,如执行时间、影响行数、执行 ID 和完整的 SQL 语句。
378#[derive(Debug, Clone, PartialEq, Default)]
379pub struct PerformanceMetrics<'a> {
380    /// 执行时间(毫秒)
381    pub exectime: f32,
382
383    /// 影响的行数
384    pub rowcount: u32,
385
386    /// 执行 ID
387    pub exec_id: i64,
388
389    /// 完整的 SQL 语句
390    pub sql: Cow<'a, str>,
391}