dm_database_parser_sqllog/
tools.rs

1//! 工具函数模块
2//!
3//! 提供了日志格式验证相关的工具函数,主要用于快速判断行是否为有效的记录起始行。
4
5use once_cell::sync::Lazy;
6
7// 时间戳格式常量
8const TIMESTAMP_LENGTH: usize = 23;
9const MIN_LINE_LENGTH: usize = 25;
10const TIMESTAMP_SEPARATOR_POSITIONS: [(usize, u8); 6] = [
11    (4, b'-'),
12    (7, b'-'),
13    (10, b' '),
14    (13, b':'),
15    (16, b':'),
16    (19, b'.'),
17];
18const TIMESTAMP_DIGIT_POSITIONS: [usize; 17] =
19    [0, 1, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 22];
20
21// Meta 字段常量
22const META_START_INDEX: usize = 25;
23#[allow(dead_code)]
24const MIN_META_FIELDS: usize = 6; // 最少字段数(支持没有 appname 的情况)
25#[allow(dead_code)]
26const REQUIRED_META_FIELDS: usize = 7;
27#[allow(dead_code)]
28const META_WITH_IP_FIELDS: usize = 8;
29
30// 使用 Lazy 静态初始化字段前缀数组,避免每次访问时创建
31static META_FIELD_PREFIXES: Lazy<[&'static str; 8]> = Lazy::new(|| {
32    [
33        "EP[",
34        "sess:",
35        "thrd:",
36        "user:",
37        "trxid:",
38        "stmt:",
39        "appname:",
40        "ip:::ffff:",
41    ]
42});
43
44// 预定义的字节常量,避免重复创建
45const SPACE_BYTE: u8 = b' ';
46const OPEN_PAREN_BYTE: u8 = b'(';
47const CLOSE_PAREN_CHAR: char = ')';
48
49/// 判断字节数组是否为有效的时间戳格式
50///
51/// 验证时间戳格式是否为 "YYYY-MM-DD HH:MM:SS.mmm"(恰好 23 字节)。
52///
53/// # 参数
54///
55/// * `bytes` - 要检查的字节数组
56///
57/// # 返回
58///
59/// 如果是有效的时间戳格式返回 `true`,否则返回 `false`
60///
61/// # 示例
62///
63/// ```
64/// use dm_database_parser_sqllog::tools::is_ts_millis_bytes;
65///
66/// let valid = b"2025-08-12 10:57:09.548";
67/// assert!(is_ts_millis_bytes(valid));
68///
69/// let invalid = b"2025-08-12";
70/// assert!(!is_ts_millis_bytes(invalid));
71/// ```
72#[inline(always)]
73pub fn is_ts_millis_bytes(bytes: &[u8]) -> bool {
74    if bytes.len() != TIMESTAMP_LENGTH {
75        return false;
76    }
77
78    // 检查分隔符位置
79    for &(pos, expected) in &TIMESTAMP_SEPARATOR_POSITIONS {
80        if bytes[pos] != expected {
81            return false;
82        }
83    }
84
85    // 检查数字位置
86    for &i in &TIMESTAMP_DIGIT_POSITIONS {
87        if !bytes[i].is_ascii_digit() {
88            return false;
89        }
90    }
91
92    true
93}
94
95/// 判断一行日志是否为记录起始行
96///
97/// 这是一个高性能的验证函数,用于快速判断一行文本是否为有效的日志记录起始行。
98///
99/// # 判断标准
100///
101/// 1. 行首 23 字节符合时间戳格式 `YYYY-MM-DD HH:mm:ss.SSS`
102/// 2. 时间戳后紧跟一个空格,然后是 meta 部分
103/// 3. Meta 部分用小括号包含
104/// 4. Meta 部分必须包含所有必需字段(client_ip 可选)
105/// 5. Meta 字段间以一个空格分隔
106/// 6. Meta 字段顺序固定:ep → sess → thrd_id → username → trxid → statement → appname → client_ip(可选)
107///
108/// # 参数
109///
110/// * `line` - 要检查的行
111///
112/// # 返回
113///
114/// 如果是有效的记录起始行返回 `true`,否则返回 `false`
115///
116/// # 示例
117///
118/// ```
119/// use dm_database_parser_sqllog::tools::is_record_start_line;
120///
121/// let valid = "2025-08-12 10:57:09.548 (EP[0] sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app) SELECT 1";
122/// assert!(is_record_start_line(valid));
123///
124/// let invalid = "This is not a log line";
125/// assert!(!is_record_start_line(invalid));
126/// ```
127/// 7. meta 部分结束后紧跟一个空格,然后是 body 部分。
128pub fn is_record_start_line(line: &str) -> bool {
129    // 早期退出:检查最小长度
130    let bytes = line.as_bytes();
131    if bytes.len() < MIN_LINE_LENGTH {
132        return false;
133    }
134
135    // 早期退出:验证时间戳格式(最快的失败路径)
136    if !is_ts_millis_bytes(&bytes[0..TIMESTAMP_LENGTH]) {
137        return false;
138    }
139
140    // 早期退出:检查时间戳后的分隔符 " ("
141    if bytes[23] != SPACE_BYTE || bytes[24] != OPEN_PAREN_BYTE {
142        return false;
143    }
144
145    // 早期退出:查找 meta 部分的右括号
146    let closing_paren_index = match line.find(CLOSE_PAREN_CHAR) {
147        Some(idx) => idx,
148        None => return false,
149    };
150
151    // 提取 meta 部分(括号内的内容)
152    let meta_part = &line[META_START_INDEX..closing_paren_index];
153
154    // 验证 meta 字段
155    validate_meta_fields(meta_part)
156}
157
158/// 验证 meta 部分的字段
159///
160/// 字段结构:
161/// - 前5个字段必需:EP[...] sess:... thrd:... user:... trxid:...
162/// - 后3个字段可选:stmt:... appname:... ip:::ffff:...
163///
164/// 字段之间用单个空格分隔
165#[inline]
166fn validate_meta_fields(meta: &str) -> bool {
167    let mut remaining = meta;
168
169    // 验证5个必需字段
170    for &prefix in META_FIELD_PREFIXES.iter().take(5) {
171        // 当前位置必须匹配字段前缀
172        if !remaining.starts_with(prefix) {
173            return false;
174        }
175
176        // 跳过前缀,查找字段值的结束位置(空格或字符串结束)
177        remaining = &remaining[prefix.len()..];
178
179        // 查找下一个空格(字段分隔符)
180        match remaining.find(' ') {
181            Some(space_idx) => {
182                // 跳过字段值和空格,移到下一个字段
183                remaining = &remaining[space_idx + 1..];
184            }
185            None => {
186                // 没有更多字段了
187                // 只有在处理完最后一个必需字段(trxid)时才是有效的
188                return prefix == "trxid:";
189            }
190        }
191    }
192
193    // 到这里说明5个必需字段都存在
194    // 检查可选字段:stmt, appname, ip
195
196    // 检查 stmt 字段(可选)
197    if remaining.is_empty() {
198        return true; // 只有5个必需字段,有效
199    }
200
201    if !remaining.starts_with("stmt:") {
202        return false; // 如果有更多内容但不是 stmt,则无效
203    }
204
205    remaining = &remaining[5..]; // 跳过 "stmt:"
206
207    // 查找 stmt 值的结束位置
208    match remaining.find(' ') {
209        Some(space_idx) => {
210            remaining = &remaining[space_idx + 1..];
211        }
212        None => {
213            return true; // stmt 是最后一个字段,有效
214        }
215    }
216
217    // 检查 appname 字段(可选)
218    if remaining.is_empty() {
219        return true; // 只到 stmt,有效
220    }
221
222    if !remaining.starts_with("appname:") {
223        return false; // 如果有更多内容但不是 appname,则无效
224    }
225
226    remaining = &remaining[8..]; // 跳过 "appname:"
227
228    // appname 的值可能包含空格,需要特殊处理
229    // 查找可能的 ip 字段标记
230    if let Some(_ip_idx) = remaining.find(" ip:::ffff:") {
231        // 有 IP 字段(IP 值后面不应该有更多内容)
232        return true;
233    }
234
235    // 没有 IP 字段,appname 后面应该直接结束
236    true
237}
238
239#[cfg(test)]
240mod tests {
241    use super::*;
242
243    mod timestamp_tests {
244        use super::*;
245
246        #[test]
247        fn valid_timestamps() {
248            let valid_cases: &[&[u8]] = &[
249                b"2024-06-15 12:34:56.789",
250                b"2000-01-01 00:00:00.000",
251                b"2099-12-31 23:59:59.999",
252                b"2024-02-29 12:34:56.789", // 闰年
253            ];
254            for ts in valid_cases {
255                assert!(is_ts_millis_bytes(ts), "Failed for: {:?}", ts);
256            }
257        }
258
259        #[test]
260        fn wrong_length() {
261            let invalid_cases: &[&[u8]] = &[
262                b"2024-06-15 12:34:56",
263                b"2024-06-15 12:34:56.7",
264                b"2024-06-15 12:34:56.7890",
265                b"",
266                b"2024",
267            ];
268            for ts in invalid_cases {
269                assert!(!is_ts_millis_bytes(ts), "Should fail for: {:?}", ts);
270            }
271        }
272
273        #[test]
274        fn wrong_separator() {
275            let invalid_cases: &[&[u8]] = &[
276                b"2024-06-15 12:34:56,789", // 逗号代替点
277                b"2024/06/15 12:34:56.789", // 斜杠代替短横线
278                b"2024-06-15T12:34:56.789", // T 代替空格
279                b"2024-06-15-12:34:56.789", // 短横线代替空格
280                b"2024-06-15 12-34-56.789", // 短横线代替冒号
281            ];
282            for ts in invalid_cases {
283                assert!(!is_ts_millis_bytes(ts), "Should fail for: {:?}", ts);
284            }
285        }
286
287        #[test]
288        fn non_digits() {
289            let invalid_cases: &[&[u8]] = &[
290                b"202a-06-15 12:34:56.789",
291                b"2024-0b-15 12:34:56.789",
292                b"2024-06-1c 12:34:56.789",
293                b"2024-06-15 1d:34:56.789",
294                b"2024-06-15 12:3e:56.789",
295                b"2024-06-15 12:34:5f.789",
296                b"2024-06-15 12:34:56.78g",
297            ];
298            for ts in invalid_cases {
299                assert!(!is_ts_millis_bytes(ts), "Should fail for: {:?}", ts);
300            }
301        }
302
303        #[test]
304        fn special_chars() {
305            assert!(!is_ts_millis_bytes(b"2024-06-15 12:34:56.\x00\x00\x00"));
306            assert!(!is_ts_millis_bytes(b"\x002024-06-15 12:34:56.789"));
307        }
308    }
309
310    mod record_start_line_tests {
311        use super::*;
312
313        #[test]
314        fn valid_complete_line() {
315            let line = "2025-08-12 10:57:09.548 (EP[0] sess:0x178ebca0 thrd:757455 user:HBTCOMS_V3_PROD trxid:0 stmt:0x285eb060 appname: ip:::ffff:10.3.100.68) [SEL] select 1 from dual EXECTIME: 0(ms) ROWCOUNT: 1(rows) EXEC_ID: 289655178.";
316            assert!(is_record_start_line(line));
317        }
318
319        #[test]
320        fn valid_without_ip() {
321            let line = "2025-08-12 10:57:09.548 (EP[0] sess:0x178ebca0 thrd:757455 user:HBTCOMS_V3_PROD trxid:0 stmt:0x285eb060 appname:) [SEL] select 1 from dual";
322            assert!(is_record_start_line(line));
323        }
324
325        #[test]
326        fn minimal_valid() {
327            let line = "2025-08-12 10:57:09.548 (EP[0] sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app) body";
328            assert!(is_record_start_line(line));
329        }
330
331        #[test]
332        fn too_short() {
333            let short_lines = [
334                "2025-08-12 10:57:09.548",
335                "2025-08-12 10:57:09.548 (",
336                "",
337                "short",
338            ];
339            for line in &short_lines {
340                assert!(!is_record_start_line(line), "Should fail for: {}", line);
341            }
342        }
343
344        #[test]
345        fn invalid_timestamp() {
346            let line = "2025-08-12 10:57:09,548 (EP[0] sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app) body";
347            assert!(!is_record_start_line(line));
348        }
349
350        #[test]
351        fn format_errors() {
352            let invalid_lines = [
353                "2025-08-12 10:57:09.548(EP[0] sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app) body", // 无空格
354                "2025-08-12 10:57:09.548 EP[0] sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app) body", // 无左括号
355                "2025-08-12 10:57:09.548 (EP[0] sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app body", // 无右括号
356            ];
357            for line in &invalid_lines {
358                assert!(!is_record_start_line(line), "Should fail for: {}", line);
359            }
360        }
361
362        #[test]
363        fn insufficient_fields() {
364            // 现在支持 5 个字段的格式,测试只有 4 个字段的情况
365            let line = "2025-08-12 10:57:09.548 (EP[0] sess:123 thrd:456 user:alice) body";
366            assert!(!is_record_start_line(line));
367        }
368
369        #[test]
370        fn wrong_field_order() {
371            let line = "2025-08-12 10:57:09.548 (sess:123 EP[0] thrd:456 user:alice trxid:789 stmt:999 appname:app) body";
372            assert!(!is_record_start_line(line));
373        }
374
375        #[test]
376        fn missing_required_fields() {
377            // 只有前 5 个字段是必需的: EP, sess, thrd, user, trxid
378            let test_cases = [
379                (
380                    "2025-08-12 10:57:09.548 (sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app) body",
381                    "EP",
382                ),
383                (
384                    "2025-08-12 10:57:09.548 (EP[0] thrd:456 user:alice trxid:789 stmt:999 appname:app) body",
385                    "sess",
386                ),
387                (
388                    "2025-08-12 10:57:09.548 (EP[0] sess:123 user:alice trxid:789 stmt:999 appname:app) body",
389                    "thrd",
390                ),
391                (
392                    "2025-08-12 10:57:09.548 (EP[0] sess:123 thrd:456 trxid:789 stmt:999 appname:app) body",
393                    "user",
394                ),
395                (
396                    "2025-08-12 10:57:09.548 (EP[0] sess:123 thrd:456 user:alice stmt:999 appname:app) body",
397                    "trxid",
398                ),
399            ];
400            for (line, field) in &test_cases {
401                assert!(
402                    !is_record_start_line(line),
403                    "Should fail when missing {} field",
404                    field
405                );
406            }
407        }
408
409        #[test]
410        fn with_valid_ip() {
411            let line = "2025-08-12 10:57:09.548 (EP[0] sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app ip:::ffff:192.168.1.100) body";
412            assert!(is_record_start_line(line));
413        }
414
415        #[test]
416        fn with_invalid_ip_format() {
417            // IP 格式错误(应该是 ip:::ffff: 而不是 ip:)
418            let line = "2025-08-12 10:57:09.548 (EP[0] sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app ip:192.168.1.100) body";
419            // 这个格式实际上会通过,因为 "ip:192.168.1.100)" 会被当作 appname 值的一部分
420            // 让我们测试一个真正无效的格式
421            assert!(is_record_start_line(line));
422        }
423
424        #[test]
425        fn complex_field_values() {
426            let line = "2025-08-12 10:57:09.548 (EP[123] sess:0xABCD1234 thrd:9999999 user:USER_WITH_UNDERSCORES trxid:12345678 stmt:0xFFFFFFFF appname:app-name-with-dashes ip:::ffff:10.20.30.40) SELECT * FROM table";
427            assert!(is_record_start_line(line));
428        }
429
430        #[test]
431        fn empty_appname() {
432            let line = "2025-08-12 10:57:09.548 (EP[0] sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:) body";
433            assert!(is_record_start_line(line));
434        }
435
436        #[test]
437        fn continuation_line() {
438            let continuation = "    SELECT * FROM users WHERE id = 1";
439            assert!(!is_record_start_line(continuation));
440        }
441
442        #[test]
443        fn double_space_in_meta() {
444            // v0.1.3+: 更严格的验证,要求字段之间只有单个空格
445            // 双空格会导致验证失败
446            let line = "2025-08-12 10:57:09.548 (EP[0]  sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app) body";
447            // 新版本中这不会通过,因为我们要求严格的单空格分隔
448            assert!(!is_record_start_line(line));
449
450            // 正确的格式应该是单空格
451            let valid_line = "2025-08-12 10:57:09.548 (EP[0] sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app) body";
452            assert!(is_record_start_line(valid_line));
453        }
454    }
455}