dm_database_parser_sqllog/
tools.rs

1//! 工具函数模块
2//!
3//! 提供了日志格式验证相关的工具函数,主要用于快速判断行是否为有效的记录起始行。
4
5// 时间戳格式常量
6const TIMESTAMP_LENGTH: usize = 23;
7const MIN_LINE_LENGTH: usize = 25;
8const TIMESTAMP_SEPARATOR_POSITIONS: [(usize, u8); 6] = [
9    (4, b'-'),
10    (7, b'-'),
11    (10, b' '),
12    (13, b':'),
13    (16, b':'),
14    (19, b'.'),
15];
16const TIMESTAMP_DIGIT_POSITIONS: [usize; 17] =
17    [0, 1, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 22];
18
19// 预定义的字节常量,避免重复创建
20const SPACE_BYTE: u8 = b' ';
21const OPEN_PAREN_BYTE: u8 = b'(';
22const CLOSE_PAREN_CHAR: char = ')';
23
24/// 判断字节数组是否为有效的时间戳格式
25///
26/// 验证时间戳格式是否为 "YYYY-MM-DD HH:MM:SS.mmm"(恰好 23 字节)。
27///
28/// # 参数
29///
30/// * `bytes` - 要检查的字节数组
31///
32/// # 返回
33///
34/// 如果是有效的时间戳格式返回 `true`,否则返回 `false`
35///
36/// # 示例
37///
38/// ```
39/// use dm_database_parser_sqllog::tools::is_ts_millis_bytes;
40///
41/// let valid = b"2025-08-12 10:57:09.548";
42/// assert!(is_ts_millis_bytes(valid));
43///
44/// let invalid = b"2025-08-12";
45/// assert!(!is_ts_millis_bytes(invalid));
46/// ```
47#[inline(always)]
48pub fn is_ts_millis_bytes(bytes: &[u8]) -> bool {
49    if bytes.len() != TIMESTAMP_LENGTH {
50        return false;
51    }
52
53    // 检查分隔符位置
54    for &(pos, expected) in &TIMESTAMP_SEPARATOR_POSITIONS {
55        if bytes[pos] != expected {
56            return false;
57        }
58    }
59
60    // 检查数字位置
61    for &i in &TIMESTAMP_DIGIT_POSITIONS {
62        if !bytes[i].is_ascii_digit() {
63            return false;
64        }
65    }
66
67    true
68}
69
70/// 判断一行日志是否为记录起始行
71///
72/// 这是一个高性能的验证函数,用于快速判断一行文本是否为有效的日志记录起始行。
73///
74/// # 判断标准
75///
76/// 1. 行首 23 字节符合时间戳格式 `YYYY-MM-DD HH:mm:ss.SSS`
77/// 2. 时间戳后紧跟一个空格,然后是 meta 部分
78/// 3. Meta 部分用小括号包含
79/// 4. Meta 部分必须包含所有必需字段(client_ip 可选)
80/// 5. Meta 字段间以一个空格分隔
81/// 6. Meta 字段顺序固定:ep → sess → thrd_id → username → trxid → statement → appname → client_ip(可选)
82///
83/// # 参数
84///
85/// * `line` - 要检查的行
86///
87/// # 返回
88///
89/// 如果是有效的记录起始行返回 `true`,否则返回 `false`
90///
91/// # 示例
92///
93/// ```
94/// use dm_database_parser_sqllog::tools::is_record_start_line;
95///
96/// let valid = "2025-08-12 10:57:09.548 (EP[0] sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app) SELECT 1";
97/// assert!(is_record_start_line(valid));
98///
99/// let invalid = "This is not a log line";
100/// assert!(!is_record_start_line(invalid));
101/// ```
102/// 7. meta 部分结束后紧跟一个空格,然后是 body 部分。
103pub fn is_record_start_line(line: &str) -> bool {
104    // 早期退出:检查最小长度
105    let bytes = line.as_bytes();
106    if bytes.len() < MIN_LINE_LENGTH {
107        return false;
108    }
109
110    // 早期退出:验证时间戳格式(最快的失败路径)
111    if !is_ts_millis_bytes(&bytes[0..TIMESTAMP_LENGTH]) {
112        return false;
113    }
114
115    // 早期退出:检查时间戳后的分隔符 " ("
116    if bytes[23] != SPACE_BYTE || bytes[24] != OPEN_PAREN_BYTE {
117        return false;
118    }
119
120    // 早期退出:查找 meta 部分的右括号
121    let closing_paren_index = match line.find(CLOSE_PAREN_CHAR) {
122        Some(idx) => idx,
123        None => return false,
124    };
125
126    // 提取 meta 部分并验证字段
127    let meta_part = &line[25..closing_paren_index];
128    validate_meta_fields_fast(meta_part)
129}
130
131/// 快速验证 meta 字段(只验证 5 个必需字段的顺序和前缀)
132///
133/// 使用字节级操作,比字符串操作快约 2-3 倍
134#[inline]
135fn validate_meta_fields_fast(meta: &str) -> bool {
136    let bytes = meta.as_bytes();
137    let len = bytes.len();
138
139    // 最小长度检查:"EP[0] sess:1 thrd:1 user:a trxid:1" 约 38 字节
140    if len < 38 {
141        return false;
142    }
143
144    // 内联的字节前缀匹配函数
145    #[inline(always)]
146    fn check_prefix(bytes: &[u8], prefix: &[u8]) -> bool {
147        bytes.len() >= prefix.len() && &bytes[..prefix.len()] == prefix
148    }
149
150    // 内联的空格查找函数
151    #[inline(always)]
152    fn find_space(bytes: &[u8]) -> Option<usize> {
153        bytes.iter().position(|&b| b == b' ')
154    }
155
156    let mut pos = 0;
157
158    // 1. 验证 EP[ (必须在开头)
159    if !check_prefix(&bytes[pos..], b"EP[") {
160        return false;
161    }
162    pos = match find_space(&bytes[pos..]) {
163        Some(idx) => pos + idx + 1,
164        None => return false,
165    };
166    if pos >= len {
167        return false;
168    }
169
170    // 2. 验证 sess:
171    if !check_prefix(&bytes[pos..], b"sess:") {
172        return false;
173    }
174    pos = match find_space(&bytes[pos..]) {
175        Some(idx) => pos + idx + 1,
176        None => return false,
177    };
178    if pos >= len {
179        return false;
180    }
181
182    // 3. 验证 thrd:
183    if !check_prefix(&bytes[pos..], b"thrd:") {
184        return false;
185    }
186    pos = match find_space(&bytes[pos..]) {
187        Some(idx) => pos + idx + 1,
188        None => return false,
189    };
190    if pos >= len {
191        return false;
192    }
193
194    // 4. 验证 user:
195    if !check_prefix(&bytes[pos..], b"user:") {
196        return false;
197    }
198    pos = match find_space(&bytes[pos..]) {
199        Some(idx) => pos + idx + 1,
200        None => return false,
201    };
202    if pos >= len {
203        return false;
204    }
205
206    // 5. 验证 trxid:
207    check_prefix(&bytes[pos..], b"trxid:")
208}
209
210#[cfg(test)]
211mod tests {
212    use super::*;
213
214    mod timestamp_tests {
215        use super::*;
216
217        #[test]
218        fn valid_timestamps() {
219            let valid_cases: &[&[u8]] = &[
220                b"2024-06-15 12:34:56.789",
221                b"2000-01-01 00:00:00.000",
222                b"2099-12-31 23:59:59.999",
223                b"2024-02-29 12:34:56.789", // 闰年
224            ];
225            for ts in valid_cases {
226                assert!(is_ts_millis_bytes(ts), "Failed for: {:?}", ts);
227            }
228        }
229
230        #[test]
231        fn wrong_length() {
232            let invalid_cases: &[&[u8]] = &[
233                b"2024-06-15 12:34:56",
234                b"2024-06-15 12:34:56.7",
235                b"2024-06-15 12:34:56.7890",
236                b"",
237                b"2024",
238            ];
239            for ts in invalid_cases {
240                assert!(!is_ts_millis_bytes(ts), "Should fail for: {:?}", ts);
241            }
242        }
243
244        #[test]
245        fn wrong_separator() {
246            let invalid_cases: &[&[u8]] = &[
247                b"2024-06-15 12:34:56,789", // 逗号代替点
248                b"2024/06/15 12:34:56.789", // 斜杠代替短横线
249                b"2024-06-15T12:34:56.789", // T 代替空格
250                b"2024-06-15-12:34:56.789", // 短横线代替空格
251                b"2024-06-15 12-34-56.789", // 短横线代替冒号
252            ];
253            for ts in invalid_cases {
254                assert!(!is_ts_millis_bytes(ts), "Should fail for: {:?}", ts);
255            }
256        }
257
258        #[test]
259        fn non_digits() {
260            let invalid_cases: &[&[u8]] = &[
261                b"202a-06-15 12:34:56.789",
262                b"2024-0b-15 12:34:56.789",
263                b"2024-06-1c 12:34:56.789",
264                b"2024-06-15 1d:34:56.789",
265                b"2024-06-15 12:3e:56.789",
266                b"2024-06-15 12:34:5f.789",
267                b"2024-06-15 12:34:56.78g",
268            ];
269            for ts in invalid_cases {
270                assert!(!is_ts_millis_bytes(ts), "Should fail for: {:?}", ts);
271            }
272        }
273
274        #[test]
275        fn special_chars() {
276            assert!(!is_ts_millis_bytes(b"2024-06-15 12:34:56.\x00\x00\x00"));
277            assert!(!is_ts_millis_bytes(b"\x002024-06-15 12:34:56.789"));
278        }
279    }
280
281    mod record_start_line_tests {
282        use super::*;
283
284        #[test]
285        fn valid_complete_line() {
286            let line = "2025-08-12 10:57:09.548 (EP[0] sess:0x178ebca0 thrd:757455 user:HBTCOMS_V3_PROD trxid:0 stmt:0x285eb060 appname: ip:::ffff:10.3.100.68) [SEL] select 1 from dual EXECTIME: 0(ms) ROWCOUNT: 1(rows) EXEC_ID: 289655178.";
287            assert!(is_record_start_line(line));
288        }
289
290        #[test]
291        fn valid_without_ip() {
292            let line = "2025-08-12 10:57:09.548 (EP[0] sess:0x178ebca0 thrd:757455 user:HBTCOMS_V3_PROD trxid:0 stmt:0x285eb060 appname:) [SEL] select 1 from dual";
293            assert!(is_record_start_line(line));
294        }
295
296        #[test]
297        fn minimal_valid() {
298            let line = "2025-08-12 10:57:09.548 (EP[0] sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app) body";
299            assert!(is_record_start_line(line));
300        }
301
302        #[test]
303        fn too_short() {
304            let short_lines = [
305                "2025-08-12 10:57:09.548",
306                "2025-08-12 10:57:09.548 (",
307                "",
308                "short",
309            ];
310            for line in &short_lines {
311                assert!(!is_record_start_line(line), "Should fail for: {}", line);
312            }
313        }
314
315        #[test]
316        fn invalid_timestamp() {
317            let line = "2025-08-12 10:57:09,548 (EP[0] sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app) body";
318            assert!(!is_record_start_line(line));
319        }
320
321        #[test]
322        fn format_errors() {
323            let invalid_lines = [
324                "2025-08-12 10:57:09.548(EP[0] sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app) body", // 无空格
325                "2025-08-12 10:57:09.548 EP[0] sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app) body", // 无左括号
326                "2025-08-12 10:57:09.548 (EP[0] sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app body", // 无右括号
327            ];
328            for line in &invalid_lines {
329                assert!(!is_record_start_line(line), "Should fail for: {}", line);
330            }
331        }
332
333        #[test]
334        fn insufficient_fields() {
335            // 现在支持 5 个字段的格式,测试只有 4 个字段的情况
336            let line = "2025-08-12 10:57:09.548 (EP[0] sess:123 thrd:456 user:alice) body";
337            assert!(!is_record_start_line(line));
338        }
339
340        #[test]
341        fn wrong_field_order() {
342            let line = "2025-08-12 10:57:09.548 (sess:123 EP[0] thrd:456 user:alice trxid:789 stmt:999 appname:app) body";
343            assert!(!is_record_start_line(line));
344        }
345
346        #[test]
347        fn missing_required_fields() {
348            // 只有前 5 个字段是必需的: EP, sess, thrd, user, trxid
349            let test_cases = [
350                (
351                    "2025-08-12 10:57:09.548 (sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app) body",
352                    "EP",
353                ),
354                (
355                    "2025-08-12 10:57:09.548 (EP[0] thrd:456 user:alice trxid:789 stmt:999 appname:app) body",
356                    "sess",
357                ),
358                (
359                    "2025-08-12 10:57:09.548 (EP[0] sess:123 user:alice trxid:789 stmt:999 appname:app) body",
360                    "thrd",
361                ),
362                (
363                    "2025-08-12 10:57:09.548 (EP[0] sess:123 thrd:456 trxid:789 stmt:999 appname:app) body",
364                    "user",
365                ),
366                (
367                    "2025-08-12 10:57:09.548 (EP[0] sess:123 thrd:456 user:alice stmt:999 appname:app) body",
368                    "trxid",
369                ),
370            ];
371            for (line, field) in &test_cases {
372                assert!(
373                    !is_record_start_line(line),
374                    "Should fail when missing {} field",
375                    field
376                );
377            }
378        }
379
380        #[test]
381        fn with_valid_ip() {
382            let line = "2025-08-12 10:57:09.548 (EP[0] sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app ip:::ffff:192.168.1.100) body";
383            assert!(is_record_start_line(line));
384        }
385
386        #[test]
387        fn with_invalid_ip_format() {
388            // IP 格式错误(应该是 ip:::ffff: 而不是 ip:)
389            let line = "2025-08-12 10:57:09.548 (EP[0] sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app ip:192.168.1.100) body";
390            // 这个格式实际上会通过,因为 "ip:192.168.1.100)" 会被当作 appname 值的一部分
391            // 让我们测试一个真正无效的格式
392            assert!(is_record_start_line(line));
393        }
394
395        #[test]
396        fn complex_field_values() {
397            let line = "2025-08-12 10:57:09.548 (EP[123] sess:0xABCD1234 thrd:9999999 user:USER_WITH_UNDERSCORES trxid:12345678 stmt:0xFFFFFFFF appname:app-name-with-dashes ip:::ffff:10.20.30.40) SELECT * FROM table";
398            assert!(is_record_start_line(line));
399        }
400
401        #[test]
402        fn empty_appname() {
403            let line = "2025-08-12 10:57:09.548 (EP[0] sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:) body";
404            assert!(is_record_start_line(line));
405        }
406
407        #[test]
408        fn continuation_line() {
409            let continuation = "    SELECT * FROM users WHERE id = 1";
410            assert!(!is_record_start_line(continuation));
411        }
412
413        #[test]
414        fn double_space_in_meta() {
415            // v0.1.3+: 更严格的验证,要求字段之间只有单个空格
416            // 双空格会导致验证失败
417            let line = "2025-08-12 10:57:09.548 (EP[0]  sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app) body";
418            // 新版本中这不会通过,因为我们要求严格的单空格分隔
419            assert!(!is_record_start_line(line));
420
421            // 正确的格式应该是单空格
422            let valid_line = "2025-08-12 10:57:09.548 (EP[0] sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app) body";
423            assert!(is_record_start_line(valid_line));
424        }
425    }
426}