dm_database_parser_sqllog/
tools.rs

1//! 工具函数模块
2//!
3//! 提供了日志格式验证相关的工具函数,主要用于快速判断行是否为有效的记录起始行。
4//!
5//! # Feature 控制
6//!
7//! 本模块所有内容仅作为库内部工具,普通用户无需直接调用。
8
9// 时间戳格式常量
10const TIMESTAMP_LENGTH: usize = 23;
11const MIN_LINE_LENGTH: usize = 25;
12const TIMESTAMP_SEPARATOR_POSITIONS: [(usize, u8); 6] = [
13    (4, b'-'),
14    (7, b'-'),
15    (10, b' '),
16    (13, b':'),
17    (16, b':'),
18    (19, b'.'),
19];
20const TIMESTAMP_DIGIT_POSITIONS: [usize; 17] =
21    [0, 1, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 22];
22
23// 预定义的字节常量,避免重复创建
24const SPACE_BYTE: u8 = b' ';
25const OPEN_PAREN_BYTE: u8 = b'(';
26const CLOSE_PAREN_CHAR: char = ')';
27
28/// 判断字节数组是否为有效的时间戳格式
29///
30/// 验证时间戳格式是否为 "YYYY-MM-DD HH:MM:SS.mmm"(恰好 23 字节)。
31///
32/// # 参数
33///
34/// * `bytes` - 要检查的字节数组
35///
36/// # 返回
37///
38/// 如果是有效的时间戳格式返回 `true`,否则返回 `false`
39///
40/// # 示例
41///
42/// ```
43/// use dm_database_parser_sqllog::tools::is_ts_millis_bytes;
44///
45/// let valid = b"2025-08-12 10:57:09.548";
46/// assert!(is_ts_millis_bytes(valid));
47///
48/// let invalid = b"2025-08-12";
49/// assert!(!is_ts_millis_bytes(invalid));
50/// ```
51#[inline(always)]
52pub fn is_ts_millis_bytes(bytes: &[u8]) -> bool {
53    if bytes.len() != TIMESTAMP_LENGTH {
54        return false;
55    }
56
57    // 检查分隔符位置
58    for &(pos, expected) in &TIMESTAMP_SEPARATOR_POSITIONS {
59        if bytes[pos] != expected {
60            return false;
61        }
62    }
63
64    // 检查数字位置
65    for &i in &TIMESTAMP_DIGIT_POSITIONS {
66        if !bytes[i].is_ascii_digit() {
67            return false;
68        }
69    }
70
71    true
72}
73
74/// 判断一行日志是否为记录起始行
75///
76/// 这是一个高性能的验证函数,用于快速判断一行文本是否为有效的日志记录起始行。
77///
78/// # 判断标准
79///
80/// 1. 行首 23 字节符合时间戳格式 `YYYY-MM-DD HH:mm:ss.SSS`
81/// 2. 时间戳后紧跟一个空格,然后是 meta 部分
82/// 3. Meta 部分用小括号包含
83/// 4. Meta 部分必须包含所有必需字段(client_ip 可选)
84/// 5. Meta 字段间以一个空格分隔
85/// 6. Meta 字段顺序固定:ep → sess → thrd_id → username → trxid → statement → appname → client_ip(可选)
86///
87/// # 参数
88///
89/// * `line` - 要检查的行
90///
91/// # 返回
92///
93/// 如果是有效的记录起始行返回 `true`,否则返回 `false`
94///
95/// # 示例
96///
97/// ```
98/// use dm_database_parser_sqllog::tools::is_record_start_line;
99///
100/// let valid = "2025-08-12 10:57:09.548 (EP[0] sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app) SELECT 1";
101/// assert!(is_record_start_line(valid));
102///
103/// let invalid = "This is not a log line";
104/// assert!(!is_record_start_line(invalid));
105/// ```
106/// 7. meta 部分结束后紧跟一个空格,然后是 body 部分。
107pub fn is_record_start_line(line: &str) -> bool {
108    // 早期退出:检查最小长度
109    let bytes = line.as_bytes();
110    if bytes.len() < MIN_LINE_LENGTH {
111        return false;
112    }
113
114    // 早期退出:验证时间戳格式(最快的失败路径)
115    if !is_ts_millis_bytes(&bytes[0..TIMESTAMP_LENGTH]) {
116        return false;
117    }
118
119    // 早期退出:检查时间戳后的分隔符 " ("
120    if bytes[23] != SPACE_BYTE || bytes[24] != OPEN_PAREN_BYTE {
121        return false;
122    }
123
124    // 早期退出:查找 meta 部分的右括号
125    let closing_paren_index = match line.find(CLOSE_PAREN_CHAR) {
126        Some(idx) => idx,
127        None => return false,
128    };
129
130    // 提取 meta 部分并验证字段
131    let meta_part = &line[25..closing_paren_index];
132    validate_meta_fields_fast(meta_part)
133}
134
135/// 快速验证 meta 字段(只验证 5 个必需字段的顺序和前缀)
136///
137/// 使用字节级操作,比字符串操作快约 2-3 倍
138#[inline]
139fn validate_meta_fields_fast(meta: &str) -> bool {
140    let bytes = meta.as_bytes();
141    let len = bytes.len();
142
143    // 最小长度检查:"EP[0] sess:1 thrd:1 user:a trxid:1" 约 38 字节
144    if len < 38 {
145        return false;
146    }
147
148    // 内联的字节前缀匹配函数
149    #[inline(always)]
150    fn check_prefix(bytes: &[u8], prefix: &[u8]) -> bool {
151        bytes.len() >= prefix.len() && &bytes[..prefix.len()] == prefix
152    }
153
154    // 内联的空格查找函数
155    #[inline(always)]
156    fn find_space(bytes: &[u8]) -> Option<usize> {
157        bytes.iter().position(|&b| b == b' ')
158    }
159
160    let mut pos = 0;
161
162    // 1. 验证 EP[ (必须在开头)
163    if !check_prefix(&bytes[pos..], b"EP[") {
164        return false;
165    }
166    pos = match find_space(&bytes[pos..]) {
167        Some(idx) => pos + idx + 1,
168        None => return false,
169    };
170    if pos >= len {
171        return false;
172    }
173
174    // 2. 验证 sess:
175    if !check_prefix(&bytes[pos..], b"sess:") {
176        return false;
177    }
178    pos = match find_space(&bytes[pos..]) {
179        Some(idx) => pos + idx + 1,
180        None => return false,
181    };
182    if pos >= len {
183        return false;
184    }
185
186    // 3. 验证 thrd:
187    if !check_prefix(&bytes[pos..], b"thrd:") {
188        return false;
189    }
190    pos = match find_space(&bytes[pos..]) {
191        Some(idx) => pos + idx + 1,
192        None => return false,
193    };
194    if pos >= len {
195        return false;
196    }
197
198    // 4. 验证 user:
199    if !check_prefix(&bytes[pos..], b"user:") {
200        return false;
201    }
202    pos = match find_space(&bytes[pos..]) {
203        Some(idx) => pos + idx + 1,
204        None => return false,
205    };
206    if pos >= len {
207        return false;
208    }
209
210    // 5. 验证 trxid:
211    check_prefix(&bytes[pos..], b"trxid:")
212}
213
214#[cfg(test)]
215mod tests {
216    use super::*;
217
218    mod timestamp_tests {
219        use super::*;
220
221        #[test]
222        fn valid_timestamps() {
223            let valid_cases: &[&[u8]] = &[
224                b"2024-06-15 12:34:56.789",
225                b"2000-01-01 00:00:00.000",
226                b"2099-12-31 23:59:59.999",
227                b"2024-02-29 12:34:56.789", // 闰年
228            ];
229            for ts in valid_cases {
230                assert!(is_ts_millis_bytes(ts), "Failed for: {:?}", ts);
231            }
232        }
233
234        #[test]
235        fn wrong_length() {
236            let invalid_cases: &[&[u8]] = &[
237                b"2024-06-15 12:34:56",
238                b"2024-06-15 12:34:56.7",
239                b"2024-06-15 12:34:56.7890",
240                b"",
241                b"2024",
242            ];
243            for ts in invalid_cases {
244                assert!(!is_ts_millis_bytes(ts), "Should fail for: {:?}", ts);
245            }
246        }
247
248        #[test]
249        fn wrong_separator() {
250            let invalid_cases: &[&[u8]] = &[
251                b"2024-06-15 12:34:56,789", // 逗号代替点
252                b"2024/06/15 12:34:56.789", // 斜杠代替短横线
253                b"2024-06-15T12:34:56.789", // T 代替空格
254                b"2024-06-15-12:34:56.789", // 短横线代替空格
255                b"2024-06-15 12-34-56.789", // 短横线代替冒号
256            ];
257            for ts in invalid_cases {
258                assert!(!is_ts_millis_bytes(ts), "Should fail for: {:?}", ts);
259            }
260        }
261
262        #[test]
263        fn non_digits() {
264            let invalid_cases: &[&[u8]] = &[
265                b"202a-06-15 12:34:56.789",
266                b"2024-0b-15 12:34:56.789",
267                b"2024-06-1c 12:34:56.789",
268                b"2024-06-15 1d:34:56.789",
269                b"2024-06-15 12:3e:56.789",
270                b"2024-06-15 12:34:5f.789",
271                b"2024-06-15 12:34:56.78g",
272            ];
273            for ts in invalid_cases {
274                assert!(!is_ts_millis_bytes(ts), "Should fail for: {:?}", ts);
275            }
276        }
277
278        #[test]
279        fn special_chars() {
280            assert!(!is_ts_millis_bytes(b"2024-06-15 12:34:56.\x00\x00\x00"));
281            assert!(!is_ts_millis_bytes(b"\x002024-06-15 12:34:56.789"));
282        }
283    }
284
285    mod record_start_line_tests {
286        use super::*;
287
288        #[test]
289        fn valid_complete_line() {
290            let line = "2025-08-12 10:57:09.548 (EP[0] sess:0x178ebca0 thrd:757455 user:HBTCOMS_V3_PROD trxid:0 stmt:0x285eb060 appname: ip:::ffff:10.3.100.68) [SEL] select 1 from dual EXECTIME: 0(ms) ROWCOUNT: 1(rows) EXEC_ID: 289655178.";
291            assert!(is_record_start_line(line));
292        }
293
294        #[test]
295        fn valid_without_ip() {
296            let line = "2025-08-12 10:57:09.548 (EP[0] sess:0x178ebca0 thrd:757455 user:HBTCOMS_V3_PROD trxid:0 stmt:0x285eb060 appname:) [SEL] select 1 from dual";
297            assert!(is_record_start_line(line));
298        }
299
300        #[test]
301        fn minimal_valid() {
302            let line = "2025-08-12 10:57:09.548 (EP[0] sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app) body";
303            assert!(is_record_start_line(line));
304        }
305
306        #[test]
307        fn too_short() {
308            let short_lines = [
309                "2025-08-12 10:57:09.548",
310                "2025-08-12 10:57:09.548 (",
311                "",
312                "short",
313            ];
314            for line in &short_lines {
315                assert!(!is_record_start_line(line), "Should fail for: {}", line);
316            }
317        }
318
319        #[test]
320        fn invalid_timestamp() {
321            let line = "2025-08-12 10:57:09,548 (EP[0] sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app) body";
322            assert!(!is_record_start_line(line));
323        }
324
325        #[test]
326        fn format_errors() {
327            let invalid_lines = [
328                "2025-08-12 10:57:09.548(EP[0] sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app) body", // 无空格
329                "2025-08-12 10:57:09.548 EP[0] sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app) body", // 无左括号
330                "2025-08-12 10:57:09.548 (EP[0] sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app body", // 无右括号
331            ];
332            for line in &invalid_lines {
333                assert!(!is_record_start_line(line), "Should fail for: {}", line);
334            }
335        }
336
337        #[test]
338        fn insufficient_fields() {
339            // 现在支持 5 个字段的格式,测试只有 4 个字段的情况
340            let line = "2025-08-12 10:57:09.548 (EP[0] sess:123 thrd:456 user:alice) body";
341            assert!(!is_record_start_line(line));
342        }
343
344        #[test]
345        fn wrong_field_order() {
346            let line = "2025-08-12 10:57:09.548 (sess:123 EP[0] thrd:456 user:alice trxid:789 stmt:999 appname:app) body";
347            assert!(!is_record_start_line(line));
348        }
349
350        #[test]
351        fn missing_required_fields() {
352            // 只有前 5 个字段是必需的: EP, sess, thrd, user, trxid
353            let test_cases = [
354                (
355                    "2025-08-12 10:57:09.548 (sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app) body",
356                    "EP",
357                ),
358                (
359                    "2025-08-12 10:57:09.548 (EP[0] thrd:456 user:alice trxid:789 stmt:999 appname:app) body",
360                    "sess",
361                ),
362                (
363                    "2025-08-12 10:57:09.548 (EP[0] sess:123 user:alice trxid:789 stmt:999 appname:app) body",
364                    "thrd",
365                ),
366                (
367                    "2025-08-12 10:57:09.548 (EP[0] sess:123 thrd:456 trxid:789 stmt:999 appname:app) body",
368                    "user",
369                ),
370                (
371                    "2025-08-12 10:57:09.548 (EP[0] sess:123 thrd:456 user:alice stmt:999 appname:app) body",
372                    "trxid",
373                ),
374            ];
375            for (line, field) in &test_cases {
376                assert!(
377                    !is_record_start_line(line),
378                    "Should fail when missing {} field",
379                    field
380                );
381            }
382        }
383
384        #[test]
385        fn with_valid_ip() {
386            let line = "2025-08-12 10:57:09.548 (EP[0] sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app ip:::ffff:192.168.1.100) body";
387            assert!(is_record_start_line(line));
388        }
389
390        #[test]
391        fn with_invalid_ip_format() {
392            // IP 格式错误(应该是 ip:::ffff: 而不是 ip:)
393            let line = "2025-08-12 10:57:09.548 (EP[0] sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app ip:192.168.1.100) body";
394            // 这个格式实际上会通过,因为 "ip:192.168.1.100)" 会被当作 appname 值的一部分
395            // 让我们测试一个真正无效的格式
396            assert!(is_record_start_line(line));
397        }
398
399        #[test]
400        fn complex_field_values() {
401            let line = "2025-08-12 10:57:09.548 (EP[123] sess:0xABCD1234 thrd:9999999 user:USER_WITH_UNDERSCORES trxid:12345678 stmt:0xFFFFFFFF appname:app-name-with-dashes ip:::ffff:10.20.30.40) SELECT * FROM table";
402            assert!(is_record_start_line(line));
403        }
404
405        #[test]
406        fn empty_appname() {
407            let line = "2025-08-12 10:57:09.548 (EP[0] sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:) body";
408            assert!(is_record_start_line(line));
409        }
410
411        #[test]
412        fn continuation_line() {
413            let continuation = "    SELECT * FROM users WHERE id = 1";
414            assert!(!is_record_start_line(continuation));
415        }
416
417        #[test]
418        fn double_space_in_meta() {
419            // v0.1.3+: 更严格的验证,要求字段之间只有单个空格
420            // 双空格会导致验证失败
421            let line = "2025-08-12 10:57:09.548 (EP[0]  sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app) body";
422            // 新版本中这不会通过,因为我们要求严格的单空格分隔
423            assert!(!is_record_start_line(line));
424
425            // 正确的格式应该是单空格
426            let valid_line = "2025-08-12 10:57:09.548 (EP[0] sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app) body";
427            assert!(is_record_start_line(valid_line));
428        }
429    }
430}