dm_database_parser_sqllog/
tools.rs

1//! 工具函数模块
2//!
3//! 提供了日志格式验证相关的工具函数,主要用于快速判断行是否为有效的记录起始行。
4
5use once_cell::sync::Lazy;
6
7// 时间戳格式常量
8const TIMESTAMP_LENGTH: usize = 23;
9const MIN_LINE_LENGTH: usize = 25;
10const TIMESTAMP_SEPARATOR_POSITIONS: [(usize, u8); 6] = [
11    (4, b'-'),
12    (7, b'-'),
13    (10, b' '),
14    (13, b':'),
15    (16, b':'),
16    (19, b'.'),
17];
18const TIMESTAMP_DIGIT_POSITIONS: [usize; 17] =
19    [0, 1, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 22];
20
21// Meta 字段常量
22const META_START_INDEX: usize = 25;
23const REQUIRED_META_FIELDS: usize = 7;
24const META_WITH_IP_FIELDS: usize = 8;
25
26// 使用 Lazy 静态初始化字段前缀数组,避免每次访问时创建
27static META_FIELD_PREFIXES: Lazy<[&'static str; 8]> = Lazy::new(|| {
28    [
29        "EP[",
30        "sess:",
31        "thrd:",
32        "user:",
33        "trxid:",
34        "stmt:",
35        "appname:",
36        "ip:::ffff:",
37    ]
38});
39
40// 预定义的字节常量,避免重复创建
41const SPACE_BYTE: u8 = b' ';
42const OPEN_PAREN_BYTE: u8 = b'(';
43const CLOSE_PAREN_CHAR: char = ')';
44
45/// 判断字节数组是否为有效的时间戳格式
46///
47/// 验证时间戳格式是否为 "YYYY-MM-DD HH:MM:SS.mmm"(恰好 23 字节)。
48///
49/// # 参数
50///
51/// * `bytes` - 要检查的字节数组
52///
53/// # 返回
54///
55/// 如果是有效的时间戳格式返回 `true`,否则返回 `false`
56///
57/// # 示例
58///
59/// ```
60/// use dm_database_parser_sqllog::tools::is_ts_millis_bytes;
61///
62/// let valid = b"2025-08-12 10:57:09.548";
63/// assert!(is_ts_millis_bytes(valid));
64///
65/// let invalid = b"2025-08-12";
66/// assert!(!is_ts_millis_bytes(invalid));
67/// ```
68#[inline(always)]
69pub fn is_ts_millis_bytes(bytes: &[u8]) -> bool {
70    if bytes.len() != TIMESTAMP_LENGTH {
71        return false;
72    }
73
74    // 检查分隔符位置
75    for &(pos, expected) in &TIMESTAMP_SEPARATOR_POSITIONS {
76        if bytes[pos] != expected {
77            return false;
78        }
79    }
80
81    // 检查数字位置
82    for &i in &TIMESTAMP_DIGIT_POSITIONS {
83        if !bytes[i].is_ascii_digit() {
84            return false;
85        }
86    }
87
88    true
89}
90
91/// 判断一行日志是否为记录起始行
92///
93/// 这是一个高性能的验证函数,用于快速判断一行文本是否为有效的日志记录起始行。
94///
95/// # 判断标准
96///
97/// 1. 行首 23 字节符合时间戳格式 `YYYY-MM-DD HH:mm:ss.SSS`
98/// 2. 时间戳后紧跟一个空格,然后是 meta 部分
99/// 3. Meta 部分用小括号包含
100/// 4. Meta 部分必须包含所有必需字段(client_ip 可选)
101/// 5. Meta 字段间以一个空格分隔
102/// 6. Meta 字段顺序固定:ep → sess → thrd_id → username → trxid → statement → appname → client_ip(可选)
103///
104/// # 参数
105///
106/// * `line` - 要检查的行
107///
108/// # 返回
109///
110/// 如果是有效的记录起始行返回 `true`,否则返回 `false`
111///
112/// # 示例
113///
114/// ```
115/// use dm_database_parser_sqllog::tools::is_record_start_line;
116///
117/// let valid = "2025-08-12 10:57:09.548 (EP[0] sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app) SELECT 1";
118/// assert!(is_record_start_line(valid));
119///
120/// let invalid = "This is not a log line";
121/// assert!(!is_record_start_line(invalid));
122/// ```
123/// 7. meta 部分结束后紧跟一个空格,然后是 body 部分。
124pub fn is_record_start_line(line: &str) -> bool {
125    let bytes = line.as_bytes();
126    if bytes.len() < MIN_LINE_LENGTH {
127        return false;
128    }
129
130    // 检查时间戳部分
131    if !is_ts_millis_bytes(&bytes[0..TIMESTAMP_LENGTH]) {
132        return false;
133    }
134
135    // 检查时间戳后面的空格和括号
136    if bytes[23] != SPACE_BYTE || bytes[24] != OPEN_PAREN_BYTE {
137        return false;
138    }
139
140    // 查找并检查 meta 部分的右括号
141    let closing_paren_index = match line.find(CLOSE_PAREN_CHAR) {
142        Some(index) => index,
143        None => return false,
144    };
145
146    // 解析并验证 meta 字段 - 使用单次迭代验证所有字段
147    let meta_part = &line[META_START_INDEX..closing_paren_index];
148
149    // 创建迭代器并验证字段数量和内容
150    let mut split_iter = meta_part.split(' ');
151    let mut field_count = 0;
152
153    // 验证前 7 个必需字段
154    for prefix in META_FIELD_PREFIXES.iter().take(REQUIRED_META_FIELDS) {
155        match split_iter.next() {
156            Some(field) if field.contains(prefix) => {
157                field_count += 1;
158            }
159            _ => return false,
160        }
161    }
162
163    // 检查可选的 IP 字段
164    if let Some(ip_field) = split_iter.next() {
165        if !ip_field.contains(META_FIELD_PREFIXES[REQUIRED_META_FIELDS]) {
166            return false;
167        }
168        field_count += 1;
169
170        // 不应该有更多字段
171        if split_iter.next().is_some() {
172            return false;
173        }
174    }
175
176    // 字段数量必须是 7 或 8
177    field_count == REQUIRED_META_FIELDS || field_count == META_WITH_IP_FIELDS
178}
179
180#[cfg(test)]
181mod tests {
182    use super::*;
183
184    mod timestamp_tests {
185        use super::*;
186
187        #[test]
188        fn valid_timestamps() {
189            let valid_cases: &[&[u8]] = &[
190                b"2024-06-15 12:34:56.789",
191                b"2000-01-01 00:00:00.000",
192                b"2099-12-31 23:59:59.999",
193                b"2024-02-29 12:34:56.789", // 闰年
194            ];
195            for ts in valid_cases {
196                assert!(is_ts_millis_bytes(ts), "Failed for: {:?}", ts);
197            }
198        }
199
200        #[test]
201        fn wrong_length() {
202            let invalid_cases: &[&[u8]] = &[
203                b"2024-06-15 12:34:56",
204                b"2024-06-15 12:34:56.7",
205                b"2024-06-15 12:34:56.7890",
206                b"",
207                b"2024",
208            ];
209            for ts in invalid_cases {
210                assert!(!is_ts_millis_bytes(ts), "Should fail for: {:?}", ts);
211            }
212        }
213
214        #[test]
215        fn wrong_separator() {
216            let invalid_cases: &[&[u8]] = &[
217                b"2024-06-15 12:34:56,789", // 逗号代替点
218                b"2024/06/15 12:34:56.789", // 斜杠代替短横线
219                b"2024-06-15T12:34:56.789", // T 代替空格
220                b"2024-06-15-12:34:56.789", // 短横线代替空格
221                b"2024-06-15 12-34-56.789", // 短横线代替冒号
222            ];
223            for ts in invalid_cases {
224                assert!(!is_ts_millis_bytes(ts), "Should fail for: {:?}", ts);
225            }
226        }
227
228        #[test]
229        fn non_digits() {
230            let invalid_cases: &[&[u8]] = &[
231                b"202a-06-15 12:34:56.789",
232                b"2024-0b-15 12:34:56.789",
233                b"2024-06-1c 12:34:56.789",
234                b"2024-06-15 1d:34:56.789",
235                b"2024-06-15 12:3e:56.789",
236                b"2024-06-15 12:34:5f.789",
237                b"2024-06-15 12:34:56.78g",
238            ];
239            for ts in invalid_cases {
240                assert!(!is_ts_millis_bytes(ts), "Should fail for: {:?}", ts);
241            }
242        }
243
244        #[test]
245        fn special_chars() {
246            assert!(!is_ts_millis_bytes(b"2024-06-15 12:34:56.\x00\x00\x00"));
247            assert!(!is_ts_millis_bytes(b"\x002024-06-15 12:34:56.789"));
248        }
249    }
250
251    mod record_start_line_tests {
252        use super::*;
253
254        #[test]
255        fn valid_complete_line() {
256            let line = "2025-08-12 10:57:09.548 (EP[0] sess:0x178ebca0 thrd:757455 user:HBTCOMS_V3_PROD trxid:0 stmt:0x285eb060 appname: ip:::ffff:10.3.100.68) [SEL] select 1 from dual EXECTIME: 0(ms) ROWCOUNT: 1(rows) EXEC_ID: 289655178.";
257            assert!(is_record_start_line(line));
258        }
259
260        #[test]
261        fn valid_without_ip() {
262            let line = "2025-08-12 10:57:09.548 (EP[0] sess:0x178ebca0 thrd:757455 user:HBTCOMS_V3_PROD trxid:0 stmt:0x285eb060 appname:) [SEL] select 1 from dual";
263            assert!(is_record_start_line(line));
264        }
265
266        #[test]
267        fn minimal_valid() {
268            let line = "2025-08-12 10:57:09.548 (EP[0] sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app) body";
269            assert!(is_record_start_line(line));
270        }
271
272        #[test]
273        fn too_short() {
274            let short_lines = [
275                "2025-08-12 10:57:09.548",
276                "2025-08-12 10:57:09.548 (",
277                "",
278                "short",
279            ];
280            for line in &short_lines {
281                assert!(!is_record_start_line(line), "Should fail for: {}", line);
282            }
283        }
284
285        #[test]
286        fn invalid_timestamp() {
287            let line = "2025-08-12 10:57:09,548 (EP[0] sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app) body";
288            assert!(!is_record_start_line(line));
289        }
290
291        #[test]
292        fn format_errors() {
293            let invalid_lines = [
294                "2025-08-12 10:57:09.548(EP[0] sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app) body", // 无空格
295                "2025-08-12 10:57:09.548 EP[0] sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app) body", // 无左括号
296                "2025-08-12 10:57:09.548 (EP[0] sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app body", // 无右括号
297            ];
298            for line in &invalid_lines {
299                assert!(!is_record_start_line(line), "Should fail for: {}", line);
300            }
301        }
302
303        #[test]
304        fn insufficient_fields() {
305            let line =
306                "2025-08-12 10:57:09.548 (EP[0] sess:123 thrd:456 user:alice trxid:789) body";
307            assert!(!is_record_start_line(line));
308        }
309
310        #[test]
311        fn wrong_field_order() {
312            let line = "2025-08-12 10:57:09.548 (sess:123 EP[0] thrd:456 user:alice trxid:789 stmt:999 appname:app) body";
313            assert!(!is_record_start_line(line));
314        }
315
316        #[test]
317        fn missing_required_fields() {
318            let test_cases = [
319                (
320                    "2025-08-12 10:57:09.548 (sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app) body",
321                    "EP",
322                ),
323                (
324                    "2025-08-12 10:57:09.548 (EP[0] thrd:456 user:alice trxid:789 stmt:999 appname:app) body",
325                    "sess",
326                ),
327                (
328                    "2025-08-12 10:57:09.548 (EP[0] sess:123 user:alice trxid:789 stmt:999 appname:app) body",
329                    "thrd",
330                ),
331                (
332                    "2025-08-12 10:57:09.548 (EP[0] sess:123 thrd:456 trxid:789 stmt:999 appname:app) body",
333                    "user",
334                ),
335                (
336                    "2025-08-12 10:57:09.548 (EP[0] sess:123 thrd:456 user:alice stmt:999 appname:app) body",
337                    "trxid",
338                ),
339                (
340                    "2025-08-12 10:57:09.548 (EP[0] sess:123 thrd:456 user:alice trxid:789 appname:app) body",
341                    "stmt",
342                ),
343                (
344                    "2025-08-12 10:57:09.548 (EP[0] sess:123 thrd:456 user:alice trxid:789 stmt:999) body",
345                    "appname",
346                ),
347            ];
348            for (line, field) in &test_cases {
349                assert!(
350                    !is_record_start_line(line),
351                    "Should fail when missing {} field",
352                    field
353                );
354            }
355        }
356
357        #[test]
358        fn with_valid_ip() {
359            let line = "2025-08-12 10:57:09.548 (EP[0] sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app ip:::ffff:192.168.1.100) body";
360            assert!(is_record_start_line(line));
361        }
362
363        #[test]
364        fn with_invalid_ip_format() {
365            let line = "2025-08-12 10:57:09.548 (EP[0] sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app ip:192.168.1.100) body";
366            assert!(!is_record_start_line(line));
367        }
368
369        #[test]
370        fn complex_field_values() {
371            let line = "2025-08-12 10:57:09.548 (EP[123] sess:0xABCD1234 thrd:9999999 user:USER_WITH_UNDERSCORES trxid:12345678 stmt:0xFFFFFFFF appname:app-name-with-dashes ip:::ffff:10.20.30.40) SELECT * FROM table";
372            assert!(is_record_start_line(line));
373        }
374
375        #[test]
376        fn empty_appname() {
377            let line = "2025-08-12 10:57:09.548 (EP[0] sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:) body";
378            assert!(is_record_start_line(line));
379        }
380
381        #[test]
382        fn continuation_line() {
383            let continuation = "    SELECT * FROM users WHERE id = 1";
384            assert!(!is_record_start_line(continuation));
385        }
386
387        #[test]
388        fn double_space_in_meta() {
389            let line = "2025-08-12 10:57:09.548 (EP[0]  sess:123 thrd:456 user:alice trxid:789 stmt:999 appname:app) body";
390            assert!(!is_record_start_line(line));
391        }
392    }
393}