dm-database-parser-sqllog 1.0.0

一个高性能的达梦数据库 sqllog 日志解析库,提供零分配或低分配的记录切分与解析功能
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
use memchr::memmem::Finder;
use memchr::{memchr, memrchr};
#[cfg(unix)]
use memmap2::Advice;
use memmap2::Mmap;
use simdutf8::basic::from_utf8 as simd_from_utf8;
use std::borrow::Cow;
use std::fs::File;
use std::path::Path;
use std::sync::LazyLock;

use crate::error::ParseError;
use crate::sqllog::Sqllog;
use encoding::all::GB18030;
use encoding::{DecoderTrap, Encoding};

/// Pre-built SIMD searcher for the `") "` meta-close pattern.
/// Avoids rebuilding the Finder on every record parse.
static FINDER_CLOSE_META: LazyLock<Finder<'static>> = LazyLock::new(|| Finder::new(b") "));

/// Pre-built SIMD searcher for the `"\n20"` record-start pattern.
/// Shared across threads via LazyLock; constructed once on first use.
static FINDER_RECORD_START: LazyLock<Finder<'static>> = LazyLock::new(|| Finder::new(b"\n20"));

#[derive(Copy, Clone, Debug, PartialEq, Eq, Default)]
pub(crate) enum FileEncodingHint {
    #[default]
    Auto,
    Utf8,
    Gb18030,
}

pub struct LogParser {
    mmap: Mmap,
    encoding: FileEncodingHint,
}

/// 记录起始字节偏移列表,由 `LogParser::index()` 一次性构建。
/// 每个元素是某条记录在内存映射缓冲区内的绝对字节偏移。
/// 用于两阶段并行扫描:先建索引,再按记录数均匀分区给多线程。
pub struct RecordIndex {
    pub(crate) offsets: Vec<usize>,
}

impl RecordIndex {
    /// 记录总数
    pub fn len(&self) -> usize {
        self.offsets.len()
    }

    /// 是否为空(文件不含任何完整记录)
    pub fn is_empty(&self) -> bool {
        self.offsets.is_empty()
    }
}

impl LogParser {
    pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self, ParseError> {
        let file = File::open(path).map_err(|e| ParseError::IoError(e.to_string()))?;
        let mmap = unsafe { Mmap::map(&file).map_err(|e| ParseError::IoError(e.to_string()))? };

        // HOT-04: 告知 OS 以顺序模式预读 mmap 页面,减少 page fault 开销
        // Unix-only;Windows 上 advise() 方法不存在,cfg 门控跳过
        // 失败(如内核不支持)静默忽略,不影响正确性
        #[cfg(unix)]
        let _ = mmap.advise(Advice::Sequential);

        // Detect encoding by sampling the first 64 KB and the last 4 KB.
        // Sampling both ends catches the rare case where GB18030 content only
        // appears after the initial UTF-8 section (e.g. late-joined non-ASCII
        // usernames), while keeping the cost well below a full-file scan.
        //
        // Known limitation: the middle of large files (> ~68 KB) is not sampled.
        // GB18030 multi-byte sequences that appear only in the middle of a very
        // large log file may cause the file to be misclassified as UTF-8, leading
        // to garbled output for those records. In practice DM log files either use
        // GB18030 throughout or are entirely ASCII-safe UTF-8, so this edge case
        // is unlikely. A full middle-window sample could be added if misclassification
        // is observed in production.
        let head_size = mmap.len().min(64 * 1024);
        let tail_start = mmap.len().saturating_sub(4 * 1024).max(head_size);
        let head_ok = simd_from_utf8(&mmap[..head_size]).is_ok();
        let tail_ok = tail_start >= mmap.len() || simd_from_utf8(&mmap[tail_start..]).is_ok();
        let encoding = if head_ok && tail_ok {
            FileEncodingHint::Utf8
        } else {
            FileEncodingHint::Gb18030
        };

        Ok(Self { mmap, encoding })
    }

    pub fn iter(&self) -> LogIterator<'_> {
        LogIterator {
            data: &self.mmap,
            pos: 0,
            encoding: self.encoding,
        }
    }

    /// 两阶段扫描第一阶段:构建记录起始字节偏移索引。
    /// 单线程扫描整个文件,返回的 `RecordIndex` 可直接用于并行处理阶段。
    pub fn index(&self) -> RecordIndex {
        let data: &[u8] = &self.mmap;
        let mut offsets: Vec<usize> = Vec::new();

        // 第 0 条记录:仅当文件首字节即是时间戳时才单独 push
        // (find_next_record_start 会先跳过首行,所以首行就是时间戳的情况需要单独处理)
        if data.len() >= 23 && is_timestamp_start(&data[0..23]) {
            offsets.push(0);
        }

        let mut pos: usize = 0;
        loop {
            let next = find_next_record_start(data, pos);
            if next >= data.len() {
                break;
            }
            // 防止与首条记录重复 push(首字节即是时间戳的边界情况)
            if offsets.last() != Some(&next) {
                offsets.push(next);
            }
            // Pitfall 1: pos 必须前进至少 1,否则 find_next_record_start
            // 在首行就是时间戳时会返回同一个 next,无限循环
            pos = next.saturating_add(1);
        }
        RecordIndex { offsets }
    }

    /// Returns a Rayon parallel iterator over all log records.
    ///
    /// Large files (≥ PAR_THRESHOLD = 32 MB) are split into N byte-aligned chunks
    /// at record boundaries — O(threads) overhead, not O(records). Small files use a
    /// single partition so Rayon executes single-threaded without scheduling cost
    /// (PAR-03 semantics). `index()` is intentionally not called here: a full
    /// sequential pre-scan would double I/O on mmap'd, I/O-bound workloads.
    pub fn par_iter(
        &self,
    ) -> impl rayon::iter::ParallelIterator<Item = Result<Sqllog<'_>, ParseError>> + '_ {
        use rayon::prelude::*;

        const PAR_THRESHOLD: usize = 32 * 1024 * 1024;

        let data: &[u8] = &self.mmap;
        let encoding = self.encoding;

        let bounds: Vec<(usize, usize)> = if data.is_empty() {
            Vec::new()
        } else if data.len() < PAR_THRESHOLD {
            vec![(0, data.len())]
        } else {
            let num_threads = rayon::current_num_threads().max(1);
            let chunk_size = (data.len() / num_threads).max(1);
            let mut starts: Vec<usize> = vec![0];
            for i in 1..num_threads {
                let boundary = find_next_record_start(data, i * chunk_size);
                if boundary < data.len() {
                    starts.push(boundary);
                }
            }
            starts.push(data.len());
            starts.dedup();
            starts.windows(2).map(|w| (w[0], w[1])).collect()
        };

        bounds
            .into_par_iter()
            .flat_map_iter(move |(start, end)| LogIterator {
                data: &data[start..end],
                pos: 0,
                encoding,
            })
    }
}

pub struct LogIterator<'a> {
    data: &'a [u8],
    pos: usize,
    encoding: FileEncodingHint,
}

impl<'a> Iterator for LogIterator<'a> {
    type Item = Result<Sqllog<'a>, ParseError>;

    fn next(&mut self) -> Option<Self::Item> {
        loop {
            if self.pos >= self.data.len() {
                return None;
            }

            let data = &self.data[self.pos..];

            // 快速路径:先用 memchr 找第一个 '\n',若下一行即是时间戳则为单行记录
            // 慢速路径(多行):用 FINDER_RECORD_START.find_iter 跳过嵌入换行
            let (record_end, next_start, is_multiline) = match memchr(b'\n', data) {
                None => (data.len(), data.len(), false),
                Some(first_nl) => {
                    let ts_start = first_nl + 1;
                    if ts_start + 23 <= data.len()
                        && is_timestamp_start(&data[ts_start..ts_start + 23])
                    {
                        // 单行记录:边界就是第一个 '\n'
                        (first_nl, ts_start, false)
                    } else {
                        // 多行记录:用 memmem 跳过嵌入换行继续搜索
                        // ALGO-01: find_iter 替代逐行 while-memchr 循环
                        let mut found_boundary: Option<usize> = None;
                        for candidate in FINDER_RECORD_START.find_iter(&data[ts_start..]) {
                            let abs_ts = ts_start + candidate + 1;
                            if abs_ts + 23 <= data.len()
                                && is_timestamp_start(&data[abs_ts..abs_ts + 23])
                            {
                                found_boundary = Some(ts_start + candidate);
                                break;
                            }
                        }
                        match found_boundary {
                            Some(idx) => (idx, idx + 1, true),
                            None => (data.len(), data.len(), true),
                        }
                    }
                }
            };

            let record_slice = &data[..record_end];
            self.pos += next_start;

            // Trim trailing CR if present
            let record_slice = if record_slice.ends_with(b"\r") {
                &record_slice[..record_slice.len() - 1]
            } else {
                record_slice
            };

            // Skip empty slices iteratively instead of recursing to avoid stack overflow
            // when the file contains many consecutive blank lines.
            if record_slice.is_empty() {
                continue;
            }

            return Some(parse_record_with_hint(
                record_slice,
                is_multiline,
                self.encoding,
            ));
        }
    }
}

/// Find the position of the next record start at or after `from`.
/// A record start is a line beginning with a timestamp pattern.
fn find_next_record_start(data: &[u8], from: usize) -> usize {
    let mut pos = from;
    // Skip to start of next line
    if let Some(nl) = memchr(b'\n', &data[pos..]) {
        pos += nl + 1;
    } else {
        return data.len();
    }
    // 先检查 pos 本身是否是时间戳行(Finder 不会命中无前置 '\n' 的行首)
    if pos + 23 <= data.len() && is_timestamp_start(&data[pos..pos + 23]) {
        return pos;
    }

    // ALGO-01: memmem 单次扫描替代逐行 memchr loop
    for candidate in FINDER_RECORD_START.find_iter(&data[pos..]) {
        let ts_start = pos + candidate + 1;
        if ts_start + 23 <= data.len() && is_timestamp_start(&data[ts_start..ts_start + 23]) {
            return ts_start;
        }
    }
    data.len()
}

pub fn parse_record<'a>(record_bytes: &'a [u8]) -> Result<Sqllog<'a>, ParseError> {
    // Auto-detect multiline: inspect whether the bytes actually contain a newline
    // rather than hardcoding true, which caused a redundant memchr scan for
    // single-line records and was semantically misleading.
    let is_multiline = memchr(b'\n', record_bytes).is_some();
    parse_record_with_hint(record_bytes, is_multiline, FileEncodingHint::Auto)
}

fn parse_record_with_hint<'a>(
    record_bytes: &'a [u8],
    is_multiline: bool,
    encoding_hint: FileEncodingHint,
) -> Result<Sqllog<'a>, ParseError> {
    // Find end of first line
    let (first_line, _rest) = if is_multiline {
        match memchr(b'\n', record_bytes) {
            Some(idx) => {
                let mut line = &record_bytes[..idx];
                if line.ends_with(b"\r") {
                    line = &line[..line.len() - 1];
                }
                (line, &record_bytes[idx + 1..])
            }
            None => {
                let mut line = record_bytes;
                if line.ends_with(b"\r") {
                    line = &line[..line.len() - 1];
                }
                (line, &[] as &[u8])
            }
        }
    } else {
        let mut line = record_bytes;
        if line.ends_with(b"\r") {
            line = &line[..line.len() - 1];
        }
        (line, &[] as &[u8])
    };

    // 1. Timestamp
    if first_line.len() < 23 {
        return Err(make_invalid_format_error(first_line));
    }
    // We assume ASCII/UTF-8 for timestamp
    // SAFETY: We validated the timestamp format in LogIterator::next using is_ts_millis_bytes,
    // which ensures it contains only digits and separators.
    let ts = unsafe { Cow::Borrowed(std::str::from_utf8_unchecked(&first_line[0..23])) };

    // 2. Meta
    // Format: TS (META) BODY
    // Find first '(' after TS
    let meta_start = match memchr(b'(', &first_line[23..]) {
        Some(idx) => 23 + idx,
        None => {
            return Err(make_invalid_format_error(first_line));
        }
    };

    // Find closing ')' for meta using pre-built SIMD Finder.
    let meta_end = match FINDER_CLOSE_META.find(&first_line[meta_start..]) {
        Some(idx) => Some(meta_start + idx),
        None => memrchr(b')', &first_line[meta_start..]).map(|idx| meta_start + idx),
    };

    let meta_end = match meta_end {
        Some(idx) => idx,
        None => {
            return Err(make_invalid_format_error(first_line));
        }
    };

    let meta_bytes = &first_line[meta_start + 1..meta_end];
    // Lazy parsing: store raw bytes as a Cow<'a, str>.
    // For Utf8 / Auto-UTF8 encoding: meta_bytes is a sub-slice of the memory-mapped buffer
    // (raw UTF-8 bytes) that lives for 'a — borrowing is sound.
    // For Gb18030 / Auto-GB18030 encoding: GB18030.decode() produces a new owned String, so
    // meta_raw becomes Cow::Owned; the 'a lifetime is NOT extended to that allocation.
    let meta_raw = match encoding_hint {
        FileEncodingHint::Utf8 => {
            // File already validated as UTF-8 during `from_path`; skip per-slice re-validation.
            // SAFETY: meta_bytes is a sub-slice of record_bytes which lives for 'a.
            // No lifetime extension via from_raw_parts needed — meta_bytes already carries 'a.
            unsafe { Cow::Borrowed(std::str::from_utf8_unchecked(meta_bytes)) }
        }
        FileEncodingHint::Gb18030 => match GB18030.decode(meta_bytes, DecoderTrap::Strict) {
            Ok(s) => Cow::Owned(s),
            Err(_) => Cow::Owned(String::from_utf8_lossy(meta_bytes).into_owned()),
        },
        FileEncodingHint::Auto => match simd_from_utf8(meta_bytes) {
            Ok(_) => {
                // SAFETY: meta_bytes is a sub-slice of record_bytes which lives for 'a;
                // simd_from_utf8 confirmed it is valid UTF-8.
                unsafe { Cow::Borrowed(std::str::from_utf8_unchecked(meta_bytes)) }
            }
            Err(_) => match GB18030.decode(meta_bytes, DecoderTrap::Strict) {
                Ok(s) => Cow::Owned(s),
                Err(_) => Cow::Owned(String::from_utf8_lossy(meta_bytes).into_owned()),
            },
        },
    };

    // 3. Body & 4. Indicators
    let body_start_in_first_line = meta_end + 1;

    // The ") " pattern guarantees one space; skip it directly.
    let content_start = if body_start_in_first_line < first_line.len()
        && first_line[body_start_in_first_line] == b' '
    {
        body_start_in_first_line + 1
    } else {
        body_start_in_first_line
    };

    // Extract optional leading tag like [SEL] or [ORA]
    let mut tag: Option<Cow<'a, str>> = None;
    let content_slice = if content_start < record_bytes.len() {
        let mut s = &record_bytes[content_start..];
        // If it starts with '[', try to find matching ']' and treat inner token as tag
        if !s.is_empty()
            && s[0] == b'['
            && let Some(end_idx) = memchr(b']', s)
            && end_idx >= 1
        {
            let inner = &s[1..end_idx];
            // Accept token without spaces and reasonable length
            if !inner.contains(&b' ') && inner.len() <= 32 {
                tag = match encoding_hint {
                    FileEncodingHint::Utf8 => {
                        // File already validated as UTF-8; skip re-validation.
                        // SAFETY: inner is a sub-slice of record_bytes which lives for 'a.
                        // No from_raw_parts needed — inner already carries 'a lifetime.
                        Some(unsafe { Cow::Borrowed(std::str::from_utf8_unchecked(inner)) })
                    }
                    _ => match simd_from_utf8(inner) {
                        Ok(_) => Some(unsafe {
                            // SAFETY: inner is a sub-slice of record_bytes which lives for 'a;
                            // simd_from_utf8 confirmed it is valid UTF-8.
                            Cow::Borrowed(std::str::from_utf8_unchecked(inner))
                        }),
                        Err(_) => match encoding_hint {
                            FileEncodingHint::Gb18030 => {
                                match GB18030.decode(inner, DecoderTrap::Strict) {
                                    Ok(s) => Some(Cow::Owned(s)),
                                    Err(_) => Some(Cow::Owned(
                                        String::from_utf8_lossy(inner).into_owned(),
                                    )),
                                }
                            }
                            _ => Some(Cow::Owned(String::from_utf8_lossy(inner).into_owned())),
                        },
                    },
                };
                // Move past the closing ']' and any following ASCII whitespace
                s = &s[end_idx + 1..];
                let mut skip = 0usize;
                while skip < s.len() && s[skip].is_ascii_whitespace() {
                    skip += 1;
                }
                s = &s[skip..];
            }
        }
        s
    } else {
        &[] as &[u8]
    };

    let content_raw = Cow::Borrowed(content_slice);

    Ok(Sqllog {
        ts,
        meta_raw,
        content_raw,
        tag,
        encoding: encoding_hint,
    })
}

// u64 掩码常量:验证时间戳格式 "20YY-MM-DD HH:MM:SS.mmm"
// 字节位置:0('2'), 1('0'), 4('-'), 7('-'), 10(' '), 13(':'), 16(':'), 19('.')
const LO_MASK: u64 = 0xFF0000FF0000FFFF; // data[0..8]:位置 0,1,4,7
const LO_EXPECTED: u64 = 0x2D00002D00003032; // LE: '2'=0x32,'0'=0x30,'-'=0x2D,'-'=0x2D
const HI_MASK: u64 = 0x0000FF0000FF0000; // data[8..16]:位置 10,13(偏移 2,5)
const HI_EXPECTED: u64 = 0x00003A0000200000; // LE: ' '=0x20,':'=0x3A

/// 检查 bytes[0..23] 是否符合时间戳格式 "20YY-MM-DD HH:MM:SS.mmm"。
/// 调用前需确保 bytes.len() >= 23(由调用方做长度检查)。
#[inline(always)]
fn is_timestamp_start(bytes: &[u8]) -> bool {
    debug_assert!(bytes.len() >= 23);
    let lo = u64::from_le_bytes(bytes[0..8].try_into().unwrap());
    let hi = u64::from_le_bytes(bytes[8..16].try_into().unwrap());
    // 位置 16(':') 和 19('.') 用两次单字节比较(比第三次 u64 load 更清晰)
    (lo & LO_MASK == LO_EXPECTED)
        && (hi & HI_MASK == HI_EXPECTED)
        && bytes[16] == b':'
        && bytes[19] == b'.'
}

/// 将原始字节转换为 InvalidFormat 错误(错误路径,标注 cold 避免影响热路径代码布局)
#[cold]
fn make_invalid_format_error(raw_bytes: &[u8]) -> ParseError {
    ParseError::InvalidFormat {
        raw: String::from_utf8_lossy(raw_bytes).to_string(),
    }
}