use memchr::memmem::Finder;
use memchr::{memchr, memrchr};
use std::fs;
use std::path::Path;
use std::path::PathBuf;
use std::str;
use std::sync::LazyLock;
use crate::error::ParseError;
use crate::sqllog;
use crate::sqllog::Sqllog;
use encoding::all::GB18030;
use encoding::{DecoderTrap, Encoding};
static FINDER_CLOSE_META: LazyLock<Finder<'static>> = LazyLock::new(|| Finder::new(b") "));
static FINDER_RECORD_START: LazyLock<Finder<'static>> = LazyLock::new(|| Finder::new(b"\n20"));
#[derive(Copy, Clone, Debug, PartialEq, Eq, Default)]
pub enum FileEncodingHint {
#[default]
Auto,
Utf8,
Gb18030,
}
pub struct LogParser {
data: Vec<u8>,
encoding: FileEncodingHint,
}
pub struct LogParserBuilder {
path: PathBuf,
encoding_hint: Option<FileEncodingHint>,
}
impl LogParserBuilder {
pub fn new<P: AsRef<Path>>(path: P) -> Self {
Self {
path: path.as_ref().to_path_buf(),
encoding_hint: None,
}
}
pub fn encoding_hint(mut self, hint: FileEncodingHint) -> Self {
self.encoding_hint = Some(hint);
self
}
pub fn build(self) -> Result<LogParser, ParseError> {
let data = fs::read(&self.path)
.map_err(|e| ParseError::IoError(e.to_string()))?;
let encoding = match self.encoding_hint {
Some(hint) => hint,
None => {
let head_size = data.len().min(64 * 1024);
let head_ok = str::from_utf8(&data[..head_size]).is_ok();
let tail_start = data.len().saturating_sub(4 * 1024).max(head_size);
let tail_ok = tail_start >= data.len()
|| str::from_utf8(&data[tail_start..]).is_ok();
if head_ok && tail_ok {
FileEncodingHint::Utf8
} else {
FileEncodingHint::Gb18030
}
}
};
Ok(LogParser { data, encoding })
}
}
impl LogParser {
pub fn iter(&self) -> LogIterator<'_> {
LogIterator {
data: &self.data,
pos: 0,
encoding: self.encoding,
line_number: 1,
}
}
}
pub struct LogIterator<'a> {
data: &'a [u8],
pos: usize,
encoding: FileEncodingHint,
line_number: u64,
}
impl<'a> LogIterator<'a> {
pub fn skip_errors(self) -> impl Iterator<Item = Sqllog> + 'a {
self.filter_map(Result::ok)
}
pub fn filter_by_exec_time(
self,
min_ms: u64,
) -> impl Iterator<Item = Result<Sqllog, ParseError>> + 'a {
let threshold = min_ms as f32;
self.filter(move |item| match item {
Ok(sqllog) => sqllog.exectime >= threshold,
Err(_) => false,
})
}
pub fn filter_by_sql_contains(
self,
pattern: &str,
) -> impl Iterator<Item = Result<Sqllog, ParseError>> + 'a {
let pattern = pattern.to_string();
self.filter(move |item| match item {
Ok(sqllog) => sqllog.sql.contains(&pattern),
Err(_) => false,
})
}
}
impl<'a> Iterator for LogIterator<'a> {
type Item = Result<Sqllog, ParseError>;
fn next(&mut self) -> Option<Self::Item> {
loop {
if self.pos >= self.data.len() {
return None;
}
let data = &self.data[self.pos..];
let current_line = self.line_number;
let (record_end, next_start) = match memchr(b'\n', data) {
None => (data.len(), data.len()),
Some(first_nl) => {
let ts_start = first_nl + 1;
if ts_start + 23 <= data.len()
&& is_timestamp_start(&data[ts_start..ts_start + 23])
{
(first_nl, ts_start)
} else {
let mut found_boundary: Option<usize> = None;
for candidate in FINDER_RECORD_START.find_iter(&data[ts_start..]) {
let abs_ts = ts_start + candidate + 1;
if abs_ts + 23 <= data.len()
&& is_timestamp_start(&data[abs_ts..abs_ts + 23])
{
found_boundary = Some(ts_start + candidate);
break;
}
}
match found_boundary {
Some(idx) => (idx, idx + 1),
None => (data.len(), data.len()),
}
}
}
};
let record_slice = &data[..record_end];
self.pos += next_start;
self.line_number += data[..next_start].iter().filter(|&&b| b == b'\n').count() as u64;
let record_slice = if record_slice.ends_with(b"\r") {
&record_slice[..record_slice.len() - 1]
} else {
record_slice
};
if record_slice.is_empty() {
continue;
}
return Some(parse_record_with_hint(
record_slice,
self.encoding,
current_line,
));
}
}
}
pub fn parse_record(record_bytes: &[u8]) -> Result<Sqllog, ParseError> {
parse_record_with_hint(record_bytes, FileEncodingHint::Auto, 0)
}
fn parse_record_with_hint(
record_bytes: &[u8],
encoding_hint: FileEncodingHint,
line_number: u64,
) -> Result<Sqllog, ParseError> {
let is_multiline = memchr(b'\n', record_bytes).is_some();
let first_line = if is_multiline {
match memchr(b'\n', record_bytes) {
Some(idx) => {
let mut line = &record_bytes[..idx];
if line.ends_with(b"\r") {
line = &line[..line.len() - 1];
}
line
}
None => {
let mut line = record_bytes;
if line.ends_with(b"\r") {
line = &line[..line.len() - 1];
}
line
}
}
} else {
let mut line = record_bytes;
if line.ends_with(b"\r") {
line = &line[..line.len() - 1];
}
line
};
if first_line.len() < 23 {
return Err(make_invalid_format_error(first_line, line_number));
}
let ts = match str::from_utf8(&first_line[0..23]) {
Ok(s) => s.to_string(),
Err(_) => return Err(make_invalid_format_error(first_line, line_number)),
};
let meta_start = match memchr(b'(', &first_line[23..]) {
Some(idx) => 23 + idx,
None => return Err(make_invalid_format_error(first_line, line_number)),
};
let meta_end = match FINDER_CLOSE_META.find(&first_line[meta_start..]) {
Some(idx) => Some(meta_start + idx),
None => memrchr(b')', &first_line[meta_start..]).map(|idx| meta_start + idx),
};
let meta_end = match meta_end {
Some(idx) => idx,
None => return Err(make_invalid_format_error(first_line, line_number)),
};
let meta_bytes = &first_line[meta_start + 1..meta_end];
let (ep, sess_id, thrd_id, username, trxid, statement, appname, client_ip) =
match encoding_hint {
FileEncodingHint::Utf8 => {
sqllog::parse_meta_from_bytes(meta_bytes)
}
FileEncodingHint::Auto => {
match str::from_utf8(meta_bytes) {
Ok(_) => sqllog::parse_meta_from_bytes(meta_bytes),
Err(_) => match GB18030.decode(meta_bytes, DecoderTrap::Strict) {
Ok(decoded) => sqllog::parse_meta_from_bytes(decoded.as_bytes()),
Err(_) => {
let lossy = String::from_utf8_lossy(meta_bytes).into_owned();
sqllog::parse_meta_from_bytes(lossy.as_bytes())
}
},
}
}
FileEncodingHint::Gb18030 => {
match GB18030.decode(meta_bytes, DecoderTrap::Strict) {
Ok(decoded) => sqllog::parse_meta_from_bytes(decoded.as_bytes()),
Err(_) => {
let lossy = String::from_utf8_lossy(meta_bytes).into_owned();
sqllog::parse_meta_from_bytes(lossy.as_bytes())
}
}
}
};
let body_start_in_first_line = meta_end + 1;
let content_start = if body_start_in_first_line < first_line.len()
&& first_line[body_start_in_first_line] == b' '
{
body_start_in_first_line + 1
} else {
body_start_in_first_line
};
let mut tag: Option<String> = None;
let content_slice = if content_start < record_bytes.len() {
let mut s = &record_bytes[content_start..];
if !s.is_empty()
&& s[0] == b'['
&& let Some(end_idx) = memchr(b']', s)
&& end_idx >= 1
{
let inner = &s[1..end_idx];
if !inner.contains(&b' ') && inner.len() <= 32 {
tag = match encoding_hint {
FileEncodingHint::Utf8 => {
str::from_utf8(inner).ok().map(|t| t.to_string())
}
FileEncodingHint::Auto => {
str::from_utf8(inner).ok().map(|t| t.to_string())
.or_else(|| {
GB18030.decode(inner, DecoderTrap::Strict)
.ok()
})
}
FileEncodingHint::Gb18030 => {
GB18030.decode(inner, DecoderTrap::Strict)
.ok()
.or_else(|| str::from_utf8(inner).ok().map(|s| s.to_string()))
}
};
s = &s[end_idx + 1..];
let mut skip = 0usize;
while skip < s.len() && s[skip].is_ascii_whitespace() {
skip += 1;
}
s = &s[skip..];
}
}
s
} else {
&[] as &[u8]
};
let split = sqllog::find_indicators_split(content_slice);
let body_bytes = &content_slice[..split];
let ind_bytes = &content_slice[split..];
let sql_raw = match encoding_hint {
FileEncodingHint::Utf8 => {
String::from_utf8_lossy(body_bytes).into_owned()
}
FileEncodingHint::Auto => {
match str::from_utf8(body_bytes) {
Ok(s) => s.to_string(),
Err(_) => match GB18030.decode(body_bytes, DecoderTrap::Strict) {
Ok(s) => s,
Err(_) => String::from_utf8_lossy(body_bytes).into_owned(),
},
}
}
FileEncodingHint::Gb18030 => {
match GB18030.decode(body_bytes, DecoderTrap::Strict) {
Ok(s) => s,
Err(_) => String::from_utf8_lossy(body_bytes).into_owned(),
}
}
};
let sql = if tag.as_deref() == Some("ORA") {
sql_raw.strip_prefix(": ").unwrap_or(&sql_raw).to_string()
} else {
sql_raw
};
let (exectime, rowcount, exec_id) = sqllog::parse_indicators_from_bytes(ind_bytes);
Ok(Sqllog {
ts,
tag,
ep,
sess_id,
thrd_id,
username,
trxid,
statement,
appname,
client_ip,
sql,
exectime,
rowcount,
exec_id,
})
}
const LO_MASK: u64 = 0xFF0000FF0000FFFF;
const LO_EXPECTED: u64 = 0x2D00002D00003032;
const HI_MASK: u64 = 0x0000FF0000FF0000;
const HI_EXPECTED: u64 = 0x00003A0000200000;
#[inline(always)]
fn is_timestamp_start(bytes: &[u8]) -> bool {
debug_assert!(bytes.len() >= 23);
let lo = u64::from_le_bytes(bytes[0..8].try_into().unwrap());
let hi = u64::from_le_bytes(bytes[8..16].try_into().unwrap());
(lo & LO_MASK == LO_EXPECTED)
&& (hi & HI_MASK == HI_EXPECTED)
&& bytes[16] == b':'
&& bytes[19] == b'.'
}
#[cold]
fn make_invalid_format_error(raw_bytes: &[u8], line_number: u64) -> ParseError {
ParseError::InvalidFormat {
raw: String::from_utf8_lossy(raw_bytes).to_string(),
line_number,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_is_timestamp_start_valid() {
let ts = b"2025-11-17 16:09:41.123";
assert!(is_timestamp_start(ts));
}
#[test]
fn test_is_timestamp_start_wrong_year_prefix() {
let ts = b"1025-11-17 16:09:41.123";
assert!(!is_timestamp_start(ts));
}
#[test]
fn test_is_timestamp_start_wrong_month_separator() {
let ts = b"2025X11-17 16:09:41.123";
assert!(!is_timestamp_start(ts));
}
#[test]
fn test_is_timestamp_start_wrong_second_separator() {
let ts = b"2025-11-17 16:09X41.123";
assert!(!is_timestamp_start(ts));
}
#[test]
fn test_is_timestamp_start_wrong_millis_separator() {
let ts = b"2025-11-17 16:09:41X123";
assert!(!is_timestamp_start(ts));
}
#[test]
fn test_is_timestamp_start_exactly_23_bytes() {
let ts = b"2025-11-17 16:09:41.123";
assert_eq!(ts.len(), 23);
assert!(is_timestamp_start(ts));
}
#[test]
fn test_is_timestamp_start_trailing_garbage() {
let ts = b"2025-11-17 16:09:41.123extra_garbage_here";
assert!(is_timestamp_start(ts));
}
#[cfg(not(miri))]
#[test]
fn test_builder_encoding_hint_utf8() {
use std::io::Write;
use tempfile::NamedTempFile;
let mut tmp = NamedTempFile::new().expect("tmp");
write!(
tmp,
"2025-11-17 16:09:41.123 (EP[0] sess:1 thrd:2 user:u trxid:3 stmt:4 appname:a) SELECT 1"
)
.unwrap();
tmp.as_file().sync_all().unwrap();
let parser = LogParserBuilder::new(tmp.path())
.encoding_hint(FileEncodingHint::Utf8)
.build()
.expect("build");
let record = parser.iter().next().unwrap().unwrap();
assert_eq!(record.ts, "2025-11-17 16:09:41.123");
assert!(record.sql.contains("SELECT 1"));
}
#[cfg(not(miri))]
#[test]
fn test_builder_file_not_found() {
let result = LogParserBuilder::new("/nonexistent/path.log").build();
assert!(result.is_err());
match result {
Err(ParseError::IoError(_)) => {}
_ => panic!("Expected IoError on nonexistent file"),
}
}
}