pub(crate) fn compact_whitespace(sql: &str) -> String {
let bytes = sql.as_bytes();
let mut out = String::with_capacity(sql.len());
let mut i = 0;
let mut last_was_space = true;
while i < bytes.len() {
let b = bytes[i];
if b == b'-' && bytes.get(i + 1) == Some(&b'-') {
i = scan_line_comment(sql, bytes, i, &mut out);
last_was_space = true;
} else if b == b'/' && bytes.get(i + 1) == Some(&b'*') {
i = scan_block_comment(sql, bytes, i, &mut out);
last_was_space = false;
} else if b == b'"' {
i = scan_quoted_identifier(sql, bytes, i, b'"', &mut out);
last_was_space = false;
} else if b == b'`' {
i = scan_quoted_identifier(sql, bytes, i, b'`', &mut out);
last_was_space = false;
} else if b == b'\'' {
i = scan_string_literal(sql, bytes, i, &mut out);
last_was_space = false;
} else if b == b'$' {
i = scan_dollar(sql, bytes, i, &mut out);
last_was_space = false;
} else if matches!(b, b' ' | b'\t' | b'\n' | b'\r') {
if !last_was_space {
out.push(' ');
last_was_space = true;
}
i += 1;
} else {
let ch = sql[i..].chars().next().expect("i < bytes.len()");
out.push(ch);
i += ch.len_utf8();
last_was_space = false;
}
}
while out.ends_with(' ') {
out.pop();
}
out
}
fn scan_line_comment(sql: &str, bytes: &[u8], start: usize, out: &mut String) -> usize {
let mut i = start + 2;
while i < bytes.len() && bytes[i] != b'\n' {
i += 1;
}
if i < bytes.len() {
i += 1;
}
out.push_str(&sql[start..i]);
i
}
fn scan_block_comment(sql: &str, bytes: &[u8], start: usize, out: &mut String) -> usize {
let mut i = start + 2;
let mut closed = false;
while i + 1 < bytes.len() {
if bytes[i] == b'*' && bytes[i + 1] == b'/' {
i += 2;
closed = true;
break;
}
i += 1;
}
if !closed {
i = bytes.len();
}
out.push_str(&sql[start..i]);
i
}
fn scan_quoted_identifier(
sql: &str,
bytes: &[u8],
start: usize,
quote: u8,
out: &mut String,
) -> usize {
let mut i = start + 1;
while i < bytes.len() {
if bytes[i] == quote {
if bytes.get(i + 1) == Some("e) {
i += 2;
} else {
i += 1;
break;
}
} else {
i += 1;
}
}
out.push_str(&sql[start..i]);
i
}
fn scan_string_literal(sql: &str, bytes: &[u8], start: usize, out: &mut String) -> usize {
let mut i = start + 1;
while i < bytes.len() {
let c = bytes[i];
if c == b'\\' {
i += if i + 1 < bytes.len() { 2 } else { 1 };
} else if c == b'\'' {
if bytes.get(i + 1) == Some(&b'\'') {
i += 2;
} else {
i += 1;
break;
}
} else {
i += 1;
}
}
out.push_str(&sql[start..i]);
i
}
fn scan_dollar(sql: &str, bytes: &[u8], start: usize, out: &mut String) -> usize {
if bytes.get(start + 1).is_some_and(u8::is_ascii_digit) {
let mut i = start + 1;
while i < bytes.len() && bytes[i].is_ascii_digit() {
i += 1;
}
out.push_str(&sql[start..i]);
return i;
}
let tag_start = start + 1;
let mut tag_end = tag_start;
if bytes
.get(tag_end)
.is_some_and(|c| c.is_ascii_alphabetic() || *c == b'_')
{
tag_end += 1;
while bytes
.get(tag_end)
.is_some_and(|c| c.is_ascii_alphanumeric() || *c == b'_')
{
tag_end += 1;
}
}
if bytes.get(tag_end) != Some(&b'$') {
out.push('$');
return start + 1;
}
let tag = &bytes[tag_start..tag_end];
let body_start = tag_end + 1;
let mut k = body_start;
while k < bytes.len() {
if bytes[k] == b'$' {
let after = k + 1;
let close_end = after + tag.len();
if close_end < bytes.len()
&& &bytes[after..close_end] == tag
&& bytes[close_end] == b'$'
{
out.push_str(&sql[start..=close_end]);
return close_end + 1;
}
}
k += 1;
}
out.push_str(&sql[start..]);
bytes.len()
}
#[cfg(test)]
mod tests {
use super::compact_whitespace;
fn check(input: &str, expected: &str) {
assert_eq!(compact_whitespace(input), expected, "input: {input:?}");
}
#[test]
fn multiline_select_collapses_to_single_spaces() {
check(
"SELECT *\n FROM users\n WHERE id = 1",
"SELECT * FROM users WHERE id = 1",
);
}
#[test]
fn string_literal_preserves_internal_whitespace() {
check(
"SELECT 'hello world\n!' FROM t",
"SELECT 'hello world\n!' FROM t",
);
}
#[test]
fn string_literal_doubled_quote_escape_preserved() {
check("SELECT 'it''s' FROM t", "SELECT 'it''s' FROM t");
}
#[test]
fn string_literal_backslash_escape_preserved() {
check(r"SELECT 'it\'s' FROM t", r"SELECT 'it\'s' FROM t");
}
#[test]
fn string_literal_lone_trailing_backslash_unterminated() {
check("'\\", "'\\");
}
#[test]
fn string_literal_unterminated_to_eof() {
check("SELECT 'unterminated", "SELECT 'unterminated");
}
#[test]
fn ansi_quoted_identifier_with_embedded_tab_preserved() {
check("SELECT \"two\twords\"", "SELECT \"two\twords\"");
}
#[test]
fn backtick_identifier_with_embedded_space_preserved() {
check("SELECT `name with spaces`", "SELECT `name with spaces`");
}
#[test]
fn ansi_doubled_quote_escape_preserved() {
check("SELECT \"with\"\"quote\"", "SELECT \"with\"\"quote\"");
}
#[test]
fn backtick_doubled_escape_preserved() {
check("SELECT `with``tick`", "SELECT `with``tick`");
}
#[test]
fn dollar_quoted_empty_tag_preserves_internal_indent() {
check("SELECT $$multi\n line$$", "SELECT $$multi\n line$$");
}
#[test]
fn dollar_quoted_with_tag_preserves_body() {
check(
"SELECT $tag$body with\nnewline$tag$",
"SELECT $tag$body with\nnewline$tag$",
);
}
#[test]
fn dollar_positional_param_one_digit() {
check("WHERE id = $1", "WHERE id = $1");
}
#[test]
fn dollar_positional_param_multi_digit() {
check("WHERE id = $42", "WHERE id = $42");
}
#[test]
fn dollar_lone_with_no_opener_preserved() {
check("$body", "$body");
}
#[test]
fn dollar_quoted_unterminated_to_eof_no_panic() {
check("$$ab", "$$ab");
}
#[test]
fn dollar_quoted_unterminated_with_tag_to_eof() {
check("$tag$body$ta", "$tag$body$ta");
}
#[test]
fn line_comment_terminator_preserved_so_subsequent_tokens_stay_outside() {
check("SELECT 1\n--c\nFROM x", "SELECT 1 --c\nFROM x");
}
#[test]
fn line_comment_crlf_terminator_preserves_both_bytes() {
check("SELECT 1\n--c\r\nFROM x", "SELECT 1 --c\r\nFROM x");
}
#[test]
fn line_comment_at_eof_without_newline_copied_as_is() {
check("-- trailing", "-- trailing");
}
#[test]
fn line_comment_at_eof_with_trailing_cr_preserves_cr() {
check("-- trailing\r", "-- trailing\r");
}
#[test]
fn line_comment_at_eof_trailing_spaces_stripped_for_trim_invariant() {
check("SELECT 1\n-- trailing ", "SELECT 1 -- trailing");
}
#[test]
fn block_comment_preserves_internal_whitespace_and_markers() {
check(
"/* multi\n line */ SELECT 1",
"/* multi\n line */ SELECT 1",
);
}
#[test]
fn block_comment_unterminated_consumes_to_eof() {
check("/* unterminated", "/* unterminated");
}
#[test]
fn tabs_crlf_and_multiple_spaces_collapse() {
check("SELECT\t\t1\r\n\r\n\r\n FROM\tt", "SELECT 1 FROM t");
}
#[test]
fn empty_input_returns_empty_string() {
check("", "");
}
#[test]
fn whitespace_only_input_returns_empty_string() {
check(" \n\t ", "");
}
#[test]
fn leading_whitespace_suppressed() {
check(" SELECT 1", "SELECT 1");
}
#[test]
fn trailing_whitespace_trimmed() {
check("SELECT 1 ", "SELECT 1");
}
#[test]
fn placeholder_question_mark_flows_through_normal_state() {
check(
"SELECT ?, ? FROM t WHERE id = ?",
"SELECT ?, ? FROM t WHERE id = ?",
);
}
#[test]
fn non_ascii_identifier_preserved() {
check("WHERE café = 'value'", "WHERE café = 'value'");
}
#[test]
fn non_ascii_identifier_in_multiline_collapses_outer_whitespace() {
check("SELECT café\n FROM users", "SELECT café FROM users");
}
#[test]
fn post_obfuscation_multiline_collapses_with_placeholders() {
let obfuscated = "SELECT ?, ?\nFROM users\nWHERE id = ?";
check(obfuscated, "SELECT ?, ? FROM users WHERE id = ?");
}
mod proptests {
use super::super::compact_whitespace;
use proptest::prelude::*;
fn whitespace() -> impl Strategy<Value = String> {
"[ \t\n\r]{0,5}".prop_map(String::from)
}
fn ident_lower() -> impl Strategy<Value = String> {
"[a-z_][a-z0-9_]{0,7}".prop_map(String::from)
}
fn ansi_quoted_ident() -> impl Strategy<Value = String> {
"[a-z0-9 _]{0,8}".prop_map(|inner| format!("\"{inner}\""))
}
fn backtick_quoted_ident() -> impl Strategy<Value = String> {
"[a-z0-9 _]{0,8}".prop_map(|inner| format!("`{inner}`"))
}
fn safe_punct() -> impl Strategy<Value = String> {
prop::sample::select(vec![",", ";", "=", "<", ">", "+", "*", "(", ")", "?"])
.prop_map(String::from)
}
fn integer() -> impl Strategy<Value = String> {
"[0-9]{1,5}".prop_map(String::from)
}
fn string_literal_plain() -> impl Strategy<Value = String> {
"[a-z0-9 _]{0,8}".prop_map(|inner| format!("'{inner}'"))
}
fn dollar_quoted_plain() -> impl Strategy<Value = String> {
(
"[a-z_][a-z0-9_]{0,3}".prop_map(String::from),
"[a-z0-9 _]{0,8}".prop_map(String::from),
)
.prop_map(|(tag, body)| format!("${tag}${body}${tag}$"))
}
fn line_comment() -> impl Strategy<Value = String> {
"[a-z0-9 _]{0,15}".prop_map(|inner| format!("--{inner}\n"))
}
fn block_comment() -> impl Strategy<Value = String> {
"[a-z0-9 _]{0,15}".prop_map(|inner| format!("/*{inner}*/"))
}
fn token_any() -> impl Strategy<Value = String> {
prop_oneof![
ident_lower(),
whitespace(),
ansi_quoted_ident(),
backtick_quoted_ident(),
safe_punct(),
integer(),
string_literal_plain(),
dollar_quoted_plain(),
line_comment(),
block_comment(),
]
}
fn fragment_any() -> impl Strategy<Value = String> {
prop::collection::vec(token_any(), 0..16).prop_map(|tokens| tokens.concat())
}
fn token_normal_state() -> impl Strategy<Value = String> {
prop_oneof![ident_lower(), whitespace(), safe_punct(), integer(),]
}
fn fragment_normal_state_only() -> impl Strategy<Value = String> {
prop::collection::vec(token_normal_state(), 0..16).prop_map(|tokens| tokens.concat())
}
fn fragment_single_verbatim_region() -> impl Strategy<Value = String> {
prop_oneof![
ansi_quoted_ident(),
backtick_quoted_ident(),
string_literal_plain(),
dollar_quoted_plain(),
block_comment(),
]
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(256))]
#[test]
fn no_panic_on_random_bytes(bytes in prop::collection::vec(any::<u8>(), 0..256)) {
let s = String::from_utf8_lossy(&bytes).into_owned();
let _ = compact_whitespace(&s);
}
#[test]
fn no_panic_on_structured_fragments(s in fragment_any()) {
let _ = compact_whitespace(&s);
}
#[test]
fn idempotent(s in fragment_any()) {
let once = compact_whitespace(&s);
let twice = compact_whitespace(&once);
prop_assert_eq!(once, twice);
}
#[test]
fn length_monotonic(s in fragment_any()) {
let out = compact_whitespace(&s);
prop_assert!(out.len() <= s.len());
}
#[test]
fn trim_invariant(s in fragment_any()) {
let out = compact_whitespace(&s);
prop_assert!(!out.starts_with(' '), "leading space: {out:?}");
prop_assert!(!out.ends_with(' '), "trailing space: {out:?}");
}
#[test]
fn normal_state_collapses_all_whitespace_controls(
s in fragment_normal_state_only()
) {
let out = compact_whitespace(&s);
prop_assert!(
!out.contains('\n') && !out.contains('\r') && !out.contains('\t'),
"normal-state output contains whitespace control: {out:?}"
);
}
#[test]
fn single_verbatim_region_round_trips(s in fragment_single_verbatim_region()) {
prop_assert_eq!(compact_whitespace(&s), s);
}
}
}
}