use crate::error::{SecretshError, TokenizationError};
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Token {
pub value: String,
pub placeholders: Vec<Placeholder>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Placeholder {
pub key: String,
pub start: usize,
pub end: usize,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct TokenizeResult {
pub tokens: Vec<Token>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum QuoteState {
Unquoted,
SingleQuoted,
DoubleQuoted,
}
pub fn tokenize(input: &str) -> Result<TokenizeResult, SecretshError> {
let raw_tokens = lex(input)?;
if raw_tokens.is_empty() {
return Err(TokenizationError::EmptyCommand.into());
}
let mut tokens = Vec::with_capacity(raw_tokens.len());
for value in raw_tokens {
let placeholders = scan_placeholders(&value)?;
tokens.push(Token {
value,
placeholders,
});
}
Ok(TokenizeResult { tokens })
}
fn lex(input: &str) -> Result<Vec<String>, SecretshError> {
let chars: Vec<char> = input.chars().collect();
let len = chars.len();
let byte_offsets: Vec<usize> = {
let mut offsets = Vec::with_capacity(len + 1);
let mut byte_pos = 0usize;
for &ch in &chars {
offsets.push(byte_pos);
byte_pos += ch.len_utf8();
}
offsets.push(byte_pos); offsets
};
let mut tokens: Vec<String> = Vec::new();
let mut current = String::new();
let mut in_token = false;
let mut state = QuoteState::Unquoted;
let mut quote_start: usize = 0;
let mut i = 0usize;
while i < len {
let ch = chars[i];
let byte_offset = byte_offsets[i];
match state {
QuoteState::Unquoted => {
match ch {
' ' | '\t' => {
if in_token {
tokens.push(std::mem::take(&mut current));
in_token = false;
}
i += 1;
}
'\'' => {
state = QuoteState::SingleQuoted;
quote_start = byte_offset;
in_token = true;
i += 1;
}
'"' => {
state = QuoteState::DoubleQuoted;
quote_start = byte_offset;
in_token = true;
i += 1;
}
'\\' => {
if i + 1 >= len {
return Err(TokenizationError::TrailingBackslash.into());
}
let next = chars[i + 1];
current.push(next);
in_token = true;
i += 2;
}
'|' => {
return Err(TokenizationError::RejectedMetacharacter {
character: '|',
offset: byte_offset,
}
.into());
}
'>' | '<' => {
return Err(TokenizationError::RejectedMetacharacter {
character: ch,
offset: byte_offset,
}
.into());
}
'&' => {
return Err(TokenizationError::RejectedMetacharacter {
character: '&',
offset: byte_offset,
}
.into());
}
';' => {
return Err(TokenizationError::RejectedMetacharacter {
character: ';',
offset: byte_offset,
}
.into());
}
'`' => {
return Err(TokenizationError::RejectedMetacharacter {
character: '`',
offset: byte_offset,
}
.into());
}
'(' => {
return Err(TokenizationError::RejectedMetacharacter {
character: '(',
offset: byte_offset,
}
.into());
}
'*' | '?' | '[' => {
return Err(TokenizationError::RejectedMetacharacter {
character: ch,
offset: byte_offset,
}
.into());
}
'$' => {
let next = chars.get(i + 1).copied();
let is_expansion = match next {
Some(c) => c.is_alphanumeric() || c == '_' || c == '{' || c == '(',
None => false,
};
if is_expansion {
return Err(TokenizationError::RejectedMetacharacter {
character: '$',
offset: byte_offset,
}
.into());
}
current.push('$');
in_token = true;
i += 1;
}
_ => {
current.push(ch);
in_token = true;
i += 1;
}
}
}
QuoteState::SingleQuoted => {
match ch {
'\'' => {
state = QuoteState::Unquoted;
i += 1;
}
_ => {
current.push(ch);
i += 1;
}
}
}
QuoteState::DoubleQuoted => {
match ch {
'"' => {
state = QuoteState::Unquoted;
i += 1;
}
'\\' => {
let next = chars.get(i + 1).copied();
match next {
Some('"') => {
current.push('"');
i += 2;
}
Some('\\') => {
current.push('\\');
i += 2;
}
Some(_) => {
current.push('\\');
i += 1;
}
None => {
current.push('\\');
i += 1;
}
}
}
_ => {
current.push(ch);
i += 1;
}
}
}
}
}
match state {
QuoteState::SingleQuoted => {
return Err(TokenizationError::UnclosedSingleQuote {
offset: quote_start,
}
.into());
}
QuoteState::DoubleQuoted => {
return Err(TokenizationError::UnclosedDoubleQuote {
offset: quote_start,
}
.into());
}
QuoteState::Unquoted => {}
}
if in_token {
tokens.push(current);
}
Ok(tokens)
}
fn scan_placeholders(value: &str) -> Result<Vec<Placeholder>, SecretshError> {
let mut placeholders = Vec::new();
let bytes = value.as_bytes();
let len = bytes.len();
let mut i = 0usize;
while i < len {
if i + 1 < len && bytes[i] == b'{' && bytes[i + 1] == b'{' {
let open_start = i;
i += 2;
let key_start = i;
let mut found_close = false;
while i < len {
if i + 1 < len && bytes[i] == b'}' && bytes[i + 1] == b'}' {
found_close = true;
break;
}
i += 1;
}
if !found_close {
let fragment = format!("{{{{{}}}", &value[key_start..i]);
return Err(TokenizationError::MalformedPlaceholder { fragment }.into());
}
let key = &value[key_start..i];
let close_end = i + 2;
if !is_valid_key(key) {
let fragment = format!("{{{{{}}}}}", key);
return Err(TokenizationError::InvalidKeyName { fragment }.into());
}
placeholders.push(Placeholder {
key: key.to_owned(),
start: open_start,
end: close_end,
});
i = close_end;
} else {
i += 1;
}
}
Ok(placeholders)
}
#[inline]
fn is_valid_key(key: &str) -> bool {
let mut chars = key.chars();
match chars.next() {
None => false, Some(first) => {
(first.is_ascii_alphabetic() || first == '_')
&& chars.all(|c| c.is_ascii_alphanumeric() || c == '_')
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::error::{SecretshError, TokenizationError};
fn tok(input: &str) -> TokenizeResult {
tokenize(input).unwrap_or_else(|e| panic!("tokenize({input:?}) failed: {e}"))
}
fn tok_err(input: &str) -> TokenizationError {
match tokenize(input) {
Err(SecretshError::Tokenization(e)) => e,
Err(other) => panic!("expected TokenizationError, got: {other}"),
Ok(r) => panic!("expected error, got tokens: {r:?}"),
}
}
fn values(r: &TokenizeResult) -> Vec<&str> {
r.tokens.iter().map(|t| t.value.as_str()).collect()
}
#[test]
fn empty_string_is_error() {
assert!(matches!(tok_err(""), TokenizationError::EmptyCommand));
}
#[test]
fn all_whitespace_is_error() {
assert!(matches!(
tok_err(" \t "),
TokenizationError::EmptyCommand
));
}
#[test]
fn single_word() {
assert_eq!(values(&tok("hello")), ["hello"]);
}
#[test]
fn two_words_single_space() {
assert_eq!(values(&tok("foo bar")), ["foo", "bar"]);
}
#[test]
fn multiple_spaces_between_words() {
assert_eq!(values(&tok("foo bar")), ["foo", "bar"]);
}
#[test]
fn tab_separates_tokens() {
assert_eq!(values(&tok("foo\tbar")), ["foo", "bar"]);
}
#[test]
fn leading_and_trailing_whitespace_ignored() {
assert_eq!(values(&tok(" foo bar ")), ["foo", "bar"]);
}
#[test]
fn many_tokens() {
assert_eq!(values(&tok("a b c d e")), ["a", "b", "c", "d", "e"]);
}
#[test]
fn single_quoted_preserves_spaces() {
assert_eq!(values(&tok("'hello world'")), ["hello world"]);
}
#[test]
fn single_quoted_preserves_backslash() {
assert_eq!(values(&tok(r"'foo\nbar'")), [r"foo\nbar"]);
}
#[test]
fn single_quoted_preserves_double_quote() {
assert_eq!(values(&tok("'say \"hi\"'")), ["say \"hi\""]);
}
#[test]
fn single_quoted_preserves_metacharacters() {
assert_eq!(values(&tok("'|><&;`(*?[$'")), ["|><&;`(*?[$"]);
}
#[test]
fn single_quoted_preserves_dollar_expansion() {
assert_eq!(values(&tok("'$HOME'")), ["$HOME"]);
}
#[test]
fn empty_single_quotes_produce_empty_token() {
assert_eq!(values(&tok("''")), [""]);
}
#[test]
fn single_quote_adjacent_to_word() {
assert_eq!(values(&tok("foo'bar'baz")), ["foobarbaz"]);
}
#[test]
fn unclosed_single_quote_is_error() {
let e = tok_err("'unclosed");
assert!(
matches!(e, TokenizationError::UnclosedSingleQuote { offset: 0 }),
"got: {e:?}"
);
}
#[test]
fn unclosed_single_quote_offset_is_correct() {
let e = tok_err("foo 'bar");
assert!(
matches!(e, TokenizationError::UnclosedSingleQuote { offset: 4 }),
"got: {e:?}"
);
}
#[test]
fn double_quoted_preserves_spaces() {
assert_eq!(values(&tok(r#""hello world""#)), ["hello world"]);
}
#[test]
fn double_quoted_escape_double_quote() {
assert_eq!(values(&tok(r#""\"""#)), ["\""]);
}
#[test]
fn double_quoted_escape_backslash() {
assert_eq!(values(&tok(r#""\\""#)), ["\\"]);
}
#[test]
fn double_quoted_backslash_before_other_char_is_literal() {
assert_eq!(values(&tok(r#""\n""#)), [r"\n"]);
}
#[test]
fn double_quoted_preserves_metacharacters() {
assert_eq!(values(&tok(r#""|><&;`(*?[$""#)), ["|><&;`(*?[$"]);
}
#[test]
fn double_quoted_preserves_dollar_expansion() {
assert_eq!(values(&tok(r#""$HOME""#)), ["$HOME"]);
}
#[test]
fn empty_double_quotes_produce_empty_token() {
assert_eq!(values(&tok(r#""""#)), [""]);
}
#[test]
fn double_quote_adjacent_to_word() {
assert_eq!(values(&tok(r#"foo"bar"baz"#)), ["foobarbaz"]);
}
#[test]
fn unclosed_double_quote_is_error() {
let e = tok_err(r#""unclosed"#);
assert!(
matches!(e, TokenizationError::UnclosedDoubleQuote { offset: 0 }),
"got: {e:?}"
);
}
#[test]
fn unclosed_double_quote_offset_is_correct() {
let e = tok_err(r#"foo "bar"#);
assert!(
matches!(e, TokenizationError::UnclosedDoubleQuote { offset: 4 }),
"got: {e:?}"
);
}
#[test]
fn backslash_escapes_space() {
assert_eq!(values(&tok(r"foo\ bar")), ["foo bar"]);
}
#[test]
fn backslash_escapes_pipe() {
assert_eq!(values(&tok(r"\|")), ["|"]);
}
#[test]
fn backslash_escapes_dollar() {
assert_eq!(values(&tok(r"\$HOME")), ["$HOME"]);
}
#[test]
fn backslash_escapes_backslash() {
assert_eq!(values(&tok(r"\\")), ["\\"]);
}
#[test]
fn backslash_escapes_asterisk() {
assert_eq!(values(&tok(r"\*")), ["*"]);
}
#[test]
fn trailing_backslash_is_error() {
assert!(matches!(
tok_err(r"\"),
TokenizationError::TrailingBackslash
));
}
#[test]
fn trailing_backslash_after_token_is_error() {
assert!(matches!(
tok_err(r"foo \"),
TokenizationError::TrailingBackslash
));
}
#[test]
fn rejects_pipe() {
let e = tok_err("foo | bar");
assert!(
matches!(
e,
TokenizationError::RejectedMetacharacter { character: '|', .. }
),
"got: {e:?}"
);
}
#[test]
fn rejects_redirect_out() {
let e = tok_err("foo > /dev/null");
assert!(
matches!(
e,
TokenizationError::RejectedMetacharacter { character: '>', .. }
),
"got: {e:?}"
);
}
#[test]
fn rejects_redirect_in() {
let e = tok_err("foo < /dev/null");
assert!(
matches!(
e,
TokenizationError::RejectedMetacharacter { character: '<', .. }
),
"got: {e:?}"
);
}
#[test]
fn rejects_ampersand() {
let e = tok_err("foo & bar");
assert!(
matches!(
e,
TokenizationError::RejectedMetacharacter { character: '&', .. }
),
"got: {e:?}"
);
}
#[test]
fn rejects_double_ampersand() {
let e = tok_err("foo && bar");
assert!(
matches!(
e,
TokenizationError::RejectedMetacharacter { character: '&', .. }
),
"got: {e:?}"
);
}
#[test]
fn rejects_semicolon() {
let e = tok_err("foo; bar");
assert!(
matches!(
e,
TokenizationError::RejectedMetacharacter { character: ';', .. }
),
"got: {e:?}"
);
}
#[test]
fn rejects_backtick() {
let e = tok_err("foo `bar`");
assert!(
matches!(
e,
TokenizationError::RejectedMetacharacter { character: '`', .. }
),
"got: {e:?}"
);
}
#[test]
fn rejects_open_paren() {
let e = tok_err("foo (bar)");
assert!(
matches!(
e,
TokenizationError::RejectedMetacharacter { character: '(', .. }
),
"got: {e:?}"
);
}
#[test]
fn rejects_dollar_paren_subshell() {
let e = tok_err("foo $(cmd)");
assert!(
matches!(
e,
TokenizationError::RejectedMetacharacter { character: '$', .. }
),
"got: {e:?}"
);
}
#[test]
fn rejects_dollar_brace_expansion() {
let e = tok_err("foo ${VAR}");
assert!(
matches!(
e,
TokenizationError::RejectedMetacharacter { character: '$', .. }
),
"got: {e:?}"
);
}
#[test]
fn rejects_dollar_alphanumeric_expansion() {
let e = tok_err("foo $HOME");
assert!(
matches!(
e,
TokenizationError::RejectedMetacharacter { character: '$', .. }
),
"got: {e:?}"
);
}
#[test]
fn rejects_glob_star() {
let e = tok_err("ls *");
assert!(
matches!(
e,
TokenizationError::RejectedMetacharacter { character: '*', .. }
),
"got: {e:?}"
);
}
#[test]
fn rejects_glob_question() {
let e = tok_err("ls ?");
assert!(
matches!(
e,
TokenizationError::RejectedMetacharacter { character: '?', .. }
),
"got: {e:?}"
);
}
#[test]
fn rejects_glob_bracket() {
let e = tok_err("ls [abc]");
assert!(
matches!(
e,
TokenizationError::RejectedMetacharacter { character: '[', .. }
),
"got: {e:?}"
);
}
#[test]
fn rejected_metacharacter_offset_is_correct() {
let e = tok_err("foo |");
assert!(
matches!(
e,
TokenizationError::RejectedMetacharacter {
character: '|',
offset: 4
}
),
"got: {e:?}"
);
}
#[test]
fn bare_dollar_at_eof_is_literal() {
assert_eq!(values(&tok("foo$")), ["foo$"]);
}
#[test]
fn bare_dollar_before_space_is_literal() {
assert_eq!(values(&tok("$ foo")), ["$", "foo"]);
}
#[test]
fn bare_dollar_before_quote_is_literal() {
assert_eq!(values(&tok("$'foo'")), ["$foo"]);
}
#[test]
fn pipe_inside_single_quotes_is_ok() {
assert_eq!(values(&tok("'foo|bar'")), ["foo|bar"]);
}
#[test]
fn redirect_inside_double_quotes_is_ok() {
assert_eq!(values(&tok(r#""foo>bar""#)), ["foo>bar"]);
}
#[test]
fn glob_inside_single_quotes_is_ok() {
assert_eq!(values(&tok("'*.rs'")), ["*.rs"]);
}
#[test]
fn dollar_expansion_inside_double_quotes_is_ok() {
assert_eq!(values(&tok(r#""$HOME""#)), ["$HOME"]);
}
#[test]
fn semicolon_inside_double_quotes_is_ok() {
assert_eq!(values(&tok(r#""foo;bar""#)), ["foo;bar"]);
}
#[test]
fn consecutive_single_quotes_concatenate() {
assert_eq!(values(&tok("'foo''bar'")), ["foobar"]);
}
#[test]
fn consecutive_double_quotes_concatenate() {
assert_eq!(values(&tok(r#""foo""bar""#)), ["foobar"]);
}
#[test]
fn mixed_quotes_concatenate() {
assert_eq!(values(&tok(r#"'foo'"bar""#)), ["foobar"]);
}
#[test]
fn single_then_double_then_unquoted() {
assert_eq!(values(&tok(r#"'a'"b"c"#)), ["abc"]);
}
#[test]
fn placeholder_as_full_token() {
let r = tok("{{DB_PASS}}");
assert_eq!(r.tokens.len(), 1);
let t = &r.tokens[0];
assert_eq!(t.value, "{{DB_PASS}}");
assert_eq!(t.placeholders.len(), 1);
let p = &t.placeholders[0];
assert_eq!(p.key, "DB_PASS");
assert_eq!(p.start, 0);
assert_eq!(p.end, 11); }
#[test]
fn placeholder_embedded_in_token() {
let r = tok("admin:{{PASS}}");
assert_eq!(r.tokens.len(), 1);
let t = &r.tokens[0];
assert_eq!(t.value, "admin:{{PASS}}");
assert_eq!(t.placeholders.len(), 1);
let p = &t.placeholders[0];
assert_eq!(p.key, "PASS");
assert_eq!(p.start, 6);
assert_eq!(p.end, 14);
}
#[test]
fn placeholder_with_underscore_key() {
let r = tok("{{_PRIVATE_KEY}}");
assert_eq!(r.tokens[0].placeholders[0].key, "_PRIVATE_KEY");
}
#[test]
fn placeholder_with_mixed_case_key() {
let r = tok("{{MySecret123}}");
assert_eq!(r.tokens[0].placeholders[0].key, "MySecret123");
}
#[test]
fn two_placeholders_in_one_token() {
let r = tok("{{USER}}:{{PASS}}");
let t = &r.tokens[0];
assert_eq!(t.placeholders.len(), 2);
assert_eq!(t.placeholders[0].key, "USER");
assert_eq!(t.placeholders[1].key, "PASS");
assert_eq!(t.placeholders[0].start, 0);
assert_eq!(t.placeholders[0].end, 8);
assert_eq!(t.placeholders[1].start, 9);
assert_eq!(t.placeholders[1].end, 17);
}
#[test]
fn placeholder_in_second_token() {
let r = tok("cmd --password={{SECRET}}");
assert_eq!(r.tokens.len(), 2);
assert_eq!(r.tokens[0].placeholders.len(), 0);
assert_eq!(r.tokens[1].placeholders.len(), 1);
assert_eq!(r.tokens[1].placeholders[0].key, "SECRET");
}
#[test]
fn no_placeholders_in_plain_token() {
let r = tok("hello world");
for t in &r.tokens {
assert!(t.placeholders.is_empty());
}
}
#[test]
fn placeholder_survives_double_quote_context() {
let r = tok(r#""{{DB_PASS}}""#);
assert_eq!(r.tokens[0].value, "{{DB_PASS}}");
assert_eq!(r.tokens[0].placeholders[0].key, "DB_PASS");
}
#[test]
fn placeholder_survives_single_quote_context() {
let r = tok("'{{DB_PASS}}'");
assert_eq!(r.tokens[0].value, "{{DB_PASS}}");
assert_eq!(r.tokens[0].placeholders[0].key, "DB_PASS");
}
#[test]
fn unclosed_placeholder_is_error() {
let e = tok_err("{{FOO");
assert!(
matches!(e, TokenizationError::MalformedPlaceholder { .. }),
"got: {e:?}"
);
}
#[test]
fn unclosed_placeholder_fragment_contains_opening() {
let e = tok_err("{{FOO");
if let TokenizationError::MalformedPlaceholder { fragment } = e {
assert!(fragment.starts_with("{{"), "fragment: {fragment:?}");
assert!(fragment.contains("FOO"), "fragment: {fragment:?}");
} else {
panic!("wrong error variant");
}
}
#[test]
fn unclosed_placeholder_embedded_in_token() {
let e = tok_err("admin:{{PASS");
assert!(
matches!(e, TokenizationError::MalformedPlaceholder { .. }),
"got: {e:?}"
);
}
#[test]
fn placeholder_with_empty_key_is_error() {
let e = tok_err("{{}}");
assert!(
matches!(e, TokenizationError::InvalidKeyName { .. }),
"got: {e:?}"
);
}
#[test]
fn placeholder_with_numeric_start_key_is_error() {
let e = tok_err("{{1FOO}}");
assert!(
matches!(e, TokenizationError::InvalidKeyName { .. }),
"got: {e:?}"
);
}
#[test]
fn placeholder_with_hyphen_in_key_is_error() {
let e = tok_err("{{FOO-BAR}}");
assert!(
matches!(e, TokenizationError::InvalidKeyName { .. }),
"got: {e:?}"
);
}
#[test]
fn single_open_brace_is_literal() {
assert_eq!(values(&tok("{foo}")), ["{foo}"]);
}
#[test]
fn single_close_brace_is_literal() {
assert_eq!(values(&tok("foo}")), ["foo}"]);
}
#[test]
fn single_character_token() {
assert_eq!(values(&tok("x")), ["x"]);
}
#[test]
fn token_with_only_escaped_space() {
assert_eq!(values(&tok(r"\ ")), [" "]);
}
#[test]
fn multiple_escaped_spaces_form_one_token() {
assert_eq!(values(&tok(r"a\ b\ c")), ["a b c"]);
}
#[test]
fn empty_single_quote_between_words() {
assert_eq!(values(&tok("foo''bar")), ["foobar"]);
}
#[test]
fn empty_double_quote_between_words() {
assert_eq!(values(&tok(r#"foo""bar"#)), ["foobar"]);
}
#[test]
fn newline_inside_single_quotes_is_literal() {
assert_eq!(values(&tok("'foo\nbar'")), ["foo\nbar"]);
}
#[test]
fn newline_inside_double_quotes_is_literal() {
assert_eq!(values(&tok("\"foo\nbar\"")), ["foo\nbar"]);
}
#[test]
fn unicode_characters_pass_through() {
assert_eq!(values(&tok("héllo wörld")), ["héllo", "wörld"]);
}
#[test]
fn unicode_inside_single_quotes() {
assert_eq!(values(&tok("'héllo wörld'")), ["héllo wörld"]);
}
#[test]
fn backslash_escapes_unicode() {
assert_eq!(values(&tok("\\é")), ["é"]);
}
#[test]
fn complex_real_world_command() {
let r = tok(r#"psql "postgresql://{{DB_USER}}:{{DB_PASS}}@localhost/mydb""#);
assert_eq!(r.tokens.len(), 2);
assert_eq!(r.tokens[0].value, "psql");
assert_eq!(
r.tokens[1].value,
"postgresql://{{DB_USER}}:{{DB_PASS}}@localhost/mydb"
);
assert_eq!(r.tokens[1].placeholders.len(), 2);
assert_eq!(r.tokens[1].placeholders[0].key, "DB_USER");
assert_eq!(r.tokens[1].placeholders[1].key, "DB_PASS");
}
#[test]
fn command_with_escaped_metachar_and_placeholder() {
let r = tok(r#"curl \-d '{"key":"{{API_KEY}}"}'"#);
assert_eq!(r.tokens.len(), 3);
assert_eq!(r.tokens[0].value, "curl");
assert_eq!(r.tokens[1].value, "-d");
assert_eq!(r.tokens[2].value, r#"{"key":"{{API_KEY}}"}"#);
assert_eq!(r.tokens[2].placeholders[0].key, "API_KEY");
}
#[test]
fn placeholder_byte_offsets_are_correct_with_multibyte_prefix() {
let r = tok("héllo:{{KEY}}");
let p = &r.tokens[0].placeholders[0];
assert_eq!(p.key, "KEY");
assert_eq!(p.start, 7);
assert_eq!(p.end, 14); }
}