connection-string 0.1.1

Connection string parsing in Rust
Documentation
use std::collections::HashMap;
use std::ops::{Deref, DerefMut};
use std::str::FromStr;

use crate::{bail, ensure};

/// An ADO.net connection string
///
/// # Limitations
///
/// This parser does not support [Odbc connection
/// strings](https://docs.microsoft.com/en-us/dotnet/framework/data/adonet/connection-string-syntax#odbc-connection-strings)
/// and [Excel connection strings with extended
/// properties](https://docs.microsoft.com/en-us/dotnet/framework/data/adonet/connection-string-syntax#connecting-to-excel).
///
/// [Read more](https://docs.microsoft.com/en-us/dotnet/framework/data/adonet/connection-string-syntax)
#[derive(Debug)]
pub struct AdoNetString {
    pairs: HashMap<String, String>,
}

impl Deref for AdoNetString {
    type Target = HashMap<String, String>;

    fn deref(&self) -> &Self::Target {
        &self.pairs
    }
}

impl DerefMut for AdoNetString {
    fn deref_mut(&mut self) -> &mut Self::Target {
        &mut self.pairs
    }
}

// NOTE(yosh): Unfortunately we can't parse using `split(';')` because JDBC
// strings support escaping. This means that `{;}` is valid and we need to write
// an actual LR parser.
impl FromStr for AdoNetString {
    type Err = crate::Error;

    fn from_str(input: &str) -> Result<Self, Self::Err> {
        let mut lexer = Lexer::tokenize(input)?;
        let mut pairs = HashMap::new();

        // Iterate over `key=value` pairs.
        for n in 0.. {
            // [property=[value][;property=value][;]]
            //                                       ^
            if lexer.peek().kind() == &TokenKind::Eof {
                break;
            }

            // [property=[value][;property=value][;]]
            //                   ^
            if n != 0 {
                let err = "Key-value pairs must be separated by a `;`";
                ensure!(lexer.next().kind() == &TokenKind::Semi, err);

                // [property=value[;property=value][;]]
                //                                  ^
                if lexer.peek().kind() == &TokenKind::Eof {
                    break;
                }
            }

            // [property=[value][;property=value][;]]
            //  ^^^^^^^^
            let key = read_ident(&mut lexer)?;
            ensure!(!key.is_empty(), "Key must not be empty");

            // [property=[value][;property=value][;]]
            //          ^
            let err = "key-value pairs must be joined by a `=`";
            ensure!(lexer.next().kind() == &TokenKind::Eq, err);

            // [property=[value][;property=value][;]]
            //           ^^^^^
            let value = read_ident(&mut lexer)?;

            pairs.insert(key, value);
        }
        Ok(Self { pairs })
    }
}

/// Read either a valid key or value from the lexer.
fn read_ident(lexer: &mut Lexer) -> crate::Result<String> {
    let mut output = String::new();
    loop {
        let Token { kind, .. } = lexer.peek();
        match kind {
            TokenKind::Atom(c) => {
                let _ = lexer.next();
                output.push(c);
            }
            TokenKind::Escaped(seq) => {
                let _ = lexer.next();
                output.extend(seq);
            }
            TokenKind::Semi => break,
            TokenKind::Eq => break,
            TokenKind::Newline => {
                let _ = lexer.next();
                continue; // NOTE(yosh): unsure if this is the correct behavior
            }
            TokenKind::Whitespace => {
                let _ = lexer.next();
                match output.len() {
                    0 => continue, // ignore leading whitespace
                    _ => output.push(' '),
                }
            }
            TokenKind::Eof => break,
        }
    }
    output = output.trim_end().to_owned(); // remove trailing whitespace
    Ok(output)
}

#[derive(Debug, Clone)]
struct Token {
    kind: TokenKind,
    loc: Location,
}

impl Token {
    /// Create a new instance.
    fn new(kind: TokenKind, loc: Location) -> Self {
        Self { kind, loc }
    }

    fn kind(&self) -> &TokenKind {
        &self.kind
    }
}

#[derive(Debug, Clone, Eq, PartialEq)]
enum TokenKind {
    Semi,
    Eq,
    Atom(char),
    Escaped(Vec<char>),
    Newline,
    Whitespace,
    Eof,
}

#[derive(Debug)]
struct Lexer {
    tokens: Vec<Token>,
}

impl Lexer {
    /// Parse a string into a sequence of tokens.
    fn tokenize(mut input: &str) -> crate::Result<Self> {
        let mut tokens = vec![];
        let mut loc = Location::default();
        while !input.is_empty() {
            let old_input = input;
            let mut chars = input.chars();
            let kind = match chars.next().unwrap() {
                '"' => {
                    let mut buf = Vec::new();
                    loop {
                        match chars.next() {
                            None => bail!("unclosed double quote"),
                            // When we read a double quote inside a double quote
                            // we need to lookahead to determine whether it's an
                            // escape sequence or a closing delimiter.
                            Some('"') => match lookahead(&chars) {
                                Some('"') => {
                                    if buf.len() == 0 {
                                        break;
                                    }
                                    let _ = chars.next();
                                    buf.push('"');
                                    buf.push('"');
                                }
                                Some(_) | None => break,
                            },
                            Some(c) if c.is_ascii() => buf.push(c),
                            _ => bail!("Invalid ado.net token"),
                        }
                    }
                    TokenKind::Escaped(buf)
                }
                '\'' => {
                    let mut buf = Vec::new();
                    loop {
                        match chars.next() {
                            None => bail!("unclosed single quote"),
                            // When we read a single quote inside a single quote
                            // we need to lookahead to determine whether it's an
                            // escape sequence or a closing delimiter.
                            Some('\'') => match lookahead(&chars) {
                                Some('\'') => {
                                    if buf.len() == 0 {
                                        break;
                                    }
                                    let _ = chars.next();
                                    buf.push('\'');
                                    buf.push('\'');
                                }
                                Some(_) | None => break,
                            },
                            Some(c) if c.is_ascii() => buf.push(c),
                            Some(c) => bail!("Invalid ado.net token `{}`", c),
                        }
                    }
                    TokenKind::Escaped(buf)
                }
                '{' => {
                    let mut buf = Vec::new();
                    // Read alphanumeric ASCII including whitespace until we find a closing curly.
                    loop {
                        match chars.next() {
                            None => bail!("unclosed escape literal"),
                            Some('}') => break,
                            Some(c) if c.is_ascii() => buf.push(c),
                            Some(c) => bail!("Invalid ado.net token `{}`", c),
                        }
                    }
                    TokenKind::Escaped(buf)
                }
                ';' => TokenKind::Semi,
                '=' => TokenKind::Eq,
                '\n' => TokenKind::Newline,
                ' ' => TokenKind::Whitespace,
                char if char.is_ascii() => TokenKind::Atom(char),
                char => bail!("Invalid character found: {}", char),
            };
            tokens.push(Token::new(kind, loc));
            input = chars.as_str();

            let consumed = old_input.len() - input.len();
            loc.advance(&old_input[..consumed]);
        }
        tokens.reverse();
        Ok(Self { tokens })
    }

    /// Get the next token from the queue.
    #[must_use]
    pub(crate) fn next(&mut self) -> Token {
        self.tokens.pop().unwrap_or(Token {
            kind: TokenKind::Eof,
            loc: Location::default(),
        })
    }

    /// Peek at the next token in the queue.
    #[must_use]
    pub(crate) fn peek(&mut self) -> Token {
        self.tokens.last().map(|t| t.clone()).unwrap_or(Token {
            kind: TokenKind::Eof,
            loc: Location::default(),
        })
    }
}

/// Look at the next char in the iterator.
fn lookahead(iter: &std::str::Chars<'_>) -> Option<char> {
    let s = iter.as_str();
    s.chars().next()
}

/// Track the location of the Token inside the string.
#[derive(Copy, Clone, Default, Debug)]
pub(crate) struct Location {
    pub(crate) column: usize,
}

impl Location {
    fn advance(&mut self, text: &str) {
        self.column += text.chars().count();
    }
}

#[cfg(test)]
mod test {
    use super::AdoNetString;

    fn assert_kv(ado: &AdoNetString, key: &str, value: &str) {
        assert_eq!(ado.get(key), Some(&value.to_owned()));
    }

    // Source: https://docs.microsoft.com/en-us/dotnet/framework/data/adonet/connection-string-syntax#windows-authentication-with-sqlclient
    // https://docs.microsoft.com/en-us/dotnet/framework/data/adonet/connection-string-syntax#windows-authentication-with-sqlclient
    #[test]
    fn windows_auth_with_sql_client() -> crate::Result<()> {
        let input = "Persist Security Info=False;Integrated Security=true;\nInitial Catalog=AdventureWorks;Server=MSSQL1";
        let ado: AdoNetString = input.parse()?;
        assert_kv(&ado, "Persist Security Info", "False");
        assert_kv(&ado, "Integrated Security", "true");
        assert_kv(&ado, "Server", "MSSQL1");
        assert_kv(&ado, "Initial Catalog", "AdventureWorks");
        Ok(())
    }

    // https://docs.microsoft.com/en-us/dotnet/framework/data/adonet/connection-string-syntax#sql-server-authentication-with-sqlclient
    #[test]
    fn sql_server_auth_with_sql_client() -> crate::Result<()> {
        let input = "Persist Security Info=False;User ID=*****;Password=*****;Initial Catalog=AdventureWorks;Server=MySqlServer";
        let ado: AdoNetString = input.parse()?;
        assert_kv(&ado, "Persist Security Info", "False");
        assert_kv(&ado, "User ID", "*****");
        assert_kv(&ado, "Password", "*****");
        assert_kv(&ado, "Initial Catalog", "AdventureWorks");
        assert_kv(&ado, "Server", "MySqlServer");
        Ok(())
    }

    // https://docs.microsoft.com/en-us/dotnet/framework/data/adonet/connection-string-syntax#connect-to-a-named-instance-of-sql-server
    #[test]
    fn connect_to_named_sql_server_instance() -> crate::Result<()> {
        let input = r#"Data Source=MySqlServer\MSSQL1;"#;
        let ado: AdoNetString = input.parse()?;
        assert_kv(&ado, "Data Source", r#"MySqlServer\MSSQL1"#);
        Ok(())
    }

    // https://docs.microsoft.com/en-us/dotnet/framework/data/adonet/connection-string-syntax#oledb-connection-string-syntax
    #[test]
    fn oledb_connection_string_syntax() -> crate::Result<()> {
        let input = r#"Provider=Microsoft.Jet.OLEDB.4.0; Data Source=d:\Northwind.mdb;User ID=Admin;Password=;"#;
        let ado: AdoNetString = input.parse()?;
        assert_kv(&ado, "Provider", r#"Microsoft.Jet.OLEDB.4.0"#);
        assert_kv(&ado, "Data Source", r#"d:\Northwind.mdb"#);
        assert_kv(&ado, "User ID", r#"Admin"#);
        assert_kv(&ado, "Password", r#""#);

        let input = r#"Provider=Microsoft.Jet.OLEDB.4.0;Data Source=d:\Northwind.mdb;Jet OLEDB:System Database=d:\NorthwindSystem.mdw;User ID=*****;Password=*****;"#;
        let ado: AdoNetString = input.parse()?;
        assert_kv(&ado, "Provider", r#"Microsoft.Jet.OLEDB.4.0"#);
        assert_kv(&ado, "Data Source", r#"d:\Northwind.mdb"#);
        assert_kv(
            &ado,
            "Jet OLEDB:System Database",
            r#"d:\NorthwindSystem.mdw"#,
        );
        assert_kv(&ado, "User ID", r#"*****"#);
        assert_kv(&ado, "Password", r#"*****"#);
        Ok(())
    }

    // https://docs.microsoft.com/en-us/dotnet/framework/data/adonet/connection-string-syntax#using-datadirectory-to-connect-to-accessjet
    #[test]
    fn connect_to_access_jet() -> crate::Result<()> {
        let input = r#"Provider=Microsoft.Jet.OLEDB.4.0;  
                       Data Source=|DataDirectory|\Northwind.mdb;  
                       Jet OLEDB:System Database=|DataDirectory|\System.mdw;"#;
        let ado: AdoNetString = input.parse()?;
        assert_kv(&ado, "Data Source", r#"|DataDirectory|\Northwind.mdb"#);
        assert_kv(&ado, "Provider", r#"Microsoft.Jet.OLEDB.4.0"#);
        assert_kv(
            &ado,
            "Jet OLEDB:System Database",
            r#"|DataDirectory|\System.mdw"#,
        );
        Ok(())
    }

    // NOTE(yosh): we do not support Excel connection strings yet because the
    // double quote escaping is a small nightmare to parse.
    // // https://docs.microsoft.com/en-us/dotnet/framework/data/adonet/connection-string-syntax#connecting-to-excel
    // #[test]
    // fn connect_to_excel() -> crate::Result<()> {
    //     let input = r#"Provider=Microsoft.Jet.OLEDB.4.0;Data Source=D:\MyExcel.xls;Extended Properties=""Excel 8.0;HDR=Yes;IMEX=1"""#;
    //     let ado: AdoNetString = input.parse()?;
    //     assert_kv(&ado, "Provider", r#"Microsoft.Jet.OLEDB.4.0"#);
    //     assert_kv(&ado, "Data Source", r#"D:\MyExcel.xls"#);
    //     assert_kv(
    //         &ado,
    //         "Extended Properties",
    //         r#"""Excel 8.0;HDR=Yes;IMEX=1"""#,
    //     );
    //     Ok(())
    // }

    // https://docs.microsoft.com/en-us/dotnet/framework/data/adonet/connection-string-syntax#data-shape-provider-connection-string-syntax
    #[test]
    fn data_shape_provider() -> crate::Result<()> {
        let input = r#"Provider=MSDataShape;Data Provider=SQLOLEDB;Data Source=(local);Initial Catalog=pubs;Integrated Security=SSPI;"#;
        let ado: AdoNetString = input.parse()?;
        assert_kv(&ado, "Provider", r#"MSDataShape"#);
        assert_kv(&ado, "Data Provider", r#"SQLOLEDB"#);
        assert_kv(&ado, "Data Source", r#"(local)"#);
        assert_kv(&ado, "Initial Catalog", r#"pubs"#);
        assert_kv(&ado, "Integrated Security", r#"SSPI"#);
        Ok(())
    }

    // NOTE(yosh): we do not support ODBC connection strings because the first part of the
    // https://docs.microsoft.com/en-us/dotnet/framework/data/adonet/connection-string-syntax#odbc-connection-strings
    #[test]
    fn odbc_connection_strings() -> crate::Result<()> {
        let input = r#"Driver={Microsoft Text Driver (*.txt; *.csv)};DBQ=d:\bin"#;
        let ado: AdoNetString = input.parse()?;
        assert_kv(&ado, "Driver", r#"Microsoft Text Driver (*.txt; *.csv)"#);
        assert_kv(&ado, "DBQ", r#"d:\bin"#);
        Ok(())
    }

    // https://docs.microsoft.com/en-us/dotnet/framework/data/adonet/connection-string-syntax#oracle-connection-strings
    #[test]
    fn oracle_connection_strings() -> crate::Result<()> {
        let input = "Data Source=Oracle9i;User ID=*****;Password=*****;";
        let ado: AdoNetString = input.parse()?;
        assert_kv(&ado, "Data Source", "Oracle9i");
        assert_kv(&ado, "User ID", "*****");
        assert_kv(&ado, "Password", "*****");
        Ok(())
    }
}