use std::{
collections::{btree_map, BTreeMap},
path::PathBuf,
sync::Arc,
};
use eyre::Context;
use maplit::btreemap;
use regex::Regex;
use serde::Serialize;
use walkdir::WalkDir;
#[derive(Debug, Clone, Serialize)]
pub struct DocComment {
pub comment: String,
pub line_number: usize,
}
#[derive(Debug, Serialize)]
pub struct TableInfo {
#[serde(skip)]
pub file_path: Arc<String>,
pub name: String,
pub doc_comment: Option<DocComment>,
pub columns: BTreeMap<String, ColumnInfo>,
}
#[derive(Debug, Serialize)]
pub struct ColumnInfo {
#[serde(skip)]
pub file_path: Arc<String>,
pub name: String,
pub doc_comment: Option<DocComment>,
pub table_name: String,
}
#[derive(Debug, Serialize)]
pub struct HarvestedFile {
#[serde(skip)]
pub file_path: Arc<String>,
pub tables: Vec<TableInfo>,
pub alter_columns: Vec<ColumnInfo>,
}
#[derive(Debug, Serialize)]
pub struct TotalHarvest {
pub tables: BTreeMap<String, TableInfo>,
}
pub fn harvest_from_paths(paths: &Vec<PathBuf>) -> eyre::Result<TotalHarvest> {
fn harvest(paths: &Vec<PathBuf>) -> eyre::Result<BTreeMap<Arc<String>, HarvestedFile>> {
let mut out = BTreeMap::new();
for path in paths {
for entry in WalkDir::new(&path) {
let entry = entry.with_context(|| format!("failed to walk {path:?}"))?;
let Some(path_str) = entry.path().to_str() else {
continue;
};
if !(path_str.ends_with(".sql")
|| path_str.ends_with(".sql.postgres")
|| path_str.ends_with(".sql.postgresql"))
{
continue;
}
let content = std::fs::read_to_string(entry.path())
.with_context(|| format!("could not read {path_str}"))?;
let path_str = Arc::new(path_str.to_owned());
out.insert(path_str.clone(), harvest_sql_file(&content, path_str));
}
}
Ok(out)
}
let all = harvest(paths)?;
let mut combined = TotalHarvest {
tables: BTreeMap::new(),
};
for (filename, file) in all {
for table in file.tables {
combined.tables.insert(table.name.clone(), table);
}
for column in file.alter_columns {
match combined.tables.entry(column.table_name.clone()) {
btree_map::Entry::Vacant(ve) => {
ve.insert(TableInfo {
file_path: filename.clone(),
name: column.table_name.clone(),
doc_comment: None,
columns: btreemap![column.name.clone() => column],
});
}
btree_map::Entry::Occupied(mut oe) => {
oe.get_mut().columns.insert(column.name.to_owned(), column);
}
}
}
}
Ok(combined)
}
pub fn harvest_sql_file(content: &str, file_path: Arc<String>) -> HarvestedFile {
let lines: Vec<&str> = content.lines().collect();
let mut tables = Vec::new();
let mut alter_columns = Vec::new();
let doc_comment_re = Regex::new(r"^\s*--\s*(.*)$").unwrap();
let create_table_re = Regex::new(r"(?i)^\s*CREATE\s+TABLE\s+(\w+)").unwrap();
let alter_table_re = Regex::new(r"(?i)^\s*ALTER\s+TABLE\s+(\w+)").unwrap();
let add_column_re = Regex::new(r"(?i)^\s*ADD\s+COLUMN\s+(\w+)").unwrap();
let column_re = Regex::new(r"^\s*(\w+)\s+").unwrap();
let mut i = 0;
while i < lines.len() {
let line = lines[i];
if let Some(captures) = create_table_re.captures(line) {
let table_name = captures.get(1).unwrap().as_str().to_string();
let table_doc = find_preceding_doc_comment(&lines, i, &doc_comment_re);
let (columns, end_idx) = parse_create_table_columns(
&lines,
i + 1,
&doc_comment_re,
&column_re,
&table_name,
file_path.clone(),
);
tables.push(TableInfo {
file_path: file_path.clone(),
name: table_name,
doc_comment: table_doc,
columns,
});
i = end_idx;
}
else if line.to_uppercase().contains("ALTER TABLE")
&& line.to_uppercase().contains("ADD COLUMN")
{
let single_line_alter_re =
Regex::new(r"(?i)^\s*ALTER\s+TABLE\s+(\w+)\s+ADD\s+COLUMN\s+(\w+)").unwrap();
if let Some(captures) = single_line_alter_re.captures(line) {
let table_name = captures.get(1).unwrap().as_str().to_string();
let column_name = captures.get(2).unwrap().as_str().to_string();
let column_doc = find_preceding_doc_comment(&lines, i, &doc_comment_re);
alter_columns.push(ColumnInfo {
file_path: file_path.clone(),
name: column_name,
doc_comment: column_doc,
table_name,
});
}
i += 1;
}
else if let Some(captures) = alter_table_re.captures(line) {
let table_name = captures.get(1).unwrap().as_str().to_string();
let mut j = i + 1;
while j < lines.len() {
let next_line = lines[j];
if let Some(add_captures) = add_column_re.captures(next_line) {
let column_name = add_captures.get(1).unwrap().as_str().to_string();
let column_doc = find_preceding_doc_comment(&lines, j, &doc_comment_re);
alter_columns.push(ColumnInfo {
file_path: file_path.clone(),
name: column_name,
doc_comment: column_doc,
table_name: table_name.clone(),
});
i = j;
break;
}
let trimmed = next_line.trim();
if !trimmed.is_empty() && !doc_comment_re.is_match(trimmed) {
break;
}
j += 1;
}
i += 1;
} else {
i += 1;
}
}
HarvestedFile {
file_path: file_path.clone(),
tables,
alter_columns,
}
}
fn find_preceding_doc_comment(
lines: &[&str],
current_idx: usize,
doc_comment_re: &Regex,
) -> Option<DocComment> {
if current_idx == 0 {
return None;
}
let mut comment_lines = Vec::new();
let mut first_comment_line = None;
for i in (0..current_idx).rev() {
let line = lines[i].trim();
if line.is_empty() {
continue;
}
if let Some(captures) = doc_comment_re.captures(line) {
let comment_text = captures.get(1).unwrap().as_str().trim().to_string();
comment_lines.push(comment_text);
first_comment_line = Some(i + 1); } else {
break;
}
}
if comment_lines.is_empty() {
return None;
}
comment_lines.reverse();
let combined_comment = comment_lines.join("\n");
Some(DocComment {
comment: combined_comment,
line_number: first_comment_line.unwrap(),
})
}
fn parse_create_table_columns(
lines: &[&str],
start_idx: usize,
doc_comment_re: &Regex,
column_re: &Regex,
table_name: &str,
file_path: Arc<String>,
) -> (BTreeMap<String, ColumnInfo>, usize) {
let mut columns = BTreeMap::new();
let mut i = start_idx;
while i < lines.len() {
let line = lines[i].trim();
if line.ends_with(");") {
break;
}
if line.is_empty()
|| line.to_uppercase().starts_with("PRIMARY KEY")
|| line.to_uppercase().starts_with("FOREIGN KEY")
|| line.to_uppercase().starts_with("UNIQUE")
|| line.to_uppercase().starts_with("CHECK")
|| line.to_uppercase().starts_with("CONSTRAINT")
{
i += 1;
continue;
}
if let Some(captures) = column_re.captures(line) {
let column_name = captures.get(1).unwrap().as_str().to_string();
let column_doc = find_preceding_doc_comment(lines, i, doc_comment_re);
columns.insert(
column_name.clone(),
ColumnInfo {
file_path: file_path.clone(),
name: column_name,
doc_comment: column_doc,
table_name: table_name.to_string(),
},
);
}
i += 1;
}
(columns, i)
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use insta::assert_yaml_snapshot;
use super::HarvestedFile;
fn harvest_sql(sql: &str) -> HarvestedFile {
super::harvest_sql_file(sql, Arc::new("test.sql".to_owned()))
}
#[test]
fn test_harvest() {
let h = harvest_sql(
r#"
-- Stores users
-- Does not include magic users.
CREATE TABLE users (
-- ID from the auth system
sub_id UUID NOT NULL,
-- Hash of user's password,
-- or null if no password is set.
-- We use Argon2 in the PHC format
password_hash TEXT
);
ALTER TABLE users
-- whether the user is barred from logging in
ADD COLUMN locked BOOLEAN DEFAULT FALSE;
-- whether the user is all powerful
-- and able to do anything they want
ALTER TABLE users ADD COLUMN admin BOOLEAN DEFAULT FALSE;
"#,
);
assert_yaml_snapshot!(h, @r#"
tables:
- name: users
doc_comment:
comment: "Stores users\nDoes not include magic users."
line_number: 2
columns:
password_hash:
name: password_hash
doc_comment:
comment: "Hash of user's password,\nor null if no password is set.\nWe use Argon2 in the PHC format"
line_number: 8
table_name: users
sub_id:
name: sub_id
doc_comment:
comment: ID from the auth system
line_number: 5
table_name: users
alter_columns:
- name: locked
doc_comment:
comment: whether the user is barred from logging in
line_number: 15
table_name: users
- name: admin
doc_comment:
comment: "whether the user is all powerful\nand able to do anything they want"
line_number: 18
table_name: users
"#);
}
}