pgcrab 0.2.0

Linting and documentation generation tool for Postgres database schemas
Documentation
// SPDX-FileCopyrightText: 2025 Olivier 'reivilibre'
//
// SPDX-License-Identifier: GPL-3.0-or-later

use std::{collections::BTreeSet, path::Path};

use eyre::{Context, ContextCompat};
use itertools::Itertools;
use postgres::Transaction;
use strum::{EnumIter, EnumString, IntoStaticStr};
use toml_edit::{Array, DocumentMut, Entry, Item, Table, TableLike, Value};

use super::DiagnosticClassification;

mod lint_donotdothis;
mod lint_pgcrab;
mod lint_pgindexhealth;
mod macros;

#[derive(Debug)]
pub struct SchemaDiagnostic {
    pub loc: SchemaLoc,
    pub rule: SchemaDiagnosticRule,
}

#[derive(Debug)]
pub enum SchemaLoc {
    Table {
        table: String,
    },
    Column {
        table: String,
        column: String,
    },
    Object {
        object: String,
        kind: String,
    },
    ForeignKey {
        table: String,
        target_table: String,
        foreign_key: String,
    },
    Index {
        table: String,
        index: String,
    },
    Indexes {
        table: String,
        indexes: BTreeSet<String>,
    },
    ForeignKeys {
        table: String,
        target_table: String,
        foreign_keys: BTreeSet<String>,
    },
}

impl SchemaLoc {
    /// Convert to a rule that can be matched by a concession rule.
    pub fn to_concession_matchable_string(&self) -> String {
        match self {
            SchemaLoc::Table { table } => table.clone(),
            SchemaLoc::Column { table, column } => format!("{table}/{column}"),
            SchemaLoc::Object { object, kind: _ } => object.to_owned(),
            SchemaLoc::ForeignKey {
                table,
                target_table: _,
                foreign_key,
            } => format!("{table}/{foreign_key}"),
            SchemaLoc::Index { table, index } => format!("{table}/{index}"),
            SchemaLoc::Indexes { table, indexes } => {
                format!("{table}/{}", indexes.iter().join(","))
            }
            SchemaLoc::ForeignKeys {
                table,
                target_table: _,
                foreign_keys,
            } => format!("{table}/{}", foreign_keys.iter().join(",")),
        }
    }
}

#[derive(EnumString, strum::Display, EnumIter, IntoStaticStr, Copy, Clone, Debug)]
#[strum(serialize_all = "snake_case")]
pub enum SchemaDiagnosticRule {
    DontUseTimestampWithoutTimeZone,
    DontUseMoney,
    DontUseSerial,
    DontUseVarcharNByDefault,
    ColumnRequiresQuotation,
    ObjectRequiresQuotation,
    PossibleObjectNameTruncation,
    DuplicateIndexes,
    DuplicateForeignKeys,
    ForeignKeyWithoutIndex,
    ForeignKeyWithUnmatchedColumnType,
    IndexWithRedundantWhereClause,
    OverlappingIndexes,
    OverlappingForeignKeys,
    TableWithoutPrimaryKey,
    BtreeIndexOnArrayColumn,
    IndexWithBoolean,
    TableWithoutReplicaIdentity,
}

impl SchemaDiagnosticRule {
    pub fn describe(self) -> &'static str {
        match self {
            SchemaDiagnosticRule::DontUseTimestampWithoutTimeZone => {
                "Don't use timestamp (without time zone), not even to store UTC times
`timestamptz` stores a point in time without any more storage overhead.
`timestamptz` is more ergonomic for doing useful time calculations, that would otherwise be very
awkward to do with `timestamp without timezone`, even if you are storing times in UTC!
Unless what you really want is a picture of a clock, use `timestamptz` (`timestamp with time zone`).
See: <https://wiki.postgresql.org/wiki/Don't_Do_This#Don.27t_use_timestamp_.28without_time_zone.29>
and: <https://wiki.postgresql.org/wiki/Don't_Do_This#Don.27t_use_timestamp_.28without_time_zone.29_to_store_UTC_times>"
            }
            SchemaDiagnosticRule::DontUseMoney => {
                "Don't use money
There are usually more appropriate types to use depending on your desired semantics.
See: <https://wiki.postgresql.org/wiki/Don't_Do_This#Don.27t_use_money>"
            }
            SchemaDiagnosticRule::DontUseSerial => {
                "Don't use SERIAL/SMALLSERIAL/BIGSERIAL
These types have slightly weird ownership semantics.
For new applications, prefer a `GENERATED BY DEFAULT AS IDENTITY` column instead.
See: <https://wiki.postgresql.org/wiki/Don't_Do_This#Don.27t_use_serial>
and: <https://www.enterprisedb.com/blog/postgresql-10-identity-columns-explained>"
            }
            SchemaDiagnosticRule::DontUseVarcharNByDefault => {
                "Don't use VARCHAR(n) by default
There are no benefits to using VARCHAR(n) by default,
unlike other RDBMSes.
See: <https://wiki.postgresql.org/wiki/Don%27t_Do_This#Don.27t_use_varchar.28n.29_by_default>"
            }
            SchemaDiagnosticRule::ColumnRequiresQuotation => {
                "Column name always requires quotation
This column name does not follow the naming convention and therefore always has to be quoted
whenever referred to by SQL.
It is preferable to use a compliant name and avoid this.
See: <https://wiki.postgresql.org/wiki/Don%27t_Do_This#Don.27t_use_upper_case_table_or_column_names>"
            },
            SchemaDiagnosticRule::ObjectRequiresQuotation => {
                "Object name always requires quotation
This object name does not follow the naming convention and therefore always has to be quoted
whenever referred to by SQL.
It is preferable to use a compliant name and avoid this.
See: <https://wiki.postgresql.org/wiki/Don%27t_Do_This#Don.27t_use_upper_case_table_or_column_names>"
            },
            SchemaDiagnosticRule::PossibleObjectNameTruncation => {
                "Object name may have been truncated
This object's name is of the maximum length (usually 63) and therefore may have been silently
truncated.
In conjunction with constructs such as `IF NOT EXISTS`, this could result in a loss
of intended semantics and therefore could be quite dangerous.
It is preferable to use a shorter name that does not risk being truncated.
See: <https://www.postgresql.org/docs/current/runtime-config-preset.html#GUC-MAX-IDENTIFIER-LENGTH>"
            },
            SchemaDiagnosticRule::DuplicateIndexes => {
                "Duplicate indexes
These indexes are identical. This is just a waste of space and disk bandwidth.
Remove all except one of the indexes."
            },
            SchemaDiagnosticRule::DuplicateForeignKeys => {
                "Duplicate foreign keys
These foreign keys are identical.
Unsure if this causes extra I/O overhead, but I doubt it's doing any good!
Remove all except one of the foreign keys."
            },
            SchemaDiagnosticRule::ForeignKeyWithoutIndex => {
                "Foreign key without index
This foreign key is not indexed.
If rows are deleted in the target table, then the resultant
foreign key backreference checks will need to scan the entire source table,
which can cause extremely poor performance for the deletions and other
concurrent database operations.
If the source table is not small and you care about deleting rows from the
target table, then it is likely best to add an index."
            },
            SchemaDiagnosticRule::ForeignKeyWithUnmatchedColumnType => {
                "Foreign key with unmatched column type(s)
This foreign key links two (sets of) columns, but those don't have the same types!
This is suspicious, might cause confusion in the future and will also result in
implicit DBMS-level type coercions."
            },
            SchemaDiagnosticRule::IndexWithRedundantWhereClause => {
                "Index with redundant WHERE ... IS NOT NULL clause
This is a partial index with a 'WHERE ... IS NOT NULL' predicate, but
that predicate is redundant as the column is already `NOT NULL`.
Other than being confusing, this can lead to worse performance if a query
omits the redundant predicate in a query, meaning the index doesn't get used
despite being appropriate."
            }
            SchemaDiagnosticRule::OverlappingIndexes => {
                "Overlapping indexes
For b-tree indexes:
    One of these indexes is a prefix of the other.
    Because b-trees are useful for searching along any prefix of the included columns,
    this means one of these indexes is redundant.
    In the vast majority of cases the obsolete index should just be deleted,
    because this frees up disk space and (for actively-written tables) disk bandwidth.

For other kinds of index:
    These rules may not apply for some types of index.
    Use your own judgment and research; it may be correct to add a concession for this case."
/* but if one index is concise and the other one super bloated, you might prefer reading
the concise one. But this case should be pretty rare, no? */
            }
            SchemaDiagnosticRule::OverlappingForeignKeys => {
                "Overlapping foreign keys
One of these foreign key constraints is made redundant by the other one,
because it contains a subset of the constrained columns, meaning it is a
stronger constraint.
I don't see any reason why the looser constraint (the one with more
columns) cannot just be removed."
            }
            SchemaDiagnosticRule::TableWithoutPrimaryKey => {
                "Table without primary key
It's very unlikely that a table should NOT have a `PRIMARY KEY`.
A primary key provides a lot of sanity for humans and database tools,
as well as providing some basic protection against duplicate rows
accidentally being introduced, e.g. during a non-atomic database restore
(like restoring a `pg_dump` in the default way).

If you already have a `UNIQUE` constraint, consider promoting that to a `PRIMARY KEY`."
            }
            SchemaDiagnosticRule::BtreeIndexOnArrayColumn => {
                "B-tree index on array column
A b-tree index on an array column considers the array as an entire atom,
when you more likely wanted a GIN index so that you can search for
individual elements within the array."
            }
            SchemaDiagnosticRule::IndexWithBoolean => {
                "Non-unique index on just a boolean column
Given there are only 2 boolean values, this index is not very selective.
This means it is unlikely to be very useful in relation to the disk space
taken.
Consider whether you can create a partial index, with the boolean column
as a predicate, for this case instead?"
            },
            SchemaDiagnosticRule::TableWithoutReplicaIdentity => {
                "Table without replica identity
This table does not have a replica identity.
As a result, `UPDATE` and `DELETE` operations will be forbidden on
this table whilst it is in logical replication.
Logical replication is useful for online migration, online database upgrades,
backups, hot standbys and so on.
Consider adding a primary key to this table, which will also give an
implied `REPLICA IDENTITY`.
A `REPLICA IDENTITY` can instead be specified manually; it may be possible to
re-use another unique index or, failing that, `REPLICA IDENTITY FULL`.
See: <https://www.postgresql.org/docs/17/logical-replication-publication.html>"
            }
        }
    }

    pub fn default_classification(self) -> DiagnosticClassification {
        use DiagnosticClassification::*;
        match self {
            SchemaDiagnosticRule::DontUseTimestampWithoutTimeZone => Warning,
            SchemaDiagnosticRule::DontUseMoney => Warning,
            SchemaDiagnosticRule::DontUseSerial => Warning,
            SchemaDiagnosticRule::DontUseVarcharNByDefault => Note,
            SchemaDiagnosticRule::ColumnRequiresQuotation => Warning,
            SchemaDiagnosticRule::ObjectRequiresQuotation => Warning,
            SchemaDiagnosticRule::PossibleObjectNameTruncation => Error,
            SchemaDiagnosticRule::DuplicateIndexes => Error,
            SchemaDiagnosticRule::DuplicateForeignKeys => Error,
            SchemaDiagnosticRule::ForeignKeyWithoutIndex => Warning,
            SchemaDiagnosticRule::ForeignKeyWithUnmatchedColumnType => Error,
            SchemaDiagnosticRule::IndexWithRedundantWhereClause => Error,
            SchemaDiagnosticRule::OverlappingIndexes => Warning,
            SchemaDiagnosticRule::OverlappingForeignKeys => Warning,
            SchemaDiagnosticRule::TableWithoutPrimaryKey => Error,
            SchemaDiagnosticRule::BtreeIndexOnArrayColumn => Warning,
            SchemaDiagnosticRule::IndexWithBoolean => Warning,
            SchemaDiagnosticRule::TableWithoutReplicaIdentity => Warning,
        }
    }
}

pub fn lint_all(txn: &mut Transaction) -> eyre::Result<Vec<SchemaDiagnostic>> {
    let mut out = Vec::new();
    for lintgroup in &[
        lint_donotdothis::LINTS,
        lint_pgindexhealth::LINTS,
        lint_pgcrab::LINTS,
    ] {
        for lint in *lintgroup {
            out.extend(lint(txn)?);
        }
    }
    Ok(out)
}

fn table_mut<'a>(item: &'a mut Item, debug_name: &str) -> eyre::Result<&'a mut dyn TableLike> {
    if item.is_none() {
        let mut tbl = Table::new();
        // don't need to expose the root necessarily
        tbl.set_implicit(true);
        *item = Item::Table(tbl);
    }
    item.as_table_like_mut().with_context(|| {
        format!("error in existing config file: {debug_name} exists but is not a table")
    })
}
fn table_mut_entry<'a>(
    table: &'a mut dyn TableLike,
    debug_name: &str,
    key_name: &str,
) -> eyre::Result<&'a mut dyn TableLike> {
    match table.entry(key_name) {
        Entry::Occupied(occupied_entry) => occupied_entry
            .into_mut()
            .as_table_like_mut()
            .with_context(|| {
                format!("error in existing config file: {debug_name}.{key_name} exists but is not a table")
            }),
        Entry::Vacant(vacant_entry) => Ok(vacant_entry.insert(Item::Table(Table::new()))
            .as_table_like_mut()
            .unwrap()),
    }
}

pub fn add_concessions(diagnostics: &[SchemaDiagnostic], config_path: &Path) -> eyre::Result<()> {
    let config_text = std::fs::read_to_string(config_path).context("could not read config file")?;
    let mut config_doc: DocumentMut = config_text.parse().context("could not parse config TOML")?;
    let concessions_table = table_mut_entry(
        table_mut(&mut config_doc["schema"], "schema")?,
        "schema",
        "concessions",
    )?;

    // if concessions_item.is_none() {
    //     *concessions_item = Item::Table(Table::new());
    // }

    // let concessions_table = concessions_item.as_table_like_mut().with_context(|| {
    //     format!("error in existing config file: schema.concessions exists but is not a table")
    // })?;

    for diagnostic in diagnostics {
        let rule_id = <&'static str>::from(diagnostic.rule);
        let matchable = diagnostic
            .loc
            .to_concession_matchable_string()
            .replace("/", ".");

        let rule_array = concessions_table
            .entry(rule_id)
            .or_insert(Item::Value(Value::Array(Array::new())));
        let rule_array = rule_array.as_array_mut().with_context(|| format!("error in existing config file: schema.concessions.{rule_id} exists but is not an array"))?;
        rule_array.push(Value::from(matchable));
        rule_array.fmt();
    }

    concessions_table.fmt();

    let tmp_path = config_path.parent().unwrap().join("pgcrab.toml~tmp-new");
    std::fs::write(&tmp_path, config_doc.to_string().as_bytes())
        .context("could not write new TOML file")?;

    std::fs::rename(&tmp_path, config_path)
        .context("could not move config file to replace old one")?;

    Ok(())
}