diff --git a/doc/src/sgml/func/func-admin.sgml b/doc/src/sgml/func/func-admin.sgml index 57ff333159f0..88d260795b8b 100644 --- a/doc/src/sgml/func/func-admin.sgml +++ b/doc/src/sgml/func/func-admin.sgml @@ -2960,4 +2960,75 @@ SELECT convert_from(pg_read_binary_file('file_in_utf8.txt'), 'UTF8'); + + Data Checksum Functions + + + The functions shown in can + be used to enable or disable data checksums in a running cluster. + See for details. + + + + Data Checksum Functions + + + + + Function + + + Description + + + + + + + + + pg_enable_data_checksums + + pg_enable_data_checksums ( cost_delay int, cost_limit int ) + void + + + Initiates data checksums for the cluster. This will switch the data + checksums mode to inprogress-on as well as start a + background worker that will process all pages in the database and + enable checksums on them. When all data pages have had checksums + enabled, the cluster will automatically switch data checksums mode to + on. + + + If cost_delay and cost_limit are + specified, the speed of the process is throttled using the same principles as + Cost-based Vacuum Delay. + + + + + + + pg_disable_data_checksums + + pg_disable_data_checksums () + void + + + Disables data checksum validation and calculation for the cluster. This + will switch the data checksum mode to inprogress-off + while data checksums are being disabled. When all active backends have + stopped validating data checksums, the data checksum mode will be + changed to off. At this point the data pages will + still have checksums recorded but they are not updated when pages are + modified. + + + + +
+ +
+ diff --git a/doc/src/sgml/glossary.sgml b/doc/src/sgml/glossary.sgml index b88cac598e90..a4e16d03aaec 100644 --- a/doc/src/sgml/glossary.sgml +++ b/doc/src/sgml/glossary.sgml @@ -184,6 +184,8 @@ (but not the autovacuum workers), the background writer, the checkpointer, + the data checksums worker, + the data checksums worker launcher, the logger, the startup process, the WAL archiver, @@ -573,6 +575,27 @@ + + Data Checksums Worker + + + An auxiliary process + which enables or disables data checksums in a specific database. + + + + + + Data Checksums Worker Launcher + + + An auxiliary process + which starts processes + for each database. + + + + Database cluster diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index 3f4a27a736e2..6082d991497e 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -3527,8 +3527,9 @@ description | Waiting for a newly initialized WAL file to reach durable storage Number of data page checksum failures detected in this - database (or on a shared object), or NULL if data checksums are - disabled. + database (or on a shared object). + Detected failures are reported regardless of the + setting. @@ -3538,8 +3539,8 @@ description | Waiting for a newly initialized WAL file to reach durable storage Time at which the last data page checksum failure was detected in - this database (or on a shared object), or NULL if data checksums are - disabled. + this database (or on a shared object). Last failure is reported + regardless of the setting. @@ -6877,6 +6878,205 @@ FROM pg_stat_get_backend_idset() AS backendid; + + Data Checksum Progress Reporting + + + pg_stat_progress_data_checksums + + + + When data checksums are being enabled on a running cluster, the + pg_stat_progress_data_checksums view will contain + a row for the launcher process, and one row for each worker process which + is currently calculating checksums for the data pages in one database. + + + + <structname>pg_stat_progress_data_checksums</structname> View + + + + + + Column Type + + + Description> + + + + + + + + + + pid integer + + + Process ID of a datachecksumworker process. + + + + + + + datid oid + + + OID of this database, or 0 for the launcher process + relation + + + + + + datname name + + + Name of this database, or NULL for the + launcher process. + + + + + + + phase text + + + Current processing phase, see + for description of the phases. + + + + + + + + databases_total integer + + + The total number of databases which will be processed. Only the + launcher worker has this value set, the other worker processes + have this set to NULL. + + + + + + + + databases_done integer + + + The number of databases which have been processed. Only the + launcher worker has this value set, the other worker processes + have this set to NULL. + + + + + + + + relations_total integer + + + The total number of relations which will be processed, or + NULL if the data checksums worker process hasn't + calculated the number of relations yet. The launcher process has + this NULL. + + + + + + + + relations_done integer + + + The number of relations which have been processed. The launcher + process has this NULL. + + + + + + + + blocks_total integer + + + The number of blocks in the current relation which will be processed, + or NULL if the data checksums worker process hasn't + calculated the number of blocks yet. The launcher process has + this NULL. + + + + + + + + blocks_done integer + + + The number of blocks in the current relation which have been processed. + The launcher process has this NULL. + + + + + + +
+ + + Data Checksum Phases + + + + + + Phase + Description + + + + + enabling + + The command is currently enabling data checksums on the cluster. + + + + disabling + + The command is currently disabling data checksums on the cluster. + + + + waiting on temporary tables + + The command is currently waiting for all temporary tables which existed + at the time the command was started to be removed. + + + + waiting on checkpoint + + The command is currently waiting for a checkpoint to update the checksum + state before finishing. + + + + +
+
+ diff --git a/doc/src/sgml/ref/pg_checksums.sgml b/doc/src/sgml/ref/pg_checksums.sgml index 95043aa329c0..0343710af53d 100644 --- a/doc/src/sgml/ref/pg_checksums.sgml +++ b/doc/src/sgml/ref/pg_checksums.sgml @@ -45,6 +45,12 @@ PostgreSQL documentation exit status is nonzero if the operation failed. + + When enabling checksums, if checksums were in the process of being enabled + when the cluster was shut down, pg_checksums + will still process all relations regardless of the online processing. + + When verifying checksums, every file in the cluster is scanned. When enabling checksums, each relation file block with a changed checksum is diff --git a/doc/src/sgml/regress.sgml b/doc/src/sgml/regress.sgml index 8838fe7f0225..7074751834ea 100644 --- a/doc/src/sgml/regress.sgml +++ b/doc/src/sgml/regress.sgml @@ -263,6 +263,18 @@ make check-world PG_TEST_EXTRA='kerberos ldap ssl load_balance libpq_encryption' The following values are currently supported: + + checksum_extended + + + Runs additional tests for enabling data checksums which inject delays + and re-tries in the processing, as well as tests that run pgbench + concurrently and randomly restarts the cluster. Some of these test + suites requires injection points enabled in the installation. + + + + kerberos diff --git a/doc/src/sgml/wal.sgml b/doc/src/sgml/wal.sgml index f3b86b26be90..0ada90ca0b16 100644 --- a/doc/src/sgml/wal.sgml +++ b/doc/src/sgml/wal.sgml @@ -246,9 +246,10 @@ Checksums can be disabled when the cluster is initialized using initdb. - They can also be enabled or disabled at a later time as an offline - operation. Data checksums are enabled or disabled at the full cluster - level, and cannot be specified individually for databases or tables. + They can also be enabled or disabled at a later time either as an offline + operation or online in a running cluster allowing concurrent access. Data + checksums are enabled or disabled at the full cluster level, and cannot be + specified individually for databases or tables. @@ -265,7 +266,7 @@ - Off-line Enabling of Checksums + Offline Enabling of Checksums The pg_checksums @@ -274,6 +275,56 @@ + + + Online Enabling of Checksums + + + Checksums can be enabled or disabled online, by calling the appropriate + functions. + + + + Enabling checksums will put the cluster checksum mode in + inprogress-on mode. During this time, checksums will be + written but not verified. In addition to this, a background worker process + is started that enables checksums on all existing data in the cluster. Once + this worker has completed processing all databases in the cluster, the + checksum mode will automatically switch to on. The + processing will consume two background worker processes, make sure that + max_worker_processes allows for at least two more + additional processes. + + + + The process will initially wait for all open transactions to finish before + it starts, so that it can be certain that there are no tables that have been + created inside a transaction that has not committed yet and thus would not + be visible to the process enabling checksums. It will also, for each database, + wait for all pre-existing temporary tables to get removed before it finishes. + If long-lived temporary tables are used in the application it may be necessary + to terminate these application connections to allow the process to complete. + + + + If the cluster is stopped while in inprogress-on mode, for + any reason, then this process must be restarted manually. To do this, + re-execute the function pg_enable_data_checksums() + once the cluster has been restarted. The process will start over, there is + no support for resuming work from where it was interrupted. + + + + + Enabling checksums can cause significant I/O to the system, as most of the + database pages will need to be rewritten, and will be written both to the + data files and the WAL. The impact may be limited by throttling using the + cost_delay and cost_limit + parameters of the pg_enable_data_checksums function. + + + + diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c index cd6c2a2f650a..c50d654db30e 100644 --- a/src/backend/access/rmgrdesc/xlogdesc.c +++ b/src/backend/access/rmgrdesc/xlogdesc.c @@ -18,6 +18,7 @@ #include "access/xlog.h" #include "access/xlog_internal.h" #include "catalog/pg_control.h" +#include "storage/bufpage.h" #include "utils/guc.h" #include "utils/timestamp.h" @@ -167,6 +168,26 @@ xlog_desc(StringInfo buf, XLogReaderState *record) memcpy(&wal_level, rec, sizeof(int)); appendStringInfo(buf, "wal_level %s", get_wal_level_string(wal_level)); } + else if (info == XLOG_CHECKSUMS) + { + xl_checksum_state xlrec; + + memcpy(&xlrec, rec, sizeof(xl_checksum_state)); + switch (xlrec.new_checksumtype) + { + case PG_DATA_CHECKSUM_VERSION: + appendStringInfoString(buf, "on"); + break; + case PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION: + appendStringInfoString(buf, "inprogress-off"); + break; + case PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION: + appendStringInfoString(buf, "inprogress-on"); + break; + default: + appendStringInfoString(buf, "off"); + } + } } const char * @@ -218,6 +239,9 @@ xlog_identify(uint8 info) case XLOG_CHECKPOINT_REDO: id = "CHECKPOINT_REDO"; break; + case XLOG_CHECKSUMS: + id = "CHECKSUMS"; + break; } return id; diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 7ffb21791519..46edf5313591 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -550,6 +550,9 @@ typedef struct XLogCtlData */ XLogRecPtr lastFpwDisableRecPtr; + /* last data_checksum_version we've seen */ + uint32 data_checksum_version; + slock_t info_lck; /* locks shared variables shown above */ } XLogCtlData; @@ -647,6 +650,36 @@ static XLogRecPtr LocalMinRecoveryPoint; static TimeLineID LocalMinRecoveryPointTLI; static bool updateMinRecoveryPoint = true; +/* + * Local state fror Controlfile data_checksum_version. After initialization + * this is only updated when absorbing a procsignal barrier during interrupt + * processing. The reason for keeping a copy in backend-private memory is to + * avoid locking for interrogating checksum state. Possible values are the + * checksum versions defined in storage/bufpage.h as well as zero when data + * checksums are disabled. + */ +static uint32 LocalDataChecksumVersion = 0; + +/* + * Flag to remember if the procsignalbarrier being absorbed for checksums is + * the first one. The first procsignalbarrier can in rare cases be for the + * state we've initialized, i.e. a duplicate. This may happen for any + * data_checksum_version value, but for PG_DATA_CHECKSUM_ON_VERSION this would + * trigger an assert failure (this is the only transition with an assert) when + * processing the barrier. This may happen if the process is spawned between + * the update of XLogCtl->data_checksum_version and the barrier being emitted. + * This can only happen on the very first barrier so mark that with this flag. + */ +static bool InitialDataChecksumTransition = true; + +/* + * Variable backing the GUC, keep it in sync with LocalDataChecksumVersion. + * See SetLocalDataChecksumVersion(). + */ +int data_checksums = 0; + +static void SetLocalDataChecksumVersion(uint32 data_checksum_version); + /* For WALInsertLockAcquire/Release functions */ static int MyLockNo = 0; static bool holdingAllLocks = false; @@ -715,6 +748,8 @@ static void WALInsertLockAcquireExclusive(void); static void WALInsertLockRelease(void); static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt); +static void XLogChecksums(uint32 new_type); + /* * Insert an XLOG record represented by an already-constructed chain of data * chunks. This is a low-level routine; to construct the WAL record header @@ -828,9 +863,10 @@ XLogInsertRecord(XLogRecData *rdata, * only happen just after a checkpoint, so it's better to be slow in * this case and fast otherwise. * - * Also check to see if fullPageWrites was just turned on or there's a - * running backup (which forces full-page writes); if we weren't - * already doing full-page writes then go back and recompute. + * Also check to see if fullPageWrites was just turned on, there's a + * running backup or if checksums are enabled (all of which forces + * full-page writes); if we weren't already doing full-page writes + * then go back and recompute. * * If we aren't doing full-page writes then RedoRecPtr doesn't * actually affect the contents of the XLOG record, so we'll update @@ -843,7 +879,9 @@ XLogInsertRecord(XLogRecData *rdata, Assert(RedoRecPtr < Insert->RedoRecPtr); RedoRecPtr = Insert->RedoRecPtr; } - doPageWrites = (Insert->fullPageWrites || Insert->runningBackups > 0); + doPageWrites = (Insert->fullPageWrites || + Insert->runningBackups > 0 || + DataChecksumsNeedWrite()); if (doPageWrites && (!prevDoPageWrites || @@ -4229,6 +4267,12 @@ InitControlFile(uint64 sysidentifier, uint32 data_checksum_version) ControlFile->wal_log_hints = wal_log_hints; ControlFile->track_commit_timestamp = track_commit_timestamp; ControlFile->data_checksum_version = data_checksum_version; + + /* + * Set the data_checksum_version value into XLogCtl, which is where all + * processes get the current value from. (Maybe it should go just there?) + */ + XLogCtl->data_checksum_version = data_checksum_version; } static void @@ -4552,10 +4596,6 @@ ReadControlFile(void) (SizeOfXLogLongPHD - SizeOfXLogShortPHD); CalculateCheckpointSegments(); - - /* Make the initdb settings visible as GUC variables, too */ - SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no", - PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT); } /* @@ -4589,13 +4629,374 @@ GetMockAuthenticationNonce(void) } /* - * Are checksums enabled for data pages? + * DataChecksumsNeedWrite + * Returns whether data checksums must be written or not + * + * Returns true iff data checksums are enabled or are in the process of being + * enabled. During "inprogress-on" and "inprogress-off" states checksums must + * be written even though they are not verified (see datachecksumsworker.c for + * a longer discussion). + * + * This function is intended for callsites which are about to write a data page + * to storage, and need to know whether to re-calculate the checksum for the + * page header. Calling this function must be performed as close to the write + * operation as possible to keep the critical section short. + */ +bool +DataChecksumsNeedWrite(void) +{ + return (LocalDataChecksumVersion == PG_DATA_CHECKSUM_VERSION || + LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION || + LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION); +} + +/* + * DataChecksumsNeedVerify + * Returns whether data checksums must be verified or not + * + * Data checksums are only verified if they are fully enabled in the cluster. + * During the "inprogress-on" and "inprogress-off" states they are only + * updated, not verified (see datachecksumsworker.c for a longer discussion). + * + * This function is intended for callsites which have read data and are about + * to perform checksum validation based on the result of this. Calling this + * function must be performed as close to the validation call as possible to + * keep the critical section short. This is in order to protect against time of + * check/time of use situations around data checksum validation. + */ +bool +DataChecksumsNeedVerify(void) +{ + return (LocalDataChecksumVersion == PG_DATA_CHECKSUM_VERSION); +} + +/* + * DataChecksumsOnInProgress + * Returns whether data checksums are being enabled + * + * Most operations don't need to worry about the "inprogress" states, and + * should use DataChecksumsNeedVerify() or DataChecksumsNeedWrite(). The + * "inprogress-on" state for enabling checksums is used when the checksum + * worker is setting checksums on all pages, it can thus be used to check for + * aborted checksum processing which need to be restarted. + */ +inline bool +DataChecksumsOnInProgress(void) +{ + return (LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION); +} + +/* + * DataChecksumsOffInProgress + * Returns whether data checksums are being disabled + * + * The "inprogress-off" state for disabling checksums is used for when the + * worker resets the catalog state. DataChecksumsNeedVerify() or + * DataChecksumsNeedWrite() should be used for deciding whether to read/write + * checksums. */ bool -DataChecksumsEnabled(void) +DataChecksumsOffInProgress(void) +{ + return (LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION); +} + +/* + * SetDataChecksumsOnInProgress + * Sets the data checksum state to "inprogress-on" to enable checksums + * + * To start the process of enabling data checksums in a running cluster the + * data_checksum_version state must be changed to "inprogress-on". See + * SetDataChecksumsOn below for a description on how this state change works. + * This function blocks until all backends in the cluster have acknowledged the + * state transition. + */ +void +SetDataChecksumsOnInProgress(void) +{ + uint64 barrier; + + Assert(ControlFile != NULL); + + /* + * The state transition is performed in a critical section with + * checkpoints held off to provide crash safety. + */ + MyProc->delayChkptFlags |= DELAY_CHKPT_START; + START_CRIT_SECTION(); + + XLogChecksums(PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION); + + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION; + SpinLockRelease(&XLogCtl->info_lck); + + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON); + + END_CRIT_SECTION(); + MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; + + /* + * Await state change in all backends to ensure that all backends are in + * "inprogress-on". Once done we know that all backends are writing data + * checksums. + */ + WaitForProcSignalBarrier(barrier); +} + +/* + * SetDataChecksumsOn + * Enables data checksums cluster-wide + * + * Enabling data checksums is performed using two barriers, the first one to + * set the state to "inprogress-on" (done by SetDataChecksumsOnInProgress()) + * and the second one to set the state to "on" (done here). Below is a short + * description of the processing, a more detailed write-up can be found in + * datachecksumsworker.c. + * + * To start the process of enabling data checksums in a running cluster the + * data_checksum_version state must be changed to "inprogress-on". This state + * requires data checksums to be written but not verified. This ensures that + * all data pages can be checksummed without the risk of false negatives in + * validation during the process. When all existing pages are guaranteed to + * have checksums, and all new pages will be initiated with checksums, the + * state can be changed to "on". Once the state is "on" checksums will be both + * written and verified. See datachecksumsworker.c for a longer discussion on + * how data checksums can be enabled in a running cluster. + * + * This function blocks until all backends in the cluster have acknowledged the + * state transition. + */ +void +SetDataChecksumsOn(void) { + uint64 barrier; + Assert(ControlFile != NULL); - return (ControlFile->data_checksum_version > 0); + + SpinLockAcquire(&XLogCtl->info_lck); + + /* + * The only allowed state transition to "on" is from "inprogress-on" since + * that state ensures that all pages will have data checksums written. + */ + if (XLogCtl->data_checksum_version != PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION) + { + SpinLockRelease(&XLogCtl->info_lck); + elog(ERROR, "checksums not in \"inprogress-on\" mode"); + } + + SpinLockRelease(&XLogCtl->info_lck); + + MyProc->delayChkptFlags |= DELAY_CHKPT_START; + INJECTION_POINT("datachecksums-enable-checksums-delay", NULL); + START_CRIT_SECTION(); + + XLogChecksums(PG_DATA_CHECKSUM_VERSION); + + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_VERSION; + SpinLockRelease(&XLogCtl->info_lck); + + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_ON); + + END_CRIT_SECTION(); + MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; + + /* + * Await state transition of "on" in all backends. When done we know that + * data checksums are enabled in all backends and data checksums are both + * written and verified. + */ + WaitForProcSignalBarrier(barrier); +} + +/* + * SetDataChecksumsOff + * Disables data checksums cluster-wide + * + * Disabling data checksums must be performed with two sets of barriers, each + * carrying a different state. The state is first set to "inprogress-off" + * during which checksums are still written but not verified. This ensures that + * backends which have yet to observe the state change from "on" won't get + * validation errors on concurrently modified pages. Once all backends have + * changed to "inprogress-off", the barrier for moving to "off" can be emitted. + * This function blocks until all backends in the cluster have acknowledged the + * state transition. + */ +void +SetDataChecksumsOff(void) +{ + uint64 barrier; + + Assert(ControlFile); + + SpinLockAcquire(&XLogCtl->info_lck); + + /* If data checksums are already disabled there is nothing to do */ + if (XLogCtl->data_checksum_version == 0) + { + SpinLockRelease(&XLogCtl->info_lck); + return; + } + + /* + * If data checksums are currently enabled we first transition to the + * "inprogress-off" state during which backends continue to write + * checksums without verifying them. When all backends are in + * "inprogress-off" the next transition to "off" can be performed, after + * which all data checksum processing is disabled. + */ + if (XLogCtl->data_checksum_version == PG_DATA_CHECKSUM_VERSION) + { + SpinLockRelease(&XLogCtl->info_lck); + + MyProc->delayChkptFlags |= DELAY_CHKPT_START; + START_CRIT_SECTION(); + + XLogChecksums(PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION); + + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION; + SpinLockRelease(&XLogCtl->info_lck); + + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF); + + END_CRIT_SECTION(); + MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; + + /* + * Update local state in all backends to ensure that any backend in + * "on" state is changed to "inprogress-off". + */ + WaitForProcSignalBarrier(barrier); + + /* + * At this point we know that no backends are verifying data checksums + * during reading. Next, we can safely move to state "off" to also + * stop writing checksums. + */ + } + else + { + /* + * Ending up here implies that the checksums state is "inprogress-on" + * or "inprogress-off" and we can transition directly to "off" from + * there. + */ + SpinLockRelease(&XLogCtl->info_lck); + } + + /* + * Ensure that we don't incur a checkpoint during disabling checksums. + */ + MyProc->delayChkptFlags |= DELAY_CHKPT_START; + START_CRIT_SECTION(); + + XLogChecksums(0); + + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->data_checksum_version = 0; + SpinLockRelease(&XLogCtl->info_lck); + + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_OFF); + + END_CRIT_SECTION(); + MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; + + WaitForProcSignalBarrier(barrier); +} + +/* + * ProcSignalBarrier absorption functions for enabling and disabling data + * checksums in a running cluster. The procsignalbarriers are emitted in the + * SetDataChecksums* functions. + */ +bool +AbsorbChecksumsOnInProgressBarrier(void) +{ + Assert(LocalDataChecksumVersion != PG_DATA_CHECKSUM_VERSION); + SetLocalDataChecksumVersion(PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION); + return true; +} + +bool +AbsorbChecksumsOnBarrier(void) +{ + /* + * If the process was spawned between updating XLogCtl and emitting the + * barrier it will have seen the updated value, so for the first barrier + * we accept both "on" and "inprogress-on". + */ + Assert((LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION) || + (InitialDataChecksumTransition && + (LocalDataChecksumVersion == PG_DATA_CHECKSUM_VERSION))); + + SetLocalDataChecksumVersion(PG_DATA_CHECKSUM_VERSION); + InitialDataChecksumTransition = false; + return true; +} + +bool +AbsorbChecksumsOffInProgressBarrier(void) +{ + Assert(LocalDataChecksumVersion == PG_DATA_CHECKSUM_VERSION); + SetLocalDataChecksumVersion(PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION); + return true; +} + +bool +AbsorbChecksumsOffBarrier(void) +{ + /* + * We should never get here directly from a cluster with data checksums + * enabled, an inprogress state should be in between. When there are no + * failures the inprogress-off state should preceed, but in case of error + * in processing we can also reach here from the inprogress-on state. + */ + Assert((LocalDataChecksumVersion != PG_DATA_CHECKSUM_VERSION) && + (LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION || + LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION)); + SetLocalDataChecksumVersion(PG_DATA_CHECKSUM_OFF); + return true; +} + +/* + * InitLocalControlData + * + * Set up backend local caches of controldata variables which may change at + * any point during runtime and thus require special cased locking. So far + * this only applies to data_checksum_version, but it's intended to be general + * purpose enough to handle future cases. + */ +void +InitLocalDataChecksumVersion(void) +{ + SpinLockAcquire(&XLogCtl->info_lck); + SetLocalDataChecksumVersion(XLogCtl->data_checksum_version); + SpinLockRelease(&XLogCtl->info_lck); +} + +void +SetLocalDataChecksumVersion(uint32 data_checksum_version) +{ + LocalDataChecksumVersion = data_checksum_version; + + data_checksums = data_checksum_version; +} + +/* guc hook */ +const char * +show_data_checksums(void) +{ + if (LocalDataChecksumVersion == PG_DATA_CHECKSUM_VERSION) + return "on"; + else if (LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION) + return "inprogress-on"; + else if (LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION) + return "inprogress-off"; + else + return "off"; } /* @@ -4870,6 +5271,7 @@ LocalProcessControlFile(bool reset) Assert(reset || ControlFile == NULL); ControlFile = palloc(sizeof(ControlFileData)); ReadControlFile(); + SetLocalDataChecksumVersion(ControlFile->data_checksum_version); } /* @@ -5039,6 +5441,11 @@ XLOGShmemInit(void) XLogCtl->InstallXLogFileSegmentActive = false; XLogCtl->WalWriterSleeping = false; + /* Use the checksum info from control file */ + XLogCtl->data_checksum_version = ControlFile->data_checksum_version; + + SetLocalDataChecksumVersion(XLogCtl->data_checksum_version); + SpinLockInit(&XLogCtl->Insert.insertpos_lck); SpinLockInit(&XLogCtl->info_lck); pg_atomic_init_u64(&XLogCtl->logInsertResult, InvalidXLogRecPtr); @@ -6180,6 +6587,47 @@ StartupXLOG(void) pfree(endOfRecoveryInfo->recoveryStopReason); pfree(endOfRecoveryInfo); + /* + * If we reach this point with checksums in the state inprogress-on, it + * means that data checksums were in the process of being enabled when the + * cluster shut down. Since processing didn't finish, the operation will + * have to be restarted from scratch since there is no capability to + * continue where it was when the cluster shut down. Thus, revert the + * state back to off, and inform the user with a warning message. Being + * able to restart processing is a TODO, but it wouldn't be possible to + * restart here since we cannot launch a dynamic background worker + * directly from here (it has to be from a regular backend). + */ + if (XLogCtl->data_checksum_version == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION) + { + XLogChecksums(0); + + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->data_checksum_version = 0; + SetLocalDataChecksumVersion(XLogCtl->data_checksum_version); + SpinLockRelease(&XLogCtl->info_lck); + + ereport(WARNING, + (errmsg("data checksums state has been set of off"), + errhint("If checksums were being enabled during shutdown then processing must be manually restarted."))); + } + + /* + * If data checksums were being disabled when the cluster was shut down, + * we know that we have a state where all backends have stopped validating + * checksums and we can move to off instead of prompting the user to + * perform any action. + */ + if (XLogCtl->data_checksum_version == PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION) + { + XLogChecksums(0); + + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->data_checksum_version = 0; + SetLocalDataChecksumVersion(XLogCtl->data_checksum_version); + SpinLockRelease(&XLogCtl->info_lck); + } + /* * All done with end-of-recovery actions. * @@ -6471,7 +6919,7 @@ GetRedoRecPtr(void) XLogRecPtr ptr; /* - * The possibly not up-to-date copy in XlogCtl is enough. Even if we + * The possibly not up-to-date copy in XLogCtl is enough. Even if we * grabbed a WAL insertion lock to read the authoritative value in * Insert->RedoRecPtr, someone might update it just after we've released * the lock. @@ -7035,6 +7483,12 @@ CreateCheckPoint(int flags) checkPoint.fullPageWrites = Insert->fullPageWrites; checkPoint.wal_level = wal_level; + /* + * Get the current data_checksum_version value from xlogctl, valid at the + * time of the checkpoint. + */ + checkPoint.data_checksum_version = XLogCtl->data_checksum_version; + if (shutdown) { XLogRecPtr curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos); @@ -7290,6 +7744,9 @@ CreateCheckPoint(int flags) ControlFile->minRecoveryPoint = InvalidXLogRecPtr; ControlFile->minRecoveryPointTLI = 0; + /* make sure we start with the checksum version as of the checkpoint */ + ControlFile->data_checksum_version = checkPoint.data_checksum_version; + /* * Persist unloggedLSN value. It's reset on crash recovery, so this goes * unused on non-shutdown checkpoints, but seems useful to store it always @@ -7435,6 +7892,10 @@ CreateEndOfRecoveryRecord(void) LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); ControlFile->minRecoveryPoint = recptr; ControlFile->minRecoveryPointTLI = xlrec.ThisTimeLineID; + + /* start with the latest checksum version (as of the end of recovery) */ + ControlFile->data_checksum_version = XLogCtl->data_checksum_version; + UpdateControlFile(); LWLockRelease(ControlFileLock); @@ -7776,6 +8237,10 @@ CreateRestartPoint(int flags) if (flags & CHECKPOINT_IS_SHUTDOWN) ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY; } + + /* we shall start with the latest checksum version */ + ControlFile->data_checksum_version = lastCheckPoint.data_checksum_version; + UpdateControlFile(); } LWLockRelease(ControlFileLock); @@ -8187,6 +8652,24 @@ XLogReportParameters(void) } } +/* + * Log the new state of checksums + */ +static void +XLogChecksums(uint32 new_type) +{ + xl_checksum_state xlrec; + XLogRecPtr recptr; + + xlrec.new_checksumtype = new_type; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xl_checksum_state)); + + recptr = XLogInsert(RM_XLOG_ID, XLOG_CHECKSUMS); + XLogFlush(recptr); +} + /* * Update full_page_writes in shared memory, and write an * XLOG_FPW_CHANGE record if necessary. @@ -8605,6 +9088,46 @@ xlog_redo(XLogReaderState *record) { /* nothing to do here, just for informational purposes */ } + else if (info == XLOG_CHECKSUMS) + { + xl_checksum_state state; + uint64 barrier; + + memcpy(&state, XLogRecGetData(record), sizeof(xl_checksum_state)); + + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->data_checksum_version = state.new_checksumtype; + SpinLockRelease(&XLogCtl->info_lck); + + /* + * Block on a procsignalbarrier to await all processes having seen the + * change to checksum status. Once the barrier has been passed we can + * initiate the corresponding processing. + */ + switch (state.new_checksumtype) + { + case PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION: + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON); + WaitForProcSignalBarrier(barrier); + break; + + case PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION: + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF); + WaitForProcSignalBarrier(barrier); + break; + + case PG_DATA_CHECKSUM_VERSION: + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_ON); + WaitForProcSignalBarrier(barrier); + break; + + default: + Assert(state.new_checksumtype == 0); + barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_OFF); + WaitForProcSignalBarrier(barrier); + break; + } + } } /* diff --git a/src/backend/access/transam/xlogfuncs.c b/src/backend/access/transam/xlogfuncs.c index 8c3090165f00..337932a89e5e 100644 --- a/src/backend/access/transam/xlogfuncs.c +++ b/src/backend/access/transam/xlogfuncs.c @@ -26,6 +26,7 @@ #include "funcapi.h" #include "miscadmin.h" #include "pgstat.h" +#include "postmaster/datachecksumsworker.h" #include "replication/walreceiver.h" #include "storage/fd.h" #include "storage/latch.h" @@ -748,3 +749,45 @@ pg_promote(PG_FUNCTION_ARGS) wait_seconds))); PG_RETURN_BOOL(false); } + +/* + * Disables data checksums for the cluster, if applicable. Starts a background + * worker which turns off the data checksums. + */ +Datum +disable_data_checksums(PG_FUNCTION_ARGS) +{ + bool fast = PG_GETARG_BOOL(0); + + if (!superuser()) + ereport(ERROR, errmsg("must be superuser")); + + StartDataChecksumsWorkerLauncher(DISABLE_DATACHECKSUMS, 0, 0, fast); + PG_RETURN_VOID(); +} + +/* + * Enables data checksums for the cluster, if applicable. Supports vacuum- + * like cost based throttling to limit system load. Starts a background worker + * which updates data checksums on existing data. + */ +Datum +enable_data_checksums(PG_FUNCTION_ARGS) +{ + int cost_delay = PG_GETARG_INT32(0); + int cost_limit = PG_GETARG_INT32(1); + bool fast = PG_GETARG_BOOL(2); + + if (!superuser()) + ereport(ERROR, errmsg("must be superuser")); + + if (cost_delay < 0) + ereport(ERROR, errmsg("cost delay cannot be a negative value")); + + if (cost_limit <= 0) + ereport(ERROR, errmsg("cost limit must be greater than zero")); + + StartDataChecksumsWorkerLauncher(ENABLE_DATACHECKSUMS, cost_delay, cost_limit, fast); + + PG_RETURN_VOID(); +} diff --git a/src/backend/backup/basebackup.c b/src/backend/backup/basebackup.c index bb7d90aa5d96..54dcfbcb3334 100644 --- a/src/backend/backup/basebackup.c +++ b/src/backend/backup/basebackup.c @@ -1613,7 +1613,8 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename, * enabled for this cluster, and if this is a relation file, then verify * the checksum. */ - if (!noverify_checksums && DataChecksumsEnabled() && + if (!noverify_checksums && + DataChecksumsNeedWrite() && RelFileNumberIsValid(relfilenumber)) verify_checksum = true; @@ -2007,6 +2008,9 @@ verify_page_checksum(Page page, XLogRecPtr start_lsn, BlockNumber blkno, if (PageIsNew(page) || PageGetLSN(page) >= start_lsn) return true; + if (!DataChecksumsNeedVerify()) + return true; + /* Perform the actual checksum calculation. */ checksum = pg_checksum_page(page, blkno); diff --git a/src/backend/catalog/system_functions.sql b/src/backend/catalog/system_functions.sql index 566f308e4439..dea7ad3cf30c 100644 --- a/src/backend/catalog/system_functions.sql +++ b/src/backend/catalog/system_functions.sql @@ -650,6 +650,18 @@ LANGUAGE INTERNAL CALLED ON NULL INPUT VOLATILE PARALLEL SAFE AS 'pg_stat_reset_slru'; +CREATE OR REPLACE FUNCTION + pg_enable_data_checksums(cost_delay integer DEFAULT 0, + cost_limit integer DEFAULT 100, + fast boolean DEFAULT false) + RETURNS void STRICT VOLATILE LANGUAGE internal AS 'enable_data_checksums' + PARALLEL RESTRICTED; + +CREATE OR REPLACE FUNCTION + pg_disable_data_checksums(fast boolean DEFAULT false) + RETURNS void STRICT VOLATILE LANGUAGE internal AS 'disable_data_checksums' + PARALLEL RESTRICTED; + -- -- The default permissions for functions mean that anyone can execute them. -- A number of functions shouldn't be executable by just anyone, but rather @@ -775,6 +787,10 @@ REVOKE EXECUTE ON FUNCTION pg_ls_logicalmapdir() FROM PUBLIC; REVOKE EXECUTE ON FUNCTION pg_ls_replslotdir(text) FROM PUBLIC; +REVOKE EXECUTE ON FUNCTION pg_enable_data_checksums(integer, integer, boolean) FROM public; + +REVOKE EXECUTE ON FUNCTION pg_disable_data_checksums(boolean) FROM public; + -- -- We also set up some things as accessible to standard roles. -- diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 1b3c5a55882d..22f67c7ee4ac 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -1354,6 +1354,26 @@ CREATE VIEW pg_stat_progress_copy AS FROM pg_stat_get_progress_info('COPY') AS S LEFT JOIN pg_database D ON S.datid = D.oid; +CREATE VIEW pg_stat_progress_data_checksums AS + SELECT + S.pid AS pid, S.datid, D.datname AS datname, + CASE S.param1 WHEN 0 THEN 'enabling' + WHEN 1 THEN 'disabling' + WHEN 2 THEN 'waiting' + WHEN 3 THEN 'waiting on temporary tables' + WHEN 4 THEN 'waiting on checkpoint' + WHEN 5 THEN 'done' + END AS phase, + CASE S.param2 WHEN -1 THEN NULL ELSE S.param2 END AS databases_total, + S.param3 AS databases_done, + CASE S.param4 WHEN -1 THEN NULL ELSE S.param4 END AS relations_total, + CASE S.param5 WHEN -1 THEN NULL ELSE S.param5 END AS relations_done, + CASE S.param6 WHEN -1 THEN NULL ELSE S.param6 END AS blocks_total, + CASE S.param7 WHEN -1 THEN NULL ELSE S.param7 END AS blocks_done + FROM pg_stat_get_progress_info('DATACHECKSUMS') AS S + LEFT JOIN pg_database D ON S.datid = D.oid + ORDER BY S.datid; -- return the launcher process first + CREATE VIEW pg_user_mappings AS SELECT U.oid AS umid, diff --git a/src/backend/postmaster/Makefile b/src/backend/postmaster/Makefile index 0f4435d2d97c..0c36765acfe1 100644 --- a/src/backend/postmaster/Makefile +++ b/src/backend/postmaster/Makefile @@ -18,6 +18,7 @@ OBJS = \ bgworker.o \ bgwriter.o \ checkpointer.o \ + datachecksumsworker.o \ fork_process.o \ interrupt.o \ launch_backend.o \ diff --git a/src/backend/postmaster/auxprocess.c b/src/backend/postmaster/auxprocess.c index a6d3630398f4..5742a1dd724e 100644 --- a/src/backend/postmaster/auxprocess.c +++ b/src/backend/postmaster/auxprocess.c @@ -15,6 +15,7 @@ #include #include +#include "access/xlog.h" #include "miscadmin.h" #include "pgstat.h" #include "postmaster/auxprocess.h" @@ -68,6 +69,24 @@ AuxiliaryProcessMainCommon(void) ProcSignalInit(NULL, 0); + /* + * Initialize a local cache of the data_checksum_version, to be updated by + * the procsignal-based barriers. + * + * This intentionally happens after initializing the procsignal, otherwise + * we might miss a state change. This means we can get a barrier for the + * state we've just initialized - but it can happen only once. + * + * The postmaster (which is what gets forked into the new child process) + * does not handle barriers, therefore it may not have the current value + * of LocalDataChecksumVersion value (it'll have the value read from the + * control file, which may be arbitrarily old). + * + * NB: Even if the postmaster handled barriers, the value might still be + * stale, as it might have changed after this process forked. + */ + InitLocalDataChecksumVersion(); + /* * Auxiliary processes don't run transactions, but they may need a * resource owner anyway to manage buffer pins acquired outside diff --git a/src/backend/postmaster/bgworker.c b/src/backend/postmaster/bgworker.c index 1ad65c237c34..0d2ade1f9057 100644 --- a/src/backend/postmaster/bgworker.c +++ b/src/backend/postmaster/bgworker.c @@ -18,6 +18,7 @@ #include "pgstat.h" #include "port/atomics.h" #include "postmaster/bgworker_internals.h" +#include "postmaster/datachecksumsworker.h" #include "postmaster/postmaster.h" #include "replication/logicallauncher.h" #include "replication/logicalworker.h" @@ -132,6 +133,12 @@ static const struct }, { "TablesyncWorkerMain", TablesyncWorkerMain + }, + { + "DataChecksumsWorkerLauncherMain", DataChecksumsWorkerLauncherMain + }, + { + "DataChecksumsWorkerMain", DataChecksumsWorkerMain } }; diff --git a/src/backend/postmaster/datachecksumsworker.c b/src/backend/postmaster/datachecksumsworker.c new file mode 100644 index 000000000000..ff451d502ba7 --- /dev/null +++ b/src/backend/postmaster/datachecksumsworker.c @@ -0,0 +1,1463 @@ +/*------------------------------------------------------------------------- + * + * datachecksumsworker.c + * Background worker for enabling or disabling data checksums online + * + * When enabling data checksums on a database at initdb time or when shut down + * with pg_checksums, no extra process is required as each page is checksummed, + * and verified, when accessed. When enabling checksums on an already running + * cluster, this worker will ensure that all pages are checksummed before + * verification of the checksums is turned on. In the case of disabling + * checksums, the state transition is performed only in the control file, no + * changes are performed on the data pages. + * + * Checksums can be either enabled or disabled cluster-wide, with on/off being + * the end state for data_checksums. + * + * Enabling checksums + * ------------------ + * When enabling checksums in an online cluster, data_checksums will be set to + * "inprogress-on" which signals that write operations MUST compute and write + * the checksum on the data page, but during reading the checksum SHALL NOT be + * verified. This ensures that all objects created during checksumming will + * have checksums set, but no reads will fail due to incorrect checksum. The + * DataChecksumsWorker will compile a list of databases which exist at the + * start of checksumming, and all of these which haven't been dropped during + * the processing MUST have been processed successfully in order for checksums + * to be enabled. Any new relation created during processing will see the + * in-progress state and will automatically be checksummed. + * + * For each database, all relations which have storage are read and every data + * page is marked dirty to force a write with the checksum. This will generate + * a lot of WAL as the entire database is read and written. + * + * If the processing is interrupted by a cluster restart, it will be restarted + * from the beginning again as state isn't persisted. + * + * Disabling checksums + * ------------------- + * When disabling checksums, data_checksums will be set to "inprogress-off" + * which signals that checksums are written but no longer verified. This ensure + * that backends which have yet to move from the "on" state will still be able + * to process data checksum validation. + * + * Synchronization and Correctness + * ------------------------------- + * The processes involved in enabling, or disabling, data checksums in an + * online cluster must be properly synchronized with the normal backends + * serving concurrent queries to ensure correctness. Correctness is defined + * as the following: + * + * - Backends SHALL NOT violate local data_checksums state + * - Data checksums SHALL NOT be considered enabled cluster-wide until all + * currently connected backends have the local state "enabled" + * + * There are two levels of synchronization required for enabling data checksums + * in an online cluster: (i) changing state in the active backends ("on", + * "off", "inprogress-on" and "inprogress-off"), and (ii) ensuring no + * incompatible objects and processes are left in a database when workers end. + * The former deals with cluster-wide agreement on data checksum state and the + * latter with ensuring that any concurrent activity cannot break the data + * checksum contract during processing. + * + * Synchronizing the state change is done with procsignal barriers, where the + * WAL logging backend updating the global state in the controlfile will wait + * for all other backends to absorb the barrier. Barrier absorption will happen + * during interrupt processing, which means that connected backends will change + * state at different times. To prevent data checksum state changes when + * writing and verifying checksums, interrupts shall be held off before + * interrogating state and resumed when the IO operation has been performed. + * + * When Enabling Data Checksums + * ---------------------------- + * A process which fails to observe data checksums being enabled can induce + * two types of errors: failing to write the checksum when modifying the page + * and failing to validate the data checksum on the page when reading it. + * + * When processing starts all backends belong to one of the below sets, with + * one set being empty: + * + * Bd: Backends in "off" state + * Bi: Backends in "inprogress-on" state + * + * If processing is started in an online cluster then all backends are in Bd. + * If processing was halted by the cluster shutting down, the controlfile + * state "inprogress-on" will be observed on system startup and all backends + * will be in Bd. Backends transition Bd -> Bi via a procsignalbarrier. When + * the DataChecksumsWorker has finished writing checksums on all pages and + * enables data checksums cluster-wide, there are four sets of backends where + * Bd shall be an empty set: + * + * Bg: Backend updating the global state and emitting the procsignalbarrier + * Bd: Backends in "off" state + * Be: Backends in "on" state + * Bi: Backends in "inprogress-on" state + * + * Backends in Bi and Be will write checksums when modifying a page, but only + * backends in Be will verify the checksum during reading. The Bg backend is + * blocked waiting for all backends in Bi to process interrupts and move to + * Be. Any backend starting while Bg is waiting on the procsignalbarrier will + * observe the global state being "on" and will thus automatically belong to + * Be. Checksums are enabled cluster-wide when Bi is an empty set. Bi and Be + * are compatible sets while still operating based on their local state as + * both write data checksums. + * + * When Disabling Data Checksums + * ----------------------------- + * A process which fails to observe that data checksums have been disabled + * can induce two types of errors: writing the checksum when modifying the + * page and validating a data checksum which is no longer correct due to + * modifications to the page. + * + * Bg: Backend updating the global state and emitting the procsignalbarrier + * Bd: Backends in "off" state + * Be: Backends in "on" state + * Bo: Backends in "inprogress-off" state + * + * Backends transition from the Be state to Bd like so: Be -> Bo -> Bd + * + * The goal is to transition all backends to Bd making the others empty sets. + * Backends in Bo write data checksums, but don't validate them, such that + * backends still in Be can continue to validate pages until the barrier has + * been absorbed such that they are in Bo. Once all backends are in Bo, the + * barrier to transition to "off" can be raised and all backends can safely + * stop writing data checksums as no backend is enforcing data checksum + * validation any longer. + * + * + * Potential optimizations + * ----------------------- + * Below are some potential optimizations and improvements which were brought + * up during reviews of this feature, but which weren't implemented in the + * initial version. These are ideas listed without any validation on their + * feasibility or potential payoff. More discussion on these can be found on + * the -hackers threads linked to in the commit message of this feature. + * + * * Launching datachecksumsworker for resuming operation from the startup + * process: Currently users have to restart processing manually after a + * restart since dynamic background worker cannot be started from the + * postmaster. Changing the startup process could make restarting the + * processing automatic on cluster restart. + * * Avoid dirtying the page when checksums already match: Iff the checksum + * on the page happens to already match we still dirty the page. It should + * be enough to only do the log_newpage_buffer() call in that case. + * * Invent a lightweight WAL record that doesn't contain the full-page + * image but just the block number: On replay, the redo routine would read + * the page from disk. + * * Teach pg_checksums to avoid checksummed pages when pg_checksums is used + * to enable checksums on a cluster which is in inprogress-on state and + * may have checksummed pages (make pg_checksums be able to resume an + * online operation). + * + * + * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/postmaster/datachecksumsworker.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/genam.h" +#include "access/heapam.h" +#include "access/htup_details.h" +#include "access/xact.h" +#include "access/xloginsert.h" +#include "catalog/indexing.h" +#include "catalog/pg_class.h" +#include "catalog/pg_database.h" +#include "commands/progress.h" +#include "commands/vacuum.h" +#include "common/relpath.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/bgworker.h" +#include "postmaster/bgwriter.h" +#include "postmaster/datachecksumsworker.h" +#include "storage/bufmgr.h" +#include "storage/checksum.h" +#include "storage/ipc.h" +#include "storage/lmgr.h" +#include "storage/lwlock.h" +#include "storage/procarray.h" +#include "storage/smgr.h" +#include "tcop/tcopprot.h" +#include "utils/fmgroids.h" +#include "utils/injection_point.h" +#include "utils/lsyscache.h" +#include "utils/ps_status.h" +#include "utils/syscache.h" + +/* + * Number of times we retry to open a database before giving up and consider + * it to have failed processing. + */ +#define DATACHECKSUMSWORKER_MAX_DB_RETRIES 5 + +/* + * Signaling between backends calling pg_enable/disable_data_checksums, the + * checksums launcher process, and the checksums worker process. + * + * This struct is protected by DataChecksumsWorkerLock + */ +typedef struct DataChecksumsWorkerShmemStruct +{ + /* + * These are set by pg_{enable|disable|verify}_data_checksums, to tell the + * launcher what the target state is. + */ + DataChecksumsWorkerOperation launch_operation; + int launch_cost_delay; + int launch_cost_limit; + bool launch_fast; + + /* + * Is a launcher process is currently running? + * + * This is set by the launcher process, after it has read the above + * launch_* parameters. + */ + bool launcher_running; + + /* + * These fields indicate the target state that the launcher is currently + * working towards. They can be different from the corresponding launch_* + * fields, if a new pg_enable/disable_data_checksums() call was made while + * the launcher/worker was already running. + * + * The below members are set when the launcher starts, and are only + * accessed read-only by the single worker. Thus, we can access these + * without a lock. If multiple workers, or dynamic cost parameters, are + * supported at some point then this would need to be revisited. + */ + DataChecksumsWorkerOperation operation; + int cost_delay; + int cost_limit; + bool immediate_checkpoint; + + /* + * Signaling between the launcher and the worker process. + * + * As there is only a single worker, and the launcher won't read these + * until the worker exits, they can be accessed without the need for a + * lock. If multiple workers are supported then this will have to be + * revisited. + */ + + /* result, set by worker before exiting */ + DataChecksumsWorkerResult success; + + /* + * tells the worker process whether it should also process the shared + * catalogs + */ + bool process_shared_catalogs; +} DataChecksumsWorkerShmemStruct; + +/* Shared memory segment for datachecksumsworker */ +static DataChecksumsWorkerShmemStruct *DataChecksumsWorkerShmem; + +typedef struct DataChecksumsWorkerDatabase +{ + Oid dboid; + char *dbname; +} DataChecksumsWorkerDatabase; + +typedef struct DataChecksumsWorkerResultEntry +{ + Oid dboid; + DataChecksumsWorkerResult result; + int retries; +} DataChecksumsWorkerResultEntry; + + +/* + * Flag set by the interrupt handler + */ +static volatile sig_atomic_t abort_requested = false; + +/* + * Have we set the DataChecksumsWorkerShmemStruct->launcher_running flag? + * If we have, we need to clear it before exiting! + */ +static volatile sig_atomic_t launcher_running = false; + +/* + * Are we enabling data checksums, or disabling them? + */ +static DataChecksumsWorkerOperation operation; + +/* Prototypes */ +static List *BuildDatabaseList(void); +static List *BuildRelationList(bool temp_relations, bool include_shared); +static void FreeDatabaseList(List *dblist); +static DataChecksumsWorkerResult ProcessDatabase(DataChecksumsWorkerDatabase *db); +static bool ProcessAllDatabases(bool immediate_checkpoint); +static bool ProcessSingleRelationFork(Relation reln, ForkNumber forkNum, BufferAccessStrategy strategy); +static void launcher_cancel_handler(SIGNAL_ARGS); +static void WaitForAllTransactionsToFinish(void); + +/* + * StartDataChecksumsWorkerLauncher + * Main entry point for datachecksumsworker launcher process + * + * The main entrypoint for starting data checksums processing for enabling as + * well as disabling. + */ +void +StartDataChecksumsWorkerLauncher(DataChecksumsWorkerOperation op, + int cost_delay, + int cost_limit, + bool fast) +{ + BackgroundWorker bgw; + BackgroundWorkerHandle *bgw_handle; + bool launcher_running; + +#ifdef USE_ASSERT_CHECKING + /* The cost delay settings have no effect when disabling */ + if (op == DISABLE_DATACHECKSUMS) + Assert(cost_delay == 0 && cost_limit == 0); +#endif + + /* Store the desired state in shared memory */ + LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE); + + DataChecksumsWorkerShmem->launch_operation = op; + DataChecksumsWorkerShmem->launch_cost_delay = cost_delay; + DataChecksumsWorkerShmem->launch_cost_limit = cost_limit; + DataChecksumsWorkerShmem->launch_fast = fast; + + /* is the launcher already running? */ + launcher_running = DataChecksumsWorkerShmem->launcher_running; + + LWLockRelease(DataChecksumsWorkerLock); + + /* + * Launch a new launcher process, if it's not running already. + * + * If the launcher is currently busy enabling the checksums, and we want + * them disabled (or vice versa), the launcher will notice that at latest + * when it's about to exit, and will loop back process the new request. So + * if the launcher is already running, we don't need to do anything more + * here to abort it. + * + * If you call pg_enable/disable_data_checksums() twice in a row, before + * the launcher has had a chance to start up, we still end up launching it + * twice. That's OK, the second invocation will see that a launcher is + * already running and exit quickly. + * + * TODO: We could optimize here and skip launching the launcher, if we are + * already in the desired state, i.e. if the checksums are already enabled + * and you call pg_enable_data_checksums(). + */ + if (!launcher_running) + { + /* + * Prepare the BackgroundWorker and launch it. + */ + memset(&bgw, 0, sizeof(bgw)); + bgw.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION; + bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; + snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres"); + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "DataChecksumsWorkerLauncherMain"); + snprintf(bgw.bgw_name, BGW_MAXLEN, "datachecksumsworker launcher"); + snprintf(bgw.bgw_type, BGW_MAXLEN, "datachecksumsworker launcher"); + bgw.bgw_restart_time = BGW_NEVER_RESTART; + bgw.bgw_notify_pid = MyProcPid; + bgw.bgw_main_arg = (Datum) 0; + + if (!RegisterDynamicBackgroundWorker(&bgw, &bgw_handle)) + ereport(ERROR, + errmsg("failed to start background worker to process data checksums")); + } +} + +/* + * ProcessSingleRelationFork + * Enable data checksums in a single relation/fork. + * + * Returns true if successful, and false if *aborted*. On error, an actual + * error is raised in the lower levels. + */ +static bool +ProcessSingleRelationFork(Relation reln, ForkNumber forkNum, BufferAccessStrategy strategy) +{ + BlockNumber numblocks = RelationGetNumberOfBlocksInFork(reln, forkNum); + char activity[NAMEDATALEN * 2 + 128]; + char *relns; + + relns = get_namespace_name(RelationGetNamespace(reln)); + + if (!relns) + return false; + + /* Report the current relation to pgstat_activity */ + snprintf(activity, sizeof(activity) - 1, "processing: %s.%s (%s, %dblocks)", + relns, RelationGetRelationName(reln), forkNames[forkNum], numblocks); + pgstat_report_activity(STATE_RUNNING, activity); + + /* + * As of now we only update the block counter for main forks in order to + * not cause too frequent calls. TODO: investigate whether we should do it + * more frequent? + */ + if (forkNum == MAIN_FORKNUM) + pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_BLOCKS_TOTAL, + numblocks); + + /* + * We are looping over the blocks which existed at the time of process + * start, which is safe since new blocks are created with checksums set + * already due to the state being "inprogress-on". + */ + for (BlockNumber blknum = 0; blknum < numblocks; blknum++) + { + Buffer buf = ReadBufferExtended(reln, forkNum, blknum, RBM_NORMAL, strategy); + + /* Need to get an exclusive lock before we can flag as dirty */ + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + /* + * Mark the buffer as dirty and force a full page write. We have to + * re-write the page to WAL even if the checksum hasn't changed, + * because if there is a replica it might have a slightly different + * version of the page with an invalid checksum, caused by unlogged + * changes (e.g. hintbits) on the master happening while checksums + * were off. This can happen if there was a valid checksum on the page + * at one point in the past, so only when checksums are first on, then + * off, and then turned on again. TODO: investigate if this could be + * avoided if the checksum is calculated to be correct and wal_level + * is set to "minimal", + */ + START_CRIT_SECTION(); + MarkBufferDirty(buf); + log_newpage_buffer(buf, false); + END_CRIT_SECTION(); + + UnlockReleaseBuffer(buf); + + /* + * This is the only place where we check if we are asked to abort, the + * abortion will bubble up from here. It's safe to check this without + * a lock, because if we miss it being set, we will try again soon. + */ + Assert(operation == ENABLE_DATACHECKSUMS); + if (DataChecksumsWorkerShmem->launch_operation == DISABLE_DATACHECKSUMS) + abort_requested = true; + + if (abort_requested) + return false; + + /* + * As of now we only update the block counter for main forks in order + * to not cause too frequent calls. TODO: investigate whether we + * should do it more frequent? + */ + if (forkNum == MAIN_FORKNUM) + pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_BLOCKS_DONE, + (blknum + 1)); + + vacuum_delay_point(false); + } + + pfree(relns); + return true; +} + +/* + * ProcessSingleRelationByOid + * Process a single relation based on oid. + * + * Returns true if successful, and false if *aborted*. On error, an actual + * error is raised in the lower levels. + */ +static bool +ProcessSingleRelationByOid(Oid relationId, BufferAccessStrategy strategy) +{ + Relation rel; + bool aborted = false; + + StartTransactionCommand(); + + rel = try_relation_open(relationId, AccessShareLock); + if (rel == NULL) + { + /* + * Relation no longer exists. We don't consider this an error since + * there are no pages in it that need data checksums, and thus return + * true. The worker operates off a list of relations generated at the + * start of processing, so relations being dropped in the meantime is + * to be expected. + */ + CommitTransactionCommand(); + pgstat_report_activity(STATE_IDLE, NULL); + return true; + } + RelationGetSmgr(rel); + + for (ForkNumber fnum = 0; fnum <= MAX_FORKNUM; fnum++) + { + if (smgrexists(rel->rd_smgr, fnum)) + { + if (!ProcessSingleRelationFork(rel, fnum, strategy)) + { + aborted = true; + break; + } + } + } + relation_close(rel, AccessShareLock); + elog(DEBUG2, + "data checksum processing done for relation with OID %u: %s", + relationId, (aborted ? "aborted" : "finished")); + + CommitTransactionCommand(); + + pgstat_report_activity(STATE_IDLE, NULL); + + return !aborted; +} + +/* + * ProcessDatabase + * Enable data checksums in a single database. + * + * We do this by launching a dynamic background worker into this database, and + * waiting for it to finish. We have to do this in a separate worker, since + * each process can only be connected to one database during its lifetime. + */ +static DataChecksumsWorkerResult +ProcessDatabase(DataChecksumsWorkerDatabase *db) +{ + BackgroundWorker bgw; + BackgroundWorkerHandle *bgw_handle; + BgwHandleStatus status; + pid_t pid; + char activity[NAMEDATALEN + 64]; + + DataChecksumsWorkerShmem->success = DATACHECKSUMSWORKER_FAILED; + + memset(&bgw, 0, sizeof(bgw)); + bgw.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION; + bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; + snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres"); + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "%s", "DataChecksumsWorkerMain"); + snprintf(bgw.bgw_name, BGW_MAXLEN, "datachecksumsworker worker"); + snprintf(bgw.bgw_type, BGW_MAXLEN, "datachecksumsworker worker"); + bgw.bgw_restart_time = BGW_NEVER_RESTART; + bgw.bgw_notify_pid = MyProcPid; + bgw.bgw_main_arg = ObjectIdGetDatum(db->dboid); + + /* + * If there are no worker slots available, make sure we retry processing + * this database. This will make the datachecksumsworker move on to the + * next database and quite likely fail with the same problem. TODO: Maybe + * we need a backoff to avoid running through all the databases here in + * short order. + */ + if (!RegisterDynamicBackgroundWorker(&bgw, &bgw_handle)) + { + ereport(WARNING, + errmsg("failed to start worker for enabling data checksums in database \"%s\", retrying", + db->dbname), + errhint("The max_worker_processes setting might be too low.")); + return DATACHECKSUMSWORKER_RETRYDB; + } + + status = WaitForBackgroundWorkerStartup(bgw_handle, &pid); + if (status == BGWH_STOPPED) + { + ereport(WARNING, + errmsg("could not start background worker for enabling data checksums in database \"%s\"", + db->dbname), + errhint("More details on the error might be found in the server log.")); + return DATACHECKSUMSWORKER_FAILED; + } + + /* + * If the postmaster crashed we cannot end up with a processed database so + * we have no alternative other than exiting. When enabling checksums we + * won't at this time have changed the pg_control version to enabled so + * when the cluster comes back up processing will have to be restarted. + * When disabling, the pg_control version will be set to off before this + * so when the cluster comes up checksums will be off as expected. + */ + if (status == BGWH_POSTMASTER_DIED) + ereport(FATAL, + errmsg("cannot enable data checksums without the postmaster process"), + errhint("Restart the database and restart data checksum processing by calling pg_enable_data_checksums().")); + + Assert(status == BGWH_STARTED); + ereport(DEBUG1, + errmsg("initiating data checksum processing in database \"%s\"", + db->dbname)); + + snprintf(activity, sizeof(activity) - 1, + "Waiting for worker in database %s (pid %ld)", db->dbname, (long) pid); + pgstat_report_activity(STATE_RUNNING, activity); + + status = WaitForBackgroundWorkerShutdown(bgw_handle); + if (status == BGWH_POSTMASTER_DIED) + ereport(FATAL, + errmsg("postmaster exited during data checksum processing in \"%s\"", + db->dbname), + errhint("Restart the database and restart data checksum processing by calling pg_enable_data_checksums().")); + + if (DataChecksumsWorkerShmem->success == DATACHECKSUMSWORKER_ABORTED) + ereport(LOG, + errmsg("data checksums processing was aborted in database \"%s\"", + db->dbname)); + + pgstat_report_activity(STATE_IDLE, NULL); + + return DataChecksumsWorkerShmem->success; +} + +/* + * launcher_exit + * + * Internal routine for cleaning up state when the launcher process exits. We + * need to clean up the abort flag to ensure that processing can be restarted + * again after it was previously aborted. + */ +static void +launcher_exit(int code, Datum arg) +{ + if (launcher_running) + { + LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE); + launcher_running = false; + DataChecksumsWorkerShmem->launcher_running = false; + LWLockRelease(DataChecksumsWorkerLock); + } +} + +/* + * launcher_cancel_handler + * + * Internal routine for reacting to SIGINT and flagging the worker to abort. + * The worker won't be interrupted immediately but will check for abort flag + * between each block in a relation. + */ +static void +launcher_cancel_handler(SIGNAL_ARGS) +{ + int save_errno = errno; + + abort_requested = true; + + /* + * There is no sleeping in the main loop, the flag will be checked + * periodically in ProcessSingleRelationFork. The worker does however + * sleep when waiting for concurrent transactions to end so we still need + * to set the latch. + */ + SetLatch(MyLatch); + + errno = save_errno; +} + +/* + * WaitForAllTransactionsToFinish + * Blocks awaiting all current transactions to finish + * + * Returns when all transactions which are active at the call of the function + * have ended, or if the postmaster dies while waiting. If the postmaster dies + * the abort flag will be set to indicate that the caller of this shouldn't + * proceed. + * + * NB: this will return early, if aborted by SIGINT or if the target state + * is changed while we're running. + */ +static void +WaitForAllTransactionsToFinish(void) +{ + TransactionId waitforxid; + + LWLockAcquire(XidGenLock, LW_SHARED); + waitforxid = XidFromFullTransactionId(TransamVariables->nextXid); + LWLockRelease(XidGenLock); + + while (TransactionIdPrecedes(GetOldestActiveTransactionId(false, true), waitforxid)) + { + char activity[64]; + int rc; + + /* Oldest running xid is older than us, so wait */ + snprintf(activity, + sizeof(activity), + "Waiting for current transactions to finish (waiting for %u)", + waitforxid); + pgstat_report_activity(STATE_RUNNING, activity); + + /* Retry every 3 seconds */ + ResetLatch(MyLatch); + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, + 3000, + WAIT_EVENT_CHECKSUM_ENABLE_STARTCONDITION); + + /* + * If the postmaster died we won't be able to enable checksums + * cluster-wide so abort and hope to continue when restarted. + */ + if (rc & WL_POSTMASTER_DEATH) + ereport(FATAL, + errmsg("postmaster exited during data checksum processing"), + errhint("Restart the database and restart data checksum processing by calling pg_enable_data_checksums().")); + + LWLockAcquire(DataChecksumsWorkerLock, LW_SHARED); + if (DataChecksumsWorkerShmem->launch_operation != operation) + abort_requested = true; + LWLockRelease(DataChecksumsWorkerLock); + if (abort_requested) + break; + } + + pgstat_report_activity(STATE_IDLE, NULL); + return; +} + +/* + * DataChecksumsWorkerLauncherMain + * + * Main function for launching dynamic background workers for processing data + * checksums in databases. This function has the bgworker management, with + * ProcessAllDatabases being responsible for looping over the databases and + * initiating processing. + */ +void +DataChecksumsWorkerLauncherMain(Datum arg) +{ + on_shmem_exit(launcher_exit, 0); + + ereport(DEBUG1, + errmsg("background worker \"datachecksumsworker\" launcher started")); + + pqsignal(SIGTERM, die); + pqsignal(SIGINT, launcher_cancel_handler); + + BackgroundWorkerUnblockSignals(); + + MyBackendType = B_DATACHECKSUMSWORKER_LAUNCHER; + init_ps_display(NULL); + + LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE); + + if (DataChecksumsWorkerShmem->launcher_running) + { + /* Launcher was already running, let it finish */ + LWLockRelease(DataChecksumsWorkerLock); + return; + } + + launcher_running = true; + + /* + * Initialize a connection to shared catalogs only. + */ + BackgroundWorkerInitializeConnectionByOid(InvalidOid, InvalidOid, 0); + + operation = DataChecksumsWorkerShmem->launch_operation; + DataChecksumsWorkerShmem->launcher_running = true; + DataChecksumsWorkerShmem->operation = operation; + DataChecksumsWorkerShmem->cost_delay = DataChecksumsWorkerShmem->launch_cost_delay; + DataChecksumsWorkerShmem->cost_limit = DataChecksumsWorkerShmem->launch_cost_limit; + DataChecksumsWorkerShmem->immediate_checkpoint = DataChecksumsWorkerShmem->launch_fast; + LWLockRelease(DataChecksumsWorkerLock); + + /* + * The target state can change while we are busy enabling/disabling + * checksums, if the user calls pg_disable/enable_data_checksums() before + * we are finished with the previous request. In that case, we will loop + * back here, to process the new request. + */ +again: + + pgstat_progress_start_command(PROGRESS_COMMAND_DATACHECKSUMS, + InvalidOid); + + if (operation == ENABLE_DATACHECKSUMS) + { + /* + * If we are asked to enable checksums in a cluster which already has + * checksums enabled, exit immediately as there is nothing more to do. + * Hold interrupts to make sure state doesn't change during checking. + */ + HOLD_INTERRUPTS(); + if (DataChecksumsNeedVerify()) + { + RESUME_INTERRUPTS(); + goto done; + } + RESUME_INTERRUPTS(); + + /* + * Set the state to inprogress-on and wait on the procsignal barrier. + */ + pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_PHASE, + PROGRESS_DATACHECKSUMS_PHASE_ENABLING); + SetDataChecksumsOnInProgress(); + + /* + * All backends are now in inprogress-on state and are writing data + * checksums. Start processing all data at rest. + */ + if (!ProcessAllDatabases(DataChecksumsWorkerShmem->immediate_checkpoint)) + { + /* + * If the target state changed during processing then it's not a + * failure, so restart processing instead. + */ + LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE); + if (DataChecksumsWorkerShmem->launch_operation != operation) + { + LWLockRelease(DataChecksumsWorkerLock); + goto done; + } + LWLockRelease(DataChecksumsWorkerLock); + ereport(ERROR, + errmsg("unable to enable data checksums in cluster")); + } + + /* + * Data checksums have been set on all pages, set the state to on in + * order to instruct backends to validate checksums on reading. + */ + SetDataChecksumsOn(); + } + else + { + int flags; + + pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_PHASE, + PROGRESS_DATACHECKSUMS_PHASE_DISABLING); + SetDataChecksumsOff(); + + flags = CHECKPOINT_FORCE | CHECKPOINT_WAIT; + if (DataChecksumsWorkerShmem->immediate_checkpoint) + flags = flags | CHECKPOINT_FAST; + RequestCheckpoint(flags); + } + +done: + + /* + * All done. But before we exit, check if the target state was changed + * while we were running. In that case we will have to start all over + * again. + */ + LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE); + if (DataChecksumsWorkerShmem->launch_operation != operation) + { + DataChecksumsWorkerShmem->operation = DataChecksumsWorkerShmem->launch_operation; + operation = DataChecksumsWorkerShmem->launch_operation; + DataChecksumsWorkerShmem->cost_delay = DataChecksumsWorkerShmem->launch_cost_delay; + DataChecksumsWorkerShmem->cost_limit = DataChecksumsWorkerShmem->launch_cost_limit; + LWLockRelease(DataChecksumsWorkerLock); + goto again; + } + + /* Shut down progress reporting as we are done */ + pgstat_progress_end_command(); + + launcher_running = false; + DataChecksumsWorkerShmem->launcher_running = false; + LWLockRelease(DataChecksumsWorkerLock); +} + +/* + * ProcessAllDatabases + * Compute the list of all databases and process checksums in each + * + * This will repeatedly generate a list of databases to process for enabling + * checksums. Until no new databases are found, this will loop around computing + * a new list and comparing it to the already seen ones. + * + * If immediate_checkpoint is set to true then a CHECKPOINT_FAST will be + * issued. This is useful for testing but should be avoided in production use + * as it may affect cluster performance drastically. + */ +static bool +ProcessAllDatabases(bool immediate_checkpoint) +{ + List *DatabaseList; + HTAB *ProcessedDatabases = NULL; + HASHCTL hash_ctl; + bool found_failed = false; + int flags; + + /* Initialize a hash tracking all processed databases */ + memset(&hash_ctl, 0, sizeof(hash_ctl)); + hash_ctl.keysize = sizeof(Oid); + hash_ctl.entrysize = sizeof(DataChecksumsWorkerResultEntry); + ProcessedDatabases = hash_create("Processed databases", + 64, + &hash_ctl, + HASH_ELEM | HASH_BLOBS); + + /* + * Set up so first run processes shared catalogs, but not once in every + * db. + */ + DataChecksumsWorkerShmem->process_shared_catalogs = true; + + /* + * Get a list of all databases to process. This may include databases that + * were created during our runtime. Since a database can be created as a + * copy of any other database (which may not have existed in our last + * run), we have to repeat this loop until no new databases show up in the + * list. + */ + DatabaseList = BuildDatabaseList(); + + /* Allow a test case to modify the initial list of databases */ + INJECTION_POINT("datachecksumsworker-initial-dblist", DatabaseList); + + /* + * Update progress reporting with the total number of databases we need to + * process. This number should not be changed during processing, the + * columns for processed databases is instead increased such that it can + * be compared against the total. + */ + { + const int index[] = { + PROGRESS_DATACHECKSUMS_DBS_TOTAL, + PROGRESS_DATACHECKSUMS_DBS_DONE, + PROGRESS_DATACHECKSUMS_RELS_TOTAL, + PROGRESS_DATACHECKSUMS_RELS_DONE, + PROGRESS_DATACHECKSUMS_BLOCKS_TOTAL, + PROGRESS_DATACHECKSUMS_BLOCKS_DONE, + }; + + int64 vals[6]; + + vals[0] = list_length(DatabaseList); + vals[1] = 0; + + /* translated to NULL */ + vals[2] = -1; + vals[3] = -1; + vals[4] = -1; + vals[5] = -1; + + pgstat_progress_update_multi_param(6, index, vals); + } + + while (true) + { + int processed_databases = 0; + + foreach_ptr(DataChecksumsWorkerDatabase, db, DatabaseList) + { + DataChecksumsWorkerResult result; + DataChecksumsWorkerResultEntry *entry; + bool found; + + /* + * Check if this database has been processed already, and if so + * whether it should be retried or skipped. + */ + entry = (DataChecksumsWorkerResultEntry *) hash_search(ProcessedDatabases, &db->dboid, + HASH_FIND, NULL); + + if (entry) + { + if (entry->result == DATACHECKSUMSWORKER_RETRYDB) + { + /* + * Limit the number of retries to avoid infinite looping + * in case there simply won't be enough workers in the + * cluster to finish this operation. + */ + if (entry->retries > DATACHECKSUMSWORKER_MAX_DB_RETRIES) + entry->result = DATACHECKSUMSWORKER_FAILED; + } + + /* Skip if this database has been processed already */ + if (entry->result != DATACHECKSUMSWORKER_RETRYDB) + continue; + } + + result = ProcessDatabase(db); + processed_databases++; + + /* + * Update the number of processed databases in the progress + * report. + */ + pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_DBS_DONE, + processed_databases); + + /* Allow a test process to alter the result of the operation */ + INJECTION_POINT("datachecksumsworker-fail-db", &result); + + if (result == DATACHECKSUMSWORKER_SUCCESSFUL) + { + /* + * If one database has completed shared catalogs, we don't + * have to process them again. + */ + if (DataChecksumsWorkerShmem->process_shared_catalogs) + DataChecksumsWorkerShmem->process_shared_catalogs = false; + } + else if (result == DATACHECKSUMSWORKER_ABORTED) + { + /* Abort flag set, so exit the whole process */ + return false; + } + + entry = hash_search(ProcessedDatabases, &db->dboid, HASH_ENTER, &found); + entry->dboid = db->dboid; + entry->result = result; + if (!found) + entry->retries = 0; + else + entry->retries++; + } + + elog(DEBUG1, + "%i databases processed for data checksum enabling, %s", + processed_databases, + (processed_databases ? "process with restart" : "process completed")); + + FreeDatabaseList(DatabaseList); + + /* + * If no databases were processed in this run of the loop, we have now + * finished all databases and no concurrently created ones can exist. + */ + if (processed_databases == 0) + break; + + /* + * Re-generate the list of databases for another pass. Since we wait + * for all pre-existing transactions finish, this way we can be + * certain that there are no databases left without checksums. + */ + WaitForAllTransactionsToFinish(); + DatabaseList = BuildDatabaseList(); + } + + /* + * ProcessedDatabases now has all databases and the results of their + * processing. Failure to enable checksums for a database can be because + * they actually failed for some reason, or because the database was + * dropped between us getting the database list and trying to process it. + * Get a fresh list of databases to detect the second case where the + * database was dropped before we had started processing it. If a database + * still exists, but enabling checksums failed then we fail the entire + * checksumming process and exit with an error. + */ + WaitForAllTransactionsToFinish(); + DatabaseList = BuildDatabaseList(); + + foreach_ptr(DataChecksumsWorkerDatabase, db, DatabaseList) + { + DataChecksumsWorkerResultEntry *entry; + bool found; + + entry = hash_search(ProcessedDatabases, (void *) &db->dboid, + HASH_FIND, &found); + + /* + * We are only interested in the processed databases which failed, and + * where the failed database still exists. This indicates that + * enabling checksums actually failed, and not that the failure was + * due to the db being concurrently dropped. + */ + if (found && entry->result == DATACHECKSUMSWORKER_FAILED) + { + ereport(WARNING, + errmsg("failed to enable data checksums in \"%s\"", db->dbname)); + found_failed = found; + continue; + } + } + + FreeDatabaseList(DatabaseList); + + if (found_failed) + { + /* Disable checksums on cluster, because we failed */ + SetDataChecksumsOff(); + /* Force a checkpoint to make everything consistent */ + flags = CHECKPOINT_FORCE | CHECKPOINT_WAIT; + if (immediate_checkpoint) + flags = flags | CHECKPOINT_FAST; + RequestCheckpoint(flags); + ereport(ERROR, + errmsg("data checksums failed to get enabled in all databases, aborting"), + errhint("The server log might have more information on the cause of the error.")); + } + + /* + * When enabling checksums, we have to wait for a checkpoint for the + * checksums to change from in-progress to on. + */ + pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_PHASE, + PROGRESS_DATACHECKSUMS_PHASE_WAITING_CHECKPOINT); + + /* + * Force a checkpoint to get everything out to disk. The use of immediate + * checkpoints is for running tests, as they would otherwise not execute + * in such a way that they can reliably be placed under timeout control. + */ + flags = CHECKPOINT_FORCE | CHECKPOINT_WAIT; + if (immediate_checkpoint) + flags = flags | CHECKPOINT_FAST; + RequestCheckpoint(flags); + + return true; +} + +/* + * DataChecksumsWorkerShmemSize + * Compute required space for datachecksumsworker-related shared memory + */ +Size +DataChecksumsWorkerShmemSize(void) +{ + Size size; + + size = sizeof(DataChecksumsWorkerShmemStruct); + size = MAXALIGN(size); + + return size; +} + +/* + * DataChecksumsWorkerShmemInit + * Allocate and initialize datachecksumsworker-related shared memory + */ +void +DataChecksumsWorkerShmemInit(void) +{ + bool found; + + DataChecksumsWorkerShmem = (DataChecksumsWorkerShmemStruct *) + ShmemInitStruct("DataChecksumsWorker Data", + DataChecksumsWorkerShmemSize(), + &found); + + if (!found) + { + MemSet(DataChecksumsWorkerShmem, 0, DataChecksumsWorkerShmemSize()); + + /* + * Even if this is a redundant assignment, we want to be explicit + * about our intent for readability, since we want to be able to query + * this state in case of restartability. + */ + DataChecksumsWorkerShmem->launch_operation = false; + DataChecksumsWorkerShmem->launcher_running = false; + DataChecksumsWorkerShmem->launch_fast = false; + } +} + +/* + * BuildDatabaseList + * Compile a list of all currently available databases in the cluster + * + * This creates the list of databases for the datachecksumsworker workers to + * add checksums to. If the caller wants to ensure that no concurrently + * running CREATE DATABASE calls exist, this needs to be preceded by a call + * to WaitForAllTransactionsToFinish(). + */ +static List * +BuildDatabaseList(void) +{ + List *DatabaseList = NIL; + Relation rel; + TableScanDesc scan; + HeapTuple tup; + MemoryContext ctx = CurrentMemoryContext; + MemoryContext oldctx; + + StartTransactionCommand(); + + rel = table_open(DatabaseRelationId, AccessShareLock); + scan = table_beginscan_catalog(rel, 0, NULL); + + while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection))) + { + Form_pg_database pgdb = (Form_pg_database) GETSTRUCT(tup); + DataChecksumsWorkerDatabase *db; + + oldctx = MemoryContextSwitchTo(ctx); + + db = (DataChecksumsWorkerDatabase *) palloc0(sizeof(DataChecksumsWorkerDatabase)); + + db->dboid = pgdb->oid; + db->dbname = pstrdup(NameStr(pgdb->datname)); + + DatabaseList = lappend(DatabaseList, db); + + MemoryContextSwitchTo(oldctx); + } + + table_endscan(scan); + table_close(rel, AccessShareLock); + + CommitTransactionCommand(); + + return DatabaseList; +} + +static void +FreeDatabaseList(List *dblist) +{ + if (!dblist) + return; + + foreach_ptr(DataChecksumsWorkerDatabase, db, dblist) + { + if (db->dbname != NULL) + pfree(db->dbname); + } + + list_free_deep(dblist); +} + +/* + * BuildRelationList + * Compile a list of relations in the database + * + * Returns a list of OIDs for the request relation types. If temp_relations + * is True then only temporary relations are returned. If temp_relations is + * False then non-temporary relations which have data checksums are returned. + * If include_shared is True then shared relations are included as well in a + * non-temporary list. include_shared has no relevance when building a list of + * temporary relations. + */ +static List * +BuildRelationList(bool temp_relations, bool include_shared) +{ + List *RelationList = NIL; + Relation rel; + TableScanDesc scan; + HeapTuple tup; + MemoryContext ctx = CurrentMemoryContext; + MemoryContext oldctx; + + StartTransactionCommand(); + + rel = table_open(RelationRelationId, AccessShareLock); + scan = table_beginscan_catalog(rel, 0, NULL); + + while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection))) + { + Form_pg_class pgc = (Form_pg_class) GETSTRUCT(tup); + + /* + * Only include temporary relations when asked for a temp relation + * list. + */ + if (pgc->relpersistence == RELPERSISTENCE_TEMP) + { + if (!temp_relations) + continue; + } + else + { + /* + * If we are only interested in temp relations then continue + * immediately as the current relation isn't a temp relation. + */ + if (temp_relations) + continue; + + if (!RELKIND_HAS_STORAGE(pgc->relkind)) + continue; + + if (pgc->relisshared && !include_shared) + continue; + } + + oldctx = MemoryContextSwitchTo(ctx); + RelationList = lappend_oid(RelationList, pgc->oid); + MemoryContextSwitchTo(oldctx); + } + + table_endscan(scan); + table_close(rel, AccessShareLock); + + CommitTransactionCommand(); + + return RelationList; +} + +/* + * DataChecksumsWorkerMain + * + * Main function for enabling checksums in a single database, This is the + * function set as the bgw_function_name in the dynamic background worker + * process initiated for each database by the worker launcher. After enabling + * data checksums in each applicable relation in the database, it will wait for + * all temporary relations that were present when the function started to + * disappear before returning. This is required since we cannot rewrite + * existing temporary relations with data checksums. + */ +void +DataChecksumsWorkerMain(Datum arg) +{ + Oid dboid = DatumGetObjectId(arg); + List *RelationList = NIL; + List *InitialTempTableList = NIL; + BufferAccessStrategy strategy; + bool aborted = false; + int64 rels_done; + + operation = ENABLE_DATACHECKSUMS; + + pqsignal(SIGTERM, die); + + BackgroundWorkerUnblockSignals(); + + MyBackendType = B_DATACHECKSUMSWORKER_WORKER; + init_ps_display(NULL); + + BackgroundWorkerInitializeConnectionByOid(dboid, InvalidOid, + BGWORKER_BYPASS_ALLOWCONN); + + /* worker will have a separate entry in pg_stat_progress_data_checksums */ + pgstat_progress_start_command(PROGRESS_COMMAND_DATACHECKSUMS, + InvalidOid); + + /* + * Get a list of all temp tables present as we start in this database. We + * need to wait until they are all gone until we are done, since we cannot + * access these relations and modify them. + */ + InitialTempTableList = BuildRelationList(true, false); + + /* + * Enable vacuum cost delay, if any. + */ + Assert(DataChecksumsWorkerShmem->operation == ENABLE_DATACHECKSUMS); + VacuumCostDelay = DataChecksumsWorkerShmem->cost_delay; + VacuumCostLimit = DataChecksumsWorkerShmem->cost_limit; + VacuumCostActive = (VacuumCostDelay > 0); + VacuumCostBalance = 0; + VacuumCostPageHit = 0; + VacuumCostPageMiss = 0; + VacuumCostPageDirty = 0; + + /* + * Create and set the vacuum strategy as our buffer strategy. + */ + strategy = GetAccessStrategy(BAS_VACUUM); + + RelationList = BuildRelationList(false, + DataChecksumsWorkerShmem->process_shared_catalogs); + + /* Update the total number of relations to be processed in this DB. */ + { + const int index[] = { + PROGRESS_DATACHECKSUMS_RELS_TOTAL, + PROGRESS_DATACHECKSUMS_RELS_DONE + }; + + int64 vals[2]; + + vals[0] = list_length(RelationList); + vals[1] = 0; + + pgstat_progress_update_multi_param(2, index, vals); + } + + /* Process the relations */ + rels_done = 0; + foreach_oid(reloid, RelationList) + { + if (!ProcessSingleRelationByOid(reloid, strategy)) + { + aborted = true; + break; + } + + pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_RELS_DONE, + ++rels_done); + } + list_free(RelationList); + + if (aborted) + { + DataChecksumsWorkerShmem->success = DATACHECKSUMSWORKER_ABORTED; + ereport(DEBUG1, + errmsg("data checksum processing aborted in database OID %u", + dboid)); + return; + } + + /* The worker is about to wait for temporary tables to go away. */ + pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_PHASE, + PROGRESS_DATACHECKSUMS_PHASE_WAITING_TEMPREL); + + /* + * Wait for all temp tables that existed when we started to go away. This + * is necessary since we cannot "reach" them to enable checksums. Any temp + * tables created after we started will already have checksums in them + * (due to the "inprogress-on" state), so no need to wait for those. + */ + for (;;) + { + List *CurrentTempTables; + int numleft; + char activity[64]; + + CurrentTempTables = BuildRelationList(true, false); + numleft = 0; + foreach_oid(tmptbloid, InitialTempTableList) + { + if (list_member_oid(CurrentTempTables, tmptbloid)) + numleft++; + } + list_free(CurrentTempTables); + + INJECTION_POINT("datachecksumsworker-fake-temptable-wait", &numleft); + + if (numleft == 0) + break; + + /* + * At least one temp table is left to wait for, indicate in pgstat + * activity and progress reporting. + */ + snprintf(activity, + sizeof(activity), + "Waiting for %d temp tables to be removed", numleft); + pgstat_report_activity(STATE_RUNNING, activity); + + /* Retry every 3 seconds */ + ResetLatch(MyLatch); + (void) WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + 3000, + WAIT_EVENT_CHECKSUM_ENABLE_TEMPTABLE_WAIT); + + LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE); + aborted = DataChecksumsWorkerShmem->launch_operation != operation; + LWLockRelease(DataChecksumsWorkerLock); + + if (aborted || abort_requested) + { + DataChecksumsWorkerShmem->success = DATACHECKSUMSWORKER_ABORTED; + ereport(DEBUG1, + errmsg("data checksum processing aborted in database OID %u", + dboid)); + return; + } + } + + list_free(InitialTempTableList); + + /* worker done */ + pgstat_progress_end_command(); + + DataChecksumsWorkerShmem->success = DATACHECKSUMSWORKER_SUCCESSFUL; +} diff --git a/src/backend/postmaster/launch_backend.c b/src/backend/postmaster/launch_backend.c index bf6b55ee8304..955df32be5d0 100644 --- a/src/backend/postmaster/launch_backend.c +++ b/src/backend/postmaster/launch_backend.c @@ -204,6 +204,9 @@ static child_process_kind child_process_kinds[] = { [B_WAL_SUMMARIZER] = {"wal_summarizer", WalSummarizerMain, true}, [B_WAL_WRITER] = {"wal_writer", WalWriterMain, true}, + [B_DATACHECKSUMSWORKER_LAUNCHER] = {"datachecksum launcher", NULL, false}, + [B_DATACHECKSUMSWORKER_WORKER] = {"datachecksum worker", NULL, false}, + [B_LOGGER] = {"syslogger", SysLoggerMain, false}, }; diff --git a/src/backend/postmaster/meson.build b/src/backend/postmaster/meson.build index 0008603cfee9..ce10ef1059a8 100644 --- a/src/backend/postmaster/meson.build +++ b/src/backend/postmaster/meson.build @@ -6,6 +6,7 @@ backend_sources += files( 'bgworker.c', 'bgwriter.c', 'checkpointer.c', + 'datachecksumsworker.c', 'fork_process.c', 'interrupt.c', 'launch_backend.c', diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index e1d643b013d7..3d15a894c3a4 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -2983,6 +2983,11 @@ PostmasterStateMachine(void) B_INVALID, B_STANDALONE_BACKEND); + /* also add checksumming processes */ + remainMask = btmask_add(remainMask, + B_DATACHECKSUMSWORKER_LAUNCHER, + B_DATACHECKSUMSWORKER_WORKER); + /* All types should be included in targetMask or remainMask */ Assert((remainMask.mask | targetMask.mask) == BTYPE_MASK_ALL.mask); } diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c index cc03f0706e9c..f9f06821a8f9 100644 --- a/src/backend/replication/logical/decode.c +++ b/src/backend/replication/logical/decode.c @@ -186,6 +186,7 @@ xlog_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) case XLOG_FPW_CHANGE: case XLOG_FPI_FOR_HINT: case XLOG_FPI: + case XLOG_CHECKSUMS: case XLOG_OVERWRITE_CONTRECORD: case XLOG_CHECKPOINT_REDO: break; diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index 2fa045e6b0f6..44213d140aee 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -30,6 +30,8 @@ #include "postmaster/autovacuum.h" #include "postmaster/bgworker_internals.h" #include "postmaster/bgwriter.h" +#include "postmaster/datachecksumsworker.h" +#include "postmaster/postmaster.h" #include "postmaster/walsummarizer.h" #include "replication/logicallauncher.h" #include "replication/origin.h" @@ -150,6 +152,7 @@ CalculateShmemSize(int *num_semaphores) size = add_size(size, InjectionPointShmemSize()); size = add_size(size, SlotSyncShmemSize()); size = add_size(size, AioShmemSize()); + size = add_size(size, DataChecksumsWorkerShmemSize()); /* include additional requested shmem from preload libraries */ size = add_size(size, total_addin_request); @@ -332,6 +335,7 @@ CreateOrAttachShmemStructs(void) PgArchShmemInit(); ApplyLauncherShmemInit(); SlotSyncShmemInit(); + DataChecksumsWorkerShmemInit(); /* * Set up other modules that need some shared memory space diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c index 087821311cce..6881c6f4069a 100644 --- a/src/backend/storage/ipc/procsignal.c +++ b/src/backend/storage/ipc/procsignal.c @@ -18,6 +18,7 @@ #include #include "access/parallel.h" +#include "access/xlog.h" #include "commands/async.h" #include "miscadmin.h" #include "pgstat.h" @@ -576,6 +577,18 @@ ProcessProcSignalBarrier(void) case PROCSIGNAL_BARRIER_SMGRRELEASE: processed = ProcessBarrierSmgrRelease(); break; + case PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON: + processed = AbsorbChecksumsOnInProgressBarrier(); + break; + case PROCSIGNAL_BARRIER_CHECKSUM_ON: + processed = AbsorbChecksumsOnBarrier(); + break; + case PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF: + processed = AbsorbChecksumsOffInProgressBarrier(); + break; + case PROCSIGNAL_BARRIER_CHECKSUM_OFF: + processed = AbsorbChecksumsOffBarrier(); + break; } /* diff --git a/src/backend/storage/page/README b/src/backend/storage/page/README index e30d7ac59adc..73c36a639086 100644 --- a/src/backend/storage/page/README +++ b/src/backend/storage/page/README @@ -10,7 +10,9 @@ http://www.cs.toronto.edu/~bianca/papers/sigmetrics09.pdf, discussed 2010/12/22 on -hackers list. Current implementation requires this be enabled system-wide at initdb time, or -by using the pg_checksums tool on an offline cluster. +by using the pg_checksums tool on an offline cluster. Checksums can also be +enabled at runtime using pg_enable_data_checksums(), and disabled by using +pg_disable_data_checksums(). The checksum is not valid at all times on a data page!! The checksum is valid when the page leaves the shared pool and is checked diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c index dbb49ed9197d..19cf6512e520 100644 --- a/src/backend/storage/page/bufpage.c +++ b/src/backend/storage/page/bufpage.c @@ -107,7 +107,7 @@ PageIsVerified(PageData *page, BlockNumber blkno, int flags, bool *checksum_fail */ if (!PageIsNew(page)) { - if (DataChecksumsEnabled()) + if (DataChecksumsNeedVerify()) { checksum = pg_checksum_page(page, blkno); @@ -1511,7 +1511,7 @@ PageSetChecksumCopy(Page page, BlockNumber blkno) static char *pageCopy = NULL; /* If we don't need a checksum, just return the passed-in data */ - if (PageIsNew(page) || !DataChecksumsEnabled()) + if (PageIsNew(page) || !DataChecksumsNeedWrite()) return page; /* @@ -1541,7 +1541,7 @@ void PageSetChecksumInplace(Page page, BlockNumber blkno) { /* If we don't need a checksum, just return */ - if (PageIsNew(page) || !DataChecksumsEnabled()) + if (PageIsNew(page) || !DataChecksumsNeedWrite()) return; ((PageHeader) page)->pd_checksum = pg_checksum_page(page, blkno); diff --git a/src/backend/utils/activity/pgstat_backend.c b/src/backend/utils/activity/pgstat_backend.c index 8714a85e2d93..edc2512d79f7 100644 --- a/src/backend/utils/activity/pgstat_backend.c +++ b/src/backend/utils/activity/pgstat_backend.c @@ -378,6 +378,8 @@ pgstat_tracks_backend_bktype(BackendType bktype) case B_CHECKPOINTER: case B_IO_WORKER: case B_STARTUP: + case B_DATACHECKSUMSWORKER_LAUNCHER: + case B_DATACHECKSUMSWORKER_WORKER: return false; case B_AUTOVAC_WORKER: diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c index 13ae57ed6498..a290d56f4096 100644 --- a/src/backend/utils/activity/pgstat_io.c +++ b/src/backend/utils/activity/pgstat_io.c @@ -362,6 +362,8 @@ pgstat_tracks_io_bktype(BackendType bktype) case B_LOGGER: return false; + case B_DATACHECKSUMSWORKER_LAUNCHER: + case B_DATACHECKSUMSWORKER_WORKER: case B_AUTOVAC_LAUNCHER: case B_AUTOVAC_WORKER: case B_BACKEND: diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt index 5427da5bc1b1..7f26d78cb77c 100644 --- a/src/backend/utils/activity/wait_event_names.txt +++ b/src/backend/utils/activity/wait_event_names.txt @@ -116,6 +116,9 @@ CHECKPOINT_DELAY_COMPLETE "Waiting for a backend that blocks a checkpoint from c CHECKPOINT_DELAY_START "Waiting for a backend that blocks a checkpoint from starting." CHECKPOINT_DONE "Waiting for a checkpoint to complete." CHECKPOINT_START "Waiting for a checkpoint to start." +CHECKSUM_ENABLE_STARTCONDITION "Waiting for data checksums enabling to start." +CHECKSUM_ENABLE_FINISHCONDITION "Waiting for data checksums to be enabled." +CHECKSUM_ENABLE_TEMPTABLE_WAIT "Waiting for temporary tables to be dropped for data checksums to be enabled." EXECUTE_GATHER "Waiting for activity from a child process while executing a Gather plan node." HASH_BATCH_ALLOCATE "Waiting for an elected Parallel Hash participant to allocate a hash table." HASH_BATCH_ELECT "Waiting to elect a Parallel Hash participant to allocate a hash table." @@ -352,6 +355,7 @@ DSMRegistry "Waiting to read or update the dynamic shared memory registry." InjectionPoint "Waiting to read or update information related to injection points." SerialControl "Waiting to read or update shared pg_serial state." AioWorkerSubmissionQueue "Waiting to access AIO worker submission queue." +DataChecksumsWorker "Waiting for data checksumsworker." # # END OF PREDEFINED LWLOCKS (DO NOT CHANGE THIS LINE) diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c index c756c2bebaaa..f4e264ebf33c 100644 --- a/src/backend/utils/adt/pgstatfuncs.c +++ b/src/backend/utils/adt/pgstatfuncs.c @@ -274,6 +274,8 @@ pg_stat_get_progress_info(PG_FUNCTION_ARGS) cmdtype = PROGRESS_COMMAND_BASEBACKUP; else if (pg_strcasecmp(cmd, "COPY") == 0) cmdtype = PROGRESS_COMMAND_COPY; + else if (pg_strcasecmp(cmd, "DATACHECKSUMS") == 0) + cmdtype = PROGRESS_COMMAND_DATACHECKSUMS; else ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), @@ -1146,9 +1148,6 @@ pg_stat_get_db_checksum_failures(PG_FUNCTION_ARGS) int64 result; PgStat_StatDBEntry *dbentry; - if (!DataChecksumsEnabled()) - PG_RETURN_NULL(); - if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL) result = 0; else @@ -1164,9 +1163,6 @@ pg_stat_get_db_checksum_last_failure(PG_FUNCTION_ARGS) TimestampTz result; PgStat_StatDBEntry *dbentry; - if (!DataChecksumsEnabled()) - PG_RETURN_NULL(); - if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL) result = 0; else diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c index 545d1e90fbd4..34cce2ce0bed 100644 --- a/src/backend/utils/init/miscinit.c +++ b/src/backend/utils/init/miscinit.c @@ -293,9 +293,18 @@ GetBackendTypeDesc(BackendType backendType) case B_CHECKPOINTER: backendDesc = gettext_noop("checkpointer"); break; + case B_IO_WORKER: backendDesc = gettext_noop("io worker"); break; + + case B_DATACHECKSUMSWORKER_LAUNCHER: + backendDesc = "datachecksumsworker launcher"; + break; + case B_DATACHECKSUMSWORKER_WORKER: + backendDesc = "datachecksumsworker worker"; + break; + case B_LOGGER: backendDesc = gettext_noop("logger"); break; @@ -895,7 +904,8 @@ InitializeSessionUserIdStandalone(void) * workers, in slot sync worker and in background workers. */ Assert(!IsUnderPostmaster || AmAutoVacuumWorkerProcess() || - AmLogicalSlotSyncWorkerProcess() || AmBackgroundWorkerProcess()); + AmLogicalSlotSyncWorkerProcess() || AmBackgroundWorkerProcess() || + AmDataChecksumsWorkerProcess()); /* call only once */ Assert(!OidIsValid(AuthenticatedUserId)); diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 641e535a73c7..589e7eab9e84 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -750,6 +750,24 @@ InitPostgres(const char *in_dbname, Oid dboid, ProcSignalInit(MyCancelKey, MyCancelKeyLength); + /* + * Initialize a local cache of the data_checksum_version, to be updated by + * the procsignal-based barriers. + * + * This intentionally happens after initializing the procsignal, otherwise + * we might miss a state change. This means we can get a barrier for the + * state we've just initialized - but it can happen only once. + * + * The postmaster (which is what gets forked into the new child process) + * does not handle barriers, therefore it may not have the current value + * of LocalDataChecksumVersion value (it'll have the value read from the + * control file, which may be arbitrarily old). + * + * NB: Even if the postmaster handled barriers, the value might still be + * stale, as it might have changed after this process forked. + */ + InitLocalDataChecksumVersion(); + /* * Also set up timeout handlers needed for backend operation. We need * these in every case except bootstrap. @@ -878,7 +896,7 @@ InitPostgres(const char *in_dbname, Oid dboid, errhint("You should immediately run CREATE USER \"%s\" SUPERUSER;.", username != NULL ? username : "postgres"))); } - else if (AmBackgroundWorkerProcess()) + else if (AmBackgroundWorkerProcess() || AmDataChecksumsWorkerProcess()) { if (username == NULL && !OidIsValid(useroid)) { diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index f137129209f6..36fba8496df2 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -491,6 +491,14 @@ static const struct config_enum_entry file_copy_method_options[] = { {NULL, 0, false} }; +static const struct config_enum_entry data_checksums_options[] = { + {"on", PG_DATA_CHECKSUM_VERSION, true}, + {"off", PG_DATA_CHECKSUM_OFF, true}, + {"inprogress-on", PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION, true}, + {"inprogress-off", PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION, true}, + {NULL, 0, false} +}; + /* * Options for enum values stored in other modules */ @@ -616,7 +624,6 @@ static int shared_memory_size_mb; static int shared_memory_size_in_huge_pages; static int wal_block_size; static int num_os_semaphores; -static bool data_checksums; static bool integer_datetimes; #ifdef USE_ASSERT_CHECKING @@ -2043,17 +2050,6 @@ struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, - { - {"data_checksums", PGC_INTERNAL, PRESET_OPTIONS, - gettext_noop("Shows whether data checksums are turned on for this cluster."), - NULL, - GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE | GUC_RUNTIME_COMPUTED - }, - &data_checksums, - false, - NULL, NULL, NULL - }, - { {"syslog_sequence_numbers", PGC_SIGHUP, LOGGING_WHERE, gettext_noop("Add sequence number to syslog messages to avoid duplicate suppression."), @@ -5489,6 +5485,16 @@ struct config_enum ConfigureNamesEnum[] = DEFAULT_IO_METHOD, io_method_options, NULL, assign_io_method, NULL }, + { + {"data_checksums", PGC_INTERNAL, PRESET_OPTIONS, + gettext_noop("Shows whether data checksums are turned on for this cluster."), + NULL, + GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE | GUC_RUNTIME_COMPUTED + }, + &data_checksums, + PG_DATA_CHECKSUM_OFF, data_checksums_options, + NULL, NULL, show_data_checksums + }, /* End-of-list marker */ { diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c index f20be82862a2..8411cecf3ffb 100644 --- a/src/bin/pg_checksums/pg_checksums.c +++ b/src/bin/pg_checksums/pg_checksums.c @@ -568,7 +568,7 @@ main(int argc, char *argv[]) ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY) pg_fatal("cluster must be shut down"); - if (ControlFile->data_checksum_version == 0 && + if (ControlFile->data_checksum_version != PG_DATA_CHECKSUM_VERSION && mode == PG_MODE_CHECK) pg_fatal("data checksums are not enabled in cluster"); @@ -576,7 +576,7 @@ main(int argc, char *argv[]) mode == PG_MODE_DISABLE) pg_fatal("data checksums are already disabled in cluster"); - if (ControlFile->data_checksum_version > 0 && + if (ControlFile->data_checksum_version == PG_DATA_CHECKSUM_VERSION && mode == PG_MODE_ENABLE) pg_fatal("data checksums are already enabled in cluster"); diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c index 10de058ce91f..acf5c7b026e7 100644 --- a/src/bin/pg_controldata/pg_controldata.c +++ b/src/bin/pg_controldata/pg_controldata.c @@ -280,6 +280,8 @@ main(int argc, char *argv[]) ControlFile->checkPointCopy.oldestCommitTsXid); printf(_("Latest checkpoint's newestCommitTsXid:%u\n"), ControlFile->checkPointCopy.newestCommitTsXid); + printf(_("Latest checkpoint's data_checksum_version:%u\n"), + ControlFile->checkPointCopy.data_checksum_version); printf(_("Time of latest checkpoint: %s\n"), ckpttime_str); printf(_("Fake LSN counter for unlogged rels: %X/%08X\n"), diff --git a/src/bin/pg_upgrade/controldata.c b/src/bin/pg_upgrade/controldata.c index 90cef0864de7..29684e824401 100644 --- a/src/bin/pg_upgrade/controldata.c +++ b/src/bin/pg_upgrade/controldata.c @@ -15,6 +15,7 @@ #include "access/xlog_internal.h" #include "common/string.h" #include "pg_upgrade.h" +#include "storage/bufpage.h" /* @@ -736,6 +737,14 @@ check_control_data(ControlData *oldctrl, * check_for_isn_and_int8_passing_mismatch(). */ + /* + * If data checksums are in any in-progress state then disallow the + * upgrade. The user should either let the process finish, or turn off + * data checksums, before retrying. + */ + if (oldctrl->data_checksum_version > PG_DATA_CHECKSUM_VERSION) + pg_fatal("checksums are being enabled in the old cluster"); + /* * We might eventually allow upgrades from checksum to no-checksum * clusters. diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index d12798be3d80..8bcc5aa8a63e 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -56,6 +56,7 @@ extern PGDLLIMPORT int CommitDelay; extern PGDLLIMPORT int CommitSiblings; extern PGDLLIMPORT bool track_wal_io_timing; extern PGDLLIMPORT int wal_decode_buffer_size; +extern PGDLLIMPORT int data_checksums; extern PGDLLIMPORT int CheckPointSegments; @@ -117,7 +118,7 @@ extern PGDLLIMPORT int wal_level; * of the bits make it to disk, but the checksum wouldn't match. Also WAL-log * them if forced by wal_log_hints=on. */ -#define XLogHintBitIsNeeded() (DataChecksumsEnabled() || wal_log_hints) +#define XLogHintBitIsNeeded() (wal_log_hints || DataChecksumsNeedWrite()) /* Do we need to WAL-log information required only for Hot Standby and logical replication? */ #define XLogStandbyInfoActive() (wal_level >= WAL_LEVEL_REPLICA) @@ -229,7 +230,19 @@ extern XLogRecPtr GetXLogWriteRecPtr(void); extern uint64 GetSystemIdentifier(void); extern char *GetMockAuthenticationNonce(void); -extern bool DataChecksumsEnabled(void); +extern bool DataChecksumsNeedWrite(void); +extern bool DataChecksumsNeedVerify(void); +extern bool DataChecksumsOnInProgress(void); +extern bool DataChecksumsOffInProgress(void); +extern void SetDataChecksumsOnInProgress(void); +extern void SetDataChecksumsOn(void); +extern void SetDataChecksumsOff(void); +extern bool AbsorbChecksumsOnInProgressBarrier(void); +extern bool AbsorbChecksumsOffInProgressBarrier(void); +extern bool AbsorbChecksumsOnBarrier(void); +extern bool AbsorbChecksumsOffBarrier(void); +extern const char *show_data_checksums(void); +extern void InitLocalDataChecksumVersion(void); extern bool GetDefaultCharSignedness(void); extern XLogRecPtr GetFakeLSNForUnloggedRel(void); extern Size XLOGShmemSize(void); diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index cc06fc29ab2b..cc78b00fe4cc 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -25,6 +25,7 @@ #include "lib/stringinfo.h" #include "pgtime.h" #include "storage/block.h" +#include "storage/checksum.h" #include "storage/relfilelocator.h" @@ -289,6 +290,12 @@ typedef struct xl_restore_point char rp_name[MAXFNAMELEN]; } xl_restore_point; +/* Information logged when data checksum level is changed */ +typedef struct xl_checksum_state +{ + uint32 new_checksumtype; +} xl_checksum_state; + /* Overwrite of prior contrecord */ typedef struct xl_overwrite_contrecord { diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h index 63e834a6ce47..a8877fb87d1a 100644 --- a/src/include/catalog/pg_control.h +++ b/src/include/catalog/pg_control.h @@ -62,6 +62,9 @@ typedef struct CheckPoint * set to InvalidTransactionId. */ TransactionId oldestActiveXid; + + /* data checksums at the time of the checkpoint */ + uint32 data_checksum_version; } CheckPoint; /* XLOG info values for XLOG rmgr */ @@ -80,6 +83,7 @@ typedef struct CheckPoint /* 0xC0 is used in Postgres 9.5-11 */ #define XLOG_OVERWRITE_CONTRECORD 0xD0 #define XLOG_CHECKPOINT_REDO 0xE0 +#define XLOG_CHECKSUMS 0xF0 /* @@ -219,7 +223,7 @@ typedef struct ControlFileData bool float8ByVal; /* float8, int8, etc pass-by-value? */ /* Are data pages protected by checksums? Zero if no checksum version */ - uint32 data_checksum_version; + uint32 data_checksum_version; /* persistent */ /* * True if the default signedness of char is "signed" on a platform where diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 118d6da1ace0..c6f4e31a12fe 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -12356,6 +12356,25 @@ proname => 'jsonb_subscript_handler', prorettype => 'internal', proargtypes => 'internal', prosrc => 'jsonb_subscript_handler' }, +# data checksum management functions +{ oid => '9258', + descr => 'disable data checksums', + proname => 'pg_disable_data_checksums', provolatile => 'v', prorettype => 'void', + proparallel => 'r', + proargtypes => 'bool', proallargtypes => '{bool}', + proargmodes => '{i}', + proargnames => '{fast}', + prosrc => 'disable_data_checksums' }, + +{ oid => '9257', + descr => 'enable data checksums', + proname => 'pg_enable_data_checksums', provolatile => 'v', prorettype => 'void', + proparallel => 'r', + proargtypes => 'int4 int4 bool', proallargtypes => '{int4,int4,bool}', + proargmodes => '{i,i,i}', + proargnames => '{cost_delay,cost_limit,fast}', + prosrc => 'enable_data_checksums' }, + # collation management functions { oid => '3445', descr => 'import collations from operating system', proname => 'pg_import_system_collations', procost => '100', diff --git a/src/include/commands/progress.h b/src/include/commands/progress.h index 1cde4bd9bcf1..cf6de4ef12d6 100644 --- a/src/include/commands/progress.h +++ b/src/include/commands/progress.h @@ -162,4 +162,20 @@ #define PROGRESS_COPY_TYPE_PIPE 3 #define PROGRESS_COPY_TYPE_CALLBACK 4 +/* Progress parameters for PROGRESS_DATACHECKSUMS */ +#define PROGRESS_DATACHECKSUMS_PHASE 0 +#define PROGRESS_DATACHECKSUMS_DBS_TOTAL 1 +#define PROGRESS_DATACHECKSUMS_DBS_DONE 2 +#define PROGRESS_DATACHECKSUMS_RELS_TOTAL 3 +#define PROGRESS_DATACHECKSUMS_RELS_DONE 4 +#define PROGRESS_DATACHECKSUMS_BLOCKS_TOTAL 5 +#define PROGRESS_DATACHECKSUMS_BLOCKS_DONE 6 + +/* Phases of datachecksumsworker operation */ +#define PROGRESS_DATACHECKSUMS_PHASE_ENABLING 0 +#define PROGRESS_DATACHECKSUMS_PHASE_DISABLING 1 +#define PROGRESS_DATACHECKSUMS_PHASE_WAITING 2 +#define PROGRESS_DATACHECKSUMS_PHASE_WAITING_TEMPREL 3 +#define PROGRESS_DATACHECKSUMS_PHASE_WAITING_CHECKPOINT 4 + #endif diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 1bef98471c36..2a0d7b6de420 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -366,6 +366,9 @@ typedef enum BackendType B_WAL_SUMMARIZER, B_WAL_WRITER, + B_DATACHECKSUMSWORKER_LAUNCHER, + B_DATACHECKSUMSWORKER_WORKER, + /* * Logger is not connected to shared memory and does not have a PGPROC * entry. @@ -391,6 +394,9 @@ extern PGDLLIMPORT BackendType MyBackendType; #define AmWalSummarizerProcess() (MyBackendType == B_WAL_SUMMARIZER) #define AmWalWriterProcess() (MyBackendType == B_WAL_WRITER) #define AmIoWorkerProcess() (MyBackendType == B_IO_WORKER) +#define AmDataChecksumsWorkerProcess() \ + (MyBackendType == B_DATACHECKSUMSWORKER_LAUNCHER || \ + MyBackendType == B_DATACHECKSUMSWORKER_WORKER) #define AmSpecialWorkerProcess() \ (AmAutoVacuumLauncherProcess() || \ diff --git a/src/include/postmaster/datachecksumsworker.h b/src/include/postmaster/datachecksumsworker.h new file mode 100644 index 000000000000..2cd066fd0feb --- /dev/null +++ b/src/include/postmaster/datachecksumsworker.h @@ -0,0 +1,51 @@ +/*------------------------------------------------------------------------- + * + * datachecksumsworker.h + * header file for data checksum helper background worker + * + * + * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/postmaster/datachecksumsworker.h + * + *------------------------------------------------------------------------- + */ +#ifndef DATACHECKSUMSWORKER_H +#define DATACHECKSUMSWORKER_H + +/* Shared memory */ +extern Size DataChecksumsWorkerShmemSize(void); +extern void DataChecksumsWorkerShmemInit(void); + +/* Possible operations the Datachecksumsworker can perform */ +typedef enum DataChecksumsWorkerOperation +{ + ENABLE_DATACHECKSUMS, + DISABLE_DATACHECKSUMS, + /* TODO: VERIFY_DATACHECKSUMS, */ +} DataChecksumsWorkerOperation; + +/* + * Possible states for a database entry which has been processed. Exported + * here since we want to be able to reference this from injection point tests. + */ +typedef enum +{ + DATACHECKSUMSWORKER_SUCCESSFUL = 0, + DATACHECKSUMSWORKER_ABORTED, + DATACHECKSUMSWORKER_FAILED, + DATACHECKSUMSWORKER_RETRYDB, +} DataChecksumsWorkerResult; + +/* Start the background processes for enabling or disabling checksums */ +void StartDataChecksumsWorkerLauncher(DataChecksumsWorkerOperation op, + int cost_delay, + int cost_limit, + bool fast); + +/* Background worker entrypoints */ +void DataChecksumsWorkerLauncherMain(Datum arg); +void DataChecksumsWorkerMain(Datum arg); + +#endif /* DATACHECKSUMSWORKER_H */ diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h index aeb67c498c59..30fb0f62d4c0 100644 --- a/src/include/storage/bufpage.h +++ b/src/include/storage/bufpage.h @@ -16,6 +16,7 @@ #include "access/xlogdefs.h" #include "storage/block.h" +#include "storage/checksum.h" #include "storage/item.h" #include "storage/off.h" @@ -205,7 +206,6 @@ typedef PageHeaderData *PageHeader; * handling pages. */ #define PG_PAGE_LAYOUT_VERSION 4 -#define PG_DATA_CHECKSUM_VERSION 1 /* ---------------------------------------------------------------- * page support functions diff --git a/src/include/storage/checksum.h b/src/include/storage/checksum.h index 25d13a798d10..b3f368a15b52 100644 --- a/src/include/storage/checksum.h +++ b/src/include/storage/checksum.h @@ -15,6 +15,20 @@ #include "storage/block.h" +/* + * Checksum version 0 is used for when data checksums are disabled (OFF). + * PG_DATA_CHECKSUM_VERSION defines that data checksums are enabled in the + * cluster and PG_DATA_CHECKSUM_INPROGRESS_{ON|OFF}_VERSION defines that data + * checksums are either currently being enabled or disabled. + */ +typedef enum ChecksumType +{ + PG_DATA_CHECKSUM_OFF = 0, + PG_DATA_CHECKSUM_VERSION, + PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION, + PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION +} ChecksumType; + /* * Compute the checksum for a Postgres page. The page must be aligned on a * 4-byte boundary. diff --git a/src/include/storage/lwlocklist.h b/src/include/storage/lwlocklist.h index 06a1ffd4b08b..b8f7ba0be517 100644 --- a/src/include/storage/lwlocklist.h +++ b/src/include/storage/lwlocklist.h @@ -85,6 +85,7 @@ PG_LWLOCK(50, DSMRegistry) PG_LWLOCK(51, InjectionPoint) PG_LWLOCK(52, SerialControl) PG_LWLOCK(53, AioWorkerSubmissionQueue) +PG_LWLOCK(54, DataChecksumsWorker) /* * There also exist several built-in LWLock tranches. As with the predefined diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index c6f5ebceefdd..d90d35b1d6fa 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -463,11 +463,11 @@ extern PGDLLIMPORT PGPROC *PreparedXactProcs; * Background writer, checkpointer, WAL writer, WAL summarizer, and archiver * run during normal operation. Startup process and WAL receiver also consume * 2 slots, but WAL writer is launched only after startup has exited, so we - * only need 6 slots. + * only need 6 slots to cover these. The DataChecksums worker and launcher + * can consume 2 slots when data checksums are enabled or disabled. */ #define MAX_IO_WORKERS 32 -#define NUM_AUXILIARY_PROCS (6 + MAX_IO_WORKERS) - +#define NUM_AUXILIARY_PROCS (8 + MAX_IO_WORKERS) /* configurable options */ extern PGDLLIMPORT int DeadlockTimeout; diff --git a/src/include/storage/procsignal.h b/src/include/storage/procsignal.h index afeeb1ca019f..c54c61e2cd8a 100644 --- a/src/include/storage/procsignal.h +++ b/src/include/storage/procsignal.h @@ -54,6 +54,11 @@ typedef enum typedef enum { PROCSIGNAL_BARRIER_SMGRRELEASE, /* ask smgr to close files */ + + PROCSIGNAL_BARRIER_CHECKSUM_OFF, + PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON, + PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF, + PROCSIGNAL_BARRIER_CHECKSUM_ON, } ProcSignalBarrierType; /* diff --git a/src/include/utils/backend_progress.h b/src/include/utils/backend_progress.h index dda813ab4076..c664e92dbfe7 100644 --- a/src/include/utils/backend_progress.h +++ b/src/include/utils/backend_progress.h @@ -28,6 +28,7 @@ typedef enum ProgressCommandType PROGRESS_COMMAND_CREATE_INDEX, PROGRESS_COMMAND_BASEBACKUP, PROGRESS_COMMAND_COPY, + PROGRESS_COMMAND_DATACHECKSUMS, } ProgressCommandType; #define PGSTAT_NUM_PROGRESS_PARAM 20 diff --git a/src/test/Makefile b/src/test/Makefile index 511a72e6238a..278ce3e8a86e 100644 --- a/src/test/Makefile +++ b/src/test/Makefile @@ -12,7 +12,16 @@ subdir = src/test top_builddir = ../.. include $(top_builddir)/src/Makefile.global -SUBDIRS = perl postmaster regress isolation modules authentication recovery subscription +SUBDIRS = \ + perl \ + postmaster \ + regress \ + isolation \ + modules \ + authentication \ + recovery \ + subscription \ + checksum ifeq ($(with_icu),yes) SUBDIRS += icu diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile index 903a8ac151aa..c8f2747b2612 100644 --- a/src/test/modules/Makefile +++ b/src/test/modules/Makefile @@ -17,6 +17,7 @@ SUBDIRS = \ test_aio \ test_binaryheap \ test_bloomfilter \ + test_checksums \ test_copy_callbacks \ test_custom_rmgrs \ test_ddl_deparse \ diff --git a/src/test/modules/meson.build b/src/test/modules/meson.build index 93be0f57289a..6b4450eb4733 100644 --- a/src/test/modules/meson.build +++ b/src/test/modules/meson.build @@ -16,6 +16,7 @@ subdir('ssl_passphrase_callback') subdir('test_aio') subdir('test_binaryheap') subdir('test_bloomfilter') +subdir('test_checksums') subdir('test_copy_callbacks') subdir('test_custom_rmgrs') subdir('test_ddl_deparse') diff --git a/src/test/modules/test_checksums/.gitignore b/src/test/modules/test_checksums/.gitignore new file mode 100644 index 000000000000..871e943d50e1 --- /dev/null +++ b/src/test/modules/test_checksums/.gitignore @@ -0,0 +1,2 @@ +# Generated by test suite +/tmp_check/ diff --git a/src/test/modules/test_checksums/Makefile b/src/test/modules/test_checksums/Makefile new file mode 100644 index 000000000000..a5b6259a7288 --- /dev/null +++ b/src/test/modules/test_checksums/Makefile @@ -0,0 +1,40 @@ +#------------------------------------------------------------------------- +# +# Makefile for src/test/checksum +# +# Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group +# Portions Copyright (c) 1994, Regents of the University of California +# +# src/test/checksum/Makefile +# +#------------------------------------------------------------------------- + +EXTRA_INSTALL = src/test/modules/injection_points + +export enable_injection_points + +MODULE_big = test_checksums +OBJS = \ + $(WIN32RES) \ + test_checksums.o +PGFILEDESC = "test_checksums - test code for data checksums" + +EXTENSION = test_checksums +DATA = test_checksums--1.0.sql + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = src/test/modules/test_checksums +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif + +check: + $(prove_check) + +installcheck: + $(prove_installcheck) diff --git a/src/test/modules/test_checksums/README b/src/test/modules/test_checksums/README new file mode 100644 index 000000000000..0f0317060b38 --- /dev/null +++ b/src/test/modules/test_checksums/README @@ -0,0 +1,22 @@ +src/test/checksum/README + +Regression tests for data checksums +=================================== + +This directory contains a test suite for enabling data checksums +in a running cluster. + +Running the tests +================= + + make check + +or + + make installcheck + +NOTE: This creates a temporary installation (in the case of "check"), +with multiple nodes, be they master or standby(s) for the purpose of +the tests. + +NOTE: This requires the --enable-tap-tests argument to configure. diff --git a/src/test/modules/test_checksums/meson.build b/src/test/modules/test_checksums/meson.build new file mode 100644 index 000000000000..57156b63599b --- /dev/null +++ b/src/test/modules/test_checksums/meson.build @@ -0,0 +1,35 @@ +# Copyright (c) 2025, PostgreSQL Global Development Group + +test_checksums_sources = files( + 'test_checksums.c', +) + +test_checksums = shared_module('test_checksums', + test_checksums_sources, + kwargs: pg_test_mod_args, +) +test_install_libs += test_checksums + +test_install_data += files( + 'test_checksums.control', + 'test_checksums--1.0.sql', +) + +tests += { + 'name': 'test_checksums', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'tap': { + 'env': { + 'enable_injection_points': get_option('injection_points') ? 'yes' : 'no', + }, + 'tests': [ + 't/001_basic.pl', + 't/002_restarts.pl', + 't/003_standby_restarts.pl', + 't/004_offline.pl', + 't/005_injection.pl', + 't/006_concurrent_pgbench.pl', + ], + }, +} diff --git a/src/test/modules/test_checksums/t/001_basic.pl b/src/test/modules/test_checksums/t/001_basic.pl new file mode 100644 index 000000000000..728a5c4510c3 --- /dev/null +++ b/src/test/modules/test_checksums/t/001_basic.pl @@ -0,0 +1,63 @@ + +# Copyright (c) 2025, PostgreSQL Global Development Group + +# Test suite for testing enabling data checksums in an online cluster +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +use FindBin; +use lib $FindBin::RealBin; + +use DataChecksums::Utils; + +# Initialize node with checksums disabled. +my $node = PostgreSQL::Test::Cluster->new('main'); +$node->init(no_data_checksums => 1); +$node->start; + +# Create some content to have un-checksummed data in the cluster +$node->safe_psql('postgres', + "CREATE TABLE t AS SELECT generate_series(1,10000) AS a;"); + +# Ensure that checksums are turned off +test_checksum_state($node, 'off'); + +# Enable data checksums and wait for the state transition to 'on' +enable_data_checksums($node, wait => 'on'); + +# Run a dummy query just to make sure we can read back data +my $result = + $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1 "); +is($result, '9999', 'ensure checksummed pages can be read back'); + +# Enable data checksums again which should be a no-op so we explicitly don't +# wait for any state transition as none should happen here +enable_data_checksums($node); +test_checksum_state($node, 'on'); +# ..and make sure we can still read/write data +$node->safe_psql('postgres', "UPDATE t SET a = a + 1;"); +$result = $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); +is($result, '10000', 'ensure checksummed pages can be read back'); + +# Disable checksums again and wait for the state transition +disable_data_checksums($node, wait => 'on'); + +# Test reading data again +$result = $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); +is($result, '10000', 'ensure previously checksummed pages can be read back'); + +# Re-enable checksums and make sure that the underlying data has changed to +# ensure that checksums will be different. +$node->safe_psql('postgres', "UPDATE t SET a = a + 1;"); +enable_data_checksums($node, wait => 'on'); + +# Run a dummy query just to make sure we can read back the data +$result = $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); +is($result, '10000', 'ensure checksummed pages can be read back'); + +$node->stop; +done_testing(); diff --git a/src/test/modules/test_checksums/t/002_restarts.pl b/src/test/modules/test_checksums/t/002_restarts.pl new file mode 100644 index 000000000000..75599cf41f25 --- /dev/null +++ b/src/test/modules/test_checksums/t/002_restarts.pl @@ -0,0 +1,110 @@ + +# Copyright (c) 2024, PostgreSQL Global Development Group + +# Test suite for testing enabling data checksums in an online cluster with a +# restart which breaks processing. +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +use FindBin; +use lib $FindBin::RealBin; + +use DataChecksums::Utils; + +# Initialize node with checksums disabled. +my $node = PostgreSQL::Test::Cluster->new('main'); +$node->init(no_data_checksums => 1); +$node->start; + +# Initialize result storage for queries +my $result; + +# Create some content to have un-checksummed data in the cluster +$node->safe_psql('postgres', + "CREATE TABLE t AS SELECT generate_series(1,10000) AS a;"); + +# Ensure that checksums are disabled +test_checksum_state($node, 'off'); + +SKIP: +{ + skip 'Data checksum delay tests not enabled in PG_TEST_EXTRA', 6 + if (!$ENV{PG_TEST_EXTRA} + || $ENV{PG_TEST_EXTRA} !~ /\bchecksum_extended\b/); + + # Create a barrier for checksumming to block on, in this case a pre- + # existing temporary table which is kept open while processing is started. + # We can accomplish this by setting up an interactive psql process which + # keeps the temporary table created as we enable checksums in another psql + # process. + # + # This is a similar test to the synthetic variant in 005_injection.pl + # which fakes this scenario. + my $bsession = $node->background_psql('postgres'); + $bsession->query_safe('CREATE TEMPORARY TABLE tt (a integer);'); + + # In another session, make sure we can see the blocking temp table but + # start processing anyways and check that we are blocked with a proper + # wait event. + $result = $node->safe_psql('postgres', + "SELECT relpersistence FROM pg_catalog.pg_class WHERE relname = 'tt';" + ); + is($result, 't', 'ensure we can see the temporary table'); + + # Enabling data checksums shouldn't work as the process is blocked on the + # temporary table held open by $bsession. Ensure that we reach inprogress- + # on before we do more tests. + enable_data_checksums($node, wait => 'inprogress-on'); + + # Wait for processing to finish and the worker waiting for leftover temp + # relations to be able to actually finish + $result = $node->poll_query_until( + 'postgres', + "SELECT wait_event FROM pg_catalog.pg_stat_activity " + . "WHERE backend_type = 'datachecksumsworker worker';", + 'ChecksumEnableTemptableWait'); + + # The datachecksumsworker waits for temporary tables to disappear for 3 + # seconds before retrying, so sleep for 4 seconds to be guaranteed to see + # a retry cycle + sleep(4); + + # Re-check the wait event to ensure we are blocked on the right thing. + $result = $node->safe_psql('postgres', + "SELECT wait_event FROM pg_catalog.pg_stat_activity " + . "WHERE backend_type = 'datachecksumsworker worker';"); + is($result, 'ChecksumEnableTemptableWait', + 'ensure the correct wait condition is set'); + test_checksum_state($node, 'inprogress-on'); + + # Stop the cluster while bsession is still attached. We can't close the + # session first since the brief period between closing and stopping might + # be enough for checksums to get enabled. + $node->stop; + $bsession->quit; + $node->start; + + # Ensure the checksums aren't enabled across the restart. This leaves the + # cluster in the same state as before we entered the SKIP block. + test_checksum_state($node, 'off'); +} + +enable_data_checksums($node, wait => 'on'); + +$result = $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); +is($result, '9999', 'ensure checksummed pages can be read back'); + +$result = $node->poll_query_until( + 'postgres', + "SELECT count(*) FROM pg_stat_activity WHERE backend_type LIKE 'datachecksumsworker%';", + '0'); +is($result, 1, 'await datachecksums worker/launcher termination'); + +disable_data_checksums($node, wait => 1); + +$node->stop; +done_testing(); diff --git a/src/test/modules/test_checksums/t/003_standby_restarts.pl b/src/test/modules/test_checksums/t/003_standby_restarts.pl new file mode 100644 index 000000000000..fe34b4d7d05c --- /dev/null +++ b/src/test/modules/test_checksums/t/003_standby_restarts.pl @@ -0,0 +1,114 @@ + +# Copyright (c) 2024, PostgreSQL Global Development Group + +# Test suite for testing enabling data checksums in an online cluster with +# streaming replication +use strict; +use warnings FATAL => 'all'; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +use FindBin; +use lib $FindBin::RealBin; + +use DataChecksums::Utils; + +# Initialize primary node +my $node_primary = PostgreSQL::Test::Cluster->new('primary'); +$node_primary->init(allows_streaming => 1, no_data_checksums => 1); +$node_primary->start; + +my $slotname = 'physical_slot'; +$node_primary->safe_psql('postgres', + "SELECT pg_create_physical_replication_slot('$slotname')"); + +# Take backup +my $backup_name = 'my_backup'; +$node_primary->backup($backup_name); + +# Create streaming standby linking to primary +my $node_standby_1 = PostgreSQL::Test::Cluster->new('standby_1'); +$node_standby_1->init_from_backup($node_primary, $backup_name, + has_streaming => 1); +$node_standby_1->append_conf( + 'postgresql.conf', qq[ +primary_slot_name = '$slotname' +]); +$node_standby_1->start; + +# Create some content on the primary to have un-checksummed data in the cluster +$node_primary->safe_psql('postgres', + "CREATE TABLE t AS SELECT generate_series(1,10000) AS a;"); + +# Wait for standbys to catch up +$node_primary->wait_for_catchup($node_standby_1, 'replay', + $node_primary->lsn('insert')); + +# Check that checksums are turned off on all nodes +test_checksum_state($node_primary, 'off'); +test_checksum_state($node_standby_1, 'off'); + +# --------------------------------------------------------------------------- +# Enable checksums for the cluster, and make sure that both the primary and +# standby change state. +# + +# Ensure that the primary switches to "inprogress-on" +enable_data_checksums($node_primary, wait => 'inprogress-on'); +# Wait for checksum enable to be replayed +$node_primary->wait_for_catchup($node_standby_1, 'replay'); + +# Ensure that the standby has switched to "inprogress-on" or "on". Normally it +# would be "inprogress-on", but it is theoretically possible for the primary to +# complete the checksum enabling *and* have the standby replay that record +# before we reach the check below. +my $result = $node_standby_1->poll_query_until( + 'postgres', + "SELECT setting = 'off' FROM pg_catalog.pg_settings WHERE name = 'data_checksums';", + 'f'); +is($result, 1, 'ensure standby has absorbed the inprogress-on barrier'); +$result = $node_standby_1->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';" +); + +is(($result eq 'inprogress-on' || $result eq 'on'), + 1, 'ensure checksums are on, or in progress, on standby_1'); + +# Insert some more data which should be checksummed on INSERT +$node_primary->safe_psql('postgres', + "INSERT INTO t VALUES (generate_series(1, 10000));"); + +# Wait for checksums enabled on the primary and standby +wait_for_checksum_state($node_primary, 'on'); +wait_for_checksum_state($node_standby_1, 'on'); + +$result = + $node_primary->safe_psql('postgres', "SELECT count(a) FROM t WHERE a > 1"); +is($result, '19998', 'ensure we can safely read all data with checksums'); + +$result = $node_primary->poll_query_until( + 'postgres', + "SELECT count(*) FROM pg_stat_activity WHERE backend_type LIKE 'datachecksumsworker%';", + '0'); +is($result, 1, 'await datachecksums worker/launcher termination'); + +# +# Disable checksums and ensure it's propagated to standby and that we can +# still read all data +# + +# Disable checksums and wait for the operation to be replayed +disable_data_checksums($node_primary); +$node_primary->wait_for_catchup($node_standby_1, 'replay'); +# Ensure that the primary abd standby has switched to off +wait_for_checksum_state($node_primary, 'off'); +wait_for_checksum_state($node_standby_1, 'off'); +# Doublecheck reading data withourt errors +$result = + $node_primary->safe_psql('postgres', "SELECT count(a) FROM t WHERE a > 1"); +is($result, "19998", 'ensure we can safely read all data without checksums'); + +$node_standby_1->stop; +$node_primary->stop; +done_testing(); diff --git a/src/test/modules/test_checksums/t/004_offline.pl b/src/test/modules/test_checksums/t/004_offline.pl new file mode 100644 index 000000000000..e9fbcf77eab5 --- /dev/null +++ b/src/test/modules/test_checksums/t/004_offline.pl @@ -0,0 +1,82 @@ + +# Copyright (c) 2024, PostgreSQL Global Development Group + +# Test suite for testing enabling data checksums offline from various states +# of checksum processing +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +use FindBin; +use lib $FindBin::RealBin; + +use DataChecksums::Utils; + +# Initialize node with checksums disabled. +my $node = PostgreSQL::Test::Cluster->new('main'); +$node->init(no_data_checksums => 1); +$node->start; + +# Create some content to have un-checksummed data in the cluster +$node->safe_psql('postgres', + "CREATE TABLE t AS SELECT generate_series(1,10000) AS a;"); + +# Ensure that checksums are disabled +test_checksum_state($node, 'off'); + +# Enable checksums offline using pg_checksums +$node->stop; +$node->checksum_enable_offline; +$node->start; + +# Ensure that checksums are enabled +test_checksum_state($node, 'on'); + +# Run a dummy query just to make sure we can read back some data +my $result = + $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); +is($result, '9999', 'ensure checksummed pages can be read back'); + +# Disable checksums offline again using pg_checksums +$node->stop; +$node->checksum_disable_offline; +$node->start; + +# Ensure that checksums are disabled +test_checksum_state($node, 'off'); + +# Create a barrier for checksumming to block on, in this case a pre-existing +# temporary table which is kept open while processing is started. We can +# accomplish this by setting up an interactive psql process which keeps the +# temporary table created as we enable checksums in another psql process. + +my $bsession = $node->background_psql('postgres'); +$bsession->query_safe('CREATE TEMPORARY TABLE tt (a integer);'); + +# In another session, make sure we can see the blocking temp table but start +# processing anyways and check that we are blocked with a proper wait event. +$result = $node->safe_psql('postgres', + "SELECT relpersistence FROM pg_catalog.pg_class WHERE relname = 'tt';"); +is($result, 't', 'ensure we can see the temporary table'); + +enable_data_checksums($node, wait => 'inprogress-on'); + +# Turn the cluster off and enable checksums offline, then start back up +$bsession->quit; +$node->stop; +$node->checksum_enable_offline; +$node->start; + +# Ensure that checksums are now enabled even though processing wasn't +# restarted +test_checksum_state($node, 'on'); + +# Run a dummy query just to make sure we can read back some data +$result = $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); +is($result, '9999', 'ensure checksummed pages can be read back'); + +$node->stop; +done_testing(); diff --git a/src/test/modules/test_checksums/t/005_injection.pl b/src/test/modules/test_checksums/t/005_injection.pl new file mode 100644 index 000000000000..f4459e0e6363 --- /dev/null +++ b/src/test/modules/test_checksums/t/005_injection.pl @@ -0,0 +1,76 @@ + +# Copyright (c) 2025, PostgreSQL Global Development Group + +# Test suite for testing enabling data checksums in an online cluster with +# injection point tests injecting failures into the processing + +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +use FindBin; +use lib $FindBin::RealBin; + +use DataChecksums::Utils; + +if ($ENV{enable_injection_points} ne 'yes') +{ + plan skip_all => 'Injection points not supported by this build'; +} + +# --------------------------------------------------------------------------- +# Test cluster setup +# + +# Initiate testcluster +my $node = PostgreSQL::Test::Cluster->new('main'); +$node->init(no_data_checksums => 1); +$node->start; + +# Set up test environment +$node->safe_psql('postgres', 'CREATE EXTENSION test_checksums;'); + +# --------------------------------------------------------------------------- +# Inducing failures in processing + +# Force enabling checksums to fail by marking one of the databases as having +# failed in processing. +disable_data_checksums($node, wait => 1); +$node->safe_psql('postgres', 'SELECT dcw_inject_fail_database(true);'); +enable_data_checksums($node, wait => 'off'); +$node->safe_psql('postgres', 'SELECT dcw_inject_fail_database(false);'); + +# Force the enable checksums processing to make multiple passes by removing +# one database from the list in the first pass. This will simulate a CREATE +# DATABASE during processing. Doing this via fault injection makes the test +# not be subject to exact timing. +$node->safe_psql('postgres', 'SELECT dcw_prune_dblist(true);'); +enable_data_checksums($node, wait => 'on'); + +# --------------------------------------------------------------------------- +# Timing and retry related tests +# +SKIP: +{ + skip 'Data checksum delay tests not enabled in PG_TEST_EXTRA', 4 + if (!$ENV{PG_TEST_EXTRA} + || $ENV{PG_TEST_EXTRA} !~ /\bchecksum_extended\b/); + + # Inject a delay in the barrier for enabling checksums + disable_data_checksums($node, wait => 1); + $node->safe_psql('postgres', 'SELECT dcw_inject_delay_barrier();'); + enable_data_checksums($node, wait => 'on'); + + # Fake the existence of a temporary table at the start of processing, which + # will force the processing to wait and retry in order to wait for it to + # disappear. + disable_data_checksums($node, wait => 1); + $node->safe_psql('postgres', 'SELECT dcw_fake_temptable(true);'); + enable_data_checksums($node, wait => 'on'); +} + +$node->stop; +done_testing(); diff --git a/src/test/modules/test_checksums/t/006_concurrent_pgbench.pl b/src/test/modules/test_checksums/t/006_concurrent_pgbench.pl new file mode 100644 index 000000000000..b33ca6e0c260 --- /dev/null +++ b/src/test/modules/test_checksums/t/006_concurrent_pgbench.pl @@ -0,0 +1,326 @@ + +# Copyright (c) 2025, PostgreSQL Global Development Group + +# Test suite for testing enabling data checksums in an online cluster with +# concurrent activity via pgbench runs + +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +use FindBin; +use lib $FindBin::RealBin; + +use DataChecksums::Utils; + +my $node_primary_slot = 'physical_slot'; +my $node_primary_backup = 'primary_backup'; +my $node_primary; +my $node_primary_loglocation = 0; +my $node_standby_1; +my $node_standby_1_loglocation = 0; + +# The number of full test iterations which will be performed. The exact number +# of tests performed and the wall time taken is non-deterministic as the test +# performs a lot of randomized actions, but 50 iterations will be a long test +# run regardless. +my $TEST_ITERATIONS = 50; + +# Variables which record the current state of the cluster +my $data_checksum_state = 'off'; +my $pgbench_running = 0; + +# Variables holding state for managing the cluster and aux processes in +# various ways +my @stop_modes = (); +my ($pgb_primary_stdin, $pgb_primary_stdout, $pgb_primary_stderr) = + ('', '', ''); +my ($pgb_standby_1_stdin, $pgb_standby_1_stdout, $pgb_standby_1_stderr) = + ('', '', ''); + +if (!$ENV{PG_TEST_EXTRA} || $ENV{PG_TEST_EXTRA} !~ /\bchecksum_extended\b/) +{ + plan skip_all => 'Extended tests not enabled'; +} + +if ($ENV{enable_injection_points} ne 'yes') +{ + plan skip_all => 'Injection points not supported by this build'; +} + +# Helper for retrieving a binary value with random distribution for deciding +# whether to turn things off during testing. +sub cointoss +{ + return int(rand(2) == 1); +} + +# Helper for injecting random sleeps here and there in the testrun. The sleep +# duration wont be predictable in order to avoid sleep patterns that manage to +# avoid race conditions and timing bugs. +sub random_sleep +{ + return if cointoss; + sleep(int(rand(3))); +} + +# Start a read-only pgbench run in the background against the server specified +# via the port passed as parameter +sub background_ro_pgbench +{ + my ($port, $stdin, $stdout, $stderr) = @_; + + my $pgbench_primary = IPC::Run::start( + [ 'pgbench', '-p', $port, '-S', '-T', '600', '-c', '10', 'postgres' ], + '<' => \$stdin, + '>' => \$stdout, + '2>' => \$stderr, + IPC::Run::timer($PostgreSQL::Test::Utils::timeout_default)); +} + +# Start a pgbench run in the background against the server specified via the +# port passed as parameter +sub background_rw_pgbench +{ + my ($port, $stdin, $stdout, $stderr) = @_; + + my $pgbench_primary = IPC::Run::start( + [ 'pgbench', '-p', $port, '-T', '600', '-c', '10', 'postgres' ], + '<' => \$stdin, + '>' => \$stdout, + '2>' => \$stderr, + IPC::Run::timer($PostgreSQL::Test::Utils::timeout_default)); +} + +# Invert the state of data checksums in the cluster, if data checksums are on +# then disable them and vice versa. Also performs proper validation of the +# before and after state. +sub flip_data_checksums +{ + test_checksum_state($node_primary, $data_checksum_state); + test_checksum_state($node_standby_1, $data_checksum_state); + + if ($data_checksum_state eq 'off') + { + # Coin-toss to see if we are injecting a retry due to a temptable + $node_primary->safe_psql('postgres', + 'SELECT dcw_fake_temptable(true);') + if cointoss(); + + # Ensure that the primary switches to "inprogress-on" + enable_data_checksums($node_primary, wait => 'inprogress-on'); + random_sleep(); + # Wait for checksum enable to be replayed + $node_primary->wait_for_catchup($node_standby_1, 'replay'); + + # Ensure that the standby has switched to "inprogress-on" or "on". + # Normally it would be "inprogress-on", but it is theoretically + # possible for the primary to complete the checksum enabling *and* have + # the standby replay that record before we reach the check below. + my $result = $node_standby_1->poll_query_until( + 'postgres', + "SELECT setting = 'off' " + . "FROM pg_catalog.pg_settings " + . "WHERE name = 'data_checksums';", + 'f'); + is($result, 1, + 'ensure standby has absorbed the inprogress-on barrier'); + random_sleep(); + $result = $node_standby_1->safe_psql('postgres', + "SELECT setting " + . "FROM pg_catalog.pg_settings " + . "WHERE name = 'data_checksums';"); + + is(($result eq 'inprogress-on' || $result eq 'on'), + 1, 'ensure checksums are on, or in progress, on standby_1'); + + # Wait for checksums enabled on the primary and standby + wait_for_checksum_state($node_primary, 'on'); + random_sleep(); + wait_for_checksum_state($node_standby_1, 'on'); + + $node_primary->safe_psql('postgres', + 'SELECT dcw_fake_temptable(false);'); + $data_checksum_state = 'on'; + } + elsif ($data_checksum_state eq 'on') + { + random_sleep(); + disable_data_checksums($node_primary); + $node_primary->wait_for_catchup($node_standby_1, 'replay'); + + # Wait for checksums disabled on the primary and standby + wait_for_checksum_state($node_primary, 'off'); + random_sleep(); + wait_for_checksum_state($node_standby_1, 'off'); + + $data_checksum_state = 'off'; + } + else + { + # This should only happen due to programmer error when hacking on the + # test code, but since that might pass subtly by let's ensure it gets + # caught with a test error if so. + is(1, 0, 'data_checksum_state variable has invalid state'); + } +} + +# Prepare an array with pg_ctl stop modes which we later can randomly select +# from in order to stop the cluster in some way. +for (my $i = 1; $i <= 100; $i++) +{ + if (int(rand($i * 2)) > $i) + { + push(@stop_modes, "immediate"); + } + else + { + push(@stop_modes, "fast"); + } +} + +# Create and start a cluster with one primary and one standby node, and ensure +# they are caught up and in sync. +$node_primary = PostgreSQL::Test::Cluster->new('main'); +$node_primary->init(allows_streaming => 1, no_data_checksums => 1); +# max_connections need to be bumped in order to accomodate for pgbench clients +# and log_statement is dialled down since it otherwise will generate enormous +# amounts of logging. Page verification failures are still logged. +$node_primary->append_conf( + 'postgresql.conf', + qq[ +max_connections = 30 +log_statement = none +]); +$node_primary->start; +$node_primary->safe_psql('postgres', 'CREATE EXTENSION test_checksums;'); +# Create some content to have un-checksummed data in the cluster +$node_primary->safe_psql('postgres', + "CREATE TABLE t AS SELECT generate_series(1, 100000) AS a;"); +$node_primary->safe_psql('postgres', + "SELECT pg_create_physical_replication_slot('$node_primary_slot');"); +$node_primary->backup($node_primary_backup); + +$node_standby_1 = PostgreSQL::Test::Cluster->new('standby_1'); +$node_standby_1->init_from_backup($node_primary, $node_primary_backup, + has_streaming => 1); +$node_standby_1->append_conf( + 'postgresql.conf', qq[ +primary_slot_name = '$node_primary_slot' +]); +$node_standby_1->start; + +$node_primary->command_ok([ 'pgbench', '-i', '-s', '100', '-q', 'postgres' ]); +$node_primary->wait_for_catchup($node_standby_1, 'replay'); + +# Start the test suite with pgbench running. +background_ro_pgbench( + $node_standby_1->port, $pgb_standby_1_stdin, + $pgb_standby_1_stdout, $pgb_standby_1_stderr); +background_rw_pgbench( + $node_primary->port, $pgb_primary_stdin, + $pgb_primary_stdout, $pgb_primary_stderr); + +# Main test suite. This loop will start a pgbench run on the cluster and while +# that's running flip the state of data checksums concurrently. It will then +# randomly restart thec cluster (in fast or immediate) mode and then check for +# the desired state. The idea behind doing things randomly is to stress out +# any timing related issues by subjecting the cluster for varied workloads. +# A TODO is to generate a trace such that any test failure can be traced to +# its order of operations for debugging. +for (my $i = 0; $i < $TEST_ITERATIONS; $i++) +{ + if (!$node_primary->is_alive) + { + # Since the log isn't being written to now, parse the log and check + # for instances of checksum verification failures. + my $log = PostgreSQL::Test::Utils::slurp_file($node_primary->logfile, + $node_primary_loglocation); + unlike( + $log, + qr/page verification failed/, + "no checksum validation errors in primary log"); + $node_primary_loglocation = -s $node_primary->logfile; + + # If data checksums are enabled, take the opportunity to verify them + # while the cluster is offline + $node_primary->checksum_verify_offline() + unless $data_checksum_state eq 'off'; + random_sleep(); + $node_primary->start; + # Start a pgbench in the background against the primary + background_rw_pgbench($node_primary->port, 0, $pgb_primary_stdin, + $pgb_primary_stdout, $pgb_primary_stderr); + } + + if (!$node_standby_1->is_alive) + { + # Since the log isn't being written to now, parse the log and check + # for instances of checksum verification failures. + my $log = + PostgreSQL::Test::Utils::slurp_file($node_standby_1->logfile, + $node_standby_1_loglocation); + unlike( + $log, + qr/page verification failed/, + "no checksum validation errors in standby_1 log"); + $node_standby_1_loglocation = -s $node_standby_1->logfile; + + # If data checksums are enabled, take the opportunity to verify them + # while the cluster is offline + $node_standby_1->checksum_verify_offline() + unless $data_checksum_state eq 'off'; + random_sleep(); + $node_standby_1->start; + # Start a select-only pgbench in the background on the standby + background_ro_pgbench($node_standby_1->port, 1, $pgb_standby_1_stdin, + $pgb_standby_1_stdout, $pgb_standby_1_stderr); + } + + $node_primary->safe_psql('postgres', "UPDATE t SET a = a + 1;"); + + flip_data_checksums(); + random_sleep(); + my $result = $node_primary->safe_psql('postgres', + "SELECT count(*) FROM t WHERE a > 1"); + is($result, '100000', 'ensure data pages can be read back on primary'); + random_sleep(); + $node_primary->wait_for_catchup($node_standby_1, 'write'); + + # Potentially powercycle the cluster + $node_primary->stop($stop_modes[ int(rand(100)) ]) if cointoss(); + random_sleep(); + $node_standby_1->stop($stop_modes[ int(rand(100)) ]) if cointoss(); +} + +# Testrun is over, ensure that data reads back as expected and perform a final +# verification of the data checksum state. +my $result = + $node_primary->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1"); +is($result, '100000', 'ensure data pages can be read back on primary'); +test_checksum_state($node_primary, $data_checksum_state); +test_checksum_state($node_standby_1, $data_checksum_state); + +# Perform one final pass over the logs and hunt for unexpected errors +my $log = PostgreSQL::Test::Utils::slurp_file($node_primary->logfile, + $node_primary_loglocation); +unlike( + $log, + qr/page verification failed/, + "no checksum validation errors in primary log"); +$node_primary_loglocation = -s $node_primary->logfile; +$log = PostgreSQL::Test::Utils::slurp_file($node_standby_1->logfile, + $node_standby_1_loglocation); +unlike( + $log, + qr/page verification failed/, + "no checksum validation errors in standby_1 log"); +$node_standby_1_loglocation = -s $node_standby_1->logfile; + +$node_standby_1->teardown_node; +$node_primary->teardown_node; + +done_testing(); diff --git a/src/test/modules/test_checksums/t/DataChecksums/Utils.pm b/src/test/modules/test_checksums/t/DataChecksums/Utils.pm new file mode 100644 index 000000000000..ee2f2a1428fd --- /dev/null +++ b/src/test/modules/test_checksums/t/DataChecksums/Utils.pm @@ -0,0 +1,185 @@ + +# Copyright (c) 2025, PostgreSQL Global Development Group + +=pod + +=head1 NAME + +DataChecksums::Utils - Utility functions for testing data checksums in a running cluster + +=head1 SYNOPSIS + + use PostgreSQL::Test::Cluster; + use DataChecksums::Utils qw( .. ); + + # Create, and start, a new cluster + my $node = PostgreSQL::Test::Cluster->new('primary'); + $node->init; + $node->start; + + test_checksum_state($node, 'off'); + + enable_data_checksums($node); + + wait_for_checksum_state($node, 'on'); + + +=cut + +package DataChecksums::Utils; + +use strict; +use warnings FATAL => 'all'; +use Exporter 'import'; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +our @EXPORT = qw( + test_checksum_state + wait_for_checksum_state + enable_data_checksums + disable_data_checksums +); + +=pod + +=head1 METHODS + +=over + +=item test_checksum_state(node, state) + +Test that the current value of the data checksum GUC in the server running +at B matches B. If the values differ, a test failure is logged. +Returns True if the values match, otherwise False. + +=cut + +sub test_checksum_state +{ + my ($postgresnode, $state) = @_; + + my $result = $postgresnode->safe_psql('postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';" + ); + is($result, $state, 'ensure checksums are set to ' . $state); + return $result eq $state; +} + +=item wait_for_checksum_state(node, state) + +Test the value of the data checksum GUC in the server running at B +repeatedly until it matches B or times out. Processing will run for +$PostgreSQL::Test::Utils::timeout_default seconds before timing out. If the +values differ when the process times out, False is returned and a test failure +is logged, otherwise True. + +=cut + +sub wait_for_checksum_state +{ + my ($postgresnode, $state) = @_; + + my $res = $postgresnode->poll_query_until( + 'postgres', + "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';", + $state); + is($res, 1, 'ensure data checksums are transitioned to ' . $state); + return $res == 1; +} + +=item enable_data_checksums($node, %params) + +Function for enabling data checksums in the cluster running at B. + +=over + +=item cost_delay + +The C to use when enabling data checksums, default is 0. + +=item cost_limit + +The C to use when enabling data checksums, default is 100. + +=item fast + +If set to C an immediate checkpoint will be issued after data +checksums are enabled. Setting this to false will lead to slower tests. +The default is true. + +=item wait + +If defined, the function will wait for the state defined in this parameter, +waiting timing out, before returning. The function will wait for +$PostgreSQL::Test::Utils::timeout_default seconds before timing out. + +=back + +=cut + +sub enable_data_checksums +{ + my $postgresnode = shift; + my %params = @_; + + # Set sane defaults for the parameters + $params{cost_delay} = 0 unless (defined($params{cost_delay})); + $params{cost_limit} = 100 unless (defined($params{cost_limit})); + $params{fast} = 'true' unless (defined($params{fast})); + + my $query = <<'EOQ'; +SELECT pg_enable_data_checksums(%s, %s, %s); +EOQ + + $postgresnode->safe_psql( + 'postgres', + sprintf($query, + $params{cost_delay}, $params{cost_limit}, $params{fast})); + + wait_for_checksum_state($postgresnode, $params{wait}) + if (defined($params{wait})); +} + +=item disable_data_checksums($node, %params) + +Function for disabling data checksums in the cluster running at B. + +=over + +=item wait + +If defined, the function will wait for the state to turn to B, or +waiting timing out, before returning. The function will wait for +$PostgreSQL::Test::Utils::timeout_default seconds before timing out. +Unlike in C the value of the parameter is discarded. + +=back + +=cut + +sub disable_data_checksums +{ + my $postgresnode = shift; + my %params = @_; + + # Set sane defaults for the parameters + $params{fast} = 'true' unless (defined($params{fast})); + + my $query = <<'EOQ'; +SELECT pg_disable_data_checksums(%s); +EOQ + + $postgresnode->safe_psql('postgres', sprintf($query, $params{fast})); + + wait_for_checksum_state($postgresnode, 'off') if (defined($params{wait})); +} + +=pod + +=back + +=cut + +1; diff --git a/src/test/modules/test_checksums/test_checksums--1.0.sql b/src/test/modules/test_checksums/test_checksums--1.0.sql new file mode 100644 index 000000000000..704b45a31866 --- /dev/null +++ b/src/test/modules/test_checksums/test_checksums--1.0.sql @@ -0,0 +1,20 @@ +/* src/test/modules/test_checksums/test_checksums--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION test_checksums" to load this file. \quit + +CREATE FUNCTION dcw_inject_delay_barrier(attach boolean DEFAULT true) + RETURNS pg_catalog.void + AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION dcw_inject_fail_database(attach boolean DEFAULT true) + RETURNS pg_catalog.void + AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION dcw_prune_dblist(attach boolean DEFAULT true) + RETURNS pg_catalog.void + AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION dcw_fake_temptable(attach boolean DEFAULT true) + RETURNS pg_catalog.void + AS 'MODULE_PATHNAME' LANGUAGE C; diff --git a/src/test/modules/test_checksums/test_checksums.c b/src/test/modules/test_checksums/test_checksums.c new file mode 100644 index 000000000000..26897bff960d --- /dev/null +++ b/src/test/modules/test_checksums/test_checksums.c @@ -0,0 +1,173 @@ +/*-------------------------------------------------------------------------- + * + * test_checksums.c + * Test data checksums + * + * Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/test/modules/test_checksums/test_checksums.c + * + * ------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "funcapi.h" +#include "miscadmin.h" +#include "postmaster/datachecksumsworker.h" +#include "storage/latch.h" +#include "utils/injection_point.h" +#include "utils/wait_event.h" + +#define USEC_PER_SEC 1000000 + + +PG_MODULE_MAGIC; + +extern PGDLLEXPORT void dc_delay_barrier(const char *name, const void *private_data, void *arg); +extern PGDLLEXPORT void dc_fail_database(const char *name, const void *private_data, void *arg); +extern PGDLLEXPORT void dc_dblist(const char *name, const void *private_data, void *arg); +extern PGDLLEXPORT void dc_fake_temptable(const char *name, const void *private_data, void *arg); + +/* + * Test for delaying emission of procsignalbarriers. + */ +void +dc_delay_barrier(const char *name, const void *private_data, void *arg) +{ + (void) name; + (void) private_data; + + (void) WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + (3 * 1000), + WAIT_EVENT_PG_SLEEP); +} + +PG_FUNCTION_INFO_V1(dcw_inject_delay_barrier); +Datum +dcw_inject_delay_barrier(PG_FUNCTION_ARGS) +{ +#ifdef USE_INJECTION_POINTS + bool attach = PG_GETARG_BOOL(0); + + if (attach) + InjectionPointAttach("datachecksums-enable-checksums-delay", + "test_checksums", + "dc_delay_barrier", + NULL, + 0); + else + InjectionPointDetach("datachecksums-enable-checksums-delay"); +#else + elog(ERROR, + "test is not working as intended when injection points are disabled"); +#endif + PG_RETURN_VOID(); +} + +void +dc_fail_database(const char *name, const void *private_data, void *arg) +{ + static bool first_pass = true; + DataChecksumsWorkerResult *res = (DataChecksumsWorkerResult *) arg; + + if (first_pass) + *res = DATACHECKSUMSWORKER_FAILED; + first_pass = false; +} + +PG_FUNCTION_INFO_V1(dcw_inject_fail_database); +Datum +dcw_inject_fail_database(PG_FUNCTION_ARGS) +{ +#ifdef USE_INJECTION_POINTS + bool attach = PG_GETARG_BOOL(0); + + if (attach) + InjectionPointAttach("datachecksumsworker-fail-db", + "test_checksums", + "dc_fail_database", + NULL, + 0); + else + InjectionPointDetach("datachecksumsworker-fail-db"); +#else + elog(ERROR, + "test is not working as intended when injection points are disabled"); +#endif + PG_RETURN_VOID(); +} + +/* + * Test to remove an entry from the Databaselist to force re-processing since + * not all databases could be processed in the first iteration of the loop. + */ +void +dc_dblist(const char *name, const void *private_data, void *arg) +{ + static bool first_pass = true; + List *DatabaseList = (List *) arg; + + if (first_pass) + DatabaseList = list_delete_last(DatabaseList); + first_pass = false; +} + +PG_FUNCTION_INFO_V1(dcw_prune_dblist); +Datum +dcw_prune_dblist(PG_FUNCTION_ARGS) +{ +#ifdef USE_INJECTION_POINTS + bool attach = PG_GETARG_BOOL(0); + + if (attach) + InjectionPointAttach("datachecksumsworker-initial-dblist", + "test_checksums", + "dc_dblist", + NULL, + 0); + else + InjectionPointDetach("datachecksumsworker-initial-dblist"); +#else + elog(ERROR, + "test is not working as intended when injection points are disabled"); +#endif + PG_RETURN_VOID(); +} + +/* + * Test to force waiting for existing temptables. + */ +void +dc_fake_temptable(const char *name, const void *private_data, void *arg) +{ + static bool first_pass = true; + int *numleft = (int *) arg; + + if (first_pass) + *numleft = 1; + first_pass = false; +} + +PG_FUNCTION_INFO_V1(dcw_fake_temptable); +Datum +dcw_fake_temptable(PG_FUNCTION_ARGS) +{ +#ifdef USE_INJECTION_POINTS + bool attach = PG_GETARG_BOOL(0); + + if (attach) + InjectionPointAttach("datachecksumsworker-fake-temptable-wait", + "test_checksums", + "dc_fake_temptable", + NULL, + 0); + else + InjectionPointDetach("datachecksumsworker-fake-temptable-wait"); +#else + elog(ERROR, + "test is not working as intended when injection points are disabled"); +#endif + PG_RETURN_VOID(); +} diff --git a/src/test/modules/test_checksums/test_checksums.control b/src/test/modules/test_checksums/test_checksums.control new file mode 100644 index 000000000000..84b4cc035a78 --- /dev/null +++ b/src/test/modules/test_checksums/test_checksums.control @@ -0,0 +1,4 @@ +comment = 'Test code for data checksums' +default_version = '1.0' +module_pathname = '$libdir/test_checksums' +relocatable = true diff --git a/src/test/perl/PostgreSQL/Test/Cluster.pm b/src/test/perl/PostgreSQL/Test/Cluster.pm index 35413f140198..3af7944aceac 100644 --- a/src/test/perl/PostgreSQL/Test/Cluster.pm +++ b/src/test/perl/PostgreSQL/Test/Cluster.pm @@ -3872,6 +3872,51 @@ sub advance_wal } } +=item $node->checksum_enable_offline() + +Enable data page checksums in an offline cluster with B. The +caller is responsible for ensuring that the cluster is in the right state for +this operation. + +=cut + +sub checksum_enable_offline +{ + my ($self) = @_; + + print "# Enabling checksums in \"$self->data_dir\"\n"; + PostgreSQL::Test::Utils::system_or_bail('pg_checksums', '-D', + $self->data_dir, '-e'); + return; +} + +=item checksum_disable_offline + +Disable data page checksums in an offline cluster with B. The +caller is responsible for ensuring that the cluster is in the right state for +this operation. + +=cut + +sub checksum_disable_offline +{ + my ($self) = @_; + + print "# Disabling checksums in \"$self->data_dir\"\n"; + PostgreSQL::Test::Utils::system_or_bail('pg_checksums', '-D', + $self->data_dir, '-d'); + return; +} + +sub checksum_verify_offline +{ + my ($self) = @_; + + PostgreSQL::Test::Utils::system_or_bail('pg_checksums', '-D', + $self->data_dir, '-c'); + return; +} + =pod =back diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 35e8aad7701b..4b9c5526e50c 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -2071,6 +2071,42 @@ pg_stat_progress_create_index| SELECT s.pid, s.param15 AS partitions_done FROM (pg_stat_get_progress_info('CREATE INDEX'::text) s(pid, datid, relid, param1, param2, param3, param4, param5, param6, param7, param8, param9, param10, param11, param12, param13, param14, param15, param16, param17, param18, param19, param20) LEFT JOIN pg_database d ON ((s.datid = d.oid))); +pg_stat_progress_data_checksums| SELECT s.pid, + s.datid, + d.datname, + CASE s.param1 + WHEN 0 THEN 'enabling'::text + WHEN 1 THEN 'disabling'::text + WHEN 2 THEN 'waiting'::text + WHEN 3 THEN 'waiting on temporary tables'::text + WHEN 4 THEN 'waiting on checkpoint'::text + WHEN 5 THEN 'done'::text + ELSE NULL::text + END AS phase, + CASE s.param2 + WHEN '-1'::integer THEN NULL::bigint + ELSE s.param2 + END AS databases_total, + s.param3 AS databases_done, + CASE s.param4 + WHEN '-1'::integer THEN NULL::bigint + ELSE s.param4 + END AS relations_total, + CASE s.param5 + WHEN '-1'::integer THEN NULL::bigint + ELSE s.param5 + END AS relations_done, + CASE s.param6 + WHEN '-1'::integer THEN NULL::bigint + ELSE s.param6 + END AS blocks_total, + CASE s.param7 + WHEN '-1'::integer THEN NULL::bigint + ELSE s.param7 + END AS blocks_done + FROM (pg_stat_get_progress_info('DATACHECKSUMS'::text) s(pid, datid, relid, param1, param2, param3, param4, param5, param6, param7, param8, param9, param10, param11, param12, param13, param14, param15, param16, param17, param18, param19, param20) + LEFT JOIN pg_database d ON ((s.datid = d.oid))) + ORDER BY s.datid; pg_stat_progress_vacuum| SELECT s.pid, s.datid, d.datname, diff --git a/src/test/regress/expected/stats.out b/src/test/regress/expected/stats.out index 605f50703769..9042e4d38e30 100644 --- a/src/test/regress/expected/stats.out +++ b/src/test/regress/expected/stats.out @@ -59,6 +59,22 @@ io worker|relation|vacuum io worker|temp relation|normal io worker|wal|init io worker|wal|normal +datachecksumsworker launcher|relation|bulkread +datachecksumsworker launcher|relation|bulkwrite +datachecksumsworker launcher|relation|init +datachecksumsworker launcher|relation|normal +datachecksumsworker launcher|relation|vacuum +datachecksumsworker launcher|temp relation|normal +datachecksumsworker launcher|wal|init +datachecksumsworker launcher|wal|normal +datachecksumsworker worker|relation|bulkread +datachecksumsworker worker|relation|bulkwrite +datachecksumsworker worker|relation|init +datachecksumsworker worker|relation|normal +datachecksumsworker worker|relation|vacuum +datachecksumsworker worker|temp relation|normal +datachecksumsworker worker|wal|init +datachecksumsworker worker|wal|normal slotsync worker|relation|bulkread slotsync worker|relation|bulkwrite slotsync worker|relation|init @@ -95,7 +111,7 @@ walsummarizer|wal|init walsummarizer|wal|normal walwriter|wal|init walwriter|wal|normal -(79 rows) +(87 rows) \a -- ensure that both seqscan and indexscan plans are allowed SET enable_seqscan TO on; diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index a13e81628902..df0f49ea2aab 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -416,6 +416,7 @@ CheckPointStmt CheckpointStatsData CheckpointerRequest CheckpointerShmemStruct +ChecksumType Chromosome CkptSortItem CkptTsStatus @@ -608,6 +609,10 @@ DataPageDeleteStack DataTypesUsageChecks DataTypesUsageVersionCheck DatabaseInfo +DataChecksumsWorkerDatabase +DataChecksumsWorkerResult +DataChecksumsWorkerResultEntry +DataChecksumsWorkerShmemStruct DateADT DateTimeErrorExtra Datum @@ -4243,6 +4248,7 @@ xl_btree_split xl_btree_unlink_page xl_btree_update xl_btree_vacuum +xl_checksum_state xl_clog_truncate xl_commit_ts_truncate xl_dbase_create_file_copy_rec