diff --git a/doc/src/sgml/func/func-admin.sgml b/doc/src/sgml/func/func-admin.sgml
index 57ff333159f0..88d260795b8b 100644
--- a/doc/src/sgml/func/func-admin.sgml
+++ b/doc/src/sgml/func/func-admin.sgml
@@ -2960,4 +2960,75 @@ SELECT convert_from(pg_read_binary_file('file_in_utf8.txt'), 'UTF8');
+
+ Data Checksum Functions
+
+
+ The functions shown in can
+ be used to enable or disable data checksums in a running cluster.
+ See for details.
+
+
+
+ Data Checksum Functions
+
+
+
+
+ Function
+
+
+ Description
+
+
+
+
+
+
+
+
+ pg_enable_data_checksums
+
+ pg_enable_data_checksums ( cost_delayint, cost_limitint )
+ void
+
+
+ Initiates data checksums for the cluster. This will switch the data
+ checksums mode to inprogress-on as well as start a
+ background worker that will process all pages in the database and
+ enable checksums on them. When all data pages have had checksums
+ enabled, the cluster will automatically switch data checksums mode to
+ on.
+
+
+ If cost_delay and cost_limit are
+ specified, the speed of the process is throttled using the same principles as
+ Cost-based Vacuum Delay.
+
+
+
+
+
+
+ pg_disable_data_checksums
+
+ pg_disable_data_checksums ()
+ void
+
+
+ Disables data checksum validation and calculation for the cluster. This
+ will switch the data checksum mode to inprogress-off
+ while data checksums are being disabled. When all active backends have
+ stopped validating data checksums, the data checksum mode will be
+ changed to off. At this point the data pages will
+ still have checksums recorded but they are not updated when pages are
+ modified.
+
+
+
+
+
+
+
+
diff --git a/doc/src/sgml/glossary.sgml b/doc/src/sgml/glossary.sgml
index b88cac598e90..a4e16d03aaec 100644
--- a/doc/src/sgml/glossary.sgml
+++ b/doc/src/sgml/glossary.sgml
@@ -184,6 +184,8 @@
(but not the autovacuum workers),
the background writer,
the checkpointer,
+ the data checksums worker,
+ the data checksums worker launcher,
the logger,
the startup process,
the WAL archiver,
@@ -573,6 +575,27 @@
+
+ Data Checksums Worker
+
+
+ An auxiliary process
+ which enables or disables data checksums in a specific database.
+
+
+
+
+
+ Data Checksums Worker Launcher
+
+
+ An auxiliary process
+ which starts processes
+ for each database.
+
+
+
+
Database cluster
diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index 3f4a27a736e2..6082d991497e 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -3527,8 +3527,9 @@ description | Waiting for a newly initialized WAL file to reach durable storage
Number of data page checksum failures detected in this
- database (or on a shared object), or NULL if data checksums are
- disabled.
+ database (or on a shared object).
+ Detected failures are reported regardless of the
+ setting.
@@ -3538,8 +3539,8 @@ description | Waiting for a newly initialized WAL file to reach durable storage
Time at which the last data page checksum failure was detected in
- this database (or on a shared object), or NULL if data checksums are
- disabled.
+ this database (or on a shared object). Last failure is reported
+ regardless of the setting.
@@ -6877,6 +6878,205 @@ FROM pg_stat_get_backend_idset() AS backendid;
+
+ Data Checksum Progress Reporting
+
+
+ pg_stat_progress_data_checksums
+
+
+
+ When data checksums are being enabled on a running cluster, the
+ pg_stat_progress_data_checksums view will contain
+ a row for the launcher process, and one row for each worker process which
+ is currently calculating checksums for the data pages in one database.
+
+
+
+ pg_stat_progress_data_checksums View
+
+
+
+
+
+ Column Type
+
+
+ Description>
+
+
+
+
+
+
+
+
+
+ pidinteger
+
+
+ Process ID of a datachecksumworker process.
+
+
+
+
+
+
+ datidoid
+
+
+ OID of this database, or 0 for the launcher process
+ relation
+
+
+
+
+
+ datnamename
+
+
+ Name of this database, or NULL for the
+ launcher process.
+
+
+
+
+
+
+ phasetext
+
+
+ Current processing phase, see
+ for description of the phases.
+
+
+
+
+
+
+
+ databases_totalinteger
+
+
+ The total number of databases which will be processed. Only the
+ launcher worker has this value set, the other worker processes
+ have this set to NULL.
+
+
+
+
+
+
+
+ databases_doneinteger
+
+
+ The number of databases which have been processed. Only the
+ launcher worker has this value set, the other worker processes
+ have this set to NULL.
+
+
+
+
+
+
+
+ relations_totalinteger
+
+
+ The total number of relations which will be processed, or
+ NULL if the data checksums worker process hasn't
+ calculated the number of relations yet. The launcher process has
+ this NULL.
+
+
+
+
+
+
+
+ relations_doneinteger
+
+
+ The number of relations which have been processed. The launcher
+ process has this NULL.
+
+
+
+
+
+
+
+ blocks_totalinteger
+
+
+ The number of blocks in the current relation which will be processed,
+ or NULL if the data checksums worker process hasn't
+ calculated the number of blocks yet. The launcher process has
+ this NULL.
+
+
+
+
+
+
+
+ blocks_doneinteger
+
+
+ The number of blocks in the current relation which have been processed.
+ The launcher process has this NULL.
+
+
+
+
+
+
+
+
+
+ Data Checksum Phases
+
+
+
+
+
+ Phase
+ Description
+
+
+
+
+ enabling
+
+ The command is currently enabling data checksums on the cluster.
+
+
+
+ disabling
+
+ The command is currently disabling data checksums on the cluster.
+
+
+
+ waiting on temporary tables
+
+ The command is currently waiting for all temporary tables which existed
+ at the time the command was started to be removed.
+
+
+
+ waiting on checkpoint
+
+ The command is currently waiting for a checkpoint to update the checksum
+ state before finishing.
+
+
+
+
+
+
+
diff --git a/doc/src/sgml/ref/pg_checksums.sgml b/doc/src/sgml/ref/pg_checksums.sgml
index 95043aa329c0..0343710af53d 100644
--- a/doc/src/sgml/ref/pg_checksums.sgml
+++ b/doc/src/sgml/ref/pg_checksums.sgml
@@ -45,6 +45,12 @@ PostgreSQL documentation
exit status is nonzero if the operation failed.
+
+ When enabling checksums, if checksums were in the process of being enabled
+ when the cluster was shut down, pg_checksums
+ will still process all relations regardless of the online processing.
+
+
When verifying checksums, every file in the cluster is scanned. When
enabling checksums, each relation file block with a changed checksum is
diff --git a/doc/src/sgml/regress.sgml b/doc/src/sgml/regress.sgml
index 8838fe7f0225..7074751834ea 100644
--- a/doc/src/sgml/regress.sgml
+++ b/doc/src/sgml/regress.sgml
@@ -263,6 +263,18 @@ make check-world PG_TEST_EXTRA='kerberos ldap ssl load_balance libpq_encryption'
The following values are currently supported:
+
+ checksum_extended
+
+
+ Runs additional tests for enabling data checksums which inject delays
+ and re-tries in the processing, as well as tests that run pgbench
+ concurrently and randomly restarts the cluster. Some of these test
+ suites requires injection points enabled in the installation.
+
+
+
+
kerberos
diff --git a/doc/src/sgml/wal.sgml b/doc/src/sgml/wal.sgml
index f3b86b26be90..0ada90ca0b16 100644
--- a/doc/src/sgml/wal.sgml
+++ b/doc/src/sgml/wal.sgml
@@ -246,9 +246,10 @@
Checksums can be disabled when the cluster is initialized using initdb.
- They can also be enabled or disabled at a later time as an offline
- operation. Data checksums are enabled or disabled at the full cluster
- level, and cannot be specified individually for databases or tables.
+ They can also be enabled or disabled at a later time either as an offline
+ operation or online in a running cluster allowing concurrent access. Data
+ checksums are enabled or disabled at the full cluster level, and cannot be
+ specified individually for databases or tables.
@@ -265,7 +266,7 @@
- Off-line Enabling of Checksums
+ Offline Enabling of Checksums
The pg_checksums
@@ -274,6 +275,56 @@
+
+
+ Online Enabling of Checksums
+
+
+ Checksums can be enabled or disabled online, by calling the appropriate
+ functions.
+
+
+
+ Enabling checksums will put the cluster checksum mode in
+ inprogress-on mode. During this time, checksums will be
+ written but not verified. In addition to this, a background worker process
+ is started that enables checksums on all existing data in the cluster. Once
+ this worker has completed processing all databases in the cluster, the
+ checksum mode will automatically switch to on. The
+ processing will consume two background worker processes, make sure that
+ max_worker_processes allows for at least two more
+ additional processes.
+
+
+
+ The process will initially wait for all open transactions to finish before
+ it starts, so that it can be certain that there are no tables that have been
+ created inside a transaction that has not committed yet and thus would not
+ be visible to the process enabling checksums. It will also, for each database,
+ wait for all pre-existing temporary tables to get removed before it finishes.
+ If long-lived temporary tables are used in the application it may be necessary
+ to terminate these application connections to allow the process to complete.
+
+
+
+ If the cluster is stopped while in inprogress-on mode, for
+ any reason, then this process must be restarted manually. To do this,
+ re-execute the function pg_enable_data_checksums()
+ once the cluster has been restarted. The process will start over, there is
+ no support for resuming work from where it was interrupted.
+
+
+
+
+ Enabling checksums can cause significant I/O to the system, as most of the
+ database pages will need to be rewritten, and will be written both to the
+ data files and the WAL. The impact may be limited by throttling using the
+ cost_delay and cost_limit
+ parameters of the pg_enable_data_checksums function.
+
+
+
+
diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c
index cd6c2a2f650a..c50d654db30e 100644
--- a/src/backend/access/rmgrdesc/xlogdesc.c
+++ b/src/backend/access/rmgrdesc/xlogdesc.c
@@ -18,6 +18,7 @@
#include "access/xlog.h"
#include "access/xlog_internal.h"
#include "catalog/pg_control.h"
+#include "storage/bufpage.h"
#include "utils/guc.h"
#include "utils/timestamp.h"
@@ -167,6 +168,26 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
memcpy(&wal_level, rec, sizeof(int));
appendStringInfo(buf, "wal_level %s", get_wal_level_string(wal_level));
}
+ else if (info == XLOG_CHECKSUMS)
+ {
+ xl_checksum_state xlrec;
+
+ memcpy(&xlrec, rec, sizeof(xl_checksum_state));
+ switch (xlrec.new_checksumtype)
+ {
+ case PG_DATA_CHECKSUM_VERSION:
+ appendStringInfoString(buf, "on");
+ break;
+ case PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION:
+ appendStringInfoString(buf, "inprogress-off");
+ break;
+ case PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION:
+ appendStringInfoString(buf, "inprogress-on");
+ break;
+ default:
+ appendStringInfoString(buf, "off");
+ }
+ }
}
const char *
@@ -218,6 +239,9 @@ xlog_identify(uint8 info)
case XLOG_CHECKPOINT_REDO:
id = "CHECKPOINT_REDO";
break;
+ case XLOG_CHECKSUMS:
+ id = "CHECKSUMS";
+ break;
}
return id;
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 7ffb21791519..46edf5313591 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -550,6 +550,9 @@ typedef struct XLogCtlData
*/
XLogRecPtr lastFpwDisableRecPtr;
+ /* last data_checksum_version we've seen */
+ uint32 data_checksum_version;
+
slock_t info_lck; /* locks shared variables shown above */
} XLogCtlData;
@@ -647,6 +650,36 @@ static XLogRecPtr LocalMinRecoveryPoint;
static TimeLineID LocalMinRecoveryPointTLI;
static bool updateMinRecoveryPoint = true;
+/*
+ * Local state fror Controlfile data_checksum_version. After initialization
+ * this is only updated when absorbing a procsignal barrier during interrupt
+ * processing. The reason for keeping a copy in backend-private memory is to
+ * avoid locking for interrogating checksum state. Possible values are the
+ * checksum versions defined in storage/bufpage.h as well as zero when data
+ * checksums are disabled.
+ */
+static uint32 LocalDataChecksumVersion = 0;
+
+/*
+ * Flag to remember if the procsignalbarrier being absorbed for checksums is
+ * the first one. The first procsignalbarrier can in rare cases be for the
+ * state we've initialized, i.e. a duplicate. This may happen for any
+ * data_checksum_version value, but for PG_DATA_CHECKSUM_ON_VERSION this would
+ * trigger an assert failure (this is the only transition with an assert) when
+ * processing the barrier. This may happen if the process is spawned between
+ * the update of XLogCtl->data_checksum_version and the barrier being emitted.
+ * This can only happen on the very first barrier so mark that with this flag.
+ */
+static bool InitialDataChecksumTransition = true;
+
+/*
+ * Variable backing the GUC, keep it in sync with LocalDataChecksumVersion.
+ * See SetLocalDataChecksumVersion().
+ */
+int data_checksums = 0;
+
+static void SetLocalDataChecksumVersion(uint32 data_checksum_version);
+
/* For WALInsertLockAcquire/Release functions */
static int MyLockNo = 0;
static bool holdingAllLocks = false;
@@ -715,6 +748,8 @@ static void WALInsertLockAcquireExclusive(void);
static void WALInsertLockRelease(void);
static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
+static void XLogChecksums(uint32 new_type);
+
/*
* Insert an XLOG record represented by an already-constructed chain of data
* chunks. This is a low-level routine; to construct the WAL record header
@@ -828,9 +863,10 @@ XLogInsertRecord(XLogRecData *rdata,
* only happen just after a checkpoint, so it's better to be slow in
* this case and fast otherwise.
*
- * Also check to see if fullPageWrites was just turned on or there's a
- * running backup (which forces full-page writes); if we weren't
- * already doing full-page writes then go back and recompute.
+ * Also check to see if fullPageWrites was just turned on, there's a
+ * running backup or if checksums are enabled (all of which forces
+ * full-page writes); if we weren't already doing full-page writes
+ * then go back and recompute.
*
* If we aren't doing full-page writes then RedoRecPtr doesn't
* actually affect the contents of the XLOG record, so we'll update
@@ -843,7 +879,9 @@ XLogInsertRecord(XLogRecData *rdata,
Assert(RedoRecPtr < Insert->RedoRecPtr);
RedoRecPtr = Insert->RedoRecPtr;
}
- doPageWrites = (Insert->fullPageWrites || Insert->runningBackups > 0);
+ doPageWrites = (Insert->fullPageWrites ||
+ Insert->runningBackups > 0 ||
+ DataChecksumsNeedWrite());
if (doPageWrites &&
(!prevDoPageWrites ||
@@ -4229,6 +4267,12 @@ InitControlFile(uint64 sysidentifier, uint32 data_checksum_version)
ControlFile->wal_log_hints = wal_log_hints;
ControlFile->track_commit_timestamp = track_commit_timestamp;
ControlFile->data_checksum_version = data_checksum_version;
+
+ /*
+ * Set the data_checksum_version value into XLogCtl, which is where all
+ * processes get the current value from. (Maybe it should go just there?)
+ */
+ XLogCtl->data_checksum_version = data_checksum_version;
}
static void
@@ -4552,10 +4596,6 @@ ReadControlFile(void)
(SizeOfXLogLongPHD - SizeOfXLogShortPHD);
CalculateCheckpointSegments();
-
- /* Make the initdb settings visible as GUC variables, too */
- SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no",
- PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT);
}
/*
@@ -4589,13 +4629,374 @@ GetMockAuthenticationNonce(void)
}
/*
- * Are checksums enabled for data pages?
+ * DataChecksumsNeedWrite
+ * Returns whether data checksums must be written or not
+ *
+ * Returns true iff data checksums are enabled or are in the process of being
+ * enabled. During "inprogress-on" and "inprogress-off" states checksums must
+ * be written even though they are not verified (see datachecksumsworker.c for
+ * a longer discussion).
+ *
+ * This function is intended for callsites which are about to write a data page
+ * to storage, and need to know whether to re-calculate the checksum for the
+ * page header. Calling this function must be performed as close to the write
+ * operation as possible to keep the critical section short.
+ */
+bool
+DataChecksumsNeedWrite(void)
+{
+ return (LocalDataChecksumVersion == PG_DATA_CHECKSUM_VERSION ||
+ LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION ||
+ LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION);
+}
+
+/*
+ * DataChecksumsNeedVerify
+ * Returns whether data checksums must be verified or not
+ *
+ * Data checksums are only verified if they are fully enabled in the cluster.
+ * During the "inprogress-on" and "inprogress-off" states they are only
+ * updated, not verified (see datachecksumsworker.c for a longer discussion).
+ *
+ * This function is intended for callsites which have read data and are about
+ * to perform checksum validation based on the result of this. Calling this
+ * function must be performed as close to the validation call as possible to
+ * keep the critical section short. This is in order to protect against time of
+ * check/time of use situations around data checksum validation.
+ */
+bool
+DataChecksumsNeedVerify(void)
+{
+ return (LocalDataChecksumVersion == PG_DATA_CHECKSUM_VERSION);
+}
+
+/*
+ * DataChecksumsOnInProgress
+ * Returns whether data checksums are being enabled
+ *
+ * Most operations don't need to worry about the "inprogress" states, and
+ * should use DataChecksumsNeedVerify() or DataChecksumsNeedWrite(). The
+ * "inprogress-on" state for enabling checksums is used when the checksum
+ * worker is setting checksums on all pages, it can thus be used to check for
+ * aborted checksum processing which need to be restarted.
+ */
+inline bool
+DataChecksumsOnInProgress(void)
+{
+ return (LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION);
+}
+
+/*
+ * DataChecksumsOffInProgress
+ * Returns whether data checksums are being disabled
+ *
+ * The "inprogress-off" state for disabling checksums is used for when the
+ * worker resets the catalog state. DataChecksumsNeedVerify() or
+ * DataChecksumsNeedWrite() should be used for deciding whether to read/write
+ * checksums.
*/
bool
-DataChecksumsEnabled(void)
+DataChecksumsOffInProgress(void)
+{
+ return (LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION);
+}
+
+/*
+ * SetDataChecksumsOnInProgress
+ * Sets the data checksum state to "inprogress-on" to enable checksums
+ *
+ * To start the process of enabling data checksums in a running cluster the
+ * data_checksum_version state must be changed to "inprogress-on". See
+ * SetDataChecksumsOn below for a description on how this state change works.
+ * This function blocks until all backends in the cluster have acknowledged the
+ * state transition.
+ */
+void
+SetDataChecksumsOnInProgress(void)
+{
+ uint64 barrier;
+
+ Assert(ControlFile != NULL);
+
+ /*
+ * The state transition is performed in a critical section with
+ * checkpoints held off to provide crash safety.
+ */
+ MyProc->delayChkptFlags |= DELAY_CHKPT_START;
+ START_CRIT_SECTION();
+
+ XLogChecksums(PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION);
+
+ SpinLockAcquire(&XLogCtl->info_lck);
+ XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION;
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON);
+
+ END_CRIT_SECTION();
+ MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
+
+ /*
+ * Await state change in all backends to ensure that all backends are in
+ * "inprogress-on". Once done we know that all backends are writing data
+ * checksums.
+ */
+ WaitForProcSignalBarrier(barrier);
+}
+
+/*
+ * SetDataChecksumsOn
+ * Enables data checksums cluster-wide
+ *
+ * Enabling data checksums is performed using two barriers, the first one to
+ * set the state to "inprogress-on" (done by SetDataChecksumsOnInProgress())
+ * and the second one to set the state to "on" (done here). Below is a short
+ * description of the processing, a more detailed write-up can be found in
+ * datachecksumsworker.c.
+ *
+ * To start the process of enabling data checksums in a running cluster the
+ * data_checksum_version state must be changed to "inprogress-on". This state
+ * requires data checksums to be written but not verified. This ensures that
+ * all data pages can be checksummed without the risk of false negatives in
+ * validation during the process. When all existing pages are guaranteed to
+ * have checksums, and all new pages will be initiated with checksums, the
+ * state can be changed to "on". Once the state is "on" checksums will be both
+ * written and verified. See datachecksumsworker.c for a longer discussion on
+ * how data checksums can be enabled in a running cluster.
+ *
+ * This function blocks until all backends in the cluster have acknowledged the
+ * state transition.
+ */
+void
+SetDataChecksumsOn(void)
{
+ uint64 barrier;
+
Assert(ControlFile != NULL);
- return (ControlFile->data_checksum_version > 0);
+
+ SpinLockAcquire(&XLogCtl->info_lck);
+
+ /*
+ * The only allowed state transition to "on" is from "inprogress-on" since
+ * that state ensures that all pages will have data checksums written.
+ */
+ if (XLogCtl->data_checksum_version != PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION)
+ {
+ SpinLockRelease(&XLogCtl->info_lck);
+ elog(ERROR, "checksums not in \"inprogress-on\" mode");
+ }
+
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ MyProc->delayChkptFlags |= DELAY_CHKPT_START;
+ INJECTION_POINT("datachecksums-enable-checksums-delay", NULL);
+ START_CRIT_SECTION();
+
+ XLogChecksums(PG_DATA_CHECKSUM_VERSION);
+
+ SpinLockAcquire(&XLogCtl->info_lck);
+ XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_VERSION;
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_ON);
+
+ END_CRIT_SECTION();
+ MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
+
+ /*
+ * Await state transition of "on" in all backends. When done we know that
+ * data checksums are enabled in all backends and data checksums are both
+ * written and verified.
+ */
+ WaitForProcSignalBarrier(barrier);
+}
+
+/*
+ * SetDataChecksumsOff
+ * Disables data checksums cluster-wide
+ *
+ * Disabling data checksums must be performed with two sets of barriers, each
+ * carrying a different state. The state is first set to "inprogress-off"
+ * during which checksums are still written but not verified. This ensures that
+ * backends which have yet to observe the state change from "on" won't get
+ * validation errors on concurrently modified pages. Once all backends have
+ * changed to "inprogress-off", the barrier for moving to "off" can be emitted.
+ * This function blocks until all backends in the cluster have acknowledged the
+ * state transition.
+ */
+void
+SetDataChecksumsOff(void)
+{
+ uint64 barrier;
+
+ Assert(ControlFile);
+
+ SpinLockAcquire(&XLogCtl->info_lck);
+
+ /* If data checksums are already disabled there is nothing to do */
+ if (XLogCtl->data_checksum_version == 0)
+ {
+ SpinLockRelease(&XLogCtl->info_lck);
+ return;
+ }
+
+ /*
+ * If data checksums are currently enabled we first transition to the
+ * "inprogress-off" state during which backends continue to write
+ * checksums without verifying them. When all backends are in
+ * "inprogress-off" the next transition to "off" can be performed, after
+ * which all data checksum processing is disabled.
+ */
+ if (XLogCtl->data_checksum_version == PG_DATA_CHECKSUM_VERSION)
+ {
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ MyProc->delayChkptFlags |= DELAY_CHKPT_START;
+ START_CRIT_SECTION();
+
+ XLogChecksums(PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION);
+
+ SpinLockAcquire(&XLogCtl->info_lck);
+ XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION;
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF);
+
+ END_CRIT_SECTION();
+ MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
+
+ /*
+ * Update local state in all backends to ensure that any backend in
+ * "on" state is changed to "inprogress-off".
+ */
+ WaitForProcSignalBarrier(barrier);
+
+ /*
+ * At this point we know that no backends are verifying data checksums
+ * during reading. Next, we can safely move to state "off" to also
+ * stop writing checksums.
+ */
+ }
+ else
+ {
+ /*
+ * Ending up here implies that the checksums state is "inprogress-on"
+ * or "inprogress-off" and we can transition directly to "off" from
+ * there.
+ */
+ SpinLockRelease(&XLogCtl->info_lck);
+ }
+
+ /*
+ * Ensure that we don't incur a checkpoint during disabling checksums.
+ */
+ MyProc->delayChkptFlags |= DELAY_CHKPT_START;
+ START_CRIT_SECTION();
+
+ XLogChecksums(0);
+
+ SpinLockAcquire(&XLogCtl->info_lck);
+ XLogCtl->data_checksum_version = 0;
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_OFF);
+
+ END_CRIT_SECTION();
+ MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
+
+ WaitForProcSignalBarrier(barrier);
+}
+
+/*
+ * ProcSignalBarrier absorption functions for enabling and disabling data
+ * checksums in a running cluster. The procsignalbarriers are emitted in the
+ * SetDataChecksums* functions.
+ */
+bool
+AbsorbChecksumsOnInProgressBarrier(void)
+{
+ Assert(LocalDataChecksumVersion != PG_DATA_CHECKSUM_VERSION);
+ SetLocalDataChecksumVersion(PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION);
+ return true;
+}
+
+bool
+AbsorbChecksumsOnBarrier(void)
+{
+ /*
+ * If the process was spawned between updating XLogCtl and emitting the
+ * barrier it will have seen the updated value, so for the first barrier
+ * we accept both "on" and "inprogress-on".
+ */
+ Assert((LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION) ||
+ (InitialDataChecksumTransition &&
+ (LocalDataChecksumVersion == PG_DATA_CHECKSUM_VERSION)));
+
+ SetLocalDataChecksumVersion(PG_DATA_CHECKSUM_VERSION);
+ InitialDataChecksumTransition = false;
+ return true;
+}
+
+bool
+AbsorbChecksumsOffInProgressBarrier(void)
+{
+ Assert(LocalDataChecksumVersion == PG_DATA_CHECKSUM_VERSION);
+ SetLocalDataChecksumVersion(PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION);
+ return true;
+}
+
+bool
+AbsorbChecksumsOffBarrier(void)
+{
+ /*
+ * We should never get here directly from a cluster with data checksums
+ * enabled, an inprogress state should be in between. When there are no
+ * failures the inprogress-off state should preceed, but in case of error
+ * in processing we can also reach here from the inprogress-on state.
+ */
+ Assert((LocalDataChecksumVersion != PG_DATA_CHECKSUM_VERSION) &&
+ (LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION ||
+ LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION));
+ SetLocalDataChecksumVersion(PG_DATA_CHECKSUM_OFF);
+ return true;
+}
+
+/*
+ * InitLocalControlData
+ *
+ * Set up backend local caches of controldata variables which may change at
+ * any point during runtime and thus require special cased locking. So far
+ * this only applies to data_checksum_version, but it's intended to be general
+ * purpose enough to handle future cases.
+ */
+void
+InitLocalDataChecksumVersion(void)
+{
+ SpinLockAcquire(&XLogCtl->info_lck);
+ SetLocalDataChecksumVersion(XLogCtl->data_checksum_version);
+ SpinLockRelease(&XLogCtl->info_lck);
+}
+
+void
+SetLocalDataChecksumVersion(uint32 data_checksum_version)
+{
+ LocalDataChecksumVersion = data_checksum_version;
+
+ data_checksums = data_checksum_version;
+}
+
+/* guc hook */
+const char *
+show_data_checksums(void)
+{
+ if (LocalDataChecksumVersion == PG_DATA_CHECKSUM_VERSION)
+ return "on";
+ else if (LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION)
+ return "inprogress-on";
+ else if (LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION)
+ return "inprogress-off";
+ else
+ return "off";
}
/*
@@ -4870,6 +5271,7 @@ LocalProcessControlFile(bool reset)
Assert(reset || ControlFile == NULL);
ControlFile = palloc(sizeof(ControlFileData));
ReadControlFile();
+ SetLocalDataChecksumVersion(ControlFile->data_checksum_version);
}
/*
@@ -5039,6 +5441,11 @@ XLOGShmemInit(void)
XLogCtl->InstallXLogFileSegmentActive = false;
XLogCtl->WalWriterSleeping = false;
+ /* Use the checksum info from control file */
+ XLogCtl->data_checksum_version = ControlFile->data_checksum_version;
+
+ SetLocalDataChecksumVersion(XLogCtl->data_checksum_version);
+
SpinLockInit(&XLogCtl->Insert.insertpos_lck);
SpinLockInit(&XLogCtl->info_lck);
pg_atomic_init_u64(&XLogCtl->logInsertResult, InvalidXLogRecPtr);
@@ -6180,6 +6587,47 @@ StartupXLOG(void)
pfree(endOfRecoveryInfo->recoveryStopReason);
pfree(endOfRecoveryInfo);
+ /*
+ * If we reach this point with checksums in the state inprogress-on, it
+ * means that data checksums were in the process of being enabled when the
+ * cluster shut down. Since processing didn't finish, the operation will
+ * have to be restarted from scratch since there is no capability to
+ * continue where it was when the cluster shut down. Thus, revert the
+ * state back to off, and inform the user with a warning message. Being
+ * able to restart processing is a TODO, but it wouldn't be possible to
+ * restart here since we cannot launch a dynamic background worker
+ * directly from here (it has to be from a regular backend).
+ */
+ if (XLogCtl->data_checksum_version == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION)
+ {
+ XLogChecksums(0);
+
+ SpinLockAcquire(&XLogCtl->info_lck);
+ XLogCtl->data_checksum_version = 0;
+ SetLocalDataChecksumVersion(XLogCtl->data_checksum_version);
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ ereport(WARNING,
+ (errmsg("data checksums state has been set of off"),
+ errhint("If checksums were being enabled during shutdown then processing must be manually restarted.")));
+ }
+
+ /*
+ * If data checksums were being disabled when the cluster was shut down,
+ * we know that we have a state where all backends have stopped validating
+ * checksums and we can move to off instead of prompting the user to
+ * perform any action.
+ */
+ if (XLogCtl->data_checksum_version == PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION)
+ {
+ XLogChecksums(0);
+
+ SpinLockAcquire(&XLogCtl->info_lck);
+ XLogCtl->data_checksum_version = 0;
+ SetLocalDataChecksumVersion(XLogCtl->data_checksum_version);
+ SpinLockRelease(&XLogCtl->info_lck);
+ }
+
/*
* All done with end-of-recovery actions.
*
@@ -6471,7 +6919,7 @@ GetRedoRecPtr(void)
XLogRecPtr ptr;
/*
- * The possibly not up-to-date copy in XlogCtl is enough. Even if we
+ * The possibly not up-to-date copy in XLogCtl is enough. Even if we
* grabbed a WAL insertion lock to read the authoritative value in
* Insert->RedoRecPtr, someone might update it just after we've released
* the lock.
@@ -7035,6 +7483,12 @@ CreateCheckPoint(int flags)
checkPoint.fullPageWrites = Insert->fullPageWrites;
checkPoint.wal_level = wal_level;
+ /*
+ * Get the current data_checksum_version value from xlogctl, valid at the
+ * time of the checkpoint.
+ */
+ checkPoint.data_checksum_version = XLogCtl->data_checksum_version;
+
if (shutdown)
{
XLogRecPtr curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
@@ -7290,6 +7744,9 @@ CreateCheckPoint(int flags)
ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
ControlFile->minRecoveryPointTLI = 0;
+ /* make sure we start with the checksum version as of the checkpoint */
+ ControlFile->data_checksum_version = checkPoint.data_checksum_version;
+
/*
* Persist unloggedLSN value. It's reset on crash recovery, so this goes
* unused on non-shutdown checkpoints, but seems useful to store it always
@@ -7435,6 +7892,10 @@ CreateEndOfRecoveryRecord(void)
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
ControlFile->minRecoveryPoint = recptr;
ControlFile->minRecoveryPointTLI = xlrec.ThisTimeLineID;
+
+ /* start with the latest checksum version (as of the end of recovery) */
+ ControlFile->data_checksum_version = XLogCtl->data_checksum_version;
+
UpdateControlFile();
LWLockRelease(ControlFileLock);
@@ -7776,6 +8237,10 @@ CreateRestartPoint(int flags)
if (flags & CHECKPOINT_IS_SHUTDOWN)
ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
}
+
+ /* we shall start with the latest checksum version */
+ ControlFile->data_checksum_version = lastCheckPoint.data_checksum_version;
+
UpdateControlFile();
}
LWLockRelease(ControlFileLock);
@@ -8187,6 +8652,24 @@ XLogReportParameters(void)
}
}
+/*
+ * Log the new state of checksums
+ */
+static void
+XLogChecksums(uint32 new_type)
+{
+ xl_checksum_state xlrec;
+ XLogRecPtr recptr;
+
+ xlrec.new_checksumtype = new_type;
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) &xlrec, sizeof(xl_checksum_state));
+
+ recptr = XLogInsert(RM_XLOG_ID, XLOG_CHECKSUMS);
+ XLogFlush(recptr);
+}
+
/*
* Update full_page_writes in shared memory, and write an
* XLOG_FPW_CHANGE record if necessary.
@@ -8605,6 +9088,46 @@ xlog_redo(XLogReaderState *record)
{
/* nothing to do here, just for informational purposes */
}
+ else if (info == XLOG_CHECKSUMS)
+ {
+ xl_checksum_state state;
+ uint64 barrier;
+
+ memcpy(&state, XLogRecGetData(record), sizeof(xl_checksum_state));
+
+ SpinLockAcquire(&XLogCtl->info_lck);
+ XLogCtl->data_checksum_version = state.new_checksumtype;
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ /*
+ * Block on a procsignalbarrier to await all processes having seen the
+ * change to checksum status. Once the barrier has been passed we can
+ * initiate the corresponding processing.
+ */
+ switch (state.new_checksumtype)
+ {
+ case PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION:
+ barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON);
+ WaitForProcSignalBarrier(barrier);
+ break;
+
+ case PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION:
+ barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF);
+ WaitForProcSignalBarrier(barrier);
+ break;
+
+ case PG_DATA_CHECKSUM_VERSION:
+ barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_ON);
+ WaitForProcSignalBarrier(barrier);
+ break;
+
+ default:
+ Assert(state.new_checksumtype == 0);
+ barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_OFF);
+ WaitForProcSignalBarrier(barrier);
+ break;
+ }
+ }
}
/*
diff --git a/src/backend/access/transam/xlogfuncs.c b/src/backend/access/transam/xlogfuncs.c
index 8c3090165f00..337932a89e5e 100644
--- a/src/backend/access/transam/xlogfuncs.c
+++ b/src/backend/access/transam/xlogfuncs.c
@@ -26,6 +26,7 @@
#include "funcapi.h"
#include "miscadmin.h"
#include "pgstat.h"
+#include "postmaster/datachecksumsworker.h"
#include "replication/walreceiver.h"
#include "storage/fd.h"
#include "storage/latch.h"
@@ -748,3 +749,45 @@ pg_promote(PG_FUNCTION_ARGS)
wait_seconds)));
PG_RETURN_BOOL(false);
}
+
+/*
+ * Disables data checksums for the cluster, if applicable. Starts a background
+ * worker which turns off the data checksums.
+ */
+Datum
+disable_data_checksums(PG_FUNCTION_ARGS)
+{
+ bool fast = PG_GETARG_BOOL(0);
+
+ if (!superuser())
+ ereport(ERROR, errmsg("must be superuser"));
+
+ StartDataChecksumsWorkerLauncher(DISABLE_DATACHECKSUMS, 0, 0, fast);
+ PG_RETURN_VOID();
+}
+
+/*
+ * Enables data checksums for the cluster, if applicable. Supports vacuum-
+ * like cost based throttling to limit system load. Starts a background worker
+ * which updates data checksums on existing data.
+ */
+Datum
+enable_data_checksums(PG_FUNCTION_ARGS)
+{
+ int cost_delay = PG_GETARG_INT32(0);
+ int cost_limit = PG_GETARG_INT32(1);
+ bool fast = PG_GETARG_BOOL(2);
+
+ if (!superuser())
+ ereport(ERROR, errmsg("must be superuser"));
+
+ if (cost_delay < 0)
+ ereport(ERROR, errmsg("cost delay cannot be a negative value"));
+
+ if (cost_limit <= 0)
+ ereport(ERROR, errmsg("cost limit must be greater than zero"));
+
+ StartDataChecksumsWorkerLauncher(ENABLE_DATACHECKSUMS, cost_delay, cost_limit, fast);
+
+ PG_RETURN_VOID();
+}
diff --git a/src/backend/backup/basebackup.c b/src/backend/backup/basebackup.c
index bb7d90aa5d96..54dcfbcb3334 100644
--- a/src/backend/backup/basebackup.c
+++ b/src/backend/backup/basebackup.c
@@ -1613,7 +1613,8 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
* enabled for this cluster, and if this is a relation file, then verify
* the checksum.
*/
- if (!noverify_checksums && DataChecksumsEnabled() &&
+ if (!noverify_checksums &&
+ DataChecksumsNeedWrite() &&
RelFileNumberIsValid(relfilenumber))
verify_checksum = true;
@@ -2007,6 +2008,9 @@ verify_page_checksum(Page page, XLogRecPtr start_lsn, BlockNumber blkno,
if (PageIsNew(page) || PageGetLSN(page) >= start_lsn)
return true;
+ if (!DataChecksumsNeedVerify())
+ return true;
+
/* Perform the actual checksum calculation. */
checksum = pg_checksum_page(page, blkno);
diff --git a/src/backend/catalog/system_functions.sql b/src/backend/catalog/system_functions.sql
index 566f308e4439..dea7ad3cf30c 100644
--- a/src/backend/catalog/system_functions.sql
+++ b/src/backend/catalog/system_functions.sql
@@ -650,6 +650,18 @@ LANGUAGE INTERNAL
CALLED ON NULL INPUT VOLATILE PARALLEL SAFE
AS 'pg_stat_reset_slru';
+CREATE OR REPLACE FUNCTION
+ pg_enable_data_checksums(cost_delay integer DEFAULT 0,
+ cost_limit integer DEFAULT 100,
+ fast boolean DEFAULT false)
+ RETURNS void STRICT VOLATILE LANGUAGE internal AS 'enable_data_checksums'
+ PARALLEL RESTRICTED;
+
+CREATE OR REPLACE FUNCTION
+ pg_disable_data_checksums(fast boolean DEFAULT false)
+ RETURNS void STRICT VOLATILE LANGUAGE internal AS 'disable_data_checksums'
+ PARALLEL RESTRICTED;
+
--
-- The default permissions for functions mean that anyone can execute them.
-- A number of functions shouldn't be executable by just anyone, but rather
@@ -775,6 +787,10 @@ REVOKE EXECUTE ON FUNCTION pg_ls_logicalmapdir() FROM PUBLIC;
REVOKE EXECUTE ON FUNCTION pg_ls_replslotdir(text) FROM PUBLIC;
+REVOKE EXECUTE ON FUNCTION pg_enable_data_checksums(integer, integer, boolean) FROM public;
+
+REVOKE EXECUTE ON FUNCTION pg_disable_data_checksums(boolean) FROM public;
+
--
-- We also set up some things as accessible to standard roles.
--
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 1b3c5a55882d..22f67c7ee4ac 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -1354,6 +1354,26 @@ CREATE VIEW pg_stat_progress_copy AS
FROM pg_stat_get_progress_info('COPY') AS S
LEFT JOIN pg_database D ON S.datid = D.oid;
+CREATE VIEW pg_stat_progress_data_checksums AS
+ SELECT
+ S.pid AS pid, S.datid, D.datname AS datname,
+ CASE S.param1 WHEN 0 THEN 'enabling'
+ WHEN 1 THEN 'disabling'
+ WHEN 2 THEN 'waiting'
+ WHEN 3 THEN 'waiting on temporary tables'
+ WHEN 4 THEN 'waiting on checkpoint'
+ WHEN 5 THEN 'done'
+ END AS phase,
+ CASE S.param2 WHEN -1 THEN NULL ELSE S.param2 END AS databases_total,
+ S.param3 AS databases_done,
+ CASE S.param4 WHEN -1 THEN NULL ELSE S.param4 END AS relations_total,
+ CASE S.param5 WHEN -1 THEN NULL ELSE S.param5 END AS relations_done,
+ CASE S.param6 WHEN -1 THEN NULL ELSE S.param6 END AS blocks_total,
+ CASE S.param7 WHEN -1 THEN NULL ELSE S.param7 END AS blocks_done
+ FROM pg_stat_get_progress_info('DATACHECKSUMS') AS S
+ LEFT JOIN pg_database D ON S.datid = D.oid
+ ORDER BY S.datid; -- return the launcher process first
+
CREATE VIEW pg_user_mappings AS
SELECT
U.oid AS umid,
diff --git a/src/backend/postmaster/Makefile b/src/backend/postmaster/Makefile
index 0f4435d2d97c..0c36765acfe1 100644
--- a/src/backend/postmaster/Makefile
+++ b/src/backend/postmaster/Makefile
@@ -18,6 +18,7 @@ OBJS = \
bgworker.o \
bgwriter.o \
checkpointer.o \
+ datachecksumsworker.o \
fork_process.o \
interrupt.o \
launch_backend.o \
diff --git a/src/backend/postmaster/auxprocess.c b/src/backend/postmaster/auxprocess.c
index a6d3630398f4..5742a1dd724e 100644
--- a/src/backend/postmaster/auxprocess.c
+++ b/src/backend/postmaster/auxprocess.c
@@ -15,6 +15,7 @@
#include
#include
+#include "access/xlog.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "postmaster/auxprocess.h"
@@ -68,6 +69,24 @@ AuxiliaryProcessMainCommon(void)
ProcSignalInit(NULL, 0);
+ /*
+ * Initialize a local cache of the data_checksum_version, to be updated by
+ * the procsignal-based barriers.
+ *
+ * This intentionally happens after initializing the procsignal, otherwise
+ * we might miss a state change. This means we can get a barrier for the
+ * state we've just initialized - but it can happen only once.
+ *
+ * The postmaster (which is what gets forked into the new child process)
+ * does not handle barriers, therefore it may not have the current value
+ * of LocalDataChecksumVersion value (it'll have the value read from the
+ * control file, which may be arbitrarily old).
+ *
+ * NB: Even if the postmaster handled barriers, the value might still be
+ * stale, as it might have changed after this process forked.
+ */
+ InitLocalDataChecksumVersion();
+
/*
* Auxiliary processes don't run transactions, but they may need a
* resource owner anyway to manage buffer pins acquired outside
diff --git a/src/backend/postmaster/bgworker.c b/src/backend/postmaster/bgworker.c
index 1ad65c237c34..0d2ade1f9057 100644
--- a/src/backend/postmaster/bgworker.c
+++ b/src/backend/postmaster/bgworker.c
@@ -18,6 +18,7 @@
#include "pgstat.h"
#include "port/atomics.h"
#include "postmaster/bgworker_internals.h"
+#include "postmaster/datachecksumsworker.h"
#include "postmaster/postmaster.h"
#include "replication/logicallauncher.h"
#include "replication/logicalworker.h"
@@ -132,6 +133,12 @@ static const struct
},
{
"TablesyncWorkerMain", TablesyncWorkerMain
+ },
+ {
+ "DataChecksumsWorkerLauncherMain", DataChecksumsWorkerLauncherMain
+ },
+ {
+ "DataChecksumsWorkerMain", DataChecksumsWorkerMain
}
};
diff --git a/src/backend/postmaster/datachecksumsworker.c b/src/backend/postmaster/datachecksumsworker.c
new file mode 100644
index 000000000000..ff451d502ba7
--- /dev/null
+++ b/src/backend/postmaster/datachecksumsworker.c
@@ -0,0 +1,1463 @@
+/*-------------------------------------------------------------------------
+ *
+ * datachecksumsworker.c
+ * Background worker for enabling or disabling data checksums online
+ *
+ * When enabling data checksums on a database at initdb time or when shut down
+ * with pg_checksums, no extra process is required as each page is checksummed,
+ * and verified, when accessed. When enabling checksums on an already running
+ * cluster, this worker will ensure that all pages are checksummed before
+ * verification of the checksums is turned on. In the case of disabling
+ * checksums, the state transition is performed only in the control file, no
+ * changes are performed on the data pages.
+ *
+ * Checksums can be either enabled or disabled cluster-wide, with on/off being
+ * the end state for data_checksums.
+ *
+ * Enabling checksums
+ * ------------------
+ * When enabling checksums in an online cluster, data_checksums will be set to
+ * "inprogress-on" which signals that write operations MUST compute and write
+ * the checksum on the data page, but during reading the checksum SHALL NOT be
+ * verified. This ensures that all objects created during checksumming will
+ * have checksums set, but no reads will fail due to incorrect checksum. The
+ * DataChecksumsWorker will compile a list of databases which exist at the
+ * start of checksumming, and all of these which haven't been dropped during
+ * the processing MUST have been processed successfully in order for checksums
+ * to be enabled. Any new relation created during processing will see the
+ * in-progress state and will automatically be checksummed.
+ *
+ * For each database, all relations which have storage are read and every data
+ * page is marked dirty to force a write with the checksum. This will generate
+ * a lot of WAL as the entire database is read and written.
+ *
+ * If the processing is interrupted by a cluster restart, it will be restarted
+ * from the beginning again as state isn't persisted.
+ *
+ * Disabling checksums
+ * -------------------
+ * When disabling checksums, data_checksums will be set to "inprogress-off"
+ * which signals that checksums are written but no longer verified. This ensure
+ * that backends which have yet to move from the "on" state will still be able
+ * to process data checksum validation.
+ *
+ * Synchronization and Correctness
+ * -------------------------------
+ * The processes involved in enabling, or disabling, data checksums in an
+ * online cluster must be properly synchronized with the normal backends
+ * serving concurrent queries to ensure correctness. Correctness is defined
+ * as the following:
+ *
+ * - Backends SHALL NOT violate local data_checksums state
+ * - Data checksums SHALL NOT be considered enabled cluster-wide until all
+ * currently connected backends have the local state "enabled"
+ *
+ * There are two levels of synchronization required for enabling data checksums
+ * in an online cluster: (i) changing state in the active backends ("on",
+ * "off", "inprogress-on" and "inprogress-off"), and (ii) ensuring no
+ * incompatible objects and processes are left in a database when workers end.
+ * The former deals with cluster-wide agreement on data checksum state and the
+ * latter with ensuring that any concurrent activity cannot break the data
+ * checksum contract during processing.
+ *
+ * Synchronizing the state change is done with procsignal barriers, where the
+ * WAL logging backend updating the global state in the controlfile will wait
+ * for all other backends to absorb the barrier. Barrier absorption will happen
+ * during interrupt processing, which means that connected backends will change
+ * state at different times. To prevent data checksum state changes when
+ * writing and verifying checksums, interrupts shall be held off before
+ * interrogating state and resumed when the IO operation has been performed.
+ *
+ * When Enabling Data Checksums
+ * ----------------------------
+ * A process which fails to observe data checksums being enabled can induce
+ * two types of errors: failing to write the checksum when modifying the page
+ * and failing to validate the data checksum on the page when reading it.
+ *
+ * When processing starts all backends belong to one of the below sets, with
+ * one set being empty:
+ *
+ * Bd: Backends in "off" state
+ * Bi: Backends in "inprogress-on" state
+ *
+ * If processing is started in an online cluster then all backends are in Bd.
+ * If processing was halted by the cluster shutting down, the controlfile
+ * state "inprogress-on" will be observed on system startup and all backends
+ * will be in Bd. Backends transition Bd -> Bi via a procsignalbarrier. When
+ * the DataChecksumsWorker has finished writing checksums on all pages and
+ * enables data checksums cluster-wide, there are four sets of backends where
+ * Bd shall be an empty set:
+ *
+ * Bg: Backend updating the global state and emitting the procsignalbarrier
+ * Bd: Backends in "off" state
+ * Be: Backends in "on" state
+ * Bi: Backends in "inprogress-on" state
+ *
+ * Backends in Bi and Be will write checksums when modifying a page, but only
+ * backends in Be will verify the checksum during reading. The Bg backend is
+ * blocked waiting for all backends in Bi to process interrupts and move to
+ * Be. Any backend starting while Bg is waiting on the procsignalbarrier will
+ * observe the global state being "on" and will thus automatically belong to
+ * Be. Checksums are enabled cluster-wide when Bi is an empty set. Bi and Be
+ * are compatible sets while still operating based on their local state as
+ * both write data checksums.
+ *
+ * When Disabling Data Checksums
+ * -----------------------------
+ * A process which fails to observe that data checksums have been disabled
+ * can induce two types of errors: writing the checksum when modifying the
+ * page and validating a data checksum which is no longer correct due to
+ * modifications to the page.
+ *
+ * Bg: Backend updating the global state and emitting the procsignalbarrier
+ * Bd: Backends in "off" state
+ * Be: Backends in "on" state
+ * Bo: Backends in "inprogress-off" state
+ *
+ * Backends transition from the Be state to Bd like so: Be -> Bo -> Bd
+ *
+ * The goal is to transition all backends to Bd making the others empty sets.
+ * Backends in Bo write data checksums, but don't validate them, such that
+ * backends still in Be can continue to validate pages until the barrier has
+ * been absorbed such that they are in Bo. Once all backends are in Bo, the
+ * barrier to transition to "off" can be raised and all backends can safely
+ * stop writing data checksums as no backend is enforcing data checksum
+ * validation any longer.
+ *
+ *
+ * Potential optimizations
+ * -----------------------
+ * Below are some potential optimizations and improvements which were brought
+ * up during reviews of this feature, but which weren't implemented in the
+ * initial version. These are ideas listed without any validation on their
+ * feasibility or potential payoff. More discussion on these can be found on
+ * the -hackers threads linked to in the commit message of this feature.
+ *
+ * * Launching datachecksumsworker for resuming operation from the startup
+ * process: Currently users have to restart processing manually after a
+ * restart since dynamic background worker cannot be started from the
+ * postmaster. Changing the startup process could make restarting the
+ * processing automatic on cluster restart.
+ * * Avoid dirtying the page when checksums already match: Iff the checksum
+ * on the page happens to already match we still dirty the page. It should
+ * be enough to only do the log_newpage_buffer() call in that case.
+ * * Invent a lightweight WAL record that doesn't contain the full-page
+ * image but just the block number: On replay, the redo routine would read
+ * the page from disk.
+ * * Teach pg_checksums to avoid checksummed pages when pg_checksums is used
+ * to enable checksums on a cluster which is in inprogress-on state and
+ * may have checksummed pages (make pg_checksums be able to resume an
+ * online operation).
+ *
+ *
+ * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/postmaster/datachecksumsworker.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "access/heapam.h"
+#include "access/htup_details.h"
+#include "access/xact.h"
+#include "access/xloginsert.h"
+#include "catalog/indexing.h"
+#include "catalog/pg_class.h"
+#include "catalog/pg_database.h"
+#include "commands/progress.h"
+#include "commands/vacuum.h"
+#include "common/relpath.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "postmaster/bgworker.h"
+#include "postmaster/bgwriter.h"
+#include "postmaster/datachecksumsworker.h"
+#include "storage/bufmgr.h"
+#include "storage/checksum.h"
+#include "storage/ipc.h"
+#include "storage/lmgr.h"
+#include "storage/lwlock.h"
+#include "storage/procarray.h"
+#include "storage/smgr.h"
+#include "tcop/tcopprot.h"
+#include "utils/fmgroids.h"
+#include "utils/injection_point.h"
+#include "utils/lsyscache.h"
+#include "utils/ps_status.h"
+#include "utils/syscache.h"
+
+/*
+ * Number of times we retry to open a database before giving up and consider
+ * it to have failed processing.
+ */
+#define DATACHECKSUMSWORKER_MAX_DB_RETRIES 5
+
+/*
+ * Signaling between backends calling pg_enable/disable_data_checksums, the
+ * checksums launcher process, and the checksums worker process.
+ *
+ * This struct is protected by DataChecksumsWorkerLock
+ */
+typedef struct DataChecksumsWorkerShmemStruct
+{
+ /*
+ * These are set by pg_{enable|disable|verify}_data_checksums, to tell the
+ * launcher what the target state is.
+ */
+ DataChecksumsWorkerOperation launch_operation;
+ int launch_cost_delay;
+ int launch_cost_limit;
+ bool launch_fast;
+
+ /*
+ * Is a launcher process is currently running?
+ *
+ * This is set by the launcher process, after it has read the above
+ * launch_* parameters.
+ */
+ bool launcher_running;
+
+ /*
+ * These fields indicate the target state that the launcher is currently
+ * working towards. They can be different from the corresponding launch_*
+ * fields, if a new pg_enable/disable_data_checksums() call was made while
+ * the launcher/worker was already running.
+ *
+ * The below members are set when the launcher starts, and are only
+ * accessed read-only by the single worker. Thus, we can access these
+ * without a lock. If multiple workers, or dynamic cost parameters, are
+ * supported at some point then this would need to be revisited.
+ */
+ DataChecksumsWorkerOperation operation;
+ int cost_delay;
+ int cost_limit;
+ bool immediate_checkpoint;
+
+ /*
+ * Signaling between the launcher and the worker process.
+ *
+ * As there is only a single worker, and the launcher won't read these
+ * until the worker exits, they can be accessed without the need for a
+ * lock. If multiple workers are supported then this will have to be
+ * revisited.
+ */
+
+ /* result, set by worker before exiting */
+ DataChecksumsWorkerResult success;
+
+ /*
+ * tells the worker process whether it should also process the shared
+ * catalogs
+ */
+ bool process_shared_catalogs;
+} DataChecksumsWorkerShmemStruct;
+
+/* Shared memory segment for datachecksumsworker */
+static DataChecksumsWorkerShmemStruct *DataChecksumsWorkerShmem;
+
+typedef struct DataChecksumsWorkerDatabase
+{
+ Oid dboid;
+ char *dbname;
+} DataChecksumsWorkerDatabase;
+
+typedef struct DataChecksumsWorkerResultEntry
+{
+ Oid dboid;
+ DataChecksumsWorkerResult result;
+ int retries;
+} DataChecksumsWorkerResultEntry;
+
+
+/*
+ * Flag set by the interrupt handler
+ */
+static volatile sig_atomic_t abort_requested = false;
+
+/*
+ * Have we set the DataChecksumsWorkerShmemStruct->launcher_running flag?
+ * If we have, we need to clear it before exiting!
+ */
+static volatile sig_atomic_t launcher_running = false;
+
+/*
+ * Are we enabling data checksums, or disabling them?
+ */
+static DataChecksumsWorkerOperation operation;
+
+/* Prototypes */
+static List *BuildDatabaseList(void);
+static List *BuildRelationList(bool temp_relations, bool include_shared);
+static void FreeDatabaseList(List *dblist);
+static DataChecksumsWorkerResult ProcessDatabase(DataChecksumsWorkerDatabase *db);
+static bool ProcessAllDatabases(bool immediate_checkpoint);
+static bool ProcessSingleRelationFork(Relation reln, ForkNumber forkNum, BufferAccessStrategy strategy);
+static void launcher_cancel_handler(SIGNAL_ARGS);
+static void WaitForAllTransactionsToFinish(void);
+
+/*
+ * StartDataChecksumsWorkerLauncher
+ * Main entry point for datachecksumsworker launcher process
+ *
+ * The main entrypoint for starting data checksums processing for enabling as
+ * well as disabling.
+ */
+void
+StartDataChecksumsWorkerLauncher(DataChecksumsWorkerOperation op,
+ int cost_delay,
+ int cost_limit,
+ bool fast)
+{
+ BackgroundWorker bgw;
+ BackgroundWorkerHandle *bgw_handle;
+ bool launcher_running;
+
+#ifdef USE_ASSERT_CHECKING
+ /* The cost delay settings have no effect when disabling */
+ if (op == DISABLE_DATACHECKSUMS)
+ Assert(cost_delay == 0 && cost_limit == 0);
+#endif
+
+ /* Store the desired state in shared memory */
+ LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
+
+ DataChecksumsWorkerShmem->launch_operation = op;
+ DataChecksumsWorkerShmem->launch_cost_delay = cost_delay;
+ DataChecksumsWorkerShmem->launch_cost_limit = cost_limit;
+ DataChecksumsWorkerShmem->launch_fast = fast;
+
+ /* is the launcher already running? */
+ launcher_running = DataChecksumsWorkerShmem->launcher_running;
+
+ LWLockRelease(DataChecksumsWorkerLock);
+
+ /*
+ * Launch a new launcher process, if it's not running already.
+ *
+ * If the launcher is currently busy enabling the checksums, and we want
+ * them disabled (or vice versa), the launcher will notice that at latest
+ * when it's about to exit, and will loop back process the new request. So
+ * if the launcher is already running, we don't need to do anything more
+ * here to abort it.
+ *
+ * If you call pg_enable/disable_data_checksums() twice in a row, before
+ * the launcher has had a chance to start up, we still end up launching it
+ * twice. That's OK, the second invocation will see that a launcher is
+ * already running and exit quickly.
+ *
+ * TODO: We could optimize here and skip launching the launcher, if we are
+ * already in the desired state, i.e. if the checksums are already enabled
+ * and you call pg_enable_data_checksums().
+ */
+ if (!launcher_running)
+ {
+ /*
+ * Prepare the BackgroundWorker and launch it.
+ */
+ memset(&bgw, 0, sizeof(bgw));
+ bgw.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION;
+ bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
+ snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres");
+ snprintf(bgw.bgw_function_name, BGW_MAXLEN, "DataChecksumsWorkerLauncherMain");
+ snprintf(bgw.bgw_name, BGW_MAXLEN, "datachecksumsworker launcher");
+ snprintf(bgw.bgw_type, BGW_MAXLEN, "datachecksumsworker launcher");
+ bgw.bgw_restart_time = BGW_NEVER_RESTART;
+ bgw.bgw_notify_pid = MyProcPid;
+ bgw.bgw_main_arg = (Datum) 0;
+
+ if (!RegisterDynamicBackgroundWorker(&bgw, &bgw_handle))
+ ereport(ERROR,
+ errmsg("failed to start background worker to process data checksums"));
+ }
+}
+
+/*
+ * ProcessSingleRelationFork
+ * Enable data checksums in a single relation/fork.
+ *
+ * Returns true if successful, and false if *aborted*. On error, an actual
+ * error is raised in the lower levels.
+ */
+static bool
+ProcessSingleRelationFork(Relation reln, ForkNumber forkNum, BufferAccessStrategy strategy)
+{
+ BlockNumber numblocks = RelationGetNumberOfBlocksInFork(reln, forkNum);
+ char activity[NAMEDATALEN * 2 + 128];
+ char *relns;
+
+ relns = get_namespace_name(RelationGetNamespace(reln));
+
+ if (!relns)
+ return false;
+
+ /* Report the current relation to pgstat_activity */
+ snprintf(activity, sizeof(activity) - 1, "processing: %s.%s (%s, %dblocks)",
+ relns, RelationGetRelationName(reln), forkNames[forkNum], numblocks);
+ pgstat_report_activity(STATE_RUNNING, activity);
+
+ /*
+ * As of now we only update the block counter for main forks in order to
+ * not cause too frequent calls. TODO: investigate whether we should do it
+ * more frequent?
+ */
+ if (forkNum == MAIN_FORKNUM)
+ pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_BLOCKS_TOTAL,
+ numblocks);
+
+ /*
+ * We are looping over the blocks which existed at the time of process
+ * start, which is safe since new blocks are created with checksums set
+ * already due to the state being "inprogress-on".
+ */
+ for (BlockNumber blknum = 0; blknum < numblocks; blknum++)
+ {
+ Buffer buf = ReadBufferExtended(reln, forkNum, blknum, RBM_NORMAL, strategy);
+
+ /* Need to get an exclusive lock before we can flag as dirty */
+ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+
+ /*
+ * Mark the buffer as dirty and force a full page write. We have to
+ * re-write the page to WAL even if the checksum hasn't changed,
+ * because if there is a replica it might have a slightly different
+ * version of the page with an invalid checksum, caused by unlogged
+ * changes (e.g. hintbits) on the master happening while checksums
+ * were off. This can happen if there was a valid checksum on the page
+ * at one point in the past, so only when checksums are first on, then
+ * off, and then turned on again. TODO: investigate if this could be
+ * avoided if the checksum is calculated to be correct and wal_level
+ * is set to "minimal",
+ */
+ START_CRIT_SECTION();
+ MarkBufferDirty(buf);
+ log_newpage_buffer(buf, false);
+ END_CRIT_SECTION();
+
+ UnlockReleaseBuffer(buf);
+
+ /*
+ * This is the only place where we check if we are asked to abort, the
+ * abortion will bubble up from here. It's safe to check this without
+ * a lock, because if we miss it being set, we will try again soon.
+ */
+ Assert(operation == ENABLE_DATACHECKSUMS);
+ if (DataChecksumsWorkerShmem->launch_operation == DISABLE_DATACHECKSUMS)
+ abort_requested = true;
+
+ if (abort_requested)
+ return false;
+
+ /*
+ * As of now we only update the block counter for main forks in order
+ * to not cause too frequent calls. TODO: investigate whether we
+ * should do it more frequent?
+ */
+ if (forkNum == MAIN_FORKNUM)
+ pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_BLOCKS_DONE,
+ (blknum + 1));
+
+ vacuum_delay_point(false);
+ }
+
+ pfree(relns);
+ return true;
+}
+
+/*
+ * ProcessSingleRelationByOid
+ * Process a single relation based on oid.
+ *
+ * Returns true if successful, and false if *aborted*. On error, an actual
+ * error is raised in the lower levels.
+ */
+static bool
+ProcessSingleRelationByOid(Oid relationId, BufferAccessStrategy strategy)
+{
+ Relation rel;
+ bool aborted = false;
+
+ StartTransactionCommand();
+
+ rel = try_relation_open(relationId, AccessShareLock);
+ if (rel == NULL)
+ {
+ /*
+ * Relation no longer exists. We don't consider this an error since
+ * there are no pages in it that need data checksums, and thus return
+ * true. The worker operates off a list of relations generated at the
+ * start of processing, so relations being dropped in the meantime is
+ * to be expected.
+ */
+ CommitTransactionCommand();
+ pgstat_report_activity(STATE_IDLE, NULL);
+ return true;
+ }
+ RelationGetSmgr(rel);
+
+ for (ForkNumber fnum = 0; fnum <= MAX_FORKNUM; fnum++)
+ {
+ if (smgrexists(rel->rd_smgr, fnum))
+ {
+ if (!ProcessSingleRelationFork(rel, fnum, strategy))
+ {
+ aborted = true;
+ break;
+ }
+ }
+ }
+ relation_close(rel, AccessShareLock);
+ elog(DEBUG2,
+ "data checksum processing done for relation with OID %u: %s",
+ relationId, (aborted ? "aborted" : "finished"));
+
+ CommitTransactionCommand();
+
+ pgstat_report_activity(STATE_IDLE, NULL);
+
+ return !aborted;
+}
+
+/*
+ * ProcessDatabase
+ * Enable data checksums in a single database.
+ *
+ * We do this by launching a dynamic background worker into this database, and
+ * waiting for it to finish. We have to do this in a separate worker, since
+ * each process can only be connected to one database during its lifetime.
+ */
+static DataChecksumsWorkerResult
+ProcessDatabase(DataChecksumsWorkerDatabase *db)
+{
+ BackgroundWorker bgw;
+ BackgroundWorkerHandle *bgw_handle;
+ BgwHandleStatus status;
+ pid_t pid;
+ char activity[NAMEDATALEN + 64];
+
+ DataChecksumsWorkerShmem->success = DATACHECKSUMSWORKER_FAILED;
+
+ memset(&bgw, 0, sizeof(bgw));
+ bgw.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION;
+ bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
+ snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres");
+ snprintf(bgw.bgw_function_name, BGW_MAXLEN, "%s", "DataChecksumsWorkerMain");
+ snprintf(bgw.bgw_name, BGW_MAXLEN, "datachecksumsworker worker");
+ snprintf(bgw.bgw_type, BGW_MAXLEN, "datachecksumsworker worker");
+ bgw.bgw_restart_time = BGW_NEVER_RESTART;
+ bgw.bgw_notify_pid = MyProcPid;
+ bgw.bgw_main_arg = ObjectIdGetDatum(db->dboid);
+
+ /*
+ * If there are no worker slots available, make sure we retry processing
+ * this database. This will make the datachecksumsworker move on to the
+ * next database and quite likely fail with the same problem. TODO: Maybe
+ * we need a backoff to avoid running through all the databases here in
+ * short order.
+ */
+ if (!RegisterDynamicBackgroundWorker(&bgw, &bgw_handle))
+ {
+ ereport(WARNING,
+ errmsg("failed to start worker for enabling data checksums in database \"%s\", retrying",
+ db->dbname),
+ errhint("The max_worker_processes setting might be too low."));
+ return DATACHECKSUMSWORKER_RETRYDB;
+ }
+
+ status = WaitForBackgroundWorkerStartup(bgw_handle, &pid);
+ if (status == BGWH_STOPPED)
+ {
+ ereport(WARNING,
+ errmsg("could not start background worker for enabling data checksums in database \"%s\"",
+ db->dbname),
+ errhint("More details on the error might be found in the server log."));
+ return DATACHECKSUMSWORKER_FAILED;
+ }
+
+ /*
+ * If the postmaster crashed we cannot end up with a processed database so
+ * we have no alternative other than exiting. When enabling checksums we
+ * won't at this time have changed the pg_control version to enabled so
+ * when the cluster comes back up processing will have to be restarted.
+ * When disabling, the pg_control version will be set to off before this
+ * so when the cluster comes up checksums will be off as expected.
+ */
+ if (status == BGWH_POSTMASTER_DIED)
+ ereport(FATAL,
+ errmsg("cannot enable data checksums without the postmaster process"),
+ errhint("Restart the database and restart data checksum processing by calling pg_enable_data_checksums()."));
+
+ Assert(status == BGWH_STARTED);
+ ereport(DEBUG1,
+ errmsg("initiating data checksum processing in database \"%s\"",
+ db->dbname));
+
+ snprintf(activity, sizeof(activity) - 1,
+ "Waiting for worker in database %s (pid %ld)", db->dbname, (long) pid);
+ pgstat_report_activity(STATE_RUNNING, activity);
+
+ status = WaitForBackgroundWorkerShutdown(bgw_handle);
+ if (status == BGWH_POSTMASTER_DIED)
+ ereport(FATAL,
+ errmsg("postmaster exited during data checksum processing in \"%s\"",
+ db->dbname),
+ errhint("Restart the database and restart data checksum processing by calling pg_enable_data_checksums()."));
+
+ if (DataChecksumsWorkerShmem->success == DATACHECKSUMSWORKER_ABORTED)
+ ereport(LOG,
+ errmsg("data checksums processing was aborted in database \"%s\"",
+ db->dbname));
+
+ pgstat_report_activity(STATE_IDLE, NULL);
+
+ return DataChecksumsWorkerShmem->success;
+}
+
+/*
+ * launcher_exit
+ *
+ * Internal routine for cleaning up state when the launcher process exits. We
+ * need to clean up the abort flag to ensure that processing can be restarted
+ * again after it was previously aborted.
+ */
+static void
+launcher_exit(int code, Datum arg)
+{
+ if (launcher_running)
+ {
+ LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
+ launcher_running = false;
+ DataChecksumsWorkerShmem->launcher_running = false;
+ LWLockRelease(DataChecksumsWorkerLock);
+ }
+}
+
+/*
+ * launcher_cancel_handler
+ *
+ * Internal routine for reacting to SIGINT and flagging the worker to abort.
+ * The worker won't be interrupted immediately but will check for abort flag
+ * between each block in a relation.
+ */
+static void
+launcher_cancel_handler(SIGNAL_ARGS)
+{
+ int save_errno = errno;
+
+ abort_requested = true;
+
+ /*
+ * There is no sleeping in the main loop, the flag will be checked
+ * periodically in ProcessSingleRelationFork. The worker does however
+ * sleep when waiting for concurrent transactions to end so we still need
+ * to set the latch.
+ */
+ SetLatch(MyLatch);
+
+ errno = save_errno;
+}
+
+/*
+ * WaitForAllTransactionsToFinish
+ * Blocks awaiting all current transactions to finish
+ *
+ * Returns when all transactions which are active at the call of the function
+ * have ended, or if the postmaster dies while waiting. If the postmaster dies
+ * the abort flag will be set to indicate that the caller of this shouldn't
+ * proceed.
+ *
+ * NB: this will return early, if aborted by SIGINT or if the target state
+ * is changed while we're running.
+ */
+static void
+WaitForAllTransactionsToFinish(void)
+{
+ TransactionId waitforxid;
+
+ LWLockAcquire(XidGenLock, LW_SHARED);
+ waitforxid = XidFromFullTransactionId(TransamVariables->nextXid);
+ LWLockRelease(XidGenLock);
+
+ while (TransactionIdPrecedes(GetOldestActiveTransactionId(false, true), waitforxid))
+ {
+ char activity[64];
+ int rc;
+
+ /* Oldest running xid is older than us, so wait */
+ snprintf(activity,
+ sizeof(activity),
+ "Waiting for current transactions to finish (waiting for %u)",
+ waitforxid);
+ pgstat_report_activity(STATE_RUNNING, activity);
+
+ /* Retry every 3 seconds */
+ ResetLatch(MyLatch);
+ rc = WaitLatch(MyLatch,
+ WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
+ 3000,
+ WAIT_EVENT_CHECKSUM_ENABLE_STARTCONDITION);
+
+ /*
+ * If the postmaster died we won't be able to enable checksums
+ * cluster-wide so abort and hope to continue when restarted.
+ */
+ if (rc & WL_POSTMASTER_DEATH)
+ ereport(FATAL,
+ errmsg("postmaster exited during data checksum processing"),
+ errhint("Restart the database and restart data checksum processing by calling pg_enable_data_checksums()."));
+
+ LWLockAcquire(DataChecksumsWorkerLock, LW_SHARED);
+ if (DataChecksumsWorkerShmem->launch_operation != operation)
+ abort_requested = true;
+ LWLockRelease(DataChecksumsWorkerLock);
+ if (abort_requested)
+ break;
+ }
+
+ pgstat_report_activity(STATE_IDLE, NULL);
+ return;
+}
+
+/*
+ * DataChecksumsWorkerLauncherMain
+ *
+ * Main function for launching dynamic background workers for processing data
+ * checksums in databases. This function has the bgworker management, with
+ * ProcessAllDatabases being responsible for looping over the databases and
+ * initiating processing.
+ */
+void
+DataChecksumsWorkerLauncherMain(Datum arg)
+{
+ on_shmem_exit(launcher_exit, 0);
+
+ ereport(DEBUG1,
+ errmsg("background worker \"datachecksumsworker\" launcher started"));
+
+ pqsignal(SIGTERM, die);
+ pqsignal(SIGINT, launcher_cancel_handler);
+
+ BackgroundWorkerUnblockSignals();
+
+ MyBackendType = B_DATACHECKSUMSWORKER_LAUNCHER;
+ init_ps_display(NULL);
+
+ LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
+
+ if (DataChecksumsWorkerShmem->launcher_running)
+ {
+ /* Launcher was already running, let it finish */
+ LWLockRelease(DataChecksumsWorkerLock);
+ return;
+ }
+
+ launcher_running = true;
+
+ /*
+ * Initialize a connection to shared catalogs only.
+ */
+ BackgroundWorkerInitializeConnectionByOid(InvalidOid, InvalidOid, 0);
+
+ operation = DataChecksumsWorkerShmem->launch_operation;
+ DataChecksumsWorkerShmem->launcher_running = true;
+ DataChecksumsWorkerShmem->operation = operation;
+ DataChecksumsWorkerShmem->cost_delay = DataChecksumsWorkerShmem->launch_cost_delay;
+ DataChecksumsWorkerShmem->cost_limit = DataChecksumsWorkerShmem->launch_cost_limit;
+ DataChecksumsWorkerShmem->immediate_checkpoint = DataChecksumsWorkerShmem->launch_fast;
+ LWLockRelease(DataChecksumsWorkerLock);
+
+ /*
+ * The target state can change while we are busy enabling/disabling
+ * checksums, if the user calls pg_disable/enable_data_checksums() before
+ * we are finished with the previous request. In that case, we will loop
+ * back here, to process the new request.
+ */
+again:
+
+ pgstat_progress_start_command(PROGRESS_COMMAND_DATACHECKSUMS,
+ InvalidOid);
+
+ if (operation == ENABLE_DATACHECKSUMS)
+ {
+ /*
+ * If we are asked to enable checksums in a cluster which already has
+ * checksums enabled, exit immediately as there is nothing more to do.
+ * Hold interrupts to make sure state doesn't change during checking.
+ */
+ HOLD_INTERRUPTS();
+ if (DataChecksumsNeedVerify())
+ {
+ RESUME_INTERRUPTS();
+ goto done;
+ }
+ RESUME_INTERRUPTS();
+
+ /*
+ * Set the state to inprogress-on and wait on the procsignal barrier.
+ */
+ pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_PHASE,
+ PROGRESS_DATACHECKSUMS_PHASE_ENABLING);
+ SetDataChecksumsOnInProgress();
+
+ /*
+ * All backends are now in inprogress-on state and are writing data
+ * checksums. Start processing all data at rest.
+ */
+ if (!ProcessAllDatabases(DataChecksumsWorkerShmem->immediate_checkpoint))
+ {
+ /*
+ * If the target state changed during processing then it's not a
+ * failure, so restart processing instead.
+ */
+ LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
+ if (DataChecksumsWorkerShmem->launch_operation != operation)
+ {
+ LWLockRelease(DataChecksumsWorkerLock);
+ goto done;
+ }
+ LWLockRelease(DataChecksumsWorkerLock);
+ ereport(ERROR,
+ errmsg("unable to enable data checksums in cluster"));
+ }
+
+ /*
+ * Data checksums have been set on all pages, set the state to on in
+ * order to instruct backends to validate checksums on reading.
+ */
+ SetDataChecksumsOn();
+ }
+ else
+ {
+ int flags;
+
+ pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_PHASE,
+ PROGRESS_DATACHECKSUMS_PHASE_DISABLING);
+ SetDataChecksumsOff();
+
+ flags = CHECKPOINT_FORCE | CHECKPOINT_WAIT;
+ if (DataChecksumsWorkerShmem->immediate_checkpoint)
+ flags = flags | CHECKPOINT_FAST;
+ RequestCheckpoint(flags);
+ }
+
+done:
+
+ /*
+ * All done. But before we exit, check if the target state was changed
+ * while we were running. In that case we will have to start all over
+ * again.
+ */
+ LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
+ if (DataChecksumsWorkerShmem->launch_operation != operation)
+ {
+ DataChecksumsWorkerShmem->operation = DataChecksumsWorkerShmem->launch_operation;
+ operation = DataChecksumsWorkerShmem->launch_operation;
+ DataChecksumsWorkerShmem->cost_delay = DataChecksumsWorkerShmem->launch_cost_delay;
+ DataChecksumsWorkerShmem->cost_limit = DataChecksumsWorkerShmem->launch_cost_limit;
+ LWLockRelease(DataChecksumsWorkerLock);
+ goto again;
+ }
+
+ /* Shut down progress reporting as we are done */
+ pgstat_progress_end_command();
+
+ launcher_running = false;
+ DataChecksumsWorkerShmem->launcher_running = false;
+ LWLockRelease(DataChecksumsWorkerLock);
+}
+
+/*
+ * ProcessAllDatabases
+ * Compute the list of all databases and process checksums in each
+ *
+ * This will repeatedly generate a list of databases to process for enabling
+ * checksums. Until no new databases are found, this will loop around computing
+ * a new list and comparing it to the already seen ones.
+ *
+ * If immediate_checkpoint is set to true then a CHECKPOINT_FAST will be
+ * issued. This is useful for testing but should be avoided in production use
+ * as it may affect cluster performance drastically.
+ */
+static bool
+ProcessAllDatabases(bool immediate_checkpoint)
+{
+ List *DatabaseList;
+ HTAB *ProcessedDatabases = NULL;
+ HASHCTL hash_ctl;
+ bool found_failed = false;
+ int flags;
+
+ /* Initialize a hash tracking all processed databases */
+ memset(&hash_ctl, 0, sizeof(hash_ctl));
+ hash_ctl.keysize = sizeof(Oid);
+ hash_ctl.entrysize = sizeof(DataChecksumsWorkerResultEntry);
+ ProcessedDatabases = hash_create("Processed databases",
+ 64,
+ &hash_ctl,
+ HASH_ELEM | HASH_BLOBS);
+
+ /*
+ * Set up so first run processes shared catalogs, but not once in every
+ * db.
+ */
+ DataChecksumsWorkerShmem->process_shared_catalogs = true;
+
+ /*
+ * Get a list of all databases to process. This may include databases that
+ * were created during our runtime. Since a database can be created as a
+ * copy of any other database (which may not have existed in our last
+ * run), we have to repeat this loop until no new databases show up in the
+ * list.
+ */
+ DatabaseList = BuildDatabaseList();
+
+ /* Allow a test case to modify the initial list of databases */
+ INJECTION_POINT("datachecksumsworker-initial-dblist", DatabaseList);
+
+ /*
+ * Update progress reporting with the total number of databases we need to
+ * process. This number should not be changed during processing, the
+ * columns for processed databases is instead increased such that it can
+ * be compared against the total.
+ */
+ {
+ const int index[] = {
+ PROGRESS_DATACHECKSUMS_DBS_TOTAL,
+ PROGRESS_DATACHECKSUMS_DBS_DONE,
+ PROGRESS_DATACHECKSUMS_RELS_TOTAL,
+ PROGRESS_DATACHECKSUMS_RELS_DONE,
+ PROGRESS_DATACHECKSUMS_BLOCKS_TOTAL,
+ PROGRESS_DATACHECKSUMS_BLOCKS_DONE,
+ };
+
+ int64 vals[6];
+
+ vals[0] = list_length(DatabaseList);
+ vals[1] = 0;
+
+ /* translated to NULL */
+ vals[2] = -1;
+ vals[3] = -1;
+ vals[4] = -1;
+ vals[5] = -1;
+
+ pgstat_progress_update_multi_param(6, index, vals);
+ }
+
+ while (true)
+ {
+ int processed_databases = 0;
+
+ foreach_ptr(DataChecksumsWorkerDatabase, db, DatabaseList)
+ {
+ DataChecksumsWorkerResult result;
+ DataChecksumsWorkerResultEntry *entry;
+ bool found;
+
+ /*
+ * Check if this database has been processed already, and if so
+ * whether it should be retried or skipped.
+ */
+ entry = (DataChecksumsWorkerResultEntry *) hash_search(ProcessedDatabases, &db->dboid,
+ HASH_FIND, NULL);
+
+ if (entry)
+ {
+ if (entry->result == DATACHECKSUMSWORKER_RETRYDB)
+ {
+ /*
+ * Limit the number of retries to avoid infinite looping
+ * in case there simply won't be enough workers in the
+ * cluster to finish this operation.
+ */
+ if (entry->retries > DATACHECKSUMSWORKER_MAX_DB_RETRIES)
+ entry->result = DATACHECKSUMSWORKER_FAILED;
+ }
+
+ /* Skip if this database has been processed already */
+ if (entry->result != DATACHECKSUMSWORKER_RETRYDB)
+ continue;
+ }
+
+ result = ProcessDatabase(db);
+ processed_databases++;
+
+ /*
+ * Update the number of processed databases in the progress
+ * report.
+ */
+ pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_DBS_DONE,
+ processed_databases);
+
+ /* Allow a test process to alter the result of the operation */
+ INJECTION_POINT("datachecksumsworker-fail-db", &result);
+
+ if (result == DATACHECKSUMSWORKER_SUCCESSFUL)
+ {
+ /*
+ * If one database has completed shared catalogs, we don't
+ * have to process them again.
+ */
+ if (DataChecksumsWorkerShmem->process_shared_catalogs)
+ DataChecksumsWorkerShmem->process_shared_catalogs = false;
+ }
+ else if (result == DATACHECKSUMSWORKER_ABORTED)
+ {
+ /* Abort flag set, so exit the whole process */
+ return false;
+ }
+
+ entry = hash_search(ProcessedDatabases, &db->dboid, HASH_ENTER, &found);
+ entry->dboid = db->dboid;
+ entry->result = result;
+ if (!found)
+ entry->retries = 0;
+ else
+ entry->retries++;
+ }
+
+ elog(DEBUG1,
+ "%i databases processed for data checksum enabling, %s",
+ processed_databases,
+ (processed_databases ? "process with restart" : "process completed"));
+
+ FreeDatabaseList(DatabaseList);
+
+ /*
+ * If no databases were processed in this run of the loop, we have now
+ * finished all databases and no concurrently created ones can exist.
+ */
+ if (processed_databases == 0)
+ break;
+
+ /*
+ * Re-generate the list of databases for another pass. Since we wait
+ * for all pre-existing transactions finish, this way we can be
+ * certain that there are no databases left without checksums.
+ */
+ WaitForAllTransactionsToFinish();
+ DatabaseList = BuildDatabaseList();
+ }
+
+ /*
+ * ProcessedDatabases now has all databases and the results of their
+ * processing. Failure to enable checksums for a database can be because
+ * they actually failed for some reason, or because the database was
+ * dropped between us getting the database list and trying to process it.
+ * Get a fresh list of databases to detect the second case where the
+ * database was dropped before we had started processing it. If a database
+ * still exists, but enabling checksums failed then we fail the entire
+ * checksumming process and exit with an error.
+ */
+ WaitForAllTransactionsToFinish();
+ DatabaseList = BuildDatabaseList();
+
+ foreach_ptr(DataChecksumsWorkerDatabase, db, DatabaseList)
+ {
+ DataChecksumsWorkerResultEntry *entry;
+ bool found;
+
+ entry = hash_search(ProcessedDatabases, (void *) &db->dboid,
+ HASH_FIND, &found);
+
+ /*
+ * We are only interested in the processed databases which failed, and
+ * where the failed database still exists. This indicates that
+ * enabling checksums actually failed, and not that the failure was
+ * due to the db being concurrently dropped.
+ */
+ if (found && entry->result == DATACHECKSUMSWORKER_FAILED)
+ {
+ ereport(WARNING,
+ errmsg("failed to enable data checksums in \"%s\"", db->dbname));
+ found_failed = found;
+ continue;
+ }
+ }
+
+ FreeDatabaseList(DatabaseList);
+
+ if (found_failed)
+ {
+ /* Disable checksums on cluster, because we failed */
+ SetDataChecksumsOff();
+ /* Force a checkpoint to make everything consistent */
+ flags = CHECKPOINT_FORCE | CHECKPOINT_WAIT;
+ if (immediate_checkpoint)
+ flags = flags | CHECKPOINT_FAST;
+ RequestCheckpoint(flags);
+ ereport(ERROR,
+ errmsg("data checksums failed to get enabled in all databases, aborting"),
+ errhint("The server log might have more information on the cause of the error."));
+ }
+
+ /*
+ * When enabling checksums, we have to wait for a checkpoint for the
+ * checksums to change from in-progress to on.
+ */
+ pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_PHASE,
+ PROGRESS_DATACHECKSUMS_PHASE_WAITING_CHECKPOINT);
+
+ /*
+ * Force a checkpoint to get everything out to disk. The use of immediate
+ * checkpoints is for running tests, as they would otherwise not execute
+ * in such a way that they can reliably be placed under timeout control.
+ */
+ flags = CHECKPOINT_FORCE | CHECKPOINT_WAIT;
+ if (immediate_checkpoint)
+ flags = flags | CHECKPOINT_FAST;
+ RequestCheckpoint(flags);
+
+ return true;
+}
+
+/*
+ * DataChecksumsWorkerShmemSize
+ * Compute required space for datachecksumsworker-related shared memory
+ */
+Size
+DataChecksumsWorkerShmemSize(void)
+{
+ Size size;
+
+ size = sizeof(DataChecksumsWorkerShmemStruct);
+ size = MAXALIGN(size);
+
+ return size;
+}
+
+/*
+ * DataChecksumsWorkerShmemInit
+ * Allocate and initialize datachecksumsworker-related shared memory
+ */
+void
+DataChecksumsWorkerShmemInit(void)
+{
+ bool found;
+
+ DataChecksumsWorkerShmem = (DataChecksumsWorkerShmemStruct *)
+ ShmemInitStruct("DataChecksumsWorker Data",
+ DataChecksumsWorkerShmemSize(),
+ &found);
+
+ if (!found)
+ {
+ MemSet(DataChecksumsWorkerShmem, 0, DataChecksumsWorkerShmemSize());
+
+ /*
+ * Even if this is a redundant assignment, we want to be explicit
+ * about our intent for readability, since we want to be able to query
+ * this state in case of restartability.
+ */
+ DataChecksumsWorkerShmem->launch_operation = false;
+ DataChecksumsWorkerShmem->launcher_running = false;
+ DataChecksumsWorkerShmem->launch_fast = false;
+ }
+}
+
+/*
+ * BuildDatabaseList
+ * Compile a list of all currently available databases in the cluster
+ *
+ * This creates the list of databases for the datachecksumsworker workers to
+ * add checksums to. If the caller wants to ensure that no concurrently
+ * running CREATE DATABASE calls exist, this needs to be preceded by a call
+ * to WaitForAllTransactionsToFinish().
+ */
+static List *
+BuildDatabaseList(void)
+{
+ List *DatabaseList = NIL;
+ Relation rel;
+ TableScanDesc scan;
+ HeapTuple tup;
+ MemoryContext ctx = CurrentMemoryContext;
+ MemoryContext oldctx;
+
+ StartTransactionCommand();
+
+ rel = table_open(DatabaseRelationId, AccessShareLock);
+ scan = table_beginscan_catalog(rel, 0, NULL);
+
+ while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection)))
+ {
+ Form_pg_database pgdb = (Form_pg_database) GETSTRUCT(tup);
+ DataChecksumsWorkerDatabase *db;
+
+ oldctx = MemoryContextSwitchTo(ctx);
+
+ db = (DataChecksumsWorkerDatabase *) palloc0(sizeof(DataChecksumsWorkerDatabase));
+
+ db->dboid = pgdb->oid;
+ db->dbname = pstrdup(NameStr(pgdb->datname));
+
+ DatabaseList = lappend(DatabaseList, db);
+
+ MemoryContextSwitchTo(oldctx);
+ }
+
+ table_endscan(scan);
+ table_close(rel, AccessShareLock);
+
+ CommitTransactionCommand();
+
+ return DatabaseList;
+}
+
+static void
+FreeDatabaseList(List *dblist)
+{
+ if (!dblist)
+ return;
+
+ foreach_ptr(DataChecksumsWorkerDatabase, db, dblist)
+ {
+ if (db->dbname != NULL)
+ pfree(db->dbname);
+ }
+
+ list_free_deep(dblist);
+}
+
+/*
+ * BuildRelationList
+ * Compile a list of relations in the database
+ *
+ * Returns a list of OIDs for the request relation types. If temp_relations
+ * is True then only temporary relations are returned. If temp_relations is
+ * False then non-temporary relations which have data checksums are returned.
+ * If include_shared is True then shared relations are included as well in a
+ * non-temporary list. include_shared has no relevance when building a list of
+ * temporary relations.
+ */
+static List *
+BuildRelationList(bool temp_relations, bool include_shared)
+{
+ List *RelationList = NIL;
+ Relation rel;
+ TableScanDesc scan;
+ HeapTuple tup;
+ MemoryContext ctx = CurrentMemoryContext;
+ MemoryContext oldctx;
+
+ StartTransactionCommand();
+
+ rel = table_open(RelationRelationId, AccessShareLock);
+ scan = table_beginscan_catalog(rel, 0, NULL);
+
+ while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection)))
+ {
+ Form_pg_class pgc = (Form_pg_class) GETSTRUCT(tup);
+
+ /*
+ * Only include temporary relations when asked for a temp relation
+ * list.
+ */
+ if (pgc->relpersistence == RELPERSISTENCE_TEMP)
+ {
+ if (!temp_relations)
+ continue;
+ }
+ else
+ {
+ /*
+ * If we are only interested in temp relations then continue
+ * immediately as the current relation isn't a temp relation.
+ */
+ if (temp_relations)
+ continue;
+
+ if (!RELKIND_HAS_STORAGE(pgc->relkind))
+ continue;
+
+ if (pgc->relisshared && !include_shared)
+ continue;
+ }
+
+ oldctx = MemoryContextSwitchTo(ctx);
+ RelationList = lappend_oid(RelationList, pgc->oid);
+ MemoryContextSwitchTo(oldctx);
+ }
+
+ table_endscan(scan);
+ table_close(rel, AccessShareLock);
+
+ CommitTransactionCommand();
+
+ return RelationList;
+}
+
+/*
+ * DataChecksumsWorkerMain
+ *
+ * Main function for enabling checksums in a single database, This is the
+ * function set as the bgw_function_name in the dynamic background worker
+ * process initiated for each database by the worker launcher. After enabling
+ * data checksums in each applicable relation in the database, it will wait for
+ * all temporary relations that were present when the function started to
+ * disappear before returning. This is required since we cannot rewrite
+ * existing temporary relations with data checksums.
+ */
+void
+DataChecksumsWorkerMain(Datum arg)
+{
+ Oid dboid = DatumGetObjectId(arg);
+ List *RelationList = NIL;
+ List *InitialTempTableList = NIL;
+ BufferAccessStrategy strategy;
+ bool aborted = false;
+ int64 rels_done;
+
+ operation = ENABLE_DATACHECKSUMS;
+
+ pqsignal(SIGTERM, die);
+
+ BackgroundWorkerUnblockSignals();
+
+ MyBackendType = B_DATACHECKSUMSWORKER_WORKER;
+ init_ps_display(NULL);
+
+ BackgroundWorkerInitializeConnectionByOid(dboid, InvalidOid,
+ BGWORKER_BYPASS_ALLOWCONN);
+
+ /* worker will have a separate entry in pg_stat_progress_data_checksums */
+ pgstat_progress_start_command(PROGRESS_COMMAND_DATACHECKSUMS,
+ InvalidOid);
+
+ /*
+ * Get a list of all temp tables present as we start in this database. We
+ * need to wait until they are all gone until we are done, since we cannot
+ * access these relations and modify them.
+ */
+ InitialTempTableList = BuildRelationList(true, false);
+
+ /*
+ * Enable vacuum cost delay, if any.
+ */
+ Assert(DataChecksumsWorkerShmem->operation == ENABLE_DATACHECKSUMS);
+ VacuumCostDelay = DataChecksumsWorkerShmem->cost_delay;
+ VacuumCostLimit = DataChecksumsWorkerShmem->cost_limit;
+ VacuumCostActive = (VacuumCostDelay > 0);
+ VacuumCostBalance = 0;
+ VacuumCostPageHit = 0;
+ VacuumCostPageMiss = 0;
+ VacuumCostPageDirty = 0;
+
+ /*
+ * Create and set the vacuum strategy as our buffer strategy.
+ */
+ strategy = GetAccessStrategy(BAS_VACUUM);
+
+ RelationList = BuildRelationList(false,
+ DataChecksumsWorkerShmem->process_shared_catalogs);
+
+ /* Update the total number of relations to be processed in this DB. */
+ {
+ const int index[] = {
+ PROGRESS_DATACHECKSUMS_RELS_TOTAL,
+ PROGRESS_DATACHECKSUMS_RELS_DONE
+ };
+
+ int64 vals[2];
+
+ vals[0] = list_length(RelationList);
+ vals[1] = 0;
+
+ pgstat_progress_update_multi_param(2, index, vals);
+ }
+
+ /* Process the relations */
+ rels_done = 0;
+ foreach_oid(reloid, RelationList)
+ {
+ if (!ProcessSingleRelationByOid(reloid, strategy))
+ {
+ aborted = true;
+ break;
+ }
+
+ pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_RELS_DONE,
+ ++rels_done);
+ }
+ list_free(RelationList);
+
+ if (aborted)
+ {
+ DataChecksumsWorkerShmem->success = DATACHECKSUMSWORKER_ABORTED;
+ ereport(DEBUG1,
+ errmsg("data checksum processing aborted in database OID %u",
+ dboid));
+ return;
+ }
+
+ /* The worker is about to wait for temporary tables to go away. */
+ pgstat_progress_update_param(PROGRESS_DATACHECKSUMS_PHASE,
+ PROGRESS_DATACHECKSUMS_PHASE_WAITING_TEMPREL);
+
+ /*
+ * Wait for all temp tables that existed when we started to go away. This
+ * is necessary since we cannot "reach" them to enable checksums. Any temp
+ * tables created after we started will already have checksums in them
+ * (due to the "inprogress-on" state), so no need to wait for those.
+ */
+ for (;;)
+ {
+ List *CurrentTempTables;
+ int numleft;
+ char activity[64];
+
+ CurrentTempTables = BuildRelationList(true, false);
+ numleft = 0;
+ foreach_oid(tmptbloid, InitialTempTableList)
+ {
+ if (list_member_oid(CurrentTempTables, tmptbloid))
+ numleft++;
+ }
+ list_free(CurrentTempTables);
+
+ INJECTION_POINT("datachecksumsworker-fake-temptable-wait", &numleft);
+
+ if (numleft == 0)
+ break;
+
+ /*
+ * At least one temp table is left to wait for, indicate in pgstat
+ * activity and progress reporting.
+ */
+ snprintf(activity,
+ sizeof(activity),
+ "Waiting for %d temp tables to be removed", numleft);
+ pgstat_report_activity(STATE_RUNNING, activity);
+
+ /* Retry every 3 seconds */
+ ResetLatch(MyLatch);
+ (void) WaitLatch(MyLatch,
+ WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
+ 3000,
+ WAIT_EVENT_CHECKSUM_ENABLE_TEMPTABLE_WAIT);
+
+ LWLockAcquire(DataChecksumsWorkerLock, LW_EXCLUSIVE);
+ aborted = DataChecksumsWorkerShmem->launch_operation != operation;
+ LWLockRelease(DataChecksumsWorkerLock);
+
+ if (aborted || abort_requested)
+ {
+ DataChecksumsWorkerShmem->success = DATACHECKSUMSWORKER_ABORTED;
+ ereport(DEBUG1,
+ errmsg("data checksum processing aborted in database OID %u",
+ dboid));
+ return;
+ }
+ }
+
+ list_free(InitialTempTableList);
+
+ /* worker done */
+ pgstat_progress_end_command();
+
+ DataChecksumsWorkerShmem->success = DATACHECKSUMSWORKER_SUCCESSFUL;
+}
diff --git a/src/backend/postmaster/launch_backend.c b/src/backend/postmaster/launch_backend.c
index bf6b55ee8304..955df32be5d0 100644
--- a/src/backend/postmaster/launch_backend.c
+++ b/src/backend/postmaster/launch_backend.c
@@ -204,6 +204,9 @@ static child_process_kind child_process_kinds[] = {
[B_WAL_SUMMARIZER] = {"wal_summarizer", WalSummarizerMain, true},
[B_WAL_WRITER] = {"wal_writer", WalWriterMain, true},
+ [B_DATACHECKSUMSWORKER_LAUNCHER] = {"datachecksum launcher", NULL, false},
+ [B_DATACHECKSUMSWORKER_WORKER] = {"datachecksum worker", NULL, false},
+
[B_LOGGER] = {"syslogger", SysLoggerMain, false},
};
diff --git a/src/backend/postmaster/meson.build b/src/backend/postmaster/meson.build
index 0008603cfee9..ce10ef1059a8 100644
--- a/src/backend/postmaster/meson.build
+++ b/src/backend/postmaster/meson.build
@@ -6,6 +6,7 @@ backend_sources += files(
'bgworker.c',
'bgwriter.c',
'checkpointer.c',
+ 'datachecksumsworker.c',
'fork_process.c',
'interrupt.c',
'launch_backend.c',
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index e1d643b013d7..3d15a894c3a4 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -2983,6 +2983,11 @@ PostmasterStateMachine(void)
B_INVALID,
B_STANDALONE_BACKEND);
+ /* also add checksumming processes */
+ remainMask = btmask_add(remainMask,
+ B_DATACHECKSUMSWORKER_LAUNCHER,
+ B_DATACHECKSUMSWORKER_WORKER);
+
/* All types should be included in targetMask or remainMask */
Assert((remainMask.mask | targetMask.mask) == BTYPE_MASK_ALL.mask);
}
diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c
index cc03f0706e9c..f9f06821a8f9 100644
--- a/src/backend/replication/logical/decode.c
+++ b/src/backend/replication/logical/decode.c
@@ -186,6 +186,7 @@ xlog_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
case XLOG_FPW_CHANGE:
case XLOG_FPI_FOR_HINT:
case XLOG_FPI:
+ case XLOG_CHECKSUMS:
case XLOG_OVERWRITE_CONTRECORD:
case XLOG_CHECKPOINT_REDO:
break;
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index 2fa045e6b0f6..44213d140aee 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -30,6 +30,8 @@
#include "postmaster/autovacuum.h"
#include "postmaster/bgworker_internals.h"
#include "postmaster/bgwriter.h"
+#include "postmaster/datachecksumsworker.h"
+#include "postmaster/postmaster.h"
#include "postmaster/walsummarizer.h"
#include "replication/logicallauncher.h"
#include "replication/origin.h"
@@ -150,6 +152,7 @@ CalculateShmemSize(int *num_semaphores)
size = add_size(size, InjectionPointShmemSize());
size = add_size(size, SlotSyncShmemSize());
size = add_size(size, AioShmemSize());
+ size = add_size(size, DataChecksumsWorkerShmemSize());
/* include additional requested shmem from preload libraries */
size = add_size(size, total_addin_request);
@@ -332,6 +335,7 @@ CreateOrAttachShmemStructs(void)
PgArchShmemInit();
ApplyLauncherShmemInit();
SlotSyncShmemInit();
+ DataChecksumsWorkerShmemInit();
/*
* Set up other modules that need some shared memory space
diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c
index 087821311cce..6881c6f4069a 100644
--- a/src/backend/storage/ipc/procsignal.c
+++ b/src/backend/storage/ipc/procsignal.c
@@ -18,6 +18,7 @@
#include
#include "access/parallel.h"
+#include "access/xlog.h"
#include "commands/async.h"
#include "miscadmin.h"
#include "pgstat.h"
@@ -576,6 +577,18 @@ ProcessProcSignalBarrier(void)
case PROCSIGNAL_BARRIER_SMGRRELEASE:
processed = ProcessBarrierSmgrRelease();
break;
+ case PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON:
+ processed = AbsorbChecksumsOnInProgressBarrier();
+ break;
+ case PROCSIGNAL_BARRIER_CHECKSUM_ON:
+ processed = AbsorbChecksumsOnBarrier();
+ break;
+ case PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF:
+ processed = AbsorbChecksumsOffInProgressBarrier();
+ break;
+ case PROCSIGNAL_BARRIER_CHECKSUM_OFF:
+ processed = AbsorbChecksumsOffBarrier();
+ break;
}
/*
diff --git a/src/backend/storage/page/README b/src/backend/storage/page/README
index e30d7ac59adc..73c36a639086 100644
--- a/src/backend/storage/page/README
+++ b/src/backend/storage/page/README
@@ -10,7 +10,9 @@ http://www.cs.toronto.edu/~bianca/papers/sigmetrics09.pdf, discussed
2010/12/22 on -hackers list.
Current implementation requires this be enabled system-wide at initdb time, or
-by using the pg_checksums tool on an offline cluster.
+by using the pg_checksums tool on an offline cluster. Checksums can also be
+enabled at runtime using pg_enable_data_checksums(), and disabled by using
+pg_disable_data_checksums().
The checksum is not valid at all times on a data page!!
The checksum is valid when the page leaves the shared pool and is checked
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index dbb49ed9197d..19cf6512e520 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -107,7 +107,7 @@ PageIsVerified(PageData *page, BlockNumber blkno, int flags, bool *checksum_fail
*/
if (!PageIsNew(page))
{
- if (DataChecksumsEnabled())
+ if (DataChecksumsNeedVerify())
{
checksum = pg_checksum_page(page, blkno);
@@ -1511,7 +1511,7 @@ PageSetChecksumCopy(Page page, BlockNumber blkno)
static char *pageCopy = NULL;
/* If we don't need a checksum, just return the passed-in data */
- if (PageIsNew(page) || !DataChecksumsEnabled())
+ if (PageIsNew(page) || !DataChecksumsNeedWrite())
return page;
/*
@@ -1541,7 +1541,7 @@ void
PageSetChecksumInplace(Page page, BlockNumber blkno)
{
/* If we don't need a checksum, just return */
- if (PageIsNew(page) || !DataChecksumsEnabled())
+ if (PageIsNew(page) || !DataChecksumsNeedWrite())
return;
((PageHeader) page)->pd_checksum = pg_checksum_page(page, blkno);
diff --git a/src/backend/utils/activity/pgstat_backend.c b/src/backend/utils/activity/pgstat_backend.c
index 8714a85e2d93..edc2512d79f7 100644
--- a/src/backend/utils/activity/pgstat_backend.c
+++ b/src/backend/utils/activity/pgstat_backend.c
@@ -378,6 +378,8 @@ pgstat_tracks_backend_bktype(BackendType bktype)
case B_CHECKPOINTER:
case B_IO_WORKER:
case B_STARTUP:
+ case B_DATACHECKSUMSWORKER_LAUNCHER:
+ case B_DATACHECKSUMSWORKER_WORKER:
return false;
case B_AUTOVAC_WORKER:
diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c
index 13ae57ed6498..a290d56f4096 100644
--- a/src/backend/utils/activity/pgstat_io.c
+++ b/src/backend/utils/activity/pgstat_io.c
@@ -362,6 +362,8 @@ pgstat_tracks_io_bktype(BackendType bktype)
case B_LOGGER:
return false;
+ case B_DATACHECKSUMSWORKER_LAUNCHER:
+ case B_DATACHECKSUMSWORKER_WORKER:
case B_AUTOVAC_LAUNCHER:
case B_AUTOVAC_WORKER:
case B_BACKEND:
diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt
index 5427da5bc1b1..7f26d78cb77c 100644
--- a/src/backend/utils/activity/wait_event_names.txt
+++ b/src/backend/utils/activity/wait_event_names.txt
@@ -116,6 +116,9 @@ CHECKPOINT_DELAY_COMPLETE "Waiting for a backend that blocks a checkpoint from c
CHECKPOINT_DELAY_START "Waiting for a backend that blocks a checkpoint from starting."
CHECKPOINT_DONE "Waiting for a checkpoint to complete."
CHECKPOINT_START "Waiting for a checkpoint to start."
+CHECKSUM_ENABLE_STARTCONDITION "Waiting for data checksums enabling to start."
+CHECKSUM_ENABLE_FINISHCONDITION "Waiting for data checksums to be enabled."
+CHECKSUM_ENABLE_TEMPTABLE_WAIT "Waiting for temporary tables to be dropped for data checksums to be enabled."
EXECUTE_GATHER "Waiting for activity from a child process while executing a Gather plan node."
HASH_BATCH_ALLOCATE "Waiting for an elected Parallel Hash participant to allocate a hash table."
HASH_BATCH_ELECT "Waiting to elect a Parallel Hash participant to allocate a hash table."
@@ -352,6 +355,7 @@ DSMRegistry "Waiting to read or update the dynamic shared memory registry."
InjectionPoint "Waiting to read or update information related to injection points."
SerialControl "Waiting to read or update shared pg_serial state."
AioWorkerSubmissionQueue "Waiting to access AIO worker submission queue."
+DataChecksumsWorker "Waiting for data checksumsworker."
#
# END OF PREDEFINED LWLOCKS (DO NOT CHANGE THIS LINE)
diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c
index c756c2bebaaa..f4e264ebf33c 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -274,6 +274,8 @@ pg_stat_get_progress_info(PG_FUNCTION_ARGS)
cmdtype = PROGRESS_COMMAND_BASEBACKUP;
else if (pg_strcasecmp(cmd, "COPY") == 0)
cmdtype = PROGRESS_COMMAND_COPY;
+ else if (pg_strcasecmp(cmd, "DATACHECKSUMS") == 0)
+ cmdtype = PROGRESS_COMMAND_DATACHECKSUMS;
else
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
@@ -1146,9 +1148,6 @@ pg_stat_get_db_checksum_failures(PG_FUNCTION_ARGS)
int64 result;
PgStat_StatDBEntry *dbentry;
- if (!DataChecksumsEnabled())
- PG_RETURN_NULL();
-
if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL)
result = 0;
else
@@ -1164,9 +1163,6 @@ pg_stat_get_db_checksum_last_failure(PG_FUNCTION_ARGS)
TimestampTz result;
PgStat_StatDBEntry *dbentry;
- if (!DataChecksumsEnabled())
- PG_RETURN_NULL();
-
if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL)
result = 0;
else
diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c
index 545d1e90fbd4..34cce2ce0bed 100644
--- a/src/backend/utils/init/miscinit.c
+++ b/src/backend/utils/init/miscinit.c
@@ -293,9 +293,18 @@ GetBackendTypeDesc(BackendType backendType)
case B_CHECKPOINTER:
backendDesc = gettext_noop("checkpointer");
break;
+
case B_IO_WORKER:
backendDesc = gettext_noop("io worker");
break;
+
+ case B_DATACHECKSUMSWORKER_LAUNCHER:
+ backendDesc = "datachecksumsworker launcher";
+ break;
+ case B_DATACHECKSUMSWORKER_WORKER:
+ backendDesc = "datachecksumsworker worker";
+ break;
+
case B_LOGGER:
backendDesc = gettext_noop("logger");
break;
@@ -895,7 +904,8 @@ InitializeSessionUserIdStandalone(void)
* workers, in slot sync worker and in background workers.
*/
Assert(!IsUnderPostmaster || AmAutoVacuumWorkerProcess() ||
- AmLogicalSlotSyncWorkerProcess() || AmBackgroundWorkerProcess());
+ AmLogicalSlotSyncWorkerProcess() || AmBackgroundWorkerProcess() ||
+ AmDataChecksumsWorkerProcess());
/* call only once */
Assert(!OidIsValid(AuthenticatedUserId));
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index 641e535a73c7..589e7eab9e84 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -750,6 +750,24 @@ InitPostgres(const char *in_dbname, Oid dboid,
ProcSignalInit(MyCancelKey, MyCancelKeyLength);
+ /*
+ * Initialize a local cache of the data_checksum_version, to be updated by
+ * the procsignal-based barriers.
+ *
+ * This intentionally happens after initializing the procsignal, otherwise
+ * we might miss a state change. This means we can get a barrier for the
+ * state we've just initialized - but it can happen only once.
+ *
+ * The postmaster (which is what gets forked into the new child process)
+ * does not handle barriers, therefore it may not have the current value
+ * of LocalDataChecksumVersion value (it'll have the value read from the
+ * control file, which may be arbitrarily old).
+ *
+ * NB: Even if the postmaster handled barriers, the value might still be
+ * stale, as it might have changed after this process forked.
+ */
+ InitLocalDataChecksumVersion();
+
/*
* Also set up timeout handlers needed for backend operation. We need
* these in every case except bootstrap.
@@ -878,7 +896,7 @@ InitPostgres(const char *in_dbname, Oid dboid,
errhint("You should immediately run CREATE USER \"%s\" SUPERUSER;.",
username != NULL ? username : "postgres")));
}
- else if (AmBackgroundWorkerProcess())
+ else if (AmBackgroundWorkerProcess() || AmDataChecksumsWorkerProcess())
{
if (username == NULL && !OidIsValid(useroid))
{
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index f137129209f6..36fba8496df2 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -491,6 +491,14 @@ static const struct config_enum_entry file_copy_method_options[] = {
{NULL, 0, false}
};
+static const struct config_enum_entry data_checksums_options[] = {
+ {"on", PG_DATA_CHECKSUM_VERSION, true},
+ {"off", PG_DATA_CHECKSUM_OFF, true},
+ {"inprogress-on", PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION, true},
+ {"inprogress-off", PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION, true},
+ {NULL, 0, false}
+};
+
/*
* Options for enum values stored in other modules
*/
@@ -616,7 +624,6 @@ static int shared_memory_size_mb;
static int shared_memory_size_in_huge_pages;
static int wal_block_size;
static int num_os_semaphores;
-static bool data_checksums;
static bool integer_datetimes;
#ifdef USE_ASSERT_CHECKING
@@ -2043,17 +2050,6 @@ struct config_bool ConfigureNamesBool[] =
NULL, NULL, NULL
},
- {
- {"data_checksums", PGC_INTERNAL, PRESET_OPTIONS,
- gettext_noop("Shows whether data checksums are turned on for this cluster."),
- NULL,
- GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE | GUC_RUNTIME_COMPUTED
- },
- &data_checksums,
- false,
- NULL, NULL, NULL
- },
-
{
{"syslog_sequence_numbers", PGC_SIGHUP, LOGGING_WHERE,
gettext_noop("Add sequence number to syslog messages to avoid duplicate suppression."),
@@ -5489,6 +5485,16 @@ struct config_enum ConfigureNamesEnum[] =
DEFAULT_IO_METHOD, io_method_options,
NULL, assign_io_method, NULL
},
+ {
+ {"data_checksums", PGC_INTERNAL, PRESET_OPTIONS,
+ gettext_noop("Shows whether data checksums are turned on for this cluster."),
+ NULL,
+ GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE | GUC_RUNTIME_COMPUTED
+ },
+ &data_checksums,
+ PG_DATA_CHECKSUM_OFF, data_checksums_options,
+ NULL, NULL, show_data_checksums
+ },
/* End-of-list marker */
{
diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c
index f20be82862a2..8411cecf3ffb 100644
--- a/src/bin/pg_checksums/pg_checksums.c
+++ b/src/bin/pg_checksums/pg_checksums.c
@@ -568,7 +568,7 @@ main(int argc, char *argv[])
ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
pg_fatal("cluster must be shut down");
- if (ControlFile->data_checksum_version == 0 &&
+ if (ControlFile->data_checksum_version != PG_DATA_CHECKSUM_VERSION &&
mode == PG_MODE_CHECK)
pg_fatal("data checksums are not enabled in cluster");
@@ -576,7 +576,7 @@ main(int argc, char *argv[])
mode == PG_MODE_DISABLE)
pg_fatal("data checksums are already disabled in cluster");
- if (ControlFile->data_checksum_version > 0 &&
+ if (ControlFile->data_checksum_version == PG_DATA_CHECKSUM_VERSION &&
mode == PG_MODE_ENABLE)
pg_fatal("data checksums are already enabled in cluster");
diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c
index 10de058ce91f..acf5c7b026e7 100644
--- a/src/bin/pg_controldata/pg_controldata.c
+++ b/src/bin/pg_controldata/pg_controldata.c
@@ -280,6 +280,8 @@ main(int argc, char *argv[])
ControlFile->checkPointCopy.oldestCommitTsXid);
printf(_("Latest checkpoint's newestCommitTsXid:%u\n"),
ControlFile->checkPointCopy.newestCommitTsXid);
+ printf(_("Latest checkpoint's data_checksum_version:%u\n"),
+ ControlFile->checkPointCopy.data_checksum_version);
printf(_("Time of latest checkpoint: %s\n"),
ckpttime_str);
printf(_("Fake LSN counter for unlogged rels: %X/%08X\n"),
diff --git a/src/bin/pg_upgrade/controldata.c b/src/bin/pg_upgrade/controldata.c
index 90cef0864de7..29684e824401 100644
--- a/src/bin/pg_upgrade/controldata.c
+++ b/src/bin/pg_upgrade/controldata.c
@@ -15,6 +15,7 @@
#include "access/xlog_internal.h"
#include "common/string.h"
#include "pg_upgrade.h"
+#include "storage/bufpage.h"
/*
@@ -736,6 +737,14 @@ check_control_data(ControlData *oldctrl,
* check_for_isn_and_int8_passing_mismatch().
*/
+ /*
+ * If data checksums are in any in-progress state then disallow the
+ * upgrade. The user should either let the process finish, or turn off
+ * data checksums, before retrying.
+ */
+ if (oldctrl->data_checksum_version > PG_DATA_CHECKSUM_VERSION)
+ pg_fatal("checksums are being enabled in the old cluster");
+
/*
* We might eventually allow upgrades from checksum to no-checksum
* clusters.
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index d12798be3d80..8bcc5aa8a63e 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -56,6 +56,7 @@ extern PGDLLIMPORT int CommitDelay;
extern PGDLLIMPORT int CommitSiblings;
extern PGDLLIMPORT bool track_wal_io_timing;
extern PGDLLIMPORT int wal_decode_buffer_size;
+extern PGDLLIMPORT int data_checksums;
extern PGDLLIMPORT int CheckPointSegments;
@@ -117,7 +118,7 @@ extern PGDLLIMPORT int wal_level;
* of the bits make it to disk, but the checksum wouldn't match. Also WAL-log
* them if forced by wal_log_hints=on.
*/
-#define XLogHintBitIsNeeded() (DataChecksumsEnabled() || wal_log_hints)
+#define XLogHintBitIsNeeded() (wal_log_hints || DataChecksumsNeedWrite())
/* Do we need to WAL-log information required only for Hot Standby and logical replication? */
#define XLogStandbyInfoActive() (wal_level >= WAL_LEVEL_REPLICA)
@@ -229,7 +230,19 @@ extern XLogRecPtr GetXLogWriteRecPtr(void);
extern uint64 GetSystemIdentifier(void);
extern char *GetMockAuthenticationNonce(void);
-extern bool DataChecksumsEnabled(void);
+extern bool DataChecksumsNeedWrite(void);
+extern bool DataChecksumsNeedVerify(void);
+extern bool DataChecksumsOnInProgress(void);
+extern bool DataChecksumsOffInProgress(void);
+extern void SetDataChecksumsOnInProgress(void);
+extern void SetDataChecksumsOn(void);
+extern void SetDataChecksumsOff(void);
+extern bool AbsorbChecksumsOnInProgressBarrier(void);
+extern bool AbsorbChecksumsOffInProgressBarrier(void);
+extern bool AbsorbChecksumsOnBarrier(void);
+extern bool AbsorbChecksumsOffBarrier(void);
+extern const char *show_data_checksums(void);
+extern void InitLocalDataChecksumVersion(void);
extern bool GetDefaultCharSignedness(void);
extern XLogRecPtr GetFakeLSNForUnloggedRel(void);
extern Size XLOGShmemSize(void);
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h
index cc06fc29ab2b..cc78b00fe4cc 100644
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -25,6 +25,7 @@
#include "lib/stringinfo.h"
#include "pgtime.h"
#include "storage/block.h"
+#include "storage/checksum.h"
#include "storage/relfilelocator.h"
@@ -289,6 +290,12 @@ typedef struct xl_restore_point
char rp_name[MAXFNAMELEN];
} xl_restore_point;
+/* Information logged when data checksum level is changed */
+typedef struct xl_checksum_state
+{
+ uint32 new_checksumtype;
+} xl_checksum_state;
+
/* Overwrite of prior contrecord */
typedef struct xl_overwrite_contrecord
{
diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h
index 63e834a6ce47..a8877fb87d1a 100644
--- a/src/include/catalog/pg_control.h
+++ b/src/include/catalog/pg_control.h
@@ -62,6 +62,9 @@ typedef struct CheckPoint
* set to InvalidTransactionId.
*/
TransactionId oldestActiveXid;
+
+ /* data checksums at the time of the checkpoint */
+ uint32 data_checksum_version;
} CheckPoint;
/* XLOG info values for XLOG rmgr */
@@ -80,6 +83,7 @@ typedef struct CheckPoint
/* 0xC0 is used in Postgres 9.5-11 */
#define XLOG_OVERWRITE_CONTRECORD 0xD0
#define XLOG_CHECKPOINT_REDO 0xE0
+#define XLOG_CHECKSUMS 0xF0
/*
@@ -219,7 +223,7 @@ typedef struct ControlFileData
bool float8ByVal; /* float8, int8, etc pass-by-value? */
/* Are data pages protected by checksums? Zero if no checksum version */
- uint32 data_checksum_version;
+ uint32 data_checksum_version; /* persistent */
/*
* True if the default signedness of char is "signed" on a platform where
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 118d6da1ace0..c6f4e31a12fe 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -12356,6 +12356,25 @@
proname => 'jsonb_subscript_handler', prorettype => 'internal',
proargtypes => 'internal', prosrc => 'jsonb_subscript_handler' },
+# data checksum management functions
+{ oid => '9258',
+ descr => 'disable data checksums',
+ proname => 'pg_disable_data_checksums', provolatile => 'v', prorettype => 'void',
+ proparallel => 'r',
+ proargtypes => 'bool', proallargtypes => '{bool}',
+ proargmodes => '{i}',
+ proargnames => '{fast}',
+ prosrc => 'disable_data_checksums' },
+
+{ oid => '9257',
+ descr => 'enable data checksums',
+ proname => 'pg_enable_data_checksums', provolatile => 'v', prorettype => 'void',
+ proparallel => 'r',
+ proargtypes => 'int4 int4 bool', proallargtypes => '{int4,int4,bool}',
+ proargmodes => '{i,i,i}',
+ proargnames => '{cost_delay,cost_limit,fast}',
+ prosrc => 'enable_data_checksums' },
+
# collation management functions
{ oid => '3445', descr => 'import collations from operating system',
proname => 'pg_import_system_collations', procost => '100',
diff --git a/src/include/commands/progress.h b/src/include/commands/progress.h
index 1cde4bd9bcf1..cf6de4ef12d6 100644
--- a/src/include/commands/progress.h
+++ b/src/include/commands/progress.h
@@ -162,4 +162,20 @@
#define PROGRESS_COPY_TYPE_PIPE 3
#define PROGRESS_COPY_TYPE_CALLBACK 4
+/* Progress parameters for PROGRESS_DATACHECKSUMS */
+#define PROGRESS_DATACHECKSUMS_PHASE 0
+#define PROGRESS_DATACHECKSUMS_DBS_TOTAL 1
+#define PROGRESS_DATACHECKSUMS_DBS_DONE 2
+#define PROGRESS_DATACHECKSUMS_RELS_TOTAL 3
+#define PROGRESS_DATACHECKSUMS_RELS_DONE 4
+#define PROGRESS_DATACHECKSUMS_BLOCKS_TOTAL 5
+#define PROGRESS_DATACHECKSUMS_BLOCKS_DONE 6
+
+/* Phases of datachecksumsworker operation */
+#define PROGRESS_DATACHECKSUMS_PHASE_ENABLING 0
+#define PROGRESS_DATACHECKSUMS_PHASE_DISABLING 1
+#define PROGRESS_DATACHECKSUMS_PHASE_WAITING 2
+#define PROGRESS_DATACHECKSUMS_PHASE_WAITING_TEMPREL 3
+#define PROGRESS_DATACHECKSUMS_PHASE_WAITING_CHECKPOINT 4
+
#endif
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 1bef98471c36..2a0d7b6de420 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -366,6 +366,9 @@ typedef enum BackendType
B_WAL_SUMMARIZER,
B_WAL_WRITER,
+ B_DATACHECKSUMSWORKER_LAUNCHER,
+ B_DATACHECKSUMSWORKER_WORKER,
+
/*
* Logger is not connected to shared memory and does not have a PGPROC
* entry.
@@ -391,6 +394,9 @@ extern PGDLLIMPORT BackendType MyBackendType;
#define AmWalSummarizerProcess() (MyBackendType == B_WAL_SUMMARIZER)
#define AmWalWriterProcess() (MyBackendType == B_WAL_WRITER)
#define AmIoWorkerProcess() (MyBackendType == B_IO_WORKER)
+#define AmDataChecksumsWorkerProcess() \
+ (MyBackendType == B_DATACHECKSUMSWORKER_LAUNCHER || \
+ MyBackendType == B_DATACHECKSUMSWORKER_WORKER)
#define AmSpecialWorkerProcess() \
(AmAutoVacuumLauncherProcess() || \
diff --git a/src/include/postmaster/datachecksumsworker.h b/src/include/postmaster/datachecksumsworker.h
new file mode 100644
index 000000000000..2cd066fd0feb
--- /dev/null
+++ b/src/include/postmaster/datachecksumsworker.h
@@ -0,0 +1,51 @@
+/*-------------------------------------------------------------------------
+ *
+ * datachecksumsworker.h
+ * header file for data checksum helper background worker
+ *
+ *
+ * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/postmaster/datachecksumsworker.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef DATACHECKSUMSWORKER_H
+#define DATACHECKSUMSWORKER_H
+
+/* Shared memory */
+extern Size DataChecksumsWorkerShmemSize(void);
+extern void DataChecksumsWorkerShmemInit(void);
+
+/* Possible operations the Datachecksumsworker can perform */
+typedef enum DataChecksumsWorkerOperation
+{
+ ENABLE_DATACHECKSUMS,
+ DISABLE_DATACHECKSUMS,
+ /* TODO: VERIFY_DATACHECKSUMS, */
+} DataChecksumsWorkerOperation;
+
+/*
+ * Possible states for a database entry which has been processed. Exported
+ * here since we want to be able to reference this from injection point tests.
+ */
+typedef enum
+{
+ DATACHECKSUMSWORKER_SUCCESSFUL = 0,
+ DATACHECKSUMSWORKER_ABORTED,
+ DATACHECKSUMSWORKER_FAILED,
+ DATACHECKSUMSWORKER_RETRYDB,
+} DataChecksumsWorkerResult;
+
+/* Start the background processes for enabling or disabling checksums */
+void StartDataChecksumsWorkerLauncher(DataChecksumsWorkerOperation op,
+ int cost_delay,
+ int cost_limit,
+ bool fast);
+
+/* Background worker entrypoints */
+void DataChecksumsWorkerLauncherMain(Datum arg);
+void DataChecksumsWorkerMain(Datum arg);
+
+#endif /* DATACHECKSUMSWORKER_H */
diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h
index aeb67c498c59..30fb0f62d4c0 100644
--- a/src/include/storage/bufpage.h
+++ b/src/include/storage/bufpage.h
@@ -16,6 +16,7 @@
#include "access/xlogdefs.h"
#include "storage/block.h"
+#include "storage/checksum.h"
#include "storage/item.h"
#include "storage/off.h"
@@ -205,7 +206,6 @@ typedef PageHeaderData *PageHeader;
* handling pages.
*/
#define PG_PAGE_LAYOUT_VERSION 4
-#define PG_DATA_CHECKSUM_VERSION 1
/* ----------------------------------------------------------------
* page support functions
diff --git a/src/include/storage/checksum.h b/src/include/storage/checksum.h
index 25d13a798d10..b3f368a15b52 100644
--- a/src/include/storage/checksum.h
+++ b/src/include/storage/checksum.h
@@ -15,6 +15,20 @@
#include "storage/block.h"
+/*
+ * Checksum version 0 is used for when data checksums are disabled (OFF).
+ * PG_DATA_CHECKSUM_VERSION defines that data checksums are enabled in the
+ * cluster and PG_DATA_CHECKSUM_INPROGRESS_{ON|OFF}_VERSION defines that data
+ * checksums are either currently being enabled or disabled.
+ */
+typedef enum ChecksumType
+{
+ PG_DATA_CHECKSUM_OFF = 0,
+ PG_DATA_CHECKSUM_VERSION,
+ PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION,
+ PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION
+} ChecksumType;
+
/*
* Compute the checksum for a Postgres page. The page must be aligned on a
* 4-byte boundary.
diff --git a/src/include/storage/lwlocklist.h b/src/include/storage/lwlocklist.h
index 06a1ffd4b08b..b8f7ba0be517 100644
--- a/src/include/storage/lwlocklist.h
+++ b/src/include/storage/lwlocklist.h
@@ -85,6 +85,7 @@ PG_LWLOCK(50, DSMRegistry)
PG_LWLOCK(51, InjectionPoint)
PG_LWLOCK(52, SerialControl)
PG_LWLOCK(53, AioWorkerSubmissionQueue)
+PG_LWLOCK(54, DataChecksumsWorker)
/*
* There also exist several built-in LWLock tranches. As with the predefined
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index c6f5ebceefdd..d90d35b1d6fa 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -463,11 +463,11 @@ extern PGDLLIMPORT PGPROC *PreparedXactProcs;
* Background writer, checkpointer, WAL writer, WAL summarizer, and archiver
* run during normal operation. Startup process and WAL receiver also consume
* 2 slots, but WAL writer is launched only after startup has exited, so we
- * only need 6 slots.
+ * only need 6 slots to cover these. The DataChecksums worker and launcher
+ * can consume 2 slots when data checksums are enabled or disabled.
*/
#define MAX_IO_WORKERS 32
-#define NUM_AUXILIARY_PROCS (6 + MAX_IO_WORKERS)
-
+#define NUM_AUXILIARY_PROCS (8 + MAX_IO_WORKERS)
/* configurable options */
extern PGDLLIMPORT int DeadlockTimeout;
diff --git a/src/include/storage/procsignal.h b/src/include/storage/procsignal.h
index afeeb1ca019f..c54c61e2cd8a 100644
--- a/src/include/storage/procsignal.h
+++ b/src/include/storage/procsignal.h
@@ -54,6 +54,11 @@ typedef enum
typedef enum
{
PROCSIGNAL_BARRIER_SMGRRELEASE, /* ask smgr to close files */
+
+ PROCSIGNAL_BARRIER_CHECKSUM_OFF,
+ PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON,
+ PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF,
+ PROCSIGNAL_BARRIER_CHECKSUM_ON,
} ProcSignalBarrierType;
/*
diff --git a/src/include/utils/backend_progress.h b/src/include/utils/backend_progress.h
index dda813ab4076..c664e92dbfe7 100644
--- a/src/include/utils/backend_progress.h
+++ b/src/include/utils/backend_progress.h
@@ -28,6 +28,7 @@ typedef enum ProgressCommandType
PROGRESS_COMMAND_CREATE_INDEX,
PROGRESS_COMMAND_BASEBACKUP,
PROGRESS_COMMAND_COPY,
+ PROGRESS_COMMAND_DATACHECKSUMS,
} ProgressCommandType;
#define PGSTAT_NUM_PROGRESS_PARAM 20
diff --git a/src/test/Makefile b/src/test/Makefile
index 511a72e6238a..278ce3e8a86e 100644
--- a/src/test/Makefile
+++ b/src/test/Makefile
@@ -12,7 +12,16 @@ subdir = src/test
top_builddir = ../..
include $(top_builddir)/src/Makefile.global
-SUBDIRS = perl postmaster regress isolation modules authentication recovery subscription
+SUBDIRS = \
+ perl \
+ postmaster \
+ regress \
+ isolation \
+ modules \
+ authentication \
+ recovery \
+ subscription \
+ checksum
ifeq ($(with_icu),yes)
SUBDIRS += icu
diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile
index 903a8ac151aa..c8f2747b2612 100644
--- a/src/test/modules/Makefile
+++ b/src/test/modules/Makefile
@@ -17,6 +17,7 @@ SUBDIRS = \
test_aio \
test_binaryheap \
test_bloomfilter \
+ test_checksums \
test_copy_callbacks \
test_custom_rmgrs \
test_ddl_deparse \
diff --git a/src/test/modules/meson.build b/src/test/modules/meson.build
index 93be0f57289a..6b4450eb4733 100644
--- a/src/test/modules/meson.build
+++ b/src/test/modules/meson.build
@@ -16,6 +16,7 @@ subdir('ssl_passphrase_callback')
subdir('test_aio')
subdir('test_binaryheap')
subdir('test_bloomfilter')
+subdir('test_checksums')
subdir('test_copy_callbacks')
subdir('test_custom_rmgrs')
subdir('test_ddl_deparse')
diff --git a/src/test/modules/test_checksums/.gitignore b/src/test/modules/test_checksums/.gitignore
new file mode 100644
index 000000000000..871e943d50e1
--- /dev/null
+++ b/src/test/modules/test_checksums/.gitignore
@@ -0,0 +1,2 @@
+# Generated by test suite
+/tmp_check/
diff --git a/src/test/modules/test_checksums/Makefile b/src/test/modules/test_checksums/Makefile
new file mode 100644
index 000000000000..a5b6259a7288
--- /dev/null
+++ b/src/test/modules/test_checksums/Makefile
@@ -0,0 +1,40 @@
+#-------------------------------------------------------------------------
+#
+# Makefile for src/test/checksum
+#
+# Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
+# Portions Copyright (c) 1994, Regents of the University of California
+#
+# src/test/checksum/Makefile
+#
+#-------------------------------------------------------------------------
+
+EXTRA_INSTALL = src/test/modules/injection_points
+
+export enable_injection_points
+
+MODULE_big = test_checksums
+OBJS = \
+ $(WIN32RES) \
+ test_checksums.o
+PGFILEDESC = "test_checksums - test code for data checksums"
+
+EXTENSION = test_checksums
+DATA = test_checksums--1.0.sql
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = src/test/modules/test_checksums
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
+
+check:
+ $(prove_check)
+
+installcheck:
+ $(prove_installcheck)
diff --git a/src/test/modules/test_checksums/README b/src/test/modules/test_checksums/README
new file mode 100644
index 000000000000..0f0317060b38
--- /dev/null
+++ b/src/test/modules/test_checksums/README
@@ -0,0 +1,22 @@
+src/test/checksum/README
+
+Regression tests for data checksums
+===================================
+
+This directory contains a test suite for enabling data checksums
+in a running cluster.
+
+Running the tests
+=================
+
+ make check
+
+or
+
+ make installcheck
+
+NOTE: This creates a temporary installation (in the case of "check"),
+with multiple nodes, be they master or standby(s) for the purpose of
+the tests.
+
+NOTE: This requires the --enable-tap-tests argument to configure.
diff --git a/src/test/modules/test_checksums/meson.build b/src/test/modules/test_checksums/meson.build
new file mode 100644
index 000000000000..57156b63599b
--- /dev/null
+++ b/src/test/modules/test_checksums/meson.build
@@ -0,0 +1,35 @@
+# Copyright (c) 2025, PostgreSQL Global Development Group
+
+test_checksums_sources = files(
+ 'test_checksums.c',
+)
+
+test_checksums = shared_module('test_checksums',
+ test_checksums_sources,
+ kwargs: pg_test_mod_args,
+)
+test_install_libs += test_checksums
+
+test_install_data += files(
+ 'test_checksums.control',
+ 'test_checksums--1.0.sql',
+)
+
+tests += {
+ 'name': 'test_checksums',
+ 'sd': meson.current_source_dir(),
+ 'bd': meson.current_build_dir(),
+ 'tap': {
+ 'env': {
+ 'enable_injection_points': get_option('injection_points') ? 'yes' : 'no',
+ },
+ 'tests': [
+ 't/001_basic.pl',
+ 't/002_restarts.pl',
+ 't/003_standby_restarts.pl',
+ 't/004_offline.pl',
+ 't/005_injection.pl',
+ 't/006_concurrent_pgbench.pl',
+ ],
+ },
+}
diff --git a/src/test/modules/test_checksums/t/001_basic.pl b/src/test/modules/test_checksums/t/001_basic.pl
new file mode 100644
index 000000000000..728a5c4510c3
--- /dev/null
+++ b/src/test/modules/test_checksums/t/001_basic.pl
@@ -0,0 +1,63 @@
+
+# Copyright (c) 2025, PostgreSQL Global Development Group
+
+# Test suite for testing enabling data checksums in an online cluster
+use strict;
+use warnings FATAL => 'all';
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+use FindBin;
+use lib $FindBin::RealBin;
+
+use DataChecksums::Utils;
+
+# Initialize node with checksums disabled.
+my $node = PostgreSQL::Test::Cluster->new('main');
+$node->init(no_data_checksums => 1);
+$node->start;
+
+# Create some content to have un-checksummed data in the cluster
+$node->safe_psql('postgres',
+ "CREATE TABLE t AS SELECT generate_series(1,10000) AS a;");
+
+# Ensure that checksums are turned off
+test_checksum_state($node, 'off');
+
+# Enable data checksums and wait for the state transition to 'on'
+enable_data_checksums($node, wait => 'on');
+
+# Run a dummy query just to make sure we can read back data
+my $result =
+ $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1 ");
+is($result, '9999', 'ensure checksummed pages can be read back');
+
+# Enable data checksums again which should be a no-op so we explicitly don't
+# wait for any state transition as none should happen here
+enable_data_checksums($node);
+test_checksum_state($node, 'on');
+# ..and make sure we can still read/write data
+$node->safe_psql('postgres', "UPDATE t SET a = a + 1;");
+$result = $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1");
+is($result, '10000', 'ensure checksummed pages can be read back');
+
+# Disable checksums again and wait for the state transition
+disable_data_checksums($node, wait => 'on');
+
+# Test reading data again
+$result = $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1");
+is($result, '10000', 'ensure previously checksummed pages can be read back');
+
+# Re-enable checksums and make sure that the underlying data has changed to
+# ensure that checksums will be different.
+$node->safe_psql('postgres', "UPDATE t SET a = a + 1;");
+enable_data_checksums($node, wait => 'on');
+
+# Run a dummy query just to make sure we can read back the data
+$result = $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1");
+is($result, '10000', 'ensure checksummed pages can be read back');
+
+$node->stop;
+done_testing();
diff --git a/src/test/modules/test_checksums/t/002_restarts.pl b/src/test/modules/test_checksums/t/002_restarts.pl
new file mode 100644
index 000000000000..75599cf41f25
--- /dev/null
+++ b/src/test/modules/test_checksums/t/002_restarts.pl
@@ -0,0 +1,110 @@
+
+# Copyright (c) 2024, PostgreSQL Global Development Group
+
+# Test suite for testing enabling data checksums in an online cluster with a
+# restart which breaks processing.
+use strict;
+use warnings FATAL => 'all';
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+use FindBin;
+use lib $FindBin::RealBin;
+
+use DataChecksums::Utils;
+
+# Initialize node with checksums disabled.
+my $node = PostgreSQL::Test::Cluster->new('main');
+$node->init(no_data_checksums => 1);
+$node->start;
+
+# Initialize result storage for queries
+my $result;
+
+# Create some content to have un-checksummed data in the cluster
+$node->safe_psql('postgres',
+ "CREATE TABLE t AS SELECT generate_series(1,10000) AS a;");
+
+# Ensure that checksums are disabled
+test_checksum_state($node, 'off');
+
+SKIP:
+{
+ skip 'Data checksum delay tests not enabled in PG_TEST_EXTRA', 6
+ if (!$ENV{PG_TEST_EXTRA}
+ || $ENV{PG_TEST_EXTRA} !~ /\bchecksum_extended\b/);
+
+ # Create a barrier for checksumming to block on, in this case a pre-
+ # existing temporary table which is kept open while processing is started.
+ # We can accomplish this by setting up an interactive psql process which
+ # keeps the temporary table created as we enable checksums in another psql
+ # process.
+ #
+ # This is a similar test to the synthetic variant in 005_injection.pl
+ # which fakes this scenario.
+ my $bsession = $node->background_psql('postgres');
+ $bsession->query_safe('CREATE TEMPORARY TABLE tt (a integer);');
+
+ # In another session, make sure we can see the blocking temp table but
+ # start processing anyways and check that we are blocked with a proper
+ # wait event.
+ $result = $node->safe_psql('postgres',
+ "SELECT relpersistence FROM pg_catalog.pg_class WHERE relname = 'tt';"
+ );
+ is($result, 't', 'ensure we can see the temporary table');
+
+ # Enabling data checksums shouldn't work as the process is blocked on the
+ # temporary table held open by $bsession. Ensure that we reach inprogress-
+ # on before we do more tests.
+ enable_data_checksums($node, wait => 'inprogress-on');
+
+ # Wait for processing to finish and the worker waiting for leftover temp
+ # relations to be able to actually finish
+ $result = $node->poll_query_until(
+ 'postgres',
+ "SELECT wait_event FROM pg_catalog.pg_stat_activity "
+ . "WHERE backend_type = 'datachecksumsworker worker';",
+ 'ChecksumEnableTemptableWait');
+
+ # The datachecksumsworker waits for temporary tables to disappear for 3
+ # seconds before retrying, so sleep for 4 seconds to be guaranteed to see
+ # a retry cycle
+ sleep(4);
+
+ # Re-check the wait event to ensure we are blocked on the right thing.
+ $result = $node->safe_psql('postgres',
+ "SELECT wait_event FROM pg_catalog.pg_stat_activity "
+ . "WHERE backend_type = 'datachecksumsworker worker';");
+ is($result, 'ChecksumEnableTemptableWait',
+ 'ensure the correct wait condition is set');
+ test_checksum_state($node, 'inprogress-on');
+
+ # Stop the cluster while bsession is still attached. We can't close the
+ # session first since the brief period between closing and stopping might
+ # be enough for checksums to get enabled.
+ $node->stop;
+ $bsession->quit;
+ $node->start;
+
+ # Ensure the checksums aren't enabled across the restart. This leaves the
+ # cluster in the same state as before we entered the SKIP block.
+ test_checksum_state($node, 'off');
+}
+
+enable_data_checksums($node, wait => 'on');
+
+$result = $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1");
+is($result, '9999', 'ensure checksummed pages can be read back');
+
+$result = $node->poll_query_until(
+ 'postgres',
+ "SELECT count(*) FROM pg_stat_activity WHERE backend_type LIKE 'datachecksumsworker%';",
+ '0');
+is($result, 1, 'await datachecksums worker/launcher termination');
+
+disable_data_checksums($node, wait => 1);
+
+$node->stop;
+done_testing();
diff --git a/src/test/modules/test_checksums/t/003_standby_restarts.pl b/src/test/modules/test_checksums/t/003_standby_restarts.pl
new file mode 100644
index 000000000000..fe34b4d7d05c
--- /dev/null
+++ b/src/test/modules/test_checksums/t/003_standby_restarts.pl
@@ -0,0 +1,114 @@
+
+# Copyright (c) 2024, PostgreSQL Global Development Group
+
+# Test suite for testing enabling data checksums in an online cluster with
+# streaming replication
+use strict;
+use warnings FATAL => 'all';
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+use FindBin;
+use lib $FindBin::RealBin;
+
+use DataChecksums::Utils;
+
+# Initialize primary node
+my $node_primary = PostgreSQL::Test::Cluster->new('primary');
+$node_primary->init(allows_streaming => 1, no_data_checksums => 1);
+$node_primary->start;
+
+my $slotname = 'physical_slot';
+$node_primary->safe_psql('postgres',
+ "SELECT pg_create_physical_replication_slot('$slotname')");
+
+# Take backup
+my $backup_name = 'my_backup';
+$node_primary->backup($backup_name);
+
+# Create streaming standby linking to primary
+my $node_standby_1 = PostgreSQL::Test::Cluster->new('standby_1');
+$node_standby_1->init_from_backup($node_primary, $backup_name,
+ has_streaming => 1);
+$node_standby_1->append_conf(
+ 'postgresql.conf', qq[
+primary_slot_name = '$slotname'
+]);
+$node_standby_1->start;
+
+# Create some content on the primary to have un-checksummed data in the cluster
+$node_primary->safe_psql('postgres',
+ "CREATE TABLE t AS SELECT generate_series(1,10000) AS a;");
+
+# Wait for standbys to catch up
+$node_primary->wait_for_catchup($node_standby_1, 'replay',
+ $node_primary->lsn('insert'));
+
+# Check that checksums are turned off on all nodes
+test_checksum_state($node_primary, 'off');
+test_checksum_state($node_standby_1, 'off');
+
+# ---------------------------------------------------------------------------
+# Enable checksums for the cluster, and make sure that both the primary and
+# standby change state.
+#
+
+# Ensure that the primary switches to "inprogress-on"
+enable_data_checksums($node_primary, wait => 'inprogress-on');
+# Wait for checksum enable to be replayed
+$node_primary->wait_for_catchup($node_standby_1, 'replay');
+
+# Ensure that the standby has switched to "inprogress-on" or "on". Normally it
+# would be "inprogress-on", but it is theoretically possible for the primary to
+# complete the checksum enabling *and* have the standby replay that record
+# before we reach the check below.
+my $result = $node_standby_1->poll_query_until(
+ 'postgres',
+ "SELECT setting = 'off' FROM pg_catalog.pg_settings WHERE name = 'data_checksums';",
+ 'f');
+is($result, 1, 'ensure standby has absorbed the inprogress-on barrier');
+$result = $node_standby_1->safe_psql('postgres',
+ "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';"
+);
+
+is(($result eq 'inprogress-on' || $result eq 'on'),
+ 1, 'ensure checksums are on, or in progress, on standby_1');
+
+# Insert some more data which should be checksummed on INSERT
+$node_primary->safe_psql('postgres',
+ "INSERT INTO t VALUES (generate_series(1, 10000));");
+
+# Wait for checksums enabled on the primary and standby
+wait_for_checksum_state($node_primary, 'on');
+wait_for_checksum_state($node_standby_1, 'on');
+
+$result =
+ $node_primary->safe_psql('postgres', "SELECT count(a) FROM t WHERE a > 1");
+is($result, '19998', 'ensure we can safely read all data with checksums');
+
+$result = $node_primary->poll_query_until(
+ 'postgres',
+ "SELECT count(*) FROM pg_stat_activity WHERE backend_type LIKE 'datachecksumsworker%';",
+ '0');
+is($result, 1, 'await datachecksums worker/launcher termination');
+
+#
+# Disable checksums and ensure it's propagated to standby and that we can
+# still read all data
+#
+
+# Disable checksums and wait for the operation to be replayed
+disable_data_checksums($node_primary);
+$node_primary->wait_for_catchup($node_standby_1, 'replay');
+# Ensure that the primary abd standby has switched to off
+wait_for_checksum_state($node_primary, 'off');
+wait_for_checksum_state($node_standby_1, 'off');
+# Doublecheck reading data withourt errors
+$result =
+ $node_primary->safe_psql('postgres', "SELECT count(a) FROM t WHERE a > 1");
+is($result, "19998", 'ensure we can safely read all data without checksums');
+
+$node_standby_1->stop;
+$node_primary->stop;
+done_testing();
diff --git a/src/test/modules/test_checksums/t/004_offline.pl b/src/test/modules/test_checksums/t/004_offline.pl
new file mode 100644
index 000000000000..e9fbcf77eab5
--- /dev/null
+++ b/src/test/modules/test_checksums/t/004_offline.pl
@@ -0,0 +1,82 @@
+
+# Copyright (c) 2024, PostgreSQL Global Development Group
+
+# Test suite for testing enabling data checksums offline from various states
+# of checksum processing
+use strict;
+use warnings FATAL => 'all';
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+use FindBin;
+use lib $FindBin::RealBin;
+
+use DataChecksums::Utils;
+
+# Initialize node with checksums disabled.
+my $node = PostgreSQL::Test::Cluster->new('main');
+$node->init(no_data_checksums => 1);
+$node->start;
+
+# Create some content to have un-checksummed data in the cluster
+$node->safe_psql('postgres',
+ "CREATE TABLE t AS SELECT generate_series(1,10000) AS a;");
+
+# Ensure that checksums are disabled
+test_checksum_state($node, 'off');
+
+# Enable checksums offline using pg_checksums
+$node->stop;
+$node->checksum_enable_offline;
+$node->start;
+
+# Ensure that checksums are enabled
+test_checksum_state($node, 'on');
+
+# Run a dummy query just to make sure we can read back some data
+my $result =
+ $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1");
+is($result, '9999', 'ensure checksummed pages can be read back');
+
+# Disable checksums offline again using pg_checksums
+$node->stop;
+$node->checksum_disable_offline;
+$node->start;
+
+# Ensure that checksums are disabled
+test_checksum_state($node, 'off');
+
+# Create a barrier for checksumming to block on, in this case a pre-existing
+# temporary table which is kept open while processing is started. We can
+# accomplish this by setting up an interactive psql process which keeps the
+# temporary table created as we enable checksums in another psql process.
+
+my $bsession = $node->background_psql('postgres');
+$bsession->query_safe('CREATE TEMPORARY TABLE tt (a integer);');
+
+# In another session, make sure we can see the blocking temp table but start
+# processing anyways and check that we are blocked with a proper wait event.
+$result = $node->safe_psql('postgres',
+ "SELECT relpersistence FROM pg_catalog.pg_class WHERE relname = 'tt';");
+is($result, 't', 'ensure we can see the temporary table');
+
+enable_data_checksums($node, wait => 'inprogress-on');
+
+# Turn the cluster off and enable checksums offline, then start back up
+$bsession->quit;
+$node->stop;
+$node->checksum_enable_offline;
+$node->start;
+
+# Ensure that checksums are now enabled even though processing wasn't
+# restarted
+test_checksum_state($node, 'on');
+
+# Run a dummy query just to make sure we can read back some data
+$result = $node->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1");
+is($result, '9999', 'ensure checksummed pages can be read back');
+
+$node->stop;
+done_testing();
diff --git a/src/test/modules/test_checksums/t/005_injection.pl b/src/test/modules/test_checksums/t/005_injection.pl
new file mode 100644
index 000000000000..f4459e0e6363
--- /dev/null
+++ b/src/test/modules/test_checksums/t/005_injection.pl
@@ -0,0 +1,76 @@
+
+# Copyright (c) 2025, PostgreSQL Global Development Group
+
+# Test suite for testing enabling data checksums in an online cluster with
+# injection point tests injecting failures into the processing
+
+use strict;
+use warnings FATAL => 'all';
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+use FindBin;
+use lib $FindBin::RealBin;
+
+use DataChecksums::Utils;
+
+if ($ENV{enable_injection_points} ne 'yes')
+{
+ plan skip_all => 'Injection points not supported by this build';
+}
+
+# ---------------------------------------------------------------------------
+# Test cluster setup
+#
+
+# Initiate testcluster
+my $node = PostgreSQL::Test::Cluster->new('main');
+$node->init(no_data_checksums => 1);
+$node->start;
+
+# Set up test environment
+$node->safe_psql('postgres', 'CREATE EXTENSION test_checksums;');
+
+# ---------------------------------------------------------------------------
+# Inducing failures in processing
+
+# Force enabling checksums to fail by marking one of the databases as having
+# failed in processing.
+disable_data_checksums($node, wait => 1);
+$node->safe_psql('postgres', 'SELECT dcw_inject_fail_database(true);');
+enable_data_checksums($node, wait => 'off');
+$node->safe_psql('postgres', 'SELECT dcw_inject_fail_database(false);');
+
+# Force the enable checksums processing to make multiple passes by removing
+# one database from the list in the first pass. This will simulate a CREATE
+# DATABASE during processing. Doing this via fault injection makes the test
+# not be subject to exact timing.
+$node->safe_psql('postgres', 'SELECT dcw_prune_dblist(true);');
+enable_data_checksums($node, wait => 'on');
+
+# ---------------------------------------------------------------------------
+# Timing and retry related tests
+#
+SKIP:
+{
+ skip 'Data checksum delay tests not enabled in PG_TEST_EXTRA', 4
+ if (!$ENV{PG_TEST_EXTRA}
+ || $ENV{PG_TEST_EXTRA} !~ /\bchecksum_extended\b/);
+
+ # Inject a delay in the barrier for enabling checksums
+ disable_data_checksums($node, wait => 1);
+ $node->safe_psql('postgres', 'SELECT dcw_inject_delay_barrier();');
+ enable_data_checksums($node, wait => 'on');
+
+ # Fake the existence of a temporary table at the start of processing, which
+ # will force the processing to wait and retry in order to wait for it to
+ # disappear.
+ disable_data_checksums($node, wait => 1);
+ $node->safe_psql('postgres', 'SELECT dcw_fake_temptable(true);');
+ enable_data_checksums($node, wait => 'on');
+}
+
+$node->stop;
+done_testing();
diff --git a/src/test/modules/test_checksums/t/006_concurrent_pgbench.pl b/src/test/modules/test_checksums/t/006_concurrent_pgbench.pl
new file mode 100644
index 000000000000..b33ca6e0c260
--- /dev/null
+++ b/src/test/modules/test_checksums/t/006_concurrent_pgbench.pl
@@ -0,0 +1,326 @@
+
+# Copyright (c) 2025, PostgreSQL Global Development Group
+
+# Test suite for testing enabling data checksums in an online cluster with
+# concurrent activity via pgbench runs
+
+use strict;
+use warnings FATAL => 'all';
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+use FindBin;
+use lib $FindBin::RealBin;
+
+use DataChecksums::Utils;
+
+my $node_primary_slot = 'physical_slot';
+my $node_primary_backup = 'primary_backup';
+my $node_primary;
+my $node_primary_loglocation = 0;
+my $node_standby_1;
+my $node_standby_1_loglocation = 0;
+
+# The number of full test iterations which will be performed. The exact number
+# of tests performed and the wall time taken is non-deterministic as the test
+# performs a lot of randomized actions, but 50 iterations will be a long test
+# run regardless.
+my $TEST_ITERATIONS = 50;
+
+# Variables which record the current state of the cluster
+my $data_checksum_state = 'off';
+my $pgbench_running = 0;
+
+# Variables holding state for managing the cluster and aux processes in
+# various ways
+my @stop_modes = ();
+my ($pgb_primary_stdin, $pgb_primary_stdout, $pgb_primary_stderr) =
+ ('', '', '');
+my ($pgb_standby_1_stdin, $pgb_standby_1_stdout, $pgb_standby_1_stderr) =
+ ('', '', '');
+
+if (!$ENV{PG_TEST_EXTRA} || $ENV{PG_TEST_EXTRA} !~ /\bchecksum_extended\b/)
+{
+ plan skip_all => 'Extended tests not enabled';
+}
+
+if ($ENV{enable_injection_points} ne 'yes')
+{
+ plan skip_all => 'Injection points not supported by this build';
+}
+
+# Helper for retrieving a binary value with random distribution for deciding
+# whether to turn things off during testing.
+sub cointoss
+{
+ return int(rand(2) == 1);
+}
+
+# Helper for injecting random sleeps here and there in the testrun. The sleep
+# duration wont be predictable in order to avoid sleep patterns that manage to
+# avoid race conditions and timing bugs.
+sub random_sleep
+{
+ return if cointoss;
+ sleep(int(rand(3)));
+}
+
+# Start a read-only pgbench run in the background against the server specified
+# via the port passed as parameter
+sub background_ro_pgbench
+{
+ my ($port, $stdin, $stdout, $stderr) = @_;
+
+ my $pgbench_primary = IPC::Run::start(
+ [ 'pgbench', '-p', $port, '-S', '-T', '600', '-c', '10', 'postgres' ],
+ '<' => \$stdin,
+ '>' => \$stdout,
+ '2>' => \$stderr,
+ IPC::Run::timer($PostgreSQL::Test::Utils::timeout_default));
+}
+
+# Start a pgbench run in the background against the server specified via the
+# port passed as parameter
+sub background_rw_pgbench
+{
+ my ($port, $stdin, $stdout, $stderr) = @_;
+
+ my $pgbench_primary = IPC::Run::start(
+ [ 'pgbench', '-p', $port, '-T', '600', '-c', '10', 'postgres' ],
+ '<' => \$stdin,
+ '>' => \$stdout,
+ '2>' => \$stderr,
+ IPC::Run::timer($PostgreSQL::Test::Utils::timeout_default));
+}
+
+# Invert the state of data checksums in the cluster, if data checksums are on
+# then disable them and vice versa. Also performs proper validation of the
+# before and after state.
+sub flip_data_checksums
+{
+ test_checksum_state($node_primary, $data_checksum_state);
+ test_checksum_state($node_standby_1, $data_checksum_state);
+
+ if ($data_checksum_state eq 'off')
+ {
+ # Coin-toss to see if we are injecting a retry due to a temptable
+ $node_primary->safe_psql('postgres',
+ 'SELECT dcw_fake_temptable(true);')
+ if cointoss();
+
+ # Ensure that the primary switches to "inprogress-on"
+ enable_data_checksums($node_primary, wait => 'inprogress-on');
+ random_sleep();
+ # Wait for checksum enable to be replayed
+ $node_primary->wait_for_catchup($node_standby_1, 'replay');
+
+ # Ensure that the standby has switched to "inprogress-on" or "on".
+ # Normally it would be "inprogress-on", but it is theoretically
+ # possible for the primary to complete the checksum enabling *and* have
+ # the standby replay that record before we reach the check below.
+ my $result = $node_standby_1->poll_query_until(
+ 'postgres',
+ "SELECT setting = 'off' "
+ . "FROM pg_catalog.pg_settings "
+ . "WHERE name = 'data_checksums';",
+ 'f');
+ is($result, 1,
+ 'ensure standby has absorbed the inprogress-on barrier');
+ random_sleep();
+ $result = $node_standby_1->safe_psql('postgres',
+ "SELECT setting "
+ . "FROM pg_catalog.pg_settings "
+ . "WHERE name = 'data_checksums';");
+
+ is(($result eq 'inprogress-on' || $result eq 'on'),
+ 1, 'ensure checksums are on, or in progress, on standby_1');
+
+ # Wait for checksums enabled on the primary and standby
+ wait_for_checksum_state($node_primary, 'on');
+ random_sleep();
+ wait_for_checksum_state($node_standby_1, 'on');
+
+ $node_primary->safe_psql('postgres',
+ 'SELECT dcw_fake_temptable(false);');
+ $data_checksum_state = 'on';
+ }
+ elsif ($data_checksum_state eq 'on')
+ {
+ random_sleep();
+ disable_data_checksums($node_primary);
+ $node_primary->wait_for_catchup($node_standby_1, 'replay');
+
+ # Wait for checksums disabled on the primary and standby
+ wait_for_checksum_state($node_primary, 'off');
+ random_sleep();
+ wait_for_checksum_state($node_standby_1, 'off');
+
+ $data_checksum_state = 'off';
+ }
+ else
+ {
+ # This should only happen due to programmer error when hacking on the
+ # test code, but since that might pass subtly by let's ensure it gets
+ # caught with a test error if so.
+ is(1, 0, 'data_checksum_state variable has invalid state');
+ }
+}
+
+# Prepare an array with pg_ctl stop modes which we later can randomly select
+# from in order to stop the cluster in some way.
+for (my $i = 1; $i <= 100; $i++)
+{
+ if (int(rand($i * 2)) > $i)
+ {
+ push(@stop_modes, "immediate");
+ }
+ else
+ {
+ push(@stop_modes, "fast");
+ }
+}
+
+# Create and start a cluster with one primary and one standby node, and ensure
+# they are caught up and in sync.
+$node_primary = PostgreSQL::Test::Cluster->new('main');
+$node_primary->init(allows_streaming => 1, no_data_checksums => 1);
+# max_connections need to be bumped in order to accomodate for pgbench clients
+# and log_statement is dialled down since it otherwise will generate enormous
+# amounts of logging. Page verification failures are still logged.
+$node_primary->append_conf(
+ 'postgresql.conf',
+ qq[
+max_connections = 30
+log_statement = none
+]);
+$node_primary->start;
+$node_primary->safe_psql('postgres', 'CREATE EXTENSION test_checksums;');
+# Create some content to have un-checksummed data in the cluster
+$node_primary->safe_psql('postgres',
+ "CREATE TABLE t AS SELECT generate_series(1, 100000) AS a;");
+$node_primary->safe_psql('postgres',
+ "SELECT pg_create_physical_replication_slot('$node_primary_slot');");
+$node_primary->backup($node_primary_backup);
+
+$node_standby_1 = PostgreSQL::Test::Cluster->new('standby_1');
+$node_standby_1->init_from_backup($node_primary, $node_primary_backup,
+ has_streaming => 1);
+$node_standby_1->append_conf(
+ 'postgresql.conf', qq[
+primary_slot_name = '$node_primary_slot'
+]);
+$node_standby_1->start;
+
+$node_primary->command_ok([ 'pgbench', '-i', '-s', '100', '-q', 'postgres' ]);
+$node_primary->wait_for_catchup($node_standby_1, 'replay');
+
+# Start the test suite with pgbench running.
+background_ro_pgbench(
+ $node_standby_1->port, $pgb_standby_1_stdin,
+ $pgb_standby_1_stdout, $pgb_standby_1_stderr);
+background_rw_pgbench(
+ $node_primary->port, $pgb_primary_stdin,
+ $pgb_primary_stdout, $pgb_primary_stderr);
+
+# Main test suite. This loop will start a pgbench run on the cluster and while
+# that's running flip the state of data checksums concurrently. It will then
+# randomly restart thec cluster (in fast or immediate) mode and then check for
+# the desired state. The idea behind doing things randomly is to stress out
+# any timing related issues by subjecting the cluster for varied workloads.
+# A TODO is to generate a trace such that any test failure can be traced to
+# its order of operations for debugging.
+for (my $i = 0; $i < $TEST_ITERATIONS; $i++)
+{
+ if (!$node_primary->is_alive)
+ {
+ # Since the log isn't being written to now, parse the log and check
+ # for instances of checksum verification failures.
+ my $log = PostgreSQL::Test::Utils::slurp_file($node_primary->logfile,
+ $node_primary_loglocation);
+ unlike(
+ $log,
+ qr/page verification failed/,
+ "no checksum validation errors in primary log");
+ $node_primary_loglocation = -s $node_primary->logfile;
+
+ # If data checksums are enabled, take the opportunity to verify them
+ # while the cluster is offline
+ $node_primary->checksum_verify_offline()
+ unless $data_checksum_state eq 'off';
+ random_sleep();
+ $node_primary->start;
+ # Start a pgbench in the background against the primary
+ background_rw_pgbench($node_primary->port, 0, $pgb_primary_stdin,
+ $pgb_primary_stdout, $pgb_primary_stderr);
+ }
+
+ if (!$node_standby_1->is_alive)
+ {
+ # Since the log isn't being written to now, parse the log and check
+ # for instances of checksum verification failures.
+ my $log =
+ PostgreSQL::Test::Utils::slurp_file($node_standby_1->logfile,
+ $node_standby_1_loglocation);
+ unlike(
+ $log,
+ qr/page verification failed/,
+ "no checksum validation errors in standby_1 log");
+ $node_standby_1_loglocation = -s $node_standby_1->logfile;
+
+ # If data checksums are enabled, take the opportunity to verify them
+ # while the cluster is offline
+ $node_standby_1->checksum_verify_offline()
+ unless $data_checksum_state eq 'off';
+ random_sleep();
+ $node_standby_1->start;
+ # Start a select-only pgbench in the background on the standby
+ background_ro_pgbench($node_standby_1->port, 1, $pgb_standby_1_stdin,
+ $pgb_standby_1_stdout, $pgb_standby_1_stderr);
+ }
+
+ $node_primary->safe_psql('postgres', "UPDATE t SET a = a + 1;");
+
+ flip_data_checksums();
+ random_sleep();
+ my $result = $node_primary->safe_psql('postgres',
+ "SELECT count(*) FROM t WHERE a > 1");
+ is($result, '100000', 'ensure data pages can be read back on primary');
+ random_sleep();
+ $node_primary->wait_for_catchup($node_standby_1, 'write');
+
+ # Potentially powercycle the cluster
+ $node_primary->stop($stop_modes[ int(rand(100)) ]) if cointoss();
+ random_sleep();
+ $node_standby_1->stop($stop_modes[ int(rand(100)) ]) if cointoss();
+}
+
+# Testrun is over, ensure that data reads back as expected and perform a final
+# verification of the data checksum state.
+my $result =
+ $node_primary->safe_psql('postgres', "SELECT count(*) FROM t WHERE a > 1");
+is($result, '100000', 'ensure data pages can be read back on primary');
+test_checksum_state($node_primary, $data_checksum_state);
+test_checksum_state($node_standby_1, $data_checksum_state);
+
+# Perform one final pass over the logs and hunt for unexpected errors
+my $log = PostgreSQL::Test::Utils::slurp_file($node_primary->logfile,
+ $node_primary_loglocation);
+unlike(
+ $log,
+ qr/page verification failed/,
+ "no checksum validation errors in primary log");
+$node_primary_loglocation = -s $node_primary->logfile;
+$log = PostgreSQL::Test::Utils::slurp_file($node_standby_1->logfile,
+ $node_standby_1_loglocation);
+unlike(
+ $log,
+ qr/page verification failed/,
+ "no checksum validation errors in standby_1 log");
+$node_standby_1_loglocation = -s $node_standby_1->logfile;
+
+$node_standby_1->teardown_node;
+$node_primary->teardown_node;
+
+done_testing();
diff --git a/src/test/modules/test_checksums/t/DataChecksums/Utils.pm b/src/test/modules/test_checksums/t/DataChecksums/Utils.pm
new file mode 100644
index 000000000000..ee2f2a1428fd
--- /dev/null
+++ b/src/test/modules/test_checksums/t/DataChecksums/Utils.pm
@@ -0,0 +1,185 @@
+
+# Copyright (c) 2025, PostgreSQL Global Development Group
+
+=pod
+
+=head1 NAME
+
+DataChecksums::Utils - Utility functions for testing data checksums in a running cluster
+
+=head1 SYNOPSIS
+
+ use PostgreSQL::Test::Cluster;
+ use DataChecksums::Utils qw( .. );
+
+ # Create, and start, a new cluster
+ my $node = PostgreSQL::Test::Cluster->new('primary');
+ $node->init;
+ $node->start;
+
+ test_checksum_state($node, 'off');
+
+ enable_data_checksums($node);
+
+ wait_for_checksum_state($node, 'on');
+
+
+=cut
+
+package DataChecksums::Utils;
+
+use strict;
+use warnings FATAL => 'all';
+use Exporter 'import';
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+our @EXPORT = qw(
+ test_checksum_state
+ wait_for_checksum_state
+ enable_data_checksums
+ disable_data_checksums
+);
+
+=pod
+
+=head1 METHODS
+
+=over
+
+=item test_checksum_state(node, state)
+
+Test that the current value of the data checksum GUC in the server running
+at B matches B. If the values differ, a test failure is logged.
+Returns True if the values match, otherwise False.
+
+=cut
+
+sub test_checksum_state
+{
+ my ($postgresnode, $state) = @_;
+
+ my $result = $postgresnode->safe_psql('postgres',
+ "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';"
+ );
+ is($result, $state, 'ensure checksums are set to ' . $state);
+ return $result eq $state;
+}
+
+=item wait_for_checksum_state(node, state)
+
+Test the value of the data checksum GUC in the server running at B
+repeatedly until it matches B or times out. Processing will run for
+$PostgreSQL::Test::Utils::timeout_default seconds before timing out. If the
+values differ when the process times out, False is returned and a test failure
+is logged, otherwise True.
+
+=cut
+
+sub wait_for_checksum_state
+{
+ my ($postgresnode, $state) = @_;
+
+ my $res = $postgresnode->poll_query_until(
+ 'postgres',
+ "SELECT setting FROM pg_catalog.pg_settings WHERE name = 'data_checksums';",
+ $state);
+ is($res, 1, 'ensure data checksums are transitioned to ' . $state);
+ return $res == 1;
+}
+
+=item enable_data_checksums($node, %params)
+
+Function for enabling data checksums in the cluster running at B.
+
+=over
+
+=item cost_delay
+
+The C to use when enabling data checksums, default is 0.
+
+=item cost_limit
+
+The C to use when enabling data checksums, default is 100.
+
+=item fast
+
+If set to C an immediate checkpoint will be issued after data
+checksums are enabled. Setting this to false will lead to slower tests.
+The default is true.
+
+=item wait
+
+If defined, the function will wait for the state defined in this parameter,
+waiting timing out, before returning. The function will wait for
+$PostgreSQL::Test::Utils::timeout_default seconds before timing out.
+
+=back
+
+=cut
+
+sub enable_data_checksums
+{
+ my $postgresnode = shift;
+ my %params = @_;
+
+ # Set sane defaults for the parameters
+ $params{cost_delay} = 0 unless (defined($params{cost_delay}));
+ $params{cost_limit} = 100 unless (defined($params{cost_limit}));
+ $params{fast} = 'true' unless (defined($params{fast}));
+
+ my $query = <<'EOQ';
+SELECT pg_enable_data_checksums(%s, %s, %s);
+EOQ
+
+ $postgresnode->safe_psql(
+ 'postgres',
+ sprintf($query,
+ $params{cost_delay}, $params{cost_limit}, $params{fast}));
+
+ wait_for_checksum_state($postgresnode, $params{wait})
+ if (defined($params{wait}));
+}
+
+=item disable_data_checksums($node, %params)
+
+Function for disabling data checksums in the cluster running at B.
+
+=over
+
+=item wait
+
+If defined, the function will wait for the state to turn to B, or
+waiting timing out, before returning. The function will wait for
+$PostgreSQL::Test::Utils::timeout_default seconds before timing out.
+Unlike in C the value of the parameter is discarded.
+
+=back
+
+=cut
+
+sub disable_data_checksums
+{
+ my $postgresnode = shift;
+ my %params = @_;
+
+ # Set sane defaults for the parameters
+ $params{fast} = 'true' unless (defined($params{fast}));
+
+ my $query = <<'EOQ';
+SELECT pg_disable_data_checksums(%s);
+EOQ
+
+ $postgresnode->safe_psql('postgres', sprintf($query, $params{fast}));
+
+ wait_for_checksum_state($postgresnode, 'off') if (defined($params{wait}));
+}
+
+=pod
+
+=back
+
+=cut
+
+1;
diff --git a/src/test/modules/test_checksums/test_checksums--1.0.sql b/src/test/modules/test_checksums/test_checksums--1.0.sql
new file mode 100644
index 000000000000..704b45a31866
--- /dev/null
+++ b/src/test/modules/test_checksums/test_checksums--1.0.sql
@@ -0,0 +1,20 @@
+/* src/test/modules/test_checksums/test_checksums--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION test_checksums" to load this file. \quit
+
+CREATE FUNCTION dcw_inject_delay_barrier(attach boolean DEFAULT true)
+ RETURNS pg_catalog.void
+ AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION dcw_inject_fail_database(attach boolean DEFAULT true)
+ RETURNS pg_catalog.void
+ AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION dcw_prune_dblist(attach boolean DEFAULT true)
+ RETURNS pg_catalog.void
+ AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION dcw_fake_temptable(attach boolean DEFAULT true)
+ RETURNS pg_catalog.void
+ AS 'MODULE_PATHNAME' LANGUAGE C;
diff --git a/src/test/modules/test_checksums/test_checksums.c b/src/test/modules/test_checksums/test_checksums.c
new file mode 100644
index 000000000000..26897bff960d
--- /dev/null
+++ b/src/test/modules/test_checksums/test_checksums.c
@@ -0,0 +1,173 @@
+/*--------------------------------------------------------------------------
+ *
+ * test_checksums.c
+ * Test data checksums
+ *
+ * Copyright (c) 2025, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/test/modules/test_checksums/test_checksums.c
+ *
+ * -------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "postmaster/datachecksumsworker.h"
+#include "storage/latch.h"
+#include "utils/injection_point.h"
+#include "utils/wait_event.h"
+
+#define USEC_PER_SEC 1000000
+
+
+PG_MODULE_MAGIC;
+
+extern PGDLLEXPORT void dc_delay_barrier(const char *name, const void *private_data, void *arg);
+extern PGDLLEXPORT void dc_fail_database(const char *name, const void *private_data, void *arg);
+extern PGDLLEXPORT void dc_dblist(const char *name, const void *private_data, void *arg);
+extern PGDLLEXPORT void dc_fake_temptable(const char *name, const void *private_data, void *arg);
+
+/*
+ * Test for delaying emission of procsignalbarriers.
+ */
+void
+dc_delay_barrier(const char *name, const void *private_data, void *arg)
+{
+ (void) name;
+ (void) private_data;
+
+ (void) WaitLatch(MyLatch,
+ WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
+ (3 * 1000),
+ WAIT_EVENT_PG_SLEEP);
+}
+
+PG_FUNCTION_INFO_V1(dcw_inject_delay_barrier);
+Datum
+dcw_inject_delay_barrier(PG_FUNCTION_ARGS)
+{
+#ifdef USE_INJECTION_POINTS
+ bool attach = PG_GETARG_BOOL(0);
+
+ if (attach)
+ InjectionPointAttach("datachecksums-enable-checksums-delay",
+ "test_checksums",
+ "dc_delay_barrier",
+ NULL,
+ 0);
+ else
+ InjectionPointDetach("datachecksums-enable-checksums-delay");
+#else
+ elog(ERROR,
+ "test is not working as intended when injection points are disabled");
+#endif
+ PG_RETURN_VOID();
+}
+
+void
+dc_fail_database(const char *name, const void *private_data, void *arg)
+{
+ static bool first_pass = true;
+ DataChecksumsWorkerResult *res = (DataChecksumsWorkerResult *) arg;
+
+ if (first_pass)
+ *res = DATACHECKSUMSWORKER_FAILED;
+ first_pass = false;
+}
+
+PG_FUNCTION_INFO_V1(dcw_inject_fail_database);
+Datum
+dcw_inject_fail_database(PG_FUNCTION_ARGS)
+{
+#ifdef USE_INJECTION_POINTS
+ bool attach = PG_GETARG_BOOL(0);
+
+ if (attach)
+ InjectionPointAttach("datachecksumsworker-fail-db",
+ "test_checksums",
+ "dc_fail_database",
+ NULL,
+ 0);
+ else
+ InjectionPointDetach("datachecksumsworker-fail-db");
+#else
+ elog(ERROR,
+ "test is not working as intended when injection points are disabled");
+#endif
+ PG_RETURN_VOID();
+}
+
+/*
+ * Test to remove an entry from the Databaselist to force re-processing since
+ * not all databases could be processed in the first iteration of the loop.
+ */
+void
+dc_dblist(const char *name, const void *private_data, void *arg)
+{
+ static bool first_pass = true;
+ List *DatabaseList = (List *) arg;
+
+ if (first_pass)
+ DatabaseList = list_delete_last(DatabaseList);
+ first_pass = false;
+}
+
+PG_FUNCTION_INFO_V1(dcw_prune_dblist);
+Datum
+dcw_prune_dblist(PG_FUNCTION_ARGS)
+{
+#ifdef USE_INJECTION_POINTS
+ bool attach = PG_GETARG_BOOL(0);
+
+ if (attach)
+ InjectionPointAttach("datachecksumsworker-initial-dblist",
+ "test_checksums",
+ "dc_dblist",
+ NULL,
+ 0);
+ else
+ InjectionPointDetach("datachecksumsworker-initial-dblist");
+#else
+ elog(ERROR,
+ "test is not working as intended when injection points are disabled");
+#endif
+ PG_RETURN_VOID();
+}
+
+/*
+ * Test to force waiting for existing temptables.
+ */
+void
+dc_fake_temptable(const char *name, const void *private_data, void *arg)
+{
+ static bool first_pass = true;
+ int *numleft = (int *) arg;
+
+ if (first_pass)
+ *numleft = 1;
+ first_pass = false;
+}
+
+PG_FUNCTION_INFO_V1(dcw_fake_temptable);
+Datum
+dcw_fake_temptable(PG_FUNCTION_ARGS)
+{
+#ifdef USE_INJECTION_POINTS
+ bool attach = PG_GETARG_BOOL(0);
+
+ if (attach)
+ InjectionPointAttach("datachecksumsworker-fake-temptable-wait",
+ "test_checksums",
+ "dc_fake_temptable",
+ NULL,
+ 0);
+ else
+ InjectionPointDetach("datachecksumsworker-fake-temptable-wait");
+#else
+ elog(ERROR,
+ "test is not working as intended when injection points are disabled");
+#endif
+ PG_RETURN_VOID();
+}
diff --git a/src/test/modules/test_checksums/test_checksums.control b/src/test/modules/test_checksums/test_checksums.control
new file mode 100644
index 000000000000..84b4cc035a78
--- /dev/null
+++ b/src/test/modules/test_checksums/test_checksums.control
@@ -0,0 +1,4 @@
+comment = 'Test code for data checksums'
+default_version = '1.0'
+module_pathname = '$libdir/test_checksums'
+relocatable = true
diff --git a/src/test/perl/PostgreSQL/Test/Cluster.pm b/src/test/perl/PostgreSQL/Test/Cluster.pm
index 35413f140198..3af7944aceac 100644
--- a/src/test/perl/PostgreSQL/Test/Cluster.pm
+++ b/src/test/perl/PostgreSQL/Test/Cluster.pm
@@ -3872,6 +3872,51 @@ sub advance_wal
}
}
+=item $node->checksum_enable_offline()
+
+Enable data page checksums in an offline cluster with B. The
+caller is responsible for ensuring that the cluster is in the right state for
+this operation.
+
+=cut
+
+sub checksum_enable_offline
+{
+ my ($self) = @_;
+
+ print "# Enabling checksums in \"$self->data_dir\"\n";
+ PostgreSQL::Test::Utils::system_or_bail('pg_checksums', '-D',
+ $self->data_dir, '-e');
+ return;
+}
+
+=item checksum_disable_offline
+
+Disable data page checksums in an offline cluster with B. The
+caller is responsible for ensuring that the cluster is in the right state for
+this operation.
+
+=cut
+
+sub checksum_disable_offline
+{
+ my ($self) = @_;
+
+ print "# Disabling checksums in \"$self->data_dir\"\n";
+ PostgreSQL::Test::Utils::system_or_bail('pg_checksums', '-D',
+ $self->data_dir, '-d');
+ return;
+}
+
+sub checksum_verify_offline
+{
+ my ($self) = @_;
+
+ PostgreSQL::Test::Utils::system_or_bail('pg_checksums', '-D',
+ $self->data_dir, '-c');
+ return;
+}
+
=pod
=back
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index 35e8aad7701b..4b9c5526e50c 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -2071,6 +2071,42 @@ pg_stat_progress_create_index| SELECT s.pid,
s.param15 AS partitions_done
FROM (pg_stat_get_progress_info('CREATE INDEX'::text) s(pid, datid, relid, param1, param2, param3, param4, param5, param6, param7, param8, param9, param10, param11, param12, param13, param14, param15, param16, param17, param18, param19, param20)
LEFT JOIN pg_database d ON ((s.datid = d.oid)));
+pg_stat_progress_data_checksums| SELECT s.pid,
+ s.datid,
+ d.datname,
+ CASE s.param1
+ WHEN 0 THEN 'enabling'::text
+ WHEN 1 THEN 'disabling'::text
+ WHEN 2 THEN 'waiting'::text
+ WHEN 3 THEN 'waiting on temporary tables'::text
+ WHEN 4 THEN 'waiting on checkpoint'::text
+ WHEN 5 THEN 'done'::text
+ ELSE NULL::text
+ END AS phase,
+ CASE s.param2
+ WHEN '-1'::integer THEN NULL::bigint
+ ELSE s.param2
+ END AS databases_total,
+ s.param3 AS databases_done,
+ CASE s.param4
+ WHEN '-1'::integer THEN NULL::bigint
+ ELSE s.param4
+ END AS relations_total,
+ CASE s.param5
+ WHEN '-1'::integer THEN NULL::bigint
+ ELSE s.param5
+ END AS relations_done,
+ CASE s.param6
+ WHEN '-1'::integer THEN NULL::bigint
+ ELSE s.param6
+ END AS blocks_total,
+ CASE s.param7
+ WHEN '-1'::integer THEN NULL::bigint
+ ELSE s.param7
+ END AS blocks_done
+ FROM (pg_stat_get_progress_info('DATACHECKSUMS'::text) s(pid, datid, relid, param1, param2, param3, param4, param5, param6, param7, param8, param9, param10, param11, param12, param13, param14, param15, param16, param17, param18, param19, param20)
+ LEFT JOIN pg_database d ON ((s.datid = d.oid)))
+ ORDER BY s.datid;
pg_stat_progress_vacuum| SELECT s.pid,
s.datid,
d.datname,
diff --git a/src/test/regress/expected/stats.out b/src/test/regress/expected/stats.out
index 605f50703769..9042e4d38e30 100644
--- a/src/test/regress/expected/stats.out
+++ b/src/test/regress/expected/stats.out
@@ -59,6 +59,22 @@ io worker|relation|vacuum
io worker|temp relation|normal
io worker|wal|init
io worker|wal|normal
+datachecksumsworker launcher|relation|bulkread
+datachecksumsworker launcher|relation|bulkwrite
+datachecksumsworker launcher|relation|init
+datachecksumsworker launcher|relation|normal
+datachecksumsworker launcher|relation|vacuum
+datachecksumsworker launcher|temp relation|normal
+datachecksumsworker launcher|wal|init
+datachecksumsworker launcher|wal|normal
+datachecksumsworker worker|relation|bulkread
+datachecksumsworker worker|relation|bulkwrite
+datachecksumsworker worker|relation|init
+datachecksumsworker worker|relation|normal
+datachecksumsworker worker|relation|vacuum
+datachecksumsworker worker|temp relation|normal
+datachecksumsworker worker|wal|init
+datachecksumsworker worker|wal|normal
slotsync worker|relation|bulkread
slotsync worker|relation|bulkwrite
slotsync worker|relation|init
@@ -95,7 +111,7 @@ walsummarizer|wal|init
walsummarizer|wal|normal
walwriter|wal|init
walwriter|wal|normal
-(79 rows)
+(87 rows)
\a
-- ensure that both seqscan and indexscan plans are allowed
SET enable_seqscan TO on;
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index a13e81628902..df0f49ea2aab 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -416,6 +416,7 @@ CheckPointStmt
CheckpointStatsData
CheckpointerRequest
CheckpointerShmemStruct
+ChecksumType
Chromosome
CkptSortItem
CkptTsStatus
@@ -608,6 +609,10 @@ DataPageDeleteStack
DataTypesUsageChecks
DataTypesUsageVersionCheck
DatabaseInfo
+DataChecksumsWorkerDatabase
+DataChecksumsWorkerResult
+DataChecksumsWorkerResultEntry
+DataChecksumsWorkerShmemStruct
DateADT
DateTimeErrorExtra
Datum
@@ -4243,6 +4248,7 @@ xl_btree_split
xl_btree_unlink_page
xl_btree_update
xl_btree_vacuum
+xl_checksum_state
xl_clog_truncate
xl_commit_ts_truncate
xl_dbase_create_file_copy_rec