Make pg_stat_io count IOs as bytes instead of blocks for some operations
authorMichael Paquier <michael@paquier.xyz>
Tue, 14 Jan 2025 03:14:29 +0000 (12:14 +0900)
committerMichael Paquier <michael@paquier.xyz>
Tue, 14 Jan 2025 03:14:29 +0000 (12:14 +0900)
Currently in pg_stat_io view, IOs are counted as blocks of size
BLCKSZ.  There are two limitations with this design:
* The actual number of I/O requests sent to the kernel is lower because
I/O requests may be merged before being sent.  Additionally, it gives
the impression that all I/Os are done in block size, which shadows the
benefits of merging I/O requests.
* Some patches are under work to extend pg_stat_io for the tracking of
operations that may not be linked to the block size.  For example, WAL
read IOs are done in variable bytes and it is not possible to correctly
show these IOs in pg_stat_io view, and we want to keep all this data in
a single system view rather than spread it across multiple relations to
ease monitoring.

WaitReadBuffers() can now be tracked as a single read operation
worth N blocks.  Same for ExtendBufferedRelShared() and
ExtendBufferedRelLocal() for extensions.

Three columns are added to pg_stat_io for reads, writes and extensions
for the byte calculations.  op_bytes, which was always hardcoded to
BLCKSZ, is removed.  IO backend statistics are updated to reflect these
changes.

Bump catalog version.

Author: Nazir Bilal Yavuz
Reviewed-by: Bertrand Drouvot, Melanie Plageman
Discussion: https://postgr.es/m/CAN55FZ0oqxBaaHAEsj=xFqkzE3n5P=3RA1V_igXwL-RV7QRzyw@mail.gmail.com

12 files changed:
doc/src/sgml/monitoring.sgml
src/backend/catalog/system_views.sql
src/backend/storage/buffer/bufmgr.c
src/backend/storage/buffer/localbuf.c
src/backend/storage/smgr/md.c
src/backend/utils/activity/pgstat_backend.c
src/backend/utils/activity/pgstat_io.c
src/backend/utils/adt/pgstatfuncs.c
src/include/catalog/catversion.h
src/include/catalog/pg_proc.dat
src/include/pgstat.h
src/test/regress/expected/rules.out

index d0d176cc54f1147632c8e33dbfa5cec6ae7c8898..e5888fae2b52fa733820cdad7a2a24df412dd13d 100644 (file)
@@ -2692,8 +2692,18 @@ description | Waiting for a newly initialized WAL file to reach durable storage
         <structfield>reads</structfield> <type>bigint</type>
        </para>
        <para>
-        Number of read operations, each of the size specified in
-        <varname>op_bytes</varname>.
+        Number of read operations.
+       </para>
+      </entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry">
+       <para role="column_definition">
+        <structfield>read_bytes</structfield> <type>numeric</type>
+       </para>
+       <para>
+        The total size of read operations in bytes.
        </para>
       </entry>
      </row>
@@ -2716,8 +2726,18 @@ description | Waiting for a newly initialized WAL file to reach durable storage
         <structfield>writes</structfield> <type>bigint</type>
        </para>
        <para>
-        Number of write operations, each of the size specified in
-        <varname>op_bytes</varname>.
+        Number of write operations.
+       </para>
+      </entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry">
+       <para role="column_definition">
+        <structfield>write_bytes</structfield> <type>numeric</type>
+       </para>
+       <para>
+        The total size of write operations in bytes.
        </para>
       </entry>
      </row>
@@ -2740,8 +2760,8 @@ description | Waiting for a newly initialized WAL file to reach durable storage
         <structfield>writebacks</structfield> <type>bigint</type>
        </para>
        <para>
-        Number of units of size <varname>op_bytes</varname> which the process
-        requested the kernel write out to permanent storage.
+        Number of units of size <symbol>BLCKSZ</symbol> (typically 8kB) which
+        the process requested the kernel write out to permanent storage.
        </para>
       </entry>
      </row>
@@ -2766,8 +2786,7 @@ description | Waiting for a newly initialized WAL file to reach durable storage
         <structfield>extends</structfield> <type>bigint</type>
        </para>
        <para>
-        Number of relation extend operations, each of the size specified in
-        <varname>op_bytes</varname>.
+        Number of relation extend operations.
        </para>
       </entry>
      </row>
@@ -2775,11 +2794,10 @@ description | Waiting for a newly initialized WAL file to reach durable storage
      <row>
       <entry role="catalog_table_entry">
        <para role="column_definition">
-        <structfield>extend_time</structfield> <type>double precision</type>
+        <structfield>extend_bytes</structfield> <type>numeric</type>
        </para>
        <para>
-        Time spent in extend operations in milliseconds (if
-        <xref linkend="guc-track-io-timing"/> is enabled, otherwise zero)
+        The total size of relation extend operations in bytes.
        </para>
       </entry>
      </row>
@@ -2787,16 +2805,11 @@ description | Waiting for a newly initialized WAL file to reach durable storage
      <row>
       <entry role="catalog_table_entry">
        <para role="column_definition">
-        <structfield>op_bytes</structfield> <type>bigint</type>
-       </para>
-       <para>
-        The number of bytes per unit of I/O read, written, or extended.
+        <structfield>extend_time</structfield> <type>double precision</type>
        </para>
        <para>
-        Relation data reads, writes, and extends are done in
-        <varname>block_size</varname> units, derived from the build-time
-        parameter <symbol>BLCKSZ</symbol>, which is <literal>8192</literal> by
-        default.
+        Time spent in extend operations in milliseconds (if
+        <xref linkend="guc-track-io-timing"/> is enabled, otherwise zero)
        </para>
       </entry>
      </row>
index 7a595c84db9dcaf29e493dd5c0a51d21711953b2..64a873a16e3cdb565e25334cce651143b5375e7d 100644 (file)
@@ -1156,14 +1156,16 @@ SELECT
        b.object,
        b.context,
        b.reads,
+       b.read_bytes,
        b.read_time,
        b.writes,
+       b.write_bytes,
        b.write_time,
        b.writebacks,
        b.writeback_time,
        b.extends,
+       b.extend_bytes,
        b.extend_time,
-       b.op_bytes,
        b.hits,
        b.evictions,
        b.reuses,
index 7a07c9c1ebd8936e713262b1767e5c1db6fafec8..739daa1153a64080203df26849a99bf53f6eac86 100644 (file)
@@ -1165,7 +1165,7 @@ PinBufferForBlock(Relation rel,
    }
    if (*foundPtr)
    {
-       pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1);
+       pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1, 0);
        if (VacuumCostActive)
            VacuumCostBalance += VacuumCostPageHit;
 
@@ -1515,7 +1515,7 @@ WaitReadBuffers(ReadBuffersOperation *operation)
        io_start = pgstat_prepare_io_time(track_io_timing);
        smgrreadv(operation->smgr, forknum, io_first_block, io_pages, io_buffers_len);
        pgstat_count_io_op_time(io_object, io_context, IOOP_READ, io_start,
-                               io_buffers_len);
+                               1, io_buffers_len * BLCKSZ);
 
        /* Verify each block we read, and terminate the I/O. */
        for (int j = 0; j < io_buffers_len; ++j)
@@ -2073,7 +2073,7 @@ again:
         * pinners or erroring out.
         */
        pgstat_count_io_op(IOOBJECT_RELATION, io_context,
-                          from_ring ? IOOP_REUSE : IOOP_EVICT, 1);
+                          from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
    }
 
    /*
@@ -2429,7 +2429,7 @@ ExtendBufferedRelShared(BufferManagerRelation bmr,
        UnlockRelationForExtension(bmr.rel, ExclusiveLock);
 
    pgstat_count_io_op_time(IOOBJECT_RELATION, io_context, IOOP_EXTEND,
-                           io_start, extend_by);
+                           io_start, 1, extend_by * BLCKSZ);
 
    /* Set BM_VALID, terminate IO, and wake up any waiters */
    for (uint32 i = 0; i < extend_by; i++)
@@ -3891,7 +3891,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
     * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
     */
    pgstat_count_io_op_time(IOOBJECT_RELATION, io_context,
-                           IOOP_WRITE, io_start, 1);
+                           IOOP_WRITE, io_start, 1, BLCKSZ);
 
    pgBufferUsage.shared_blks_written++;
 
@@ -4530,7 +4530,7 @@ FlushRelationBuffers(Relation rel)
 
                pgstat_count_io_op_time(IOOBJECT_TEMP_RELATION,
                                        IOCONTEXT_NORMAL, IOOP_WRITE,
-                                       io_start, 1);
+                                       io_start, 1, BLCKSZ);
 
                buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
                pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
@@ -6037,7 +6037,7 @@ IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
     * blocks of permanent relations.
     */
    pgstat_count_io_op_time(IOOBJECT_RELATION, io_context,
-                           IOOP_WRITEBACK, io_start, wb_context->nr_pending);
+                           IOOP_WRITEBACK, io_start, wb_context->nr_pending, 0);
 
    wb_context->nr_pending = 0;
 }
index cdb9da7e65108e78dabd8e01740bd5fac8eae988..8f81428970b88ea05fbe93bab69dc6bb351b6e3b 100644 (file)
@@ -255,7 +255,7 @@ GetLocalVictimBuffer(void)
 
        /* Temporary table I/O does not use Buffer Access Strategies */
        pgstat_count_io_op_time(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL,
-                               IOOP_WRITE, io_start, 1);
+                               IOOP_WRITE, io_start, 1, BLCKSZ);
 
        /* Mark not-dirty now in case we error out below */
        buf_state &= ~BM_DIRTY;
@@ -279,7 +279,8 @@ GetLocalVictimBuffer(void)
        ClearBufferTag(&bufHdr->tag);
        buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
        pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
-       pgstat_count_io_op(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, IOOP_EVICT, 1);
+
+       pgstat_count_io_op(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, IOOP_EVICT, 1, 0);
    }
 
    return BufferDescriptorGetBuffer(bufHdr);
@@ -419,7 +420,7 @@ ExtendBufferedRelLocal(BufferManagerRelation bmr,
    smgrzeroextend(bmr.smgr, fork, first_block, extend_by, false);
 
    pgstat_count_io_op_time(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, IOOP_EXTEND,
-                           io_start, extend_by);
+                           io_start, 1, extend_by * BLCKSZ);
 
    for (uint32 i = 0; i < extend_by; i++)
    {
index b852a448a08f318b76cf6450c8163d3c69e2ff62..7bf0b45e2c34f4b1605616c4a5ead413ac03d4c5 100644 (file)
@@ -1401,7 +1401,7 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
         * backend fsyncs.
         */
        pgstat_count_io_op_time(IOOBJECT_RELATION, IOCONTEXT_NORMAL,
-                               IOOP_FSYNC, io_start, 1);
+                               IOOP_FSYNC, io_start, 1, 0);
    }
 }
 
@@ -1796,7 +1796,7 @@ mdsyncfiletag(const FileTag *ftag, char *path)
        FileClose(file);
 
    pgstat_count_io_op_time(IOOBJECT_RELATION, IOCONTEXT_NORMAL,
-                           IOOP_FSYNC, io_start, 1);
+                           IOOP_FSYNC, io_start, 1, 0);
 
    errno = save_errno;
    return result;
index 207bfa3c2769ca01c41f3640880a787bb052ae64..79e4d0a3053c6525ba6f72e936ad7d0187b2070d 100644 (file)
@@ -65,6 +65,8 @@ pgstat_flush_backend_entry_io(PgStat_EntryRef *entry_ref)
 
                bktype_shstats->counts[io_object][io_context][io_op] +=
                    pending_io->counts[io_object][io_context][io_op];
+               bktype_shstats->bytes[io_object][io_context][io_op] +=
+                   pending_io->bytes[io_object][io_context][io_op];
 
                time = pending_io->pending_times[io_object][io_context][io_op];
 
index b16f57ebeabd58db93fc5d9c3191da16723c57df..78b01310d8b4527412d27876977b9ad0045382c2 100644 (file)
 static PgStat_PendingIO PendingIOStats;
 static bool have_iostats = false;
 
+/*
+ * Check if an IOOp is tracked in bytes.  This relies on the ordering of IOOp
+ * defined in pgstat.h, so make sure to update this check when changing its
+ * elements.
+ */
+#define pgstat_is_ioop_tracked_in_bytes(io_op) \
+   ((io_op) < IOOP_NUM_TYPES && (io_op) >= IOOP_EXTEND)
 
 /*
  * Check that stats have not been counted for any combination of IOObject,
@@ -66,11 +73,13 @@ pgstat_bktype_io_stats_valid(PgStat_BktypeIO *backend_io,
 }
 
 void
-pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op, uint32 cnt)
+pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op,
+                  uint32 cnt, uint64 bytes)
 {
    Assert((unsigned int) io_object < IOOBJECT_NUM_TYPES);
    Assert((unsigned int) io_context < IOCONTEXT_NUM_TYPES);
    Assert((unsigned int) io_op < IOOP_NUM_TYPES);
+   Assert(pgstat_is_ioop_tracked_in_bytes(io_op) || bytes == 0);
    Assert(pgstat_tracks_io_op(MyBackendType, io_object, io_context, io_op));
 
    if (pgstat_tracks_backend_bktype(MyBackendType))
@@ -79,9 +88,11 @@ pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op, uint32
 
        entry_ref = pgstat_prep_backend_pending(MyProcNumber);
        entry_ref->pending_io.counts[io_object][io_context][io_op] += cnt;
+       entry_ref->pending_io.bytes[io_object][io_context][io_op] += bytes;
    }
 
    PendingIOStats.counts[io_object][io_context][io_op] += cnt;
+   PendingIOStats.bytes[io_object][io_context][io_op] += bytes;
 
    have_iostats = true;
 }
@@ -114,7 +125,7 @@ pgstat_prepare_io_time(bool track_io_guc)
  */
 void
 pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op,
-                       instr_time start_time, uint32 cnt)
+                       instr_time start_time, uint32 cnt, uint64 bytes)
 {
    if (track_io_timing)
    {
@@ -153,7 +164,7 @@ pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op,
        }
    }
 
-   pgstat_count_io_op(io_object, io_context, io_op, cnt);
+   pgstat_count_io_op(io_object, io_context, io_op, cnt, bytes);
 }
 
 PgStat_IO *
@@ -219,6 +230,9 @@ pgstat_io_flush_cb(bool nowait)
                bktype_shstats->counts[io_object][io_context][io_op] +=
                    PendingIOStats.counts[io_object][io_context][io_op];
 
+               bktype_shstats->bytes[io_object][io_context][io_op] +=
+                   PendingIOStats.bytes[io_object][io_context][io_op];
+
                time = PendingIOStats.pending_times[io_object][io_context][io_op];
 
                bktype_shstats->times[io_object][io_context][io_op] +=
index 5f8d20a406d429b44c870cc1442e7e73eb3aad8c..0f5e0a9778dc8a94be86bbcd717986591afba610 100644 (file)
@@ -1285,14 +1285,16 @@ typedef enum io_stat_col
    IO_COL_OBJECT,
    IO_COL_CONTEXT,
    IO_COL_READS,
+   IO_COL_READ_BYTES,
    IO_COL_READ_TIME,
    IO_COL_WRITES,
+   IO_COL_WRITE_BYTES,
    IO_COL_WRITE_TIME,
    IO_COL_WRITEBACKS,
    IO_COL_WRITEBACK_TIME,
    IO_COL_EXTENDS,
+   IO_COL_EXTEND_BYTES,
    IO_COL_EXTEND_TIME,
-   IO_COL_CONVERSION,
    IO_COL_HITS,
    IO_COL_EVICTIONS,
    IO_COL_REUSES,
@@ -1333,11 +1335,36 @@ pgstat_get_io_op_index(IOOp io_op)
    pg_unreachable();
 }
 
+/*
+ * Get the number of the column containing IO bytes for the specified IOOp.
+ * If an IOOp is not tracked in bytes, IO_COL_INVALID is returned.
+ */
+static io_stat_col
+pgstat_get_io_byte_index(IOOp io_op)
+{
+   switch (io_op)
+   {
+       case IOOP_EXTEND:
+           return IO_COL_EXTEND_BYTES;
+       case IOOP_READ:
+           return IO_COL_READ_BYTES;
+       case IOOP_WRITE:
+           return IO_COL_WRITE_BYTES;
+       case IOOP_EVICT:
+       case IOOP_FSYNC:
+       case IOOP_HIT:
+       case IOOP_REUSE:
+       case IOOP_WRITEBACK:
+           return IO_COL_INVALID;
+   }
+
+   elog(ERROR, "unrecognized IOOp value: %d", io_op);
+   pg_unreachable();
+}
+
 /*
  * Get the number of the column containing IO times for the specified IOOp.
- * This function encodes our assumption that IO time for an IOOp is displayed
- * in the view in the column directly after the IOOp counts. If an op has no
- * associated time, IO_COL_INVALID is returned.
+ * If an op has no associated time, IO_COL_INVALID is returned.
  */
 static io_stat_col
 pgstat_get_io_time_index(IOOp io_op)
@@ -1345,11 +1372,15 @@ pgstat_get_io_time_index(IOOp io_op)
    switch (io_op)
    {
        case IOOP_READ:
+           return IO_COL_READ_TIME;
        case IOOP_WRITE:
+           return IO_COL_WRITE_TIME;
        case IOOP_WRITEBACK:
+           return IO_COL_WRITEBACK_TIME;
        case IOOP_EXTEND:
+           return IO_COL_EXTEND_TIME;
        case IOOP_FSYNC:
-           return pgstat_get_io_op_index(io_op) + 1;
+           return IO_COL_FSYNC_TIME;
        case IOOP_EVICT:
        case IOOP_HIT:
        case IOOP_REUSE:
@@ -1408,18 +1439,11 @@ pg_stat_io_build_tuples(ReturnSetInfo *rsinfo,
            else
                nulls[IO_COL_RESET_TIME] = true;
 
-           /*
-            * Hard-code this to the value of BLCKSZ for now. Future values
-            * could include XLOG_BLCKSZ, once WAL IO is tracked, and constant
-            * multipliers, once non-block-oriented IO (e.g. temporary file
-            * IO) is tracked.
-            */
-           values[IO_COL_CONVERSION] = Int64GetDatum(BLCKSZ);
-
            for (int io_op = 0; io_op < IOOP_NUM_TYPES; io_op++)
            {
                int         op_idx = pgstat_get_io_op_index(io_op);
                int         time_idx = pgstat_get_io_time_index(io_op);
+               int         byte_idx = pgstat_get_io_byte_index(io_op);
 
                /*
                 * Some combinations of BackendType and IOOp, of IOContext and
@@ -1436,19 +1460,39 @@ pg_stat_io_build_tuples(ReturnSetInfo *rsinfo,
                else
                    nulls[op_idx] = true;
 
-               /* not every operation is timed */
-               if (time_idx == IO_COL_INVALID)
-                   continue;
-
                if (!nulls[op_idx])
                {
-                   PgStat_Counter time =
-                       bktype_stats->times[io_obj][io_context][io_op];
+                   /* not every operation is timed */
+                   if (time_idx != IO_COL_INVALID)
+                   {
+                       PgStat_Counter time =
+                           bktype_stats->times[io_obj][io_context][io_op];
+
+                       values[time_idx] = Float8GetDatum(pg_stat_us_to_ms(time));
+                   }
 
-                   values[time_idx] = Float8GetDatum(pg_stat_us_to_ms(time));
+                   /* not every IO is tracked in bytes */
+                   if (byte_idx != IO_COL_INVALID)
+                   {
+                       char        buf[256];
+                       PgStat_Counter byte =
+                           bktype_stats->bytes[io_obj][io_context][io_op];
+
+                       /* Convert to numeric */
+                       snprintf(buf, sizeof buf, UINT64_FORMAT, byte);
+                       values[byte_idx] = DirectFunctionCall3(numeric_in,
+                                                              CStringGetDatum(buf),
+                                                              ObjectIdGetDatum(0),
+                                                              Int32GetDatum(-1));
+                   }
                }
                else
-                   nulls[time_idx] = true;
+               {
+                   if (time_idx != IO_COL_INVALID)
+                       nulls[time_idx] = true;
+                   if (byte_idx != IO_COL_INVALID)
+                       nulls[byte_idx] = true;
+               }
            }
 
            tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
index fa300eaa2d2141ade4eb1dd0c5ee470883c758a1..7c7133cd88565c127b499e4e7574a2159aae39b5 100644 (file)
@@ -57,6 +57,6 @@
  */
 
 /*                         yyyymmddN */
-#define CATALOG_VERSION_NO 202501101
+#define CATALOG_VERSION_NO 202501401
 
 #endif
index b37e8a6f8824fb95fa8f70052e4806f5aa353ac7..872cd6e01a369d6c0e2bfed04be2882e55579c5e 100644 (file)
   proname => 'pg_stat_get_io', prorows => '30', proretset => 't',
   provolatile => 'v', proparallel => 'r', prorettype => 'record',
   proargtypes => '',
-  proallargtypes => '{text,text,text,int8,float8,int8,float8,int8,float8,int8,float8,int8,int8,int8,int8,int8,float8,timestamptz}',
-  proargmodes => '{o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}',
-  proargnames => '{backend_type,object,context,reads,read_time,writes,write_time,writebacks,writeback_time,extends,extend_time,op_bytes,hits,evictions,reuses,fsyncs,fsync_time,stats_reset}',
+  proallargtypes => '{text,text,text,int8,numeric,float8,int8,numeric,float8,int8,float8,int8,numeric,float8,int8,int8,int8,int8,float8,timestamptz}',
+  proargmodes => '{o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}',
+  proargnames => '{backend_type,object,context,reads,read_bytes,read_time,writes,write_bytes,write_time,writebacks,writeback_time,extends,extend_bytes,extend_time,hits,evictions,reuses,fsyncs,fsync_time,stats_reset}',
   prosrc => 'pg_stat_get_io' },
 
 { oid => '8806', descr => 'statistics: backend IO statistics',
   proname => 'pg_stat_get_backend_io', prorows => '5', proretset => 't',
   provolatile => 'v', proparallel => 'r', prorettype => 'record',
   proargtypes => 'int4',
-  proallargtypes => '{int4,text,text,text,int8,float8,int8,float8,int8,float8,int8,float8,int8,int8,int8,int8,int8,float8,timestamptz}',
-  proargmodes => '{i,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}',
-  proargnames => '{backend_pid,backend_type,object,context,reads,read_time,writes,write_time,writebacks,writeback_time,extends,extend_time,op_bytes,hits,evictions,reuses,fsyncs,fsync_time,stats_reset}',
+  proallargtypes => '{int4,text,text,text,int8,numeric,float8,int8,numeric,float8,int8,float8,int8,numeric,float8,int8,int8,int8,int8,float8,timestamptz}',
+  proargmodes => '{i,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}',
+  proargnames => '{backend_pid,backend_type,object,context,reads,read_bytes,read_time,writes,write_bytes,write_time,writebacks,writeback_time,extends,extend_bytes,extend_time,hits,evictions,reuses,fsyncs,fsync_time,stats_reset}',
   prosrc => 'pg_stat_get_backend_io' },
 
 { oid => '1136', descr => 'statistics: information about WAL activity',
index 6475889c58c3b54a8bd3ed55bc00a9467ddd97ac..7cd9396aced1d9e976dee772cd692da4347ecf3c 100644 (file)
@@ -343,28 +343,42 @@ typedef enum IOContext
 
 #define IOCONTEXT_NUM_TYPES (IOCONTEXT_VACUUM + 1)
 
+/*
+ * Enumeration of IO operations.
+ *
+ * This enum categorizes IO operations into two groups, depending on if
+ * byte operations are supported.
+ *
+ * Ensure IOOP_EXTEND is the first and IOOP_WRITE is the last ones in the
+ * tracked in bytes group and that the groups stay in that order.
+ */
 typedef enum IOOp
 {
+   /* IOs not tracked in bytes */
    IOOP_EVICT,
-   IOOP_EXTEND,
    IOOP_FSYNC,
    IOOP_HIT,
-   IOOP_READ,
    IOOP_REUSE,
-   IOOP_WRITE,
    IOOP_WRITEBACK,
+
+   /* IOs tracked in bytes */
+   IOOP_EXTEND,
+   IOOP_READ,
+   IOOP_WRITE,
 } IOOp;
 
-#define IOOP_NUM_TYPES (IOOP_WRITEBACK + 1)
+#define IOOP_NUM_TYPES (IOOP_WRITE + 1)
 
 typedef struct PgStat_BktypeIO
 {
+   uint64      bytes[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
    PgStat_Counter counts[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
    PgStat_Counter times[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
 } PgStat_BktypeIO;
 
 typedef struct PgStat_PendingIO
 {
+   uint64      bytes[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
    PgStat_Counter counts[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
    instr_time  pending_times[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
 } PgStat_PendingIO;
@@ -604,10 +618,11 @@ extern PgStat_CheckpointerStats *pgstat_fetch_stat_checkpointer(void);
 extern bool pgstat_bktype_io_stats_valid(PgStat_BktypeIO *backend_io,
                                         BackendType bktype);
 extern void pgstat_count_io_op(IOObject io_object, IOContext io_context,
-                              IOOp io_op, uint32 cnt);
+                              IOOp io_op, uint32 cnt, uint64 bytes);
 extern instr_time pgstat_prepare_io_time(bool track_io_guc);
 extern void pgstat_count_io_op_time(IOObject io_object, IOContext io_context,
-                                   IOOp io_op, instr_time start_time, uint32 cnt);
+                                   IOOp io_op, instr_time start_time,
+                                   uint32 cnt, uint64 bytes);
 
 extern PgStat_IO *pgstat_fetch_stat_io(void);
 extern const char *pgstat_get_io_context_name(IOContext io_context);
index 3014d047fefc6488e60fa63d4908ee013cda5093..29580c9071036a96983bb7c25f60f71634ea524b 100644 (file)
@@ -1892,21 +1892,23 @@ pg_stat_io| SELECT backend_type,
     object,
     context,
     reads,
+    read_bytes,
     read_time,
     writes,
+    write_bytes,
     write_time,
     writebacks,
     writeback_time,
     extends,
+    extend_bytes,
     extend_time,
-    op_bytes,
     hits,
     evictions,
     reuses,
     fsyncs,
     fsync_time,
     stats_reset
-   FROM pg_stat_get_io() b(backend_type, object, context, reads, read_time, writes, write_time, writebacks, writeback_time, extends, extend_time, op_bytes, hits, evictions, reuses, fsyncs, fsync_time, stats_reset);
+   FROM pg_stat_get_io() b(backend_type, object, context, reads, read_bytes, read_time, writes, write_bytes, write_time, writebacks, writeback_time, extends, extend_bytes, extend_time, hits, evictions, reuses, fsyncs, fsync_time, stats_reset);
 pg_stat_progress_analyze| SELECT s.pid,
     s.datid,
     d.datname,