Database Size - Table - Column size
Tables
Table size
Returns table size, compression rates, and row and part counts, by table
SELECT
database,
table,
formatReadableSize(sum(data_compressed_bytes) AS size) AS compressed,
formatReadableSize(sum(data_uncompressed_bytes) AS usize) AS uncompressed,
round(usize / size, 2) AS compr_rate,
sum(rows) AS rows,
count() AS part_count
FROM system.parts
WHERE (active = 1) AND (database LIKE '%') AND (table LIKE '%')
GROUP BY
database,
table
ORDER BY size DESC;
Table size + inner MatView (Atomic)
As above, but resolves Materialized View inner table names (for Materialized Views created using implicit inner table)
SELECT
p.database,
if(t.name = '', p.table, p.table||' ('||t.name||')') tbl,
formatReadableSize(sum(p.data_compressed_bytes) AS size) AS compressed,
formatReadableSize(sum(p.data_uncompressed_bytes) AS usize) AS uncompressed,
round(usize / size, 2) AS compr_rate,
sum(p.rows) AS rows,
count() AS part_count
FROM system.parts p left join system.tables t on p.database = t.database and p.table = '.inner_id.'||toString(t.uuid)
WHERE (active = 1) AND (tbl LIKE '%') AND (database LIKE '%')
GROUP BY
p.database,
tbl
ORDER BY size DESC;
Column size
Returns size, compression rate, row counts, and average row size for each column (by db and table)
SELECT
database,
table,
column,
formatReadableSize(sum(column_data_compressed_bytes) AS size) AS compressed,
formatReadableSize(sum(column_data_uncompressed_bytes) AS usize) AS uncompressed,
round(usize / size, 2) AS compr_ratio,
sum(rows) rows_cnt,
round(usize / rows_cnt, 2) avg_row_size
FROM system.parts_columns
WHERE (active = 1) AND (database LIKE '%') AND (table LIKE '%')
GROUP BY
database,
table,
column
ORDER BY size DESC;
Projections
Projection size
Returns size, compression rate, row counts, and average row size for each projection (“name”), by db and table
SELECT
database,
table,
name,
formatReadableSize(sum(data_compressed_bytes) AS size) AS compressed,
formatReadableSize(sum(data_uncompressed_bytes) AS usize) AS uncompressed,
round(usize / size, 2) AS compr_rate,
sum(rows) AS rows,
count() AS part_count
FROM system.projection_parts
WHERE (table = 'ptest') AND active
GROUP BY
database,
table,
name
ORDER BY size DESC;
Projection column size
Returns size, compression rate, row counts, and average row size for each projection (“name”), by db and table, and column
SELECT
database,
table,
column,
formatReadableSize(sum(column_data_compressed_bytes) AS size) AS compressed,
formatReadableSize(sum(column_data_uncompressed_bytes) AS usize) AS uncompressed,
round(usize / size, 2) AS compr_rate
FROM system.projection_parts_columns
WHERE (active = 1) AND (table LIKE 'ptest')
GROUP BY
database,
table,
column
ORDER BY size DESC;
Understanding the columns data properties:
For each column in a table, unique value counts, min/max, and top 5 most frequent values
SELECT
count(),
* APPLY (uniq),
* APPLY (max),
* APPLY (min),
* APPLY(topK(5))
FROM table_name
FORMAT Vertical;
-- also you can add * APPLY (entropy) to show entropy (i.e. 'randomness' of the column).
-- if the table is huge add some WHERE condition to slice some 'representative' data range, for example single month / week / day of data.
Understanding the ingest pattern:
For parts which are recently created and are unmerged, returns row, size, and count information by db and table.
- High count, low rows: lots of small parts
- High countif(NOT active) relative to count(): merges are keeping up
- Low countIf(NOT active) relative to count(): merges may be falling behind
- uniqExact(partition): how many partitions are being written to
SELECT
database,
table,
median(rows),
median(bytes_on_disk),
sum(rows),
max(bytes_on_disk),
min(bytes_on_disk),
round(quantile(0.95)(bytes_on_disk), 0),
sum(bytes_on_disk),
count(),
countIf(NOT active),
uniqExact(partition)
FROM system.parts
WHERE (modification_time > (now() - 480)) AND (level = 0)
GROUP BY
database,
table
ORDER BY count() DESC
part_log
For the past day, returns per-second part lifecycle metrics over 30 minute buckets
WITH 30 * 60 AS frame_size
SELECT
toStartOfInterval(event_time, toIntervalSecond(frame_size)) AS m,
database,
table,
ROUND(countIf(event_type = 'NewPart') / frame_size, 2) AS new,
ROUND(countIf(event_type = 'MergeParts') / frame_size, 2) AS merge,
ROUND(countIf(event_type = 'DownloadPart') / frame_size, 2) AS dl,
ROUND(countIf(event_type = 'RemovePart') / frame_size, 2) AS rm,
ROUND(countIf(event_type = 'MutatePart') / frame_size, 2) AS mut,
ROUND(countIf(event_type = 'MovePart') / frame_size, 2) AS mv
FROM system.part_log
WHERE event_time > (now() - toIntervalHour(24))
GROUP BY
m,
database,
table
ORDER BY
database ASC,
table ASC,
m ASC
For the past day, returns per-second insert throughput metrics, by db and table, over 30 minute buckets
WITH 30 * 60 AS frame_size
SELECT
toStartOfInterval(event_time, toIntervalSecond(frame_size)) AS m,
database,
table,
ROUND(countIf(event_type = 'NewPart') / frame_size, 2) AS inserts_per_sec,
ROUND(sumIf(rows, event_type = 'NewPart') / frame_size, 2) AS rows_per_sec,
ROUND(sumIf(size_in_bytes, event_type = 'NewPart') / frame_size, 2) AS bytes_per_sec
FROM system.part_log
WHERE event_time > (now() - toIntervalHour(24))
GROUP BY
m,
database,
table
ORDER BY
database ASC,
table ASC,
m ASC
Understanding the partitioning
Partition distribution analysis, aggregating system.parts metrics by partition. The quantiles results can indicate whether there is skewed distribution of data between partitions.
SELECT
database,
table,
count(),
topK(5)(partition),
COLUMNS('metric.*') APPLY(quantiles(0.005, 0.05, 0.10, 0.25, 0.5, 0.75, 0.9, 0.95, 0.995))
FROM
(
SELECT
database,
table,
partition,
sum(bytes_on_disk) AS metric_bytes,
sum(data_uncompressed_bytes) AS metric_uncompressed_bytes,
sum(rows) AS metric_rows,
sum(primary_key_bytes_in_memory) AS metric_pk_size,
count() AS metric_count,
countIf(part_type = 'Wide') AS metric_wide_count,
countIf(part_type = 'Compact') AS metric_compact_count,
countIf(part_type = 'Memory') AS metric_memory_count
FROM system.parts
GROUP BY
database,
table,
partition
)
GROUP BY
database,
table
FORMAT Vertical
Subcolumns sizes
Returns column-level storage metricsk, including subcolumns (JSON, tuples, maps, etc - if present)
WITH
if(
length(subcolumns.names) > 0,
arrayMap( (sc_n,sc_t,sc_s, sc_bod, sc_dcb, sc_dub) -> tuple(sc_n,sc_t,sc_s, sc_bod, sc_dcb, sc_dub), subcolumns.names, subcolumns.types, subcolumns.serializations, subcolumns.bytes_on_disk, subcolumns.data_compressed_bytes, subcolumns.data_uncompressed_bytes),
[tuple('',type,serialization_kind,column_bytes_on_disk,column_data_compressed_bytes,column_data_uncompressed_bytes)]) as _subcolumns_data,
arrayJoin(_subcolumns_data) as _subcolumn,
_subcolumn.1 as _sc_name,
_subcolumn.2 as _sc_type,
_subcolumn.3 as _sc_serialization,
_subcolumn.4 as _sc_bytes_on_disk,
_subcolumn.5 as _sc_data_compressed_bytes,
_subcolumn.6 as _sc_uncompressed_bytes
SELECT
database || '.' || table as table_,
column as colunm_,
_sc_name as subcolumn_,
any(_sc_type),
formatReadableSize(sum(_sc_data_compressed_bytes) AS size) AS compressed,
formatReadableSize(sum(_sc_uncompressed_bytes) AS usize) AS uncompressed,
round(usize / size, 2) AS compr_ratio,
sum(rows) AS rows_cnt,
round(usize / rows_cnt, 2) AS avg_row_size
FROM system.parts_columns
WHERE (active = 1) AND (database LIKE '%') AND (`table` LIKE '%')
GROUP BY
table_,
colunm_,
subcolumn_
ORDER BY size DESC ;