1 - Check table metadata in zookeeper

Check table metadata in zookeeper.

Compare table metadata of different replicas in zookeeper

Metadata on replica is not up to date with common metadata in Zookeeper

SELECT *, if( neighbor(name, -1) == name and name != 'is_active', neighbor(value, -1) == value , 1) as looks_good
FROM (
SELECT
    name,
    path,
    ctime,
    mtime,
    value
FROM system.zookeeper
WHERE (path IN (
    SELECT arrayJoin(groupUniqArray(if(path LIKE '%/replicas', concat(path, '/', name), path)))
    FROM system.zookeeper
    WHERE path IN (
        SELECT arrayJoin([zookeeper_path, concat(zookeeper_path, '/replicas')])
        FROM system.replicas
        WHERE table = 'test_repl'
    )
)) AND (name IN ('metadata', 'columns', 'is_active'))
ORDER BY
    name = 'is_active',
    name ASC,
    path ASC
)

vs.

SELECT metadata_modification_time, create_table_query FROM system.tables WHERE name = 'test_repl'

2 - Compare query_log for 2 intervals

WITH 
    toStartOfInterval(event_time, INTERVAL 5 MINUTE) = '2023-06-30 13:00:00' as before,
    toStartOfInterval(event_time, INTERVAL 5 MINUTE) = '2023-06-30 15:00:00' as after
SELECT
    normalized_query_hash,
    anyIf(query, before) AS QueryBefore,
    anyIf(query, after) AS QueryAfter,
    countIf(before) as CountBefore,
    sumIf(query_duration_ms, before) / 1000 AS QueriesDurationBefore,
    sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'RealTimeMicroseconds')], before) / 1000000 AS RealTimeBefore,
    sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'UserTimeMicroseconds')], before) / 1000000 AS UserTimeBefore,
    sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'SystemTimeMicroseconds')], before) / 1000000 AS SystemTimeBefore,
    sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'DiskReadElapsedMicroseconds')], before) / 1000000 AS DiskReadTimeBefore,
    sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'DiskWriteElapsedMicroseconds')], before) / 1000000 AS DiskWriteTimeBefore,
    sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'NetworkSendElapsedMicroseconds')], before) / 1000000 AS NetworkSendTimeBefore,
    sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'NetworkReceiveElapsedMicroseconds')], before) / 1000000 AS NetworkReceiveTimeBefore,
    sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'ZooKeeperWaitMicroseconds')], before) / 1000000 AS ZooKeeperWaitTimeBefore,
    sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'OSIOWaitMicroseconds')], before) / 1000000 AS OSIOWaitTimeBefore,
    sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'OSCPUWaitMicroseconds')], before) / 1000000 AS OSCPUWaitTimeBefore,
    sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'OSCPUVirtualTimeMicroseconds')], before) / 1000000 AS OSCPUVirtualTimeBefore,
    sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'SelectedBytes')], before)  AS SelectedBytesBefore, 
    sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'SelectedRanges')], before)  AS SelectedRangesBefore,
    sumIf(read_rows, before) AS ReadRowsBefore,
    formatReadableSize(sumIf(read_bytes, before) AS ReadBytesBefore),
    sumIf(written_rows, before) AS WrittenTowsBefore,
    formatReadableSize(sumIf(written_bytes, before)) AS WrittenBytesBefore,
    sumIf(result_rows, before) AS ResultRowsBefore,
    formatReadableSize(sumIf(result_bytes, before)) AS ResultBytesBefore,

    countIf(after) as CountAfter,
    sumIf(query_duration_ms, after) / 1000 AS QueriesDurationAfter,
   sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'RealTimeMicroseconds')], after) / 1000000 AS RealTimeAfter,
    sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'UserTimeMicroseconds')], after) / 1000000 AS UserTimeAfter,
    sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'SystemTimeMicroseconds')], after) / 1000000 AS SystemTimeAfter,
    sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'DiskReadElapsedMicroseconds')], after) / 1000000 AS DiskReadTimeAfter,
    sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'DiskWriteElapsedMicroseconds')], after) / 1000000 AS DiskWriteTimeAfter,
    sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'NetworkSendElapsedMicroseconds')], after) / 1000000 AS NetworkSendTimeAfter,
    sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'NetworkReceiveElapsedMicroseconds')], after) / 1000000 AS NetworkReceiveTimeAfter,
    sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'ZooKeeperWaitMicroseconds')], after) / 1000000 AS ZooKeeperWaitTimeAfter,
    sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'OSIOWaitMicroseconds')], after) / 1000000 AS OSIOWaitTimeAfter,
    sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'OSCPUWaitMicroseconds')], after) / 1000000 AS OSCPUWaitTimeAfter,
    sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'OSCPUVirtualTimeMicroseconds')], after) / 1000000 AS OSCPUVirtualTimeAfter,
    sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'SelectedBytes')], after)  AS SelectedBytesAfter,
    sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'SelectedRanges')], after)  AS SelectedRangesAfter,

    sumIf(read_rows, after) AS ReadRowsAfter,
    formatReadableSize(sumIf(read_bytes, after) AS ReadBytesAfter),
    sumIf(written_rows, after) AS WrittenTowsAfter,
    formatReadableSize(sumIf(written_bytes, after)) AS WrittenBytesAfter,
    sumIf(result_rows, after) AS ResultRowsAfter,
    formatReadableSize(sumIf(result_bytes, after)) AS ResultBytesAfter

FROM system.query_log
WHERE (before OR after) AND type in (2,4) -- QueryFinish, ExceptionWhileProcessing
GROUP BY normalized_query_hash
    WITH TOTALS
ORDER BY SelectedRangesAfter- SelectedRangesBefore DESC
LIMIT 10
FORMAT Vertical
WITH 
    toDateTime('2024-02-09 00:00:00') as timestamp_of_issue,
    event_time < timestamp_of_issue as before,
    event_time >= timestamp_of_issue as after
select
    normalized_query_hash as h,
    any(query) as query_sample,
    round(quantileIf(0.9)(query_duration_ms, before)) as duration_q90_before,
    round(quantileIf(0.9)(query_duration_ms, after))  as duration_q90_after,
    countIf(before) as cnt_before,
    countIf(after) as cnt_after,
    sumIf(query_duration_ms,before) as duration_sum_before,
    sumIf(query_duration_ms,after) as duration_sum_after,
    sumIf(ProfileEvents['UserTimeMicroseconds'], before) as usertime_sum_before,
    sumIf(ProfileEvents['UserTimeMicroseconds'], after) as usertime_sum_after,
    sumIf(read_bytes,before) as sum_read_bytes_before,
    sumIf(read_bytes,after) as sum_read_bytes_after
from system.query_log
where event_time between timestamp_of_issue - INTERVAL 3 DAY and timestamp_of_issue + INTERVAL 3 DAY
group by h
HAVING cnt_after > 1.1 * cnt_before OR sum_read_bytes_after > 1.2 * sum_read_bytes_before OR usertime_sum_after > 1.2 * usertime_sum_before
ORDER BY sum_read_bytes_after - sum_read_bytes_before 
FORMAT Vertical

3 - Debug hanging thing

Debug hanging / freezing things

Debug hanging / freezing things

If ClickHouse® is busy with something and you don’t know what’s happening, you can easily check the stacktraces of all the thread which are working

SELECT
 arrayStringConcat(arrayMap(x -> demangle(addressToSymbol(x)), trace), '\n') AS trace_functions,
 count()
FROM system.stack_trace
GROUP BY trace_functions
ORDER BY count()
DESC
SETTINGS allow_introspection_functions=1
FORMAT Vertical;

If you can’t start any queries, but you have access to the node, you can sent a signal

# older versions
for i in $(ls -1 /proc/$(pidof clickhouse-server)/task/); do kill -TSTP $i; done
# even older versions
for i in $(ls -1 /proc/$(pidof clickhouse-server)/task/); do kill -SIGPROF $i; done

4 - Handy queries for system.query_log

Handy queries for a system.query_log.

The most cpu / write / read-intensive queries from query_log

SELECT
    normalized_query_hash,
    any(query),
    count(),
    sum(query_duration_ms) / 1000 AS QueriesDuration,
    sum(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'RealTimeMicroseconds')]) / 1000000 AS RealTime,
    sum(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'UserTimeMicroseconds')]) / 1000000 AS UserTime,
    sum(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'SystemTimeMicroseconds')]) / 1000000 AS SystemTime,
    sum(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'DiskReadElapsedMicroseconds')]) / 1000000 AS DiskReadTime,
    sum(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'DiskWriteElapsedMicroseconds')]) / 1000000 AS DiskWriteTime,
    sum(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'NetworkSendElapsedMicroseconds')]) / 1000000 AS NetworkSendTime,
    sum(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'NetworkReceiveElapsedMicroseconds')]) / 1000000 AS NetworkReceiveTime,
    sum(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'ZooKeeperWaitMicroseconds')]) / 1000000 AS ZooKeeperWaitTime,
    sum(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'OSIOWaitMicroseconds')]) / 1000000 AS OSIOWaitTime,
    sum(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'OSCPUWaitMicroseconds')]) / 1000000 AS OSCPUWaitTime,
    sum(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'OSCPUVirtualTimeMicroseconds')]) / 1000000 AS OSCPUVirtualTime,
    sum(read_rows) AS ReadRows,
    formatReadableSize(sum(read_bytes)) AS ReadBytes,
    sum(written_rows) AS WrittenTows,
    formatReadableSize(sum(written_bytes)) AS WrittenBytes,
    sum(result_rows) AS ResultRows,
    formatReadableSize(sum(result_bytes)) AS ResultBytes
FROM system.query_log
WHERE (event_date >= today()) AND (event_time > (now() - 3600)) AND type in (2,4) -- QueryFinish, ExceptionWhileProcessing
GROUP BY normalized_query_hash
    WITH TOTALS
ORDER BY UserTime DESC
LIMIT 30
FORMAT Vertical

– modern ClickHouse®

SELECT 
    hostName() as host,
    normalized_query_hash,
    min(event_time),
    max(event_time),
    replace(substr(argMax(query, utime), 1, 80), '\n', ' ') AS query,
    argMax(query_id, utime) AS sample_query_id,
    count(),
    sum(query_duration_ms) / 1000 AS QueriesDuration, /* wall clock */
    sum(ProfileEvents['RealTimeMicroseconds']) / 1000000 AS RealTime,  /* same as above but x number of thread */
    sum(ProfileEvents['UserTimeMicroseconds'] as utime) / 1000000 AS UserTime,  /* time when our query was doin some cpu-insense work, creating cpu load */
    sum(ProfileEvents['SystemTimeMicroseconds']) / 1000000 AS SystemTime, /* time spend on waiting for some system operations */
    sum(ProfileEvents['DiskReadElapsedMicroseconds']) / 1000000 AS DiskReadTime,
    sum(ProfileEvents['DiskWriteElapsedMicroseconds']) / 1000000 AS DiskWriteTime,
    sum(ProfileEvents['NetworkSendElapsedMicroseconds']) / 1000000 AS NetworkSendTime, /* check the other side of the network! */
    sum(ProfileEvents['NetworkReceiveElapsedMicroseconds']) / 1000000 AS NetworkReceiveTime, /* check the other side of the network! */
    sum(ProfileEvents['ZooKeeperWaitMicroseconds']) / 1000000 AS ZooKeeperWaitTime,
    sum(ProfileEvents['OSIOWaitMicroseconds']) / 1000000 AS OSIOWaitTime, /* IO waits, usually disks - that metric is 'orthogonal' to other */ 
    sum(ProfileEvents['OSCPUWaitMicroseconds']) / 1000000 AS OSCPUWaitTime, /* waiting for a 'free' CPU - usually high when the other load on the server creates a lot of contention for cpu */ 
    sum(ProfileEvents['OSCPUVirtualTimeMicroseconds']) / 1000000 AS OSCPUVirtualTime, /* similar to usertime + system time */
    formatReadableSize(sum(ProfileEvents['NetworkReceiveBytes']) as network_receive_bytes) AS NetworkReceiveBytes,
    formatReadableSize(sum(ProfileEvents['NetworkSendBytes']) as network_send_bytes) AS NetworkSendBytes,
    sum(ProfileEvents['SelectedParts']) as SelectedParts,
    sum(ProfileEvents['SelectedRanges']) as SelectedRanges,
    sum(ProfileEvents['SelectedMarks']) as SelectedMarks,
    sum(ProfileEvents['SelectedRows']) as SelectedRows,  /* those may different from read_rows - here the number or rows potentially matching the where conditions, not neccessary all will be read */
    sum(ProfileEvents['SelectedBytes']) as SelectedBytes,
    sum(ProfileEvents['FileOpen']) as FileOpen,
    sum(ProfileEvents['ZooKeeperTransactions']) as ZooKeeperTransactions,
    formatReadableSize(sum(ProfileEvents['OSReadBytes'] ) as os_read_bytes ) as OSReadBytesExcludePageCache,
    formatReadableSize(sum(ProfileEvents['OSWriteBytes'] ) as os_write_bytes ) as OSWriteBytesExcludePageCache,
    formatReadableSize(sum(ProfileEvents['OSReadChars'] ) as os_read_chars ) as OSReadBytesIncludePageCache,
    formatReadableSize(sum(ProfileEvents['OSWriteChars'] ) as os_write_chars ) as OSWriteCharsIncludePageCache,
    formatReadableSize(quantile(0.97)(memory_usage) as memory_usage_q97) as MemoryUsageQ97 ,
    sum(read_rows) AS ReadRows,
    formatReadableSize(sum(read_bytes) as read_bytes_sum) AS ReadBytes,
    sum(written_rows) AS WrittenRows,
    formatReadableSize(sum(written_bytes) as written_bytes_sum) AS WrittenBytes, /* */
    sum(result_rows) AS ResultRows,
    formatReadableSize(sum(result_bytes) as result_bytes_sum) AS ResultBytes
FROM clusterAllReplicas('{cluster}', system.query_log)
WHERE event_date >= today() AND type in (2,4)-- QueryFinish, ExceptionWhileProcessing
GROUP BY
    GROUPING SETS (
        (normalized_query_hash, host),
        (host),
        ())
ORDER BY OSCPUVirtualTime DESC
LIMIT 30
FORMAT Vertical;

Find queries which were started but not finished at some moment in time

SELECT
  query_id,
  min(event_time) t,
  any(query)
FROM system.query_log
where event_date = today() and event_time > '2021-11-25 02:29:12'
GROUP BY query_id
HAVING countIf(type='QueryFinish') = 0 OR sum(query_duration_ms) > 100000
order by t;

select
     query_id,
     any(query)
from system.query_log
where event_time between '2021-09-24 07:00:00' and '2021-09-24 09:00:00'
group by query_id HAVING countIf(type=1) <> countIf(type!=1)

5 - Ingestion metrics from system.part_log

Query to gather information about ingestion rate from system.part_log.
-- Insert rate
select database, table, time_bucket,
       max(number_of_parts_per_insert) max_parts_pi,
       median(number_of_parts_per_insert) median_parts_pi,
       min(min_rows_per_part) min_rows_pp, 
       max(max_rows_per_part) max_rows_pp, 
       median(median_rows_per_part) median_rows_pp,
       min(rows_per_insert) min_rows_pi, 
       median(rows_per_insert) median_rows_pi, 
       max(rows_per_insert) max_rows_pi, 
       sum(rows_per_insert) rows_inserted,
       sum(seconds_per_insert) parts_creation_seconds,
       count() inserts,
       sum(number_of_parts_per_insert) new_parts,
       max(last_part_pi) - min(first_part_pi) as insert_period,
       inserts*60/insert_period as inserts_per_minute
from
(SELECT 
	database, 
	table,
	toStartOfDay(event_time) AS time_bucket, 
	count() AS number_of_parts_per_insert,
	min(rows) AS min_rows_per_part,
	max(rows) AS max_rows_per_part, 
	median(rows) AS median_rows_per_part,
	sum(rows) AS rows_per_insert, 
	min(size_in_bytes) AS min_bytes_per_part, 
	max(size_in_bytes) AS max_bytes_per_part,
	median(size_in_bytes) AS median_bytes_per_part, 
	sum(size_in_bytes) AS bytes_per_insert, 
	median_bytes_per_part / median_rows_per_part AS avg_row_size,
	sum(duration_ms)/1000 as seconds_per_insert,
	max(event_time) as last_part_pi,  min(event_time) as first_part_pi
FROM 
	system.part_log
WHERE 
  -- Enum8('NewPart' = 1, 'MergeParts' = 2, 'DownloadPart' = 3, 'RemovePart' = 4, 'MutatePart' = 5, 'MovePart' = 6)
	event_type = 1 
	AND 
  -- change if another time period is desired
	event_date >= today()
GROUP BY query_id, database, table, time_bucket
)
GROUP BY database, table, time_bucket
ORDER BY time_bucket, database, table ASC

-- New parts per partition 
select database, table, event_type, partition_id, count() c, round(avg(rows)) 
from system.part_log where event_date >= today() and event_type = 'NewPart'
group by database, table, event_type, partition_id
order by c desc

6 - Remove block numbers from zookeeper for removed partitions

Remove block numbers from zookeeper for removed partitions

SELECT distinct concat('delete ', zk.block_numbers_path, zk.partition_id) FROM
(
    SELECT r.database, r.table, zk.block_numbers_path, zk.partition_id, p.partition_id
    FROM 
    (
        SELECT path as block_numbers_path, name as partition_id
        FROM system.zookeeper
        WHERE path IN (
            SELECT concat(zookeeper_path, '/block_numbers/') as block_numbers_path
            FROM clusterAllReplicas('{cluster}',system.replicas)
        )
    ) as zk 
    LEFT JOIN 
    (
            SELECT database, table, concat(zookeeper_path, '/block_numbers/') as block_numbers_path
            FROM clusterAllReplicas('{cluster}',system.replicas)
    )
    as r ON (r.block_numbers_path = zk.block_numbers_path) 
    LEFT JOIN 
    (
        SELECT DISTINCT partition_id, database, table
        FROM clusterAllReplicas('{cluster}',system.parts)
    )
    as p ON (p.partition_id = zk.partition_id AND p.database = r.database AND p.table = r.table)
    WHERE p.partition_id = ''  AND zk.partition_id <> 'all'
    ORDER BY r.database, r.table, zk.block_numbers_path, zk.partition_id, p.partition_id
) t
FORMAT TSVRaw;

After 24.3

WITH 
  now() - INTERVAL 120 DAY as retain_old_partitions,
  replicas AS (SELECT DISTINCT database, table, zookeeper_path || '/block_numbers' AS block_numbers_path FROM system.replicas),
  zk_data AS (SELECT DISTINCT name as partition_id, path as block_numbers_path FROM system.zookeeper WHERE path IN (SELECT block_numbers_path FROM replicas) AND mtime < retain_old_partitions AND partition_id <> 'all'),
  zk_partitions AS (SELECT DISTINCT database, table, partition_id FROM replicas JOIN zk_data USING block_numbers_path),
  partitions AS (SELECT DISTINCT database, table, partition_id FROM system.parts)
SELECT 
  format('ALTER TABLE `{}`.`{}` {};',database, table, arrayStringConcat( arraySort(groupArray('FORGET PARTITION ID \'' || partition_id || '\'')), ', ')) AS query
FROM zk_partitions
WHERE (database, table, partition_id) NOT IN (SELECT * FROM partitions) 
GROUP BY database, table
ORDER BY database, table
FORMAT TSVRaw;

After fixing https://github.com/ClickHouse/ClickHouse/issues/72807

WITH 
  now() - INTERVAL 120 DAY as retain_old_partitions,
  replicas AS (SELECT DISTINCT database, table, zookeeper_path || '/block_numbers' AS block_numbers_path FROM clusterAllReplicas('{cluster}',system.replicas)),
  zk_data AS (SELECT DISTINCT name as partition_id, path as block_numbers_path FROM system.zookeeper WHERE path IN (SELECT block_numbers_path FROM replicas) AND mtime < retain_old_partitions AND partition_id <> 'all'),
  zk_partitions AS (SELECT DISTINCT database, table, partition_id FROM replicas JOIN zk_data USING block_numbers_path),
  partitions AS (SELECT DISTINCT database, table, partition_id FROM clusterAllReplicas('{cluster}',system.parts))
SELECT 
  format('ALTER TABLE `{}`.`{}` ON CLUSTER \'{{cluster}}\' {};',database, table, arrayStringConcat( arraySort(groupArray('FORGET PARTITION ID \'' || partition_id || '\'')), ', ')) AS query
FROM zk_partitions
WHERE (database, table, partition_id) NOT IN (SELECT * FROM partitions) 
GROUP BY database, table
ORDER BY database, table
FORMAT TSVRaw;

7 - Removing tasks in the replication queue related to empty partitions

Removing tasks in the replication queue related to empty partitions
SELECT 'ALTER TABLE ' || database || '.' || table || ' DROP PARTITION ID \''|| partition_id || '\';'  FROM 
(SELECT DISTINCT database, table, extract(new_part_name, '^[^_]+')  as partition_id FROM clusterAllReplicas('{cluster}', system.replication_queue) ) as rq
LEFT JOIN 
(SELECT database, table, partition_id, sum(rows) as rows_count, count() as part_count 
FROM clusterAllReplicas('{cluster}', system.parts)
WHERE active GROUP BY database, table, partition_id
)  as p
USING (database, table, partition_id)
WHERE p.rows_count = 0 AND p.part_count = 0
FORMAT TSVRaw;

8 - Can detached parts be dropped?

Can detached parts be dropped?

Here is what different statuses mean:

  1. Parts are renamed to ‘ignored’ if they were found during ATTACH together with other, bigger parts that cover the same blocks of data, i.e. they were already merged into something else.
  2. Parts are renamed to ‘broken’ if ClickHouse® was not able to load data from the parts. There could be different reasons: some files are lost, checksums are not correct, etc.
  3. Parts are renamed to ‘unexpected’ if they are present locally, but are not found in ZooKeeper, in case when an insert was not completed properly. The part is detached only if it’s old enough (5 minutes), otherwise CH registers this part in ZooKeeper as a new part.
  4. Parts are renamed to ‘cloned’ if ClickHouse have had some parts on local disk while repairing lost replica so already existed parts being renamed and put in detached directory. Controlled by setting detach_old_local_parts_when_cloning_replica.

‘Ignored’ parts are safe to delete. ‘Unexpected’ and ‘broken’ should be investigated, but it might not be an easy thing to do, especially for older parts. If the system.part_log table is enabled you can find some information there. Otherwise you will need to look in clickhouse-server.log for what happened when the parts were detached. If there is another way you could confirm that there is no data loss in the affected tables, you could simply delete all detached parts.

It is important to monitor for detached parts and act quickly when they appear. If clickhouse-server.log is lost it might be impossible to figure out what happened and why the parts were detached. You can use system.asynchronous_metrics or system.detached_parts for monitoring.

select metric from system.asynchronous_metrics where metric ilike '%detach%'

NumberOfDetachedByUserParts
NumberOfDetachedParts

Here is a quick way to find out if you have detached parts along with the reason why.

SELECT database, table, reason, count()
FROM system.detached_parts
GROUP BY database, table, reason
ORDER BY database ASC, table ASC, reason ASC

Here is a query that can help with investigations. It looks for active parts containing the same data blocks as the detached parts. It generates commands to drop the detached parts.

with ['broken','unexpected','noquorum','ignored','broken-on-start','clone','attaching','deleting','tmp-fetch',
      'covered-by-broken','merge-not-byte-identical','mutate-not-byte-identical','broken-from-backup'] as DETACH_REASONS
select a.*,
  concat('alter table ',database,'.',table,' drop detached part ''',a.name,''' settings allow_drop_detached=1;') as drop,
  concat('sudo rm -r ',a.path) as rm
from (select * replace(part[1] as partition_id, toInt64(part[2]) as min_block_number, toInt64(part[3]) as max_block_number),
  arrayFilter(x -> x not in DETACH_REASONS, splitByChar('_',name)) as part
from system.detached_parts) a
left join (select database, table, partition_id, name, active, min_block_number, max_block_number from system.parts where active) b 
on a.database=b.database and a.table=b.table and a.partition_id=b.partition_id
where a.min_block_number >= b.min_block_number
  and a.max_block_number <= b.max_block_number
order by table, min_block_number, max_block_number

Other reasons

# rg forgetPartAndMoveToDetached --type cpp
# rg renameToDetached --type cpp
# rg makeCloneInDetached --type cpp
broken
unexpected
ignored
noquorum
merge-not-byte-identical
mutate-not-byte-identical - 
broken-on-start
clone
covered-by-broken  - that means that ClickHouse during initialization of replicated table detected that some part is not ok, and decided to refetch it from healthy replicas. So the part itself will be detached as 'broken' and if that part was a result of merge / mutation all the previuos generations of that will be marked as covered-by-broken. If clickhouse was able to download the final part you don't need those covered-by-broken.

The list of DETACH_REASONS: https://github.com/ClickHouse/ClickHouse/blob/master/src/Storages/MergeTree/MergeTreePartInfo.h#L163

9 - Database Size - Table - Column size

Database Size - Table - Column size

Tables

Table size

SELECT
    database,
    table,
    formatReadableSize(sum(data_compressed_bytes) AS size) AS compressed,
    formatReadableSize(sum(data_uncompressed_bytes) AS usize) AS uncompressed,
    round(usize / size, 2) AS compr_rate,
    sum(rows) AS rows,
    count() AS part_count
FROM system.parts
WHERE (active = 1) AND (database LIKE '%') AND (table LIKE '%')
GROUP BY
    database,
    table
ORDER BY size DESC;

Table size + inner MatView (Atomic)

SELECT
      p.database,
      if(t.name = '', p.table, p.table||' ('||t.name||')') tbl,
      formatReadableSize(sum(p.data_compressed_bytes) AS size) AS compressed,
      formatReadableSize(sum(p.data_uncompressed_bytes) AS usize) AS uncompressed,
      round(usize / size, 2) AS compr_rate,
      sum(p.rows) AS rows,
      count() AS part_count
FROM system.parts p left join system.tables t on p.database = t.database and p.table = '.inner_id.'||toString(t.uuid)
WHERE (active = 1) AND (tbl LIKE '%') AND (database LIKE '%')
GROUP BY
    p.database,
    tbl
ORDER BY size DESC;

Column size

SELECT
    database,
    table,
    column,
    formatReadableSize(sum(column_data_compressed_bytes) AS size) AS compressed,
    formatReadableSize(sum(column_data_uncompressed_bytes) AS usize) AS uncompressed,
    round(usize / size, 2) AS compr_ratio,
    sum(rows) rows_cnt,
    round(usize / rows_cnt, 2) avg_row_size
FROM system.parts_columns
WHERE (active = 1) AND (database LIKE '%') AND (table LIKE '%')
GROUP BY
    database,
    table,
    column
ORDER BY size DESC;

Projections

Projection size

SELECT
    database,
    table,
    name,
    formatReadableSize(sum(data_compressed_bytes) AS size) AS compressed,
    formatReadableSize(sum(data_uncompressed_bytes) AS usize) AS uncompressed,
    round(usize / size, 2) AS compr_rate,
    sum(rows) AS rows,
    count() AS part_count
FROM system.projection_parts
WHERE (table = 'ptest') AND active
GROUP BY
    database,
    table,
    name
ORDER BY size DESC;

Projection column size

SELECT
    database,
    table,
    column,
    formatReadableSize(sum(column_data_compressed_bytes) AS size) AS compressed,
    formatReadableSize(sum(column_data_uncompressed_bytes) AS usize) AS uncompressed,
    round(usize / size, 2) AS compr_rate
FROM system.projection_parts_columns
WHERE (active = 1) AND (table LIKE 'ptest')
GROUP BY
    database,
    table,
    column
ORDER BY size DESC;

Understanding the columns data properties:

SELECT
   count(),
   * APPLY (uniq),
   * APPLY (max),
   * APPLY (min),
   * APPLY(topK(5))
FROM table_name 
FORMAT Vertical;

-- also you can add * APPLY (entropy) to show entropy (i.e. 'randomness' of the column).
-- if the table is huge add some WHERE condition to slice some 'representative' data range, for example single month / week / day of data.

Understanding the ingest pattern:

SELECT
    database,
    table,
    median(rows),
    median(bytes_on_disk),
    sum(rows),
    max(bytes_on_disk),
    min(bytes_on_disk),
    round(quantile(0.95)(bytes_on_disk), 0),
    sum(bytes_on_disk),
    count(),
    countIf(NOT active),
    uniqExact(partition)
FROM system.parts
WHERE (modification_time > (now() - 480)) AND (level = 0)
GROUP BY
    database,
    table
ORDER BY count() DESC

part_log

WITH 30 * 60 AS frame_size
SELECT
    toStartOfInterval(event_time, toIntervalSecond(frame_size)) AS m,
    database,
    table,
    ROUND(countIf(event_type = 'NewPart') / frame_size, 2) AS new,
    ROUND(countIf(event_type = 'MergeParts') / frame_size, 2) AS merge,
    ROUND(countIf(event_type = 'DownloadPart') / frame_size, 2) AS dl,
    ROUND(countIf(event_type = 'RemovePart') / frame_size, 2) AS rm,
    ROUND(countIf(event_type = 'MutatePart') / frame_size, 2) AS mut,
    ROUND(countIf(event_type = 'MovePart') / frame_size, 2) AS mv
FROM system.part_log
WHERE event_time > (now() - toIntervalHour(24))
GROUP BY
    m,
    database,
    table
ORDER BY
    database ASC,
    table ASC,
    m ASC
    
WITH 30 * 60 AS frame_size
SELECT
    toStartOfInterval(event_time, toIntervalSecond(frame_size)) AS m,
    database,
    table,
    ROUND(countIf(event_type = 'NewPart') / frame_size, 2) AS inserts_per_sec,
    ROUND(sumIf(rows, event_type = 'NewPart') / frame_size, 2) AS rows_per_sec,
    ROUND(sumIf(size_in_bytes, event_type = 'NewPart') / frame_size, 2) AS bytes_per_sec
FROM system.part_log
WHERE event_time > (now() - toIntervalHour(24))
GROUP BY
    m,
    database,
    table
ORDER BY
    database ASC,
    table ASC,
    m ASC

Understanding the partitioning

SELECT
    database,
    table,
    count(),
    topK(5)(partition),
    COLUMNS('metric.*') APPLY(quantiles(0.005, 0.05, 0.10, 0.25, 0.5, 0.75, 0.9, 0.95, 0.995))
FROM
(
    SELECT
        database,
        table,
        partition,
        sum(bytes_on_disk) AS metric_bytes,
        sum(data_uncompressed_bytes) AS metric_uncompressed_bytes,
        sum(rows) AS metric_rows,
        sum(primary_key_bytes_in_memory) AS metric_pk_size,
        count() AS metric_count,
        countIf(part_type = 'Wide') AS metric_wide_count,
        countIf(part_type = 'Compact') AS metric_compact_count,
        countIf(part_type = 'Memory') AS metric_memory_count
    FROM system.parts
    GROUP BY
        database,
        table,
        partition
)
GROUP BY
    database,
    table
FORMAT Vertical

10 - Number of active parts in a partition

Number of active parts in a partition

Q: Why do I have several active parts in a partition? Why ClickHouse® does not merge them immediately?

A: CH does not merge parts by time

Merge scheduler selects parts by own algorithm based on the current node workload / number of parts / size of parts.

CH merge scheduler balances between a big number of parts and a wasting resources on merges.

Merges are CPU/DISK IO expensive. If CH will merge every new part then all resources will be spend on merges and will no resources remain on queries (selects ).

CH will not merge parts with a combined size greater than 150 GB max_bytes_to_merge_at_max_space_in_pool.

SELECT
    database,
    table,
    partition,
    sum(rows) AS rows,
    count() AS part_count
FROM system.parts
WHERE (active = 1) AND (table LIKE '%') AND (database LIKE '%')
GROUP BY
    database,
    table,
    partition
ORDER BY part_count DESC
limit 20

11 - Parts consistency

Check if there are blocks missing

SELECT
    database,
    table,
    partition_id,
    ranges.1 AS previous_part,
    ranges.2 AS next_part,
    ranges.3 AS previous_block_number,
    ranges.4 AS next_block_number,
    range(toUInt64(previous_block_number + 1), toUInt64(next_block_number)) AS missing_block_numbers
FROM
(
    WITH
        arrayPopFront(groupArray(min_block_number) AS min) AS min_adj,
        arrayPopBack(groupArray(max_block_number) AS max) AS max_adj,
        arrayFilter((x, y, z) -> (y != (z + 1)), arrayZip(arrayPopBack(groupArray(name) AS name_arr), arrayPopFront(name_arr), max_adj, min_adj), min_adj, max_adj) AS missing_ranges
    SELECT
        database,
        table,
        partition_id,
        missing_ranges
    FROM
    (
        SELECT *
        FROM system.parts
        WHERE active AND (table = 'query_thread_log') AND (partition_id = '202108') AND active
        ORDER BY min_block_number ASC
    )
    GROUP BY
        database,
        table,
        partition_id
)
ARRAY JOIN missing_ranges AS ranges

┌─database─┬─table────────────┬─partition_id─┬─previous_part───────┬─next_part──────────┬─previous_block_number─┬─next_block_number─┬─missing_block_numbers─┐
 system    query_thread_log  202108        202108_864_1637_556  202108_1639_1639_0                   1637               1639  [1638]                
└──────────┴──────────────────┴──────────────┴─────────────────────┴────────────────────┴───────────────────────┴───────────────────┴───────────────────────┘

Find the number of blocks in a table

SELECT
    database,
    table,
    partition_id,
    sum(max_block_number - min_block_number) AS blocks_count
FROM system.parts
WHERE active AND (table = 'query_thread_log') AND (partition_id = '202108') AND active
GROUP BY
    database,
    table,
    partition_id

┌─database─┬─table────────────┬─partition_id─┬─blocks_count─┐
 system    query_thread_log  202108                1635 
└──────────┴──────────────────┴──────────────┴──────────────┘

Compare the list of parts in ZooKeeper with the list of parts on disk

select zoo.p_path as part_zoo, zoo.ctime, zoo.mtime, disk.p_path as part_disk
from
(
  select concat(path,'/',name) as p_path, ctime, mtime
  from system.zookeeper where path in (select concat(replica_path,'/parts') from system.replicas)
) zoo
left join 
(
  select concat(replica_path,'/parts/',name) as p_path
  from system.parts inner join system.replicas using (database, table)
) disk on zoo.p_path = disk.p_path
where part_disk='' and zoo.mtime <= now() - interval 1 hour
order by part_zoo;

You can clean that orphan zk records (need to execute using delete in zkCli, rm in zk-shell):

select 'delete '||part_zoo
from (
select zoo.p_path as part_zoo, zoo.ctime, zoo.mtime, disk.p_path as part_disk
from
(
  select concat(path,'/',name) as p_path, ctime, mtime
  from system.zookeeper where path in (select concat(replica_path,'/parts') from system.replicas)
) zoo
left join 
(
  select concat(replica_path,'/parts/',name) as p_path
  from system.parts inner join system.replicas using (database, table)
) disk on zoo.p_path = disk.p_path
where part_disk='' and zoo.mtime <= now() - interval 1 day
order by part_zoo) format TSVRaw;