This is the multi-page printable view of this section. Click here to print.
Useful queries
- 1: Check table metadata in zookeeper
- 2: Compare query_log for 2 intervals
- 3: Debug hanging thing
- 4: Handy queries for system.query_log
- 5: Ingestion metrics from system.part_log
- 6: Remove block numbers from zookeeper for removed partitions
- 7: Removing tasks in the replication queue related to empty partitions
- 8: Can detached parts be dropped?
- 9: Database Size - Table - Column size
- 10: Number of active parts in a partition
- 11: Parts consistency
1 - Check table metadata in zookeeper
Compare table metadata of different replicas in zookeeper
Metadata on replica is not up to date with common metadata in Zookeeper
SELECT *, if( neighbor(name, -1) == name and name != 'is_active', neighbor(value, -1) == value , 1) as looks_good
FROM (
SELECT
name,
path,
ctime,
mtime,
value
FROM system.zookeeper
WHERE (path IN (
SELECT arrayJoin(groupUniqArray(if(path LIKE '%/replicas', concat(path, '/', name), path)))
FROM system.zookeeper
WHERE path IN (
SELECT arrayJoin([zookeeper_path, concat(zookeeper_path, '/replicas')])
FROM system.replicas
WHERE table = 'test_repl'
)
)) AND (name IN ('metadata', 'columns', 'is_active'))
ORDER BY
name = 'is_active',
name ASC,
path ASC
)
vs.
SELECT metadata_modification_time, create_table_query FROM system.tables WHERE name = 'test_repl'
2 - Compare query_log for 2 intervals
WITH
toStartOfInterval(event_time, INTERVAL 5 MINUTE) = '2023-06-30 13:00:00' as before,
toStartOfInterval(event_time, INTERVAL 5 MINUTE) = '2023-06-30 15:00:00' as after
SELECT
normalized_query_hash,
anyIf(query, before) AS QueryBefore,
anyIf(query, after) AS QueryAfter,
countIf(before) as CountBefore,
sumIf(query_duration_ms, before) / 1000 AS QueriesDurationBefore,
sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'RealTimeMicroseconds')], before) / 1000000 AS RealTimeBefore,
sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'UserTimeMicroseconds')], before) / 1000000 AS UserTimeBefore,
sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'SystemTimeMicroseconds')], before) / 1000000 AS SystemTimeBefore,
sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'DiskReadElapsedMicroseconds')], before) / 1000000 AS DiskReadTimeBefore,
sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'DiskWriteElapsedMicroseconds')], before) / 1000000 AS DiskWriteTimeBefore,
sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'NetworkSendElapsedMicroseconds')], before) / 1000000 AS NetworkSendTimeBefore,
sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'NetworkReceiveElapsedMicroseconds')], before) / 1000000 AS NetworkReceiveTimeBefore,
sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'ZooKeeperWaitMicroseconds')], before) / 1000000 AS ZooKeeperWaitTimeBefore,
sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'OSIOWaitMicroseconds')], before) / 1000000 AS OSIOWaitTimeBefore,
sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'OSCPUWaitMicroseconds')], before) / 1000000 AS OSCPUWaitTimeBefore,
sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'OSCPUVirtualTimeMicroseconds')], before) / 1000000 AS OSCPUVirtualTimeBefore,
sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'SelectedBytes')], before) AS SelectedBytesBefore,
sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'SelectedRanges')], before) AS SelectedRangesBefore,
sumIf(read_rows, before) AS ReadRowsBefore,
formatReadableSize(sumIf(read_bytes, before) AS ReadBytesBefore),
sumIf(written_rows, before) AS WrittenTowsBefore,
formatReadableSize(sumIf(written_bytes, before)) AS WrittenBytesBefore,
sumIf(result_rows, before) AS ResultRowsBefore,
formatReadableSize(sumIf(result_bytes, before)) AS ResultBytesBefore,
countIf(after) as CountAfter,
sumIf(query_duration_ms, after) / 1000 AS QueriesDurationAfter,
sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'RealTimeMicroseconds')], after) / 1000000 AS RealTimeAfter,
sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'UserTimeMicroseconds')], after) / 1000000 AS UserTimeAfter,
sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'SystemTimeMicroseconds')], after) / 1000000 AS SystemTimeAfter,
sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'DiskReadElapsedMicroseconds')], after) / 1000000 AS DiskReadTimeAfter,
sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'DiskWriteElapsedMicroseconds')], after) / 1000000 AS DiskWriteTimeAfter,
sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'NetworkSendElapsedMicroseconds')], after) / 1000000 AS NetworkSendTimeAfter,
sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'NetworkReceiveElapsedMicroseconds')], after) / 1000000 AS NetworkReceiveTimeAfter,
sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'ZooKeeperWaitMicroseconds')], after) / 1000000 AS ZooKeeperWaitTimeAfter,
sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'OSIOWaitMicroseconds')], after) / 1000000 AS OSIOWaitTimeAfter,
sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'OSCPUWaitMicroseconds')], after) / 1000000 AS OSCPUWaitTimeAfter,
sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'OSCPUVirtualTimeMicroseconds')], after) / 1000000 AS OSCPUVirtualTimeAfter,
sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'SelectedBytes')], after) AS SelectedBytesAfter,
sumIf(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'SelectedRanges')], after) AS SelectedRangesAfter,
sumIf(read_rows, after) AS ReadRowsAfter,
formatReadableSize(sumIf(read_bytes, after) AS ReadBytesAfter),
sumIf(written_rows, after) AS WrittenTowsAfter,
formatReadableSize(sumIf(written_bytes, after)) AS WrittenBytesAfter,
sumIf(result_rows, after) AS ResultRowsAfter,
formatReadableSize(sumIf(result_bytes, after)) AS ResultBytesAfter
FROM system.query_log
WHERE (before OR after) AND type in (2,4) -- QueryFinish, ExceptionWhileProcessing
GROUP BY normalized_query_hash
WITH TOTALS
ORDER BY SelectedRangesAfter- SelectedRangesBefore DESC
LIMIT 10
FORMAT Vertical
WITH
toDateTime('2024-02-09 00:00:00') as timestamp_of_issue,
event_time < timestamp_of_issue as before,
event_time >= timestamp_of_issue as after
select
normalized_query_hash as h,
any(query) as query_sample,
round(quantileIf(0.9)(query_duration_ms, before)) as duration_q90_before,
round(quantileIf(0.9)(query_duration_ms, after)) as duration_q90_after,
countIf(before) as cnt_before,
countIf(after) as cnt_after,
sumIf(query_duration_ms,before) as duration_sum_before,
sumIf(query_duration_ms,after) as duration_sum_after,
sumIf(ProfileEvents['UserTimeMicroseconds'], before) as usertime_sum_before,
sumIf(ProfileEvents['UserTimeMicroseconds'], after) as usertime_sum_after,
sumIf(read_bytes,before) as sum_read_bytes_before,
sumIf(read_bytes,after) as sum_read_bytes_after
from system.query_log
where event_time between timestamp_of_issue - INTERVAL 3 DAY and timestamp_of_issue + INTERVAL 3 DAY
group by h
HAVING cnt_after > 1.1 * cnt_before OR sum_read_bytes_after > 1.2 * sum_read_bytes_before OR usertime_sum_after > 1.2 * usertime_sum_before
ORDER BY sum_read_bytes_after - sum_read_bytes_before
FORMAT Vertical
3 - Debug hanging thing
Debug hanging / freezing things
If ClickHouse® is busy with something and you don’t know what’s happening, you can easily check the stacktraces of all the thread which are working
SELECT
arrayStringConcat(arrayMap(x -> demangle(addressToSymbol(x)), trace), '\n') AS trace_functions,
count()
FROM system.stack_trace
GROUP BY trace_functions
ORDER BY count()
DESC
SETTINGS allow_introspection_functions=1
FORMAT Vertical;
If you can’t start any queries, but you have access to the node, you can sent a signal
# older versions
for i in $(ls -1 /proc/$(pidof clickhouse-server)/task/); do kill -TSTP $i; done
# even older versions
for i in $(ls -1 /proc/$(pidof clickhouse-server)/task/); do kill -SIGPROF $i; done
4 - Handy queries for system.query_log
The most cpu / write / read-intensive queries from query_log
SELECT
normalized_query_hash,
any(query),
count(),
sum(query_duration_ms) / 1000 AS QueriesDuration,
sum(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'RealTimeMicroseconds')]) / 1000000 AS RealTime,
sum(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'UserTimeMicroseconds')]) / 1000000 AS UserTime,
sum(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'SystemTimeMicroseconds')]) / 1000000 AS SystemTime,
sum(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'DiskReadElapsedMicroseconds')]) / 1000000 AS DiskReadTime,
sum(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'DiskWriteElapsedMicroseconds')]) / 1000000 AS DiskWriteTime,
sum(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'NetworkSendElapsedMicroseconds')]) / 1000000 AS NetworkSendTime,
sum(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'NetworkReceiveElapsedMicroseconds')]) / 1000000 AS NetworkReceiveTime,
sum(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'ZooKeeperWaitMicroseconds')]) / 1000000 AS ZooKeeperWaitTime,
sum(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'OSIOWaitMicroseconds')]) / 1000000 AS OSIOWaitTime,
sum(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'OSCPUWaitMicroseconds')]) / 1000000 AS OSCPUWaitTime,
sum(ProfileEvents.Values[indexOf(ProfileEvents.Names, 'OSCPUVirtualTimeMicroseconds')]) / 1000000 AS OSCPUVirtualTime,
sum(read_rows) AS ReadRows,
formatReadableSize(sum(read_bytes)) AS ReadBytes,
sum(written_rows) AS WrittenTows,
formatReadableSize(sum(written_bytes)) AS WrittenBytes,
sum(result_rows) AS ResultRows,
formatReadableSize(sum(result_bytes)) AS ResultBytes
FROM system.query_log
WHERE (event_date >= today()) AND (event_time > (now() - 3600)) AND type in (2,4) -- QueryFinish, ExceptionWhileProcessing
GROUP BY normalized_query_hash
WITH TOTALS
ORDER BY UserTime DESC
LIMIT 30
FORMAT Vertical
– modern ClickHouse®
SELECT
hostName() as host,
normalized_query_hash,
min(event_time),
max(event_time),
replace(substr(argMax(query, utime), 1, 80), '\n', ' ') AS query,
argMax(query_id, utime) AS sample_query_id,
count(),
sum(query_duration_ms) / 1000 AS QueriesDuration, /* wall clock */
sum(ProfileEvents['RealTimeMicroseconds']) / 1000000 AS RealTime, /* same as above but x number of thread */
sum(ProfileEvents['UserTimeMicroseconds'] as utime) / 1000000 AS UserTime, /* time when our query was doin some cpu-insense work, creating cpu load */
sum(ProfileEvents['SystemTimeMicroseconds']) / 1000000 AS SystemTime, /* time spend on waiting for some system operations */
sum(ProfileEvents['DiskReadElapsedMicroseconds']) / 1000000 AS DiskReadTime,
sum(ProfileEvents['DiskWriteElapsedMicroseconds']) / 1000000 AS DiskWriteTime,
sum(ProfileEvents['NetworkSendElapsedMicroseconds']) / 1000000 AS NetworkSendTime, /* check the other side of the network! */
sum(ProfileEvents['NetworkReceiveElapsedMicroseconds']) / 1000000 AS NetworkReceiveTime, /* check the other side of the network! */
sum(ProfileEvents['ZooKeeperWaitMicroseconds']) / 1000000 AS ZooKeeperWaitTime,
sum(ProfileEvents['OSIOWaitMicroseconds']) / 1000000 AS OSIOWaitTime, /* IO waits, usually disks - that metric is 'orthogonal' to other */
sum(ProfileEvents['OSCPUWaitMicroseconds']) / 1000000 AS OSCPUWaitTime, /* waiting for a 'free' CPU - usually high when the other load on the server creates a lot of contention for cpu */
sum(ProfileEvents['OSCPUVirtualTimeMicroseconds']) / 1000000 AS OSCPUVirtualTime, /* similar to usertime + system time */
formatReadableSize(sum(ProfileEvents['NetworkReceiveBytes']) as network_receive_bytes) AS NetworkReceiveBytes,
formatReadableSize(sum(ProfileEvents['NetworkSendBytes']) as network_send_bytes) AS NetworkSendBytes,
sum(ProfileEvents['SelectedParts']) as SelectedParts,
sum(ProfileEvents['SelectedRanges']) as SelectedRanges,
sum(ProfileEvents['SelectedMarks']) as SelectedMarks,
sum(ProfileEvents['SelectedRows']) as SelectedRows, /* those may different from read_rows - here the number or rows potentially matching the where conditions, not neccessary all will be read */
sum(ProfileEvents['SelectedBytes']) as SelectedBytes,
sum(ProfileEvents['FileOpen']) as FileOpen,
sum(ProfileEvents['ZooKeeperTransactions']) as ZooKeeperTransactions,
formatReadableSize(sum(ProfileEvents['OSReadBytes'] ) as os_read_bytes ) as OSReadBytesExcludePageCache,
formatReadableSize(sum(ProfileEvents['OSWriteBytes'] ) as os_write_bytes ) as OSWriteBytesExcludePageCache,
formatReadableSize(sum(ProfileEvents['OSReadChars'] ) as os_read_chars ) as OSReadBytesIncludePageCache,
formatReadableSize(sum(ProfileEvents['OSWriteChars'] ) as os_write_chars ) as OSWriteCharsIncludePageCache,
formatReadableSize(quantile(0.97)(memory_usage) as memory_usage_q97) as MemoryUsageQ97 ,
sum(read_rows) AS ReadRows,
formatReadableSize(sum(read_bytes) as read_bytes_sum) AS ReadBytes,
sum(written_rows) AS WrittenRows,
formatReadableSize(sum(written_bytes) as written_bytes_sum) AS WrittenBytes, /* */
sum(result_rows) AS ResultRows,
formatReadableSize(sum(result_bytes) as result_bytes_sum) AS ResultBytes
FROM clusterAllReplicas('{cluster}', system.query_log)
WHERE event_date >= today() AND type in (2,4)-- QueryFinish, ExceptionWhileProcessing
GROUP BY
GROUPING SETS (
(normalized_query_hash, host),
(host),
())
ORDER BY OSCPUVirtualTime DESC
LIMIT 30
FORMAT Vertical;
Find queries which were started but not finished at some moment in time
SELECT
query_id,
min(event_time) t,
any(query)
FROM system.query_log
where event_date = today() and event_time > '2021-11-25 02:29:12'
GROUP BY query_id
HAVING countIf(type='QueryFinish') = 0 OR sum(query_duration_ms) > 100000
order by t;
select
query_id,
any(query)
from system.query_log
where event_time between '2021-09-24 07:00:00' and '2021-09-24 09:00:00'
group by query_id HAVING countIf(type=1) <> countIf(type!=1)
5 - Ingestion metrics from system.part_log
-- Insert rate
select database, table, time_bucket,
max(number_of_parts_per_insert) max_parts_pi,
median(number_of_parts_per_insert) median_parts_pi,
min(min_rows_per_part) min_rows_pp,
max(max_rows_per_part) max_rows_pp,
median(median_rows_per_part) median_rows_pp,
min(rows_per_insert) min_rows_pi,
median(rows_per_insert) median_rows_pi,
max(rows_per_insert) max_rows_pi,
sum(rows_per_insert) rows_inserted,
sum(seconds_per_insert) parts_creation_seconds,
count() inserts,
sum(number_of_parts_per_insert) new_parts,
max(last_part_pi) - min(first_part_pi) as insert_period,
inserts*60/insert_period as inserts_per_minute
from
(SELECT
database,
table,
toStartOfDay(event_time) AS time_bucket,
count() AS number_of_parts_per_insert,
min(rows) AS min_rows_per_part,
max(rows) AS max_rows_per_part,
median(rows) AS median_rows_per_part,
sum(rows) AS rows_per_insert,
min(size_in_bytes) AS min_bytes_per_part,
max(size_in_bytes) AS max_bytes_per_part,
median(size_in_bytes) AS median_bytes_per_part,
sum(size_in_bytes) AS bytes_per_insert,
median_bytes_per_part / median_rows_per_part AS avg_row_size,
sum(duration_ms)/1000 as seconds_per_insert,
max(event_time) as last_part_pi, min(event_time) as first_part_pi
FROM
system.part_log
WHERE
-- Enum8('NewPart' = 1, 'MergeParts' = 2, 'DownloadPart' = 3, 'RemovePart' = 4, 'MutatePart' = 5, 'MovePart' = 6)
event_type = 1
AND
-- change if another time period is desired
event_date >= today()
GROUP BY query_id, database, table, time_bucket
)
GROUP BY database, table, time_bucket
ORDER BY time_bucket, database, table ASC
-- New parts per partition
select database, table, event_type, partition_id, count() c, round(avg(rows))
from system.part_log where event_date >= today() and event_type = 'NewPart'
group by database, table, event_type, partition_id
order by c desc
6 - Remove block numbers from zookeeper for removed partitions
Remove block numbers from zookeeper for removed partitions
SELECT distinct concat('delete ', zk.block_numbers_path, zk.partition_id) FROM
(
SELECT r.database, r.table, zk.block_numbers_path, zk.partition_id, p.partition_id
FROM
(
SELECT path as block_numbers_path, name as partition_id
FROM system.zookeeper
WHERE path IN (
SELECT concat(zookeeper_path, '/block_numbers/') as block_numbers_path
FROM clusterAllReplicas('{cluster}',system.replicas)
)
) as zk
LEFT JOIN
(
SELECT database, table, concat(zookeeper_path, '/block_numbers/') as block_numbers_path
FROM clusterAllReplicas('{cluster}',system.replicas)
)
as r ON (r.block_numbers_path = zk.block_numbers_path)
LEFT JOIN
(
SELECT DISTINCT partition_id, database, table
FROM clusterAllReplicas('{cluster}',system.parts)
)
as p ON (p.partition_id = zk.partition_id AND p.database = r.database AND p.table = r.table)
WHERE p.partition_id = '' AND zk.partition_id <> 'all'
ORDER BY r.database, r.table, zk.block_numbers_path, zk.partition_id, p.partition_id
) t
FORMAT TSVRaw;
After 24.3
WITH
now() - INTERVAL 120 DAY as retain_old_partitions,
replicas AS (SELECT DISTINCT database, table, zookeeper_path || '/block_numbers' AS block_numbers_path FROM system.replicas),
zk_data AS (SELECT DISTINCT name as partition_id, path as block_numbers_path FROM system.zookeeper WHERE path IN (SELECT block_numbers_path FROM replicas) AND mtime < retain_old_partitions AND partition_id <> 'all'),
zk_partitions AS (SELECT DISTINCT database, table, partition_id FROM replicas JOIN zk_data USING block_numbers_path),
partitions AS (SELECT DISTINCT database, table, partition_id FROM system.parts)
SELECT
format('ALTER TABLE `{}`.`{}` {};',database, table, arrayStringConcat( arraySort(groupArray('FORGET PARTITION ID \'' || partition_id || '\'')), ', ')) AS query
FROM zk_partitions
WHERE (database, table, partition_id) NOT IN (SELECT * FROM partitions)
GROUP BY database, table
ORDER BY database, table
FORMAT TSVRaw;
After fixing https://github.com/ClickHouse/ClickHouse/issues/72807
WITH
now() - INTERVAL 120 DAY as retain_old_partitions,
replicas AS (SELECT DISTINCT database, table, zookeeper_path || '/block_numbers' AS block_numbers_path FROM clusterAllReplicas('{cluster}',system.replicas)),
zk_data AS (SELECT DISTINCT name as partition_id, path as block_numbers_path FROM system.zookeeper WHERE path IN (SELECT block_numbers_path FROM replicas) AND mtime < retain_old_partitions AND partition_id <> 'all'),
zk_partitions AS (SELECT DISTINCT database, table, partition_id FROM replicas JOIN zk_data USING block_numbers_path),
partitions AS (SELECT DISTINCT database, table, partition_id FROM clusterAllReplicas('{cluster}',system.parts))
SELECT
format('ALTER TABLE `{}`.`{}` ON CLUSTER \'{{cluster}}\' {};',database, table, arrayStringConcat( arraySort(groupArray('FORGET PARTITION ID \'' || partition_id || '\'')), ', ')) AS query
FROM zk_partitions
WHERE (database, table, partition_id) NOT IN (SELECT * FROM partitions)
GROUP BY database, table
ORDER BY database, table
FORMAT TSVRaw;
7 - Removing tasks in the replication queue related to empty partitions
Removing tasks in the replication queue related to empty partitions
SELECT 'ALTER TABLE ' || database || '.' || table || ' DROP PARTITION ID \''|| partition_id || '\';' FROM
(SELECT DISTINCT database, table, extract(new_part_name, '^[^_]+') as partition_id FROM clusterAllReplicas('{cluster}', system.replication_queue) ) as rq
LEFT JOIN
(SELECT database, table, partition_id, sum(rows) as rows_count, count() as part_count
FROM clusterAllReplicas('{cluster}', system.parts)
WHERE active GROUP BY database, table, partition_id
) as p
USING (database, table, partition_id)
WHERE p.rows_count = 0 AND p.part_count = 0
FORMAT TSVRaw;
8 - Can detached parts be dropped?
Here is what different statuses mean:
- Parts are renamed to ‘ignored’ if they were found during ATTACH together with other, bigger parts that cover the same blocks of data, i.e. they were already merged into something else.
- Parts are renamed to ‘broken’ if ClickHouse® was not able to load data from the parts. There could be different reasons: some files are lost, checksums are not correct, etc.
- Parts are renamed to ‘unexpected’ if they are present locally, but are not found in ZooKeeper, in case when an insert was not completed properly. The part is detached only if it’s old enough (5 minutes), otherwise CH registers this part in ZooKeeper as a new part.
- Parts are renamed to ‘cloned’ if ClickHouse have had some parts on local disk while repairing lost replica so already existed parts being renamed and put in detached directory. Controlled by setting
detach_old_local_parts_when_cloning_replica
.
‘Ignored’ parts are safe to delete. ‘Unexpected’ and ‘broken’ should be investigated, but it might not be an easy thing to do, especially for older parts. If the system.part_log table
is enabled you can find some information there. Otherwise you will need to look in clickhouse-server.log
for what happened when the parts were detached.
If there is another way you could confirm that there is no data loss in the affected tables, you could simply delete all detached parts.
It is important to monitor for detached parts and act quickly when they appear. If clickhouse-server.log
is lost it might be impossible to figure out what happened and why the parts were detached.
You can use system.asynchronous_metrics
or system.detached_parts
for monitoring.
select metric from system.asynchronous_metrics where metric ilike '%detach%'
NumberOfDetachedByUserParts
NumberOfDetachedParts
Here is a quick way to find out if you have detached parts along with the reason why.
SELECT database, table, reason, count()
FROM system.detached_parts
GROUP BY database, table, reason
ORDER BY database ASC, table ASC, reason ASC
Here is a query that can help with investigations. It looks for active parts containing the same data blocks as the detached parts. It generates commands to drop the detached parts.
with ['broken','unexpected','noquorum','ignored','broken-on-start','clone','attaching','deleting','tmp-fetch',
'covered-by-broken','merge-not-byte-identical','mutate-not-byte-identical','broken-from-backup'] as DETACH_REASONS
select a.*,
concat('alter table ',database,'.',table,' drop detached part ''',a.name,''' settings allow_drop_detached=1;') as drop,
concat('sudo rm -r ',a.path) as rm
from (select * replace(part[1] as partition_id, toInt64(part[2]) as min_block_number, toInt64(part[3]) as max_block_number),
arrayFilter(x -> x not in DETACH_REASONS, splitByChar('_',name)) as part
from system.detached_parts) a
left join (select database, table, partition_id, name, active, min_block_number, max_block_number from system.parts where active) b
on a.database=b.database and a.table=b.table and a.partition_id=b.partition_id
where a.min_block_number >= b.min_block_number
and a.max_block_number <= b.max_block_number
order by table, min_block_number, max_block_number
Other reasons
# rg forgetPartAndMoveToDetached --type cpp
# rg renameToDetached --type cpp
# rg makeCloneInDetached --type cpp
broken
unexpected
ignored
noquorum
merge-not-byte-identical
mutate-not-byte-identical -
broken-on-start
clone
covered-by-broken - that means that ClickHouse during initialization of replicated table detected that some part is not ok, and decided to refetch it from healthy replicas. So the part itself will be detached as 'broken' and if that part was a result of merge / mutation all the previuos generations of that will be marked as covered-by-broken. If clickhouse was able to download the final part you don't need those covered-by-broken.
The list of DETACH_REASONS: https://github.com/ClickHouse/ClickHouse/blob/master/src/Storages/MergeTree/MergeTreePartInfo.h#L163
9 - Database Size - Table - Column size
Tables
Table size
SELECT
database,
table,
formatReadableSize(sum(data_compressed_bytes) AS size) AS compressed,
formatReadableSize(sum(data_uncompressed_bytes) AS usize) AS uncompressed,
round(usize / size, 2) AS compr_rate,
sum(rows) AS rows,
count() AS part_count
FROM system.parts
WHERE (active = 1) AND (database LIKE '%') AND (table LIKE '%')
GROUP BY
database,
table
ORDER BY size DESC;
Table size + inner MatView (Atomic)
SELECT
p.database,
if(t.name = '', p.table, p.table||' ('||t.name||')') tbl,
formatReadableSize(sum(p.data_compressed_bytes) AS size) AS compressed,
formatReadableSize(sum(p.data_uncompressed_bytes) AS usize) AS uncompressed,
round(usize / size, 2) AS compr_rate,
sum(p.rows) AS rows,
count() AS part_count
FROM system.parts p left join system.tables t on p.database = t.database and p.table = '.inner_id.'||toString(t.uuid)
WHERE (active = 1) AND (tbl LIKE '%') AND (database LIKE '%')
GROUP BY
p.database,
tbl
ORDER BY size DESC;
Column size
SELECT
database,
table,
column,
formatReadableSize(sum(column_data_compressed_bytes) AS size) AS compressed,
formatReadableSize(sum(column_data_uncompressed_bytes) AS usize) AS uncompressed,
round(usize / size, 2) AS compr_ratio,
sum(rows) rows_cnt,
round(usize / rows_cnt, 2) avg_row_size
FROM system.parts_columns
WHERE (active = 1) AND (database LIKE '%') AND (table LIKE '%')
GROUP BY
database,
table,
column
ORDER BY size DESC;
Projections
Projection size
SELECT
database,
table,
name,
formatReadableSize(sum(data_compressed_bytes) AS size) AS compressed,
formatReadableSize(sum(data_uncompressed_bytes) AS usize) AS uncompressed,
round(usize / size, 2) AS compr_rate,
sum(rows) AS rows,
count() AS part_count
FROM system.projection_parts
WHERE (table = 'ptest') AND active
GROUP BY
database,
table,
name
ORDER BY size DESC;
Projection column size
SELECT
database,
table,
column,
formatReadableSize(sum(column_data_compressed_bytes) AS size) AS compressed,
formatReadableSize(sum(column_data_uncompressed_bytes) AS usize) AS uncompressed,
round(usize / size, 2) AS compr_rate
FROM system.projection_parts_columns
WHERE (active = 1) AND (table LIKE 'ptest')
GROUP BY
database,
table,
column
ORDER BY size DESC;
Understanding the columns data properties:
SELECT
count(),
* APPLY (uniq),
* APPLY (max),
* APPLY (min),
* APPLY(topK(5))
FROM table_name
FORMAT Vertical;
-- also you can add * APPLY (entropy) to show entropy (i.e. 'randomness' of the column).
-- if the table is huge add some WHERE condition to slice some 'representative' data range, for example single month / week / day of data.
Understanding the ingest pattern:
SELECT
database,
table,
median(rows),
median(bytes_on_disk),
sum(rows),
max(bytes_on_disk),
min(bytes_on_disk),
round(quantile(0.95)(bytes_on_disk), 0),
sum(bytes_on_disk),
count(),
countIf(NOT active),
uniqExact(partition)
FROM system.parts
WHERE (modification_time > (now() - 480)) AND (level = 0)
GROUP BY
database,
table
ORDER BY count() DESC
part_log
WITH 30 * 60 AS frame_size
SELECT
toStartOfInterval(event_time, toIntervalSecond(frame_size)) AS m,
database,
table,
ROUND(countIf(event_type = 'NewPart') / frame_size, 2) AS new,
ROUND(countIf(event_type = 'MergeParts') / frame_size, 2) AS merge,
ROUND(countIf(event_type = 'DownloadPart') / frame_size, 2) AS dl,
ROUND(countIf(event_type = 'RemovePart') / frame_size, 2) AS rm,
ROUND(countIf(event_type = 'MutatePart') / frame_size, 2) AS mut,
ROUND(countIf(event_type = 'MovePart') / frame_size, 2) AS mv
FROM system.part_log
WHERE event_time > (now() - toIntervalHour(24))
GROUP BY
m,
database,
table
ORDER BY
database ASC,
table ASC,
m ASC
WITH 30 * 60 AS frame_size
SELECT
toStartOfInterval(event_time, toIntervalSecond(frame_size)) AS m,
database,
table,
ROUND(countIf(event_type = 'NewPart') / frame_size, 2) AS inserts_per_sec,
ROUND(sumIf(rows, event_type = 'NewPart') / frame_size, 2) AS rows_per_sec,
ROUND(sumIf(size_in_bytes, event_type = 'NewPart') / frame_size, 2) AS bytes_per_sec
FROM system.part_log
WHERE event_time > (now() - toIntervalHour(24))
GROUP BY
m,
database,
table
ORDER BY
database ASC,
table ASC,
m ASC
Understanding the partitioning
SELECT
database,
table,
count(),
topK(5)(partition),
COLUMNS('metric.*') APPLY(quantiles(0.005, 0.05, 0.10, 0.25, 0.5, 0.75, 0.9, 0.95, 0.995))
FROM
(
SELECT
database,
table,
partition,
sum(bytes_on_disk) AS metric_bytes,
sum(data_uncompressed_bytes) AS metric_uncompressed_bytes,
sum(rows) AS metric_rows,
sum(primary_key_bytes_in_memory) AS metric_pk_size,
count() AS metric_count,
countIf(part_type = 'Wide') AS metric_wide_count,
countIf(part_type = 'Compact') AS metric_compact_count,
countIf(part_type = 'Memory') AS metric_memory_count
FROM system.parts
GROUP BY
database,
table,
partition
)
GROUP BY
database,
table
FORMAT Vertical
10 - Number of active parts in a partition
Q: Why do I have several active parts in a partition? Why ClickHouse® does not merge them immediately?
A: CH does not merge parts by time
Merge scheduler selects parts by own algorithm based on the current node workload / number of parts / size of parts.
CH merge scheduler balances between a big number of parts and a wasting resources on merges.
Merges are CPU/DISK IO expensive. If CH will merge every new part then all resources will be spend on merges and will no resources remain on queries (selects ).
CH will not merge parts with a combined size greater than 150 GB max_bytes_to_merge_at_max_space_in_pool.
SELECT
database,
table,
partition,
sum(rows) AS rows,
count() AS part_count
FROM system.parts
WHERE (active = 1) AND (table LIKE '%') AND (database LIKE '%')
GROUP BY
database,
table,
partition
ORDER BY part_count DESC
limit 20
11 - Parts consistency
Check if there are blocks missing
SELECT
database,
table,
partition_id,
ranges.1 AS previous_part,
ranges.2 AS next_part,
ranges.3 AS previous_block_number,
ranges.4 AS next_block_number,
range(toUInt64(previous_block_number + 1), toUInt64(next_block_number)) AS missing_block_numbers
FROM
(
WITH
arrayPopFront(groupArray(min_block_number) AS min) AS min_adj,
arrayPopBack(groupArray(max_block_number) AS max) AS max_adj,
arrayFilter((x, y, z) -> (y != (z + 1)), arrayZip(arrayPopBack(groupArray(name) AS name_arr), arrayPopFront(name_arr), max_adj, min_adj), min_adj, max_adj) AS missing_ranges
SELECT
database,
table,
partition_id,
missing_ranges
FROM
(
SELECT *
FROM system.parts
WHERE active AND (table = 'query_thread_log') AND (partition_id = '202108') AND active
ORDER BY min_block_number ASC
)
GROUP BY
database,
table,
partition_id
)
ARRAY JOIN missing_ranges AS ranges
┌─database─┬─table────────────┬─partition_id─┬─previous_part───────┬─next_part──────────┬─previous_block_number─┬─next_block_number─┬─missing_block_numbers─┐
│ system │ query_thread_log │ 202108 │ 202108_864_1637_556 │ 202108_1639_1639_0 │ 1637 │ 1639 │ [1638] │
└──────────┴──────────────────┴──────────────┴─────────────────────┴────────────────────┴───────────────────────┴───────────────────┴───────────────────────┘
Find the number of blocks in a table
SELECT
database,
table,
partition_id,
sum(max_block_number - min_block_number) AS blocks_count
FROM system.parts
WHERE active AND (table = 'query_thread_log') AND (partition_id = '202108') AND active
GROUP BY
database,
table,
partition_id
┌─database─┬─table────────────┬─partition_id─┬─blocks_count─┐
│ system │ query_thread_log │ 202108 │ 1635 │
└──────────┴──────────────────┴──────────────┴──────────────┘
Compare the list of parts in ZooKeeper with the list of parts on disk
select zoo.p_path as part_zoo, zoo.ctime, zoo.mtime, disk.p_path as part_disk
from
(
select concat(path,'/',name) as p_path, ctime, mtime
from system.zookeeper where path in (select concat(replica_path,'/parts') from system.replicas)
) zoo
left join
(
select concat(replica_path,'/parts/',name) as p_path
from system.parts inner join system.replicas using (database, table)
) disk on zoo.p_path = disk.p_path
where part_disk='' and zoo.mtime <= now() - interval 1 hour
order by part_zoo;
You can clean that orphan zk records (need to execute using delete
in zkCli, rm
in zk-shell):
select 'delete '||part_zoo
from (
select zoo.p_path as part_zoo, zoo.ctime, zoo.mtime, disk.p_path as part_disk
from
(
select concat(path,'/',name) as p_path, ctime, mtime
from system.zookeeper where path in (select concat(replica_path,'/parts') from system.replicas)
) zoo
left join
(
select concat(replica_path,'/parts/',name) as p_path
from system.parts inner join system.replicas using (database, table)
) disk on zoo.p_path = disk.p_path
where part_disk='' and zoo.mtime <= now() - interval 1 day
order by part_zoo) format TSVRaw;