Example of the table at s3 with cache

s3 disk and s3 cache.

Storage configuration

cat /etc/clickhouse-server/config.d/s3.xml
<clickhouse>
    <storage_configuration>
        <disks>
            <s3disk>
                <type>s3</type>
                <endpoint>https://s3.us-east-1.amazonaws.com/mybucket/test/s3cached/</endpoint>
                <use_environment_credentials>1</use_environment_credentials>  <!-- use IAM AWS role -->
                    <!--access_key_id>xxxx</access_key_id>
                    <secret_access_key>xxx</secret_access_key-->
            </s3disk>
            <cache>
                <type>cache</type>
                <disk>s3disk</disk>
                <path>/var/lib/clickhouse/disks/s3_cache/</path>
                <max_size>50Gi</max_size>  <!-- 50GB local cache to cache remote data -->
            </cache>
        </disks>
        <policies>
          <s3tiered>
              <volumes>
                  <default>
                      <disk>default</disk>
                      <max_data_part_size_bytes>50000000000</max_data_part_size_bytes>   <!-- only for parts less than 50GB after they moved to s3 during merges -->         
                  </default>
                  <s3cached>
                      <disk>cache</disk>  <!-- sandwich cache plus s3disk -->
                      <!-- prefer_not_to_merge>true</prefer_not_to_merge>
                      <perform_ttl_move_on_insert>false</perform_ttl_move_on_insert-->
                  </s3cached>
              </volumes>
          </s3tiered>
        </policies>
    </storage_configuration>
</clickhouse>
select * from system.disks
┌─name────┬─path──────────────────────────────┬───────────free_space─┬──────────total_space─┬
│ cache   │ /var/lib/clickhouse/disks/s3disk/1844674407370955161518446744073709551615 │
│ default/var/lib/clickhouse/149113987072207907635200 │
│ s3disk  │ /var/lib/clickhouse/disks/s3disk/1844674407370955161518446744073709551615 │
└─────────┴───────────────────────────────────┴──────────────────────┴──────────────────────┴

select * from system.storage_policies;
┌─policy_name─┬─volume_name─┬─volume_priority─┬─disks───────┬─volume_type─┬─max_data_part_size─┬─move_factor─┬─prefer_not_to_merge─┐
│ defaultdefault1['default'] │ JBOD        │                  000 │
│ s3tiered    │ default1['default'] │ JBOD        │        500000000000.10 │
│ s3tiered    │ s3cached    │               2['s3disk']  │ JBOD        │                  00.10 │
└─────────────┴─────────────┴─────────────────┴─────────────┴─────────────┴────────────────────┴─────────────┴─────────────────────┘

example with a new table

CREATE TABLE test_s3
(
    `A` Int64,
    `S` String,
    `D` Date
)
ENGINE = MergeTree
PARTITION BY D
ORDER BY A
SETTINGS storage_policy = 's3tiered';

insert into test_s3 select number, number, '2023-01-01' from numbers(1e9);

0 rows in set. Elapsed: 270.285 sec. Processed 1.00 billion rows, 8.00 GB (3.70 million rows/s., 29.60 MB/s.)

Table size is 7.65 GiB and it at the default disk (EBS):

select disk_name, partition, sum(rows), formatReadableSize(sum(bytes_on_disk)) size, count() part_count 
from system.parts where table= 'test_s3' and active 
group by disk_name, partition;
┌─disk_name─┬─partition──┬──sum(rows)─┬─size─────┬─part_count─┐
│ default2023-01-0110000000007.65 GiB │          8 │
└───────────┴────────────┴────────────┴──────────┴────────────┘

It seems my EBS write speed is slower than S3 write speed:

alter table test_s3 move partition '2023-01-01' to volume 's3cached';
0 rows in set. Elapsed: 98.979 sec.

alter table test_s3 move partition '2023-01-01' to volume 'default';
0 rows in set. Elapsed: 127.741 sec.

Queries performance against EBS:

select * from test_s3 where A = 443;
1 row in set. Elapsed: 0.002 sec. Processed 8.19 thousand rows, 71.64 KB (3.36 million rows/s., 29.40 MB/s.)

select uniq(A) from test_s3;
1 row in set. Elapsed: 11.439 sec. Processed 1.00 billion rows, 8.00 GB (87.42 million rows/s., 699.33 MB/s.)

select count() from test_s3 where S like '%4422%'
1 row in set. Elapsed: 17.484 sec. Processed 1.00 billion rows, 17.89 GB (57.20 million rows/s., 1.02 GB/s.)

Let’s move data to S3

alter table test_s3 move partition '2023-01-01' to volume 's3cached';
0 rows in set. Elapsed: 81.068 sec.

select disk_name, partition, sum(rows), formatReadableSize(sum(bytes_on_disk)) size, count() part_count 
from system.parts where table= 'test_s3' and active 
group by disk_name, partition;
┌─disk_name─┬─partition──┬──sum(rows)─┬─size─────┬─part_count─┐
│ s3disk    │ 2023-01-0110000000007.65 GiB │          8 │
└───────────┴────────────┴────────────┴──────────┴────────────┘

The first query execution against S3, the second against the cache (local EBS):

select * from test_s3 where A = 443;
1 row in set. Elapsed: 0.458 sec. Processed 8.19 thousand rows, 71.64 KB (17.88 thousand rows/s., 156.35 KB/s.)
1 row in set. Elapsed: 0.003 sec. Processed 8.19 thousand rows, 71.64 KB (3.24 million rows/s., 28.32 MB/s.)

select uniq(A) from test_s3;
1 row in set. Elapsed: 26.601 sec. Processed 1.00 billion rows, 8.00 GB (37.59 million rows/s., 300.74 MB/s.)
1 row in set. Elapsed: 8.675 sec. Processed 1.00 billion rows, 8.00 GB (115.27 million rows/s., 922.15 MB/s.)

select count() from test_s3 where S like '%4422%'
1 row in set. Elapsed: 33.586 sec. Processed 1.00 billion rows, 17.89 GB (29.77 million rows/s., 532.63 MB/s.)
1 row in set. Elapsed: 16.551 sec. Processed 1.00 billion rows, 17.89 GB (60.42 million rows/s., 1.08 GB/s.)

Cache introspection

select cache_base_path, formatReadableSize(sum(size)) from system.filesystem_cache group by 1;
┌─cache_base_path─────────────────────┬─formatReadableSize(sum(size))─┐
│ /var/lib/clickhouse/disks/s3_cache/7.64 GiB                      │
└─────────────────────────────────────┴───────────────────────────────┘

system drop FILESYSTEM cache;

select cache_base_path, formatReadableSize(sum(size)) from system.filesystem_cache group by 1;
0 rows in set. Elapsed: 0.005 sec.

select * from test_s3 where A = 443;
1 row in set. Elapsed: 0.221 sec. Processed 8.19 thousand rows, 71.64 KB (37.10 thousand rows/s., 324.47 KB/s.)

select cache_base_path, formatReadableSize(sum(size)) from system.filesystem_cache group by 1;
┌─cache_base_path─────────────────────┬─formatReadableSize(sum(size))─┐
│ /var/lib/clickhouse/disks/s3_cache/105.95 KiB                    │
└─────────────────────────────────────┴───────────────────────────────┘

No data is stored locally (except system log tables).

select name, formatReadableSize(free_space) free_space, formatReadableSize(total_space) total_space from system.disks;
┌─name────┬─free_space─┬─total_space─┐
│ cache   │ 16.00 EiB  │ 16.00 EiB   │
│ default48.97 GiB  │ 49.09 GiB   │
│ s3disk  │ 16.00 EiB  │ 16.00 EiB   │
└─────────┴────────────┴─────────────┘

example with an existing table

The mydata table is created without the explicitly defined storage_policy, it means that implicitly storage_policy=default / volume=default / disk=default.

select disk_name, partition, sum(rows), formatReadableSize(sum(bytes_on_disk)) size, count() part_count 
from system.parts where table='mydata' and active 
group by disk_name, partition
order by partition;
┌─disk_name─┬─partition─┬─sum(rows)─┬─size───────┬─part_count─┐
│ default2022015166666774.01 GiB   │         13 │
│ default2022024666666573.64 GiB   │         13 │
│ default20220316666666138.36 MiB │         10 │
│ default2023015166666774.01 GiB   │         10 │
│ default2023024666666573.64 GiB   │         10 │
│ default20230316666666138.36 MiB │         10 │
└───────────┴───────────┴───────────┴────────────┴────────────┘

-- Let's change the storage policy, this command instant and changes only metadata of the table, and possible because the new storage policy and the old has the volume `default`.

alter table mydata modify setting storage_policy = 's3tiered';

0 rows in set. Elapsed: 0.057 sec.

straightforward (heavy) approach

-- Let's add TTL, it's a heavy command and takes a lot time and creates the performance impact, because it reads `D` column and moves parts to s3.
alter table mydata modify TTL D + interval 1 year to volume 's3cached';

0 rows in set. Elapsed: 140.661 sec.

┌─disk_name─┬─partition─┬─sum(rows)─┬─size───────┬─part_count─┐
│ s3disk    │ 2022015166666774.01 GiB   │         13 │
│ s3disk    │ 2022024666666573.64 GiB   │         13 │
│ s3disk    │ 20220316666666138.36 MiB │         10 │
│ default2023015166666774.01 GiB   │         10 │
│ default2023024666666573.64 GiB   │         10 │
│ default20230316666666138.36 MiB │         10 │
└───────────┴───────────┴───────────┴────────────┴────────────┘

gentle (manual) approach

-- alter modify TTL changes only metadata of the table and applied to only newly insterted data.
set materialize_ttl_after_modify=0;
alter table mydata modify TTL D + interval 1 year to volume 's3cached';
0 rows in set. Elapsed: 0.049 sec.

-- move data slowly partition by partition

alter table mydata move partition id '202201' to volume 's3cached';
0 rows in set. Elapsed: 49.410 sec.

alter table mydata move partition id '202202' to volume 's3cached';
0 rows in set. Elapsed: 36.952 sec.

alter table mydata move partition id '202203' to volume 's3cached';
0 rows in set. Elapsed: 4.808 sec.

-- data can be optimized to reduce number of parts before moving it to s3
optimize table mydata partition id '202301' final;
0 rows in set. Elapsed: 66.551 sec.

alter table mydata move partition id '202301' to volume 's3cached';
0 rows in set. Elapsed: 33.332 sec.

┌─disk_name─┬─partition─┬─sum(rows)─┬─size───────┬─part_count─┐
│ s3disk    │ 2022015166666774.01 GiB   │         13 │
│ s3disk    │ 2022024666666573.64 GiB   │         13 │
│ s3disk    │ 20220316666666138.36 MiB │         10 │
│ s3disk    │ 2023015166666774.01 GiB   │          1-- optimized partitiondefault2023024666666573.64 GiB   │         13 │
│ default20230316666666138.36 MiB │         10 │
└───────────┴───────────┴───────────┴────────────┴────────────┘

S3 and ClickHouse® start time

Let’s create a table with 1000 parts and move them to s3.

CREATE TABLE test_s3( A Int64, S String, D Date)
ENGINE = MergeTree PARTITION BY D ORDER BY A
SETTINGS storage_policy = 's3tiered';

insert into test_s3 select number, number, toDate('2000-01-01') + intDiv(number,1e6) from numbers(1e9);
optimize table test_s3 final settings optimize_skip_merged_partitions = 1;

select disk_name, sum(rows), formatReadableSize(sum(bytes_on_disk)) size, count() part_count 
from system.parts where table= 'test_s3' and active group by disk_name;
┌─disk_name─┬──sum(rows)─┬─size─────┬─part_count─┐
│ default10000000007.64 GiB │       1000 │
└───────────┴────────────┴──────────┴────────────┘

alter table test_s3 modify ttl D + interval 1 year to disk 's3disk';

select disk_name, sum(rows), formatReadableSize(sum(bytes_on_disk)) size, count() part_count 
from system.parts where table= 'test_s3' and active 
group by disk_name;
┌─disk_name─┬─sum(rows)─┬─size─────┬─part_count─┐
│ default7550000005.77 GiB │        755 │
│ s3disk    │ 2450000001.87 GiB │        245 │
└───────────┴───────────┴──────────┴────────────┘

----  several minutes later ----

┌─disk_name─┬──sum(rows)─┬─size─────┬─part_count─┐
│ s3disk    │ 10000000007.64 GiB │       1000 │
└───────────┴────────────┴──────────┴────────────┘

start time

:) select name, value from system.merge_tree_settings where name = 'max_part_loading_threads';
┌─name─────────────────────┬─value─────┐
│ max_part_loading_threads │ 'auto(4)' │
└──────────────────────────┴───────────┘

# systemctl stop clickhouse-server
# time systemctl start clickhouse-server  / real	4m26.766s
# systemctl stop clickhouse-server
# time systemctl start clickhouse-server  / real	4m24.263s

# cat /etc/clickhouse-server/config.d/max_part_loading_threads.xml
<?xml version="1.0"?>
<clickhouse>
    <merge_tree>
       <max_part_loading_threads>128</max_part_loading_threads>
    </merge_tree>
</clickhouse>

# systemctl stop clickhouse-server
# time systemctl start clickhouse-server / real	0m11.225s
# systemctl stop clickhouse-server
# time systemctl start clickhouse-server / real	0m10.797s

       <max_part_loading_threads>256</max_part_loading_threads>

# systemctl stop clickhouse-server
# time systemctl start clickhouse-server / real	0m8.474s
# systemctl stop clickhouse-server
# time systemctl start clickhouse-server / real	0m8.130s