Idempotent inserts into a materialized view

How to make idempotent inserts into a materialized view".

Why inserts into materialized views are not idempotent?

ClickHouse still does not have transactions. They will be implemented around 2022Q2.

Because of Clickhouse materialized view is a trigger. And an insert into a table and an insert into a subordinate materialized view it’s two different inserts so they are not atomic alltogether.

And insert into a materialized view may fail after the succesful insert into the table. In case of any failure a client gets the error about failed insertion. You may enable insert_deduplication (it’s enabled by default for Replciated engines) and repeate the insert with an idea to achive idempotate insertion, and insertion will be skipped into the source table becase of deduplication but it will be skipped for materialized view as well because by default materialized view inherites deduplication from the source table. It’s controlled by a parameter deduplicate_blocks_in_dependent_materialized_views https://clickhouse.com/docs/en/operations/settings/settings/#settings-deduplicate-blocks-in-dependent-materialized-views

If your materialized view is wide enought and always have enought data for constistent deduplication then you can enable deduplicate_blocks_in_dependent_materialized_views. Or you may add information for deduplication (some unique information / insert identifier).

Example 1. Inconsistency with deduplicate_blocks_in_dependent_materialized_views 0

create table test (A Int64, D Date) 
Engine =  ReplicatedMergeTree('/clickhouse/{cluster}/tables/{table}','{replica}') 
partition by toYYYYMM(D) order by A;

create materialized view test_mv 
Engine = ReplicatedSummingMergeTree('/clickhouse/{cluster}/tables/{table}','{replica}') 
partition by D order by D as 
select D, count() CNT from test group by D;

set max_partitions_per_insert_block=1; -- trick to fail insert into MV.

insert into test select number, today()+number%3 from numbers(100);
   DB::Exception: Received from localhost:9000. DB::Exception: Too many partitions 

select count() from test;
┌─count()─┐
     100   -- Insert was successful into the test table
└─────────┘

select sum(CNT) from test_mv;
0 rows in set. Elapsed: 0.001 sec.   -- Insert was unsuccessful into the test_mv table (DB::Exception)

-- Let's try to retry insertion
set max_partitions_per_insert_block=100; -- disable trick
insert into test select number, today()+number%3 from numbers(100); -- insert retry / No error
 
select count() from test;
┌─count()─┐
     100   -- insert was deduplicated
└─────────┘
 
select sum(CNT) from test_mv;
0 rows in set. Elapsed: 0.001 sec.    -- Inconsistency! Unfortunatly insert into MV was deduplicated as well

Example 2. Inconsistency with deduplicate_blocks_in_dependent_materialized_views 1

create table test (A Int64, D Date) 
Engine =  ReplicatedMergeTree('/clickhouse/{cluster}/tables/{table}','{replica}') 
partition by toYYYYMM(D) order by A;

create materialized view test_mv 
Engine = ReplicatedSummingMergeTree('/clickhouse/{cluster}/tables/{table}','{replica}') 
partition by D order by D as 
select D, count() CNT from test group by D;

set deduplicate_blocks_in_dependent_materialized_views=1;

insert into test select number, today() from numbers(100);      -- insert 100 rows
insert into test select number, today() from numbers(100,100);  -- insert another 100 rows


select count() from test;
┌─count()─┐
     200   -- 200 rows in the source test table
└─────────┘

select sum(CNT) from test_mv;
┌─sum(CNT)─┐
      100  -- Inconsistency! The second insert was falsely deduplicated because count() was = 100 both times 
└──────────┘

Example 3. Solution: no inconsistency with deduplicate_blocks_in_dependent_materialized_views 1

Let’s add some artificial insert_id generated by the source of inserts:

create table test (A Int64, D Date, insert_id Int64) 
Engine =  ReplicatedMergeTree('/clickhouse/{cluster}/tables/{table}','{replica}') 
partition by toYYYYMM(D) order by A;

create materialized view test_mv 
Engine = ReplicatedSummingMergeTree('/clickhouse/{cluster}/tables/{table}','{replica}') 
partition by D order by D as 
select D, count() CNT, any(insert_id) insert_id from test group by D;

set deduplicate_blocks_in_dependent_materialized_views=1;

insert into test select number, today(), 333 from numbers(100);
insert into test select number, today(), 444 from numbers(100,100);


select count() from test;
┌─count()─┐
│     200 │
└─────────┘

select sum(CNT) from test_mv;
┌─sum(CNT)─┐
│      200 │ -- no inconsistency, the second (100) was not deduplicated because 333<>444
└──────────┘

set max_partitions_per_insert_block=1;   -- trick to fail insert into MV.
insert into test select number, today()+number%3, 555 from numbers(100);  
    DB::Exception: Too many partitions for single INSERT block (more than 1)


select count() from test;
┌─count()─┐
│     300 │  -- insert is successful into the test table
└─────────┘

select sum(CNT) from test_mv;
┌─sum(CNT)─┐
│      200 │  -- insert was unsuccessful into the test_mv table
└──────────┘

set max_partitions_per_insert_block=100;
insert into test select number, today()+number%3, 555 from numbers(100);   -- insert retry


select count() from test;
┌─count()─┐
│     300 │  -- insert was deduplicated
└─────────┘

select sum(CNT) from test_mv;
┌─sum(CNT)─┐
│      300 │  -- No inconsistency! Insert was not deduplicated.
└──────────┘

Idea how to fix it in Clickhouse source code https://github.com/ClickHouse/ClickHouse/issues/30240

Fake (unused) metric to add uniqueness.

create materialized view test_mv
   Engine = ReplicatedSummingMergeTree('/clickhouse/{cluster}/tables/{table}','{replica}') 
   partition by D
   order by D
  as
   select
     D,
     count() CNT,
     sum( cityHash(*) ) insert_id
  from test group by D;