1 - Dictionaries & arrays

Dictionaries & arrays

Dictionary with Clickhouse table as a source

Test data

DROP TABLE IF EXISTS arr_src;
CREATE TABLE arr_src
(
    key UInt64,
    array_int Array(Int64),
    array_str Array(String)
) ENGINE = MergeTree order by key;

INSERT INTO arr_src SELECT
    number,
    arrayMap(i -> (number * i), range(5)),
    arrayMap(i -> concat('str', toString(number * i)), range(5))
FROM numbers(1000);

Dictionary

DROP DICTIONARY IF EXISTS arr_dict;
CREATE DICTIONARY arr_dict
(
    key UInt64,
    array_int Array(Int64) DEFAULT [1,2,3],
    array_str Array(String) DEFAULT ['1','2','3']
)
PRIMARY KEY key
SOURCE(CLICKHOUSE(DATABASE 'default' TABLE 'arr_src'))
LIFETIME(120)
LAYOUT(HASHED());

SELECT
    dictGet('arr_dict', 'array_int', toUInt64(42)) AS res_int,
    dictGetOrDefault('arr_dict', 'array_str', toUInt64(424242), ['none']) AS res_str

┌─res_int───────────┬─res_str──┐
 [0,42,84,126,168]  ['none'] 
└───────────────────┴──────────┘

Dictionary with PostgreSQL as a source

Test data in PG

create user ch;
create database ch;
GRANT ALL PRIVILEGES ON DATABASE ch TO ch;
ALTER USER ch WITH PASSWORD 'chch';

CREATE TABLE arr_src (
    key int,
    array_int   integer[],
    array_str   text[]
);

INSERT INTO arr_src VALUES
  (42, '{0,42,84,126,168}','{"str0","str42","str84","str126","str168"}'),
  (66, '{0,66,132,198,264}','{"str0","str66","str132","str198","str264"}');

Dictionary Example

CREATE DICTIONARY pg_arr_dict
(
    key UInt64,
    array_int Array(Int64) DEFAULT [1,2,3],
    array_str Array(String) DEFAULT ['1','2','3']
)
PRIMARY KEY key
SOURCE(POSTGRESQL(PORT 5432 HOST 'pg-host'
         user 'ch' password 'chch'  DATABASE  'ch' TABLE 'arr_src'))
LIFETIME(120)
LAYOUT(HASHED());

select * from pg_arr_dict;
┌─key─┬─array_int──────────┬─array_str───────────────────────────────────┐
  66  [0,66,132,198,264]  ['str0','str66','str132','str198','str264'] 
  42  [0,42,84,126,168]   ['str0','str42','str84','str126','str168']  
└─────┴────────────────────┴─────────────────────────────────────────────┘

SELECT
    dictGet('pg_arr_dict', 'array_int', toUInt64(42)) AS res_int,
    dictGetOrDefault('pg_arr_dict', 'array_str', toUInt64(424242), ['none']) AS res_str

┌─res_int───────────┬─res_str──┐
 [0,42,84,126,168]  ['none'] 
└───────────────────┴──────────┘

Dictionary with MySQL as a source

Test data in MySQL

-- casted into CH Arrays

create table arr_src(
   _key bigint(20) NOT NULL, 
   _array_int text, 
   _array_str text, 
   PRIMARY KEY(_key)
);

INSERT INTO arr_src VALUES 
  (42, '[0,42,84,126,168]','[''str0'',''str42'',''str84'',''str126'',''str168'']'),
  (66, '[0,66,132,198,264]','[''str0'',''str66'',''str132'',''str198'',''str264'']');

Dictionary in MySQL

-- supporting table to cast data
CREATE TABLE arr_src
(
    `_key` UInt8,
    `_array_int` String,
    `array_int` Array(Int32) ALIAS cast(_array_int, 'Array(Int32)'),
    `_array_str` String,
    `array_str` Array(String) ALIAS cast(_array_str, 'Array(String)')
)
ENGINE = MySQL('mysql_host', 'ch', 'arr_src', 'ch', 'pass');

-- dictionary fetches data from the supporting table
CREATE DICTIONARY mysql_arr_dict
(
    _key UInt64,
    array_int Array(Int64) DEFAULT [1,2,3],
    array_str Array(String) DEFAULT ['1','2','3']
)
PRIMARY KEY _key
SOURCE(CLICKHOUSE(DATABASE 'default' TABLE 'arr_src'))
LIFETIME(120)
LAYOUT(HASHED());


select * from mysql_arr_dict;
┌─_key─┬─array_int──────────┬─array_str───────────────────────────────────┐
   66  [0,66,132,198,264]  ['str0','str66','str132','str198','str264'] 
   42  [0,42,84,126,168]   ['str0','str42','str84','str126','str168']  
└──────┴────────────────────┴─────────────────────────────────────────────┘


SELECT
    dictGet('mysql_arr_dict', 'array_int', toUInt64(42)) AS res_int,
    dictGetOrDefault('mysql_arr_dict', 'array_str', toUInt64(424242), ['none']) AS res_str

┌─res_int───────────┬─res_str──┐
 [0,42,84,126,168]  ['none'] 
└───────────────────┴──────────┘

SELECT
    dictGet('mysql_arr_dict', 'array_int', toUInt64(66)) AS res_int,
    dictGetOrDefault('mysql_arr_dict', 'array_str', toUInt64(66), ['none']) AS res_str

┌─res_int────────────┬─res_str─────────────────────────────────────┐
 [0,66,132,198,264]  ['str0','str66','str132','str198','str264'] 
└────────────────────┴─────────────────────────────────────────────┘

2 - Dictionary on the top of the several tables using VIEW

Dictionary on the top of the several tables using VIEW

DROP TABLE IF EXISTS dictionary_source_en;
DROP TABLE IF EXISTS dictionary_source_ru;
DROP TABLE IF EXISTS dictionary_source_view;
DROP DICTIONARY IF EXISTS flat_dictionary;

CREATE TABLE dictionary_source_en
(
    id UInt64,
    value String
) ENGINE = TinyLog;

INSERT INTO dictionary_source_en VALUES (1, 'One'), (2,'Two'), (3, 'Three');

CREATE TABLE dictionary_source_ru
(
    id UInt64,
    value String
) ENGINE = TinyLog;

INSERT INTO dictionary_source_ru VALUES (1, 'Один'), (2,'Два'), (3, 'Три');

CREATE VIEW dictionary_source_view AS  SELECT id, dictionary_source_en.value as value_en, dictionary_source_ru.value as value_ru  FROM  dictionary_source_en LEFT JOIN dictionary_source_ru USING (id);

select * from dictionary_source_view;

CREATE DICTIONARY flat_dictionary
(
    id UInt64,
    value_en String,
    value_ru String
)
PRIMARY KEY id
SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'default' PASSWORD '' TABLE 'dictionary_source_view'))
LIFETIME(MIN 1 MAX 1000)
LAYOUT(FLAT());

SELECT
    dictGet(concat(currentDatabase(), '.flat_dictionary'), 'value_en', number + 1),
    dictGet(concat(currentDatabase(), '.flat_dictionary'), 'value_ru', number + 1)
FROM numbers(3);

3 - Example of PostgreSQL dictionary

Example of PostgreSQL dictionary
CREATE DICTIONARY postgres_dict
(
    id UInt32,
    value String
)
PRIMARY KEY id
SOURCE(
    POSTGRESQL(
        port 5432
        host 'postgres1'
        user  'postgres'
        password 'mysecretpassword'
        db 'clickhouse'
        table 'test_schema.test_table'
    )
)
LIFETIME(MIN 300 MAX 600)
LAYOUT(HASHED());

and later do

SELECT dictGetString(postgres_dict, 'value', toUInt64(1))

4 - MySQL8 source for dictionaries

MySQL8 source for dictionaries

Authorization

MySQL8 used default authorization plugin caching_sha2_password. Unfortunately, libmysql which currently used (21.4-) in clickhouse is not.

You can fix it during create custom user with mysql_native_password authentication plugin.

CREATE USER IF NOT EXISTS 'clickhouse'@'%'
IDENTIFIED WITH mysql_native_password BY 'clickhouse_user_password';

CREATE DATABASE IF NOT EXISTS test;

GRANT ALL PRIVILEGES ON test.* TO 'clickhouse'@'%';

Table schema changes

As an example, in ClickHouse, run SHOW TABLE STATUS LIKE 'table_name' and try to figure out was table schema changed or not from MySQL response field Update_time.

By default, to properly data loading from MySQL8 source to dictionaries, please turn off the information_schema cache.

You can change default behavior with create /etc/mysql/conf.d/information_schema_cache.cnfwith following content:

[mysqld]
information_schema_stats_expiry=0

Or setup it via SQL query:

SET GLOBAL information_schema_stats_expiry=0;

5 - Partial updates

Partial updates

Clickhouse is able to fetch from a source only updated rows. You need to define update_field section.

As an example, We have a table in an external source MySQL, PG, HTTP, … defined with the following code sample:

CREATE TABLE cities
(
    `polygon` Array(Tuple(Float64, Float64)),
    `city` String,
    `updated_at` DateTime DEFAULT now()
)
ENGINE = MergeTree ORDER BY city

When you add new row and update some rows in this table you should update updated_at with the new timestamp.

-- fetch updated rows every 30 seconds

CREATE DICTIONARY cities_dict (
    polygon Array(Tuple(Float64, Float64)),
    city String
)
PRIMARY KEY polygon
SOURCE(CLICKHOUSE( TABLE cities DB 'default'
                    update_field 'updated_at'))
LAYOUT(POLYGON())
LIFETIME(MIN 30 MAX 30)

A dictionary with update_field updated_at will fetch only updated rows. A dictionary saves the current time (now) time of the last successful update and queries the source where updated_at >= previous_update - 1 (shift = 1 sec.).

In case of HTTP source Clickhouse will send get requests with update_field as an URL parameter &updated_at=2020-01-01%2000:01:01

6 - range_hashed example - open intervals

range_hashed example - open intervals

The following example shows a range_hashed example at open intervals.

DROP TABLE IF EXISTS rates;

DROP DICTIONARY IF EXISTS rates_dict;

CREATE TABLE rates (
  id UInt64,
  date_start Nullable(Date),
  date_end Nullable(Date),
  rate Decimal64(4)
) engine=Log;

INSERT INTO rates VALUES (1, Null, '2021-03-13',99), (1, '2021-03-14','2021-03-16',100), (1, '2021-03-17', Null, 101), (2, '2021-03-14', Null, 200), (3, Null, '2021-03-14', 300), (4, '2021-03-14', '2021-03-14', 400);

CREATE DICTIONARY rates_dict
(
  id UInt64,
  date_start Date,
  date_end Date,
  rate Decimal64(4)
)
PRIMARY KEY id
SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'default' TABLE 'rates'))
LIFETIME(MIN 1 MAX 1000)
LAYOUT(RANGE_HASHED())
RANGE(MIN date_start MAX date_end);

SELECT * FROM rates_dict order by id, date_start;

┌─id─┬─date_start─┬───date_end─┬─────rate─┐
  1  1970-01-01  2021-03-13   99.0000 
  1  2021-03-14  2021-03-16  100.0000 
  1  2021-03-17  1970-01-01  101.0000 
  2  2021-03-14  1970-01-01  200.0000 
  3  1970-01-01  2021-03-14  300.0000 
  4  2021-03-14  2021-03-14  400.0000 
└────┴────────────┴────────────┴──────────┘

WITH
  toDate('2021-03-10') + INTERVAL number DAY as date
select
  date,
  dictGet(currentDatabase() || '.rates_dict', 'rate', toUInt64(1), date) as rate1,
  dictGet(currentDatabase() || '.rates_dict', 'rate', toUInt64(2), date) as rate2,
  dictGet(currentDatabase() || '.rates_dict', 'rate', toUInt64(3), date) as rate3,
  dictGet(currentDatabase() || '.rates_dict', 'rate', toUInt64(4), date) as rate4
FROM numbers(10);

┌───────date─┬────rate1─┬────rate2─┬────rate3─┬────rate4─┐
 2021-03-10   99.0000    0.0000  300.0000    0.0000 
 2021-03-11   99.0000    0.0000  300.0000    0.0000 
 2021-03-12   99.0000    0.0000  300.0000    0.0000 
 2021-03-13   99.0000    0.0000  300.0000    0.0000 
 2021-03-14  100.0000  200.0000  300.0000  400.0000 
 2021-03-15  100.0000  200.0000    0.0000    0.0000 
 2021-03-16  100.0000  200.0000    0.0000    0.0000 
 2021-03-17  101.0000  200.0000    0.0000    0.0000 
 2021-03-18  101.0000  200.0000    0.0000    0.0000 
 2021-03-19  101.0000  200.0000    0.0000    0.0000 
└────────────┴──────────┴──────────┴──────────┴──────────┘

7 - SPARSE_HASHED VS HASHED

SPARSE_HASHED VS HASHED

Sparse_hashed layout is supposed to save memory but has some downsides. We can test how much slower SPARSE_HASHED than HASHED is with the following:

create table orders(id UInt64, price Float64)
Engine = MergeTree() order by id;

insert into orders select number, 0 from numbers(5000000);

CREATE DICTIONARY orders_hashed (id UInt64, price Float64)
PRIMARY KEY id SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000
TABLE orders DB 'default' USER 'default'))
LIFETIME(MIN 0 MAX 0) LAYOUT(HASHED());

CREATE DICTIONARY orders_sparse (id UInt64, price Float64)
PRIMARY KEY id SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000
TABLE orders DB 'default' USER 'default'))
LIFETIME(MIN 0 MAX 0) LAYOUT(SPARSE_HASHED());

SELECT
    name,
    type,
    status,
    element_count,
    formatReadableSize(bytes_allocated) AS RAM
FROM system.dictionaries
WHERE name LIKE 'orders%'
┌─name──────────┬─type─────────┬─status─┬─element_count─┬─RAM────────┐
 orders_sparse  SparseHashed  LOADED        5000000  84.29 MiB  
 orders_hashed  Hashed        LOADED        5000000  256.00 MiB 
└───────────────┴──────────────┴────────┴───────────────┴────────────┘

SELECT sum(dictGet('default.orders_hashed', 'price', toUInt64(number))) AS res
FROM numbers(10000000)
┌─res─┐
   0 
└─────┘
1 rows in set. Elapsed: 0.279 sec. Processed 10.02 million rows ...

SELECT sum(dictGet('default.orders_sparse', 'price', toUInt64(number))) AS res
FROM numbers(10000000)
┌─res─┐
   0 
└─────┘
1 rows in set. Elapsed: 1.085 sec. Processed 10.02 million rows ...

As you can see SPARSE_HASHED is memory efficient and use about 3 times less memory (!!!) but is almost 4 times slower. But this is the ultimate case because this test does not read data from the disk (no MergeTree table involved).

We encourage you to test SPARSE_HASHED against your real queries, because it able to save a lot of memory and have larger (in rows) external dictionaries.