1 - Dictionaries & arrays
Dictionaries & arrays
Dictionary with Clickhouse table as a source
Test data
DROP TABLE IF EXISTS arr_src;
CREATE TABLE arr_src
(
key UInt64,
array_int Array(Int64),
array_str Array(String)
) ENGINE = MergeTree order by key;
INSERT INTO arr_src SELECT
number,
arrayMap(i -> (number * i), range(5)),
arrayMap(i -> concat('str', toString(number * i)), range(5))
FROM numbers(1000);
Dictionary
DROP DICTIONARY IF EXISTS arr_dict;
CREATE DICTIONARY arr_dict
(
key UInt64,
array_int Array(Int64) DEFAULT [1,2,3],
array_str Array(String) DEFAULT ['1','2','3']
)
PRIMARY KEY key
SOURCE(CLICKHOUSE(DATABASE 'default' TABLE 'arr_src'))
LIFETIME(120)
LAYOUT(HASHED());
SELECT
dictGet('arr_dict', 'array_int', toUInt64(42)) AS res_int,
dictGetOrDefault('arr_dict', 'array_str', toUInt64(424242), ['none']) AS res_str
┌─res_int───────────┬─res_str──┐
│ [0,42,84,126,168] │ ['none'] │
└───────────────────┴──────────┘
Dictionary with PostgreSQL as a source
Test data in PG
create user ch;
create database ch;
GRANT ALL PRIVILEGES ON DATABASE ch TO ch;
ALTER USER ch WITH PASSWORD 'chch';
CREATE TABLE arr_src (
key int,
array_int integer[],
array_str text[]
);
INSERT INTO arr_src VALUES
(42, '{0,42,84,126,168}','{"str0","str42","str84","str126","str168"}'),
(66, '{0,66,132,198,264}','{"str0","str66","str132","str198","str264"}');
Dictionary Example
CREATE DICTIONARY pg_arr_dict
(
key UInt64,
array_int Array(Int64) DEFAULT [1,2,3],
array_str Array(String) DEFAULT ['1','2','3']
)
PRIMARY KEY key
SOURCE(POSTGRESQL(PORT 5432 HOST 'pg-host'
user 'ch' password 'chch' DATABASE 'ch' TABLE 'arr_src'))
LIFETIME(120)
LAYOUT(HASHED());
select * from pg_arr_dict;
┌─key─┬─array_int──────────┬─array_str───────────────────────────────────┐
│ 66 │ [0,66,132,198,264] │ ['str0','str66','str132','str198','str264'] │
│ 42 │ [0,42,84,126,168] │ ['str0','str42','str84','str126','str168'] │
└─────┴────────────────────┴─────────────────────────────────────────────┘
SELECT
dictGet('pg_arr_dict', 'array_int', toUInt64(42)) AS res_int,
dictGetOrDefault('pg_arr_dict', 'array_str', toUInt64(424242), ['none']) AS res_str
┌─res_int───────────┬─res_str──┐
│ [0,42,84,126,168] │ ['none'] │
└───────────────────┴──────────┘
Dictionary with MySQL as a source
Test data in MySQL
-- casted into CH Arrays
create table arr_src(
_key bigint(20) NOT NULL,
_array_int text,
_array_str text,
PRIMARY KEY(_key)
);
INSERT INTO arr_src VALUES
(42, '[0,42,84,126,168]','[''str0'',''str42'',''str84'',''str126'',''str168'']'),
(66, '[0,66,132,198,264]','[''str0'',''str66'',''str132'',''str198'',''str264'']');
Dictionary in MySQL
-- supporting table to cast data
CREATE TABLE arr_src
(
`_key` UInt8,
`_array_int` String,
`array_int` Array(Int32) ALIAS cast(_array_int, 'Array(Int32)'),
`_array_str` String,
`array_str` Array(String) ALIAS cast(_array_str, 'Array(String)')
)
ENGINE = MySQL('mysql_host', 'ch', 'arr_src', 'ch', 'pass');
-- dictionary fetches data from the supporting table
CREATE DICTIONARY mysql_arr_dict
(
_key UInt64,
array_int Array(Int64) DEFAULT [1,2,3],
array_str Array(String) DEFAULT ['1','2','3']
)
PRIMARY KEY _key
SOURCE(CLICKHOUSE(DATABASE 'default' TABLE 'arr_src'))
LIFETIME(120)
LAYOUT(HASHED());
select * from mysql_arr_dict;
┌─_key─┬─array_int──────────┬─array_str───────────────────────────────────┐
│ 66 │ [0,66,132,198,264] │ ['str0','str66','str132','str198','str264'] │
│ 42 │ [0,42,84,126,168] │ ['str0','str42','str84','str126','str168'] │
└──────┴────────────────────┴─────────────────────────────────────────────┘
SELECT
dictGet('mysql_arr_dict', 'array_int', toUInt64(42)) AS res_int,
dictGetOrDefault('mysql_arr_dict', 'array_str', toUInt64(424242), ['none']) AS res_str
┌─res_int───────────┬─res_str──┐
│ [0,42,84,126,168] │ ['none'] │
└───────────────────┴──────────┘
SELECT
dictGet('mysql_arr_dict', 'array_int', toUInt64(66)) AS res_int,
dictGetOrDefault('mysql_arr_dict', 'array_str', toUInt64(66), ['none']) AS res_str
┌─res_int────────────┬─res_str─────────────────────────────────────┐
│ [0,66,132,198,264] │ ['str0','str66','str132','str198','str264'] │
└────────────────────┴─────────────────────────────────────────────┘
2 - Dictionary on the top of several tables using VIEW
Dictionary on the top of several tables using VIEW
DROP TABLE IF EXISTS dictionary_source_en;
DROP TABLE IF EXISTS dictionary_source_ru;
DROP TABLE IF EXISTS dictionary_source_view;
DROP DICTIONARY IF EXISTS flat_dictionary;
CREATE TABLE dictionary_source_en
(
id UInt64,
value String
) ENGINE = TinyLog;
INSERT INTO dictionary_source_en VALUES (1, 'One'), (2,'Two'), (3, 'Three');
CREATE TABLE dictionary_source_ru
(
id UInt64,
value String
) ENGINE = TinyLog;
INSERT INTO dictionary_source_ru VALUES (1, 'Один'), (2,'Два'), (3, 'Три');
CREATE VIEW dictionary_source_view AS SELECT id, dictionary_source_en.value as value_en, dictionary_source_ru.value as value_ru FROM dictionary_source_en LEFT JOIN dictionary_source_ru USING (id);
select * from dictionary_source_view;
CREATE DICTIONARY flat_dictionary
(
id UInt64,
value_en String,
value_ru String
)
PRIMARY KEY id
SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'default' PASSWORD '' TABLE 'dictionary_source_view'))
LIFETIME(MIN 1 MAX 1000)
LAYOUT(FLAT());
SELECT
dictGet(concat(currentDatabase(), '.flat_dictionary'), 'value_en', number + 1),
dictGet(concat(currentDatabase(), '.flat_dictionary'), 'value_ru', number + 1)
FROM numbers(3);
3 - Example of PostgreSQL dictionary
Example of PostgreSQL dictionary
CREATE DICTIONARY postgres_dict
(
id UInt32,
value String
)
PRIMARY KEY id
SOURCE(
POSTGRESQL(
port 5432
host 'postgres1'
user 'postgres'
password 'mysecretpassword'
db 'clickhouse'
table 'test_schema.test_table'
)
)
LIFETIME(MIN 300 MAX 600)
LAYOUT(HASHED());
and later do
SELECT dictGetString(postgres_dict, 'value', toUInt64(1))
4 - MySQL8 source for dictionaries
MySQL8 source for dictionaries
Authorization
MySQL8 used default authorization plugin caching_sha2_password
. Unfortunately, libmysql
which currently used (21.4-) in clickhouse is not.
You can fix it during create custom user with mysql_native_password
authentication plugin.
CREATE USER IF NOT EXISTS 'clickhouse'@'%'
IDENTIFIED WITH mysql_native_password BY 'clickhouse_user_password';
CREATE DATABASE IF NOT EXISTS test;
GRANT ALL PRIVILEGES ON test.* TO 'clickhouse'@'%';
Table schema changes
As an example, in ClickHouse, run SHOW TABLE STATUS LIKE 'table_name'
and try to figure out was table schema changed or not from MySQL response field Update_time
.
By default, to properly data loading from MySQL8 source to dictionaries, please turn off the information_schema
cache.
You can change default behavior with create /etc/mysql/conf.d/information_schema_cache.cnf
with following content:
[mysqld]
information_schema_stats_expiry=0
Or setup it via SQL query:
SET GLOBAL information_schema_stats_expiry=0;
5 - Partial updates
Partial updates
Clickhouse is able to fetch from a source only updated rows. You need to define update_field
section.
As an example, We have a table in an external source MySQL, PG, HTTP, … defined with the following code sample:
CREATE TABLE cities
(
`polygon` Array(Tuple(Float64, Float64)),
`city` String,
`updated_at` DateTime DEFAULT now()
)
ENGINE = MergeTree ORDER BY city
When you add new row and update
some rows in this table you should update updated_at
with the new timestamp.
-- fetch updated rows every 30 seconds
CREATE DICTIONARY cities_dict (
polygon Array(Tuple(Float64, Float64)),
city String
)
PRIMARY KEY polygon
SOURCE(CLICKHOUSE( TABLE cities DB 'default'
update_field 'updated_at'))
LAYOUT(POLYGON())
LIFETIME(MIN 30 MAX 30)
A dictionary with update_field updated_at
will fetch only updated rows. A dictionary saves the current time (now) time of the last successful update and queries the source where updated_at >= previous_update - 1
(shift = 1 sec.).
In case of HTTP source Clickhouse will send get requests with update_field as an URL parameter &updated_at=2020-01-01%2000:01:01
6 - range_hashed example - open intervals
range_hashed example - open intervals
The following example shows a range_hashed
example at open intervals.
DROP TABLE IF EXISTS rates;
DROP DICTIONARY IF EXISTS rates_dict;
CREATE TABLE rates (
id UInt64,
date_start Nullable(Date),
date_end Nullable(Date),
rate Decimal64(4)
) engine=Log;
INSERT INTO rates VALUES (1, Null, '2021-03-13',99), (1, '2021-03-14','2021-03-16',100), (1, '2021-03-17', Null, 101), (2, '2021-03-14', Null, 200), (3, Null, '2021-03-14', 300), (4, '2021-03-14', '2021-03-14', 400);
CREATE DICTIONARY rates_dict
(
id UInt64,
date_start Date,
date_end Date,
rate Decimal64(4)
)
PRIMARY KEY id
SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'default' TABLE 'rates'))
LIFETIME(MIN 1 MAX 1000)
LAYOUT(RANGE_HASHED())
RANGE(MIN date_start MAX date_end);
SELECT * FROM rates_dict order by id, date_start;
┌─id─┬─date_start─┬───date_end─┬─────rate─┐
│ 1 │ 1970-01-01 │ 2021-03-13 │ 99.0000 │
│ 1 │ 2021-03-14 │ 2021-03-16 │ 100.0000 │
│ 1 │ 2021-03-17 │ 1970-01-01 │ 101.0000 │
│ 2 │ 2021-03-14 │ 1970-01-01 │ 200.0000 │
│ 3 │ 1970-01-01 │ 2021-03-14 │ 300.0000 │
│ 4 │ 2021-03-14 │ 2021-03-14 │ 400.0000 │
└────┴────────────┴────────────┴──────────┘
WITH
toDate('2021-03-10') + INTERVAL number DAY as date
select
date,
dictGet(currentDatabase() || '.rates_dict', 'rate', toUInt64(1), date) as rate1,
dictGet(currentDatabase() || '.rates_dict', 'rate', toUInt64(2), date) as rate2,
dictGet(currentDatabase() || '.rates_dict', 'rate', toUInt64(3), date) as rate3,
dictGet(currentDatabase() || '.rates_dict', 'rate', toUInt64(4), date) as rate4
FROM numbers(10);
┌───────date─┬────rate1─┬────rate2─┬────rate3─┬────rate4─┐
│ 2021-03-10 │ 99.0000 │ 0.0000 │ 300.0000 │ 0.0000 │
│ 2021-03-11 │ 99.0000 │ 0.0000 │ 300.0000 │ 0.0000 │
│ 2021-03-12 │ 99.0000 │ 0.0000 │ 300.0000 │ 0.0000 │
│ 2021-03-13 │ 99.0000 │ 0.0000 │ 300.0000 │ 0.0000 │
│ 2021-03-14 │ 100.0000 │ 200.0000 │ 300.0000 │ 400.0000 │
│ 2021-03-15 │ 100.0000 │ 200.0000 │ 0.0000 │ 0.0000 │
│ 2021-03-16 │ 100.0000 │ 200.0000 │ 0.0000 │ 0.0000 │
│ 2021-03-17 │ 101.0000 │ 200.0000 │ 0.0000 │ 0.0000 │
│ 2021-03-18 │ 101.0000 │ 200.0000 │ 0.0000 │ 0.0000 │
│ 2021-03-19 │ 101.0000 │ 200.0000 │ 0.0000 │ 0.0000 │
└────────────┴──────────┴──────────┴──────────┴──────────┘
7 - Security named collections
Security named collections
Dictionary with Clickhouse table as a source with named collections
Data for connecting to external sources can be stored in named collections
<clickhouse>
<named_collections>
<local_host>
<host>localhost</host>
<port>9000</port>
<database>default</database>
<user>ch_dict</user>
<password>mypass</password>
</local_host>
</named_collections>
</clickhouse>
Dictionary
DROP DICTIONARY IF EXISTS named_coll_dict;
CREATE DICTIONARY named_coll_dict
(
key UInt64,
val String
)
PRIMARY KEY key
SOURCE(CLICKHOUSE(NAME local_host TABLE my_table DB default))
LIFETIME(MIN 1 MAX 2)
LAYOUT(HASHED());
INSERT INTO my_table(key, val) VALUES(1, 'first row');
SELECT dictGet('named_coll_dict', 'b', 1);
┌─dictGet('named_coll_dict', 'b', 1)─┐
│ first row │
└────────────────────────────────────┘
8 - SPARSE_HASHED VS HASHED vs HASHED_ARRAY
SPARSE_HASHED VS HASHED VS HASHED_ARRAY
Sparse_hashed and hashed_array layouts are supposed to save memory but has some downsides. We can test it with the following:
create table orders(id UInt64, price Float64)
Engine = MergeTree() order by id;
insert into orders select number, 0 from numbers(5000000);
CREATE DICTIONARY orders_hashed (id UInt64, price Float64)
PRIMARY KEY id SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000
TABLE orders DB 'default' USER 'default'))
LIFETIME(MIN 0 MAX 0) LAYOUT(HASHED());
CREATE DICTIONARY orders_sparse (id UInt64, price Float64)
PRIMARY KEY id SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000
TABLE orders DB 'default' USER 'default'))
LIFETIME(MIN 0 MAX 0) LAYOUT(SPARSE_HASHED());
CREATE DICTIONARY orders_hashed_array (id UInt64, price Float64)
PRIMARY KEY id SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000
TABLE orders DB 'default' USER 'default'))
LIFETIME(MIN 0 MAX 0) LAYOUT(HASHED_ARRAY());
SELECT
name,
type,
status,
element_count,
formatReadableSize(bytes_allocated) AS RAM
FROM system.dictionaries
WHERE name LIKE 'orders%'
┌─name────────────────┬─type─────────┬─status─┬─element_count─┬─RAM────────┐
│ orders_hashed_array │ HashedArray │ LOADED │ 5000000 │ 68.77 MiB │
│ orders_sparse │ SparseHashed │ LOADED │ 5000000 │ 76.30 MiB │
│ orders_hashed │ Hashed │ LOADED │ 5000000 │ 256.00 MiB │
└─────────────────────┴──────────────┴────────┴───────────────┴────────────┘
SELECT sum(dictGet('default.orders_hashed', 'price', toUInt64(number))) AS res
FROM numbers(10000000)
┌─res─┐
│ 0 │
└─────┘
1 rows in set. Elapsed: 0.546 sec. Processed 10.01 million rows ...
SELECT sum(dictGet('default.orders_sparse', 'price', toUInt64(number))) AS res
FROM numbers(10000000)
┌─res─┐
│ 0 │
└─────┘
1 rows in set. Elapsed: 1.422 sec. Processed 10.01 million rows ...
SELECT sum(dictGet('default.orders_hashed_array', 'price', toUInt64(number))) AS res
FROM numbers(10000000)
┌─res─┐
│ 0 │
└─────┘
1 rows in set. Elapsed: 0.558 sec. Processed 10.01 million rows ...
As you can see SPARSE_HASHED is memory efficient and use about 3 times less memory (!!!) but is almost 3 times slower as well. On the other side HASHED_ARRAY is even more efficient in terms of memory usage and maintains almost the same performance as HASHED layout.