跳过至主要内容

改进 Map 性能

·阅读时间:3 分钟

问题

Map 查找(例如 a['key'])使用线性复杂度(在这里提到)并且可能效率低下。这是因为从表中选择具有特定键的值需要遍历 Map 列中所有行(N)上的所有键(~M),从而导致 ~MxN 查找。

使用 Map 进行查找可能比使用 String 列慢 10 倍。下面的实验还显示冷查询慢了约 10 倍,并且处理的数据量级不同(7.21 MB vs 5.65 GB)。

-- create table with SpanNAme as String and ResourceAttributes as Map
DROP TABLE IF EXISTS tbl;
CREATE TABLE tbl (
`Timestamp` DateTime64(9) CODEC (Delta(8), ZSTD(1)),
`TraceId` String CODEC (ZSTD(1)),
`ServiceName` LowCardinality(String) CODEC (ZSTD(1)),
`Duration` UInt8 CODEC (ZSTD(1)), -- Int64
`SpanName` LowCardinality(String) CODEC (ZSTD(1)),
`ResourceAttributes` Map(LowCardinality(String), String) CODEC (ZSTD(1))
)
ENGINE = MergeTree
PARTITION BY toDate(Timestamp)
ORDER BY (ServiceName, SpanName, toUnixTimestamp(Timestamp), TraceId);

-- create UDF to generate random Map data for ResourceAttributes
DROP FUNCTION IF EXISTS genmap;
CREATE FUNCTION genmap AS (n) -> arrayMap (x-> (x::String, (x*rand32())::String), range(1, n));

-- check that genmap is working as intended
SELECT genmap(10)::Map(String, String);

-- insert 1M rows
INSERT INTO tbl
SELECT
now() - randUniform(1, 1000000.) as Timestamp,
randomPrintableASCII(2) as TraceId,
randomPrintableASCII(2) as ServiceName,
rand32() as Duration,
randomPrintableASCII(2) as SpanName,
genmap(rand64()%500)::Map(String, String) as ResourceAttributes
FROM numbers(1_000_000);

-- querying for SpanName is faster
-- [cold] 0 rows in set. Elapsed: 0.642 sec. Processed 1.00 million rows, 7.21 MB (1.56 million rows/s., 11.22 MB/s.)
-- [warm] 0 rows in set. Elapsed: 0.164 sec. Processed 1.00 million rows, 7.21 MB (6.10 million rows/s., 43.99 MB/s.)
SELECT
COUNT(*),
avg(Duration/1E6) as average,
quantile(0.95)(Duration/1E6) as p95,
quantile(0.99)(Duration/1E6) as p99,
SpanName
FROM tbl
GROUP BY SpanName ORDER BY 1 DESC LIMIT 50 FORMAT Null;

-- query for ResourceAttributes is slower
-- [cold] 0 rows in set. Elapsed: 6.432 sec. Processed 1.00 million rows, 5.65 GB (155.46 thousand rows/s., 879.07 MB/s.)
-- [warm] 0 rows in set. Elapsed: 5.935 sec. Processed 1.00 million rows, 5.65 GB (168.50 thousand rows/s., 952.81 MB/s.)
SELECT
COUNT(*),
avg(Duration/1E6) as average,
quantile(0.95)(Duration/1E6) as p95,
quantile(0.99)(Duration/1E6) as p99,
ResourceAttributes['1'] as hostname
FROM tbl
GROUP BY hostname ORDER BY 1 DESC LIMIT 50 FORMAT Null;

解决方案为了改进查询,我们可以添加另一列,其值为默认值,表示 Map 列中的特定键,然后对其进行物化以填充现有行的值。这样,我们在插入时提取并存储所需的值,从而在查询时加快查找速度。

-- solution is to add a column with value defaulting to a particular key in Map
ALTER TABLE tbl ADD COLUMN hostname LowCardinality(String) DEFAULT ResourceAttributes['1'];
ALTER TABLE tbl MATERIALIZE COLUMN hostname;

-- query for hostname (new column) is now faster
-- [cold] 0 rows in set. Elapsed: 2.215 sec. Processed 1.00 million rows, 21.67 MB (451.52 thousand rows/s., 9.78 MB/s.)
-- [warm] 0 rows in set. Elapsed: 0.541 sec. Processed 1.00 million rows, 21.67 MB (1.85 million rows/s., 40.04 MB/s.)
SELECT
COUNT(*),
avg(Duration/1E6) as average,
quantile(0.95)(Duration/1E6) as p95,
quantile(0.99)(Duration/1E6) as p99,
hostname
FROM tbl
GROUP BY hostname ORDER BY 1 DESC LIMIT 50 FORMAT Null;

-- drop cache to run query cold
SYSTEM DROP FILESYSTEM CACHE;