跳至主要内容

故障排除的实用查询

·阅读时长 4 分钟

以下是一些用于故障排除 ClickHouse 和找出问题所在的有用查询,但顺序不限。我们还提供了一个很棒的博客,其中包含一些 用于监控 ClickHouse 的基本查询

查看哪些设置已从默认设置更改

SELECT
name,
value
FROM system.settings
WHERE changed

获取所有表的尺寸

SELECT table,
formatReadableSize(sum(bytes)) as size
FROM system.parts
WHERE active
GROUP BY table

响应看起来像

┌─table───────────┬─size──────┐
│ stat │ 38.89 MiB │
│ customers │ 525.00 B │
│ my_sparse_table │ 40.73 MiB │
│ crypto_prices │ 32.18 MiB │
│ hackernews │ 6.23 GiB │
└─────────────────┴───────────┘

表的行数和平均日尺寸

SELECT
table,
formatReadableSize(size) AS size,
rows,
days,
formatReadableSize(avgDaySize) AS avgDaySize
FROM
(
SELECT
table,
sum(bytes) AS size,
sum(rows) AS rows,
min(min_date) AS min_date,
max(max_date) AS max_date,
max_date - min_date AS days,
size / (max_date - min_date) AS avgDaySize
FROM system.parts
WHERE active
GROUP BY table
ORDER BY rows DESC
)

压缩列的百分比以及主索引在内存中的尺寸

您可以查看数据的压缩程度,按列区分。此查询还返回主索引在内存中的尺寸,因为主索引必须适合内存。这是需要了解的。

SELECT
parts.*,
columns.compressed_size,
columns.uncompressed_size,
columns.compression_ratio,
columns.compression_percentage
FROM
(
SELECT
table,
formatReadableSize(sum(data_uncompressed_bytes)) AS uncompressed_size,
formatReadableSize(sum(data_compressed_bytes)) AS compressed_size,
round(sum(data_compressed_bytes) / sum(data_uncompressed_bytes), 3) AS compression_ratio,
round(100 - ((sum(data_compressed_bytes) * 100) / sum(data_uncompressed_bytes)), 3) AS compression_percentage
FROM system.columns
GROUP BY table
) AS columns
RIGHT JOIN
(
SELECT
table,
sum(rows) AS rows,
max(modification_time) AS latest_modification,
formatReadableSize(sum(bytes)) AS disk_size,
formatReadableSize(sum(primary_key_bytes_in_memory)) AS primary_keys_size,
any(engine) AS engine,
sum(bytes) AS bytes_size
FROM system.parts
WHERE active
GROUP BY
database,
table
) AS parts ON columns.table = parts.table
ORDER BY parts.bytes_size DESC

客户端在过去 10 分钟内发送的查询数量

您可以随意增加或减少 toIntervalMinute(10) 函数中的时间间隔

SELECT
client_name,
count(),
query_kind,
toStartOfMinute(event_time) AS event_time_m
FROM system.query_log
WHERE (type = 'QueryStart') AND (event_time > (now() - toIntervalMinute(10)))
GROUP BY
event_time_m,
client_name,
query_kind
ORDER BY
event_time_m DESC,
count() ASC

每个分区中的部分数量

SELECT
concat(database, '.', table),
partition_id,
count()
FROM system.parts
WHERE active
GROUP BY
database,
table,
partition_id

查找长时间运行的查询

这有助于查找卡住的查询

SELECT
elapsed,
initial_user,
client_name,
hostname(),
query_id,
query
FROM clusterAllReplicas(default, system.processes)
ORDER BY elapsed DESC

使用运行最差的查询的查询 ID,我们可以获取一个堆栈跟踪,这在调试时很有用。

SET allow_introspection_functions=1;

SELECT
arrayStringConcat(
arrayMap(
x,
y -> concat(x, ': ', y),
arrayMap(x -> addressToLine(x), trace),
arrayMap(x -> demangle(addressToSymbol(x)), trace)
),
'\n'
) as trace
FROM
system.stack_trace
WHERE
query_id = '0bb6e88b-9b9a-4ffc-b612-5746c859e360';

查看最近的错误

SELECT *
FROM system.errors
ORDER BY last_error_time DESC

响应看起来像

┌─name──────────────────┬─code─┬─value─┬─────last_error_time─┬─last_error_message──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬─last_error_trace─┬─remote─┐
│ UNKNOWN_TABLE │ 60 │ 3 │ 2023-03-14 01:02:35 │ Table system.stack_trace doesn't exist │ [] │ 0 │
│ BAD_GET │ 170 │ 1 │ 2023-03-14 00:58:55 │ Requested cluster 'default' not found │ [] │ 0 │
│ UNKNOWN_IDENTIFIER │ 47 │ 1 │ 2023-03-14 00:49:12 │ Missing columns: 'parts.table' 'table' while processing query: 'table = parts.table', required columns: 'table' 'parts.table' 'table' 'parts.table' │ [] │ 0 │
│ NO_ELEMENTS_IN_CONFIG │ 139 │ 2 │ 2023-03-14 00:42:11 │ Certificate file is not set. │ [] │ 0 │
└───────────────────────┴──────┴───────┴─────────────────────┴─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴──────────────────┴────────┘

使用最多 CPU 和内存的前 10 个查询

SELECT
type,
event_time,
initial_query_id,
formatReadableSize(memory_usage) AS memory,
`ProfileEvents.Values`[indexOf(`ProfileEvents.Names`, 'UserTimeMicroseconds')] AS userCPU,
`ProfileEvents.Values`[indexOf(`ProfileEvents.Names`, 'SystemTimeMicroseconds')] AS systemCPU,
normalizedQueryHash(query) AS normalized_query_hash
FROM system.query_log
ORDER BY memory_usage DESC
LIMIT 10

我的投影使用了多少磁盘空间

SELECT
name,
parent_name,
formatReadableSize(bytes_on_disk) AS bytes,
formatReadableSize(parent_bytes_on_disk) AS parent_bytes,
bytes_on_disk / parent_bytes_on_disk AS ratio
FROM system.projection_parts

显示系统中所有数据库的磁盘存储空间、分区数量、system.parts 表中的行数以及标记

SELECT
database,
table,
partition,
count() AS parts,
formatReadableSize(sum(bytes_on_disk)) AS bytes_on_disk,
formatReadableQuantity(sum(rows)) AS rows,
sum(marks) AS marks
FROM system.parts
WHERE (database != 'system') AND active
GROUP BY
database,
table,
partition
ORDER BY database ASC

列出最近写入的新分区的详细信息

详细信息包括创建时间、大小、行数等

SELECT
modification_time,
rows,
formatReadableSize(bytes_on_disk),
*
FROM clusterAllReplicas(default, system.parts)
WHERE (database = 'default') AND active AND (level = 0)
ORDER BY modification_time DESC
LIMIT 100