跳到主要内容

故障排除的实用查询

·4 分钟阅读
一系列用于故障排除 ClickHouse 的实用查询,包括监控表大小、长时间运行的查询和错误。

故障排除的实用查询

这里有一些用于故障排除 ClickHouse 并找出正在发生的事情的实用查询,排名不分先后。

我们还有一个很棒的博客,其中包含一些用于监控 ClickHouse 的基本查询

查看哪些设置已从默认值更改

SELECT
name,
value
FROM system.settings
WHERE changed

获取所有表的大小

SELECT table,
formatReadableSize(sum(bytes)) as size
FROM system.parts
WHERE active
GROUP BY table

响应如下所示

┌─table───────────┬─size──────┐
│ stat │ 38.89 MiB │
│ customers │ 525.00 B │
│ my_sparse_table │ 40.73 MiB │
│ crypto_prices │ 32.18 MiB │
│ hackernews │ 6.23 GiB │
└─────────────────┴───────────┘

您的表的行数和平均每日大小

SELECT
table,
formatReadableSize(size) AS size,
rows,
days,
formatReadableSize(avgDaySize) AS avgDaySize
FROM
(
SELECT
table,
sum(bytes) AS size,
sum(rows) AS rows,
min(min_date) AS min_date,
max(max_date) AS max_date,
max_date - min_date AS days,
size / (max_date - min_date) AS avgDaySize
FROM system.parts
WHERE active
GROUP BY table
ORDER BY rows DESC
)

压缩列百分比以及内存中主索引的大小

您可以查看数据的按列压缩程度。此查询还返回内存中主索引的大小 - 了解这一点很有用,因为主索引必须适合内存。

SELECT
parts.*,
columns.compressed_size,
columns.uncompressed_size,
columns.compression_ratio,
columns.compression_percentage
FROM
(
SELECT
table,
formatReadableSize(sum(data_uncompressed_bytes)) AS uncompressed_size,
formatReadableSize(sum(data_compressed_bytes)) AS compressed_size,
round(sum(data_compressed_bytes) / sum(data_uncompressed_bytes), 3) AS compression_ratio,
round(100 - ((sum(data_compressed_bytes) * 100) / sum(data_uncompressed_bytes)), 3) AS compression_percentage
FROM system.columns
GROUP BY table
) AS columns
RIGHT JOIN
(
SELECT
table,
sum(rows) AS rows,
max(modification_time) AS latest_modification,
formatReadableSize(sum(bytes)) AS disk_size,
formatReadableSize(sum(primary_key_bytes_in_memory)) AS primary_keys_size,
any(engine) AS engine,
sum(bytes) AS bytes_size
FROM system.parts
WHERE active
GROUP BY
database,
table
) AS parts ON columns.table = parts.table
ORDER BY parts.bytes_size DESC

过去 10 分钟内客户端发送的查询数

可以随意增加或减少 toIntervalMinute(10) 函数中的时间间隔

SELECT
client_name,
count(),
query_kind,
toStartOfMinute(event_time) AS event_time_m
FROM system.query_log
WHERE (type = 'QueryStart') AND (event_time > (now() - toIntervalMinute(10)))
GROUP BY
event_time_m,
client_name,
query_kind
ORDER BY
event_time_m DESC,
count() ASC

每个分区中的 parts 数量

SELECT
concat(database, '.', table),
partition_id,
count()
FROM system.parts
WHERE active
GROUP BY
database,
table,
partition_id

查找长时间运行的查询

这可以帮助找到卡住的查询

SELECT
elapsed,
initial_user,
client_name,
hostname(),
query_id,
query
FROM clusterAllReplicas(default, system.processes)
ORDER BY elapsed DESC

使用运行最差的查询的查询 ID,我们可以获得一个堆栈跟踪,这在调试时很有帮助。

SET allow_introspection_functions=1;

SELECT
arrayStringConcat(
arrayMap(
x,
y -> concat(x, ': ', y),
arrayMap(x -> addressToLine(x), trace),
arrayMap(x -> demangle(addressToSymbol(x)), trace)
),
'\n'
) as trace
FROM
system.stack_trace
WHERE
query_id = '0bb6e88b-9b9a-4ffc-b612-5746c859e360';

查看最近的错误

SELECT *
FROM system.errors
ORDER BY last_error_time DESC

响应如下所示

┌─name──────────────────┬─code─┬─value─┬─────last_error_time─┬─last_error_message──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬─last_error_trace─┬─remote─┐
│ UNKNOWN_TABLE │ 60 │ 3 │ 2023-03-14 01:02:35 │ Table system.stack_trace doesn't exist │ [] │ 0 │
│ BAD_GET │ 170 │ 1 │ 2023-03-14 00:58:55 │ Requested cluster 'default' not found │ [] │ 0 │
│ UNKNOWN_IDENTIFIER │ 47 │ 1 │ 2023-03-14 00:49:12 │ Missing columns: 'parts.table' 'table' while processing query: 'table = parts.table', required columns: 'table' 'parts.table' 'table' 'parts.table' │ [] │ 0 │
│ NO_ELEMENTS_IN_CONFIG │ 139 │ 2 │ 2023-03-14 00:42:11 │ Certificate file is not set. │ [] │ 0 │
└───────────────────────┴──────┴───────┴─────────────────────┴─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴──────────────────┴────────┘

使用最多 CPU 和内存的前 10 个查询

SELECT
type,
event_time,
initial_query_id,
formatReadableSize(memory_usage) AS memory,
`ProfileEvents.Values`[indexOf(`ProfileEvents.Names`, 'UserTimeMicroseconds')] AS userCPU,
`ProfileEvents.Values`[indexOf(`ProfileEvents.Names`, 'SystemTimeMicroseconds')] AS systemCPU,
normalizedQueryHash(query) AS normalized_query_hash
FROM system.query_log
ORDER BY memory_usage DESC
LIMIT 10

我的 projection 使用了多少磁盘空间

SELECT
name,
parent_name,
formatReadableSize(bytes_on_disk) AS bytes,
formatReadableSize(parent_bytes_on_disk) AS parent_bytes,
bytes_on_disk / parent_bytes_on_disk AS ratio
FROM system.projection_parts

显示跨数据库的磁盘存储、parts 数量、system.parts 中的行数和 marks

SELECT
database,
table,
partition,
count() AS parts,
formatReadableSize(sum(bytes_on_disk)) AS bytes_on_disk,
formatReadableQuantity(sum(rows)) AS rows,
sum(marks) AS marks
FROM system.parts
WHERE (database != 'system') AND active
GROUP BY
database,
table,
partition
ORDER BY database ASC

列出最近写入的新 parts 的详细信息

详细信息包括它们的创建时间、大小、行数等等

SELECT
modification_time,
rows,
formatReadableSize(bytes_on_disk),
*
FROM clusterAllReplicas(default, system.parts)
WHERE (database = 'default') AND active AND (level = 0)
ORDER BY modification_time DESC
LIMIT 100