TPC-H (1999)
一个流行的基准测试,模拟批发供应商的内部数据仓库。数据存储在第三范式表示中,需要在查询运行时进行大量连接。尽管其年代久远且假设数据均匀且独立分布不切实际,但TPC-H 仍然是迄今为止最流行的OLAP基准测试。
参考文献
- TPC-H
- 用于决策支持和网络商务的新型TPC基准测试 (Poess 等人,2000)
- TPC-H 分析:从有影响力的基准测试中获得的隐藏信息和经验教训 (Boncz 等人),2013
- 量化 TPC-H 阻塞点及其优化 (Dresseler 等人),2020
首先,检出 TPC-H 存储库并编译数据生成器
git clone https://github.com/gregrahn/tpch-kit.git
cd tpch-kit/dbgen
make
然后,生成数据。参数-s
指定比例因子。例如,使用-s 100
,将为表'lineitem'生成6亿行。
./dbgen -s 100
现在在 ClickHouse 中创建表
CREATE TABLE nation (
n_nationkey Int32,
n_name String,
n_regionkey Int32,
n_comment String)
ORDER BY (n_regionkey, n_name);
CREATE TABLE region (
r_regionkey Int32,
r_name String,
r_comment String)
ORDER BY (r_name);
CREATE TABLE part (
p_partkey Int32,
p_name String,
p_mfgr String,
p_brand String,
p_type String,
p_size Int32,
p_container String,
p_retailprice Decimal(15,2),
p_comment String)
ORDER BY (p_mfgr, p_brand, p_type, p_name);
CREATE TABLE supplier (
s_suppkey Int32,
s_name String,
s_address String,
s_nationkey Int32,
s_phone String,
s_acctbal Decimal(15,2),
s_comment String)
ORDER BY (s_nationkey, s_address, s_name);
CREATE TABLE partsupp (
ps_partkey Int32,
ps_suppkey Int32,
ps_availqty Int32,
ps_supplycost Decimal(15,2),
ps_comment String)
ORDER BY (ps_suppkey, ps_availqty, ps_supplycost, ps_partkey);
CREATE TABLE customer (
c_custkey Int32,
c_name String,
c_address String,
c_nationkey Int32,
c_phone String,
c_acctbal Decimal(15,2),
c_mktsegment String,
c_comment String)
ORDER BY (c_nationkey, c_mktsegment, c_address, c_name, c_custkey);
CREATE TABLE orders (
o_orderkey Int32,
o_custkey Int32,
o_orderstatus String,
o_totalprice Decimal(15,2),
o_orderdate Date,
o_orderpriority String,
o_clerk String,
o_shippriority Int32,
o_comment String)
ORDER BY (o_orderdate, o_orderstatus, o_custkey);
CREATE TABLE lineitem (
l_orderkey Int32,
l_partkey Int32,
l_suppkey Int32,
l_linenumber Int32,
l_quantity Decimal(15,2),
l_extendedprice Decimal(15,2),
l_discount Decimal(15,2),
l_tax Decimal(15,2),
l_returnflag String,
l_linestatus String,
l_shipdate Date,
l_commitdate Date,
l_receiptdate Date,
l_shipinstruct String,
l_shipmode String,
l_comment String)
ORDER BY (l_suppkey, l_partkey, l_shipdate, l_commitdate, l_receiptdate);
数据可以按如下方式导入
clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO nation FORMAT CSV" < nation.tbl
clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO region FORMAT CSV" < region.tbl
clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO part FORMAT CSV" < part.tbl
clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO supplier FORMAT CSV" < supplier.tbl
clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO partsupp FORMAT CSV" < partsupp.tbl
clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO customers FORMAT CSV" < customers.tbl
clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO orders FORMAT CSV" < orders.tbl
clickhouse-client --format_csv_delimiter '|' --query "INSERT INTO lineitem FORMAT CSV" < lineitem.tbl
查询由./qgen -s <scaling_factor>
生成。s = 100
的示例查询
危险
TPC-H 大量使用相关子查询,截至撰写本文时(2024 年 9 月),ClickHouse 不支持这些子查询(问题 #6697)。因此,许多下面的基准查询将因错误而失败。
Q1
SELECT
l_returnflag,
l_linestatus,
sum(l_quantity) AS sum_qty,
sum(l_extendedprice) AS sum_base_price,
sum(l_extendedprice * (1 - l_discount)) AS sum_disc_price,
sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge,
avg(l_quantity) AS avg_qty,
avg(l_extendedprice) AS avg_price,
avg(l_discount) AS avg_disc,
count(*) AS count_order
FROM
lineitem
WHERE
l_shipdate <= date '1998-12-01' - interval '100' day
GROUP BY
l_returnflag,
l_linestatus
ORDER BY
l_returnflag,
l_linestatus;
Q2
SELECT
s_acctbal,
s_name,
n_name,
p_partkey,
p_mfgr,
s_address,
s_phone,
s_comment
FROM
part,
supplier,
partsupp,
nation,
region
WHERE
p_partkey = ps_partkey
AND s_suppkey = ps_suppkey
AND p_size = 21
AND p_type LIKE '%COPPER'
AND s_nationkey = n_nationkey
AND n_regionkey = r_regionkey
AND r_name = 'AMERICA'
AND ps_supplycost = (
SELECT
min(ps_supplycost)
FROM
partsupp,
supplier,
nation,
region
WHERE
p_partkey = ps_partkey
AND s_suppkey = ps_suppkey
AND s_nationkey = n_nationkey
AND n_regionkey = r_regionkey
AND r_name = 'AMERICA'
)
ORDER BY
s_acctbal desc,
n_name,
s_name,
p_partkey
LIMIT 100;
Q3
SELECT
l_orderkey,
sum(l_extendedprice * (1 - l_discount)) AS revenue,
o_orderdate,
o_shippriority
FROM
customer,
orders,
lineitem
WHERE
c_mktsegment = 'BUILDING'
AND c_custkey = o_custkey
AND l_orderkey = o_orderkey
AND o_orderdate < date '1995-03-10'
AND l_shipdate > date '1995-03-10'
GROUP BY
l_orderkey,
o_orderdate,
o_shippriority
ORDER BY
revenue desc,
o_orderdate
LIMIT 10;
Q4
SELECT
o_orderpriority,
count(*) AS order_count
FROM
orders
WHERE
o_orderdate >= date '1994-07-01'
AND o_orderdate < date '1994-07-01' + interval '3' month
AND EXISTS (
SELECT
*
FROM
lineitem
WHERE
l_orderkey = o_orderkey
AND l_commitdate < l_receiptdate
)
GROUP BY
o_orderpriority
ORDER BY
o_orderpriority;
Q5
SELECT
n_name,
sum(l_extendedprice * (1 - l_discount)) AS revenue
FROM
customer,
orders,
lineitem,
supplier,
nation,
region
WHERE
c_custkey = o_custkey
AND l_orderkey = o_orderkey
AND l_suppkey = s_suppkey
AND c_nationkey = s_nationkey
AND s_nationkey = n_nationkey
AND n_regionkey = r_regionkey
AND r_name = 'MIDDLE EAST'
AND o_orderdate >= date '1994-01-01'
AND o_orderdate < date '1994-01-01' + interval '1' year
GROUP BY
n_name
ORDER BY
revenue desc;
Q6
SELECT
sum(l_extendedprice * l_discount) AS revenue
FROM
lineitem
WHERE
l_shipdate >= date '1994-01-01'
AND l_shipdate < date '1994-01-01' + interval '1' year
AND l_discount between 0.09 - 0.01 AND 0.09 + 0.01
AND l_quantity < 24;
Q7
SELECT
supp_nation,
cust_nation,
l_year,
sum(volume) AS revenue
FROM
(
SELECT
n1.n_name AS supp_nation,
n2.n_name AS cust_nation,
extract(year FROM l_shipdate) AS l_year,
l_extendedprice * (1 - l_discount) AS volume
FROM
supplier,
lineitem,
orders,
customer,
nation n1,
nation n2
WHERE
s_suppkey = l_suppkey
AND o_orderkey = l_orderkey
AND c_custkey = o_custkey
AND s_nationkey = n1.n_nationkey
AND c_nationkey = n2.n_nationkey
AND (
(n1.n_name = 'UNITED KINGDOM' AND n2.n_name = 'ETHIOPIA')
OR (n1.n_name = 'ETHIOPIA' AND n2.n_name = 'UNITED KINGDOM')
)
AND l_shipdate between date '1995-01-01' AND date '1996-12-31'
) AS shipping
GROUP BY
supp_nation,
cust_nation,
l_year
ORDER BY
supp_nation,
cust_nation,
l_year;
Q8
SELECT
o_year,
sum(CASE
WHEN nation = 'ETHIOPIA' THEN volume
ELSE 0
END) / sum(volume) AS mkt_share
FROM
(
SELECT
extract(year FROM o_orderdate) AS o_year,
l_extendedprice * (1 - l_discount) AS volume,
n2.n_name AS nation
FROM
part,
supplier,
lineitem,
orders,
customer,
nation n1,
nation n2,
region
WHERE
p_partkey = l_partkey
AND s_suppkey = l_suppkey
AND l_orderkey = o_orderkey
AND o_custkey = c_custkey
AND c_nationkey = n1.n_nationkey
AND n1.n_regionkey = r_regionkey
AND r_name = 'AFRICA'
AND s_nationkey = n2.n_nationkey
AND o_orderdate between date '1995-01-01' AND date '1996-12-31'
AND p_type = 'SMALL POLISHED TIN'
) AS all_nations
GROUP BY
o_year
ORDER BY
o_year;
Q9
SELECT
nation,
o_year,
sum(amount) AS sum_profit
FROM
(
SELECT
n_name AS nation,
extract(year FROM o_orderdate) AS o_year,
l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity AS amount
FROM
part,
supplier,
lineitem,
partsupp,
orders,
nation
WHERE
s_suppkey = l_suppkey
AND ps_suppkey = l_suppkey
AND ps_partkey = l_partkey
AND p_partkey = l_partkey
AND o_orderkey = l_orderkey
AND s_nationkey = n_nationkey
AND p_name LIKE '%drab%'
) AS profit
GROUP BY
nation,
o_year
ORDER BY
nation,
o_year desc;
Q10
SELECT
c_custkey,
c_name,
sum(l_extendedprice * (1 - l_discount)) AS revenue,
c_acctbal,
n_name,
c_address,
c_phone,
c_comment
FROM
customer,
orders,
lineitem,
nation
WHERE
c_custkey = o_custkey
AND l_orderkey = o_orderkey
AND o_orderdate >= date '1993-06-01'
AND o_orderdate < date '1993-06-01' + interval '3' month
AND l_returnflag = 'R'
AND c_nationkey = n_nationkey
GROUP BY
c_custkey,
c_name,
c_acctbal,
c_phone,
n_name,
c_address,
c_comment
ORDER BY
revenue desc
LIMIT 20;
Q11
SELECT
ps_partkey,
sum(ps_supplycost * ps_availqty) AS value
FROM
partsupp,
supplier,
nation
WHERE
ps_suppkey = s_suppkey
AND s_nationkey = n_nationkey
AND n_name = 'MOZAMBIQUE'
GROUP BY
ps_partkey having
sum(ps_supplycost * ps_availqty) > (
SELECT
sum(ps_supplycost * ps_availqty) * 0.0000010000
FROM
partsupp,
supplier,
nation
WHERE
ps_suppkey = s_suppkey
AND s_nationkey = n_nationkey
AND n_name = 'MOZAMBIQUE'
)
ORDER BY
value desc;
Q12
SELECT
l_shipmode,
sum(CASE
WHEN o_orderpriority = '1-URGENT'
OR o_orderpriority = '2-HIGH'
THEN 1
ELSE 0
END) AS high_line_count,
sum(CASE
WHEN o_orderpriority <> '1-URGENT'
AND o_orderpriority <> '2-HIGH'
THEN 1
ELSE 0
END) AS low_line_count
FROM
orders,
lineitem
WHERE
o_orderkey = l_orderkey
AND l_shipmode in ('MAIL', 'AIR')
AND l_commitdate < l_receiptdate
AND l_shipdate < l_commitdate
AND l_receiptdate >= date '1996-01-01'
AND l_receiptdate < date '1996-01-01' + interval '1' year
GROUP BY
l_shipmode
ORDER BY
l_shipmode;
Q13
SELECT
c_count,
count(*) AS custdist
FROM
(
SELECT
c_custkey,
count(o_orderkey)
FROM
customer LEFT OUTER JOIN orders ON
c_custkey = o_custkey
AND o_comment NOT LIKE '%special%deposits%'
GROUP BY
c_custkey
) AS c_orders
GROUP BY
c_count
ORDER BY
custdist desc,
c_count desc;
Q14
SELECT
100.00 * sum(CASE
WHEN p_type LIKE 'PROMO%'
THEN l_extendedprice * (1 - l_discount)
ELSE 0
END) / sum(l_extendedprice * (1 - l_discount)) AS promo_revenue
FROM
lineitem,
part
WHERE
l_partkey = p_partkey
AND l_shipdate >= date '1996-10-01'
AND l_shipdate < date '1996-10-01' + interval '1' month;
Q15
CREATE VIEW revenue0 (supplier_no, total_revenue) AS
SELECT
l_suppkey,
sum(l_extendedprice * (1 - l_discount))
FROM
lineitem
WHERE
l_shipdate >= date '1997-06-01'
AND l_shipdate < date '1997-06-01' + interval '3' month
GROUP BY
l_suppkey;
SELECT
s_suppkey,
s_name,
s_address,
s_phone,
total_revenue
FROM
supplier,
revenue0
WHERE
s_suppkey = supplier_no
AND total_revenue = (
SELECT
max(total_revenue)
FROM
revenue0
)
ORDER BY
s_suppkey;
DROP VIEW revenue0;
Q16
SELECT
p_brand,
p_type,
p_size,
count(distinct ps_suppkey) AS supplier_cnt
FROM
partsupp,
part
WHERE
p_partkey = ps_partkey
AND p_brand <> 'Brand#15'
AND p_type NOT LIKE 'SMALL POLISHED%'
AND p_size in (21, 9, 46, 34, 50, 33, 17, 36)
AND ps_suppkey NOT in (
SELECT
s_suppkey
FROM
supplier
WHERE
s_comment LIKE '%Customer%Complaints%'
)
GROUP BY
p_brand,
p_type,
p_size
ORDER BY
supplier_cnt desc,
p_brand,
p_type,
p_size;
Q17
SELECT
sum(l_extendedprice) / 7.0 AS avg_yearly
FROM
lineitem,
part
WHERE
p_partkey = l_partkey
AND p_brand = 'Brand#52'
AND p_container = 'MED CASE'
AND l_quantity < (
SELECT
0.2 * avg(l_quantity)
FROM
lineitem
WHERE
l_partkey = p_partkey
);
Q18
SELECT
c_name,
c_custkey,
o_orderkey,
o_orderdate,
o_totalprice,
sum(l_quantity)
FROM
customer,
orders,
lineitem
WHERE
o_orderkey in (
SELECT
l_orderkey
FROM
lineitem
GROUP BY
l_orderkey having
sum(l_quantity) > 313
)
AND c_custkey = o_custkey
AND o_orderkey = l_orderkey
GROUP BY
c_name,
c_custkey,
o_orderkey,
o_orderdate,
o_totalprice
ORDER BY
o_totalprice desc,
o_orderdate
LIMIT 100;
Q19
SELECT
sum(l_extendedprice* (1 - l_discount)) AS revenue
FROM
lineitem,
part
WHERE
(
p_partkey = l_partkey
AND p_brand = 'Brand#31'
AND p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG')
AND l_quantity >= 3 AND l_quantity <= 3 + 10
AND p_size between 1 AND 5
AND l_shipmode in ('AIR', 'AIR REG')
AND l_shipinstruct = 'DELIVER IN PERSON'
)
OR
(
p_partkey = l_partkey
AND p_brand = 'Brand#54'
AND p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK')
AND l_quantity >= 17 AND l_quantity <= 17 + 10
AND p_size between 1 AND 10
AND l_shipmode in ('AIR', 'AIR REG')
AND l_shipinstruct = 'DELIVER IN PERSON'
)
OR
(
p_partkey = l_partkey
AND p_brand = 'Brand#54'
AND p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG')
AND l_quantity >= 26 AND l_quantity <= 26 + 10
AND p_size between 1 AND 15
AND l_shipmode in ('AIR', 'AIR REG')
AND l_shipinstruct = 'DELIVER IN PERSON'
);
Q20
SELECT
s_name,
s_address
FROM
supplier,
nation
WHERE
s_suppkey in (
SELECT
ps_suppkey
FROM
partsupp
WHERE
ps_partkey in (
SELECT
p_partkey
FROM
part
WHERE
p_name LIKE 'chiffon%'
)
AND ps_availqty > (
SELECT
0.5 * sum(l_quantity)
FROM
lineitem
WHERE
l_partkey = ps_partkey
AND l_suppkey = ps_suppkey
AND l_shipdate >= date '1997-01-01'
AND l_shipdate < date '1997-01-01' + interval '1' year
)
)
AND s_nationkey = n_nationkey
AND n_name = 'MOZAMBIQUE'
ORDER BY
s_name;
Q21
SELECT
s_name,
count(*) AS numwait
FROM
supplier,
lineitem l1,
orders,
nation
WHERE
s_suppkey = l1.l_suppkey
AND o_orderkey = l1.l_orderkey
AND o_orderstatus = 'F'
AND l1.l_receiptdate > l1.l_commitdate
AND EXISTS (
SELECT
*
FROM
lineitem l2
WHERE
l2.l_orderkey = l1.l_orderkey
AND l2.l_suppkey <> l1.l_suppkey
)
AND NOT EXISTS (
SELECT
*
FROM
lineitem l3
WHERE
l3.l_orderkey = l1.l_orderkey
AND l3.l_suppkey <> l1.l_suppkey
AND l3.l_receiptdate > l3.l_commitdate
)
AND s_nationkey = n_nationkey
AND n_name = 'RUSSIA'
GROUP BY
s_name
ORDER BY
numwait desc,
s_name
LIMIT 100;
Q22
SELECT
cntrycode,
count(*) AS numcust,
sum(c_acctbal) AS totacctbal
FROM
(
SELECT
substring(c_phone FROM 1 for 2) AS cntrycode,
c_acctbal
FROM
customer
WHERE
substring(c_phone FROM 1 for 2) in
('26', '34', '10', '18', '27', '12', '11')
AND c_acctbal > (
SELECT
avg(c_acctbal)
FROM
customer
WHERE
c_acctbal > 0.00
AND substring(c_phone FROM 1 for 2) in
('26', '34', '10', '18', '27', '12', '11')
)
AND NOT EXISTS (
SELECT
*
FROM
orders
WHERE
o_custkey = c_custkey
)
) AS custsale
GROUP BY
cntrycode
ORDER BY
cntrycode;