In this project, you'll extend your SQL runtime to support indexing.
The testing server uses a spinning HDD; Reading from this is slow. On top of that, the cost of splitting lines on the pipe character, and the cost of parsing integers, floats, dates, etc... all add up very quickly. The server will have enough memory to read in all the data and store everything pre-parsed.
CREATE TABLE statements include three new options:
CREATE TABLE LINEITEM( ORDERKEY INT, PARTKEY INT, SUPPKEY INT, LINENUMBER INT, QUANTITY FLOAT, EXTENDEDPRICE FLOAT, DISCOUNT FLOAT, TAX FLOAT, RETURNFLAG STRING, LINESTATUS STRING, SHIPDATE DATE, COMMITDATE DATE, RECEIPTDATE DATE, SHIPINSTRUCT STRING, SHIPMODE STRING, COMMENT STRING ) USING csv OPTIONS( path 'data/LINEITEM.data', delimiter = '|', hash_index 'returnflag', tree_index 'shipdate|receiptdate', primary_key 'orderkey,linenumber' )
Specifically, the following indexes will be suggested.
Other optimizations to consider include:
TPC-H is a standard database benchmark. The benchmark consists of a dataset generator and 22 standard query templates. This checkpoint uses three queries based on TPC-H Queries 1, 3, 5, 6, 10, 11, 12, and 14. The dataset generator and template values can be found at the TPC-H website, and is run at scaling factor (SF) 0.1. Minor variations in the queries may be made. The queries have been rewritten slightly to make them easier to Analyze.
SELECT LINEITEM.RETURNFLAG, LINEITEM.LINESTATUS, SUM(LINEITEM.QUANTITY) AS SUM_QTY, SUM(LINEITEM.EXTENDEDPRICE) AS SUM_BASE_PRICE, SUM(LINEITEM.EXTENDEDPRICE*(CAST(1.0 as float)-LINEITEM.DISCOUNT)) AS SUM_DISC_PRICE, SUM(LINEITEM.EXTENDEDPRICE*(CAST(1.0 as float)-LINEITEM.DISCOUNT)*(CAST(1.0 as float)+LINEITEM.TAX)) AS SUM_CHARGE, AVG(LINEITEM.QUANTITY) AS AVG_QTY, AVG(LINEITEM.EXTENDEDPRICE) AS AVG_PRICE, AVG(LINEITEM.DISCOUNT) AS AVG_DISC, COUNT(*) AS COUNT_ORDER FROM LINEITEM WHERE LINEITEM.SHIPDATE <= DATE '1998-10-01' GROUP BY LINEITEM.RETURNFLAG, LINEITEM.LINESTATUS ORDER BY LINEITEM.RETURNFLAG, LINEITEM.LINESTATUS
SELECT LINEITEM.ORDERKEY, SUM(LINEITEM.EXTENDEDPRICE*(CAST(1.0 as float)-LINEITEM.DISCOUNT)) AS REVENUE, ORDERS.ORDERDATE, ORDERS.SHIPPRIORITY FROM CUSTOMER, ORDERS, LINEITEM WHERE CUSTOMER.MKTSEGMENT = 'BUILDING' AND CUSTOMER.CUSTKEY = ORDERS.CUSTKEY AND LINEITEM.ORDERKEY = ORDERS.ORDERKEY AND ORDERS.ORDERDATE < DATE '1995-03-15' AND LINEITEM.SHIPDATE > DATE '1995-03-15' GROUP BY LINEITEM.ORDERKEY, ORDERS.ORDERDATE, ORDERS.SHIPPRIORITY ORDER BY REVENUE DESC, ORDERDATE LIMIT 10
SELECT NATION.NAME, SUM(LINEITEM.EXTENDEDPRICE * (CAST(1.0 as float) - LINEITEM.DISCOUNT)) AS REVENUE FROM REGION, NATION, CUSTOMER, ORDERS, LINEITEM, SUPPLIER WHERE CUSTOMER.CUSTKEY = ORDERS.CUSTKEY AND LINEITEM.ORDERKEY = ORDERS.ORDERKEY AND LINEITEM.SUPPKEY = SUPPLIER.SUPPKEY AND CUSTOMER.NATIONKEY = NATION.NATIONKEY AND SUPPLIER.NATIONKEY = NATION.NATIONKEY AND NATION.REGIONKEY = REGION.REGIONKEY AND REGION.NAME = 'ASIA' AND ORDERS.ORDERDATE >= DATE '1994-01-01' AND ORDERS.ORDERDATE < DATE '1995-01-01' GROUP BY NATION.NAME ORDER BY REVENUE DESC
SELECT SUM(LINEITEM.EXTENDEDPRICE*LINEITEM.DISCOUNT) AS REVENUE FROM LINEITEM WHERE LINEITEM.SHIPDATE >= DATE '1994-01-01' AND LINEITEM.SHIPDATE < DATE '1995-01-01' AND LINEITEM.DISCOUNT > CAST(0.05 AS float) AND LINEITEM.DISCOUNT < CAST(0.07 as float) AND LINEITEM.QUANTITY < CAST(24 AS float)
SELECT CUSTOMER.CUSTKEY, SUM(LINEITEM.EXTENDEDPRICE * (CAST(1.0 as float) - LINEITEM.DISCOUNT)) AS REVENUE, CUSTOMER.ACCTBAL, NATION.NAME, CUSTOMER.ADDRESS, CUSTOMER.PHONE, CUSTOMER.COMMENT FROM CUSTOMER, ORDERS, LINEITEM, NATION WHERE CUSTOMER.CUSTKEY = ORDERS.CUSTKEY AND LINEITEM.ORDERKEY = ORDERS.ORDERKEY AND ORDERS.ORDERDATE >= DATE '1993-10-01' AND ORDERS.ORDERDATE < DATE '1994-01-01' AND LINEITEM.RETURNFLAG = 'R' AND CUSTOMER.NATIONKEY = NATION.NATIONKEY GROUP BY CUSTOMER.CUSTKEY, CUSTOMER.ACCTBAL, CUSTOMER.PHONE, NATION.NAME, CUSTOMER.ADDRESS, CUSTOMER.COMMENT ORDER BY REVENUE ASC LIMIT 20
SELECT PK_V.PARTKEY, PK_V.VALUE FROM ( SELECT PS.PARTKEY, SUM(PS.SUPPLYCOST * CAST(PS.AVAILQTY AS float)) AS VALUE FROM PARTSUPP PS, SUPPLIER S, NATION N WHERE PS.SUPPKEY = S.SUPPKEY AND S.NATIONKEY = N.NATIONKEY AND N.NAME = 'GERMANY' GROUP BY PS.PARTKEY ) PK_V, ( SELECT SUM(PS.SUPPLYCOST * CAST(PS.AVAILQTY AS float)) AS VALUE FROM PARTSUPP PS, SUPPLIER S, NATION N WHERE PS.SUPPKEY = S.SUPPKEY AND S.NATIONKEY = N.NATIONKEY AND N.NAME = 'GERMANY' ) CUTOFF_V WHERE PK_V.VALUE > (CUTOFF_V.VALUE * CAST(0.0001 AS double) / CAST(100.0 AS double)) ORDER BY PK_V.VALUE DESC
SELECT LINEITEM.SHIPMODE, SUM(CASE WHEN ORDERS.ORDERPRIORITY = '1-URGENT' OR ORDERS.ORDERPRIORITY = '2-HIGH' THEN 1 ELSE 0 END) AS HIGH_LINE_COUNT, SUM(CASE WHEN ORDERS.ORDERPRIORITY <> '1-URGENT' AND ORDERS.ORDERPRIORITY <> '2-HIGH' THEN 1 ELSE 0 END) AS LOW_LINE_COUNT FROM LINEITEM, ORDERS WHERE ORDERS.ORDERKEY = LINEITEM.ORDERKEY AND (LINEITEM.SHIPMODE='MAIL' OR LINEITEM.SHIPMODE='SHIP') AND LINEITEM.COMMITDATE < LINEITEM.RECEIPTDATE AND LINEITEM.SHIPDATE < LINEITEM.COMMITDATE AND LINEITEM.RECEIPTDATE >= DATE '1994-01-01' AND LINEITEM.RECEIPTDATE < DATE '1995-01-01' GROUP BY LINEITEM.SHIPMODE ORDER BY LINEITEM.SHIPMODE
SELECT CAST(100.00 AS double) * PROMO_ONLY / ALL_REVENUE AS PROMO_REVENUE FROM ( SELECT SUM( CASE WHEN PART.TYPE LIKE 'PROMO%' THEN LINEITEM.EXTENDEDPRICE * (CAST(1.0 as float) - LINEITEM.DISCOUNT) ELSE cast(0 as float) END ) AS PROMO_ONLY, SUM( LINEITEM.EXTENDEDPRICE * (CAST(1.0 as float) - LINEITEM.DISCOUNT) ) AS ALL_REVENUE FROM LINEITEM, PART WHERE LINEITEM.PARTKEY = PART.PARTKEY AND LINEITEM.SHIPDATE >= DATE '1995-09-01' AND LINEITEM.SHIPDATE < DATE '1995-10-01' ) AGGREGATE
This page last updated 2024-09-19 13:18:43 -0400