Skip to content

Commit

Permalink
Merge pull request #204 from ClickHouse/pchhetri/update-chdb
Browse files Browse the repository at this point in the history
chdb: run clickbench with v1.4.1 chdb
  • Loading branch information
chhetripradeep authored Jun 21, 2024
2 parents 4c4a438 + 69adece commit 743d924
Show file tree
Hide file tree
Showing 15 changed files with 496 additions and 142 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
*.bak
.idea
.clickbench
hits.csv
14 changes: 14 additions & 0 deletions chdb-parquet/benchmark.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/bash

# Install

sudo apt-get update
sudo apt-get install -y python3-pip
pip install chdb psutil

# Load the data
seq 0 99 | xargs -P100 -I{} bash -c 'wget --no-verbose --continue https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_{}.parquet'

# Run the queries

./run.sh 2>&1 | tee log.txt
43 changes: 43 additions & 0 deletions chdb-parquet/queries.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
SELECT COUNT(*) FROM file("hits_*.parquet", Parquet);
SELECT COUNT(*) FROM file("hits_*.parquet", Parquet) WHERE AdvEngineID <> 0;
SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM file("hits_*.parquet", Parquet);
SELECT AVG(UserID) FROM file("hits_*.parquet", Parquet);
SELECT COUNT(DISTINCT UserID) FROM file("hits_*.parquet", Parquet);
SELECT COUNT(DISTINCT SearchPhrase) FROM file("hits_*.parquet", Parquet);
SELECT MIN(EventDate), MAX(EventDate) FROM file("hits_*.parquet", Parquet);
SELECT AdvEngineID, COUNT(*) FROM file("hits_*.parquet", Parquet) WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC;
SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM file("hits_*.parquet", Parquet) GROUP BY RegionID ORDER BY u DESC LIMIT 10;
SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM file("hits_*.parquet", Parquet) GROUP BY RegionID ORDER BY c DESC LIMIT 10;
SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM file("hits_*.parquet", Parquet) WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;
SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM file("hits_*.parquet", Parquet) WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;
SELECT SearchPhrase, COUNT(*) AS c FROM file("hits_*.parquet", Parquet) WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM file("hits_*.parquet", Parquet) WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;
SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM file("hits_*.parquet", Parquet) WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;
SELECT UserID, COUNT(*) FROM file("hits_*.parquet", Parquet) GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10;
SELECT UserID, SearchPhrase, COUNT(*) FROM file("hits_*.parquet", Parquet) GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;
SELECT UserID, SearchPhrase, COUNT(*) FROM file("hits_*.parquet", Parquet) GROUP BY UserID, SearchPhrase LIMIT 10;
SELECT UserID, extract(minute FROM toDateTime(EventTime)) AS m, SearchPhrase, COUNT(*) FROM file("hits_*.parquet", Parquet) GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;
SELECT UserID FROM file("hits_*.parquet", Parquet) WHERE UserID = 435090932899640449;
SELECT COUNT(*) FROM file("hits_*.parquet", Parquet) WHERE URL LIKE '%google%';
SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM file("hits_*.parquet", Parquet) WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM file("hits_*.parquet", Parquet) WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
SELECT * FROM file("hits_*.parquet", Parquet) WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10;
SELECT SearchPhrase FROM file("hits_*.parquet", Parquet) WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10;
SELECT SearchPhrase FROM file("hits_*.parquet", Parquet) WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10;
SELECT SearchPhrase FROM file("hits_*.parquet", Parquet) WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10;
SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM file("hits_*.parquet", Parquet) WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM file("hits_*.parquet", Parquet) WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM file("hits_*.parquet", Parquet);
SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM file("hits_*.parquet", Parquet) WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;
SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM file("hits_*.parquet", Parquet) WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;
SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM file("hits_*.parquet", Parquet) GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;
SELECT URL, COUNT(*) AS c FROM file("hits_*.parquet", Parquet) GROUP BY URL ORDER BY c DESC LIMIT 10;
SELECT 1, URL, COUNT(*) AS c FROM file("hits_*.parquet", Parquet) GROUP BY 1, URL ORDER BY c DESC LIMIT 10;
SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM file("hits_*.parquet", Parquet) GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10;
SELECT URL, COUNT(*) AS PageViews FROM file("hits_*.parquet", Parquet) WHERE CounterID = 62 AND toDate(EventDate) >= '2013-07-01' AND toDate(EventDate) <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;
SELECT Title, COUNT(*) AS PageViews FROM file("hits_*.parquet", Parquet) WHERE CounterID = 62 AND toDate(EventDate) >= '2013-07-01' AND toDate(EventDate) <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;
SELECT URL, COUNT(*) AS PageViews FROM file("hits_*.parquet", Parquet) WHERE CounterID = 62 AND toDate(EventDate) >= '2013-07-01' AND toDate(EventDate) <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM file("hits_*.parquet", Parquet) WHERE CounterID = 62 AND toDate(EventDate) >= '2013-07-01' AND toDate(EventDate) <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM file("hits_*.parquet", Parquet) WHERE CounterID = 62 AND toDate(EventDate) >= '2013-07-01' AND toDate(EventDate) <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100;
SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM file("hits_*.parquet", Parquet) WHERE CounterID = 62 AND toDate(EventDate) >= '2013-07-01' AND toDate(EventDate) <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;
SELECT DATE_TRUNC('minute', toDateTime(EventTime)) AS M, COUNT(*) AS PageViews FROM file("hits_*.parquet", Parquet) WHERE CounterID = 62 AND toDate(EventDate) >= '2013-07-14' AND toDate(EventDate) <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', toDateTime(EventTime)) ORDER BY DATE_TRUNC('minute', toDateTime(EventTime)) LIMIT 10 OFFSET 1000;
14 changes: 14 additions & 0 deletions chdb-parquet/query.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/usr/bin/env python3

import chdb
import timeit
import sys

query = sys.stdin.read()
print(query)

for try_num in range(3):
start = timeit.default_timer()
chdb.query(query, "Null")
end = timeit.default_timer()
print(end - start)
55 changes: 55 additions & 0 deletions chdb-parquet/results/c6a.4xlarge.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
{
"system": "chDB",
"date": "2023-12-03",
"machine": "c6a.4xlarge, 500gb gp2",
"cluster_size": 1,
"comment": "",
"tags": ["C++", "column-oriented", "ClickHouse derivative", "embedded", "stateless", "serverless"],
"load_time": 0,
"data_size": 14737670832,
"result": [
[0.14784930900009385,0.01985562499999105,0.021213920000036524],
[0.23510076200000185,0.06982731300013256,0.06856002199992872],
[0.2806472989998383,0.10366240700000162,0.10361096500014355],
[0.4541343600001255,0.10390106999989257,0.10059215999990556],
[0.6548878610001339,0.45662324699992496,0.44750324799997543],
[0.9521819659998982,0.6404522270001962,0.6559423100000004],
[0.19805788999997276,0.08715895099999216,0.0858150569999907],
[0.19715853299999253,0.07409634200007531,0.07549481599994579],
[0.8598950299999615,0.6492269000000306,0.6434426030000395],
[1.4808940620000612,0.7829620970001088,0.7904872159999741],
[0.7163226630000281,0.29504979300008927,0.29545888900020145],
[0.7511556739998468,0.34864025900014894,0.3431280479999259],
[1.0435813250001047,0.7161780030000955,0.7318749700000353],
[2.4113232699999116,1.0243360720000965,1.0260252030000174],
[1.2562044089997926,0.8427601549999508,0.8326057299998411],
[0.8358988709999267,0.5704252309999447,0.5721753729999364],
[2.9971273280000332,1.9693855880000228,2.0168051440000454],
[2.2065102920000754,1.2419878719999815,1.206855657999995],
[5.954841275999797,3.93939489100012,3.761597123999991],
[0.3187466500000937,0.09396397000000434,0.09140732100013338],
[9.446635591000131,1.2302154330000121,1.2444675809999808],
[11.13492346199996,1.7419908340000347,1.7516095379999115],
[21.622155369999973,4.072843675000058,4.131993770000008],
[55.58301193300008,15.83693858699985,15.83159159999991],
[2.6420749609999348,0.4748529260000396,0.4266739149998102],
[0.9177854079998724,0.3910574720000568,0.38483357899986004],
[2.641131261000055,0.40119044900006884,0.40792182199993476],
[9.619309540000131,1.49244403900002,1.5195660050001152],
[10.307241965000003,9.973063463000017,9.892692844999829],
[1.4131518259998757,1.308299055000134,1.295807866999894],
[2.608687121999992,0.7656866700001501,0.7761720030000561],
[6.227150900999959,1.0895756959998835,1.0740852459998678],
[7.3493140729999595,5.250387106000062,5.316255329999876],
[10.698823475999916,4.070322148999821,4.003926844000034],
[10.69004674499979,4.052407979999998,3.9830662209999446],
[1.2015532409998286,0.9601744899998721,0.9185318380000353],
[0.29470233899996856,0.1431545279999682,0.14107450999995308],
[0.2372298170000704,0.09401625999998942,0.09287700000004406],
[0.18732362600007946,0.08822406300009789,0.09083071099985318],
[0.4340662390000034,0.26780105800003184,0.2632765340001697],
[0.1738416400000915,0.06409417200006828,0.06505390999996052],
[0.18030453799997304,0.057552695999902426,0.05883049899989601],
[0.1795836840001357,0.1313909179998518,0.06348699399995894]
]
}
55 changes: 55 additions & 0 deletions chdb-parquet/results/c6a.metal.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
{
"system": "chDB",
"date": "2023-12-03",
"machine": "c6a.metal, 500gb gp2",
"cluster_size": 1,
"comment": "",
"tags": ["C++", "column-oriented", "ClickHouse derivative", "embedded", "stateless", "serverless"],
"load_time": 0,
"data_size": 14737670832,
"result": [
[0.18159093600002052,0.03465310399997179,0.03535941499990258],
[0.2831564299999627,0.07290863800017178,0.07088584300004186],
[0.4106436740000845,0.11962774199992054,0.11457044600001609],
[0.47319400800006406,0.11146962500015434,0.11445495399993888],
[1.6001970140000594,1.1832852380000531,1.2053231469999446],
[1.9747222290000082,1.3993265449998944,1.4632436899998993],
[0.293016236000085,0.10769365200007996,0.10403609099989808],
[0.30888328800006093,0.08704660199987302,0.08690985099997306],
[0.9676274400001148,0.4142185770001561,0.4095282320001843],
[1.508716829999912,0.4386639290000858,0.42993967000006705],
[0.7211182660000759,0.22335372600014125,0.22127814999998918],
[0.8024834689999807,0.24581066700011434,0.2369266250000237],
[0.9941888450000533,0.34986996500015266,0.36299695699995027],
[2.2883914159999676,0.5018040939999082,0.39797729199995047],
[1.0599523600001248,0.39205417400012266,0.3990406109999185],
[0.614120850999825,0.26384819800000514,0.27866645900007825],
[2.468679433000034,0.7421945380001489,0.707125589000043],
[2.390665940999952,0.6761940030000915,0.6212907079998331],
[4.531502312999919,1.2391869979999228,1.2929706920001536],
[0.41148081200003617,0.08244017399988479,0.07748579300005076],
[9.470291036999924,0.4142645150000135,0.42491283499998644],
[11.166745869999886,0.4193407879999995,0.47778978899987123],
[21.716601035000167,0.8906946930001141,0.9057610759998624],
[55.87572047799995,4.693745457999967,4.408192557999882],
[2.6822018080001726,0.2497673709999617,0.18029253800000333],
[0.9117738509999072,0.19944890400006443,0.19231710099984411],
[2.6458359000000655,0.19551512300017748,0.17991257200014843],
[9.665634246000081,0.644791166999994,0.5677360409999892],
[8.823865798999805,1.8234754839998004,1.911617729999989],
[0.887859853000009,0.7314614570000231,0.7472309880001831],
[2.609201996000138,0.3776117290001366,0.42668743700005507],
[6.085526299000094,0.5356784809998771,0.5014943220000987],
[5.348554426000192,1.4791098619998593,1.6584785480001756],
[9.919410929999913,1.198197147999963,1.3908020020001004],
[9.918529573000114,1.4790268179999657,1.2929598839998562],
[0.5918198439999287,0.36387553300005493,0.3893138729999919],
[0.353354839000076,0.15782711399992877,0.17798462199993992],
[0.3100515420001102,0.16545474899999135,0.10927909599990926],
[0.3464544030000525,0.19426173600004404,0.10704828600000837],
[0.5001845439999215,0.28064504200006013,0.28905982800006313],
[0.3150378160000855,0.12564524300000812,0.08628042499981348],
[0.24802713799999765,0.1437892629999169,0.0803308869999455],
[0.2594364960000348,0.2181738829999631,0.08233647700012625]
]
}
8 changes: 8 additions & 0 deletions chdb-parquet/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash

cat queries.sql | while read query; do
sync
echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null

./query.py <<< "${query}"
done
10 changes: 7 additions & 3 deletions chdb/benchmark.sh
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
#!/bin/bash

# Install

sudo apt-get update
sudo apt-get install -y python3-pip
pip install chdb psutil

# Load the data
seq 0 99 | xargs -P100 -I{} bash -c 'wget --no-verbose --continue https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_{}.parquet'
wget --no-verbose --continue 'https://datasets.clickhouse.com/hits_compatible/hits.csv.gz'
gzip -d hits.csv.gz
./load.py

# Run the queries

./run.sh 2>&1 | tee log.txt

# Process the log.txt
cat log.txt | grep -P '^\d|Killed|Segmentation' | sed -r -e 's/^.*(Killed|Segmentation).*$/null\nnull\nnull/' |
awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }'
Loading

0 comments on commit 743d924

Please sign in to comment.