From 7fde6593458e7ff7de46d520daf339827a0b572c Mon Sep 17 00:00:00 2001 From: Pradeep Chhetri Date: Thu, 20 Jun 2024 15:45:36 +0800 Subject: [PATCH 1/2] chdb: run clickbench with v1.4.1 chdb --- .gitignore | 2 + chdb/benchmark.sh | 10 ++- chdb/create.sql | 114 ++++++++++++++++++++++++++++++++++ chdb/insert.sql | 3 + chdb/load.py | 21 +++++++ chdb/queries.sql | 86 ++++++++++++------------- chdb/query.py | 37 ++++++++--- chdb/results/c6a.4xlarge.json | 86 ++++++++++++------------- chdb/results/c6a.metal.json | 86 ++++++++++++------------- 9 files changed, 305 insertions(+), 140 deletions(-) create mode 100644 chdb/create.sql create mode 100644 chdb/insert.sql create mode 100755 chdb/load.py diff --git a/.gitignore b/.gitignore index a0c518825..6a84e2b81 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ *.bak .idea +.clickbench +hits.csv diff --git a/chdb/benchmark.sh b/chdb/benchmark.sh index 3c70defd6..e83cf3c2e 100755 --- a/chdb/benchmark.sh +++ b/chdb/benchmark.sh @@ -1,14 +1,18 @@ #!/bin/bash # Install - sudo apt-get update sudo apt-get install -y python3-pip pip install chdb psutil # Load the data -seq 0 99 | xargs -P100 -I{} bash -c 'wget --no-verbose --continue https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_{}.parquet' +wget --no-verbose --continue 'https://datasets.clickhouse.com/hits_compatible/hits.csv.gz' +gzip -d hits.csv.gz +./load.py # Run the queries - ./run.sh 2>&1 | tee log.txt + +# Process the log.txt +cat log.txt | grep -P '^\d|Killed|Segmentation' | sed -r -e 's/^.*(Killed|Segmentation).*$/null\nnull\nnull/' | + awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }' diff --git a/chdb/create.sql b/chdb/create.sql new file mode 100644 index 000000000..84edb5c35 --- /dev/null +++ b/chdb/create.sql @@ -0,0 +1,114 @@ +CREATE DATABASE IF NOT EXISTS clickbench ENGINE = Atomic; + +USE clickbench; + +CREATE TABLE IF NOT EXISTS hits +( + WatchID BIGINT NOT NULL, + JavaEnable SMALLINT NOT NULL, + Title TEXT NOT NULL, + GoodEvent SMALLINT NOT NULL, + EventTime TIMESTAMP NOT NULL, + EventDate Date NOT NULL, + CounterID INTEGER NOT NULL, + ClientIP INTEGER NOT NULL, + RegionID INTEGER NOT NULL, + UserID BIGINT NOT NULL, + CounterClass SMALLINT NOT NULL, + OS SMALLINT NOT NULL, + UserAgent SMALLINT NOT NULL, + URL TEXT NOT NULL, + Referer TEXT NOT NULL, + IsRefresh SMALLINT NOT NULL, + RefererCategoryID SMALLINT NOT NULL, + RefererRegionID INTEGER NOT NULL, + URLCategoryID SMALLINT NOT NULL, + URLRegionID INTEGER NOT NULL, + ResolutionWidth SMALLINT NOT NULL, + ResolutionHeight SMALLINT NOT NULL, + ResolutionDepth SMALLINT NOT NULL, + FlashMajor SMALLINT NOT NULL, + FlashMinor SMALLINT NOT NULL, + FlashMinor2 TEXT NOT NULL, + NetMajor SMALLINT NOT NULL, + NetMinor SMALLINT NOT NULL, + UserAgentMajor SMALLINT NOT NULL, + UserAgentMinor VARCHAR(255) NOT NULL, + CookieEnable SMALLINT NOT NULL, + JavascriptEnable SMALLINT NOT NULL, + IsMobile SMALLINT NOT NULL, + MobilePhone SMALLINT NOT NULL, + MobilePhoneModel TEXT NOT NULL, + Params TEXT NOT NULL, + IPNetworkID INTEGER NOT NULL, + TraficSourceID SMALLINT NOT NULL, + SearchEngineID SMALLINT NOT NULL, + SearchPhrase TEXT NOT NULL, + AdvEngineID SMALLINT NOT NULL, + IsArtifical SMALLINT NOT NULL, + WindowClientWidth SMALLINT NOT NULL, + WindowClientHeight SMALLINT NOT NULL, + ClientTimeZone SMALLINT NOT NULL, + ClientEventTime TIMESTAMP NOT NULL, + SilverlightVersion1 SMALLINT NOT NULL, + SilverlightVersion2 SMALLINT NOT NULL, + SilverlightVersion3 INTEGER NOT NULL, + SilverlightVersion4 SMALLINT NOT NULL, + PageCharset TEXT NOT NULL, + CodeVersion INTEGER NOT NULL, + IsLink SMALLINT NOT NULL, + IsDownload SMALLINT NOT NULL, + IsNotBounce SMALLINT NOT NULL, + FUniqID BIGINT NOT NULL, + OriginalURL TEXT NOT NULL, + HID INTEGER NOT NULL, + IsOldCounter SMALLINT NOT NULL, + IsEvent SMALLINT NOT NULL, + IsParameter SMALLINT NOT NULL, + DontCountHits SMALLINT NOT NULL, + WithHash SMALLINT NOT NULL, + HitColor CHAR NOT NULL, + LocalEventTime TIMESTAMP NOT NULL, + Age SMALLINT NOT NULL, + Sex SMALLINT NOT NULL, + Income SMALLINT NOT NULL, + Interests SMALLINT NOT NULL, + Robotness SMALLINT NOT NULL, + RemoteIP INTEGER NOT NULL, + WindowName INTEGER NOT NULL, + OpenerName INTEGER NOT NULL, + HistoryLength SMALLINT NOT NULL, + BrowserLanguage TEXT NOT NULL, + BrowserCountry TEXT NOT NULL, + SocialNetwork TEXT NOT NULL, + SocialAction TEXT NOT NULL, + HTTPError SMALLINT NOT NULL, + SendTiming INTEGER NOT NULL, + DNSTiming INTEGER NOT NULL, + ConnectTiming INTEGER NOT NULL, + ResponseStartTiming INTEGER NOT NULL, + ResponseEndTiming INTEGER NOT NULL, + FetchTiming INTEGER NOT NULL, + SocialSourceNetworkID SMALLINT NOT NULL, + SocialSourcePage TEXT NOT NULL, + ParamPrice BIGINT NOT NULL, + ParamOrderID TEXT NOT NULL, + ParamCurrency TEXT NOT NULL, + ParamCurrencyID SMALLINT NOT NULL, + OpenstatServiceName TEXT NOT NULL, + OpenstatCampaignID TEXT NOT NULL, + OpenstatAdID TEXT NOT NULL, + OpenstatSourceID TEXT NOT NULL, + UTMSource TEXT NOT NULL, + UTMMedium TEXT NOT NULL, + UTMCampaign TEXT NOT NULL, + UTMContent TEXT NOT NULL, + UTMTerm TEXT NOT NULL, + FromTag TEXT NOT NULL, + HasGCLID SMALLINT NOT NULL, + RefererHash BIGINT NOT NULL, + URLHash BIGINT NOT NULL, + CLID INTEGER NOT NULL, + PRIMARY KEY (CounterID, EventDate, UserID, EventTime, WatchID) +) +ENGINE = MergeTree; diff --git a/chdb/insert.sql b/chdb/insert.sql new file mode 100644 index 000000000..c9cbb59a1 --- /dev/null +++ b/chdb/insert.sql @@ -0,0 +1,3 @@ +INSERT INTO clickbench.hits +SELECT * +FROM file('hits.csv'); diff --git a/chdb/load.py b/chdb/load.py new file mode 100755 index 000000000..3a351a953 --- /dev/null +++ b/chdb/load.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python3 + +import timeit +import psutil +from chdb import dbapi + +con = dbapi.connect(path=".clickbench") +cur = con.cursor() + + +print("Loading the data") +start = timeit.default_timer() +cur.execute(open("create.sql").read()) +cur.execute(open("insert.sql").read()) +end = timeit.default_timer() + +print("Total time to load") +print(end - start) + +cur.close() +con.close() diff --git a/chdb/queries.sql b/chdb/queries.sql index 941a9f21a..4b05870ab 100644 --- a/chdb/queries.sql +++ b/chdb/queries.sql @@ -1,43 +1,43 @@ -SELECT COUNT(*) FROM file("hits_*.parquet", Parquet); -SELECT COUNT(*) FROM file("hits_*.parquet", Parquet) WHERE AdvEngineID <> 0; -SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM file("hits_*.parquet", Parquet); -SELECT AVG(UserID) FROM file("hits_*.parquet", Parquet); -SELECT COUNT(DISTINCT UserID) FROM file("hits_*.parquet", Parquet); -SELECT COUNT(DISTINCT SearchPhrase) FROM file("hits_*.parquet", Parquet); -SELECT MIN(EventDate), MAX(EventDate) FROM file("hits_*.parquet", Parquet); -SELECT AdvEngineID, COUNT(*) FROM file("hits_*.parquet", Parquet) WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC; -SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM file("hits_*.parquet", Parquet) GROUP BY RegionID ORDER BY u DESC LIMIT 10; -SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM file("hits_*.parquet", Parquet) GROUP BY RegionID ORDER BY c DESC LIMIT 10; -SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM file("hits_*.parquet", Parquet) WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10; -SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM file("hits_*.parquet", Parquet) WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10; -SELECT SearchPhrase, COUNT(*) AS c FROM file("hits_*.parquet", Parquet) WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; -SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM file("hits_*.parquet", Parquet) WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10; -SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM file("hits_*.parquet", Parquet) WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10; -SELECT UserID, COUNT(*) FROM file("hits_*.parquet", Parquet) GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10; -SELECT UserID, SearchPhrase, COUNT(*) FROM file("hits_*.parquet", Parquet) GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; -SELECT UserID, SearchPhrase, COUNT(*) FROM file("hits_*.parquet", Parquet) GROUP BY UserID, SearchPhrase LIMIT 10; -SELECT UserID, extract(minute FROM toDateTime(EventTime)) AS m, SearchPhrase, COUNT(*) FROM file("hits_*.parquet", Parquet) GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; -SELECT UserID FROM file("hits_*.parquet", Parquet) WHERE UserID = 435090932899640449; -SELECT COUNT(*) FROM file("hits_*.parquet", Parquet) WHERE URL LIKE '%google%'; -SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM file("hits_*.parquet", Parquet) WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; -SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM file("hits_*.parquet", Parquet) WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; -SELECT * FROM file("hits_*.parquet", Parquet) WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10; -SELECT SearchPhrase FROM file("hits_*.parquet", Parquet) WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10; -SELECT SearchPhrase FROM file("hits_*.parquet", Parquet) WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10; -SELECT SearchPhrase FROM file("hits_*.parquet", Parquet) WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10; -SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM file("hits_*.parquet", Parquet) WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; -SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM file("hits_*.parquet", Parquet) WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; -SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM file("hits_*.parquet", Parquet); -SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM file("hits_*.parquet", Parquet) WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10; -SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM file("hits_*.parquet", Parquet) WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; -SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM file("hits_*.parquet", Parquet) GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; -SELECT URL, COUNT(*) AS c FROM file("hits_*.parquet", Parquet) GROUP BY URL ORDER BY c DESC LIMIT 10; -SELECT 1, URL, COUNT(*) AS c FROM file("hits_*.parquet", Parquet) GROUP BY 1, URL ORDER BY c DESC LIMIT 10; -SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM file("hits_*.parquet", Parquet) GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10; -SELECT URL, COUNT(*) AS PageViews FROM file("hits_*.parquet", Parquet) WHERE CounterID = 62 AND toDate(EventDate) >= '2013-07-01' AND toDate(EventDate) <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10; -SELECT Title, COUNT(*) AS PageViews FROM file("hits_*.parquet", Parquet) WHERE CounterID = 62 AND toDate(EventDate) >= '2013-07-01' AND toDate(EventDate) <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10; -SELECT URL, COUNT(*) AS PageViews FROM file("hits_*.parquet", Parquet) WHERE CounterID = 62 AND toDate(EventDate) >= '2013-07-01' AND toDate(EventDate) <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; -SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM file("hits_*.parquet", Parquet) WHERE CounterID = 62 AND toDate(EventDate) >= '2013-07-01' AND toDate(EventDate) <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; -SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM file("hits_*.parquet", Parquet) WHERE CounterID = 62 AND toDate(EventDate) >= '2013-07-01' AND toDate(EventDate) <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100; -SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM file("hits_*.parquet", Parquet) WHERE CounterID = 62 AND toDate(EventDate) >= '2013-07-01' AND toDate(EventDate) <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; -SELECT DATE_TRUNC('minute', toDateTime(EventTime)) AS M, COUNT(*) AS PageViews FROM file("hits_*.parquet", Parquet) WHERE CounterID = 62 AND toDate(EventDate) >= '2013-07-14' AND toDate(EventDate) <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', toDateTime(EventTime)) ORDER BY DATE_TRUNC('minute', toDateTime(EventTime)) LIMIT 10 OFFSET 1000; +SELECT COUNT(*) FROM clickbench.hits; +SELECT COUNT(*) FROM clickbench.hits WHERE AdvEngineID <> 0; +SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM clickbench.hits; +SELECT AVG(UserID) FROM clickbench.hits; +SELECT COUNT(DISTINCT UserID) FROM clickbench.hits; +SELECT COUNT(DISTINCT SearchPhrase) FROM clickbench.hits; +SELECT MIN(EventDate), MAX(EventDate) FROM clickbench.hits; +SELECT AdvEngineID, COUNT(*) FROM clickbench.hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC; +SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM clickbench.hits GROUP BY RegionID ORDER BY u DESC LIMIT 10; +SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM clickbench.hits GROUP BY RegionID ORDER BY c DESC LIMIT 10; +SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM clickbench.hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM clickbench.hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT SearchPhrase, COUNT(*) AS c FROM clickbench.hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM clickbench.hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10; +SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM clickbench.hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT UserID, COUNT(*) FROM clickbench.hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM clickbench.hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM clickbench.hits GROUP BY UserID, SearchPhrase LIMIT 10; +SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM clickbench.hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID FROM clickbench.hits WHERE UserID = 435090932899640449; +SELECT COUNT(*) FROM clickbench.hits WHERE URL LIKE '%google%'; +SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM clickbench.hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM clickbench.hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT * FROM clickbench.hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM clickbench.hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM clickbench.hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10; +SELECT SearchPhrase FROM clickbench.hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10; +SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM clickbench.hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM clickbench.hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM clickbench.hits; +SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM clickbench.hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM clickbench.hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM clickbench.hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS c FROM clickbench.hits GROUP BY URL ORDER BY c DESC LIMIT 10; +SELECT 1, URL, COUNT(*) AS c FROM clickbench.hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10; +SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM clickbench.hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM clickbench.hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10; +SELECT Title, COUNT(*) AS PageViews FROM clickbench.hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM clickbench.hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM clickbench.hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM clickbench.hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100; +SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM clickbench.hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; +SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM clickbench.hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000; diff --git a/chdb/query.py b/chdb/query.py index de4afc012..f00239b12 100755 --- a/chdb/query.py +++ b/chdb/query.py @@ -1,14 +1,35 @@ #!/usr/bin/env python3 -import chdb import timeit import sys +import os +import glob +from chdb import dbapi -query = sys.stdin.read() -print(query) +def delete_table(pattern): + matching_files = glob.glob(pattern) + if matching_files: + first_file = matching_files[0] + os.remove(first_file) + +def main(): + query = sys.stdin.read() + print(query) + + delete_table('table.sql') + con = dbapi.connect(path=".clickbench") + cur = con.cursor() + + for try_num in range(3): + delete_table('table.sql') + start = timeit.default_timer() + cur.execute(query) + end = timeit.default_timer() + print(end - start) + + cur.close() + con.close() + +if __name__ == "__main__": + main() -for try_num in range(3): - start = timeit.default_timer() - chdb.query(query, "Null") - end = timeit.default_timer() - print(end - start) diff --git a/chdb/results/c6a.4xlarge.json b/chdb/results/c6a.4xlarge.json index 982925fd6..31b170ae1 100644 --- a/chdb/results/c6a.4xlarge.json +++ b/chdb/results/c6a.4xlarge.json @@ -8,48 +8,48 @@ "load_time": 0, "data_size": 14737670832, "result": [ -[0.14784930900009385,0.01985562499999105,0.021213920000036524], -[0.23510076200000185,0.06982731300013256,0.06856002199992872], -[0.2806472989998383,0.10366240700000162,0.10361096500014355], -[0.4541343600001255,0.10390106999989257,0.10059215999990556], -[0.6548878610001339,0.45662324699992496,0.44750324799997543], -[0.9521819659998982,0.6404522270001962,0.6559423100000004], -[0.19805788999997276,0.08715895099999216,0.0858150569999907], -[0.19715853299999253,0.07409634200007531,0.07549481599994579], -[0.8598950299999615,0.6492269000000306,0.6434426030000395], -[1.4808940620000612,0.7829620970001088,0.7904872159999741], -[0.7163226630000281,0.29504979300008927,0.29545888900020145], -[0.7511556739998468,0.34864025900014894,0.3431280479999259], -[1.0435813250001047,0.7161780030000955,0.7318749700000353], -[2.4113232699999116,1.0243360720000965,1.0260252030000174], -[1.2562044089997926,0.8427601549999508,0.8326057299998411], -[0.8358988709999267,0.5704252309999447,0.5721753729999364], -[2.9971273280000332,1.9693855880000228,2.0168051440000454], -[2.2065102920000754,1.2419878719999815,1.206855657999995], -[5.954841275999797,3.93939489100012,3.761597123999991], -[0.3187466500000937,0.09396397000000434,0.09140732100013338], -[9.446635591000131,1.2302154330000121,1.2444675809999808], -[11.13492346199996,1.7419908340000347,1.7516095379999115], -[21.622155369999973,4.072843675000058,4.131993770000008], -[55.58301193300008,15.83693858699985,15.83159159999991], -[2.6420749609999348,0.4748529260000396,0.4266739149998102], -[0.9177854079998724,0.3910574720000568,0.38483357899986004], -[2.641131261000055,0.40119044900006884,0.40792182199993476], -[9.619309540000131,1.49244403900002,1.5195660050001152], -[10.307241965000003,9.973063463000017,9.892692844999829], -[1.4131518259998757,1.308299055000134,1.295807866999894], -[2.608687121999992,0.7656866700001501,0.7761720030000561], -[6.227150900999959,1.0895756959998835,1.0740852459998678], -[7.3493140729999595,5.250387106000062,5.316255329999876], -[10.698823475999916,4.070322148999821,4.003926844000034], -[10.69004674499979,4.052407979999998,3.9830662209999446], -[1.2015532409998286,0.9601744899998721,0.9185318380000353], -[0.29470233899996856,0.1431545279999682,0.14107450999995308], -[0.2372298170000704,0.09401625999998942,0.09287700000004406], -[0.18732362600007946,0.08822406300009789,0.09083071099985318], -[0.4340662390000034,0.26780105800003184,0.2632765340001697], -[0.1738416400000915,0.06409417200006828,0.06505390999996052], -[0.18030453799997304,0.057552695999902426,0.05883049899989601], -[0.1795836840001357,0.1313909179998518,0.06348699399995894] +[0.14022313399982522,0.07355542799996329,0.07344153099984396], +[0.3531703529997685,0.07925550599975395,0.07806606800022564], +[0.26351363100002345,0.09193721700012247,0.1917902549998871], +[0.38363148100006583,0.10103324399960911,0.09784656199963138], +[0.9900537669996083,0.5613570460000119,0.5433893519998492], +[1.2167202540003927,0.7107847610000135,0.663691573000051], +[0.2699314280002909,0.2127854670002307,0.09261755900024582], +[0.267586774999927,0.08097015600014856,0.1945969809999042], +[1.232095701000162,0.7143670550003662,0.7134953560002941], +[1.5054283530002976,0.7469580990000395,0.7727119840001251], +[0.7220912419998058,0.23418598399985058,0.22906905100035146], +[0.7720030830000724,0.2630191090001972,0.2604900350002026], +[1.5689540900002612,0.7868708519999927,0.8063358419999531], +[2.3660360549997677,1.2540504009998585,1.192564518999916], +[1.9073773019999862,0.9206097809997118,0.9538239580001573], +[1.1452982619998693,0.6703060720001304,0.6640104650000467], +[3.2011426869999013,2.391980210999918,2.273702514999968], +[2.4969228180002574,1.4384517379999124,1.3977177189999566], +[6.500014078999811,4.478574523000134,4.411389963000147], +[0.4156887280000774,0.08146456199983731,0.07878659600009996], +[10.376145662999988,0.695910000999902,0.7210108459998992], +[11.976330335000057,0.953245059999972,0.8184917880003013], +[13.305014652999944,1.2751543530002891,1.2672313510001914], +[36.8242805750001,3.0744901809998737,3.0243863149999015], +[2.475535126000068,0.2707847709998532,0.2684428269999444], +[0.949858046999907,0.2340432820001297,0.22373586100002285], +[2.516790250000213,0.26695283199978803,0.25893610200000694], +[10.64464905199975,0.8958689270002651,0.8061138169996411], +[13.404723361999913,11.371307846000036,11.63507005700012], +[2.7593700480001644,2.7037539289999586,2.7058590489996277], +[2.3854017950002344,0.6277965969998149,0.654272507999849], +[5.341114754000046,0.9585087829996155,0.8893770189997667], +[7.1543934960000115,5.363972212000135,5.548733007999999], +[12.195872321000024,4.450395718999971,4.3755110330002935], +[12.317952603999856,4.24272665799981,4.316219603999798], +[1.7115063109999937,1.2473451579999164,1.326466134000384], +[0.2809372130000156,0.23459212699981435,0.11978739599999244], +[0.4466324979998717,0.09181159999980082,0.09241002299995671], +[0.3804757940001764,0.08911344600028315,0.08342132999996466], +[0.307677596999838,0.16184386700024334,0.1668336709999494], +[0.3277544040001885,0.0886090859999058,0.08678907300009087], +[0.28715330100021674,0.07937189900030717,0.0784708450000835], +[0.204657553999823,0.07808052500013218,0.07627645000002303], ] } diff --git a/chdb/results/c6a.metal.json b/chdb/results/c6a.metal.json index f0fea8a33..81481cc4b 100644 --- a/chdb/results/c6a.metal.json +++ b/chdb/results/c6a.metal.json @@ -8,48 +8,48 @@ "load_time": 0, "data_size": 14737670832, "result": [ -[0.18159093600002052,0.03465310399997179,0.03535941499990258], -[0.2831564299999627,0.07290863800017178,0.07088584300004186], -[0.4106436740000845,0.11962774199992054,0.11457044600001609], -[0.47319400800006406,0.11146962500015434,0.11445495399993888], -[1.6001970140000594,1.1832852380000531,1.2053231469999446], -[1.9747222290000082,1.3993265449998944,1.4632436899998993], -[0.293016236000085,0.10769365200007996,0.10403609099989808], -[0.30888328800006093,0.08704660199987302,0.08690985099997306], -[0.9676274400001148,0.4142185770001561,0.4095282320001843], -[1.508716829999912,0.4386639290000858,0.42993967000006705], -[0.7211182660000759,0.22335372600014125,0.22127814999998918], -[0.8024834689999807,0.24581066700011434,0.2369266250000237], -[0.9941888450000533,0.34986996500015266,0.36299695699995027], -[2.2883914159999676,0.5018040939999082,0.39797729199995047], -[1.0599523600001248,0.39205417400012266,0.3990406109999185], -[0.614120850999825,0.26384819800000514,0.27866645900007825], -[2.468679433000034,0.7421945380001489,0.707125589000043], -[2.390665940999952,0.6761940030000915,0.6212907079998331], -[4.531502312999919,1.2391869979999228,1.2929706920001536], -[0.41148081200003617,0.08244017399988479,0.07748579300005076], -[9.470291036999924,0.4142645150000135,0.42491283499998644], -[11.166745869999886,0.4193407879999995,0.47778978899987123], -[21.716601035000167,0.8906946930001141,0.9057610759998624], -[55.87572047799995,4.693745457999967,4.408192557999882], -[2.6822018080001726,0.2497673709999617,0.18029253800000333], -[0.9117738509999072,0.19944890400006443,0.19231710099984411], -[2.6458359000000655,0.19551512300017748,0.17991257200014843], -[9.665634246000081,0.644791166999994,0.5677360409999892], -[8.823865798999805,1.8234754839998004,1.911617729999989], -[0.887859853000009,0.7314614570000231,0.7472309880001831], -[2.609201996000138,0.3776117290001366,0.42668743700005507], -[6.085526299000094,0.5356784809998771,0.5014943220000987], -[5.348554426000192,1.4791098619998593,1.6584785480001756], -[9.919410929999913,1.198197147999963,1.3908020020001004], -[9.918529573000114,1.4790268179999657,1.2929598839998562], -[0.5918198439999287,0.36387553300005493,0.3893138729999919], -[0.353354839000076,0.15782711399992877,0.17798462199993992], -[0.3100515420001102,0.16545474899999135,0.10927909599990926], -[0.3464544030000525,0.19426173600004404,0.10704828600000837], -[0.5001845439999215,0.28064504200006013,0.28905982800006313], -[0.3150378160000855,0.12564524300000812,0.08628042499981348], -[0.24802713799999765,0.1437892629999169,0.0803308869999455], -[0.2594364960000348,0.2181738829999631,0.08233647700012625] +[0.1398201020000016,0.06791341200000289,0.07391149699999744], +[0.2597099600000021,0.2069418270000014,0.08758834399999671], +[0.47551589900000124,0.09728062599999987,0.09487701800000536], +[0.4691224370000029,0.1740481620000054,0.10000902900000597], +[2.0630773329999954,1.7024521199999896,1.6643195909999946], +[2.3481535089999994,1.6481386580000077,1.7530449890000028], +[0.357762754999996,0.09266164299999957,0.09053794000000437], +[0.4038841520000034,0.10313489499999662,0.10715072499999678], +[1.0590748890000015,0.47724499000000264,0.45966836799999555], +[1.1300884690000004,0.48051896699999475,0.46993974000000094], +[0.775603744999998,0.19686701099999482,0.1892493929999972], +[0.6906902610000003,0.21718392299999323,0.21430437200000085], +[1.3169523250000026,0.4156535830000081,0.3755140890000064], +[2.036119763000002,0.5071473900000001,0.5511740860000032], +[1.383330911999991,0.3799174090000008,0.3751193639999997], +[1.019473775999998,0.3739846369999924,0.35971603200000857], +[2.4504302189999976,0.877971260999999,0.7683858299999997], +[2.3557413229999895,0.8719610389999986,0.683793068], +[4.092770959999996,1.7129616520000184,1.297148031000006], +[0.35451071500000353,0.09219286999999099,0.08560756100001754], +[9.639714029000004,0.3707775869999921,0.23417643499999485], +[11.064303246999998,0.324648658000001,0.4172894439999766], +[13.232740172999996,0.4640560629999868,0.3877608140000177], +[35.72404554799999,1.0838868939999884,0.9306462309999972], +[2.364133627000001,0.154316730000005,0.14268514899998763], +[0.9073242750000077,0.14350597200001403,0.14364811400000121], +[2.3429148049999924,0.272847009000003,0.14734456700000464], +[9.656295885999981,0.6313464649999787,0.49251017000000274], +[8.370034843999974,1.9595999350000284,1.734505702999968], +[0.904578416999982,0.7361198660000241,0.6462295329999961], +[2.0744172760000197,0.29773619299999154,0.3026944260000164], +[4.899459804999992,0.4551924770000255,0.4221328589999871], +[4.543115612000008,2.2998050609999723,2.2794130400000086], +[10.537130942999966,1.4773695650000036,1.4047146670000075], +[10.415939505999972,1.4964427190000151,1.516837157999987], +[1.1674407470000006,0.5462833650000221,0.5045760180000229], +[0.4713924260000226,0.1254563469999539,0.1255517769999983], +[0.24476967099997182,0.2168840989999694,0.09768120500001487], +[0.3363335870000128,0.08840301899999758,0.09693554400001858], +[0.30560961300000145,0.2762818190000189,0.15135863000000427], +[0.2204463159999932,0.09077218100003392,0.08913131299999577], +[0.31145180899994784,0.0834306520000041,0.08373748599996134], +[0.30414350099999865,0.08266997800001263,0.08533381199998757], ] } From 69adece818de6b731b44e1749e06f746540b63e5 Mon Sep 17 00:00:00 2001 From: Pradeep Chhetri Date: Thu, 20 Jun 2024 17:07:54 +0800 Subject: [PATCH 2/2] Moved previous chdb to chdb-parquet --- chdb-parquet/benchmark.sh | 14 +++++++ chdb-parquet/queries.sql | 43 +++++++++++++++++++++ chdb-parquet/query.py | 14 +++++++ chdb-parquet/results/c6a.4xlarge.json | 55 +++++++++++++++++++++++++++ chdb-parquet/results/c6a.metal.json | 55 +++++++++++++++++++++++++++ chdb-parquet/run.sh | 8 ++++ chdb/results/c6a.4xlarge.json | 2 +- chdb/results/c6a.metal.json | 2 +- 8 files changed, 191 insertions(+), 2 deletions(-) create mode 100755 chdb-parquet/benchmark.sh create mode 100644 chdb-parquet/queries.sql create mode 100755 chdb-parquet/query.py create mode 100644 chdb-parquet/results/c6a.4xlarge.json create mode 100644 chdb-parquet/results/c6a.metal.json create mode 100755 chdb-parquet/run.sh diff --git a/chdb-parquet/benchmark.sh b/chdb-parquet/benchmark.sh new file mode 100755 index 000000000..3c70defd6 --- /dev/null +++ b/chdb-parquet/benchmark.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +# Install + +sudo apt-get update +sudo apt-get install -y python3-pip +pip install chdb psutil + +# Load the data +seq 0 99 | xargs -P100 -I{} bash -c 'wget --no-verbose --continue https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_{}.parquet' + +# Run the queries + +./run.sh 2>&1 | tee log.txt diff --git a/chdb-parquet/queries.sql b/chdb-parquet/queries.sql new file mode 100644 index 000000000..941a9f21a --- /dev/null +++ b/chdb-parquet/queries.sql @@ -0,0 +1,43 @@ +SELECT COUNT(*) FROM file("hits_*.parquet", Parquet); +SELECT COUNT(*) FROM file("hits_*.parquet", Parquet) WHERE AdvEngineID <> 0; +SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM file("hits_*.parquet", Parquet); +SELECT AVG(UserID) FROM file("hits_*.parquet", Parquet); +SELECT COUNT(DISTINCT UserID) FROM file("hits_*.parquet", Parquet); +SELECT COUNT(DISTINCT SearchPhrase) FROM file("hits_*.parquet", Parquet); +SELECT MIN(EventDate), MAX(EventDate) FROM file("hits_*.parquet", Parquet); +SELECT AdvEngineID, COUNT(*) FROM file("hits_*.parquet", Parquet) WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC; +SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM file("hits_*.parquet", Parquet) GROUP BY RegionID ORDER BY u DESC LIMIT 10; +SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM file("hits_*.parquet", Parquet) GROUP BY RegionID ORDER BY c DESC LIMIT 10; +SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM file("hits_*.parquet", Parquet) WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM file("hits_*.parquet", Parquet) WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT SearchPhrase, COUNT(*) AS c FROM file("hits_*.parquet", Parquet) WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM file("hits_*.parquet", Parquet) WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10; +SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM file("hits_*.parquet", Parquet) WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT UserID, COUNT(*) FROM file("hits_*.parquet", Parquet) GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM file("hits_*.parquet", Parquet) GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM file("hits_*.parquet", Parquet) GROUP BY UserID, SearchPhrase LIMIT 10; +SELECT UserID, extract(minute FROM toDateTime(EventTime)) AS m, SearchPhrase, COUNT(*) FROM file("hits_*.parquet", Parquet) GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID FROM file("hits_*.parquet", Parquet) WHERE UserID = 435090932899640449; +SELECT COUNT(*) FROM file("hits_*.parquet", Parquet) WHERE URL LIKE '%google%'; +SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM file("hits_*.parquet", Parquet) WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM file("hits_*.parquet", Parquet) WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT * FROM file("hits_*.parquet", Parquet) WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM file("hits_*.parquet", Parquet) WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM file("hits_*.parquet", Parquet) WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10; +SELECT SearchPhrase FROM file("hits_*.parquet", Parquet) WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10; +SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM file("hits_*.parquet", Parquet) WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM file("hits_*.parquet", Parquet) WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM file("hits_*.parquet", Parquet); +SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM file("hits_*.parquet", Parquet) WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM file("hits_*.parquet", Parquet) WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM file("hits_*.parquet", Parquet) GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS c FROM file("hits_*.parquet", Parquet) GROUP BY URL ORDER BY c DESC LIMIT 10; +SELECT 1, URL, COUNT(*) AS c FROM file("hits_*.parquet", Parquet) GROUP BY 1, URL ORDER BY c DESC LIMIT 10; +SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM file("hits_*.parquet", Parquet) GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM file("hits_*.parquet", Parquet) WHERE CounterID = 62 AND toDate(EventDate) >= '2013-07-01' AND toDate(EventDate) <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10; +SELECT Title, COUNT(*) AS PageViews FROM file("hits_*.parquet", Parquet) WHERE CounterID = 62 AND toDate(EventDate) >= '2013-07-01' AND toDate(EventDate) <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM file("hits_*.parquet", Parquet) WHERE CounterID = 62 AND toDate(EventDate) >= '2013-07-01' AND toDate(EventDate) <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM file("hits_*.parquet", Parquet) WHERE CounterID = 62 AND toDate(EventDate) >= '2013-07-01' AND toDate(EventDate) <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM file("hits_*.parquet", Parquet) WHERE CounterID = 62 AND toDate(EventDate) >= '2013-07-01' AND toDate(EventDate) <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100; +SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM file("hits_*.parquet", Parquet) WHERE CounterID = 62 AND toDate(EventDate) >= '2013-07-01' AND toDate(EventDate) <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; +SELECT DATE_TRUNC('minute', toDateTime(EventTime)) AS M, COUNT(*) AS PageViews FROM file("hits_*.parquet", Parquet) WHERE CounterID = 62 AND toDate(EventDate) >= '2013-07-14' AND toDate(EventDate) <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', toDateTime(EventTime)) ORDER BY DATE_TRUNC('minute', toDateTime(EventTime)) LIMIT 10 OFFSET 1000; diff --git a/chdb-parquet/query.py b/chdb-parquet/query.py new file mode 100755 index 000000000..de4afc012 --- /dev/null +++ b/chdb-parquet/query.py @@ -0,0 +1,14 @@ +#!/usr/bin/env python3 + +import chdb +import timeit +import sys + +query = sys.stdin.read() +print(query) + +for try_num in range(3): + start = timeit.default_timer() + chdb.query(query, "Null") + end = timeit.default_timer() + print(end - start) diff --git a/chdb-parquet/results/c6a.4xlarge.json b/chdb-parquet/results/c6a.4xlarge.json new file mode 100644 index 000000000..982925fd6 --- /dev/null +++ b/chdb-parquet/results/c6a.4xlarge.json @@ -0,0 +1,55 @@ +{ + "system": "chDB", + "date": "2023-12-03", + "machine": "c6a.4xlarge, 500gb gp2", + "cluster_size": 1, + "comment": "", + "tags": ["C++", "column-oriented", "ClickHouse derivative", "embedded", "stateless", "serverless"], + "load_time": 0, + "data_size": 14737670832, + "result": [ +[0.14784930900009385,0.01985562499999105,0.021213920000036524], +[0.23510076200000185,0.06982731300013256,0.06856002199992872], +[0.2806472989998383,0.10366240700000162,0.10361096500014355], +[0.4541343600001255,0.10390106999989257,0.10059215999990556], +[0.6548878610001339,0.45662324699992496,0.44750324799997543], +[0.9521819659998982,0.6404522270001962,0.6559423100000004], +[0.19805788999997276,0.08715895099999216,0.0858150569999907], +[0.19715853299999253,0.07409634200007531,0.07549481599994579], +[0.8598950299999615,0.6492269000000306,0.6434426030000395], +[1.4808940620000612,0.7829620970001088,0.7904872159999741], +[0.7163226630000281,0.29504979300008927,0.29545888900020145], +[0.7511556739998468,0.34864025900014894,0.3431280479999259], +[1.0435813250001047,0.7161780030000955,0.7318749700000353], +[2.4113232699999116,1.0243360720000965,1.0260252030000174], +[1.2562044089997926,0.8427601549999508,0.8326057299998411], +[0.8358988709999267,0.5704252309999447,0.5721753729999364], +[2.9971273280000332,1.9693855880000228,2.0168051440000454], +[2.2065102920000754,1.2419878719999815,1.206855657999995], +[5.954841275999797,3.93939489100012,3.761597123999991], +[0.3187466500000937,0.09396397000000434,0.09140732100013338], +[9.446635591000131,1.2302154330000121,1.2444675809999808], +[11.13492346199996,1.7419908340000347,1.7516095379999115], +[21.622155369999973,4.072843675000058,4.131993770000008], +[55.58301193300008,15.83693858699985,15.83159159999991], +[2.6420749609999348,0.4748529260000396,0.4266739149998102], +[0.9177854079998724,0.3910574720000568,0.38483357899986004], +[2.641131261000055,0.40119044900006884,0.40792182199993476], +[9.619309540000131,1.49244403900002,1.5195660050001152], +[10.307241965000003,9.973063463000017,9.892692844999829], +[1.4131518259998757,1.308299055000134,1.295807866999894], +[2.608687121999992,0.7656866700001501,0.7761720030000561], +[6.227150900999959,1.0895756959998835,1.0740852459998678], +[7.3493140729999595,5.250387106000062,5.316255329999876], +[10.698823475999916,4.070322148999821,4.003926844000034], +[10.69004674499979,4.052407979999998,3.9830662209999446], +[1.2015532409998286,0.9601744899998721,0.9185318380000353], +[0.29470233899996856,0.1431545279999682,0.14107450999995308], +[0.2372298170000704,0.09401625999998942,0.09287700000004406], +[0.18732362600007946,0.08822406300009789,0.09083071099985318], +[0.4340662390000034,0.26780105800003184,0.2632765340001697], +[0.1738416400000915,0.06409417200006828,0.06505390999996052], +[0.18030453799997304,0.057552695999902426,0.05883049899989601], +[0.1795836840001357,0.1313909179998518,0.06348699399995894] + ] +} diff --git a/chdb-parquet/results/c6a.metal.json b/chdb-parquet/results/c6a.metal.json new file mode 100644 index 000000000..f0fea8a33 --- /dev/null +++ b/chdb-parquet/results/c6a.metal.json @@ -0,0 +1,55 @@ +{ + "system": "chDB", + "date": "2023-12-03", + "machine": "c6a.metal, 500gb gp2", + "cluster_size": 1, + "comment": "", + "tags": ["C++", "column-oriented", "ClickHouse derivative", "embedded", "stateless", "serverless"], + "load_time": 0, + "data_size": 14737670832, + "result": [ +[0.18159093600002052,0.03465310399997179,0.03535941499990258], +[0.2831564299999627,0.07290863800017178,0.07088584300004186], +[0.4106436740000845,0.11962774199992054,0.11457044600001609], +[0.47319400800006406,0.11146962500015434,0.11445495399993888], +[1.6001970140000594,1.1832852380000531,1.2053231469999446], +[1.9747222290000082,1.3993265449998944,1.4632436899998993], +[0.293016236000085,0.10769365200007996,0.10403609099989808], +[0.30888328800006093,0.08704660199987302,0.08690985099997306], +[0.9676274400001148,0.4142185770001561,0.4095282320001843], +[1.508716829999912,0.4386639290000858,0.42993967000006705], +[0.7211182660000759,0.22335372600014125,0.22127814999998918], +[0.8024834689999807,0.24581066700011434,0.2369266250000237], +[0.9941888450000533,0.34986996500015266,0.36299695699995027], +[2.2883914159999676,0.5018040939999082,0.39797729199995047], +[1.0599523600001248,0.39205417400012266,0.3990406109999185], +[0.614120850999825,0.26384819800000514,0.27866645900007825], +[2.468679433000034,0.7421945380001489,0.707125589000043], +[2.390665940999952,0.6761940030000915,0.6212907079998331], +[4.531502312999919,1.2391869979999228,1.2929706920001536], +[0.41148081200003617,0.08244017399988479,0.07748579300005076], +[9.470291036999924,0.4142645150000135,0.42491283499998644], +[11.166745869999886,0.4193407879999995,0.47778978899987123], +[21.716601035000167,0.8906946930001141,0.9057610759998624], +[55.87572047799995,4.693745457999967,4.408192557999882], +[2.6822018080001726,0.2497673709999617,0.18029253800000333], +[0.9117738509999072,0.19944890400006443,0.19231710099984411], +[2.6458359000000655,0.19551512300017748,0.17991257200014843], +[9.665634246000081,0.644791166999994,0.5677360409999892], +[8.823865798999805,1.8234754839998004,1.911617729999989], +[0.887859853000009,0.7314614570000231,0.7472309880001831], +[2.609201996000138,0.3776117290001366,0.42668743700005507], +[6.085526299000094,0.5356784809998771,0.5014943220000987], +[5.348554426000192,1.4791098619998593,1.6584785480001756], +[9.919410929999913,1.198197147999963,1.3908020020001004], +[9.918529573000114,1.4790268179999657,1.2929598839998562], +[0.5918198439999287,0.36387553300005493,0.3893138729999919], +[0.353354839000076,0.15782711399992877,0.17798462199993992], +[0.3100515420001102,0.16545474899999135,0.10927909599990926], +[0.3464544030000525,0.19426173600004404,0.10704828600000837], +[0.5001845439999215,0.28064504200006013,0.28905982800006313], +[0.3150378160000855,0.12564524300000812,0.08628042499981348], +[0.24802713799999765,0.1437892629999169,0.0803308869999455], +[0.2594364960000348,0.2181738829999631,0.08233647700012625] + ] +} diff --git a/chdb-parquet/run.sh b/chdb-parquet/run.sh new file mode 100755 index 000000000..64df8c608 --- /dev/null +++ b/chdb-parquet/run.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +cat queries.sql | while read query; do + sync + echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null + + ./query.py <<< "${query}" +done diff --git a/chdb/results/c6a.4xlarge.json b/chdb/results/c6a.4xlarge.json index 31b170ae1..cf10ee1a4 100644 --- a/chdb/results/c6a.4xlarge.json +++ b/chdb/results/c6a.4xlarge.json @@ -5,7 +5,7 @@ "cluster_size": 1, "comment": "", "tags": ["C++", "column-oriented", "ClickHouse derivative", "embedded", "stateless", "serverless"], - "load_time": 0, + "load_time": 637, "data_size": 14737670832, "result": [ [0.14022313399982522,0.07355542799996329,0.07344153099984396], diff --git a/chdb/results/c6a.metal.json b/chdb/results/c6a.metal.json index 81481cc4b..1f8547da1 100644 --- a/chdb/results/c6a.metal.json +++ b/chdb/results/c6a.metal.json @@ -5,7 +5,7 @@ "cluster_size": 1, "comment": "", "tags": ["C++", "column-oriented", "ClickHouse derivative", "embedded", "stateless", "serverless"], - "load_time": 0, + "load_time": 584, "data_size": 14737670832, "result": [ [0.1398201020000016,0.06791341200000289,0.07391149699999744],