Skip to content

Commit 5ff9972

Browse files
authored
Include ClickHouse beta JSON type results in super command doc (#5513)
Run super cmd perf queries with ClickHouse JSON type
1 parent 26269cc commit 5ff9972

12 files changed

+888
-162
lines changed

docs/commands/super.md

+311-150
Large diffs are not rendered by default.

scripts/super-cmd-perf/README.md

+5
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,11 @@ The run proceeds in three phases:
3737
2. Test data is downloaded and loaded into needed storage formats
3838
3. Queries are executed on all data platforms
3939

40+
The scripts only run with ClickHouse's [beta JSON type](https://clickhouse.com/blog/a-new-powerful-json-data-type-for-clickhouse)
41+
on AWS because when we attempted to load data to this type on our Macbooks
42+
that have 16 GB of RAM it consistently failed with a "too many open files"
43+
error.
44+
4045
As the benchmarks may take a long time to run, the use of [`screen`](https://www.gnu.org/software/screen/)
4146
or a similar "detachable" terminal tool is recommended in case your remote
4247
network connection drops during a run.

scripts/super-cmd-perf/benchmark.sh

+13-10
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,6 @@ if command -v dmidecode && [ "$(sudo dmidecode --string system-uuid | cut -c1-3)
2222
echo 'export TMPDIR="/mnt/tmpdir"' >> "$HOME"/.profile
2323
mkdir /mnt/tmpdir
2424

25-
# Install ClickHouse
26-
if ! command -v clickhouse-client > /dev/null 2>&1; then
27-
sudo apt-get install -y apt-transport-https ca-certificates curl gnupg
28-
curl -fsSL 'https://packages.clickhouse.com/rpm/lts/repodata/repomd.xml.key' | sudo gpg --dearmor -o /usr/share/keyrings/clickhouse-keyring.gpg
29-
echo "deb [signed-by=/usr/share/keyrings/clickhouse-keyring.gpg] https://packages.clickhouse.com/deb stable main" | sudo tee \
30-
/etc/apt/sources.list.d/clickhouse.list
31-
sudo apt-get update
32-
sudo DEBIAN_FRONTEND=noninteractive apt-get install -y clickhouse-client
33-
fi
34-
3525
# Install DuckDB
3626
if ! command -v duckdb > /dev/null 2>&1; then
3727
curl -L -O https://github.com/duckdb/duckdb/releases/download/v1.1.3/duckdb_cli-linux-amd64.zip
@@ -69,6 +59,19 @@ if command -v dmidecode && [ "$(sudo dmidecode --string system-uuid | cut -c1-3)
6959

7060
cd scripts/super-cmd-perf
7161

62+
# Install ClickHouse
63+
if ! command -v clickhouse-client > /dev/null 2>&1; then
64+
sudo apt-get install -y apt-transport-https ca-certificates curl gnupg
65+
curl -fsSL 'https://packages.clickhouse.com/rpm/lts/repodata/repomd.xml.key' | sudo gpg --dearmor -o /usr/share/keyrings/clickhouse-keyring.gpg
66+
echo "deb [signed-by=/usr/share/keyrings/clickhouse-keyring.gpg] https://packages.clickhouse.com/deb stable main" | sudo tee \
67+
/etc/apt/sources.list.d/clickhouse.list
68+
sudo apt-get update
69+
sudo DEBIAN_FRONTEND=noninteractive apt-get install -y clickhouse-server clickhouse-client
70+
sudo cp clickhouse-storage.xml /etc/clickhouse-server/config.d
71+
sudo systemctl stop clickhouse-server
72+
sudo systemctl disable clickhouse-server.service
73+
fi
74+
7275
fi
7376

7477
rundir="$(date +%F_%T)"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
<clickhouse>
2+
<custom_cached_disks_base_directory>/mnt/clickhouse/caches/</custom_cached_disks_base_directory>
3+
<path>/mnt/clickhouse/</path>
4+
<tmp_path>/mnt/clickhouse/tmp/</tmp_path>
5+
</clickhouse>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
SET enable_json_type = 1;
2+
CREATE TABLE gha (v JSON) ENGINE MergeTree() ORDER BY tuple();
3+
INSERT INTO gha SELECT * FROM file('gharchive_gz/*.json.gz', JSONAsObject);

scripts/super-cmd-perf/prep-data.sh

+14
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ mkdir -p "$rundir"
1111

1212
RUNNING_ON_AWS_EC2="${RUNNING_ON_AWS_EC2:-}"
1313
if [ -n "$RUNNING_ON_AWS_EC2" ]; then
14+
cp clickhouse-table-create.sql /mnt
1415
cd /mnt
1516
fi
1617

@@ -55,4 +56,17 @@ run_cmd \
5556
"$rundir/super-bsup-create.out" \
5657
"super -o gha.bsup gharchive_gz/*.json.gz"
5758

59+
if [ -n "$RUNNING_ON_AWS_EC2" ]; then
60+
sudo mkdir -p /var/lib/clickhouse/user_files
61+
sudo chown clickhouse:clickhouse /var/lib/clickhouse/user_files
62+
sudo ln -s /mnt/gharchive_gz /var/lib/clickhouse/user_files/gharchive_gz
63+
sudo systemctl start clickhouse-server
64+
sleep 5
65+
run_cmd \
66+
"$rundir/clickhouse-table-create.out" \
67+
"clickhouse-client < clickhouse-table-create.sql"
68+
sudo systemctl stop clickhouse-server
69+
du -h clickhouse/store
70+
fi
71+
5872
du -h gha.db gha.parquet gha.bsup gharchive_gz
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
SET allow_suspicious_types_in_group_by = 1;
2+
SELECT count(),v.type
3+
FROM '__SOURCE__'
4+
WHERE v.repo.name='duckdb/duckdb'
5+
GROUP BY v.type
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
SELECT count()
2+
FROM '__SOURCE__'
3+
WHERE v.actor.login='johnbieren'

scripts/super-cmd-perf/queries/search+-clickhouse-db.sql

+489
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
SELECT count()
2+
FROM '__SOURCE__'
3+
WHERE v.payload.pull_request.body LIKE '%in case you have any feedback 😊%'
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
WITH assignees AS (
2+
SELECT v.payload.pull_request.assignee.login assignee
3+
FROM '__SOURCE__'
4+
UNION ALL
5+
SELECT arrayJoin(v.payload.pull_request.assignees).login assignee
6+
FROM '__SOURCE__'
7+
)
8+
SELECT assignee, count(*) count
9+
FROM assignees
10+
WHERE assignee IS NOT NULL
11+
GROUP BY assignee
12+
ORDER BY count DESC
13+
LIMIT 5

scripts/super-cmd-perf/run-queries.sh

+24-2
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,8 @@ function run_query {
5555
cmd="$cmd < $final_query"
5656
elif [ "$cmd" == "datafusion" ]; then
5757
cmd="datafusion-cli --file $final_query"
58-
elif [ "$cmd" == "clickhouse" ]; then
59-
cmd="clickhouse --queries-file $final_query"
58+
elif [[ "$cmd" == "clickhouse"* ]]; then
59+
cmd="$cmd --queries-file $final_query"
6060
fi
6161

6262
echo -e "About to execute\n================\n$cmd\n\nWith query\n==========" > "$outputfile"
@@ -144,3 +144,25 @@ do
144144
done
145145
echo >> "$report"
146146
echo >> "$csv_report"
147+
148+
if [ -n "$RUNNING_ON_AWS_EC2" ]; then
149+
sudo systemctl start clickhouse-server
150+
echo -n "|\`clickhouse\`|\`db\`|" >> "$report"
151+
echo -n "clickhouse,db" >> "$csv_report"
152+
for queryfile in search-clickhouse-db.sql search+-clickhouse-db.sql count-clickhouse-db.sql agg-clickhouse-db.sql union-clickhouse-db.sql
153+
do
154+
if [ "$queryfile" == "union-clickhouse-db.sql" ]; then
155+
echo -n "N/A|" >> "$report"
156+
echo -n ",N/A" >> "$csv_report"
157+
continue
158+
fi
159+
run_query clickhouse-client $queryfile gha
160+
result=$(grep Time < "$rundir/clickhouse-client-$queryfile-$source.out" | awk '{ print $4 }')
161+
echo -n "$result" >> "$report"
162+
echo -n "|" >> "$report"
163+
echo -n ",$result" >> "$csv_report"
164+
done
165+
sudo systemctl stop clickhouse-server
166+
echo >> "$report"
167+
echo >> "$csv_report"
168+
fi

0 commit comments

Comments
 (0)