|
| 1 | +## PostgreSQL hll 在留存、UV统计中的通用用法 |
| 2 | + |
| 3 | +### 作者 |
| 4 | +digoal |
| 5 | + |
| 6 | +### 日期 |
| 7 | +2020-06-10 |
| 8 | + |
| 9 | +### 标签 |
| 10 | +PostgreSQL , hll , uv , 估值 |
| 11 | + |
| 12 | +---- |
| 13 | + |
| 14 | +## 背景 |
| 15 | +留存评估, 评估每个留存日的用户数. 抛去其他维度的查询条件, 对比三种方案的效率. |
| 16 | + |
| 17 | +例如第一次登陆为6-1, 那么在6-2, 6-6登陆表示1,5日留存. |
| 18 | + |
| 19 | +目的是统计每个留存日的用户数. |
| 20 | + |
| 21 | +### 方案1 : |
| 22 | + |
| 23 | +``` |
| 24 | +-- 100万用户, 每个用户100个number (52核 pg 12) |
| 25 | +CREATE TABLE user_retain ( |
| 26 | + user_id serial PRIMARY KEY, |
| 27 | + fst_login_date date, |
| 28 | + pay_retained_num int [] -- 数组存储, 表示留存日. |
| 29 | +); |
| 30 | + |
| 31 | +create or replace function gen_rand() returns int[] as $$ |
| 32 | + select array_AGG((ceil(random()*365)::int)) from generate_series(1,100); |
| 33 | +$$ language sql strict; |
| 34 | + |
| 35 | +insert into user_retain select generate_series(1,1000000), now(), gen_rand(); |
| 36 | +``` |
| 37 | + |
| 38 | +``` |
| 39 | +explain (analyze,verbose,timing,costs,buffers) |
| 40 | +select unnest(pay_retained_num),count(*) from user_retain group by 1; |
| 41 | +``` |
| 42 | + |
| 43 | +18秒 |
| 44 | + |
| 45 | +为什么要18秒? |
| 46 | + |
| 47 | +unnest 后没发立即并行, 只是function并行, 并没有并行聚合操作, 反而因为需要gather导致并行比不并行还慢. 当然这个我认为PG是可以在内核层面优化的. |
| 48 | + |
| 49 | +``` |
| 50 | + QUERY PLAN |
| 51 | +------------------------------------------------------------------------------------------------------------------------------------------------------- |
| 52 | + HashAggregate (cost=70326.50..86732.89 rows=1000008 width=12) (actual time=29802.202..29806.633 rows=365 loops=1) |
| 53 | + Output: (unnest(pay_retained_num)), count(*) |
| 54 | + Group Key: (unnest(user_retain.pay_retained_num)) |
| 55 | + Planned Partitions: 8 |
| 56 | + Peak Memory Usage: 6193 kB |
| 57 | + Buffers: shared read=58824 |
| 58 | + -> Gather (cost=0.00..61420.18 rows=1000008 width=4) (actual time=4.392..16521.356 rows=100000000 loops=1) |
| 59 | + Output: (unnest(pay_retained_num)) |
| 60 | + Workers Planned: 26 |
| 61 | + Workers Launched: 26 |
| 62 | + Buffers: shared read=58824 |
| 63 | + -> ProjectSet (cost=0.00..61420.18 rows=384620 width=4) (actual time=0.040..284.310 rows=3846154 loops=26) |
| 64 | + Output: unnest(pay_retained_num) |
| 65 | + Buffers: shared read=58824 |
| 66 | + Worker 0: actual time=0.035..226.005 rows=3107600 loops=1 |
| 67 | + Buffers: shared read=1828 |
| 68 | + Worker 1: actual time=0.037..327.656 rows=4547500 loops=1 |
| 69 | + Buffers: shared read=2675 |
| 70 | + Worker 2: actual time=0.036..512.300 rows=6907100 loops=1 |
| 71 | + Buffers: shared read=4063 |
| 72 | +...... |
| 73 | + Buffers: shared read=1908 |
| 74 | + Worker 22: actual time=0.038..262.740 rows=3689000 loops=1 |
| 75 | + Buffers: shared read=2170 |
| 76 | + Worker 23: actual time=0.040..203.822 rows=2828800 loops=1 |
| 77 | + Buffers: shared read=1664 |
| 78 | + Worker 24: actual time=0.042..161.236 rows=2249100 loops=1 |
| 79 | + Buffers: shared read=1323 |
| 80 | + Worker 25: actual time=0.046..340.726 rows=4622300 loops=1 |
| 81 | + Buffers: shared read=2719 |
| 82 | + -> Parallel Seq Scan on public.user_retain (cost=0.00..59208.62 rows=38462 width=424) (actual time=0.033..25.329 rows=38462 loops=26) |
| 83 | + Output: user_id, fst_login_date, pay_retained_num |
| 84 | + Buffers: shared read=58824 |
| 85 | + Worker 0: actual time=0.029..21.766 rows=31076 loops=1 |
| 86 | + Buffers: shared read=1828 |
| 87 | +...... |
| 88 | + Buffers: shared read=1664 |
| 89 | + Worker 24: actual time=0.035..16.011 rows=22491 loops=1 |
| 90 | + Buffers: shared read=1323 |
| 91 | + Worker 25: actual time=0.036..30.783 rows=46223 loops=1 |
| 92 | + Buffers: shared read=2719 |
| 93 | + Planning Time: 0.265 ms |
| 94 | + Buffers: shared hit=34 read=4 |
| 95 | + Execution Time: 29809.325 ms |
| 96 | +(124 rows) |
| 97 | +``` |
| 98 | + |
| 99 | +``` |
| 100 | +postgres=# explain (analyze,verbose,timing,costs,buffers) |
| 101 | +select unnest(pay_retained_num),count(*) from user_retain group by 1; |
| 102 | + QUERY PLAN |
| 103 | +-------------------------------------------------------------------------------------------------------------------------------------------- |
| 104 | + HashAggregate (cost=215387.75..379451.57 rows=10000080 width=12) (actual time=18522.487..18524.227 rows=365 loops=1) |
| 105 | + Output: (unnest(pay_retained_num)), count(*) |
| 106 | + Group Key: unnest(user_retain.pay_retained_num) |
| 107 | + Planned Partitions: 64 |
| 108 | + Peak Memory Usage: 6193 kB |
| 109 | + Buffers: shared hit=58824 |
| 110 | + -> ProjectSet (cost=0.00..126324.54 rows=10000080 width=4) (actual time=0.012..6471.708 rows=100000000 loops=1) |
| 111 | + Output: unnest(pay_retained_num) |
| 112 | + Buffers: shared hit=58824 |
| 113 | + -> Seq Scan on public.user_retain (cost=0.00..68824.08 rows=1000008 width=424) (actual time=0.009..102.847 rows=1000000 loops=1) |
| 114 | + Output: user_id, fst_login_date, pay_retained_num |
| 115 | + Buffers: shared hit=58824 |
| 116 | + Planning Time: 0.065 ms |
| 117 | + Execution Time: 18525.843 ms |
| 118 | +(14 rows) |
| 119 | +``` |
| 120 | + |
| 121 | +### 方案2 : |
| 122 | + |
| 123 | +展开array存储 |
| 124 | + |
| 125 | +``` |
| 126 | +CREATE TABLE user_retain1 ( |
| 127 | + user_id serial , |
| 128 | + fst_login_date date, |
| 129 | + pay_retained_num int |
| 130 | +); |
| 131 | + |
| 132 | +insert into user_retain1 select user_id,fst_login_date,unnest(pay_retained_num) from user_retain; |
| 133 | + |
| 134 | +alter table user_retain1 set (parallel_workers =26); |
| 135 | +``` |
| 136 | + |
| 137 | +26个并行, 0.8秒 |
| 138 | + |
| 139 | +``` |
| 140 | +max_worker_processes = 32 |
| 141 | +max_parallel_workers_per_gather = 26 |
| 142 | +parallel_leader_participation = off |
| 143 | +max_parallel_workers = 32 |
| 144 | +parallel_tuple_cost = 0 |
| 145 | +parallel_setup_cost = 0 |
| 146 | +min_parallel_table_scan_size = 0 |
| 147 | +min_parallel_index_scan_size = 0 |
| 148 | +
|
| 149 | +
|
| 150 | +explain (analyze,verbose,timing,costs,buffers) |
| 151 | +select pay_retained_num,count(*) from user_retain1 group by 1; |
| 152 | + |
| 153 | + QUERY PLAN |
| 154 | +----------------------------------------------------------------------------------------------------------------------------------------------------- |
| 155 | + Finalize HashAggregate (cost=598284.46..598288.11 rows=365 width=12) (actual time=859.487..859.525 rows=365 loops=1) |
| 156 | + Group Key: pay_retained_num |
| 157 | + Peak Memory Usage: 61 kB |
| 158 | + -> Gather (cost=598233.36..598237.01 rows=9490 width=12) (actual time=856.866..858.653 rows=9490 loops=1) |
| 159 | + Workers Planned: 26 |
| 160 | + Workers Launched: 26 |
| 161 | + -> Partial HashAggregate (cost=598233.36..598237.01 rows=365 width=12) (actual time=851.471..851.508 rows=365 loops=26) |
| 162 | + Group Key: pay_retained_num |
| 163 | + Peak Memory Usage: 0 kB |
| 164 | + -> Parallel Seq Scan on user_retain1 (cost=0.00..579002.57 rows=3846157 width=4) (actual time=0.033..294.534 rows=3846154 loops=26) |
| 165 | + Planning Time: 0.100 ms |
| 166 | + Execution Time: 860.386 ms |
| 167 | +(12 rows) |
| 168 | +``` |
| 169 | + |
| 170 | +### 方案3 : |
| 171 | +估值计算, 每个留存日一条记录, 将留存天数对应的UID写入到hll类型中. |
| 172 | + |
| 173 | +``` |
| 174 | +create extension hll; |
| 175 | + |
| 176 | +create table t_hll ( |
| 177 | + pay_retained_num int primary key, |
| 178 | + u_hll hll |
| 179 | +); |
| 180 | + |
| 181 | +insert into t_hll select pay_retained_num, hll_add_agg(hll_hash_integer(user_id)) from user_retain1 group by pay_retained_num; |
| 182 | + |
| 183 | +select pay_retained_num, # u_hll from t_hll order by 1; |
| 184 | +``` |
| 185 | + |
| 186 | +3毫秒. |
| 187 | + |
| 188 | +``` |
| 189 | + QUERY PLAN |
| 190 | +------------------------------------------------------------------------------------------------------------------------ |
| 191 | + Index Scan using t_hll_pkey on t_hll (cost=0.15..81.53 rows=365 width=12) (actual time=0.020..3.101 rows=365 loops=1) |
| 192 | + Planning Time: 0.050 ms |
| 193 | + Execution Time: 3.121 ms |
| 194 | +(3 rows) |
| 195 | +``` |
| 196 | + |
| 197 | +估值计算的差异 |
| 198 | + |
| 199 | +``` |
| 200 | +with a as ( |
| 201 | +select pay_retained_num, count(*) as cnt from user_retain1 group by 1), |
| 202 | +b as (select pay_retained_num, # u_hll as cnt from t_hll) |
| 203 | +select a.pay_retained_num, a.cnt, b.cnt from a,b where a.pay_retained_num=b.pay_retained_num order by abs(a.cnt-b.cnt); |
| 204 | + |
| 205 | + pay_retained_num | cnt | cnt |
| 206 | +------------------+--------+-------------------- |
| 207 | + 18 | 273741 | 242072.93394329798 |
| 208 | + 226 | 273461 | 241358.471066233 |
| 209 | + 257 | 273317 | 241062.44806733675 |
| 210 | + 202 | 274096 | 241617.25101208987 |
| 211 | + 245 | 273489 | 240817.31849724415 |
| 212 | + 319 | 273263 | 240535.045171427 |
| 213 | +... ... |
| 214 | +``` |
| 215 | + |
| 216 | + |
| 217 | + |
| 218 | +#### [免费领取阿里云RDS PostgreSQL实例、ECS虚拟机](https://www.aliyun.com/database/postgresqlactivity "57258f76c37864c6e6d23383d05714ea") |
| 219 | + |
| 220 | + |
| 221 | +#### [digoal's PostgreSQL文章入口](https://github.com/digoal/blog/blob/master/README.md "22709685feb7cab07d30f30387f0a9ae") |
| 222 | + |
| 223 | + |
| 224 | + |
| 225 | + |
0 commit comments