|
| 1 | +## PostgreSQL nestloop Materialize 代价算法 |
| 2 | + |
| 3 | +### 作者 |
| 4 | +digoal |
| 5 | + |
| 6 | +### 日期 |
| 7 | +2022-01-26 |
| 8 | + |
| 9 | +### 标签 |
| 10 | +PostgreSQL , Materialize |
| 11 | + |
| 12 | +---- |
| 13 | + |
| 14 | +## 背景 |
| 15 | + |
| 16 | + |
| 17 | +``` |
| 18 | +select ... from a join b using (xx) where xx; |
| 19 | + |
| 20 | +nest loop join: |
| 21 | +a (xx rows) |
| 22 | + b |
| 23 | + 全表扫 , 与loops有关 |
| 24 | + 索引扫 , 与loops有关 |
| 25 | + 物化(run cost + rescan cost) 物化的代价与loops无关 |
| 26 | +``` |
| 27 | + |
| 28 | +通常内表小于work_mem的, 并且join字段没有索引时, 可能物化的代价更低. |
| 29 | + |
| 30 | +``` |
| 31 | +/* |
| 32 | + * cost_material |
| 33 | + * Determines and returns the cost of materializing a relation, including |
| 34 | + * the cost of reading the input data. |
| 35 | + * |
| 36 | + * If the total volume of data to materialize exceeds work_mem, we will need |
| 37 | + * to write it to disk, so the cost is much higher in that case. |
| 38 | + * |
| 39 | + * Note that here we are estimating the costs for the first scan of the |
| 40 | + * relation, so the materialization is all overhead --- any savings will |
| 41 | + * occur only on rescan, which is estimated in cost_rescan. |
| 42 | + */ |
| 43 | +void |
| 44 | +cost_material(Path *path, |
| 45 | + Cost input_startup_cost, Cost input_total_cost, |
| 46 | + double tuples, int width) |
| 47 | +{ |
| 48 | + Cost startup_cost = input_startup_cost; |
| 49 | + Cost run_cost = input_total_cost - input_startup_cost; |
| 50 | + double nbytes = relation_byte_size(tuples, width); |
| 51 | + long work_mem_bytes = work_mem * 1024L; |
| 52 | + |
| 53 | + path->rows = tuples; |
| 54 | + |
| 55 | + /* |
| 56 | + * Whether spilling or not, charge 2x cpu_operator_cost per tuple to |
| 57 | + * reflect bookkeeping overhead. (This rate must be more than what |
| 58 | + * cost_rescan charges for materialize, ie, cpu_operator_cost per tuple; |
| 59 | + * if it is exactly the same then there will be a cost tie between |
| 60 | + * nestloop with A outer, materialized B inner and nestloop with B outer, |
| 61 | + * materialized A inner. The extra cost ensures we'll prefer |
| 62 | + * materializing the smaller rel.) Note that this is normally a good deal |
| 63 | + * less than cpu_tuple_cost; which is OK because a Material plan node |
| 64 | + * doesn't do qual-checking or projection, so it's got less overhead than |
| 65 | + * most plan nodes. |
| 66 | + */ |
| 67 | + run_cost += 2 * cpu_operator_cost * tuples; |
| 68 | + |
| 69 | + /* |
| 70 | + * If we will spill to disk, charge at the rate of seq_page_cost per page. |
| 71 | + * This cost is assumed to be evenly spread through the plan run phase, |
| 72 | + * which isn't exactly accurate but our cost model doesn't allow for |
| 73 | + * nonuniform costs within the run phase. |
| 74 | + */ |
| 75 | + if (nbytes > work_mem_bytes) |
| 76 | + { |
| 77 | + double npages = ceil(nbytes / BLCKSZ); |
| 78 | + |
| 79 | + run_cost += seq_page_cost * npages; |
| 80 | + } |
| 81 | + |
| 82 | + path->startup_cost = startup_cost; |
| 83 | + path->total_cost = startup_cost + run_cost; |
| 84 | +} |
| 85 | +``` |
| 86 | + |
| 87 | + |
| 88 | +``` |
| 89 | +/* |
| 90 | + * cost_rescan |
| 91 | + * Given a finished Path, estimate the costs of rescanning it after |
| 92 | + * having done so the first time. For some Path types a rescan is |
| 93 | + * cheaper than an original scan (if no parameters change), and this |
| 94 | + * function embodies knowledge about that. The default is to return |
| 95 | + * the same costs stored in the Path. (Note that the cost estimates |
| 96 | + * actually stored in Paths are always for first scans.) |
| 97 | + * |
| 98 | + * This function is not currently intended to model effects such as rescans |
| 99 | + * being cheaper due to disk block caching; what we are concerned with is |
| 100 | + * plan types wherein the executor caches results explicitly, or doesn't |
| 101 | + * redo startup calculations, etc. |
| 102 | + */ |
| 103 | +static void |
| 104 | +cost_rescan(PlannerInfo *root, Path *path, |
| 105 | + Cost *rescan_startup_cost, /* output parameters */ |
| 106 | + Cost *rescan_total_cost) |
| 107 | +{ |
| 108 | +......... |
| 109 | + case T_Material: |
| 110 | + case T_Sort: |
| 111 | + { |
| 112 | + /* |
| 113 | + * These plan types not only materialize their results, but do |
| 114 | + * not implement qual filtering or projection. So they are |
| 115 | + * even cheaper to rescan than the ones above. We charge only |
| 116 | + * cpu_operator_cost per tuple. (Note: keep that in sync with |
| 117 | + * the run_cost charge in cost_sort, and also see comments in |
| 118 | + * cost_material before you change it.) |
| 119 | + */ |
| 120 | + Cost run_cost = cpu_operator_cost * path->rows; |
| 121 | + double nbytes = relation_byte_size(path->rows, |
| 122 | + path->pathtarget->width); |
| 123 | + long work_mem_bytes = work_mem * 1024L; |
| 124 | + |
| 125 | + if (nbytes > work_mem_bytes) |
| 126 | + { |
| 127 | + /* It will spill, so account for re-read cost */ |
| 128 | + double npages = ceil(nbytes / BLCKSZ); |
| 129 | + |
| 130 | + run_cost += seq_page_cost * npages; |
| 131 | + } |
| 132 | + *rescan_startup_cost = 0; |
| 133 | + *rescan_total_cost = run_cost; |
| 134 | + } |
| 135 | + break; |
| 136 | +``` |
| 137 | + |
| 138 | + |
| 139 | +什么时候选物化? 下面看个例子, 其实最终还是取决于代价. |
| 140 | +基本上只要物化的代价低, 就会选它. |
| 141 | + |
| 142 | +``` |
| 143 | +loop 1000000 次, 物化只增加了0.5的代价. 印证了上面的算法, |
| 144 | +物化的代价(全在内存中, 因为小于work_mem) |
| 145 | +run_cost += 2 * cpu_operator_cost * tuples |
| 146 | +2*0.0025*100 = 0.5 |
| 147 | +全表扫描的代价 |
| 148 | +cpu_tuple_cost * 100 + seq_page_cost * 1 = 2 |
| 149 | +``` |
| 150 | + |
| 151 | + |
| 152 | +``` |
| 153 | +create table a (id int, info text, crt_time timestamp); |
| 154 | + |
| 155 | +create table b (id int, info text, crt_time timestamp); |
| 156 | + |
| 157 | +insert into a select generate_series(1,1000000), md5(random()::text), clock_timestamp(); |
| 158 | +insert into b select generate_series(1,100), md5(random()::text), clock_timestamp(); |
| 159 | + |
| 160 | +explain select a.*,b.* from a join b using (id); |
| 161 | + |
| 162 | +loop 1000000 次, 物化只增加了0.5的代价. 印证了上面的算法, |
| 163 | +物化的代价(全在内存中, 因为小于work_mem) |
| 164 | +run_cost += 2 * cpu_operator_cost * tuples |
| 165 | +2*0.0025*100 = 0.5 |
| 166 | +全表扫描的代价 |
| 167 | +cpu_tuple_cost * 100 + seq_page_cost * 1 = 2 |
| 168 | + |
| 169 | +postgres=# explain select a.*,b.* from a join b using (id); |
| 170 | + QUERY PLAN |
| 171 | +------------------------------------------------------------------ |
| 172 | + Nested Loop (cost=0.00..1519348.25 rows=100 width=90) |
| 173 | + Join Filter: (a.id = b.id) |
| 174 | + -> Seq Scan on a (cost=0.00..19346.00 rows=1000000 width=45) |
| 175 | + -> Materialize (cost=0.00..2.50 rows=100 width=45) |
| 176 | + -> Seq Scan on b (cost=0.00..2.00 rows=100 width=45) |
| 177 | +(5 rows) |
| 178 | + |
| 179 | +postgres=# set enable_material =off; |
| 180 | +SET |
| 181 | +postgres=# explain select a.*,b.* from a join b using (id); |
| 182 | + QUERY PLAN |
| 183 | +------------------------------------------------------------------ |
| 184 | + Nested Loop (cost=0.00..3184602.00 rows=100 width=90) |
| 185 | + Join Filter: (a.id = b.id) |
| 186 | + -> Seq Scan on b (cost=0.00..2.00 rows=100 width=45) |
| 187 | + -> Seq Scan on a (cost=0.00..19346.00 rows=1000000 width=45) |
| 188 | +(4 rows) |
| 189 | + |
| 190 | +postgres=# explain (analyze,verbose,timing,costs,buffers) select a.*,b.* from a join b using (id); |
| 191 | + QUERY PLAN |
| 192 | +--------------------------------------------------------------------------------------------------------------------------- |
| 193 | + Nested Loop (cost=0.00..1519348.25 rows=100 width=90) (actual time=0.028..15746.781 rows=100 loops=1) |
| 194 | + Output: a.id, a.info, a.crt_time, b.id, b.info, b.crt_time |
| 195 | + Join Filter: (a.id = b.id) |
| 196 | + Rows Removed by Join Filter: 99999900 |
| 197 | + Buffers: shared hit=9347 |
| 198 | + -> Seq Scan on public.a (cost=0.00..19346.00 rows=1000000 width=45) (actual time=0.011..120.138 rows=1000000 loops=1) |
| 199 | + Output: a.id, a.info, a.crt_time |
| 200 | + Buffers: shared hit=9346 |
| 201 | + -> Materialize (cost=0.00..2.50 rows=100 width=45) (actual time=0.000..0.007 rows=100 loops=1000000) |
| 202 | + Output: b.id, b.info, b.crt_time |
| 203 | + Buffers: shared hit=1 |
| 204 | + -> Seq Scan on public.b (cost=0.00..2.00 rows=100 width=45) (actual time=0.008..0.025 rows=100 loops=1) |
| 205 | + Output: b.id, b.info, b.crt_time |
| 206 | + Buffers: shared hit=1 |
| 207 | + Query Identifier: 5474919903957737384 |
| 208 | + Planning: |
| 209 | + Buffers: shared hit=5 |
| 210 | + Planning Time: 0.299 ms |
| 211 | + Execution Time: 15746.819 ms |
| 212 | +(19 rows) |
| 213 | + |
| 214 | +postgres=# create index idx_b_1 on b (id); |
| 215 | +CREATE INDEX |
| 216 | + |
| 217 | + |
| 218 | +postgres=# explain select a.*,b.* from a join b using (id); |
| 219 | + QUERY PLAN |
| 220 | +------------------------------------------------------------------------ |
| 221 | + Nested Loop (cost=0.14..189349.30 rows=100 width=90) |
| 222 | + -> Seq Scan on a (cost=0.00..19346.00 rows=1000000 width=45) |
| 223 | + -> Index Scan using idx_b_1 on b (cost=0.14..0.16 rows=1 width=45) |
| 224 | + Index Cond: (id = a.id) |
| 225 | +(4 rows) |
| 226 | + |
| 227 | +postgres=# explain (analyze,verbose,timing,costs,buffers) select a.*,b.* from a join b using (id); |
| 228 | + QUERY PLAN |
| 229 | +------------------------------------------------------------------------------------------------------------------------------- |
| 230 | + Nested Loop (cost=0.14..189349.30 rows=100 width=90) (actual time=0.078..673.957 rows=100 loops=1) |
| 231 | + Output: a.id, a.info, a.crt_time, b.id, b.info, b.crt_time |
| 232 | + Buffers: shared hit=1009445 read=1 |
| 233 | + I/O Timings: read=0.023 |
| 234 | + -> Seq Scan on public.a (cost=0.00..19346.00 rows=1000000 width=45) (actual time=0.011..93.566 rows=1000000 loops=1) |
| 235 | + Output: a.id, a.info, a.crt_time |
| 236 | + Buffers: shared hit=9346 |
| 237 | + -> Index Scan using idx_b_1 on public.b (cost=0.14..0.16 rows=1 width=45) (actual time=0.000..0.000 rows=0 loops=1000000) |
| 238 | + Output: b.id, b.info, b.crt_time |
| 239 | + Index Cond: (b.id = a.id) |
| 240 | + Buffers: shared hit=1000099 read=1 |
| 241 | + I/O Timings: read=0.023 |
| 242 | + Query Identifier: 5474919903957737384 |
| 243 | + Planning Time: 0.125 ms |
| 244 | + Execution Time: 674.002 ms |
| 245 | +(15 rows) |
| 246 | +``` |
| 247 | + |
| 248 | + |
| 249 | + |
| 250 | +#### [期望 PostgreSQL 增加什么功能?](https://github.com/digoal/blog/issues/76 "269ac3d1c492e938c0191101c7238216") |
| 251 | + |
| 252 | + |
| 253 | +#### [PolarDB for PostgreSQL云原生分布式开源数据库](https://github.com/ApsaraDB/PolarDB-for-PostgreSQL "57258f76c37864c6e6d23383d05714ea") |
| 254 | + |
| 255 | + |
| 256 | +#### [PostgreSQL 解决方案集合](https://yq.aliyun.com/topic/118 "40cff096e9ed7122c512b35d8561d9c8") |
| 257 | + |
| 258 | + |
| 259 | +#### [德哥 / digoal's github - 公益是一辈子的事.](https://github.com/digoal/blog/blob/master/README.md "22709685feb7cab07d30f30387f0a9ae") |
| 260 | + |
| 261 | + |
| 262 | + |
| 263 | + |
0 commit comments