|
| 1 | +## 除了xlog,哪些操作可能还需要fsync ? |
| 2 | + |
| 3 | +### 作者 |
| 4 | +digoal |
| 5 | + |
| 6 | +### 日期 |
| 7 | +2015-09-04 |
| 8 | + |
| 9 | +### 标签 |
| 10 | +PostgreSQL , fsync , xlog , wal |
| 11 | + |
| 12 | +---- |
| 13 | + |
| 14 | +## 背景 |
| 15 | +我们知道xlog的一个重要责任是用来保护用户提交的事务在数据库的持久化特性的。 |
| 16 | + |
| 17 | +那么就涉及到用户提交事务后,必须先等待这笔事务对应的XLOG fsync完成。所以xlog会涉及不断的fsync(由wal writter间歇性发起,用户进程仅仅在申请不到XLOG BUFFER时会调用fsync) ([《PostgreSQL can continue when postmaster killed》](../201508/20150803_01.md) )。 |
| 18 | + |
| 19 | +另一方面,XLOG还有一个设计初衷,就是将离散的IO归为连续的IO,因为XLOG文件是预分配的,连续写入的。 |
| 20 | + |
| 21 | +如果没有XLOG,用户事务提交时,必须对操作对象fsync,可能涉及大量的离散IO,也不利于操作系统合并IO。 |
| 22 | + |
| 23 | +那么问题来了,除了xlog需要fsync,还有没有其他操作需要fsync呢? |
| 24 | + |
| 25 | +答案是必须有的,只是这种fsync会越来越少,至少在对操作响应要求高的场景会尽力避免非XLOG的fsync需求。 |
| 26 | + |
| 27 | +所以在一些对响应要求不是那么高的操作中还是有非xlog的fsync需求的。 |
| 28 | + |
| 29 | +例如 |
| 30 | + |
| 31 | +1\. initdb |
| 32 | + |
| 33 | +src/bin/initdb/initdb.c |
| 34 | + |
| 35 | +``` |
| 36 | +/* |
| 37 | + * Issue fsync recursively on PGDATA and all its contents. |
| 38 | + * |
| 39 | + * We fsync regular files and directories wherever they are, but we |
| 40 | + * follow symlinks only for pg_xlog and immediately under pg_tblspc. |
| 41 | + * Other symlinks are presumed to point at files we're not responsible |
| 42 | + * for fsyncing, and might not have privileges to write at all. |
| 43 | + * |
| 44 | + * Errors are reported but not considered fatal. |
| 45 | + */ |
| 46 | +static void |
| 47 | +fsync_pgdata(void) |
| 48 | +{ |
| 49 | + bool xlog_is_symlink; |
| 50 | + char pg_xlog[MAXPGPATH]; |
| 51 | + char pg_tblspc[MAXPGPATH]; |
| 52 | + |
| 53 | + fputs(_("syncing data to disk ... "), stdout); |
| 54 | + fflush(stdout); |
| 55 | + |
| 56 | + snprintf(pg_xlog, MAXPGPATH, "%s/pg_xlog", pg_data); |
| 57 | + snprintf(pg_tblspc, MAXPGPATH, "%s/pg_tblspc", pg_data); |
| 58 | + |
| 59 | + /* |
| 60 | + * If pg_xlog is a symlink, we'll need to recurse into it separately, |
| 61 | + * because the first walkdir below will ignore it. |
| 62 | + */ |
| 63 | + xlog_is_symlink = false; |
| 64 | + |
| 65 | +#ifndef WIN32 |
| 66 | + { |
| 67 | + struct stat st; |
| 68 | + |
| 69 | + if (lstat(pg_xlog, &st) < 0) |
| 70 | + fprintf(stderr, _("%s: could not stat file \"%s\": %s\n"), |
| 71 | + progname, pg_xlog, strerror(errno)); |
| 72 | + else if (S_ISLNK(st.st_mode)) |
| 73 | + xlog_is_symlink = true; |
| 74 | + } |
| 75 | +#else |
| 76 | + if (pgwin32_is_junction(pg_xlog)) |
| 77 | + xlog_is_symlink = true; |
| 78 | +#endif |
| 79 | + |
| 80 | + /* |
| 81 | + * If possible, hint to the kernel that we're soon going to fsync the data |
| 82 | + * directory and its contents. |
| 83 | + */ |
| 84 | +#ifdef PG_FLUSH_DATA_WORKS |
| 85 | + walkdir(pg_data, pre_sync_fname, false); |
| 86 | + if (xlog_is_symlink) |
| 87 | + walkdir(pg_xlog, pre_sync_fname, false); |
| 88 | + walkdir(pg_tblspc, pre_sync_fname, true); |
| 89 | +#endif |
| 90 | + |
| 91 | + /* |
| 92 | + * Now we do the fsync()s in the same order. |
| 93 | + * |
| 94 | + * The main call ignores symlinks, so in addition to specially processing |
| 95 | + * pg_xlog if it's a symlink, pg_tblspc has to be visited separately with |
| 96 | + * process_symlinks = true. Note that if there are any plain directories |
| 97 | + * in pg_tblspc, they'll get fsync'd twice. That's not an expected case |
| 98 | + * so we don't worry about optimizing it. |
| 99 | + */ |
| 100 | + walkdir(pg_data, fsync_fname_ext, false); |
| 101 | + if (xlog_is_symlink) |
| 102 | + walkdir(pg_xlog, fsync_fname_ext, false); |
| 103 | + walkdir(pg_tblspc, fsync_fname_ext, true); |
| 104 | + |
| 105 | + check_ok(); |
| 106 | +} |
| 107 | +``` |
| 108 | + |
| 109 | +2\. create database 或 alter database move tablespace |
| 110 | + |
| 111 | +src/backend/commands/dbcommands.c |
| 112 | + |
| 113 | + |
| 114 | +copydir@src/backend/storage/file/copydir.c |
| 115 | + |
| 116 | +每一个文件都需要fsync,量比较大。 |
| 117 | + |
| 118 | +3\. rewrite table 或 create table as 或 copy from file or 刷新物化视图 when wal_level=minimal。 |
| 119 | + |
| 120 | +调用heap_sync : |
| 121 | + |
| 122 | +src/include/access/xlog.h: |
| 123 | + |
| 124 | +``` |
| 125 | +#define XLogIsNeeded() (wal_level >= WAL_LEVEL_ARCHIVE) |
| 126 | + |
| 127 | +... |
| 128 | + if (!XLogIsNeeded()) |
| 129 | + myState->hi_options |= HEAP_INSERT_SKIP_WAL; |
| 130 | +... |
| 131 | + /* If we skipped using WAL, must heap_sync before commit */ |
| 132 | + if (myState->hi_options & HEAP_INSERT_SKIP_WAL) |
| 133 | + heap_sync(myState->rel); |
| 134 | +``` |
| 135 | + |
| 136 | +4\. 2pc事务文件 |
| 137 | + |
| 138 | +发生在WAL replay时。 |
| 139 | + |
| 140 | +``` |
| 141 | +RecreateTwoPhaseFile |
| 142 | +``` |
| 143 | + |
| 144 | +5\. 时间线文件 |
| 145 | + |
| 146 | +因为promote或者walreceiver接收到时间线文件,需要创建新的时间线文件时。 |
| 147 | + |
| 148 | +6\. replication slot文件 |
| 149 | + |
| 150 | +创建slot时,需要在pg_replslot目录中创建对应的文件。 |
| 151 | + |
| 152 | +7\. pg_clog, pg_multixact |
| 153 | + |
| 154 | +``` |
| 155 | +/* |
| 156 | + * SlruCtlData is an unshared structure that points to the active information |
| 157 | + * in shared memory. |
| 158 | + */ |
| 159 | +typedef struct SlruCtlData |
| 160 | +{ |
| 161 | + SlruShared shared; |
| 162 | + |
| 163 | + /* |
| 164 | + * This flag tells whether to fsync writes (true for pg_clog and multixact |
| 165 | + * stuff, false for pg_subtrans and pg_notify). |
| 166 | + */ |
| 167 | + bool do_fsync; |
| 168 | + |
| 169 | + /* |
| 170 | + * Decide which of two page numbers is "older" for truncation purposes. We |
| 171 | + * need to use comparison of TransactionIds here in order to do the right |
| 172 | + * thing with wraparound XID arithmetic. |
| 173 | + */ |
| 174 | + bool (*PagePrecedes) (int, int); |
| 175 | + |
| 176 | + /* |
| 177 | + * Dir is set during SimpleLruInit and does not change thereafter. Since |
| 178 | + * it's always the same, it doesn't need to be in shared memory. |
| 179 | + */ |
| 180 | + char Dir[64]; |
| 181 | +} SlruCtlData; |
| 182 | +``` |
| 183 | + |
| 184 | +其他 |
| 185 | + |
| 186 | +...... |
0 commit comments