-
Notifications
You must be signed in to change notification settings - Fork 1.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
core[WIP] Readdir improvement for fuse #4207
base: devel
Are you sure you want to change the base?
Conversation
There are multiple readdir improvement with this patch for fuse. 1) Set the buffer size 128KB during wind a readdir call by fuse 2) Wind a lookup call for all entries parallel by fuse_readdir_cbk 3) Keep the inode in active list after taking extra reference for 10m(600s) sothat in next lookup is served by md-cache 4) Take the reference only while table->active size is less than 1M Fixes: gluster#4176 Change-Id: Ic9c75799883d94ca473c230e6213cbf00b383684 Signed-off-by: Mohit Agrawal <[email protected]>
CLANG-FORMAT FAILURE: index a95a9ce0c..077d0be76 100644
--- a/glusterfsd/src/glusterfsd.c
+++ b/glusterfsd/src/glusterfsd.c
@@ -232,7 +232,8 @@ static struct argp_option gf_options[] = {
{"use-readdirp", ARGP_FUSE_USE_READDIRP_KEY, "BOOL", OPTION_ARG_OPTIONAL,
"Use readdirp mode in fuse kernel module"
" [default: \"yes\"]"},
- {"readdir-optimize", ARGP_FUSE_USE_READDIR_OPTIMIZE, "BOOL", OPTION_ARG_OPTIONAL,
+ {"readdir-optimize", ARGP_FUSE_USE_READDIR_OPTIMIZE, "BOOL",
+ OPTION_ARG_OPTIONAL,
"Enable readdir optimize mode for fuse "
" [default: \"no\"]"},
{"secure-mgmt", ARGP_SECURE_MGMT_KEY, "BOOL", OPTION_ARG_OPTIONAL,
@@ -1254,8 +1255,8 @@ parse_opts(int key, char *arg, struct argp_state *state)
break;
}
- argp_failure(state, -1, 0, "unknown readdir-optimize setting \"%s\"",
- arg);
+ argp_failure(state, -1, 0,
+ "unknown readdir-optimize setting \"%s\"", arg);
break;
case ARGP_LOGGER:
diff --git a/libglusterfs/src/glusterfs/glusterfs.h b/libglusterfs/src/glusterfs/glusterfs.h
index b8f66d861..3d709085e 100644
--- a/libglusterfs/src/glusterfs/glusterfs.h
+++ b/libglusterfs/src/glusterfs/glusterfs.h
@@ -63,7 +63,6 @@
#define O_PATH 010000000 /* from asm-generic/fcntl.h */
#endif
-
#ifndef EBADFD
/* Mac OS X does not have EBADFD */
#define EBADFD EBADF
diff --git a/xlators/features/upcall/src/upcall.c b/xlators/features/upcall/src/upcall.c
index d13ac364f..fbddfd0af 100644
--- a/xlators/features/upcall/src/upcall.c
+++ b/xlators/features/upcall/src/upcall.c
@@ -745,14 +745,13 @@ out:
return 0;
}
-static void
+static void
up_inode_unref(void *data)
{
inode_t *inode = data;
if (inode)
inode_unref(inode);
-
}
static int32_t
@@ -774,8 +773,8 @@ up_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
goto err;
}
- //delay.tv_sec = 600;
- //delay.tv_nsec = 0;
+ // delay.tv_sec = 600;
+ // delay.tv_nsec = 0;
if (!dict_get_int32_sizen(xattr_req, "inode-unref-delay", &delay_time)) {
delay.tv_sec = delay_time;
delay.tv_nsec = 0;
diff --git a/xlators/mount/fuse/src/fuse-bridge.c b/xlators/mount/fuse/src/fuse-bridge.c
index de1905156..822a6bcd1 100644
--- a/xlators/mount/fuse/src/fuse-bridge.c
+++ b/xlators/mount/fuse/src/fuse-bridge.c
@@ -1042,14 +1042,14 @@ fuse_entry_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
feo.attr_valid = calc_timeout_sec(priv->attribute_timeout);
feo.attr_valid_nsec = calc_timeout_nsec(priv->attribute_timeout);
if (!state->wind_from_readdir) {
- #if FUSE_KERNEL_MINOR_VERSION >= 9
- priv->proto_minor >= 9
- ? send_fuse_obj(this, finh, &feo)
- : send_fuse_data(this, finh, &feo, FUSE_COMPAT_ENTRY_OUT_SIZE);
- #else
- send_fuse_obj(this, finh, &feo);
- #endif
- }
+#if FUSE_KERNEL_MINOR_VERSION >= 9
+ priv->proto_minor >= 9
+ ? send_fuse_obj(this, finh, &feo)
+ : send_fuse_data(this, finh, &feo, FUSE_COMPAT_ENTRY_OUT_SIZE);
+#else
+ send_fuse_obj(this, finh, &feo);
+#endif
+ }
} else {
gf_log("glusterfs-fuse",
(op_errno == ENOENT ? GF_LOG_TRACE : GF_LOG_WARNING),
@@ -1060,7 +1060,8 @@ fuse_entry_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
if (!state->wind_from_readdir) {
if ((op_errno == ENOENT) && (priv->negative_timeout != 0)) {
feo.entry_valid = calc_timeout_sec(priv->negative_timeout);
- feo.entry_valid_nsec = calc_timeout_nsec(priv->negative_timeout);
+ feo.entry_valid_nsec = calc_timeout_nsec(
+ priv->negative_timeout);
send_fuse_obj(this, finh, &feo);
} else {
send_fuse_err(this, state->finh, op_errno);
@@ -1153,54 +1154,54 @@ fuse_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
}
UNLOCK(&old_state->lock);
if ((lkup_recvd == (lkup_sent - 1)) && (total_lookup == lkup_sent)) {
+ /* The above condition is true it means this is the last lookup
+ fop wind by readdir fop
+ */
+ fdctx = fd_ctx_get_ptr(old_state->fd, old_state->this);
+ ecache = fdctx->equeue;
+ main_frame = state->main_frame;
+ priv = old_state->this->private;
+
+ if (ecache) {
+ INIT_LIST_HEAD(&local_entries.list);
+ list_for_each_entry_safe(entry, tmp, &ecache->list, list)
+ {
+ size_t fde_size = FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET +
+ (entry->d_len));
+ max_size += fde_size;
- /* The above condition is true it means this is the last lookup
- fop wind by readdir fop
- */
- fdctx = fd_ctx_get_ptr(old_state->fd, old_state->this);
- ecache = fdctx->equeue;
- main_frame = state->main_frame;
- priv = old_state->this->private;
+ if (max_size > old_state->size) {
+ /* we received too many entries to fit in the reply */
+ max_size -= fde_size;
+ break;
+ }
+ }
- if (ecache) {
- INIT_LIST_HEAD(&local_entries.list);
- list_for_each_entry_safe(entry, tmp, &ecache->list, list)
- {
- size_t fde_size = FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + (entry->d_len));
- max_size += fde_size;
+ if (max_size == 0)
+ goto skip;
- if (max_size > old_state->size) {
- /* we received too many entries to fit in the reply */
- max_size -= fde_size;
- break;
+ buf = GF_CALLOC(1, max_size, gf_fuse_mt_char);
+ if (!buf) {
+ gf_log("glusterfs-fuse", GF_LOG_DEBUG,
+ ": READDIR => -1 (%s)", strerror(ENOMEM));
+ goto skip;
}
- }
- if (max_size == 0)
- goto skip;
-
- buf = GF_CALLOC(1, max_size, gf_fuse_mt_char);
- if (!buf) {
- gf_log("glusterfs-fuse", GF_LOG_DEBUG,
- ": READDIR => -1 (%s)", strerror(ENOMEM));
- goto skip;
- }
+ list_for_each_entry_safe(entry, tmp, &ecache->list, list)
+ {
+ fde = (struct fuse_dirent *)(buf + size);
+ gf_fuse_fill_dirent(entry, fde, priv->enable_ino32);
+ size += FUSE_DIRENT_SIZE(fde);
+ list_del_init(&entry->list);
+ list_add_tail(&entry->list, &local_entries.list);
- list_for_each_entry_safe(entry, tmp, &ecache->list, list)
- {
- fde = (struct fuse_dirent *)(buf + size);
- gf_fuse_fill_dirent(entry, fde, priv->enable_ino32);
- size += FUSE_DIRENT_SIZE(fde);
- list_del_init(&entry->list);
- list_add_tail(&entry->list, &local_entries.list);
+ if (size == max_size)
+ break;
+ }
- if (size == max_size)
- break;
+ send_fuse_data(old_state->this, old_state->finh, buf, size);
+ gf_dirent_free(&local_entries);
}
-
- send_fuse_data(old_state->this, old_state->finh, buf, size);
- gf_dirent_free(&local_entries);
- }
skip:
state->old_state = NULL;
free_fuse_state(old_state);
@@ -1316,17 +1317,17 @@ fuse_lookup_resume(fuse_state_t *state)
fuse_gfid_set(state);
/* It means the lookup operations winds from readdir */
if (state->wind_from_readdir) {
- /* TODO Current value is 600s fixed but we can provide a configurable
- option
+ /* TODO Current value is 600s fixed but we can provide a
+ configurable option
*/
delay.tv_sec = 600;
delay.tv_nsec = 0;
- /* Take extra ref so that inode will be keep in active list */
+ /* Take extra ref so that inode will be keep in active list */
inode = inode_new(state->loc.parent->table);
table = state->loc.parent->table;
if (table->active_size < 1000000) {
- //if (table->active_size < 1000000) {
- dict_set_int32(state->xdata, "inode-unref-delay", 600);
+ // if (table->active_size < 1000000) {
+ dict_set_int32(state->xdata, "inode-unref-delay", 600);
state->loc.inode = inode_ref(inode);
gf_timer_call_after(this->ctx, delay, fuse_inode_unref, inode);
} else {
@@ -1348,7 +1349,7 @@ fuse_lookup_resume(fuse_state_t *state)
out:
if (!state->wind_from_readdir)
send_fuse_err(state->this, finh, op_errno);
- free_fuse_state(state);
+ free_fuse_state(state);
}
static void
@@ -3727,9 +3728,9 @@ fuse_opendir(xlator_t *this, fuse_in_header_t *finh, void *msg,
fuse_resolve_and_resume(state, fuse_opendir_resume);
}
-
static void
-fuse_readdir_lookup(xlator_t *this, fuse_in_header_t *finh, void *msg, void *old_state, void *main_frame)
+fuse_readdir_lookup(xlator_t *this, fuse_in_header_t *finh, void *msg,
+ void *old_state, void *main_frame)
{
char *name = msg;
fuse_state_t *state = NULL;
@@ -3738,15 +3739,15 @@ fuse_readdir_lookup(xlator_t *this, fuse_in_header_t *finh, void *msg, void *old
if (state) {
state->old_state = old_state;
state->main_frame = main_frame;
- state->wind_from_readdir = _gf_true;
- (void)fuse_resolve_entry_init(state, &state->resolve, finh->nodeid, name);
+ state->wind_from_readdir = _gf_true;
+ (void)fuse_resolve_entry_init(state, &state->resolve, finh->nodeid,
+ name);
fuse_resolve_and_resume(state, fuse_lookup_resume);
}
return;
}
-
static int
fuse_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
@@ -3789,19 +3790,20 @@ fuse_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
readdir_optimize = priv->readdir_optimize;
if (readdir_optimize) {
- fdctx = fd_ctx_get_ptr(state->fd, this);
+ fdctx = fd_ctx_get_ptr(state->fd, this);
ecache = fdctx->equeue;
INIT_LIST_HEAD(&local_entries.list);
list_for_each_entry(orig_entry, (&entries->list), list)
{
- if (!strcmp(orig_entry->d_name, ".") || !strcmp(orig_entry->d_name, ".."))
+ if (!strcmp(orig_entry->d_name, ".") ||
+ !strcmp(orig_entry->d_name, ".."))
continue;
total_lookup++;
}
state->total_lookup = total_lookup;
if ((!total_lookup) || (table->active_size >= 1000000)) {
- //if (!total_lookup) {
+ // if (!total_lookup) {
readdir_optimize = 0;
goto skip_lookup;
}
@@ -3809,12 +3811,13 @@ fuse_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
flag = 1;
list_for_each_entry(orig_entry, (&entries->list), list)
{
- new_entry = gf_dirent_for_name2(orig_entry->d_name, orig_entry->d_len,
- orig_entry->d_ino, orig_entry->d_off,
- orig_entry->d_type, NULL);
+ new_entry = gf_dirent_for_name2(
+ orig_entry->d_name, orig_entry->d_len, orig_entry->d_ino,
+ orig_entry->d_off, orig_entry->d_type, NULL);
list_add_tail(&new_entry->list, &ecache->list);
if (flag) {
- fde_size = FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + new_entry->d_len);
+ fde_size = FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET +
+ new_entry->d_len);
max_size += fde_size;
}
@@ -3822,17 +3825,17 @@ fuse_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
max_size -= fde_size;
flag = 0;
}
- if (!strcmp(orig_entry->d_name, ".") || !strcmp(orig_entry->d_name, ".."))
+ if (!strcmp(orig_entry->d_name, ".") ||
+ !strcmp(orig_entry->d_name, ".."))
continue;
/* Wind a lookup fop for every entry */
fuse_readdir_lookup(this, finh, new_entry->d_name, state, frame);
- }
- return 0;
+ }
+ return 0;
}
skip_lookup:
if (!readdir_optimize) {
-
gf_log("glusterfs-fuse", GF_LOG_TRACE,
"%" PRIu64 ": READDIR => %d/%" GF_PRI_SIZET ",%" PRId64,
frame->root->unique, op_ret, state->size, state->off);
@@ -3876,8 +3879,8 @@ skip_lookup:
send_fuse_data(this, finh, buf, size);
- /* TODO: */
- /* gf_link_inodes_from_dirent (state->fd->inode, entries); */
+ /* TODO: */
+ /* gf_link_inodes_from_dirent (state->fd->inode, entries); */
}
out:
free_fuse_state(state);
@@ -3903,7 +3906,7 @@ fuse_readdir_resume(fuse_state_t *state)
size_t new_size = state->size;
if (priv->readdir_optimize) {
- fdctx = fd_ctx_get_ptr(state->fd, this);
+ fdctx = fd_ctx_get_ptr(state->fd, this);
ecache = fdctx->equeue;
new_size = 131072;
@@ -3917,7 +3920,8 @@ fuse_readdir_resume(fuse_state_t *state)
INIT_LIST_HEAD(&local_entries.list);
list_for_each_entry(entry, &ecache->list, list)
{
- size_t fde_size = FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + (entry->d_len));
+ size_t fde_size = FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET +
+ (entry->d_len));
max_size += fde_size;
if (max_size > state->size) {
@@ -3932,8 +3936,8 @@ fuse_readdir_resume(fuse_state_t *state)
buf = GF_CALLOC(1, max_size, gf_fuse_mt_char);
if (!buf) {
- gf_log("glusterfs-fuse", GF_LOG_DEBUG,
- ": READDIR => -1 (%s)", strerror(ENOMEM));
+ gf_log("glusterfs-fuse", GF_LOG_DEBUG, ": READDIR => -1 (%s)",
+ strerror(ENOMEM));
send_fuse_err(this, state->finh, ENOMEM);
goto out;
}
@@ -5460,7 +5464,8 @@ fuse_init(xlator_t *this, fuse_in_header_t *finh, void *msg,
priv->init_recvd = 1;
- gf_log("MY_LOG", GF_LOG_INFO, "readdir-optimize is %d", priv->readdir_optimize);
+ gf_log("MY_LOG", GF_LOG_INFO, "readdir-optimize is %d",
+ priv->readdir_optimize);
if (fini->major != FUSE_KERNEL_VERSION) {
gf_log("glusterfs-fuse", GF_LOG_ERROR,
@@ -6673,7 +6678,8 @@ fuse_priv_dump(xlator_t *this)
if (!this)
return -1;
- private = this->private;
+ private
+ = this->private;
if (!private)
return -1;
@@ -6827,7 +6833,8 @@ notify(xlator_t *this, int32_t event, void *data, ...)
glusterfs_graph_t *graph = NULL;
struct pollfd pfd = {0};
- private = this->private;
+ private
+ = this->private;
graph = data;
@@ -6849,7 +6856,8 @@ notify(xlator_t *this, int32_t event, void *data, ...)
(event == GF_EVENT_CHILD_DOWN)) {
pthread_mutex_lock(&private->sync_mutex);
{
- private->event_recvd = 1;
+ private
+ ->event_recvd = 1;
pthread_cond_broadcast(&private->sync_cond);
}
pthread_mutex_unlock(&private->sync_mutex);
@@ -6858,16 +6866,18 @@ notify(xlator_t *this, int32_t event, void *data, ...)
pthread_mutex_lock(&private->sync_mutex);
{
if (!private->fuse_thread_started) {
- private->fuse_thread_started = 1;
+ private
+ ->fuse_thread_started = 1;
start_thread = _gf_true;
}
}
pthread_mutex_unlock(&private->sync_mutex);
if (start_thread) {
- private->fuse_thread = GF_CALLOC(private->reader_thread_count,
- sizeof(pthread_t),
- gf_fuse_mt_pthread_t);
+ private
+ ->fuse_thread = GF_CALLOC(private->reader_thread_count,
+ sizeof(pthread_t),
+ gf_fuse_mt_pthread_t);
for (i = 0; i < private->reader_thread_count; i++) {
ret = gf_thread_create(&private->fuse_thread[i], NULL,
fuse_thread_proc, this, "fuseproc");
@@ -6901,7 +6911,8 @@ notify(xlator_t *this, int32_t event, void *data, ...)
if (fuse_get_mount_status(this) != 0) {
goto auth_fail_unlock;
}
- private->mount_finished = _gf_true;
+ private
+ ->mount_finished = _gf_true;
} else if (pfd.revents) {
gf_log(this->name, GF_LOG_ERROR,
"mount pipe closed without status");
@@ -7144,7 +7155,8 @@ init(xlator_t *this_xl)
GF_OPTION_INIT("use-readdirp", priv->use_readdirp, bool, cleanup_exit);
if (!priv->use_readdirp)
- GF_OPTION_INIT("readdir-optimize", priv->readdir_optimize, bool, cleanup_exit);
+ GF_OPTION_INIT("readdir-optimize", priv->readdir_optimize, bool,
+ cleanup_exit);
priv->fuse_dump_fd = -1;
ret = dict_get_str(options, "dump-fuse", &value_string);
diff --git a/xlators/mount/fuse/src/fuse-bridge.h b/xlators/mount/fuse/src/fuse-bridge.h
index 4a1f7367c..e3fc38b20 100644
--- a/xlators/mount/fuse/src/fuse-bridge.h
+++ b/xlators/mount/fuse/src/fuse-bridge.h
@@ -130,7 +130,6 @@ struct fuse_private {
/* for using fuse-kernel readdirp*/
gf_boolean_t use_readdirp;
-
/* The option to enable/disable readdir_optimize at fuse level*/
gf_boolean_t readdir_optimize;
|
/* The above condition is true it means this is the last lookup | ||
fop wind by readdir fop | ||
*/ | ||
fdctx = fd_ctx_get_ptr(old_state->fd, old_state->this); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
indention?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good improvement W.r.to small file performance and specifically ls
performance. Would like to merge this in if no major concerns on the PR raised by anyone in next week.
/run regression |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I haven't checked the patch in detail, but this change was tested by a user (#4176) and the results weren't good at all.
I can't say if the problem is in the patch or the user environment, but we cannot merge it without understanding what happened and doing proper testing.
thanks for the information. I will keep it intact for now till we see more testing |
@@ -757,6 +774,22 @@ up_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req) | |||
goto err; | |||
} | |||
|
|||
//delay.tv_sec = 600; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: debug leftovers?
There are multiple readdir improvement with this patch for fuse.
10m(600s) sothat in next lookup is served by md-cache
Fixes: #4176
Change-Id: Ic9c75799883d94ca473c230e6213cbf00b383684