Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion include/sys/arc_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -952,7 +952,7 @@ typedef struct arc_sums {
wmsum_t arcstat_data_size;
wmsum_t arcstat_metadata_size;
wmsum_t arcstat_dbuf_size;
wmsum_t arcstat_dnode_size;
aggsum_t arcstat_dnode_size;
wmsum_t arcstat_bonus_size;
wmsum_t arcstat_l2_hits;
wmsum_t arcstat_l2_misses;
Expand Down
65 changes: 65 additions & 0 deletions module/os/linux/zfs/zfs_vfsops.c
Original file line number Diff line number Diff line change
Expand Up @@ -1179,6 +1179,63 @@ zfs_root(zfsvfs_t *zfsvfs, struct inode **ipp)
return (error);
}

/*
* Dentry and inode caches referenced by a task in non-root memcg are
* not going to be scanned by the kernel-provided shrinker. So, if
* kernel prunes nothing, fall back to this manual walk to free dnodes.
* To avoid scanning the same znodes multiple times they are always rotated
* to the end of the z_all_znodes list. New znodes are inserted at the
* end of the list so we're always scanning the oldest znodes first.
*/
static int
zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan)
{
znode_t **zp_array, *zp;
int max_array = MIN(nr_to_scan, PAGE_SIZE * 8 / sizeof (znode_t *));
int objects = 0;
int i = 0, j = 0;

zp_array = vmem_zalloc(max_array * sizeof (znode_t *), KM_SLEEP);

mutex_enter(&zfsvfs->z_znodes_lock);
while ((zp = list_head(&zfsvfs->z_all_znodes)) != NULL) {

if ((i++ > nr_to_scan) || (j >= max_array))
break;

ASSERT(list_link_active(&zp->z_link_node));
list_remove(&zfsvfs->z_all_znodes, zp);
list_insert_tail(&zfsvfs->z_all_znodes, zp);

/* Skip active znodes and .zfs entries */
if (MUTEX_HELD(&zp->z_lock) || zp->z_is_ctldir)
continue;

if (igrab(ZTOI(zp)) == NULL)
continue;

zp_array[j] = zp;
j++;
}
mutex_exit(&zfsvfs->z_znodes_lock);

for (i = 0; i < j; i++) {
zp = zp_array[i];

ASSERT3P(zp, !=, NULL);
d_prune_aliases(ZTOI(zp));

if (atomic_read(&ZTOI(zp)->i_count) == 1)
objects++;

zrele(zp);
}

vmem_free(zp_array, max_array * sizeof (znode_t *));

return (objects);
}

/*
* The ARC has requested that the filesystem drop entries from the dentry
* and inode caches. This can occur when the ARC needs to free meta data
Expand Down Expand Up @@ -1222,6 +1279,14 @@ zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects)
*objects = (*shrinker->scan_objects)(shrinker, &sc);
#endif

/*
* Fall back to zfs_prune_aliases if kernel's shrinker did nothing
* due to dentry and inode caches being referenced by a task running
* in non-root memcg.
*/
if (*objects == 0)
*objects = zfs_prune_aliases(zfsvfs, nr_to_scan);

zfs_exit(zfsvfs, FTAG);

dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
Expand Down
27 changes: 17 additions & 10 deletions module/zfs/arc.c
Original file line number Diff line number Diff line change
Expand Up @@ -2597,7 +2597,7 @@ arc_space_consume(uint64_t space, arc_space_type_t type)
ARCSTAT_INCR(arcstat_bonus_size, space);
break;
case ARC_SPACE_DNODE:
ARCSTAT_INCR(arcstat_dnode_size, space);
aggsum_add(&arc_sums.arcstat_dnode_size, space);
break;
case ARC_SPACE_DBUF:
ARCSTAT_INCR(arcstat_dbuf_size, space);
Expand Down Expand Up @@ -2643,7 +2643,7 @@ arc_space_return(uint64_t space, arc_space_type_t type)
ARCSTAT_INCR(arcstat_bonus_size, -space);
break;
case ARC_SPACE_DNODE:
ARCSTAT_INCR(arcstat_dnode_size, -space);
aggsum_add(&arc_sums.arcstat_dnode_size, -space);
break;
case ARC_SPACE_DBUF:
ARCSTAT_INCR(arcstat_dbuf_size, -space);
Expand Down Expand Up @@ -4292,7 +4292,7 @@ arc_evict(void)
* target is not evictable or if they go over arc_dnode_limit.
*/
int64_t prune = 0;
int64_t dn = wmsum_value(&arc_sums.arcstat_dnode_size);
int64_t dn = aggsum_value(&arc_sums.arcstat_dnode_size);
w = wt * (int64_t)(arc_meta >> 16) >> 16;
if (zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA]) +
zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA]) -
Expand Down Expand Up @@ -4775,12 +4775,19 @@ arc_is_overflowing(boolean_t use_reserve)
* in the ARC. In practice, that's in the tens of MB, which is low
* enough to be safe.
*/
int64_t over = aggsum_lower_bound(&arc_sums.arcstat_size) -
int64_t arc_over = aggsum_lower_bound(&arc_sums.arcstat_size) -
arc_c - overflow / 2;
if (!use_reserve)
overflow /= 2;
return (over < 0 ? ARC_OVF_NONE :
over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE);

int64_t dn_over = aggsum_lower_bound(&arc_sums.arcstat_dnode_size) -
arc_dnode_limit;

/* Always allow at least one block of overflow. */
if (arc_over < 0 && dn_over <= 0)
return (ARC_OVF_NONE);

return (arc_over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE);
}

static abd_t *
Expand Down Expand Up @@ -6938,7 +6945,7 @@ arc_kstat_update(kstat_t *ksp, int rw)
#if defined(COMPAT_FREEBSD11)
as->arcstat_other_size.value.ui64 =
wmsum_value(&arc_sums.arcstat_bonus_size) +
wmsum_value(&arc_sums.arcstat_dnode_size) +
aggsum_value(&arc_sums.arcstat_dnode_size) +
wmsum_value(&arc_sums.arcstat_dbuf_size);
#endif

Expand Down Expand Up @@ -6980,7 +6987,7 @@ arc_kstat_update(kstat_t *ksp, int rw)
&as->arcstat_uncached_evictable_metadata);

as->arcstat_dnode_size.value.ui64 =
wmsum_value(&arc_sums.arcstat_dnode_size);
aggsum_value(&arc_sums.arcstat_dnode_size);
as->arcstat_bonus_size.value.ui64 =
wmsum_value(&arc_sums.arcstat_bonus_size);
as->arcstat_l2_hits.value.ui64 =
Expand Down Expand Up @@ -7349,7 +7356,7 @@ arc_state_init(void)
wmsum_init(&arc_sums.arcstat_data_size, 0);
wmsum_init(&arc_sums.arcstat_metadata_size, 0);
wmsum_init(&arc_sums.arcstat_dbuf_size, 0);
wmsum_init(&arc_sums.arcstat_dnode_size, 0);
aggsum_init(&arc_sums.arcstat_dnode_size, 0);
wmsum_init(&arc_sums.arcstat_bonus_size, 0);
wmsum_init(&arc_sums.arcstat_l2_hits, 0);
wmsum_init(&arc_sums.arcstat_l2_misses, 0);
Expand Down Expand Up @@ -7507,7 +7514,7 @@ arc_state_fini(void)
wmsum_fini(&arc_sums.arcstat_data_size);
wmsum_fini(&arc_sums.arcstat_metadata_size);
wmsum_fini(&arc_sums.arcstat_dbuf_size);
wmsum_fini(&arc_sums.arcstat_dnode_size);
aggsum_fini(&arc_sums.arcstat_dnode_size);
wmsum_fini(&arc_sums.arcstat_bonus_size);
wmsum_fini(&arc_sums.arcstat_l2_hits);
wmsum_fini(&arc_sums.arcstat_l2_misses);
Expand Down
Loading