From 1334be3657dd02af0591d6d8adf0e6a60a7710a6 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 17 Nov 2017 15:26:12 -0800 Subject: [PATCH 01/94] mm: fix nodemask printing The cleanup caused build warnings for constant mask pointers: mm/mempolicy.c: In function `mpol_to_str': ./include/linux/nodemask.h:108:11: warning: the comparison will always evaluate as `true' for the address of `nodes' will never be NULL [-Waddress] An earlier workaround I suggested was incorporated in the version that got merged, but that only solved the problem for gcc-7 and higher, while gcc-4.6 through gcc-6.x still warn. This changes the printing again to use inline functions that make it clear to the compiler that the line that does the NULL check has no idea whether the argument is a constant NULL. Link: http://lkml.kernel.org/r/20171117101545.119689-1-arnd@arndb.de Fixes: 0205f75571e3 ("mm: simplify nodemask printing") Signed-off-by: Arnd Bergmann Cc: Michal Hocko Cc: Stephen Rothwell Cc: Zhangshaokun Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/nodemask.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 15cab3967d6dfe..1fbde8a880d9a0 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -104,9 +104,16 @@ extern nodemask_t _unused_nodemask_arg_; * * Can be used to provide arguments for '%*pb[l]' when printing a nodemask. */ -#define nodemask_pr_args(maskp) \ - ((maskp) != NULL) ? MAX_NUMNODES : 0, \ - ((maskp) != NULL) ? (maskp)->bits : NULL +#define nodemask_pr_args(maskp) __nodemask_pr_numnodes(maskp), \ + __nodemask_pr_bits(maskp) +static inline unsigned int __nodemask_pr_numnodes(const nodemask_t *m) +{ + return m ? MAX_NUMNODES : 0; +} +static inline const unsigned long *__nodemask_pr_bits(const nodemask_t *m) +{ + return m ? m->bits : NULL; +} /* * The inline keyword gives the compiler room to decide to inline, or From 5d03a6613957785e94af7a4a6212ad4af66aa5c2 Mon Sep 17 00:00:00 2001 From: Vitaly Wool Date: Fri, 17 Nov 2017 15:26:16 -0800 Subject: [PATCH 02/94] mm/z3fold.c: use kref to prevent page free/compact race There is a race in the current z3fold implementation between do_compact() called in a work queue context and the page release procedure when page's kref goes to 0. do_compact() may be waiting for page lock, which is released by release_z3fold_page_locked right before putting the page onto the "stale" list, and then the page may be freed as do_compact() modifies its contents. The mechanism currently implemented to handle that (checking the PAGE_STALE flag) is not reliable enough. Instead, we'll use page's kref counter to guarantee that the page is not released if its compaction is scheduled. It then becomes compaction function's responsibility to decrease the counter and quit immediately if the page was actually freed. Link: http://lkml.kernel.org/r/20171117092032.00ea56f42affbed19f4fcc6c@gmail.com Signed-off-by: Vitaly Wool Cc: Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/z3fold.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index b2ba2ba585f3c0..39e19125d6a019 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -404,8 +404,7 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked) WARN_ON(z3fold_page_trylock(zhdr)); else z3fold_page_lock(zhdr); - if (test_bit(PAGE_STALE, &page->private) || - !test_and_clear_bit(NEEDS_COMPACTING, &page->private)) { + if (WARN_ON(!test_and_clear_bit(NEEDS_COMPACTING, &page->private))) { z3fold_page_unlock(zhdr); return; } @@ -413,6 +412,11 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked) list_del_init(&zhdr->buddy); spin_unlock(&pool->lock); + if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) { + atomic64_dec(&pool->pages_nr); + return; + } + z3fold_compact_page(zhdr); unbuddied = get_cpu_ptr(pool->unbuddied); fchunks = num_free_chunks(zhdr); @@ -753,9 +757,11 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) list_del_init(&zhdr->buddy); spin_unlock(&pool->lock); zhdr->cpu = -1; + kref_get(&zhdr->refcount); do_compact_page(zhdr, true); return; } + kref_get(&zhdr->refcount); queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work); z3fold_page_unlock(zhdr); } From 3aaabbf1c39effa2ac0c11103ed07ef03b0a0d89 Mon Sep 17 00:00:00 2001 From: Miles Chen Date: Fri, 17 Nov 2017 15:26:19 -0800 Subject: [PATCH 03/94] lib/dma-debug.c: fix incorrect pfn calculation dma-debug reports the following warning: WARNING: CPU: 3 PID: 298 at kernel-4.4/lib/dma-debug.c:604 debug _dma_assert_idle+0x1a8/0x230() DMA-API: cpu touching an active dma mapped cacheline [cln=0x00000882300] CPU: 3 PID: 298 Comm: vold Tainted: G W O 4.4.22+ #1 Hardware name: MT6739 (DT) Call trace: debug_dma_assert_idle+0x1a8/0x230 wp_page_copy.isra.96+0x118/0x520 do_wp_page+0x4fc/0x534 handle_mm_fault+0xd4c/0x1310 do_page_fault+0x1c8/0x394 do_mem_abort+0x50/0xec I found that debug_dma_alloc_coherent() and debug_dma_free_coherent() assume that dma_alloc_coherent() always returns a linear address. However it's possible that dma_alloc_coherent() returns a non-linear address. In this case, page_to_pfn(virt_to_page(virt)) will return an incorrect pfn. If the pfn is valid and mapped as a COW page, we will hit the warning when doing wp_page_copy(). Fix this by calculating pfn for linear and non-linear addresses. [miles.chen@mediatek.com: v4] Link: http://lkml.kernel.org/r/1510872972-23919-1-git-send-email-miles.chen@mediatek.com Link: http://lkml.kernel.org/r/1506484087-1177-1-git-send-email-miles.chen@mediatek.com Signed-off-by: Miles Chen Reviewed-by: Robin Murphy Cc: Christoph Hellwig Cc: Marek Szyprowski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/dma-debug.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/lib/dma-debug.c b/lib/dma-debug.c index ea4cc3dde4f1ba..1b34d210452c5a 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c @@ -1495,14 +1495,22 @@ void debug_dma_alloc_coherent(struct device *dev, size_t size, if (!entry) return; + /* handle vmalloc and linear addresses */ + if (!is_vmalloc_addr(virt) && !virt_to_page(virt)) + return; + entry->type = dma_debug_coherent; entry->dev = dev; - entry->pfn = page_to_pfn(virt_to_page(virt)); entry->offset = offset_in_page(virt); entry->size = size; entry->dev_addr = dma_addr; entry->direction = DMA_BIDIRECTIONAL; + if (is_vmalloc_addr(virt)) + entry->pfn = vmalloc_to_pfn(virt); + else + entry->pfn = page_to_pfn(virt_to_page(virt)); + add_dma_entry(entry); } EXPORT_SYMBOL(debug_dma_alloc_coherent); @@ -1513,13 +1521,21 @@ void debug_dma_free_coherent(struct device *dev, size_t size, struct dma_debug_entry ref = { .type = dma_debug_coherent, .dev = dev, - .pfn = page_to_pfn(virt_to_page(virt)), .offset = offset_in_page(virt), .dev_addr = addr, .size = size, .direction = DMA_BIDIRECTIONAL, }; + /* handle vmalloc and linear addresses */ + if (!is_vmalloc_addr(virt) && !virt_to_page(virt)) + return; + + if (is_vmalloc_addr(virt)) + ref.pfn = vmalloc_to_pfn(virt); + else + ref.pfn = page_to_pfn(virt_to_page(virt)); + if (unlikely(dma_debug_disabled())) return; From 09af5ccea26d347041ac7ca37a61f859ef9bd1f5 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Fri, 17 Nov 2017 15:26:23 -0800 Subject: [PATCH 04/94] mm: shmem: remove unused info variable Fix the following warning by removing the unused variable: mm/shmem.c:3205:27: warning: variable 'info' set but not used [-Wunused-but-set-variable] Link: http://lkml.kernel.org/r/1510774029-30652-1-git-send-email-clabbe@baylibre.com Signed-off-by: Corentin Labbe Acked-by: Michal Hocko Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 1f97d77551c3e5..4aa9307feab0a0 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3202,7 +3202,6 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s int len; struct inode *inode; struct page *page; - struct shmem_inode_info *info; len = strlen(symname) + 1; if (len > PAGE_SIZE) @@ -3222,7 +3221,6 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s error = 0; } - info = SHMEM_I(inode); inode->i_size = len-1; if (len <= SHORT_SYMLINK_LEN) { inode->i_link = kmemdup(symname, len, GFP_KERNEL); From a0647dc9208fae9124ca38d43a5c3c950d955291 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Fri, 17 Nov 2017 15:26:27 -0800 Subject: [PATCH 05/94] mm, compaction: kcompactd should not ignore pageblock skip Kcompactd is needlessly ignoring pageblock skip information. It is doing MIGRATE_SYNC_LIGHT compaction, which is no more powerful than MIGRATE_SYNC compaction. If compaction recently failed to isolate memory from a set of pageblocks, there is nothing to indicate that kcompactd will be able to do so, or that it is beneficial from attempting to isolate memory. Use the pageblock skip hint to avoid rescanning pageblocks needlessly until that information is reset. Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1708151638550.106658@chino.kir.corp.google.com Signed-off-by: David Rientjes Cc: Vlastimil Babka Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 85395dc6eb137f..ad40d67421f370 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1928,9 +1928,8 @@ static void kcompactd_do_work(pg_data_t *pgdat) .total_free_scanned = 0, .classzone_idx = pgdat->kcompactd_classzone_idx, .mode = MIGRATE_SYNC_LIGHT, - .ignore_skip_hint = true, + .ignore_skip_hint = false, .gfp_mask = GFP_KERNEL, - }; trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, cc.classzone_idx); From 21dc7e023611fbcf8e38f255731bcf3cc38e7638 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Fri, 17 Nov 2017 15:26:30 -0800 Subject: [PATCH 06/94] mm, compaction: persistently skip hugetlbfs pageblocks It is pointless to migrate hugetlb memory as part of memory compaction if the hugetlb size is equal to the pageblock order. No defragmentation is occurring in this condition. It is also pointless to for the freeing scanner to scan a pageblock where a hugetlb page is pinned. Unconditionally skip these pageblocks, and do so peristently so that they are not rescanned until it is observed that these hugepages are no longer pinned. It would also be possible to do this by involving the hugetlb subsystem in marking pageblocks to no longer be skipped when they hugetlb pages are freed. This is a simple solution that doesn't involve any additional subsystems in pageblock skip manipulation. [rientjes@google.com: fix build] Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1708201734390.117182@chino.kir.corp.google.com Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1708151639130.106658@chino.kir.corp.google.com Signed-off-by: David Rientjes Tested-by: Michal Hocko Cc: Vlastimil Babka Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pageblock-flags.h | 11 +++++++ mm/compaction.c | 56 ++++++++++++++++++++++++++------- 2 files changed, 55 insertions(+), 12 deletions(-) diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index e942558b3585f9..9132c5cb41f10e 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -96,6 +96,17 @@ void set_pfnblock_flags_mask(struct page *page, #define set_pageblock_skip(page) \ set_pageblock_flags_group(page, 1, PB_migrate_skip, \ PB_migrate_skip) +#else +static inline bool get_pageblock_skip(struct page *page) +{ + return false; +} +static inline void clear_pageblock_skip(struct page *page) +{ +} +static inline void set_pageblock_skip(struct page *page) +{ +} #endif /* CONFIG_COMPACTION */ #endif /* PAGEBLOCK_FLAGS_H */ diff --git a/mm/compaction.c b/mm/compaction.c index ad40d67421f370..94b5c0865dd145 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -218,6 +218,20 @@ static void reset_cached_positions(struct zone *zone) pageblock_start_pfn(zone_end_pfn(zone) - 1); } +/* + * Hugetlbfs pages should consistenly be skipped until updated by the hugetlb + * subsystem. It is always pointless to compact pages of pageblock_order and + * the free scanner can reconsider when no longer huge. + */ +static bool pageblock_skip_persistent(struct page *page, unsigned int order) +{ + if (!PageHuge(page)) + return false; + if (order != pageblock_order) + return false; + return true; +} + /* * This function is called to clear all cached information on pageblocks that * should be skipped for page isolation when the migrate and free page scanner @@ -242,6 +256,8 @@ static void __reset_isolation_suitable(struct zone *zone) continue; if (zone != page_zone(page)) continue; + if (pageblock_skip_persistent(page, compound_order(page))) + continue; clear_pageblock_skip(page); } @@ -307,7 +323,13 @@ static inline bool isolation_suitable(struct compact_control *cc, return true; } -static void update_pageblock_skip(struct compact_control *cc, +static inline bool pageblock_skip_persistent(struct page *page, + unsigned int order) +{ + return false; +} + +static inline void update_pageblock_skip(struct compact_control *cc, struct page *page, unsigned long nr_isolated, bool migrate_scanner) { @@ -449,13 +471,15 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, * and the only danger is skipping too much. */ if (PageCompound(page)) { - unsigned int comp_order = compound_order(page); - - if (likely(comp_order < MAX_ORDER)) { - blockpfn += (1UL << comp_order) - 1; - cursor += (1UL << comp_order) - 1; + const unsigned int order = compound_order(page); + + if (pageblock_skip_persistent(page, order)) { + set_pageblock_skip(page); + blockpfn = end_pfn; + } else if (likely(order < MAX_ORDER)) { + blockpfn += (1UL << order) - 1; + cursor += (1UL << order) - 1; } - goto isolate_fail; } @@ -772,11 +796,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * danger is skipping too much. */ if (PageCompound(page)) { - unsigned int comp_order = compound_order(page); - - if (likely(comp_order < MAX_ORDER)) - low_pfn += (1UL << comp_order) - 1; + const unsigned int order = compound_order(page); + if (pageblock_skip_persistent(page, order)) { + set_pageblock_skip(page); + low_pfn = end_pfn; + } else if (likely(order < MAX_ORDER)) + low_pfn += (1UL << order) - 1; goto isolate_fail; } @@ -838,7 +864,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * is safe to read and it's 0 for tail pages. */ if (unlikely(PageCompound(page))) { - low_pfn += (1UL << compound_order(page)) - 1; + const unsigned int order = compound_order(page); + + if (pageblock_skip_persistent(page, order)) { + set_pageblock_skip(page); + low_pfn = end_pfn; + } else + low_pfn += (1UL << order) - 1; goto isolate_fail; } } From b527cfe5bc23208cf9a346879501333cec638aba Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 17 Nov 2017 15:26:34 -0800 Subject: [PATCH 07/94] mm, compaction: extend pageblock_skip_persistent() to all compound pages pageblock_skip_persistent() checks for HugeTLB pages of pageblock order. When clearing pageblock skip bits for compaction, the bits are not cleared for such pageblocks, because they cannot contain base pages suitable for migration, nor free pages to use as migration targets. This optimization can be simply extended to all compound pages of order equal or larger than pageblock order, because migrating such pages (if they support it) cannot help sub-pageblock fragmentation. This includes THP's and also gigantic HugeTLB pages, which the current implementation doesn't persistently skip due to a strict pageblock_order equality check and not recognizing tail pages. While THP pages are generally less "persistent" than HugeTLB, we can still expect that if a THP exists at the point of __reset_isolation_suitable(), it will exist also during the subsequent compaction run. The time difference here could be actually smaller than between a compaction run that sets a (non-persistent) skip bit on a THP, and the next compaction run that observes it. Link: http://lkml.kernel.org/r/20171102121706.21504-1-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Acked-by: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 94b5c0865dd145..e8f5b4e2cb0551 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -219,17 +219,21 @@ static void reset_cached_positions(struct zone *zone) } /* - * Hugetlbfs pages should consistenly be skipped until updated by the hugetlb - * subsystem. It is always pointless to compact pages of pageblock_order and - * the free scanner can reconsider when no longer huge. + * Compound pages of >= pageblock_order should consistenly be skipped until + * released. It is always pointless to compact pages of such order (if they are + * migratable), and the pageblocks they occupy cannot contain any free pages. */ -static bool pageblock_skip_persistent(struct page *page, unsigned int order) +static bool pageblock_skip_persistent(struct page *page) { - if (!PageHuge(page)) + if (!PageCompound(page)) return false; - if (order != pageblock_order) - return false; - return true; + + page = compound_head(page); + + if (compound_order(page) >= pageblock_order) + return true; + + return false; } /* @@ -256,7 +260,7 @@ static void __reset_isolation_suitable(struct zone *zone) continue; if (zone != page_zone(page)) continue; - if (pageblock_skip_persistent(page, compound_order(page))) + if (pageblock_skip_persistent(page)) continue; clear_pageblock_skip(page); @@ -323,8 +327,7 @@ static inline bool isolation_suitable(struct compact_control *cc, return true; } -static inline bool pageblock_skip_persistent(struct page *page, - unsigned int order) +static inline bool pageblock_skip_persistent(struct page *page) { return false; } From 2583d6713267a4c80126e4e50dd45f5cf685ebe8 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 17 Nov 2017 15:26:38 -0800 Subject: [PATCH 08/94] mm, compaction: split off flag for not updating skip hints Pageblock skip hints were added as a heuristic for compaction, which shares core code with CMA. Since CMA reliability would suffer from the heuristics, compact_control flag ignore_skip_hint was added for the CMA use case. Since 6815bf3f233e ("mm/compaction: respect ignore_skip_hint in update_pageblock_skip") the flag also means that CMA won't *update* the skip hints in addition to ignoring them. Today, direct compaction can also ignore the skip hints in the last resort attempt, but there's no reason not to set them when isolation fails in such case. Thus, this patch splits off a new no_set_skip_hint flag to avoid the updating, which only CMA sets. This should improve the heuristics a bit, and allow us to simplify the persistent skip bit handling as the next step. Link: http://lkml.kernel.org/r/20171102121706.21504-2-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 2 +- mm/internal.h | 1 + mm/page_alloc.c | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/compaction.c b/mm/compaction.c index e8f5b4e2cb0551..bb1188a9d58ed5 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -295,7 +295,7 @@ static void update_pageblock_skip(struct compact_control *cc, struct zone *zone = cc->zone; unsigned long pfn; - if (cc->ignore_skip_hint) + if (cc->no_set_skip_hint) return; if (!page) diff --git a/mm/internal.h b/mm/internal.h index 1df011f624801f..e6bd35182daee1 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -198,6 +198,7 @@ struct compact_control { const int classzone_idx; /* zone index of a direct compactor */ enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ + bool no_set_skip_hint; /* Don't mark blocks for skipping */ bool ignore_block_suitable; /* Scan blocks considered unsuitable */ bool direct_compaction; /* False from kcompactd or /proc/... */ bool whole_zone; /* Whole zone should/has been scanned */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 55ded92f9809e0..d4096f4a5c1f75 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7619,6 +7619,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, .zone = page_zone(pfn_to_page(start)), .mode = MIGRATE_SYNC, .ignore_skip_hint = true, + .no_set_skip_hint = true, .gfp_mask = current_gfp_context(gfp_mask), }; INIT_LIST_HEAD(&cc.migratepages); From d3c85bad89b9153df741af14ad859ee49677f00d Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 17 Nov 2017 15:26:41 -0800 Subject: [PATCH 09/94] mm, compaction: remove unneeded pageblock_skip_persistent() checks Commit f3c931633a59 ("mm, compaction: persistently skip hugetlbfs pageblocks") has introduced pageblock_skip_persistent() checks into migration and free scanners, to make sure pageblocks that should be persistently skipped are marked as such, regardless of the ignore_skip_hint flag. Since the previous patch introduced a new no_set_skip_hint flag, the ignore flag no longer prevents marking pageblocks as skipped. Therefore we can remove the special cases. The relevant pageblocks will be marked as skipped by the common logic which marks each pageblock where no page could be isolated. This makes the code simpler. Link: http://lkml.kernel.org/r/20171102121706.21504-3-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Mel Gorman Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index bb1188a9d58ed5..10cd757f1006b8 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -476,10 +476,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, if (PageCompound(page)) { const unsigned int order = compound_order(page); - if (pageblock_skip_persistent(page, order)) { - set_pageblock_skip(page); - blockpfn = end_pfn; - } else if (likely(order < MAX_ORDER)) { + if (likely(order < MAX_ORDER)) { blockpfn += (1UL << order) - 1; cursor += (1UL << order) - 1; } @@ -801,10 +798,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (PageCompound(page)) { const unsigned int order = compound_order(page); - if (pageblock_skip_persistent(page, order)) { - set_pageblock_skip(page); - low_pfn = end_pfn; - } else if (likely(order < MAX_ORDER)) + if (likely(order < MAX_ORDER)) low_pfn += (1UL << order) - 1; goto isolate_fail; } @@ -867,13 +861,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * is safe to read and it's 0 for tail pages. */ if (unlikely(PageCompound(page))) { - const unsigned int order = compound_order(page); - - if (pageblock_skip_persistent(page, order)) { - set_pageblock_skip(page); - low_pfn = end_pfn; - } else - low_pfn += (1UL << order) - 1; + low_pfn += (1UL << compound_order(page)) - 1; goto isolate_fail; } } From c643401218be0f4ab3522e0c0a63016596d6e9ca Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 17 Nov 2017 15:26:45 -0800 Subject: [PATCH 10/94] proc, coredump: add CoreDumping flag to /proc/pid/status Right now there is no convenient way to check if a process is being coredumped at the moment. It might be necessary to recognize such state to prevent killing the process and getting a broken coredump. Writing a large core might take significant time, and the process is unresponsive during it, so it might be killed by timeout, if another process is monitoring and killing/restarting hanging tasks. We're getting a significant number of corrupted coredump files on machines in our fleet, just because processes are being killed by timeout in the middle of the core writing process. We do have a process health check, and some agent is responsible for restarting processes which are not responding for health check requests. Writing a large coredump to the disk can easily exceed the reasonable timeout (especially on an overloaded machine). This flag will allow the agent to distinguish processes which are being coredumped, extend the timeout for them, and let them produce a full coredump file. To provide an ability to detect if a process is in the state of being coredumped, we can expose a boolean CoreDumping flag in /proc/pid/status. Example: $ cat core.sh #!/bin/sh echo "|/usr/bin/sleep 10" > /proc/sys/kernel/core_pattern sleep 1000 & PID=$! cat /proc/$PID/status | grep CoreDumping kill -ABRT $PID sleep 1 cat /proc/$PID/status | grep CoreDumping $ ./core.sh CoreDumping: 0 CoreDumping: 1 [guro@fb.com: document CoreDumping flag in /proc//status] Link: http://lkml.kernel.org/r/20170928135357.GA8470@castle.DHCP.thefacebook.com Link: http://lkml.kernel.org/r/20170920230634.31572-1-guro@fb.com Signed-off-by: Roman Gushchin Cc: Alexander Viro Cc: Ingo Molnar Cc: Konstantin Khlebnikov Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 3 +++ fs/proc/array.c | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index ec571b9bb18a9e..2a84bb3348947c 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -181,6 +181,7 @@ read the file /proc/PID/status: VmPTE: 20 kb VmSwap: 0 kB HugetlbPages: 0 kB + CoreDumping: 0 Threads: 1 SigQ: 0/28578 SigPnd: 0000000000000000 @@ -253,6 +254,8 @@ Table 1-2: Contents of the status files (as of 4.8) VmSwap amount of swap used by anonymous private data (shmem swap usage is not included) HugetlbPages size of hugetlb memory portions + CoreDumping process's memory is currently being dumped + (killing the process may lead to a corrupted core) Threads number of threads SigQ number of signals queued/max. number for queue SigPnd bitmap of pending signals for the thread diff --git a/fs/proc/array.c b/fs/proc/array.c index 6f6fc1672ad1af..79375fc115d277 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -366,6 +366,11 @@ static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) cpumask_pr_args(&task->cpus_allowed)); } +static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm) +{ + seq_printf(m, "CoreDumping:\t%d\n", !!mm->core_state); +} + int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { @@ -376,6 +381,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, if (mm) { task_mem(m, mm); + task_core_dumping(m, mm); mmput(mm); } task_sig(m, task); From 3ee2a19908f27b8fea8ff14ffa8b755585eb7b4a Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 17 Nov 2017 15:26:49 -0800 Subject: [PATCH 11/94] proc: : uninline name_to_int() Save ~360 bytes. add/remove: 1/0 grow/shrink: 0/4 up/down: 104/-463 (-359) function old new delta name_to_int - 104 +104 proc_pid_lookup 217 126 -91 proc_lookupfd_common 212 121 -91 proc_task_lookup 289 194 -95 __proc_create 588 402 -186 Link: http://lkml.kernel.org/r/20170912194850.GA17730@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/Makefile | 1 + fs/proc/internal.h | 23 +---------------------- fs/proc/util.c | 23 +++++++++++++++++++++++ 3 files changed, 25 insertions(+), 22 deletions(-) create mode 100644 fs/proc/util.c diff --git a/fs/proc/Makefile b/fs/proc/Makefile index f7456c4e7d0f18..ead487e8051087 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -21,6 +21,7 @@ proc-y += loadavg.o proc-y += meminfo.o proc-y += stat.o proc-y += uptime.o +proc-y += util.o proc-y += version.o proc-y += softirqs.o proc-y += namespaces.o diff --git a/fs/proc/internal.h b/fs/proc/internal.h index a34195e92b206c..9aad373cf11d73 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -103,28 +103,7 @@ static inline struct task_struct *get_proc_task(struct inode *inode) void task_dump_owner(struct task_struct *task, mode_t mode, kuid_t *ruid, kgid_t *rgid); -static inline unsigned name_to_int(const struct qstr *qstr) -{ - const char *name = qstr->name; - int len = qstr->len; - unsigned n = 0; - - if (len > 1 && *name == '0') - goto out; - while (len-- > 0) { - unsigned c = *name++ - '0'; - if (c > 9) - goto out; - if (n >= (~0U-9)/10) - goto out; - n *= 10; - n += c; - } - return n; -out: - return ~0U; -} - +unsigned name_to_int(const struct qstr *qstr); /* * Offset of the first process in the /proc root directory.. */ diff --git a/fs/proc/util.c b/fs/proc/util.c new file mode 100644 index 00000000000000..c29aa497394b87 --- /dev/null +++ b/fs/proc/util.c @@ -0,0 +1,23 @@ +#include + +unsigned name_to_int(const struct qstr *qstr) +{ + const char *name = qstr->name; + int len = qstr->len; + unsigned n = 0; + + if (len > 1 && *name == '0') + goto out; + while (len-- > 0) { + unsigned c = *name++ - '0'; + if (c > 9) + goto out; + if (n >= (~0U-9)/10) + goto out; + n *= 10; + n += c; + } + return n; +out: + return ~0U; +} From 0746a0bc6e6e76444098cf944848554d21d28cae Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 17 Nov 2017 15:26:52 -0800 Subject: [PATCH 12/94] proc: use do-while in name_to_int() Gcc doesn't know that "len" is guaranteed to be >=1 by dcache and generates standard while-loop prologue duplicating loop condition. add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-27 (-27) function old new delta name_to_int 104 77 -27 Link: http://lkml.kernel.org/r/20170912195213.GB17730@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/util.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/proc/util.c b/fs/proc/util.c index c29aa497394b87..b161cfa0f9fa15 100644 --- a/fs/proc/util.c +++ b/fs/proc/util.c @@ -8,7 +8,7 @@ unsigned name_to_int(const struct qstr *qstr) if (len > 1 && *name == '0') goto out; - while (len-- > 0) { + do { unsigned c = *name++ - '0'; if (c > 9) goto out; @@ -16,7 +16,7 @@ unsigned name_to_int(const struct qstr *qstr) goto out; n *= 10; n += c; - } + } while (--len > 0); return n; out: return ~0U; From 868038bed5fb03f95ebd4517b4370b024fd13771 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 17 Nov 2017 15:26:55 -0800 Subject: [PATCH 13/94] spelling.txt: add "unnecessary" typo variants Add unnecessary typos by copying the necessary typos. Link: http://lkml.kernel.org/r/1505074722.22023.6.camel@perches.com Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/spelling.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/spelling.txt b/scripts/spelling.txt index aa0cc49ad1adc5..9a058cff49d4a1 100644 --- a/scripts/spelling.txt +++ b/scripts/spelling.txt @@ -1187,6 +1187,10 @@ unknonw||unknown unknow||unknown unkown||unknown unneded||unneeded +unneccecary||unnecessary +unneccesary||unnecessary +unneccessary||unnecessary +unnecesary||unnecessary unneedingly||unnecessarily unnsupported||unsupported unmached||unmatched From fb6cc4ac15c354fa1eb449f50a0dfe5f4bf0d42a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 17 Nov 2017 15:26:59 -0800 Subject: [PATCH 14/94] sh/boot: add static stack-protector to pre-kernel The sh decompressor code triggers stack-protector code generation when using CONFIG_CC_STACKPROTECTOR_STRONG. As done for arm and mips, add a simple static stack-protector canary. As this wasn't protected before, the risk of using a weak canary is minimized. Once the kernel is actually up, a better canary is chosen. Link: http://lkml.kernel.org/r/1506972007-80614-2-git-send-email-keescook@chromium.org Signed-off-by: Kees Cook Cc: Yoshinori Sato Cc: Rich Felker Cc: Al Viro Cc: Ingo Molnar Cc: Laura Abbott Cc: Masahiro Yamada Cc: Michal Marek Cc: Nicholas Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/boot/compressed/misc.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arch/sh/boot/compressed/misc.c b/arch/sh/boot/compressed/misc.c index f2d9d3079d4e62..627ce8e75e0164 100644 --- a/arch/sh/boot/compressed/misc.c +++ b/arch/sh/boot/compressed/misc.c @@ -104,6 +104,18 @@ static void error(char *x) while(1); /* Halt */ } +unsigned long __stack_chk_guard; + +void __stack_chk_guard_setup(void) +{ + __stack_chk_guard = 0x000a0dff; +} + +void __stack_chk_fail(void) +{ + error("stack-protector: Kernel stack is corrupted\n"); +} + #ifdef CONFIG_SUPERH64 #define stackalign 8 #else @@ -118,6 +130,8 @@ void decompress_kernel(void) { unsigned long output_addr; + __stack_chk_guard_setup(); + #ifdef CONFIG_SUPERH64 output_addr = (CONFIG_MEMORY_START + 0x2000); #else From b1fca27d384e8418aac84b39f6f5179aecc1b64f Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 17 Nov 2017 15:27:03 -0800 Subject: [PATCH 15/94] kernel debug: support resetting WARN*_ONCE I like _ONCE warnings because it's guaranteed that they don't flood the log. During testing I find it useful to reset the state of the once warnings, so that I can rerun tests and see if they trigger again, or can guarantee that a test run always hits the same warnings. This patch adds a debugfs interface to reset all the _ONCE warnings so that they appear again: echo 1 > /sys/kernel/debug/clear_warn_once This is implemented by putting all the warning booleans into a special section, and clearing it. [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/20171017221455.6740-1-andi@firstfloor.org Signed-off-by: Andi Kleen Tested-by: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/clearing-warn-once.txt | 7 +++++++ include/asm-generic/bug.h | 6 +++--- include/asm-generic/sections.h | 1 + include/asm-generic/vmlinux.lds.h | 3 +++ kernel/panic.c | 28 ++++++++++++++++++++++++++++ 5 files changed, 42 insertions(+), 3 deletions(-) create mode 100644 Documentation/clearing-warn-once.txt diff --git a/Documentation/clearing-warn-once.txt b/Documentation/clearing-warn-once.txt new file mode 100644 index 00000000000000..5b1f5d547be12d --- /dev/null +++ b/Documentation/clearing-warn-once.txt @@ -0,0 +1,7 @@ + +WARN_ONCE / WARN_ON_ONCE only print a warning once. + +echo 1 > /sys/kernel/debug/clear_warn_once + +clears the state and allows the warnings to print once again. +This can be useful after test suite runs to reproduce problems. diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index af2cc94a61bf9e..7844b0df88cd63 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -130,7 +130,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, #ifndef WARN_ON_ONCE #define WARN_ON_ONCE(condition) ({ \ - static bool __section(.data.unlikely) __warned; \ + static bool __section(.data.once) __warned; \ int __ret_warn_once = !!(condition); \ \ if (unlikely(__ret_warn_once && !__warned)) { \ @@ -142,7 +142,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, #endif #define WARN_ONCE(condition, format...) ({ \ - static bool __section(.data.unlikely) __warned; \ + static bool __section(.data.once) __warned; \ int __ret_warn_once = !!(condition); \ \ if (unlikely(__ret_warn_once && !__warned)) { \ @@ -153,7 +153,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, }) #define WARN_TAINT_ONCE(condition, taint, format...) ({ \ - static bool __section(.data.unlikely) __warned; \ + static bool __section(.data.once) __warned; \ int __ret_warn_once = !!(condition); \ \ if (unlikely(__ret_warn_once && !__warned)) { \ diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h index 6d95769310842b..03cc5f9bba71c1 100644 --- a/include/asm-generic/sections.h +++ b/include/asm-generic/sections.h @@ -44,6 +44,7 @@ extern char __entry_text_start[], __entry_text_end[]; extern char __start_rodata[], __end_rodata[]; extern char __irqentry_text_start[], __irqentry_text_end[]; extern char __softirqentry_text_start[], __softirqentry_text_end[]; +extern char __start_once[], __end_once[]; /* Start and end of .ctors section - used for constructor calls. */ extern char __ctors_start[], __ctors_end[]; diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index bdcd1caae0923d..ee8b707d9fa9c6 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -223,6 +223,9 @@ MEM_KEEP(init.data) \ MEM_KEEP(exit.data) \ *(.data.unlikely) \ + VMLINUX_SYMBOL(__start_once) = .; \ + *(.data.once) \ + VMLINUX_SYMBOL(__end_once) = .; \ STRUCT_ALIGN(); \ *(__tracepoints) \ /* implement dynamic printk debug */ \ diff --git a/kernel/panic.c b/kernel/panic.c index bdd18afa19a486..672a91dc20fe83 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -27,6 +27,8 @@ #include #include #include +#include +#include #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 @@ -587,6 +589,32 @@ void warn_slowpath_null(const char *file, int line) EXPORT_SYMBOL(warn_slowpath_null); #endif +#ifdef CONFIG_BUG + +/* Support resetting WARN*_ONCE state */ + +static int clear_warn_once_set(void *data, u64 val) +{ + memset(__start_once, 0, __end_once - __start_once); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(clear_warn_once_fops, + NULL, + clear_warn_once_set, + "%lld\n"); + +static __init int register_warn_debugfs(void) +{ + /* Don't care about failure */ + debugfs_create_file("clear_warn_once", 0644, NULL, + NULL, &clear_warn_once_fops); + return 0; +} + +device_initcall(register_warn_debugfs); +#endif + #ifdef CONFIG_CC_STACKPROTECTOR /* From aaf5dcfb223617ac2d16113e4b500199c65689de Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 17 Nov 2017 15:27:06 -0800 Subject: [PATCH 16/94] kernel debug: support resetting WARN_ONCE for all architectures Some architectures store the WARN_ONCE state in the flags field of the bug_entry. Clear that one too when resetting once state through /sys/kernel/debug/clear_warn_once Pointed out by Michael Ellerman Improves the earlier patch that add clear_warn_once. [ak@linux.intel.com: add a missing ifdef CONFIG_MODULES] Link: http://lkml.kernel.org/r/20171020170633.9593-1-andi@firstfloor.org [akpm@linux-foundation.org: fix unused var warning] [akpm@linux-foundation.org: Use 0200 for clear_warn_once file, per mpe] [akpm@linux-foundation.org: clear BUGFLAG_DONE in clear_once_table(), per mpe] Link: http://lkml.kernel.org/r/20171019204642.7404-1-andi@firstfloor.org Signed-off-by: Andi Kleen Tested-by: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bug.h | 5 +++++ kernel/panic.c | 3 ++- lib/bug.c | 23 +++++++++++++++++++++++ 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/include/linux/bug.h b/include/linux/bug.h index da4231c905c85a..fe5916550da8c5 100644 --- a/include/linux/bug.h +++ b/include/linux/bug.h @@ -43,6 +43,8 @@ enum bug_trap_type report_bug(unsigned long bug_addr, struct pt_regs *regs); /* These are defined by the architecture */ int is_valid_bugaddr(unsigned long addr); +void generic_bug_clear_once(void); + #else /* !CONFIG_GENERIC_BUG */ static inline enum bug_trap_type report_bug(unsigned long bug_addr, @@ -51,6 +53,9 @@ static inline enum bug_trap_type report_bug(unsigned long bug_addr, return BUG_TRAP_TYPE_BUG; } + +static inline void generic_bug_clear_once(void) {} + #endif /* CONFIG_GENERIC_BUG */ /* diff --git a/kernel/panic.c b/kernel/panic.c index 672a91dc20fe83..67cebf2a3b6782 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -595,6 +595,7 @@ EXPORT_SYMBOL(warn_slowpath_null); static int clear_warn_once_set(void *data, u64 val) { + generic_bug_clear_once(); memset(__start_once, 0, __end_once - __start_once); return 0; } @@ -607,7 +608,7 @@ DEFINE_SIMPLE_ATTRIBUTE(clear_warn_once_fops, static __init int register_warn_debugfs(void) { /* Don't care about failure */ - debugfs_create_file("clear_warn_once", 0644, NULL, + debugfs_create_file("clear_warn_once", 0200, NULL, NULL, &clear_warn_once_fops); return 0; } diff --git a/lib/bug.c b/lib/bug.c index 1e094408c893db..f66be6bf620629 100644 --- a/lib/bug.c +++ b/lib/bug.c @@ -196,3 +196,26 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) return BUG_TRAP_TYPE_BUG; } + +static void clear_once_table(struct bug_entry *start, struct bug_entry *end) +{ + struct bug_entry *bug; + + for (bug = start; bug < end; bug++) + bug->flags &= ~BUGFLAG_DONE; +} + +void generic_bug_clear_once(void) +{ +#ifdef CONFIG_MODULES + struct module *mod; + + rcu_read_lock_sched(); + list_for_each_entry_rcu(mod, &module_bug_list, bug_list) + clear_once_table(mod->bug_table, + mod->bug_table + mod->num_bugs); + rcu_read_unlock_sched(); +#endif + + clear_once_table(__start___bug_table, __stop___bug_table); +} From 1e6270d07cde9bfbc27f8d0f16b323cf3a3b63dd Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 17 Nov 2017 15:27:10 -0800 Subject: [PATCH 17/94] parse-maintainers: add ability to specify filenames parse-maintainers.pl is convenient, but currently hard-codes the filenames that are used. Allow user-specified filenames to simplify the use of the script. Link: http://lkml.kernel.org/r/48703c068b3235223ffa3b2eb268fa0a125b25e0.1502251549.git.joe@perches.com Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/parse-maintainers.pl | 52 ++++++++++++++++++++++++++++++++---- 1 file changed, 47 insertions(+), 5 deletions(-) diff --git a/scripts/parse-maintainers.pl b/scripts/parse-maintainers.pl index 5dbd2faa244949..255cef1b098d8f 100644 --- a/scripts/parse-maintainers.pl +++ b/scripts/parse-maintainers.pl @@ -2,9 +2,44 @@ # SPDX-License-Identifier: GPL-2.0 use strict; +use Getopt::Long qw(:config no_auto_abbrev); + +my $input_file = "MAINTAINERS"; +my $output_file = "MAINTAINERS.new"; +my $output_section = "SECTION.new"; +my $help = 0; my $P = $0; +if (!GetOptions( + 'input=s' => \$input_file, + 'output=s' => \$output_file, + 'section=s' => \$output_section, + 'h|help|usage' => \$help, + )) { + die "$P: invalid argument - use --help if necessary\n"; +} + +if ($help != 0) { + usage(); + exit 0; +} + +sub usage { + print < + + --input => MAINTAINERS file to read (default: MAINTAINERS) + --output => sorted MAINTAINERS file to write (default: MAINTAINERS.new) + --section => new sorted MAINTAINERS file to write to (default: SECTION.new) + +If exist, then the sections that match the +regexes are not written to the output file but are written to the +section file. + +EOT +} + # sort comparison functions sub by_category($$) { my ($a, $b) = @_; @@ -56,13 +91,20 @@ sub trim { sub alpha_output { my ($hashref, $filename) = (@_); + return if ! scalar(keys %$hashref); + open(my $file, '>', "$filename") or die "$P: $filename: open failed - $!\n"; + my $separator; foreach my $key (sort by_category keys %$hashref) { if ($key eq " ") { - chomp $$hashref{$key}; print $file $$hashref{$key}; } else { - print $file "\n" . $key . "\n"; + if (! defined $separator) { + $separator = "\n"; + } else { + print $file $separator; + } + print $file $key . "\n"; foreach my $pattern (sort by_pattern split('\n', %$hashref{$key})) { print $file ($pattern . "\n"); } @@ -112,7 +154,7 @@ sub file_input { my %hash; my %new_hash; -file_input(\%hash, "MAINTAINERS"); +file_input(\%hash, $input_file); foreach my $type (@ARGV) { foreach my $key (keys %hash) { @@ -123,7 +165,7 @@ sub file_input { } } -alpha_output(\%hash, "MAINTAINERS.new"); -alpha_output(\%new_hash, "SECTION.new"); +alpha_output(\%hash, $output_file); +alpha_output(\%new_hash, $output_section); exit(0); From d15809f3649e2ee04713a9dba9aa7bd2c208ad82 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 17 Nov 2017 15:27:13 -0800 Subject: [PATCH 18/94] iopoll: avoid -Wint-in-bool-context warning When we pass the result of a multiplication as the timeout or the delay, we can get a warning from gcc-7: drivers/mmc/host/bcm2835.c:596:149: error: '*' in boolean context, suggest '&&' instead [-Werror=int-in-bool-context] drivers/mfd/arizona-core.c:247:195: error: '*' in boolean context, suggest '&&' instead [-Werror=int-in-bool-context] drivers/gpu/drm/sun4i/sun4i_hdmi_i2c.c:49:27: error: '*' in boolean context, suggest '&&' instead [-Werror=int-in-bool-context] The warning is a bit questionable inside of a macro, but this is intentional on the side of the gcc developers. It is also an indication of another problem: we evaluate the timeout and sleep arguments multiple times, which can have undesired side-effects when those are complex expressions. This changes the two iopoll variants to use local variables for storing copies of the timeouts. This adds some more type safety, and avoids both the double-evaluation and the gcc warning. Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81484 Link: http://lkml.kernel.org/r/20170726133756.2161367-1-arnd@arndb.de Link: http://lkml.kernel.org/r/20171102114048.1526955-1-arnd@arndb.de Signed-off-by: Arnd Bergmann Reviewed-by: Mark Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/iopoll.h | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/include/linux/iopoll.h b/include/linux/iopoll.h index d29e1e21bf3f80..b1d861caca161a 100644 --- a/include/linux/iopoll.h +++ b/include/linux/iopoll.h @@ -42,18 +42,21 @@ */ #define readx_poll_timeout(op, addr, val, cond, sleep_us, timeout_us) \ ({ \ - ktime_t timeout = ktime_add_us(ktime_get(), timeout_us); \ - might_sleep_if(sleep_us); \ + u64 __timeout_us = (timeout_us); \ + unsigned long __sleep_us = (sleep_us); \ + ktime_t __timeout = ktime_add_us(ktime_get(), __timeout_us); \ + might_sleep_if((__sleep_us) != 0); \ for (;;) { \ (val) = op(addr); \ if (cond) \ break; \ - if (timeout_us && ktime_compare(ktime_get(), timeout) > 0) { \ + if (__timeout_us && \ + ktime_compare(ktime_get(), __timeout) > 0) { \ (val) = op(addr); \ break; \ } \ - if (sleep_us) \ - usleep_range((sleep_us >> 2) + 1, sleep_us); \ + if (__sleep_us) \ + usleep_range((__sleep_us >> 2) + 1, __sleep_us); \ } \ (cond) ? 0 : -ETIMEDOUT; \ }) @@ -77,17 +80,20 @@ */ #define readx_poll_timeout_atomic(op, addr, val, cond, delay_us, timeout_us) \ ({ \ - ktime_t timeout = ktime_add_us(ktime_get(), timeout_us); \ + u64 __timeout_us = (timeout_us); \ + unsigned long __delay_us = (delay_us); \ + ktime_t __timeout = ktime_add_us(ktime_get(), __timeout_us); \ for (;;) { \ (val) = op(addr); \ if (cond) \ break; \ - if (timeout_us && ktime_compare(ktime_get(), timeout) > 0) { \ + if (__timeout_us && \ + ktime_compare(ktime_get(), __timeout) > 0) { \ (val) = op(addr); \ break; \ } \ - if (delay_us) \ - udelay(delay_us); \ + if (__delay_us) \ + udelay(__delay_us); \ } \ (cond) ? 0 : -ETIMEDOUT; \ }) From d32f11ba281b7e203932c0a65ec1fb302493cbbe Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 17 Nov 2017 15:27:17 -0800 Subject: [PATCH 19/94] lkdtm: include WARN format string In order to test the ordering of WARN format strings, actually include one in LKDTM. Link: http://lkml.kernel.org/r/1510100869-73751-2-git-send-email-keescook@chromium.org Signed-off-by: Kees Cook Cc: Arnd Bergmann Cc: Fengguang Wu Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Peter Zijlstra (Intel) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/misc/lkdtm_bugs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/misc/lkdtm_bugs.c b/drivers/misc/lkdtm_bugs.c index b0f7af872bb56c..7eebbdfbcacd0a 100644 --- a/drivers/misc/lkdtm_bugs.c +++ b/drivers/misc/lkdtm_bugs.c @@ -63,9 +63,11 @@ void lkdtm_BUG(void) BUG(); } +static int warn_counter; + void lkdtm_WARNING(void) { - WARN_ON(1); + WARN(1, "Warning message trigger count: %d\n", warn_counter++); } void lkdtm_EXCEPTION(void) From 2a8358d8a339540f00ec596526690e8eeca931a3 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 17 Nov 2017 15:27:21 -0800 Subject: [PATCH 20/94] bug: define the "cut here" string in a single place The "cut here" string is used in a few paths. Define it in a single place. Link: http://lkml.kernel.org/r/1510100869-73751-3-git-send-email-keescook@chromium.org Signed-off-by: Kees Cook Cc: Arnd Bergmann Cc: Fengguang Wu Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Peter Zijlstra (Intel) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mn10300/mm/fault.c | 2 +- include/asm-generic/bug.h | 2 ++ kernel/panic.c | 2 +- lib/bug.c | 2 +- 4 files changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/mn10300/mm/fault.c b/arch/mn10300/mm/fault.c index f23781d6bbb3fe..f0bfa144874474 100644 --- a/arch/mn10300/mm/fault.c +++ b/arch/mn10300/mm/fault.c @@ -60,7 +60,7 @@ void bust_spinlocks(int yes) void do_BUG(const char *file, int line) { bust_spinlocks(1); - printk(KERN_EMERG "------------[ cut here ]------------\n"); + printk(KERN_EMERG CUT_HERE); printk(KERN_EMERG "kernel BUG at %s:%d!\n", file, line); } diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 7844b0df88cd63..1283473f234e13 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -4,6 +4,8 @@ #include +#define CUT_HERE "------------[ cut here ]------------\n" + #ifdef CONFIG_GENERIC_BUG #define BUGFLAG_WARNING (1 << 0) #define BUGFLAG_ONCE (1 << 1) diff --git a/kernel/panic.c b/kernel/panic.c index 67cebf2a3b6782..89df5fa2f798f1 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -520,7 +520,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, { disable_trace_on_warning(); - pr_warn("------------[ cut here ]------------\n"); + pr_warn(CUT_HERE); if (file) pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n", diff --git a/lib/bug.c b/lib/bug.c index f66be6bf620629..c1b0fad31b1091 100644 --- a/lib/bug.c +++ b/lib/bug.c @@ -186,7 +186,7 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) return BUG_TRAP_TYPE_WARN; } - printk(KERN_DEFAULT "------------[ cut here ]------------\n"); + printk(KERN_DEFAULT CUT_HERE); if (file) pr_crit("kernel BUG at %s:%u!\n", file, line); From a7bed27af194aa3f67915688039d93188ed95e2a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 17 Nov 2017 15:27:24 -0800 Subject: [PATCH 21/94] bug: fix "cut here" location for __WARN_TAINT architectures Prior to v4.11, x86 used warn_slowpath_fmt() for handling WARN()s. After WARN() was moved to using UD0 on x86, the warning text started appearing _before_ the "cut here" line. This appears to have been a long-standing bug on architectures that used __WARN_TAINT, but it didn't get fixed. v4.11 and earlier on x86: ------------[ cut here ]------------ WARNING: CPU: 0 PID: 2956 at drivers/misc/lkdtm_bugs.c:65 lkdtm_WARNING+0x21/0x30 This is a warning message Modules linked in: v4.12 and later on x86: This is a warning message ------------[ cut here ]------------ WARNING: CPU: 1 PID: 2982 at drivers/misc/lkdtm_bugs.c:68 lkdtm_WARNING+0x15/0x20 Modules linked in: With this fix: ------------[ cut here ]------------ This is a warning message WARNING: CPU: 3 PID: 3009 at drivers/misc/lkdtm_bugs.c:67 lkdtm_WARNING+0x15/0x20 Since the __FILE__ reporting happens as part of the UD0 handler, it isn't trivial to move the message to after the WARNING line, but at least we can fix the position of the "cut here" line so all the various logging tools will start including the actual runtime warning message again, when they follow the instruction and "cut here". Link: http://lkml.kernel.org/r/1510100869-73751-4-git-send-email-keescook@chromium.org Fixes: 9a93848fe787 ("x86/debug: Implement __WARN() using UD0") Signed-off-by: Kees Cook Cc: Peter Zijlstra (Intel) Cc: Josh Poimboeuf Cc: Fengguang Wu Cc: Arnd Bergmann Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/bug.h | 5 +++-- kernel/panic.c | 16 +++++++++++++++- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 1283473f234e13..963b755d19b032 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -92,10 +92,11 @@ extern void warn_slowpath_null(const char *file, const int line); #define __WARN_printf_taint(taint, arg...) \ warn_slowpath_fmt_taint(__FILE__, __LINE__, taint, arg) #else +extern __printf(1, 2) void __warn_printk(const char *fmt, ...); #define __WARN() __WARN_TAINT(TAINT_WARN) -#define __WARN_printf(arg...) do { printk(arg); __WARN(); } while (0) +#define __WARN_printf(arg...) do { __warn_printk(arg); __WARN(); } while (0) #define __WARN_printf_taint(taint, arg...) \ - do { printk(arg); __WARN_TAINT(taint); } while (0) + do { __warn_printk(arg); __WARN_TAINT(taint); } while (0) #endif /* used internally by panic.c */ diff --git a/kernel/panic.c b/kernel/panic.c index 89df5fa2f798f1..3242b64b1956de 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -520,7 +520,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint, { disable_trace_on_warning(); - pr_warn(CUT_HERE); + if (args) + pr_warn(CUT_HERE); if (file) pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n", @@ -584,9 +585,22 @@ EXPORT_SYMBOL(warn_slowpath_fmt_taint); void warn_slowpath_null(const char *file, int line) { + pr_warn(CUT_HERE); __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, NULL); } EXPORT_SYMBOL(warn_slowpath_null); +#else +void __warn_printk(const char *fmt, ...) +{ + va_list args; + + pr_warn(CUT_HERE); + + va_start(args, fmt); + vprintk(fmt, args); + va_end(args); +} +EXPORT_SYMBOL(__warn_printk); #endif #ifdef CONFIG_BUG From 4ca59b14e588f873795a11cdc77a25c686a29d23 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Fri, 17 Nov 2017 15:27:28 -0800 Subject: [PATCH 22/94] include/linux/compiler-clang.h: handle randomizable anonymous structs The GCC randomize layout plugin can randomize the member offsets of sensitive kernel data structures. To use this feature, certain annotations and members are added to the structures which affect the member offsets even if this plugin is not used. All of these structures are completely randomized, except for task_struct which leaves out some of its members. All the other members are wrapped within an anonymous struct with the __randomize_layout attribute. This is done using the randomized_struct_fields_start and randomized_struct_fields_end defines. When the plugin is disabled, the behaviour of this attribute can vary based on the GCC version. For GCC 5.1+, this attribute maps to __designated_init otherwise it is just an empty define but the anonymous structure is still present. For other compilers, both randomized_struct_fields_start and randomized_struct_fields_end default to empty defines meaning the anonymous structure is not introduced at all. So, if a module compiled with Clang, such as a BPF program, needs to access task_struct fields such as pid and comm, the offsets of these members as recognized by Clang are different from those recognized by modules compiled with GCC. If GCC 4.6+ is used to build the kernel, this can be solved by introducing appropriate defines for Clang so that the anonymous structure is seen when determining the offsets for the members. Link: http://lkml.kernel.org/r/20171109064645.25581-1-sandipan@linux.vnet.ibm.com Signed-off-by: Sandipan Das Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Kate Stewart Cc: Thomas Gleixner Cc: Naveen N. Rao Cc: Alexei Starovoitov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compiler-clang.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index a06583e41f8052..3b609edffa8fb6 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -16,3 +16,6 @@ * with any version that can compile the kernel */ #define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__) + +#define randomized_struct_fields_start struct { +#define randomized_struct_fields_end }; From 8c703d660450c4df72ff24f79a335dc7973a9dc8 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 17 Nov 2017 15:27:32 -0800 Subject: [PATCH 23/94] kernel/umh.c: optimize 'proc_cap_handler()' If 'write' is 0, we can avoid a call to spin_lock/spin_unlock. Link: http://lkml.kernel.org/r/20171020193331.7233-1-christophe.jaillet@wanadoo.fr Signed-off-by: Christophe JAILLET Acked-by: Luis R. Rodriguez Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/umh.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/umh.c b/kernel/umh.c index 6ff9905250ff05..18e5fa4b0e7191 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -537,14 +537,14 @@ static int proc_cap_handler(struct ctl_table *table, int write, /* * Drop everything not in the new_cap (but don't add things) */ - spin_lock(&umh_sysctl_lock); if (write) { + spin_lock(&umh_sysctl_lock); if (table->data == CAP_BSET) usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap); if (table->data == CAP_PI) usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap); + spin_unlock(&umh_sysctl_lock); } - spin_unlock(&umh_sysctl_lock); return 0; } From 1f3c790bd5989fcfec9e53ad8fa09f5b740c958f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 17 Nov 2017 15:27:35 -0800 Subject: [PATCH 24/94] dynamic-debug-howto: fix optional/omitted ending line number to be LARGE instead of 0 line-range is supposed to treat "1-" as "1-endoffile", so handle the special case by setting last_lineno to UINT_MAX. Fixes this error: dynamic_debug:ddebug_parse_query: last-line:0 < 1st-line:1 dynamic_debug:ddebug_exec_query: query parse failed Link: http://lkml.kernel.org/r/10a6a101-e2be-209f-1f41-54637824788e@infradead.org Signed-off-by: Randy Dunlap Acked-by: Jason Baron Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/dynamic_debug.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index da796e2dc4f506..c7c96bc7654af7 100644 --- a/lib/dynamic_debug.c +++ b/lib/dynamic_debug.c @@ -360,6 +360,10 @@ static int ddebug_parse_query(char *words[], int nwords, if (parse_lineno(last, &query->last_lineno) < 0) return -EINVAL; + /* special case for last lineno not specified */ + if (query->last_lineno == 0) + query->last_lineno = UINT_MAX; + if (query->last_lineno < query->first_lineno) { pr_err("last-line:%d < 1st-line:%d\n", query->last_lineno, From 8c188759fb5788579b5975d5a8c9f529e25c2171 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 17 Nov 2017 15:27:39 -0800 Subject: [PATCH 25/94] dynamic_debug documentation: minor fixes Fix minor typo. Fix missing words in explaining parsing of last line number. Link: http://lkml.kernel.org/r/ebb7ff42-4945-103f-d5b4-f07a6f3343a7@infradead.org Signed-off-by: Randy Dunlap Cc: Jason Baron Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/dynamic-debug-howto.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/dynamic-debug-howto.rst b/Documentation/admin-guide/dynamic-debug-howto.rst index 12278a926370a1..fdf72429f8019c 100644 --- a/Documentation/admin-guide/dynamic-debug-howto.rst +++ b/Documentation/admin-guide/dynamic-debug-howto.rst @@ -18,7 +18,7 @@ shortcut for ``print_hex_dump(KERN_DEBUG)``. For ``print_hex_dump_debug()``/``print_hex_dump_bytes()``, format string is its ``prefix_str`` argument, if it is constant string; or ``hexdump`` -in case ``prefix_str`` is build dynamically. +in case ``prefix_str`` is built dynamically. Dynamic debug has even more useful features: @@ -197,8 +197,8 @@ line line number matches the callsite line number exactly. A range of line numbers matches any callsite between the first and last line number inclusive. An empty first number means - the first line in the file, an empty line number means the - last number in the file. Examples:: + the first line in the file, an empty last line number means the + last line number in the file. Examples:: line 1603 // exactly line 1603 line 1600-1605 // the six lines from line 1600 to line 1605 From e1f7590488541845d16afd6003d31773b9155270 Mon Sep 17 00:00:00 2001 From: Tom Saeger Date: Fri, 17 Nov 2017 15:27:42 -0800 Subject: [PATCH 26/94] get_maintainer: add --self-test for internal consistency tests Add "--self-test" option to get_maintainer.pl to show potential issues in MAINTAINERS file(s) content. Pattern check warnings are shown for "F" and "X" patterns found in MAINTAINERS file(s) which do not match any files known by git. Link: http://lkml.kernel.org/r/64994f911b3510d0f4c8ac2e113501dfcec1f3c9.1509559540.git.tom.saeger@oracle.com Signed-off-by: Tom Saeger Acked-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/get_maintainer.pl | 94 ++++++++++++++++++++++++++++++++------- 1 file changed, 77 insertions(+), 17 deletions(-) diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index bc443201d3ef00..c68a5d1ba709e7 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -57,6 +57,7 @@ my $file_emails = 0; my $from_filename = 0; my $pattern_depth = 0; +my $self_test = 0; my $version = 0; my $help = 0; my $find_maintainer_files = 0; @@ -138,6 +139,7 @@ "subject_pattern" => "^GitSubject: (.*)", "stat_pattern" => "^(\\d+)\\t(\\d+)\\t\$file\$", "file_exists_cmd" => "git ls-files \$file", + "list_files_cmd" => "git ls-files \$file", ); my %VCS_cmds_hg = ( @@ -167,6 +169,7 @@ "subject_pattern" => "^HgSubject: (.*)", "stat_pattern" => "^(\\d+)\t(\\d+)\t\$file\$", "file_exists_cmd" => "hg files \$file", + "list_files_cmd" => "hg manifest -R \$file", ); my $conf = which_conf(".get_maintainer.conf"); @@ -216,6 +219,14 @@ close($ignore); } +if ($#ARGV > 0) { + foreach (@ARGV) { + if ($_ eq "-self-test" || $_ eq "--self-test") { + die "$P: using --self-test does not allow any other option or argument\n"; + } + } +} + if (!GetOptions( 'email!' => \$email, 'git!' => \$email_git, @@ -252,6 +263,7 @@ 'fe|file-emails!' => \$file_emails, 'f|file' => \$from_filename, 'find-maintainer-files' => \$find_maintainer_files, + 'self-test' => \$self_test, 'v|version' => \$version, 'h|help|usage' => \$help, )) { @@ -268,6 +280,12 @@ exit 0; } +if ($self_test) { + read_all_maintainer_files(); + check_maintainers_patterns(); + exit 0; +} + if (-t STDIN && !@ARGV) { # We're talking to a terminal, but have no command line arguments. die "$P: missing patchfile or -f file - use --help if necessary\n"; @@ -311,12 +329,14 @@ my @typevalue = (); my %keyword_hash; my @mfiles = (); +my @self_test_pattern_info = (); sub read_maintainer_file { my ($file) = @_; open (my $maint, '<', "$file") or die "$P: Can't open MAINTAINERS file '$file': $!\n"; + my $i = 1; while (<$maint>) { my $line = $_; @@ -333,6 +353,9 @@ sub read_maintainer_file { if ((-d $value)) { $value =~ s@([^/])$@$1/@; } + if ($self_test) { + push(@self_test_pattern_info, {file=>$file, line=>$line, linenr=>$i, pat=>$value}); + } } elsif ($type eq "K") { $keyword_hash{@typevalue} = $value; } @@ -341,6 +364,7 @@ sub read_maintainer_file { $line =~ s/\n$//g; push(@typevalue, $line); } + $i++; } close($maint); } @@ -357,26 +381,30 @@ sub find_ignore_git { return grep { $_ !~ /^\.git$/; } @_; } -if (-d "${lk_path}MAINTAINERS") { - opendir(DIR, "${lk_path}MAINTAINERS") or die $!; - my @files = readdir(DIR); - closedir(DIR); - foreach my $file (@files) { - push(@mfiles, "${lk_path}MAINTAINERS/$file") if ($file !~ /^\./); +read_all_maintainer_files(); + +sub read_all_maintainer_files { + if (-d "${lk_path}MAINTAINERS") { + opendir(DIR, "${lk_path}MAINTAINERS") or die $!; + my @files = readdir(DIR); + closedir(DIR); + foreach my $file (@files) { + push(@mfiles, "${lk_path}MAINTAINERS/$file") if ($file !~ /^\./); + } } -} -if ($find_maintainer_files) { - find( { wanted => \&find_is_maintainer_file, - preprocess => \&find_ignore_git, - no_chdir => 1, - }, "${lk_path}"); -} else { - push(@mfiles, "${lk_path}MAINTAINERS") if -f "${lk_path}MAINTAINERS"; -} + if ($find_maintainer_files) { + find( { wanted => \&find_is_maintainer_file, + preprocess => \&find_ignore_git, + no_chdir => 1, + }, "${lk_path}"); + } else { + push(@mfiles, "${lk_path}MAINTAINERS") if -f "${lk_path}MAINTAINERS"; + } -foreach my $file (@mfiles) { - read_maintainer_file("$file"); + foreach my $file (@mfiles) { + read_maintainer_file("$file"); + } } # @@ -586,6 +614,20 @@ sub read_mailmap { exit($exit); +sub check_maintainers_patterns { + my @lsfiles = (); + + @lsfiles = vcs_list_files($lk_path); + + for my $x (@self_test_pattern_info) { + if (!grep(m@^$x->{pat}@, @lsfiles)) { + my $line = $x->{line}; + chomp($line); + print("$x->{file}:$x->{linenr}: warning: no matches $line\n"); + } + } +} + sub ignore_email_address { my ($address) = @_; @@ -863,6 +905,7 @@ sub usage { --sections => print all of the subsystem sections with pattern matches --letters => print all matching 'letter' types from all matching sections --mailmap => use .mailmap file (default: $email_use_mailmap) + --self-test => show potential issues with MAINTAINERS file content --version => show version --help => show this help information @@ -2192,6 +2235,23 @@ sub vcs_file_exists { return $exists; } +sub vcs_list_files { + my ($file) = @_; + + my @lsfiles = (); + + my $vcs_used = vcs_exists(); + return 0 if (!$vcs_used); + + my $cmd = $VCS_cmds{"list_files_cmd"}; + $cmd =~ s/(\$\w+)/$1/eeg; # interpolate $cmd + @lsfiles = &{$VCS_cmds{"execute_cmd"}}($cmd); + + return () if ($? != 0); + + return @lsfiles; +} + sub uniq { my (@parms) = @_; From 083bf9c56d067559bd2727e1ec6ffb67d0ff53be Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 17 Nov 2017 15:27:46 -0800 Subject: [PATCH 27/94] get_maintainer: add more --self-test options Add tests for duplicate section headers, missing section content, link and scm reachability. Miscellanea: o Add --self-test= options (a comma separated list of any of sections, patterns, links or scm) where the default without options is all tests o Rename check_maintainers_patterns to self_test o Rename self_test_pattern_info to self_test_info [tom.saeger@oracle.com: improvements] Link: http://lkml.kernel.org/r/13e3986c374902fcf08ae947e36c5c608bbe3b79.1510075301.git.joe@perches.com Signed-off-by: Joe Perches Reviewed-by: Tom Saeger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/get_maintainer.pl | 149 +++++++++++++++++++++++++++++++++----- 1 file changed, 132 insertions(+), 17 deletions(-) diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index c68a5d1ba709e7..99c96e86eccb64 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -57,7 +57,7 @@ my $file_emails = 0; my $from_filename = 0; my $pattern_depth = 0; -my $self_test = 0; +my $self_test = undef; my $version = 0; my $help = 0; my $find_maintainer_files = 0; @@ -221,7 +221,7 @@ if ($#ARGV > 0) { foreach (@ARGV) { - if ($_ eq "-self-test" || $_ eq "--self-test") { + if ($_ =~ /^-{1,2}self-test(?:=|$)/) { die "$P: using --self-test does not allow any other option or argument\n"; } } @@ -263,7 +263,7 @@ 'fe|file-emails!' => \$file_emails, 'f|file' => \$from_filename, 'find-maintainer-files' => \$find_maintainer_files, - 'self-test' => \$self_test, + 'self-test:s' => \$self_test, 'v|version' => \$version, 'h|help|usage' => \$help, )) { @@ -280,9 +280,9 @@ exit 0; } -if ($self_test) { +if (defined $self_test) { read_all_maintainer_files(); - check_maintainers_patterns(); + self_test(); exit 0; } @@ -329,7 +329,7 @@ my @typevalue = (); my %keyword_hash; my @mfiles = (); -my @self_test_pattern_info = (); +my @self_test_info = (); sub read_maintainer_file { my ($file) = @_; @@ -339,6 +339,7 @@ sub read_maintainer_file { my $i = 1; while (<$maint>) { my $line = $_; + chomp $line; if ($line =~ m/^([A-Z]):\s*(.*)/) { my $type = $1; @@ -353,17 +354,16 @@ sub read_maintainer_file { if ((-d $value)) { $value =~ s@([^/])$@$1/@; } - if ($self_test) { - push(@self_test_pattern_info, {file=>$file, line=>$line, linenr=>$i, pat=>$value}); - } } elsif ($type eq "K") { $keyword_hash{@typevalue} = $value; } push(@typevalue, "$type:$value"); } elsif (!(/^\s*$/ || /^\s*\#/)) { - $line =~ s/\n$//g; push(@typevalue, $line); } + if (defined $self_test) { + push(@self_test_info, {file=>$file, linenr=>$i, line=>$line}); + } $i++; } close($maint); @@ -614,17 +614,132 @@ sub read_mailmap { exit($exit); -sub check_maintainers_patterns { +sub self_test { my @lsfiles = (); + my @good_links = (); + my @bad_links = (); + my @section_headers = (); + my $index = 0; @lsfiles = vcs_list_files($lk_path); - for my $x (@self_test_pattern_info) { - if (!grep(m@^$x->{pat}@, @lsfiles)) { - my $line = $x->{line}; - chomp($line); - print("$x->{file}:$x->{linenr}: warning: no matches $line\n"); - } + for my $x (@self_test_info) { + $index++; + + ## Section header duplication and missing section content + if (($self_test eq "" || $self_test =~ /\bsections\b/) && + $x->{line} =~ /^\S[^:]/ && + defined $self_test_info[$index] && + $self_test_info[$index]->{line} =~ /^([A-Z]):\s*\S/) { + my $has_S = 0; + my $has_F = 0; + my $has_ML = 0; + my $status = ""; + if (grep(m@^\Q$x->{line}\E@, @section_headers)) { + print("$x->{file}:$x->{linenr}: warning: duplicate section header\t$x->{line}\n"); + } else { + push(@section_headers, $x->{line}); + } + my $nextline = $index; + while (defined $self_test_info[$nextline] && + $self_test_info[$nextline]->{line} =~ /^([A-Z]):\s*(\S.*)/) { + my $type = $1; + my $value = $2; + if ($type eq "S") { + $has_S = 1; + $status = $value; + } elsif ($type eq "F" || $type eq "N") { + $has_F = 1; + } elsif ($type eq "M" || $type eq "R" || $type eq "L") { + $has_ML = 1; + } + $nextline++; + } + if (!$has_ML && $status !~ /orphan|obsolete/i) { + print("$x->{file}:$x->{linenr}: warning: section without email address\t$x->{line}\n"); + } + if (!$has_S) { + print("$x->{file}:$x->{linenr}: warning: section without status \t$x->{line}\n"); + } + if (!$has_F) { + print("$x->{file}:$x->{linenr}: warning: section without file pattern\t$x->{line}\n"); + } + } + + next if ($x->{line} !~ /^([A-Z]):\s*(.*)/); + + my $type = $1; + my $value = $2; + + ## Filename pattern matching + if (($type eq "F" || $type eq "X") && + ($self_test eq "" || $self_test =~ /\bpatterns\b/)) { + $value =~ s@\.@\\\.@g; ##Convert . to \. + $value =~ s/\*/\.\*/g; ##Convert * to .* + $value =~ s/\?/\./g; ##Convert ? to . + ##if pattern is a directory and it lacks a trailing slash, add one + if ((-d $value)) { + $value =~ s@([^/])$@$1/@; + } + if (!grep(m@^$value@, @lsfiles)) { + print("$x->{file}:$x->{linenr}: warning: no file matches\t$x->{line}\n"); + } + + ## Link reachability + } elsif (($type eq "W" || $type eq "Q" || $type eq "B") && + $value =~ /^https?:/ && + ($self_test eq "" || $self_test =~ /\blinks\b/)) { + next if (grep(m@^\Q$value\E$@, @good_links)); + my $isbad = 0; + if (grep(m@^\Q$value\E$@, @bad_links)) { + $isbad = 1; + } else { + my $output = `wget --spider -q --no-check-certificate --timeout 10 --tries 1 $value`; + if ($? == 0) { + push(@good_links, $value); + } else { + push(@bad_links, $value); + $isbad = 1; + } + } + if ($isbad) { + print("$x->{file}:$x->{linenr}: warning: possible bad link\t$x->{line}\n"); + } + + ## SCM reachability + } elsif ($type eq "T" && + ($self_test eq "" || $self_test =~ /\bscm\b/)) { + next if (grep(m@^\Q$value\E$@, @good_links)); + my $isbad = 0; + if (grep(m@^\Q$value\E$@, @bad_links)) { + $isbad = 1; + } elsif ($value !~ /^(?:git|quilt|hg)\s+\S/) { + print("$x->{file}:$x->{linenr}: warning: malformed entry\t$x->{line}\n"); + } elsif ($value =~ /^git\s+(\S+)(\s+([^\(]+\S+))?/) { + my $url = $1; + my $branch = ""; + $branch = $3 if $3; + my $output = `git ls-remote --exit-code -h "$url" $branch > /dev/null 2>&1`; + if ($? == 0) { + push(@good_links, $value); + } else { + push(@bad_links, $value); + $isbad = 1; + } + } elsif ($value =~ /^(?:quilt|hg)\s+(https?:\S+)/) { + my $url = $1; + my $output = `wget --spider -q --no-check-certificate --timeout 10 --tries 1 $url`; + if ($? == 0) { + push(@good_links, $value); + } else { + push(@bad_links, $value); + $isbad = 1; + } + } + if ($isbad) { + print("$x->{file}:$x->{linenr}: warning: possible bad link\t$x->{line}\n"); + } + } } } From 8001541cc333b1a3c2c38e3b34475fa446b053da Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 17 Nov 2017 15:27:49 -0800 Subject: [PATCH 28/94] include/linux/bitfield.h: include instead of Since commit bc6245e5efd7 ("bug: split BUILD_BUG stuff out into "), #include is better to pull minimal headers needed for BUILG_BUG() family. Link: http://lkml.kernel.org/r/1505700775-19826-1-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Acked-by: Jakub Kicinski Cc: Dinan Gunawardena Cc: Kalle Valo Cc: Ian Abbott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitfield.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/bitfield.h b/include/linux/bitfield.h index f2deb71958b2db..1030651f83098f 100644 --- a/include/linux/bitfield.h +++ b/include/linux/bitfield.h @@ -15,7 +15,7 @@ #ifndef _LINUX_BITFIELD_H #define _LINUX_BITFIELD_H -#include +#include /* * Bitfield access macros From f5bba9d11a256ad2a1c2f8e7fc6aabe6416b7890 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 17 Nov 2017 15:27:53 -0800 Subject: [PATCH 29/94] include/linux/radix-tree.h: remove unneeded #include This include was added by commit 187f1882b5b0 ("BUG: headers with BUG/BUG_ON etc. need linux/bug.h") because BUG_ON() was used in this header at that time. Some time later, commit 6d75f366b924 ("lib: radix-tree: check accounting of existing slot replacement users") removed the use of BUG_ON() from this header. Since then, there is no reason to include . Link: http://lkml.kernel.org/r/1505660151-4383-1-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Cc: Matthew Wilcox Cc: Masahiro Yamada Cc: Jan Kara Cc: Johannes Weiner Cc: Chris Mi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 0ca448c1cb42f3..23a9c89c7ad962 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -22,7 +22,6 @@ #define _LINUX_RADIX_TREE_H #include -#include #include #include #include From d6b28e0996962aeadd3777ae565ae03dd5c59f18 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 17 Nov 2017 15:27:56 -0800 Subject: [PATCH 30/94] lib: add module support to string tests Extract the string test code into its own source file, to allow compiling it either to a loadable module, or built into the kernel. Fixes: 03270c13c5ffaa6a ("lib/string.c: add testcases for memset16/32/64") Link: http://lkml.kernel.org/r/1505397744-3387-1-git-send-email-geert@linux-m68k.org Signed-off-by: Geert Uytterhoeven Cc: Matthew Wilcox Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig | 2 +- lib/Makefile | 1 + lib/string.c | 141 ---------------------------------------------- lib/test_string.c | 141 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 143 insertions(+), 142 deletions(-) create mode 100644 lib/test_string.c diff --git a/lib/Kconfig b/lib/Kconfig index a2b6745324ab35..368972f0db783d 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -584,7 +584,7 @@ config PRIME_NUMBERS tristate config STRING_SELFTEST - bool "Test string functions" + tristate "Test string functions" endmenu diff --git a/lib/Makefile b/lib/Makefile index 136a0b2545641d..e3cb2a2413380a 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -40,6 +40,7 @@ obj-y += bcd.o div64.o sort.o parser.o debug_locks.o random32.o \ bsearch.o find_bit.o llist.o memweight.o kfifo.o \ percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o \ once.o refcount.o usercopy.o errseq.o +obj-$(CONFIG_STRING_SELFTEST) += test_string.o obj-y += string_helpers.o obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o obj-y += hexdump.o diff --git a/lib/string.c b/lib/string.c index 5e8d410a93df5e..64a9e33f1daae4 100644 --- a/lib/string.c +++ b/lib/string.c @@ -1052,144 +1052,3 @@ void fortify_panic(const char *name) BUG(); } EXPORT_SYMBOL(fortify_panic); - -#ifdef CONFIG_STRING_SELFTEST -#include -#include - -static __init int memset16_selftest(void) -{ - unsigned i, j, k; - u16 v, *p; - - p = kmalloc(256 * 2 * 2, GFP_KERNEL); - if (!p) - return -1; - - for (i = 0; i < 256; i++) { - for (j = 0; j < 256; j++) { - memset(p, 0xa1, 256 * 2 * sizeof(v)); - memset16(p + i, 0xb1b2, j); - for (k = 0; k < 512; k++) { - v = p[k]; - if (k < i) { - if (v != 0xa1a1) - goto fail; - } else if (k < i + j) { - if (v != 0xb1b2) - goto fail; - } else { - if (v != 0xa1a1) - goto fail; - } - } - } - } - -fail: - kfree(p); - if (i < 256) - return (i << 24) | (j << 16) | k; - return 0; -} - -static __init int memset32_selftest(void) -{ - unsigned i, j, k; - u32 v, *p; - - p = kmalloc(256 * 2 * 4, GFP_KERNEL); - if (!p) - return -1; - - for (i = 0; i < 256; i++) { - for (j = 0; j < 256; j++) { - memset(p, 0xa1, 256 * 2 * sizeof(v)); - memset32(p + i, 0xb1b2b3b4, j); - for (k = 0; k < 512; k++) { - v = p[k]; - if (k < i) { - if (v != 0xa1a1a1a1) - goto fail; - } else if (k < i + j) { - if (v != 0xb1b2b3b4) - goto fail; - } else { - if (v != 0xa1a1a1a1) - goto fail; - } - } - } - } - -fail: - kfree(p); - if (i < 256) - return (i << 24) | (j << 16) | k; - return 0; -} - -static __init int memset64_selftest(void) -{ - unsigned i, j, k; - u64 v, *p; - - p = kmalloc(256 * 2 * 8, GFP_KERNEL); - if (!p) - return -1; - - for (i = 0; i < 256; i++) { - for (j = 0; j < 256; j++) { - memset(p, 0xa1, 256 * 2 * sizeof(v)); - memset64(p + i, 0xb1b2b3b4b5b6b7b8ULL, j); - for (k = 0; k < 512; k++) { - v = p[k]; - if (k < i) { - if (v != 0xa1a1a1a1a1a1a1a1ULL) - goto fail; - } else if (k < i + j) { - if (v != 0xb1b2b3b4b5b6b7b8ULL) - goto fail; - } else { - if (v != 0xa1a1a1a1a1a1a1a1ULL) - goto fail; - } - } - } - } - -fail: - kfree(p); - if (i < 256) - return (i << 24) | (j << 16) | k; - return 0; -} - -static __init int string_selftest_init(void) -{ - int test, subtest; - - test = 1; - subtest = memset16_selftest(); - if (subtest) - goto fail; - - test = 2; - subtest = memset32_selftest(); - if (subtest) - goto fail; - - test = 3; - subtest = memset64_selftest(); - if (subtest) - goto fail; - - pr_info("String selftests succeeded\n"); - return 0; -fail: - pr_crit("String selftest failure %d.%08x\n", test, subtest); - return 0; -} - -module_init(string_selftest_init); -#endif /* CONFIG_STRING_SELFTEST */ diff --git a/lib/test_string.c b/lib/test_string.c new file mode 100644 index 00000000000000..0fcdb82dca8667 --- /dev/null +++ b/lib/test_string.c @@ -0,0 +1,141 @@ +#include +#include +#include +#include + +static __init int memset16_selftest(void) +{ + unsigned i, j, k; + u16 v, *p; + + p = kmalloc(256 * 2 * 2, GFP_KERNEL); + if (!p) + return -1; + + for (i = 0; i < 256; i++) { + for (j = 0; j < 256; j++) { + memset(p, 0xa1, 256 * 2 * sizeof(v)); + memset16(p + i, 0xb1b2, j); + for (k = 0; k < 512; k++) { + v = p[k]; + if (k < i) { + if (v != 0xa1a1) + goto fail; + } else if (k < i + j) { + if (v != 0xb1b2) + goto fail; + } else { + if (v != 0xa1a1) + goto fail; + } + } + } + } + +fail: + kfree(p); + if (i < 256) + return (i << 24) | (j << 16) | k; + return 0; +} + +static __init int memset32_selftest(void) +{ + unsigned i, j, k; + u32 v, *p; + + p = kmalloc(256 * 2 * 4, GFP_KERNEL); + if (!p) + return -1; + + for (i = 0; i < 256; i++) { + for (j = 0; j < 256; j++) { + memset(p, 0xa1, 256 * 2 * sizeof(v)); + memset32(p + i, 0xb1b2b3b4, j); + for (k = 0; k < 512; k++) { + v = p[k]; + if (k < i) { + if (v != 0xa1a1a1a1) + goto fail; + } else if (k < i + j) { + if (v != 0xb1b2b3b4) + goto fail; + } else { + if (v != 0xa1a1a1a1) + goto fail; + } + } + } + } + +fail: + kfree(p); + if (i < 256) + return (i << 24) | (j << 16) | k; + return 0; +} + +static __init int memset64_selftest(void) +{ + unsigned i, j, k; + u64 v, *p; + + p = kmalloc(256 * 2 * 8, GFP_KERNEL); + if (!p) + return -1; + + for (i = 0; i < 256; i++) { + for (j = 0; j < 256; j++) { + memset(p, 0xa1, 256 * 2 * sizeof(v)); + memset64(p + i, 0xb1b2b3b4b5b6b7b8ULL, j); + for (k = 0; k < 512; k++) { + v = p[k]; + if (k < i) { + if (v != 0xa1a1a1a1a1a1a1a1ULL) + goto fail; + } else if (k < i + j) { + if (v != 0xb1b2b3b4b5b6b7b8ULL) + goto fail; + } else { + if (v != 0xa1a1a1a1a1a1a1a1ULL) + goto fail; + } + } + } + } + +fail: + kfree(p); + if (i < 256) + return (i << 24) | (j << 16) | k; + return 0; +} + +static __init int string_selftest_init(void) +{ + int test, subtest; + + test = 1; + subtest = memset16_selftest(); + if (subtest) + goto fail; + + test = 2; + subtest = memset32_selftest(); + if (subtest) + goto fail; + + test = 3; + subtest = memset64_selftest(); + if (subtest) + goto fail; + + pr_info("String selftests succeeded\n"); + return 0; +fail: + pr_crit("String selftest failure %d.%08x\n", test, subtest); + return 0; +} + +module_init(string_selftest_init); +MODULE_LICENSE("GPL v2"); From dc2bf000a2848cf1dee373db14ce2cd1fe3ee394 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Fri, 17 Nov 2017 15:28:00 -0800 Subject: [PATCH 31/94] lib/test: delete five error messages for failed memory allocations Omit extra messages for a memory allocation failure in these functions. This issue was detected by using the Coccinelle software. Link: http://lkml.kernel.org/r/410a4c5a-4ee0-6fcc-969c-103d8e496b78@users.sourceforge.net Signed-off-by: Markus Elfring Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_kasan.c | 5 ++--- lib/test_kmod.c | 8 ++------ lib/test_list_sort.c | 9 +++------ 3 files changed, 7 insertions(+), 15 deletions(-) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index a25c9763fce19f..ef1a3ac1397e88 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -353,10 +353,9 @@ static noinline void __init memcg_accounted_kmem_cache(void) */ for (i = 0; i < 5; i++) { p = kmem_cache_alloc(cache, GFP_KERNEL); - if (!p) { - pr_err("Allocation failed\n"); + if (!p) goto free_cache; - } + kmem_cache_free(cache, p); msleep(100); } diff --git a/lib/test_kmod.c b/lib/test_kmod.c index fba78d25e82569..337f408b4de605 100644 --- a/lib/test_kmod.c +++ b/lib/test_kmod.c @@ -783,10 +783,8 @@ static int kmod_config_sync_info(struct kmod_test_device *test_dev) free_test_dev_info(test_dev); test_dev->info = vzalloc(config->num_threads * sizeof(struct kmod_test_device_info)); - if (!test_dev->info) { - dev_err(test_dev->dev, "Cannot alloc test_dev info\n"); + if (!test_dev->info) return -ENOMEM; - } return 0; } @@ -1089,10 +1087,8 @@ static struct kmod_test_device *alloc_test_dev_kmod(int idx) struct miscdevice *misc_dev; test_dev = vzalloc(sizeof(struct kmod_test_device)); - if (!test_dev) { - pr_err("Cannot alloc test_dev\n"); + if (!test_dev) goto err_out; - } mutex_init(&test_dev->config_mutex); mutex_init(&test_dev->trigger_mutex); diff --git a/lib/test_list_sort.c b/lib/test_list_sort.c index 28e817387b04f1..5474f3f3e41d07 100644 --- a/lib/test_list_sort.c +++ b/lib/test_list_sort.c @@ -76,17 +76,14 @@ static int __init list_sort_test(void) pr_debug("start testing list_sort()\n"); elts = kcalloc(TEST_LIST_LEN, sizeof(*elts), GFP_KERNEL); - if (!elts) { - pr_err("error: cannot allocate memory\n"); + if (!elts) return err; - } for (i = 0; i < TEST_LIST_LEN; i++) { el = kmalloc(sizeof(*el), GFP_KERNEL); - if (!el) { - pr_err("error: cannot allocate memory\n"); + if (!el) goto exit; - } + /* force some equivalencies */ el->value = prandom_u32() % (TEST_LIST_LEN / 3); el->serial = i; From 3f3295709edea6268ff1609855f498035286af73 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 17 Nov 2017 15:28:04 -0800 Subject: [PATCH 32/94] lib/int_sqrt: optimize small argument The current int_sqrt() computation is sub-optimal for the case of small @x. Which is the interesting case when we're going to do cumulative distribution functions on idle times, which we assume to be a random variable, where the target residency of the deepest idle state gives an upper bound on the variable (5e6ns on recent Intel chips). In the case of small @x, the compute loop: while (m != 0) { b = y + m; y >>= 1; if (x >= b) { x -= b; y += m; } m >>= 2; } can be reduced to: while (m > x) m >>= 2; Because y==0, b==m and until x>=m y will remain 0. And while this is computationally equivalent, it runs much faster because there's less code, in particular less branches. cycles: branches: branch-misses: OLD: hot: 45.109444 +- 0.044117 44.333392 +- 0.002254 0.018723 +- 0.000593 cold: 187.737379 +- 0.156678 44.333407 +- 0.002254 6.272844 +- 0.004305 PRE: hot: 67.937492 +- 0.064124 66.999535 +- 0.000488 0.066720 +- 0.001113 cold: 232.004379 +- 0.332811 66.999527 +- 0.000488 6.914634 +- 0.006568 POST: hot: 43.633557 +- 0.034373 45.333132 +- 0.002277 0.023529 +- 0.000681 cold: 207.438411 +- 0.125840 45.333132 +- 0.002277 6.976486 +- 0.004219 Averages computed over all values <128k using a LFSR to generate order. Cold numbers have a LFSR based branch trace buffer 'confuser' ran between each int_sqrt() invocation. Link: http://lkml.kernel.org/r/20171020164644.876503355@infradead.org Fixes: 30493cc9dddb ("lib/int_sqrt.c: optimize square root algorithm") Signed-off-by: Peter Zijlstra (Intel) Suggested-by: Anshul Garg Acked-by: Linus Torvalds Cc: Davidlohr Bueso Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Will Deacon Cc: Joe Perches Cc: David Miller Cc: Matthew Wilcox Cc: Kees Cook Cc: Michael Davidson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/int_sqrt.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/int_sqrt.c b/lib/int_sqrt.c index db0b5aa071fc14..036c96781ea850 100644 --- a/lib/int_sqrt.c +++ b/lib/int_sqrt.c @@ -23,6 +23,9 @@ unsigned long int_sqrt(unsigned long x) return x; m = 1UL << (BITS_PER_LONG - 2); + while (m > x) + m >>= 2; + while (m != 0) { b = y + m; y >>= 1; From f8ae107eef209bff29a5816bc1aad40d5cd69a80 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 17 Nov 2017 15:28:08 -0800 Subject: [PATCH 33/94] lib/int_sqrt: optimize initial value compute The initial value (@m) compute is: m = 1UL << (BITS_PER_LONG - 2); while (m > x) m >>= 2; Which is a linear search for the highest even bit smaller or equal to @x We can implement this using a binary search using __fls() (or better when its hardware implemented). m = 1UL << (__fls(x) & ~1UL); Especially for small values of @x; which are the more common arguments when doing a CDF on idle times; the linear search is near to worst case, while the binary search of __fls() is a constant 6 (or 5 on 32bit) branches. cycles: branches: branch-misses: PRE: hot: 43.633557 +- 0.034373 45.333132 +- 0.002277 0.023529 +- 0.000681 cold: 207.438411 +- 0.125840 45.333132 +- 0.002277 6.976486 +- 0.004219 SOFTWARE FLS: hot: 29.576176 +- 0.028850 26.666730 +- 0.004511 0.019463 +- 0.000663 cold: 165.947136 +- 0.188406 26.666746 +- 0.004511 6.133897 +- 0.004386 HARDWARE FLS: hot: 24.720922 +- 0.025161 20.666784 +- 0.004509 0.020836 +- 0.000677 cold: 132.777197 +- 0.127471 20.666776 +- 0.004509 5.080285 +- 0.003874 Averages computed over all values <128k using a LFSR to generate order. Cold numbers have a LFSR based branch trace buffer 'confuser' ran between each int_sqrt() invocation. Link: http://lkml.kernel.org/r/20171020164644.936577234@infradead.org Signed-off-by: Peter Zijlstra (Intel) Suggested-by: Joe Perches Acked-by: Will Deacon Acked-by: Linus Torvalds Cc: Anshul Garg Cc: Davidlohr Bueso Cc: David Miller Cc: Ingo Molnar Cc: Kees Cook Cc: Matthew Wilcox Cc: Michael Davidson Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/int_sqrt.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/lib/int_sqrt.c b/lib/int_sqrt.c index 036c96781ea850..67bb300b5b4673 100644 --- a/lib/int_sqrt.c +++ b/lib/int_sqrt.c @@ -8,6 +8,7 @@ #include #include +#include /** * int_sqrt - rough approximation to sqrt @@ -22,10 +23,7 @@ unsigned long int_sqrt(unsigned long x) if (x <= 1) return x; - m = 1UL << (BITS_PER_LONG - 2); - while (m > x) - m >>= 2; - + m = 1UL << (__fls(x) & ~1UL); while (m != 0) { b = y + m; y >>= 1; From e813a614007e3a8a7b53d976e86d9a20f21f81ad Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 17 Nov 2017 15:28:12 -0800 Subject: [PATCH 34/94] lib/int_sqrt: adjust comments Our current int_sqrt() is not rough nor any approximation; it calculates the exact value of: floor(sqrt()). Document this. Link: http://lkml.kernel.org/r/20171020164645.001652117@infradead.org Signed-off-by: Peter Zijlstra (Intel) Acked-by: Linus Torvalds Cc: Anshul Garg Cc: Davidlohr Bueso Cc: David Miller Cc: Ingo Molnar Cc: Joe Perches Cc: Kees Cook Cc: Matthew Wilcox Cc: Michael Davidson Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/int_sqrt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/int_sqrt.c b/lib/int_sqrt.c index 67bb300b5b4673..e2d329099bf748 100644 --- a/lib/int_sqrt.c +++ b/lib/int_sqrt.c @@ -11,10 +11,10 @@ #include /** - * int_sqrt - rough approximation to sqrt + * int_sqrt - computes the integer square root * @x: integer of which to calculate the sqrt * - * A very rough approximation to the sqrt() function. + * Computes: floor(sqrt(x)) */ unsigned long int_sqrt(unsigned long x) { From 36a3d1dd4e16bcd0d2ddfb4a2ec7092f0ae0d931 Mon Sep 17 00:00:00 2001 From: Stephen Bates Date: Fri, 17 Nov 2017 15:28:16 -0800 Subject: [PATCH 35/94] lib/genalloc.c: make the avail variable an atomic_long_t If the amount of resources allocated to a gen_pool exceeds 2^32 then the avail atomic overflows and this causes problems when clients try and borrow resources from the pool. This is only expected to be an issue on 64 bit systems. Add the header to pull in atomic_long* operations. So that 32 bit systems continue to use atomic32_t but 64 bit systems can use atomic64_t. Link: http://lkml.kernel.org/r/1509033843-25667-1-git-send-email-sbates@raithlin.com Signed-off-by: Stephen Bates Reviewed-by: Logan Gunthorpe Reviewed-by: Mathieu Desnoyers Reviewed-by: Daniel Mentz Cc: Jonathan Corbet Cc: Andrew Morton Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/genalloc.h | 3 ++- lib/genalloc.c | 10 +++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h index 6dfec4d638df3e..872f930f1b06d4 100644 --- a/include/linux/genalloc.h +++ b/include/linux/genalloc.h @@ -32,6 +32,7 @@ #include #include +#include struct device; struct device_node; @@ -71,7 +72,7 @@ struct gen_pool { */ struct gen_pool_chunk { struct list_head next_chunk; /* next chunk in pool */ - atomic_t avail; + atomic_long_t avail; phys_addr_t phys_addr; /* physical starting address of memory chunk */ unsigned long start_addr; /* start address of memory chunk */ unsigned long end_addr; /* end address of memory chunk (inclusive) */ diff --git a/lib/genalloc.c b/lib/genalloc.c index 144fe6b1a03ea5..ca06adc4f44510 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -194,7 +194,7 @@ int gen_pool_add_virt(struct gen_pool *pool, unsigned long virt, phys_addr_t phy chunk->phys_addr = phys; chunk->start_addr = virt; chunk->end_addr = virt + size - 1; - atomic_set(&chunk->avail, size); + atomic_long_set(&chunk->avail, size); spin_lock(&pool->lock); list_add_rcu(&chunk->next_chunk, &pool->chunks); @@ -304,7 +304,7 @@ unsigned long gen_pool_alloc_algo(struct gen_pool *pool, size_t size, nbits = (size + (1UL << order) - 1) >> order; rcu_read_lock(); list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) { - if (size > atomic_read(&chunk->avail)) + if (size > atomic_long_read(&chunk->avail)) continue; start_bit = 0; @@ -324,7 +324,7 @@ unsigned long gen_pool_alloc_algo(struct gen_pool *pool, size_t size, addr = chunk->start_addr + ((unsigned long)start_bit << order); size = nbits << order; - atomic_sub(size, &chunk->avail); + atomic_long_sub(size, &chunk->avail); break; } rcu_read_unlock(); @@ -390,7 +390,7 @@ void gen_pool_free(struct gen_pool *pool, unsigned long addr, size_t size) remain = bitmap_clear_ll(chunk->bits, start_bit, nbits); BUG_ON(remain); size = nbits << order; - atomic_add(size, &chunk->avail); + atomic_long_add(size, &chunk->avail); rcu_read_unlock(); return; } @@ -464,7 +464,7 @@ size_t gen_pool_avail(struct gen_pool *pool) rcu_read_lock(); list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) - avail += atomic_read(&chunk->avail); + avail += atomic_long_read(&chunk->avail); rcu_read_unlock(); return avail; } From 2f9b7e08cb27d6d8d4579bb5301fb0940ff63d19 Mon Sep 17 00:00:00 2001 From: "Liu, Changcheng" Date: Fri, 17 Nov 2017 15:28:20 -0800 Subject: [PATCH 36/94] lib/nmi_backtrace.c: fix kernel text address leak Don't leak idle function address in NMI backtrace. Link: http://lkml.kernel.org/r/20171106165648.GA95243@sofia Signed-off-by: Liu Changcheng Reviewed-by: Petr Mladek Reviewed-by: Josh Poimboeuf Reviewed-by: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/nmi_backtrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c index 46e4c749e4eba6..61a6b5aab07e75 100644 --- a/lib/nmi_backtrace.c +++ b/lib/nmi_backtrace.c @@ -93,8 +93,8 @@ bool nmi_cpu_backtrace(struct pt_regs *regs) if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { arch_spin_lock(&lock); if (regs && cpu_in_idle(instruction_pointer(regs))) { - pr_warn("NMI backtrace for cpu %d skipped: idling at pc %#lx\n", - cpu, instruction_pointer(regs)); + pr_warn("NMI backtrace for cpu %d skipped: idling at %pS\n", + cpu, (void *)instruction_pointer(regs)); } else { pr_warn("NMI backtrace for cpu %d\n", cpu); if (regs) From e4795e3bb7d7b3b3d066cea57fb459f869500284 Mon Sep 17 00:00:00 2001 From: Cheng Jian Date: Fri, 17 Nov 2017 15:28:23 -0800 Subject: [PATCH 37/94] tools/lib/traceevent/parse-filter.c: clean up clang build warning The uniform structure filter_arg sets its union based on the difference of enum filter_arg_type, However, some functions use implicit type conversion obviously. warning: implicit conversion from enumeration type 'enum filter_exp_type' to different enumeration type 'enum filter_op_type' warning: implicit conversion from enumeration type 'enum filter_cmp_type' to different enumeration type 'enum filter_exp_type' Link: http://lkml.kernel.org/r/1509938415-113825-1-git-send-email-cj.chengjian@huawei.com Signed-off-by: Cheng Jian Cc: Kate Stewart Cc: Xie XiuQi Cc: Li Bin Cc: Steven Rostedt (Red Hat) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/lib/traceevent/parse-filter.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/lib/traceevent/parse-filter.c b/tools/lib/traceevent/parse-filter.c index 7c214ceb93860a..315df0a7026529 100644 --- a/tools/lib/traceevent/parse-filter.c +++ b/tools/lib/traceevent/parse-filter.c @@ -436,13 +436,13 @@ create_arg_exp(enum filter_exp_type etype) return NULL; arg->type = FILTER_ARG_EXP; - arg->op.type = etype; + arg->exp.type = etype; return arg; } static struct filter_arg * -create_arg_cmp(enum filter_exp_type etype) +create_arg_cmp(enum filter_cmp_type ctype) { struct filter_arg *arg; @@ -452,7 +452,7 @@ create_arg_cmp(enum filter_exp_type etype) /* Use NUM and change if necessary */ arg->type = FILTER_ARG_NUM; - arg->op.type = etype; + arg->num.type = ctype; return arg; } From 0b548e33e6cb2bff240fdaf1783783be15c29080 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 17 Nov 2017 15:28:27 -0800 Subject: [PATCH 38/94] lib/rbtree-test: lower default params Fengguang reported soft lockups while running the rbtree and interval tree test modules. The logic for these tests all occur in init phase, and we currently are pounding with the default values for number of nodes and number of iterations of each test. Reduce the latter by two orders of magnitude. This does not influence the value of the tests in that one thousand times by default is enough to get the picture. Link: http://lkml.kernel.org/r/20171109161715.xai2dtwqw2frhkcm@linux-n805 Signed-off-by: Davidlohr Bueso Reported-by: Fengguang Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/interval_tree_test.c | 4 ++-- lib/rbtree_test.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/interval_tree_test.c b/lib/interval_tree_test.c index 0e343fd2957019..835242e74aaa33 100644 --- a/lib/interval_tree_test.c +++ b/lib/interval_tree_test.c @@ -11,10 +11,10 @@ MODULE_PARM_DESC(name, msg); __param(int, nnodes, 100, "Number of nodes in the interval tree"); -__param(int, perf_loops, 100000, "Number of iterations modifying the tree"); +__param(int, perf_loops, 1000, "Number of iterations modifying the tree"); __param(int, nsearches, 100, "Number of searches to the interval tree"); -__param(int, search_loops, 10000, "Number of iterations searching the tree"); +__param(int, search_loops, 1000, "Number of iterations searching the tree"); __param(bool, search_all, false, "Searches will iterate all nodes in the tree"); __param(uint, max_endpoint, ~0, "Largest value for the interval's endpoint"); diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c index 191a238e5a9d73..7d36c1e27ff655 100644 --- a/lib/rbtree_test.c +++ b/lib/rbtree_test.c @@ -11,7 +11,7 @@ MODULE_PARM_DESC(name, msg); __param(int, nnodes, 100, "Number of nodes in the rb-tree"); -__param(int, perf_loops, 100000, "Number of iterations modifying the rb-tree"); +__param(int, perf_loops, 1000, "Number of iterations modifying the rb-tree"); __param(int, check_loops, 100, "Number of iterations modifying and verifying the rb-tree"); struct test_node { From 4441fca0a27f5f0e2c652584ae9d7abec6255c1f Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Fri, 17 Nov 2017 15:28:31 -0800 Subject: [PATCH 39/94] lib: test module for find_*_bit() functions find_bit functions are widely used in the kernel, including hot paths. This module tests performance of those functions in 2 typical scenarios: randomly filled bitmap with relatively equal distribution of set and cleared bits, and sparse bitmap which has 1 set bit for 500 cleared bits. On ThunderX machine: Start testing find_bit() with random-filled bitmap find_next_bit: 240043 cycles, 164062 iterations find_next_zero_bit: 312848 cycles, 163619 iterations find_last_bit: 193748 cycles, 164062 iterations find_first_bit: 177720874 cycles, 164062 iterations Start testing find_bit() with sparse bitmap find_next_bit: 3633 cycles, 656 iterations find_next_zero_bit: 620399 cycles, 327025 iterations find_last_bit: 3038 cycles, 656 iterations find_first_bit: 691407 cycles, 656 iterations [arnd@arndb.de: use correct format string for find-bit tests] Link: http://lkml.kernel.org/r/20171113135605.3166307-1-arnd@arndb.de Link: http://lkml.kernel.org/r/20171109140714.13168-1-ynorov@caviumnetworks.com Signed-off-by: Yury Norov Signed-off-by: Arnd Bergmann Reviewed-by: Clement Courbet Cc: Alexey Dobriyan Cc: Matthew Wilcox Cc: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 9 +++ lib/Makefile | 1 + lib/test_find_bit.c | 144 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 154 insertions(+) create mode 100644 lib/test_find_bit.c diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 8ffd891857aba7..ce813731269f6c 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1850,6 +1850,15 @@ config TEST_BPF If unsure, say N. +config TEST_FIND_BIT + tristate "Test find_bit functions" + default n + help + This builds the "test_find_bit" module that measure find_*_bit() + functions performance. + + If unsure, say N. + config TEST_FIRMWARE tristate "Test firmware loading via userspace interface" default n diff --git a/lib/Makefile b/lib/Makefile index e3cb2a2413380a..d11c48ec8ffdbd 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -47,6 +47,7 @@ obj-y += hexdump.o obj-$(CONFIG_TEST_HEXDUMP) += test_hexdump.o obj-y += kstrtox.o obj-$(CONFIG_TEST_BPF) += test_bpf.o +obj-$(CONFIG_TEST_FIND_BIT) += test_find_bit.o obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o obj-$(CONFIG_TEST_SYSCTL) += test_sysctl.o obj-$(CONFIG_TEST_HASH) += test_hash.o test_siphash.o diff --git a/lib/test_find_bit.c b/lib/test_find_bit.c new file mode 100644 index 00000000000000..f4394a36f9aa2b --- /dev/null +++ b/lib/test_find_bit.c @@ -0,0 +1,144 @@ +/* + * Test for find_*_bit functions. + * + * Copyright (c) 2017 Cavium. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +/* + * find_bit functions are widely used in kernel, so the successful boot + * is good enough test for correctness. + * + * This test is focused on performance of traversing bitmaps. Two typical + * scenarios are reproduced: + * - randomly filled bitmap with approximately equal number of set and + * cleared bits; + * - sparse bitmap with few set bits at random positions. + */ + +#include +#include +#include +#include +#include +#include + +#define BITMAP_LEN (4096UL * 8 * 10) +#define SPARSE 500 + +static DECLARE_BITMAP(bitmap, BITMAP_LEN) __initdata; + +/* + * This is Schlemiel the Painter's algorithm. It should be called after + * all other tests for the same bitmap because it sets all bits of bitmap to 1. + */ +static int __init test_find_first_bit(void *bitmap, unsigned long len) +{ + unsigned long i, cnt; + cycles_t cycles; + + cycles = get_cycles(); + for (cnt = i = 0; i < len; cnt++) { + i = find_first_bit(bitmap, len); + __clear_bit(i, bitmap); + } + cycles = get_cycles() - cycles; + pr_err("find_first_bit:\t\t%llu cycles,\t%ld iterations\n", + (u64)cycles, cnt); + + return 0; +} + +static int __init test_find_next_bit(const void *bitmap, unsigned long len) +{ + unsigned long i, cnt; + cycles_t cycles; + + cycles = get_cycles(); + for (cnt = i = 0; i < BITMAP_LEN; cnt++) + i = find_next_bit(bitmap, BITMAP_LEN, i) + 1; + cycles = get_cycles() - cycles; + pr_err("find_next_bit:\t\t%llu cycles,\t%ld iterations\n", + (u64)cycles, cnt); + + return 0; +} + +static int __init test_find_next_zero_bit(const void *bitmap, unsigned long len) +{ + unsigned long i, cnt; + cycles_t cycles; + + cycles = get_cycles(); + for (cnt = i = 0; i < BITMAP_LEN; cnt++) + i = find_next_zero_bit(bitmap, len, i) + 1; + cycles = get_cycles() - cycles; + pr_err("find_next_zero_bit:\t%llu cycles,\t%ld iterations\n", + (u64)cycles, cnt); + + return 0; +} + +static int __init test_find_last_bit(const void *bitmap, unsigned long len) +{ + unsigned long l, cnt = 0; + cycles_t cycles; + + cycles = get_cycles(); + do { + cnt++; + l = find_last_bit(bitmap, len); + if (l >= len) + break; + len = l; + } while (len); + cycles = get_cycles() - cycles; + pr_err("find_last_bit:\t\t%llu cycles,\t%ld iterations\n", + (u64)cycles, cnt); + + return 0; +} + +static int __init find_bit_test(void) +{ + unsigned long nbits = BITMAP_LEN / SPARSE; + + pr_err("\nStart testing find_bit() with random-filled bitmap\n"); + + get_random_bytes(bitmap, sizeof(bitmap)); + + test_find_next_bit(bitmap, BITMAP_LEN); + test_find_next_zero_bit(bitmap, BITMAP_LEN); + test_find_last_bit(bitmap, BITMAP_LEN); + test_find_first_bit(bitmap, BITMAP_LEN); + + pr_err("\nStart testing find_bit() with sparse bitmap\n"); + + bitmap_zero(bitmap, BITMAP_LEN); + + while (nbits--) + __set_bit(prandom_u32() % BITMAP_LEN, bitmap); + + test_find_next_bit(bitmap, BITMAP_LEN); + test_find_next_zero_bit(bitmap, BITMAP_LEN); + test_find_last_bit(bitmap, BITMAP_LEN); + test_find_first_bit(bitmap, BITMAP_LEN); + + return 0; +} +module_init(find_bit_test); + +static void __exit test_find_bit_cleanup(void) +{ +} +module_exit(test_find_bit_cleanup); + +MODULE_LICENSE("GPL"); From 25bdda2bd68ae3008759f26058f6f9b9bb2b1cc5 Mon Sep 17 00:00:00 2001 From: Miles Chen Date: Fri, 17 Nov 2017 15:28:34 -0800 Subject: [PATCH 40/94] checkpatch: support function pointers for unnamed function definition arguments Current unnamed function definition argument does not include function pointer cases and it reports something like: WARNING: function definition argument 'void' should also have an identifier name +unsigned int (*dummy)(void); Support function pointers for unnamed function arguments Link: http://lkml.kernel.org/r/1505389925-31087-1-git-send-email-miles.chen@mediatek.com Signed-off-by: Miles Chen Acked-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 8b80bac055e490..aacbe918027b0b 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -5957,7 +5957,7 @@ sub process { # check for function declarations that have arguments without identifier names if (defined $stat && - $stat =~ /^.\s*(?:extern\s+)?$Type\s*$Ident\s*\(\s*([^{]+)\s*\)\s*;/s && + $stat =~ /^.\s*(?:extern\s+)?$Type\s*(?:$Ident|\(\s*\*\s*$Ident\s*\))\s*\(\s*([^{]+)\s*\)\s*;/s && $1 ne "void") { my $args = trim($1); while ($args =~ m/\s*($Type\s*(?:$Ident|\(\s*\*\s*$Ident?\s*\)\s*$balanced_parens)?)/g) { From 258f79d5a1e49271f5aff38e6c1baeeaad0d82aa Mon Sep 17 00:00:00 2001 From: Heinrich Schuchardt Date: Fri, 17 Nov 2017 15:28:38 -0800 Subject: [PATCH 41/94] scripts/checkpatch.pl: avoid false warning missing break void foo(int a) switch (a) { case 'h': fun1(); exit(1); default: } creates a warning "Possible switch case/default not preceded by break or fallthrough comment". exit( should be treated like return. Link: http://lkml.kernel.org/r/20170910154618.25819-1-xypron.glpk@gmx.de Signed-off-by: Heinrich Schuchardt Acked-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index aacbe918027b0b..8dce8a8d9ed0ee 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -6109,7 +6109,7 @@ sub process { next if ($fline =~ /^.[\s$;]*$/); $has_statement = 1; $count++; - $has_break = 1 if ($fline =~ /\bswitch\b|\b(?:break\s*;[\s$;]*$|return\b|goto\b|continue\b)/); + $has_break = 1 if ($fline =~ /\bswitch\b|\b(?:break\s*;[\s$;]*$|exit\s*\(\b|return\b|goto\b|continue\b)/); } if (!$has_break && $has_statement) { WARN("MISSING_BREAK", From eeef5733e30e736926c762fa3336c4dd5702bcdf Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 17 Nov 2017 15:28:41 -0800 Subject: [PATCH 42/94] checkpatch: printks always need a KERN_ There was code in checkpatch that allowed continuation printks to be used without KERN_CONT. Remove the continuation check and always require a KERN_. Link: http://lkml.kernel.org/r/61980ef41d5b9b6543da1c49055042e0ab74d308.1507047008.git.joe@perches.com Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 26 ++++---------------------- 1 file changed, 4 insertions(+), 22 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 8dce8a8d9ed0ee..2a8c6c3c1bdb4d 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3829,28 +3829,10 @@ sub process { "Prefer printk_ratelimited or pr__ratelimited to printk_ratelimit\n" . $herecurr); } -# printk should use KERN_* levels. Note that follow on printk's on the -# same line do not need a level, so we use the current block context -# to try and find and validate the current printk. In summary the current -# printk includes all preceding printk's which have no newline on the end. -# we assume the first bad printk is the one to report. - if ($line =~ /\bprintk\((?!KERN_)\s*"/) { - my $ok = 0; - for (my $ln = $linenr - 1; $ln >= $first_line; $ln--) { - #print "CHECK<$lines[$ln - 1]\n"; - # we have a preceding printk if it ends - # with "\n" ignore it, else it is to blame - if ($lines[$ln - 1] =~ m{\bprintk\(}) { - if ($rawlines[$ln - 1] !~ m{\\n"}) { - $ok = 1; - } - last; - } - } - if ($ok == 0) { - WARN("PRINTK_WITHOUT_KERN_LEVEL", - "printk() should include KERN_ facility level\n" . $herecurr); - } +# printk should use KERN_* levels + if ($line =~ /\bprintk\s*\(\s*(?!KERN_[A-Z]+\b)/) { + WARN("PRINTK_WITHOUT_KERN_LEVEL", + "printk() should include KERN_ facility level\n" . $herecurr); } if ($line =~ /\bprintk\s*\(\s*KERN_([A-Z]+)/) { From cc147506bef96f31af799afe94d0c0e81161ee6c Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 17 Nov 2017 15:28:44 -0800 Subject: [PATCH 43/94] checkpatch: allow DEFINE_PER_CPU definitions to exceed line length Some of the definitions are very long and can't be split into multiple lines because ctags is limited. Exempt these lines from the line length checks. See commit 25528213fe9f ("tags: Fix DEFINE_PER_CPU expansions") for more details. Link: http://lkml.kernel.org/r/1508170320.6530.15.camel@perches.com Signed-off-by: Joe Perches Acked-by: Mark Rutland Cc: Daniel Lezcano Cc: Marc Zyngier Cc: Thomas Gleixner Cc: Ard Biesheuvel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 2a8c6c3c1bdb4d..5fa0f5467d9936 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2900,8 +2900,9 @@ sub process { $line =~ /^\+\s*#\s*define\s+\w+\s+$String$/) { $msg_type = ""; - # EFI_GUID is another special case - } elsif ($line =~ /^\+.*\bEFI_GUID\s*\(/) { + # More special cases + } elsif ($line =~ /^\+.*\bEFI_GUID\s*\(/ || + $line =~ /^\+\s*(?:\w+)?\s*DEFINE_PER_CPU/) { $msg_type = ""; # Otherwise set the alternate message types From 87bd499af5cd663c150032cca4bac822a729263b Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 17 Nov 2017 15:28:48 -0800 Subject: [PATCH 44/94] checkpatch: add TP_printk to list of logging functions So the line length check can be bypassed by its callers. Link: http://lkml.kernel.org/r/7de542c08a6e79f2ebe7c1416c9f403c23fdcc09.1508282823.git.joe@perches.com Signed-off-by: Joe Perches Reported-by: Song Liu Tested-by: Song Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 5fa0f5467d9936..6bdd43d5dec55f 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -454,6 +454,7 @@ sub hash_show_words { our $logFunctions = qr{(?x: printk(?:_ratelimited|_once|_deferred_once|_deferred|)| (?:[a-z0-9]+_){1,2}(?:printk|emerg|alert|crit|err|warning|warn|notice|info|debug|dbg|vdbg|devel|cont|WARN)(?:_ratelimited|_once|)| + TP_printk| WARN(?:_RATELIMIT|_ONCE|)| panic| MODULE_[A-Z_]+| From 5751a24edfd43a91e072d63cde2b99b5a421645f Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 17 Nov 2017 15:28:52 -0800 Subject: [PATCH 45/94] checkpatch: add --strict test for lines ending in [ or ( Lines that end in an open bracket or open parenthesis are generally hard to follow. Lines following those ending with open parenthesis are also rarely aligned to that open parenthesis. Suggest not ending lines with '[' or '(' Link: http://lkml.kernel.org/r/8fd0b2b4a7482064254e37931eb9302a81d5aa2f.1508340786.git.joe@perches.com Signed-off-by: Joe Perches Suggested-by: Vivien Didelot Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 6bdd43d5dec55f..3453df9f90ab40 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3184,6 +3184,12 @@ sub process { # check we are in a valid C source file if not then ignore this hunk next if ($realfile !~ /\.(h|c)$/); +# check for unusual line ending [ or ( + if ($line =~ /^\+.*([\[\(])\s*$/) { + CHK("OPEN_ENDED_LINE", + "Lines should not end with a '$1'\n" . $herecurr); + } + # check if this appears to be the start function declaration, save the name if ($sline =~ /^\+\{\s*$/ && $prevline =~ /^\+(?:(?:(?:$Storage|$Inline)\s*)*\s*$Type\s*)?($Ident)\(/) { From 0bc989ffc802752c5256192b4a9c8f16a00feca7 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 17 Nov 2017 15:28:55 -0800 Subject: [PATCH 46/94] checkpatch: do not check missing blank line before builtin_*_driver checkpatch.pl does not check missing blank line before module_*_driver. I want it to behave likewise for builtin_*_driver. Link: http://lkml.kernel.org/r/1505700081-12854-1-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Acked-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 3453df9f90ab40..95cda3ecc66b85 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3105,6 +3105,7 @@ sub process { $line =~ /^\+[a-z_]*init/ || $line =~ /^\+\s*(?:static\s+)?[A-Z_]*ATTR/ || $line =~ /^\+\s*DECLARE/ || + $line =~ /^\+\s*builtin_[\w_]*driver/ || $line =~ /^\+\s*__setup/)) { if (CHK("LINE_SPACING", "Please use a blank line after function/struct/union/enum declarations\n" . $hereprev) && From 2ae928a9441a3b5f13952e1e8a97d03cb23ea603 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Fri, 17 Nov 2017 15:28:59 -0800 Subject: [PATCH 47/94] epoll: account epitem and eppoll_entry to kmemcg A userspace application can directly trigger the allocations from eventpoll_epi and eventpoll_pwq slabs. A buggy or malicious application can consume a significant amount of system memory by triggering such allocations. Indeed we have seen in production where a buggy application was leaking the epoll references and causing a burst of eventpoll_epi and eventpoll_pwq slab allocations. This patch opt-in the charging of eventpoll_epi and eventpoll_pwq slabs. There is a per-user limit (~4% of total memory if no highmem) on these caches. I think it is too generous particularly in the scenario where jobs of multiple users are running on the system and the administrator is reducing cost by overcomitting the memory. This is unaccounted kernel memory and will not be considered by the oom-killer. I think by accounting it to kmemcg, for systems with kmem accounting enabled, we can provide better isolation between jobs of different users. Link: http://lkml.kernel.org/r/20171003021519.23907-1-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Cc: Alexander Viro Cc: Vladimir Davydov Cc: Johannes Weiner Cc: Greg Thelen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 2fabd19cdeea76..a45360444895a5 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -2329,11 +2329,11 @@ static int __init eventpoll_init(void) /* Allocates slab cache used to allocate "struct epitem" items */ epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem), - 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); /* Allocates slab cache used to allocate "struct eppoll_entry" */ pwq_cache = kmem_cache_create("eventpoll_pwq", - sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL); + sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL); return 0; } From 57a173bdf5baab48e8e78825c7366c634acd087c Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Fri, 17 Nov 2017 15:29:02 -0800 Subject: [PATCH 48/94] epoll: avoid calling ep_call_nested() from ep_poll_safewake() ep_poll_safewake() is used to wakeup potentially nested epoll file descriptors. The function uses ep_call_nested() to prevent entering the same wake up queue more than once, and to prevent excessively deep wakeup paths (deeper than EP_MAX_NESTS). However, this is not necessary since we are already preventing these conditions during EPOLL_CTL_ADD. This saves extra function calls, and avoids taking a global lock during the ep_call_nested() calls. I have, however, left ep_call_nested() for the CONFIG_DEBUG_LOCK_ALLOC case, since ep_call_nested() keeps track of the nesting level, and this is required by the call to spin_lock_irqsave_nested(). It would be nice to remove the ep_call_nested() calls for the CONFIG_DEBUG_LOCK_ALLOC case as well, however its not clear how to simply pass the nesting level through multiple wake_up() levels without more surgery. In any case, I don't think CONFIG_DEBUG_LOCK_ALLOC is generally used for production. This patch, also apparently fixes a workload at Google that Salman Qazi reported by completely removing the poll_safewake_ncalls->lock from wakeup paths. Link: http://lkml.kernel.org/r/1507920533-8812-1-git-send-email-jbaron@akamai.com Signed-off-by: Jason Baron Acked-by: Davidlohr Bueso Cc: Alexander Viro Cc: Salman Qazi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 47 ++++++++++++++++++----------------------------- 1 file changed, 18 insertions(+), 29 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index a45360444895a5..dc15bb02ee2a86 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -276,9 +276,6 @@ static DEFINE_MUTEX(epmutex); /* Used to check for epoll file descriptor inclusion loops */ static struct nested_calls poll_loop_ncalls; -/* Used for safe wake up implementation */ -static struct nested_calls poll_safewake_ncalls; - /* Used to call file's f_op->poll() under the nested calls boundaries */ static struct nested_calls poll_readywalk_ncalls; @@ -551,40 +548,21 @@ static int ep_call_nested(struct nested_calls *ncalls, int max_nests, * this special case of epoll. */ #ifdef CONFIG_DEBUG_LOCK_ALLOC -static inline void ep_wake_up_nested(wait_queue_head_t *wqueue, - unsigned long events, int subclass) + +static struct nested_calls poll_safewake_ncalls; + +static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests) { unsigned long flags; + wait_queue_head_t *wqueue = (wait_queue_head_t *)cookie; - spin_lock_irqsave_nested(&wqueue->lock, flags, subclass); - wake_up_locked_poll(wqueue, events); + spin_lock_irqsave_nested(&wqueue->lock, flags, call_nests + 1); + wake_up_locked_poll(wqueue, POLLIN); spin_unlock_irqrestore(&wqueue->lock, flags); -} -#else -static inline void ep_wake_up_nested(wait_queue_head_t *wqueue, - unsigned long events, int subclass) -{ - wake_up_poll(wqueue, events); -} -#endif -static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests) -{ - ep_wake_up_nested((wait_queue_head_t *) cookie, POLLIN, - 1 + call_nests); return 0; } -/* - * Perform a safe wake up of the poll wait list. The problem is that - * with the new callback'd wake up system, it is possible that the - * poll callback is reentered from inside the call to wake_up() done - * on the poll wait queue head. The rule is that we cannot reenter the - * wake up code from the same task more than EP_MAX_NESTS times, - * and we cannot reenter the same wait queue head at all. This will - * enable to have a hierarchy of epoll file descriptor of no more than - * EP_MAX_NESTS deep. - */ static void ep_poll_safewake(wait_queue_head_t *wq) { int this_cpu = get_cpu(); @@ -595,6 +573,15 @@ static void ep_poll_safewake(wait_queue_head_t *wq) put_cpu(); } +#else + +static void ep_poll_safewake(wait_queue_head_t *wq) +{ + wake_up_poll(wq, POLLIN); +} + +#endif + static void ep_remove_wait_queue(struct eppoll_entry *pwq) { wait_queue_head_t *whead; @@ -2315,8 +2302,10 @@ static int __init eventpoll_init(void) */ ep_nested_calls_init(&poll_loop_ncalls); +#ifdef CONFIG_DEBUG_LOCK_ALLOC /* Initialize the structure used to perform safe poll wait head wake ups */ ep_nested_calls_init(&poll_safewake_ncalls); +#endif /* Initialize the structure used to perform file's f_op->poll() calls */ ep_nested_calls_init(&poll_readywalk_ncalls); From 37b5e5212a448bac0fe29d2a51f088014fbaaa41 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Fri, 17 Nov 2017 15:29:06 -0800 Subject: [PATCH 49/94] epoll: remove ep_call_nested() from ep_eventpoll_poll() The use of ep_call_nested() in ep_eventpoll_poll(), which is the .poll routine for an epoll fd, is used to prevent excessively deep epoll nesting, and to prevent circular paths. However, we are already preventing these conditions during EPOLL_CTL_ADD. In terms of too deep epoll chains, we do in fact allow deep nesting of the epoll fds themselves (deeper than EP_MAX_NESTS), however we don't allow more than EP_MAX_NESTS when an epoll file descriptor is actually connected to a wakeup source. Thus, we do not require the use of ep_call_nested(), since ep_eventpoll_poll(), which is called via ep_scan_ready_list() only continues nesting if there are events available. Since ep_call_nested() is implemented using a global lock, applications that make use of nested epoll can see large performance improvements with this change. Davidlohr said: : Improvements are quite obscene actually, such as for the following : epoll_wait() benchmark with 2 level nesting on a 80 core IvyBridge: : : ncpus vanilla dirty delta : 1 2447092 3028315 +23.75% : 4 231265 2986954 +1191.57% : 8 121631 2898796 +2283.27% : 16 59749 2902056 +4757.07% : 32 26837 2326314 +8568.30% : 64 12926 1341281 +10276.61% : : (http://linux-scalability.org/epoll/epoll-test.c) Link: http://lkml.kernel.org/r/1509430214-5599-1-git-send-email-jbaron@akamai.com Signed-off-by: Jason Baron Cc: Davidlohr Bueso Cc: Alexander Viro Cc: Salman Qazi Cc: Hou Tao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 80 ++++++++++++++++++++++---------------------------- 1 file changed, 35 insertions(+), 45 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index dc15bb02ee2a86..1e048144f17c40 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -276,9 +276,6 @@ static DEFINE_MUTEX(epmutex); /* Used to check for epoll file descriptor inclusion loops */ static struct nested_calls poll_loop_ncalls; -/* Used to call file's f_op->poll() under the nested calls boundaries */ -static struct nested_calls poll_readywalk_ncalls; - /* Slab cache used to allocate "struct epitem" */ static struct kmem_cache *epi_cache __read_mostly; @@ -867,11 +864,33 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file) return 0; } -static inline unsigned int ep_item_poll(struct epitem *epi, poll_table *pt) +static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, + void *priv); +static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, + poll_table *pt); + +/* + * Differs from ep_eventpoll_poll() in that internal callers already have + * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested() + * is correctly annotated. + */ +static unsigned int ep_item_poll(struct epitem *epi, poll_table *pt, int depth) { + struct eventpoll *ep; + bool locked; + pt->_key = epi->event.events; + if (!is_file_epoll(epi->ffd.file)) + return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & + epi->event.events; - return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & epi->event.events; + ep = epi->ffd.file->private_data; + poll_wait(epi->ffd.file, &ep->poll_wait, pt); + locked = pt && (pt->_qproc == ep_ptable_queue_proc); + + return ep_scan_ready_list(epi->ffd.file->private_data, + ep_read_events_proc, &depth, depth, + locked) & epi->event.events; } static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, @@ -879,13 +898,15 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, { struct epitem *epi, *tmp; poll_table pt; + int depth = *(int *)priv; init_poll_funcptr(&pt, NULL); + depth++; list_for_each_entry_safe(epi, tmp, head, rdllink) { - if (ep_item_poll(epi, &pt)) + if (ep_item_poll(epi, &pt, depth)) { return POLLIN | POLLRDNORM; - else { + } else { /* * Item has been dropped into the ready list by the poll * callback, but it's not actually ready, as far as @@ -899,48 +920,20 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, return 0; } -static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, - poll_table *pt); - -struct readyevents_arg { - struct eventpoll *ep; - bool locked; -}; - -static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests) -{ - struct readyevents_arg *arg = priv; - - return ep_scan_ready_list(arg->ep, ep_read_events_proc, NULL, - call_nests + 1, arg->locked); -} - static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) { - int pollflags; struct eventpoll *ep = file->private_data; - struct readyevents_arg arg; - - /* - * During ep_insert() we already hold the ep->mtx for the tfile. - * Prevent re-aquisition. - */ - arg.locked = wait && (wait->_qproc == ep_ptable_queue_proc); - arg.ep = ep; + int depth = 0; /* Insert inside our poll wait queue */ poll_wait(file, &ep->poll_wait, wait); /* * Proceed to find out if wanted events are really available inside - * the ready list. This need to be done under ep_call_nested() - * supervision, since the call to f_op->poll() done on listed files - * could re-enter here. + * the ready list. */ - pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS, - ep_poll_readyevents_proc, &arg, ep, current); - - return pollflags != -1 ? pollflags : 0; + return ep_scan_ready_list(ep, ep_read_events_proc, + &depth, depth, false); } #ifdef CONFIG_PROC_FS @@ -1459,7 +1452,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, * this operation completes, the poll callback can start hitting * the new item. */ - revents = ep_item_poll(epi, &epq.pt); + revents = ep_item_poll(epi, &epq.pt, 1); /* * We have to check if something went wrong during the poll wait queue @@ -1593,7 +1586,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even * Get current event bits. We can safely use the file* here because * its usage count has been increased by the caller of this function. */ - revents = ep_item_poll(epi, &pt); + revents = ep_item_poll(epi, &pt, 1); /* * If the item is "hot" and it is not registered inside the ready @@ -1661,7 +1654,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, list_del_init(&epi->rdllink); - revents = ep_item_poll(epi, &pt); + revents = ep_item_poll(epi, &pt, 1); /* * If the event mask intersect the caller-requested one, @@ -2307,9 +2300,6 @@ static int __init eventpoll_init(void) ep_nested_calls_init(&poll_safewake_ncalls); #endif - /* Initialize the structure used to perform file's f_op->poll() calls */ - ep_nested_calls_init(&poll_readywalk_ncalls); - /* * We can have many thousands of epitems, so prevent this from * using an extra cache line on 64-bit (and smaller) CPUs From e4f02fdabd1092065ddd8366fcb3c8483e627fc0 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 17 Nov 2017 15:29:10 -0800 Subject: [PATCH 50/94] init/version.c: include instead of init/version.c has nothing to do with modules, so remove the . Instead, include for EXPORT_SYMBOL_GPL. This cuts off a lot of unnecessary header parsing. Link: http://lkml.kernel.org/r/1505920984-8523-1-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Cc: Paul Gortmaker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/version.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/init/version.c b/init/version.c index 5606341e9efd55..bfb4e3f4955e0c 100644 --- a/init/version.c +++ b/init/version.c @@ -7,7 +7,7 @@ */ #include -#include +#include #include #include #include From ecc0c469f27765ed1e2b967be0aa17cee1a60b76 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 17 Nov 2017 15:29:13 -0800 Subject: [PATCH 51/94] autofs: don't fail mount for transient error Currently if the autofs kernel module gets an error when writing to the pipe which links to the daemon, then it marks the whole moutpoint as catatonic, and it will stop working. It is possible that the error is transient. This can happen if the daemon is slow and more than 16 requests queue up. If a subsequent process tries to queue a request, and is then signalled, the write to the pipe will return -ERESTARTSYS and autofs will take that as total failure. So change the code to assess -ERESTARTSYS and -ENOMEM as transient failures which only abort the current request, not the whole mountpoint. It isn't a crash or a data corruption, but having autofs mountpoints suddenly stop working is rather inconvenient. Ian said: : And given the problems with a half dozen (or so) user space applications : consuming large amounts of CPU under heavy mount and umount activity this : could happen more easily than we expect. Link: http://lkml.kernel.org/r/87y3norvgp.fsf@notabene.neil.brown.name Signed-off-by: NeilBrown Acked-by: Ian Kent Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/waitq.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 4ac49d038bf38f..8fc41705c7cd50 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -81,7 +81,8 @@ static int autofs4_write(struct autofs_sb_info *sbi, spin_unlock_irqrestore(¤t->sighand->siglock, flags); } - return (bytes > 0); + /* if 'wr' returned 0 (impossible) we assume -EIO (safe) */ + return bytes == 0 ? 0 : wr < 0 ? wr : -EIO; } static void autofs4_notify_daemon(struct autofs_sb_info *sbi, @@ -95,6 +96,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, } pkt; struct file *pipe = NULL; size_t pktsz; + int ret; pr_debug("wait id = 0x%08lx, name = %.*s, type=%d\n", (unsigned long) wq->wait_queue_token, @@ -169,7 +171,18 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, mutex_unlock(&sbi->wq_mutex); if (autofs4_write(sbi, pipe, &pkt, pktsz)) + switch (ret = autofs4_write(sbi, pipe, &pkt, pktsz)) { + case 0: + break; + case -ENOMEM: + case -ERESTARTSYS: + /* Just fail this one */ + autofs4_wait_release(sbi, wq->wait_queue_token, ret); + break; + default: autofs4_catatonic_mode(sbi); + break; + } fput(pipe); } From 98159d977f71c3b3dee898d1c34e56f520b094e7 Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Fri, 17 Nov 2017 15:29:17 -0800 Subject: [PATCH 52/94] pipe: match pipe_max_size data type with procfs Patch series "A few round_pipe_size() and pipe-max-size fixups", v3. While backporting Michael's "pipe: fix limit handling" patchset to a distro-kernel, Mikulas noticed that current upstream pipe limit handling contains a few problems: 1 - procfs signed wrap: echo'ing a large number into /proc/sys/fs/pipe-max-size and then cat'ing it back out shows a negative value. 2 - round_pipe_size() nr_pages overflow on 32bit: this would subsequently try roundup_pow_of_two(0), which is undefined. 3 - visible non-rounded pipe-max-size value: there is no mutual exclusion or protection between the time pipe_max_size is assigned a raw value from proc_dointvec_minmax() and when it is rounded. 4 - unsigned long -> unsigned int conversion makes for potential odd return errors from do_proc_douintvec_minmax_conv() and do_proc_dopipe_max_size_conv(). This version underwent the same testing as v1: https://marc.info/?l=linux-kernel&m=150643571406022&w=2 This patch (of 4): pipe_max_size is defined as an unsigned int: unsigned int pipe_max_size = 1048576; but its procfs/sysctl representation is an integer: static struct ctl_table fs_table[] = { ... { .procname = "pipe-max-size", .data = &pipe_max_size, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &pipe_proc_fn, .extra1 = &pipe_min_size, }, ... that is signed: int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, size_t *lenp, loff_t *ppos) { ... ret = proc_dointvec_minmax(table, write, buf, lenp, ppos) This leads to signed results via procfs for large values of pipe_max_size: % echo 2147483647 >/proc/sys/fs/pipe-max-size % cat /proc/sys/fs/pipe-max-size -2147483648 Use unsigned operations on this variable to avoid such negative values. Link: http://lkml.kernel.org/r/1507658689-11669-2-git-send-email-joe.lawrence@redhat.com Signed-off-by: Joe Lawrence Reported-by: Mikulas Patocka Reviewed-by: Mikulas Patocka Cc: Michael Kerrisk Cc: Randy Dunlap Cc: Al Viro Cc: Jens Axboe Cc: Josh Poimboeuf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/pipe.c | 2 +- kernel/sysctl.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index 349c9d56d4b34a..3909c55ed389bb 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1125,7 +1125,7 @@ int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, { int ret; - ret = proc_dointvec_minmax(table, write, buf, lenp, ppos); + ret = proc_douintvec_minmax(table, write, buf, lenp, ppos); if (ret < 0 || !write) return ret; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4a13a389e99b24..2d42183b4c9898 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1816,7 +1816,7 @@ static struct ctl_table fs_table[] = { { .procname = "pipe-max-size", .data = &pipe_max_size, - .maxlen = sizeof(int), + .maxlen = sizeof(pipe_max_size), .mode = 0644, .proc_handler = &pipe_proc_fn, .extra1 = &pipe_min_size, From d3f14c485867cfb2e0c48aa88c41d0ef4bf5209c Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Fri, 17 Nov 2017 15:29:21 -0800 Subject: [PATCH 53/94] pipe: avoid round_pipe_size() nr_pages overflow on 32-bit round_pipe_size() contains a right-bit-shift expression which may overflow, which would cause undefined results in a subsequent roundup_pow_of_two() call. static inline unsigned int round_pipe_size(unsigned int size) { unsigned long nr_pages; nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; return roundup_pow_of_two(nr_pages) << PAGE_SHIFT; } PAGE_SIZE is defined as (1UL << PAGE_SHIFT), so: - 4 bytes wide on 32-bit (0 to 0xffffffff) - 8 bytes wide on 64-bit (0 to 0xffffffffffffffff) That means that 32-bit round_pipe_size(), nr_pages may overflow to 0: size=0x00000000 nr_pages=0x0 size=0x00000001 nr_pages=0x1 size=0xfffff000 nr_pages=0xfffff size=0xfffff001 nr_pages=0x0 << ! size=0xffffffff nr_pages=0x0 << ! This is bad because roundup_pow_of_two(n) is undefined when n == 0! 64-bit is not a problem as the unsigned int size is 4 bytes wide (similar to 32-bit) and the larger, 8 byte wide unsigned long, is sufficient to handle the largest value of the bit shift expression: size=0xffffffff nr_pages=100000 Modify round_pipe_size() to return 0 if n == 0 and updates its callers to handle accordingly. Link: http://lkml.kernel.org/r/1507658689-11669-3-git-send-email-joe.lawrence@redhat.com Signed-off-by: Joe Lawrence Reported-by: Mikulas Patocka Reviewed-by: Mikulas Patocka Cc: Al Viro Cc: Jens Axboe Cc: Michael Kerrisk Cc: Randy Dunlap Cc: Josh Poimboeuf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/pipe.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index 3909c55ed389bb..f0f4ab36c444d4 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1018,13 +1018,19 @@ const struct file_operations pipefifo_fops = { /* * Currently we rely on the pipe array holding a power-of-2 number - * of pages. + * of pages. Returns 0 on error. */ static inline unsigned int round_pipe_size(unsigned int size) { unsigned long nr_pages; + if (size < pipe_min_size) + size = pipe_min_size; + nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (nr_pages == 0) + return 0; + return roundup_pow_of_two(nr_pages) << PAGE_SHIFT; } @@ -1040,6 +1046,8 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) long ret = 0; size = round_pipe_size(arg); + if (size == 0) + return -EINVAL; nr_pages = size >> PAGE_SHIFT; if (!nr_pages) @@ -1123,13 +1131,18 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, size_t *lenp, loff_t *ppos) { + unsigned int rounded_pipe_max_size; int ret; ret = proc_douintvec_minmax(table, write, buf, lenp, ppos); if (ret < 0 || !write) return ret; - pipe_max_size = round_pipe_size(pipe_max_size); + rounded_pipe_max_size = round_pipe_size(pipe_max_size); + if (rounded_pipe_max_size == 0) + return -EINVAL; + + pipe_max_size = rounded_pipe_max_size; return ret; } From 7a8d181949fb2c16be00f8cdb354794a30e46b39 Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Fri, 17 Nov 2017 15:29:24 -0800 Subject: [PATCH 54/94] pipe: add proc_dopipe_max_size() to safely assign pipe_max_size pipe_max_size is assigned directly via procfs sysctl: static struct ctl_table fs_table[] = { ... { .procname = "pipe-max-size", .data = &pipe_max_size, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &pipe_proc_fn, .extra1 = &pipe_min_size, }, ... int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, size_t *lenp, loff_t *ppos) { ... ret = proc_dointvec_minmax(table, write, buf, lenp, ppos) ... and then later rounded in-place a few statements later: ... pipe_max_size = round_pipe_size(pipe_max_size); ... This leaves a window of time between initial assignment and rounding that may be visible to other threads. (For example, one thread sets a non-rounded value to pipe_max_size while another reads its value.) Similar reads of pipe_max_size are potentially racy: pipe.c :: alloc_pipe_info() pipe.c :: pipe_set_size() Add a new proc_dopipe_max_size() that consolidates reading the new value from the user buffer, verifying bounds, and calling round_pipe_size() with a single assignment to pipe_max_size. Link: http://lkml.kernel.org/r/1507658689-11669-4-git-send-email-joe.lawrence@redhat.com Signed-off-by: Joe Lawrence Reported-by: Mikulas Patocka Reviewed-by: Mikulas Patocka Cc: Al Viro Cc: Jens Axboe Cc: Michael Kerrisk Cc: Randy Dunlap Cc: Josh Poimboeuf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/pipe.c | 18 +++----------- include/linux/pipe_fs_i.h | 1 + include/linux/sysctl.h | 3 +++ kernel/sysctl.c | 49 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 56 insertions(+), 15 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index f0f4ab36c444d4..6d98566201ef5c 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1020,7 +1020,7 @@ const struct file_operations pipefifo_fops = { * Currently we rely on the pipe array holding a power-of-2 number * of pages. Returns 0 on error. */ -static inline unsigned int round_pipe_size(unsigned int size) +unsigned int round_pipe_size(unsigned int size) { unsigned long nr_pages; @@ -1125,25 +1125,13 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) } /* - * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax + * This should work even if CONFIG_PROC_FS isn't set, as proc_dopipe_max_size * will return an error. */ int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, size_t *lenp, loff_t *ppos) { - unsigned int rounded_pipe_max_size; - int ret; - - ret = proc_douintvec_minmax(table, write, buf, lenp, ppos); - if (ret < 0 || !write) - return ret; - - rounded_pipe_max_size = round_pipe_size(pipe_max_size); - if (rounded_pipe_max_size == 0) - return -EINVAL; - - pipe_max_size = rounded_pipe_max_size; - return ret; + return proc_dopipe_max_size(table, write, buf, lenp, ppos); } /* diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 6a80cfc63e0cbf..2dc5e9870fcd7c 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -191,5 +191,6 @@ long pipe_fcntl(struct file *, unsigned int, unsigned long arg); struct pipe_inode_info *get_pipe_info(struct file *file); int create_pipe_files(struct file **, int); +unsigned int round_pipe_size(unsigned int size); #endif diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index b769ecfcc3bd41..992bc9948232bd 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -51,6 +51,9 @@ extern int proc_dointvec_minmax(struct ctl_table *, int, extern int proc_douintvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +extern int proc_dopipe_max_size(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); extern int proc_dointvec_jiffies(struct ctl_table *, int, void __user *, size_t *, loff_t *); extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2d42183b4c9898..138b6484f277fa 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -66,6 +66,7 @@ #include #include #include +#include #include #include @@ -2620,6 +2621,47 @@ int proc_douintvec_minmax(struct ctl_table *table, int write, do_proc_douintvec_minmax_conv, ¶m); } +struct do_proc_dopipe_max_size_conv_param { + unsigned int *min; +}; + +static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, + unsigned int *valp, + int write, void *data) +{ + struct do_proc_dopipe_max_size_conv_param *param = data; + + if (write) { + unsigned int val = round_pipe_size(*lvalp); + + if (val == 0) + return -EINVAL; + + if (param->min && *param->min > val) + return -ERANGE; + + if (*lvalp > UINT_MAX) + return -EINVAL; + + *valp = val; + } else { + unsigned int val = *valp; + *lvalp = (unsigned long) val; + } + + return 0; +} + +int proc_dopipe_max_size(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct do_proc_dopipe_max_size_conv_param param = { + .min = (unsigned int *) table->extra1, + }; + return do_proc_douintvec(table, write, buffer, lenp, ppos, + do_proc_dopipe_max_size_conv, ¶m); +} + static void validate_coredump_safety(void) { #ifdef CONFIG_COREDUMP @@ -3125,6 +3167,12 @@ int proc_douintvec_minmax(struct ctl_table *table, int write, return -ENOSYS; } +int proc_dopipe_max_size(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + int proc_dointvec_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -3168,6 +3216,7 @@ EXPORT_SYMBOL(proc_douintvec); EXPORT_SYMBOL(proc_dointvec_jiffies); EXPORT_SYMBOL(proc_dointvec_minmax); EXPORT_SYMBOL_GPL(proc_douintvec_minmax); +EXPORT_SYMBOL_GPL(proc_dopipe_max_size); EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); EXPORT_SYMBOL(proc_dointvec_ms_jiffies); EXPORT_SYMBOL(proc_dostring); From fb910c42ccebf853c29296185c45c11164a56098 Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Fri, 17 Nov 2017 15:29:28 -0800 Subject: [PATCH 55/94] sysctl: check for UINT_MAX before unsigned int min/max Mikulas noticed in the existing do_proc_douintvec_minmax_conv() and do_proc_dopipe_max_size_conv() introduced in this patchset, that they inconsistently handle overflow and min/max range inputs: For example: 0 ... param->min - 1 ---> ERANGE param->min ... param->max ---> the value is accepted param->max + 1 ... 0x100000000L + param->min - 1 ---> ERANGE 0x100000000L + param->min ... 0x100000000L + param->max ---> EINVAL 0x100000000L + param->max + 1, 0x200000000L + param->min - 1 ---> ERANGE 0x200000000L + param->min ... 0x200000000L + param->max ---> EINVAL 0x200000000L + param->max + 1, 0x300000000L + param->min - 1 ---> ERANGE In do_proc_do*() routines which store values into unsigned int variables (4 bytes wide for 64-bit builds), first validate that the input unsigned long value (8 bytes wide for 64-bit builds) will fit inside the smaller unsigned int variable. Then check that the unsigned int value falls inside the specified parameter min, max range. Otherwise the unsigned long -> unsigned int conversion drops leading bits from the input value, leading to the inconsistent pattern Mikulas documented above. Link: http://lkml.kernel.org/r/1507658689-11669-5-git-send-email-joe.lawrence@redhat.com Signed-off-by: Joe Lawrence Reported-by: Mikulas Patocka Reviewed-by: Mikulas Patocka Cc: Al Viro Cc: Jens Axboe Cc: Michael Kerrisk Cc: Randy Dunlap Cc: Josh Poimboeuf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 138b6484f277fa..dd25d90896fc83 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2576,12 +2576,13 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp, if (write) { unsigned int val = *lvalp; + if (*lvalp > UINT_MAX) + return -EINVAL; + if ((param->min && *param->min > val) || (param->max && *param->max < val)) return -ERANGE; - if (*lvalp > UINT_MAX) - return -EINVAL; *valp = val; } else { unsigned int val = *valp; @@ -2632,17 +2633,18 @@ static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, struct do_proc_dopipe_max_size_conv_param *param = data; if (write) { - unsigned int val = round_pipe_size(*lvalp); + unsigned int val; + if (*lvalp > UINT_MAX) + return -EINVAL; + + val = round_pipe_size(*lvalp); if (val == 0) return -EINVAL; if (param->min && *param->min > val) return -ERANGE; - if (*lvalp > UINT_MAX) - return -EINVAL; - *valp = val; } else { unsigned int val = *valp; From 7554e9c4cfa208acf3164a86c05aaa967b043425 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 17 Nov 2017 15:29:32 -0800 Subject: [PATCH 56/94] fs/nilfs2: convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. This requires adding a pointer to hold the timer's target task, as the lifetime of sc_task doesn't appear to match the timer's task. Link: http://lkml.kernel.org/r/20171016235900.GA102729@beast Signed-off-by: Kees Cook Acked-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/segment.c | 11 +++++------ fs/nilfs2/segment.h | 1 + 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index f65392fecb5c7f..472f0b53a7247b 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2400,11 +2400,11 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode) return err; } -static void nilfs_construction_timeout(unsigned long data) +static void nilfs_construction_timeout(struct timer_list *t) { - struct task_struct *p = (struct task_struct *)data; + struct nilfs_sc_info *sci = from_timer(sci, t, sc_timer); - wake_up_process(p); + wake_up_process(sci->sc_timer_task); } static void @@ -2542,8 +2542,7 @@ static int nilfs_segctor_thread(void *arg) struct the_nilfs *nilfs = sci->sc_super->s_fs_info; int timeout = 0; - sci->sc_timer.data = (unsigned long)current; - sci->sc_timer.function = nilfs_construction_timeout; + sci->sc_timer_task = current; /* start sync. */ sci->sc_task = current; @@ -2674,7 +2673,7 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct super_block *sb, INIT_LIST_HEAD(&sci->sc_gc_inodes); INIT_LIST_HEAD(&sci->sc_iput_queue); INIT_WORK(&sci->sc_iput_work, nilfs_iput_work_func); - init_timer(&sci->sc_timer); + timer_setup(&sci->sc_timer, nilfs_construction_timeout, 0); sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT; sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ; diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h index 1060949d7dd2a6..84084a4d9b3e83 100644 --- a/fs/nilfs2/segment.h +++ b/fs/nilfs2/segment.h @@ -180,6 +180,7 @@ struct nilfs_sc_info { unsigned long sc_watermark; struct timer_list sc_timer; + struct task_struct *sc_timer_task; struct task_struct *sc_task; }; From 31ccb1f7ba3cfe29631587d451cf5bb8ab593550 Mon Sep 17 00:00:00 2001 From: Andreas Rohner Date: Fri, 17 Nov 2017 15:29:35 -0800 Subject: [PATCH 57/94] nilfs2: fix race condition that causes file system corruption There is a race condition between nilfs_dirty_inode() and nilfs_set_file_dirty(). When a file is opened, nilfs_dirty_inode() is called to update the access timestamp in the inode. It calls __nilfs_mark_inode_dirty() in a separate transaction. __nilfs_mark_inode_dirty() caches the ifile buffer_head in the i_bh field of the inode info structure and marks it as dirty. After some data was written to the file in another transaction, the function nilfs_set_file_dirty() is called, which adds the inode to the ns_dirty_files list. Then the segment construction calls nilfs_segctor_collect_dirty_files(), which goes through the ns_dirty_files list and checks the i_bh field. If there is a cached buffer_head in i_bh it is not marked as dirty again. Since nilfs_dirty_inode() and nilfs_set_file_dirty() use separate transactions, it is possible that a segment construction that writes out the ifile occurs in-between the two. If this happens the inode is not on the ns_dirty_files list, but its ifile block is still marked as dirty and written out. In the next segment construction, the data for the file is written out and nilfs_bmap_propagate() updates the b-tree. Eventually the bmap root is written into the i_bh block, which is not dirty, because it was written out in another segment construction. As a result the bmap update can be lost, which leads to file system corruption. Either the virtual block address points to an unallocated DAT block, or the DAT entry will be reused for something different. The error can remain undetected for a long time. A typical error message would be one of the "bad btree" errors or a warning that a DAT entry could not be found. This bug can be reproduced reliably by a simple benchmark that creates and overwrites millions of 4k files. Link: http://lkml.kernel.org/r/1509367935-3086-2-git-send-email-konishi.ryusuke@lab.ntt.co.jp Signed-off-by: Andreas Rohner Signed-off-by: Ryusuke Konishi Tested-by: Andreas Rohner Tested-by: Ryusuke Konishi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/segment.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 472f0b53a7247b..f572538dcc4f53 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1954,8 +1954,6 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci, err, ii->vfs_inode.i_ino); return err; } - mark_buffer_dirty(ibh); - nilfs_mdt_mark_dirty(ifile); spin_lock(&nilfs->ns_inode_lock); if (likely(!ii->i_bh)) ii->i_bh = ibh; @@ -1964,6 +1962,10 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci, goto retry; } + // Always redirty the buffer to avoid race condition + mark_buffer_dirty(ii->i_bh); + nilfs_mdt_mark_dirty(ifile); + clear_bit(NILFS_I_QUEUED, &ii->i_state); set_bit(NILFS_I_BUSY, &ii->i_state); list_move_tail(&ii->i_dirty, &sci->sc_dirty_files); From d4f0284a5969fd7809ec8df710eb10598b701638 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Fri, 17 Nov 2017 15:29:39 -0800 Subject: [PATCH 58/94] fs, nilfs: convert nilfs_root.count from atomic_t to refcount_t atomic_t variables are currently used to implement reference counters with the following properties: - counter is initialized to 1 using atomic_set() - a resource is freed upon counter reaching zero - once counter reaches zero, its further increments aren't allowed - counter schema uses basic atomic operations (set, inc, inc_not_zero, dec_and_test, etc.) Such atomic variables should be converted to a newly provided refcount_t type and API that prevents accidental counter overflows and underflows. This is important since overflows and underflows can lead to use-after-free situation and be exploitable. The variable nilfs_root.count is used as pure reference counter. Convert it to refcount_t and fix up the operations. Link: http://lkml.kernel.org/r/1509367935-3086-3-git-send-email-konishi.ryusuke@lab.ntt.co.jp Signed-off-by: Elena Reshetova Signed-off-by: Ryusuke Konishi Suggested-by: Kees Cook Reviewed-by: David Windsor Reviewed-by: Hans Liljestrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/the_nilfs.c | 8 ++++---- fs/nilfs2/the_nilfs.h | 5 +++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 2dd75bf619ad0e..afebb5067cec83 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -737,7 +737,7 @@ struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno) } else if (cno > root->cno) { n = n->rb_right; } else { - atomic_inc(&root->count); + refcount_inc(&root->count); spin_unlock(&nilfs->ns_cptree_lock); return root; } @@ -776,7 +776,7 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno) } else if (cno > root->cno) { p = &(*p)->rb_right; } else { - atomic_inc(&root->count); + refcount_inc(&root->count); spin_unlock(&nilfs->ns_cptree_lock); kfree(new); return root; @@ -786,7 +786,7 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno) new->cno = cno; new->ifile = NULL; new->nilfs = nilfs; - atomic_set(&new->count, 1); + refcount_set(&new->count, 1); atomic64_set(&new->inodes_count, 0); atomic64_set(&new->blocks_count, 0); @@ -806,7 +806,7 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno) void nilfs_put_root(struct nilfs_root *root) { - if (atomic_dec_and_test(&root->count)) { + if (refcount_dec_and_test(&root->count)) { struct the_nilfs *nilfs = root->nilfs; nilfs_sysfs_delete_snapshot_group(root); diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index b305c6f033e7c4..883d732b025950 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -27,6 +27,7 @@ #include #include #include +#include struct nilfs_sc_info; struct nilfs_sysfs_dev_subgroups; @@ -246,7 +247,7 @@ struct nilfs_root { __u64 cno; struct rb_node rb_node; - atomic_t count; + refcount_t count; struct the_nilfs *nilfs; struct inode *ifile; @@ -299,7 +300,7 @@ void nilfs_swap_super_block(struct the_nilfs *); static inline void nilfs_get_root(struct nilfs_root *root) { - atomic_inc(&root->count); + refcount_inc(&root->count); } static inline int nilfs_valid_fs(struct the_nilfs *nilfs) From 4d685f930a53632ff6b86efe43b95637006371fe Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Fri, 17 Nov 2017 15:29:43 -0800 Subject: [PATCH 59/94] nilfs2: align block comments of nilfs_sufile_truncate_range() at * Fix the following checkpatch warning: WARNING: Block comments should align the * on each line #633: FILE: sufile.c:633: +/** + * nilfs_sufile_truncate_range - truncate range of segment array Link: http://lkml.kernel.org/r/1509367935-3086-4-git-send-email-konishi.ryusuke@lab.ntt.co.jp Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/sufile.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 1541a1e9221a5c..1341a41e7b43ae 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -630,22 +630,22 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum, } /** - * nilfs_sufile_truncate_range - truncate range of segment array - * @sufile: inode of segment usage file - * @start: start segment number (inclusive) - * @end: end segment number (inclusive) - * - * Return Value: On success, 0 is returned. On error, one of the - * following negative error codes is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. - * - * %-EINVAL - Invalid number of segments specified - * - * %-EBUSY - Dirty or active segments are present in the range - */ + * nilfs_sufile_truncate_range - truncate range of segment array + * @sufile: inode of segment usage file + * @start: start segment number (inclusive) + * @end: end segment number (inclusive) + * + * Return Value: On success, 0 is returned. On error, one of the + * following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EINVAL - Invalid number of segments specified + * + * %-EBUSY - Dirty or active segments are present in the range + */ static int nilfs_sufile_truncate_range(struct inode *sufile, __u64 start, __u64 end) { From 3147db8938c7968b7be07f9b87510e334fe42ce1 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Fri, 17 Nov 2017 15:29:46 -0800 Subject: [PATCH 60/94] nilfs2: use octal for unreadable permission macro Replace S_IRWXUGO with 0777 because symbolic permissions are considered harmful: https://lwn.net/Articles/696229/ Link: http://lkml.kernel.org/r/1509367935-3086-5-git-send-email-konishi.ryusuke@lab.ntt.co.jp Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 515d13c196daf8..1a2894aa019425 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -150,7 +150,7 @@ static int nilfs_symlink(struct inode *dir, struct dentry *dentry, if (err) return err; - inode = nilfs_new_inode(dir, S_IFLNK | S_IRWXUGO); + inode = nilfs_new_inode(dir, S_IFLNK | 0777); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out; From 577753cc57b19949b7ce0fc848c669d37e448c20 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 17 Nov 2017 15:29:50 -0800 Subject: [PATCH 61/94] nilfs2: remove inode->i_version initialization It's never used in nilfs2. Link: http://lkml.kernel.org/r/1510064486-1728-2-git-send-email-konishi.ryusuke@lab.ntt.co.jp Signed-off-by: Jeff Layton Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/super.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 4fc018dfcfae35..3ce20cd44a2096 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -160,7 +160,6 @@ struct inode *nilfs_alloc_inode(struct super_block *sb) ii->i_bh = NULL; ii->i_state = 0; ii->i_cno = 0; - ii->vfs_inode.i_version = 1; nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode); return &ii->vfs_inode; } From 15ec37185ec66b8e199188bf8df3e7baf50ef77d Mon Sep 17 00:00:00 2001 From: Christos Gkekas Date: Fri, 17 Nov 2017 15:29:54 -0800 Subject: [PATCH 62/94] hfs/hfsplus: clean up unused variables in bnode.c Delete variables 'tree' and 'sb', which are set but never used. Link: http://lkml.kernel.org/r/1507977146-15875-1-git-send-email-chris.gekas@gmail.com Signed-off-by: Christos Gkekas Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hfs/bnode.c | 4 ---- fs/hfsplus/bnode.c | 4 ---- 2 files changed, 8 deletions(-) diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c index 8aec5e732abf94..b63a4df7327b6d 100644 --- a/fs/hfs/bnode.c +++ b/fs/hfs/bnode.c @@ -98,13 +98,11 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len) void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, struct hfs_bnode *src_node, int src, int len) { - struct hfs_btree *tree; struct page *src_page, *dst_page; hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len); if (!len) return; - tree = src_node->tree; src += src_node->page_offset; dst += dst_node->page_offset; src_page = src_node->page[0]; @@ -237,7 +235,6 @@ struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid) static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) { - struct super_block *sb; struct hfs_bnode *node, *node2; struct address_space *mapping; struct page *page; @@ -249,7 +246,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) return NULL; } - sb = tree->inode->i_sb; size = sizeof(struct hfs_bnode) + tree->pages_per_bnode * sizeof(struct page *); node = kzalloc(size, GFP_KERNEL); diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index d77015c3f22c22..177fae4e658143 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -127,14 +127,12 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len) void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, struct hfs_bnode *src_node, int src, int len) { - struct hfs_btree *tree; struct page **src_page, **dst_page; int l; hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len); if (!len) return; - tree = src_node->tree; src += src_node->page_offset; dst += dst_node->page_offset; src_page = src_node->page + (src >> PAGE_SHIFT); @@ -401,7 +399,6 @@ struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid) static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) { - struct super_block *sb; struct hfs_bnode *node, *node2; struct address_space *mapping; struct page *page; @@ -414,7 +411,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) return NULL; } - sb = tree->inode->i_sb; size = sizeof(struct hfs_bnode) + tree->pages_per_bnode * sizeof(struct page *); node = kzalloc(size, GFP_KERNEL); From eecd7f4f5b9c2021dbde0a361b365f5970db52aa Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 17 Nov 2017 15:29:57 -0800 Subject: [PATCH 63/94] fat: remove redundant assignment of 0 to slots The variable slots is being assigned a value of zero that is never read, slots is being updated again a few lines later. Remove this redundant assignment. Cleans clang warning: Value stored to 'slots' is never read Link: http://lkml.kernel.org/r/20171017140258.22536-1-colin.king@canonical.com Signed-off-by: Colin Ian King Acked-by: OGAWA Hirofumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fat/dir.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 81cecbe6d7cf6b..b833ffeee1e144 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -291,7 +291,6 @@ static int fat_parse_long(struct inode *dir, loff_t *pos, } } parse_long: - slots = 0; ds = (struct msdos_dir_slot *)*de; id = ds->id; if (!(id & 0x40)) From 628c1bcba204052d19b686b5bac149a644cdb72e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 17 Nov 2017 15:30:01 -0800 Subject: [PATCH 64/94] kernel/signal.c: protect the traced SIGNAL_UNKILLABLE tasks from SIGKILL The comment in sig_ignored() says "Tracers may want to know about even ignored signals" but SIGKILL can not be reported to debugger and it is just wrong to return 0 in this case: SIGKILL should only kill the SIGNAL_UNKILLABLE task if it comes from the parent ns. Change sig_ignored() to ignore ->ptrace if sig == SIGKILL and rely on sig_task_ignored(). SISGTOP coming from within the namespace is not really right too but at least debugger can intercept it, and we can't drop it here because this will break "gdb -p 1": ptrace_attach() won't work. Perhaps we will add another ->ptrace check later, we will see. Link: http://lkml.kernel.org/r/20171103184206.GB21036@redhat.com Signed-off-by: Oleg Nesterov Tested-by: Kyle Huey Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/kernel/signal.c b/kernel/signal.c index aa1fb9f905dbc5..be591313474231 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -94,13 +94,15 @@ static int sig_ignored(struct task_struct *t, int sig, bool force) if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) return 0; - if (!sig_task_ignored(t, sig, force)) - return 0; - /* - * Tracers may want to know about even ignored signals. + * Tracers may want to know about even ignored signal unless it + * is SIGKILL which can't be reported anyway but can be ignored + * by SIGNAL_UNKILLABLE task. */ - return !t->ptrace; + if (t->ptrace && sig != SIGKILL) + return 0; + + return sig_task_ignored(t, sig, force); } /* From ac25385089f673560867eb5179228a44ade0cfc1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 17 Nov 2017 15:30:04 -0800 Subject: [PATCH 65/94] kernel/signal.c: protect the SIGNAL_UNKILLABLE tasks from !sig_kernel_only() signals Change sig_task_ignored() to drop the SIG_DFL && !sig_kernel_only() signals even if force == T. This simplifies the next change and this matches the same check in get_signal() which will drop these signals anyway. Link: http://lkml.kernel.org/r/20171103184227.GC21036@redhat.com Signed-off-by: Oleg Nesterov Tested-by: Kyle Huey Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/signal.c b/kernel/signal.c index be591313474231..01ba166a5e3ac1 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -78,7 +78,7 @@ static int sig_task_ignored(struct task_struct *t, int sig, bool force) handler = sig_handler(t, sig); if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) && - handler == SIG_DFL && !force) + handler == SIG_DFL && !(force && sig_kernel_only(sig))) return 1; return sig_handler_ignored(handler, sig); From 426915796ccaf9c2bd9bb06dc5702225957bc2e5 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 17 Nov 2017 15:30:08 -0800 Subject: [PATCH 66/94] kernel/signal.c: remove the no longer needed SIGNAL_UNKILLABLE check in complete_signal() complete_signal() checks SIGNAL_UNKILLABLE before it starts to destroy the thread group, today this is wrong in many ways. If nothing else, fatal_signal_pending() should always imply that the whole thread group (except ->group_exit_task if it is not NULL) is killed, this check breaks the rule. After the previous changes we can rely on sig_task_ignored(); sig_fatal(sig) && SIGNAL_UNKILLABLE can only be true if we actually want to kill this task and sig == SIGKILL OR it is traced and debugger can intercept the signal. This should hopefully fix the problem reported by Dmitry. This test-case static int init(void *arg) { for (;;) pause(); } int main(void) { char stack[16 * 1024]; for (;;) { int pid = clone(init, stack + sizeof(stack)/2, CLONE_NEWPID | SIGCHLD, NULL); assert(pid > 0); assert(ptrace(PTRACE_ATTACH, pid, 0, 0) == 0); assert(waitpid(-1, NULL, WSTOPPED) == pid); assert(ptrace(PTRACE_DETACH, pid, 0, SIGSTOP) == 0); assert(syscall(__NR_tkill, pid, SIGKILL) == 0); assert(pid == wait(NULL)); } } triggers the WARN_ON_ONCE(!(task->jobctl & JOBCTL_STOP_PENDING)) in task_participate_group_stop(). do_signal_stop()->signal_group_exit() checks SIGNAL_GROUP_EXIT and return false, but task_set_jobctl_pending() checks fatal_signal_pending() and does not set JOBCTL_STOP_PENDING. And his should fix the minor security problem reported by Kyle, SECCOMP_RET_TRACE can miss fatal_signal_pending() the same way if the task is the root of a pid namespace. Link: http://lkml.kernel.org/r/20171103184246.GD21036@redhat.com Signed-off-by: Oleg Nesterov Reported-by: Dmitry Vyukov Reported-by: Kyle Huey Reviewed-by: Kees Cook Tested-by: Kyle Huey Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/signal.c b/kernel/signal.c index 01ba166a5e3ac1..6895f6bb98a769 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -931,9 +931,9 @@ static void complete_signal(int sig, struct task_struct *p, int group) * then start taking the whole group down immediately. */ if (sig_fatal(p, sig) && - !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) && + !(signal->flags & SIGNAL_GROUP_EXIT) && !sigismember(&t->real_blocked, sig) && - (sig == SIGKILL || !t->ptrace)) { + (sig == SIGKILL || !p->ptrace)) { /* * This signal will be fatal to the whole group. */ From de40ccefd1f19180c0a43e4d9b9d2f4dc8856c8b Mon Sep 17 00:00:00 2001 From: Dave Young Date: Fri, 17 Nov 2017 15:30:12 -0800 Subject: [PATCH 67/94] kdump: print a message in case parse_crashkernel_mem resulted in zero bytes parse_crashkernel_mem() silently returns if we get zero bytes in the parsing function. It is useful for debugging to add a message, especially if the kernel cannot boot correctly. Add a pr_info instead of pr_warn because it is expected behavior for size = 0, eg. crashkernel=2G-4G:128M, size will be 0 in case system memory is less than 2G. Link: http://lkml.kernel.org/r/20171114080129.GA6115@dhcp-128-65.nay.redhat.com Signed-off-by: Dave Young Cc: Baoquan He Cc: Vivek Goyal Cc: Bhupesh Sharma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/crash_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 6db80fc0810b92..b3663896278ed7 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -108,7 +108,8 @@ static int __init parse_crashkernel_mem(char *cmdline, return -EINVAL; } } - } + } else + pr_info("crashkernel size resulted in zero bytes\n"); return 0; } From c1b1418a66928bbae3d3ff057ae0110d5b122c11 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Fri, 17 Nov 2017 15:30:15 -0800 Subject: [PATCH 68/94] rapidio: constify rio_device_id rio_device_id are not supposed to change at runtime. rio driver is working with const 'id_table'. So mark the non-const rio_device_id structs as const. Link: http://lkml.kernel.org/r/1503734627-6058-2-git-send-email-arvind.yadav.cs@gmail.com Link: http://lkml.kernel.org/r/1503734627-6058-3-git-send-email-arvind.yadav.cs@gmail.com Link: http://lkml.kernel.org/r/1503734627-6058-4-git-send-email-arvind.yadav.cs@gmail.com Link: http://lkml.kernel.org/r/1503734627-6058-5-git-send-email-arvind.yadav.cs@gmail.com Link: http://lkml.kernel.org/r/1503734627-6058-6-git-send-email-arvind.yadav.cs@gmail.com Signed-off-by: Arvind Yadav Acked-by: Alexandre Bounine Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rapidio/switches/idt_gen2.c | 2 +- drivers/rapidio/switches/idt_gen3.c | 2 +- drivers/rapidio/switches/idtcps.c | 2 +- drivers/rapidio/switches/tsi568.c | 2 +- drivers/rapidio/switches/tsi57x.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/rapidio/switches/idt_gen2.c b/drivers/rapidio/switches/idt_gen2.c index e67b923b1ca6a7..4931ed79042847 100644 --- a/drivers/rapidio/switches/idt_gen2.c +++ b/drivers/rapidio/switches/idt_gen2.c @@ -458,7 +458,7 @@ static void idtg2_remove(struct rio_dev *rdev) idtg2_sysfs(rdev, false); } -static struct rio_device_id idtg2_id_table[] = { +static const struct rio_device_id idtg2_id_table[] = { {RIO_DEVICE(RIO_DID_IDTCPS1848, RIO_VID_IDT)}, {RIO_DEVICE(RIO_DID_IDTCPS1616, RIO_VID_IDT)}, {RIO_DEVICE(RIO_DID_IDTVPS1616, RIO_VID_IDT)}, diff --git a/drivers/rapidio/switches/idt_gen3.c b/drivers/rapidio/switches/idt_gen3.c index c5923a547bedb6..85a3908294d94f 100644 --- a/drivers/rapidio/switches/idt_gen3.c +++ b/drivers/rapidio/switches/idt_gen3.c @@ -348,7 +348,7 @@ static void idtg3_shutdown(struct rio_dev *rdev) } } -static struct rio_device_id idtg3_id_table[] = { +static const struct rio_device_id idtg3_id_table[] = { {RIO_DEVICE(RIO_DID_IDTRXS1632, RIO_VID_IDT)}, {RIO_DEVICE(RIO_DID_IDTRXS2448, RIO_VID_IDT)}, { 0, } /* terminate list */ diff --git a/drivers/rapidio/switches/idtcps.c b/drivers/rapidio/switches/idtcps.c index 7fbb60d3179602..4058ce2c76faf2 100644 --- a/drivers/rapidio/switches/idtcps.c +++ b/drivers/rapidio/switches/idtcps.c @@ -168,7 +168,7 @@ static void idtcps_remove(struct rio_dev *rdev) spin_unlock(&rdev->rswitch->lock); } -static struct rio_device_id idtcps_id_table[] = { +static const struct rio_device_id idtcps_id_table[] = { {RIO_DEVICE(RIO_DID_IDTCPS6Q, RIO_VID_IDT)}, {RIO_DEVICE(RIO_DID_IDTCPS8, RIO_VID_IDT)}, {RIO_DEVICE(RIO_DID_IDTCPS10Q, RIO_VID_IDT)}, diff --git a/drivers/rapidio/switches/tsi568.c b/drivers/rapidio/switches/tsi568.c index 8a43561b9d17f7..1214628b7ded7d 100644 --- a/drivers/rapidio/switches/tsi568.c +++ b/drivers/rapidio/switches/tsi568.c @@ -169,7 +169,7 @@ static void tsi568_remove(struct rio_dev *rdev) spin_unlock(&rdev->rswitch->lock); } -static struct rio_device_id tsi568_id_table[] = { +static const struct rio_device_id tsi568_id_table[] = { {RIO_DEVICE(RIO_DID_TSI568, RIO_VID_TUNDRA)}, { 0, } /* terminate list */ }; diff --git a/drivers/rapidio/switches/tsi57x.c b/drivers/rapidio/switches/tsi57x.c index 2700d15f758423..9f063e214836a0 100644 --- a/drivers/rapidio/switches/tsi57x.c +++ b/drivers/rapidio/switches/tsi57x.c @@ -336,7 +336,7 @@ static void tsi57x_remove(struct rio_dev *rdev) spin_unlock(&rdev->rswitch->lock); } -static struct rio_device_id tsi57x_id_table[] = { +static const struct rio_device_id tsi57x_id_table[] = { {RIO_DEVICE(RIO_DID_TSI572, RIO_VID_TUNDRA)}, {RIO_DEVICE(RIO_DID_TSI574, RIO_VID_TUNDRA)}, {RIO_DEVICE(RIO_DID_TSI577, RIO_VID_TUNDRA)}, From b1402dcb5643b7a27d46a05edd7491d49ba0e248 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 17 Nov 2017 15:37:57 -0800 Subject: [PATCH 69/94] drivers/rapidio/devices/rio_mport_cdev.c: fix resource leak in error handling path in 'rio_dma_transfer()' If 'dma_map_sg()', we should branch to the existing error handling path to free some resources before returning. Link: http://lkml.kernel.org/r/61292a4f369229eee03394247385e955027283f8.1505687047.git.christophe.jaillet@wanadoo.fr Signed-off-by: Christophe JAILLET Reviewed-by: Logan Gunthorpe Cc: Matt Porter Cc: Alexandre Bounine Cc: Lorenzo Stoakes Cc: Jesper Nilsson Cc: Christian K_nig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rapidio/devices/rio_mport_cdev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c index 5c1b6388122ad8..86805747a42299 100644 --- a/drivers/rapidio/devices/rio_mport_cdev.c +++ b/drivers/rapidio/devices/rio_mport_cdev.c @@ -963,7 +963,8 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode, req->sgt.sgl, req->sgt.nents, dir); if (nents == -EFAULT) { rmcd_error("Failed to map SG list"); - return -EFAULT; + ret = -EFAULT; + goto err_pg; } ret = do_dma_request(req, xfer, sync, nents); From c46d90cd7c3c5d3a5eb6265f2b0f06f44576d5a1 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 17 Nov 2017 15:38:03 -0800 Subject: [PATCH 70/94] drivers/rapidio/devices/rio_mport_cdev.c: fix error handling in 'rio_dma_transfer()' In case of error, 'dma_map_sg()' returns 0, not a negative value. There is BUG_ON() in 'dma_map_sg_attrs()' which makes sure of that. Link: http://lkml.kernel.org/r/d4235bd2b9274e99f6c86ea71b1fa1c7bd8d0c08.1505687047.git.christophe.jaillet@wanadoo.fr Signed-off-by: Christophe JAILLET Reviewed-by: Logan Gunthorpe Cc: Matt Porter Cc: Alexandre Bounine Cc: Lorenzo Stoakes Cc: Jesper Nilsson Cc: Christian K_nig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rapidio/devices/rio_mport_cdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c index 86805747a42299..dc5a33f936892e 100644 --- a/drivers/rapidio/devices/rio_mport_cdev.c +++ b/drivers/rapidio/devices/rio_mport_cdev.c @@ -961,7 +961,7 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode, nents = dma_map_sg(chan->device->dev, req->sgt.sgl, req->sgt.nents, dir); - if (nents == -EFAULT) { + if (nents == 0) { rmcd_error("Failed to map SG list"); ret = -EFAULT; goto err_pg; From 2743232c0c4c82311718bb4ec1fb659ed64ecf7a Mon Sep 17 00:00:00 2001 From: Kangmin Park Date: Fri, 17 Nov 2017 15:30:23 -0800 Subject: [PATCH 71/94] Documentation/sysctl/vm.txt: fix typo Link: http://lkml.kernel.org/r/CAKW4uUyCi=PnKf3epgFVz8z=1tMtHSOHNm+fdNxrNw3-THvRCA@mail.gmail.com Signed-off-by: Kangmin Park Cc: Jiri Kosina Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 055c8b3e101805..b920423f88cbcb 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -818,7 +818,7 @@ tooling to work, you can do: swappiness This control is used to define how aggressive the kernel will swap -memory pages. Higher values will increase agressiveness, lower values +memory pages. Higher values will increase aggressiveness, lower values decrease the amount of swap. A value of 0 instructs the kernel not to initiate swap until the amount of free and file-backed pages is less than the high water mark in a zone. From f9eb2fdd04d4e68fbea18970bbf65ace716d25b6 Mon Sep 17 00:00:00 2001 From: "Ola N. Kaldestad" Date: Fri, 17 Nov 2017 15:30:26 -0800 Subject: [PATCH 72/94] kernel/sysctl.c: code cleanups Remove unnecessary else block, remove redundant return and call to kfree in if block. Link: http://lkml.kernel.org/r/1510238435-1655-1-git-send-email-mail@okal.no Signed-off-by: Ola N. Kaldestad Acked-by: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index dd25d90896fc83..557d4672857793 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -3127,14 +3127,12 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, else bitmap_copy(bitmap, tmp_bitmap, bitmap_len); } - kfree(tmp_bitmap); *lenp -= left; *ppos += *lenp; - return 0; - } else { - kfree(tmp_bitmap); - return err; } + + kfree(tmp_bitmap); + return err; } #else /* CONFIG_PROC_SYSCTL */ From 95846ecf9dac5089aed4b144d912225f8ef86ae4 Mon Sep 17 00:00:00 2001 From: Gargi Sharma Date: Fri, 17 Nov 2017 15:30:30 -0800 Subject: [PATCH 73/94] pid: replace pid bitmap implementation with IDR API Patch series "Replacing PID bitmap implementation with IDR API", v4. This series replaces kernel bitmap implementation of PID allocation with IDR API. These patches are written to simplify the kernel by replacing custom code with calls to generic code. The following are the stats for pid and pid_namespace object files before and after the replacement. There is a noteworthy change between the IDR and bitmap implementation. Before text data bss dec hex filename 8447 3894 64 12405 3075 kernel/pid.o After text data bss dec hex filename 3397 304 0 3701 e75 kernel/pid.o Before text data bss dec hex filename 5692 1842 192 7726 1e2e kernel/pid_namespace.o After text data bss dec hex filename 2854 216 16 3086 c0e kernel/pid_namespace.o The following are the stats for ps, pstree and calling readdir on /proc for 10,000 processes. ps: With IDR API With bitmap real 0m1.479s 0m2.319s user 0m0.070s 0m0.060s sys 0m0.289s 0m0.516s pstree: With IDR API With bitmap real 0m1.024s 0m1.794s user 0m0.348s 0m0.612s sys 0m0.184s 0m0.264s proc: With IDR API With bitmap real 0m0.059s 0m0.074s user 0m0.000s 0m0.004s sys 0m0.016s 0m0.016s This patch (of 2): Replace the current bitmap implementation for Process ID allocation. Functions that are no longer required, for example, free_pidmap(), alloc_pidmap(), etc. are removed. The rest of the functions are modified to use the IDR API. The change was made to make the PID allocation less complex by replacing custom code with calls to generic API. [gs051095@gmail.com: v6] Link: http://lkml.kernel.org/r/1507760379-21662-2-git-send-email-gs051095@gmail.com [avagin@openvz.org: restore the old behaviour of the ns_last_pid sysctl] Link: http://lkml.kernel.org/r/20171106183144.16368-1-avagin@openvz.org Link: http://lkml.kernel.org/r/1507583624-22146-2-git-send-email-gs051095@gmail.com Signed-off-by: Gargi Sharma Reviewed-by: Rik van Riel Acked-by: Oleg Nesterov Cc: Julia Lawall Cc: Ingo Molnar Cc: Pavel Tatashin Cc: Kirill Tkhai Cc: Eric W. Biederman Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/platforms/cell/spufs/sched.c | 2 +- fs/proc/loadavg.c | 2 +- include/linux/pid_namespace.h | 14 +- init/main.c | 2 +- kernel/pid.c | 201 ++++------------------ kernel/pid_namespace.c | 53 +++--- 6 files changed, 65 insertions(+), 209 deletions(-) diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c index 1fbb5da17dd27f..e47761cdcb98fe 100644 --- a/arch/powerpc/platforms/cell/spufs/sched.c +++ b/arch/powerpc/platforms/cell/spufs/sched.c @@ -1093,7 +1093,7 @@ static int show_spu_loadavg(struct seq_file *s, void *private) LOAD_INT(c), LOAD_FRAC(c), count_active_contexts(), atomic_read(&nr_spu_contexts), - task_active_pid_ns(current)->last_pid); + idr_get_cursor(&task_active_pid_ns(current)->idr)); return 0; } diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c index 9bc5c58c00ee7a..a000d7547479e8 100644 --- a/fs/proc/loadavg.c +++ b/fs/proc/loadavg.c @@ -24,7 +24,7 @@ static int loadavg_proc_show(struct seq_file *m, void *v) LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]), LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]), nr_running(), nr_threads, - task_active_pid_ns(current)->last_pid); + idr_get_cursor(&task_active_pid_ns(current)->idr)); return 0; } diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index c78af6061644f3..92c6aa509d2ea7 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -10,15 +10,8 @@ #include #include #include +#include -struct pidmap { - atomic_t nr_free; - void *page; -}; - -#define BITS_PER_PAGE (PAGE_SIZE * 8) -#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) -#define PIDMAP_ENTRIES ((PID_MAX_LIMIT+BITS_PER_PAGE-1)/BITS_PER_PAGE) struct fs_pin; @@ -30,9 +23,8 @@ enum { /* definitions for pid_namespace's hide_pid field */ struct pid_namespace { struct kref kref; - struct pidmap pidmap[PIDMAP_ENTRIES]; + struct idr idr; struct rcu_head rcu; - int last_pid; unsigned int nr_hashed; struct task_struct *child_reaper; struct kmem_cache *pid_cachep; @@ -106,6 +98,6 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk); void pidhash_init(void); -void pidmap_init(void); +void pid_idr_init(void); #endif /* _LINUX_PID_NS_H */ diff --git a/init/main.c b/init/main.c index 859a786f7c0abf..d0cbcfc06124c3 100644 --- a/init/main.c +++ b/init/main.c @@ -669,7 +669,7 @@ asmlinkage __visible void __init start_kernel(void) if (late_time_init) late_time_init(); calibrate_delay(); - pidmap_init(); + pid_idr_init(); anon_vma_init(); #ifdef CONFIG_X86 if (efi_enabled(EFI_RUNTIME_SERVICES)) diff --git a/kernel/pid.c b/kernel/pid.c index 020dedbdf066bc..0ce59369632fec 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -39,6 +39,7 @@ #include #include #include +#include #define pid_hashfn(nr, ns) \ hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) @@ -53,14 +54,6 @@ int pid_max = PID_MAX_DEFAULT; int pid_max_min = RESERVED_PIDS + 1; int pid_max_max = PID_MAX_LIMIT; -static inline int mk_pid(struct pid_namespace *pid_ns, - struct pidmap *map, int off) -{ - return (map - pid_ns->pidmap)*BITS_PER_PAGE + off; -} - -#define find_next_offset(map, off) \ - find_next_zero_bit((map)->page, BITS_PER_PAGE, off) /* * PID-map pages start out as NULL, they get allocated upon @@ -70,10 +63,7 @@ static inline int mk_pid(struct pid_namespace *pid_ns, */ struct pid_namespace init_pid_ns = { .kref = KREF_INIT(2), - .pidmap = { - [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } - }, - .last_pid = 0, + .idr = IDR_INIT, .nr_hashed = PIDNS_HASH_ADDING, .level = 0, .child_reaper = &init_task, @@ -101,138 +91,6 @@ EXPORT_SYMBOL_GPL(init_pid_ns); static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); -static void free_pidmap(struct upid *upid) -{ - int nr = upid->nr; - struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE; - int offset = nr & BITS_PER_PAGE_MASK; - - clear_bit(offset, map->page); - atomic_inc(&map->nr_free); -} - -/* - * If we started walking pids at 'base', is 'a' seen before 'b'? - */ -static int pid_before(int base, int a, int b) -{ - /* - * This is the same as saying - * - * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT - * and that mapping orders 'a' and 'b' with respect to 'base'. - */ - return (unsigned)(a - base) < (unsigned)(b - base); -} - -/* - * We might be racing with someone else trying to set pid_ns->last_pid - * at the pid allocation time (there's also a sysctl for this, but racing - * with this one is OK, see comment in kernel/pid_namespace.c about it). - * We want the winner to have the "later" value, because if the - * "earlier" value prevails, then a pid may get reused immediately. - * - * Since pids rollover, it is not sufficient to just pick the bigger - * value. We have to consider where we started counting from. - * - * 'base' is the value of pid_ns->last_pid that we observed when - * we started looking for a pid. - * - * 'pid' is the pid that we eventually found. - */ -static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid) -{ - int prev; - int last_write = base; - do { - prev = last_write; - last_write = cmpxchg(&pid_ns->last_pid, prev, pid); - } while ((prev != last_write) && (pid_before(base, last_write, pid))); -} - -static int alloc_pidmap(struct pid_namespace *pid_ns) -{ - int i, offset, max_scan, pid, last = pid_ns->last_pid; - struct pidmap *map; - - pid = last + 1; - if (pid >= pid_max) - pid = RESERVED_PIDS; - offset = pid & BITS_PER_PAGE_MASK; - map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; - /* - * If last_pid points into the middle of the map->page we - * want to scan this bitmap block twice, the second time - * we start with offset == 0 (or RESERVED_PIDS). - */ - max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset; - for (i = 0; i <= max_scan; ++i) { - if (unlikely(!map->page)) { - void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); - /* - * Free the page if someone raced with us - * installing it: - */ - spin_lock_irq(&pidmap_lock); - if (!map->page) { - map->page = page; - page = NULL; - } - spin_unlock_irq(&pidmap_lock); - kfree(page); - if (unlikely(!map->page)) - return -ENOMEM; - } - if (likely(atomic_read(&map->nr_free))) { - for ( ; ; ) { - if (!test_and_set_bit(offset, map->page)) { - atomic_dec(&map->nr_free); - set_last_pid(pid_ns, last, pid); - return pid; - } - offset = find_next_offset(map, offset); - if (offset >= BITS_PER_PAGE) - break; - pid = mk_pid(pid_ns, map, offset); - if (pid >= pid_max) - break; - } - } - if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) { - ++map; - offset = 0; - } else { - map = &pid_ns->pidmap[0]; - offset = RESERVED_PIDS; - if (unlikely(last == offset)) - break; - } - pid = mk_pid(pid_ns, map, offset); - } - return -EAGAIN; -} - -int next_pidmap(struct pid_namespace *pid_ns, unsigned int last) -{ - int offset; - struct pidmap *map, *end; - - if (last >= PID_MAX_LIMIT) - return -1; - - offset = (last + 1) & BITS_PER_PAGE_MASK; - map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE]; - end = &pid_ns->pidmap[PIDMAP_ENTRIES]; - for (; map < end; map++, offset = 0) { - if (unlikely(!map->page)) - continue; - offset = find_next_bit((map)->page, BITS_PER_PAGE, offset); - if (offset < BITS_PER_PAGE) - return mk_pid(pid_ns, map, offset); - } - return -1; -} - void put_pid(struct pid *pid) { struct pid_namespace *ns; @@ -266,7 +124,7 @@ void free_pid(struct pid *pid) struct upid *upid = pid->numbers + i; struct pid_namespace *ns = upid->ns; hlist_del_rcu(&upid->pid_chain); - switch(--ns->nr_hashed) { + switch (--ns->nr_hashed) { case 2: case 1: /* When all that is left in the pid namespace @@ -284,12 +142,11 @@ void free_pid(struct pid *pid) schedule_work(&ns->proc_work); break; } + + idr_remove(&ns->idr, upid->nr); } spin_unlock_irqrestore(&pidmap_lock, flags); - for (i = 0; i <= pid->level; i++) - free_pidmap(pid->numbers + i); - call_rcu(&pid->rcu, delayed_put_pid); } @@ -308,8 +165,29 @@ struct pid *alloc_pid(struct pid_namespace *ns) tmp = ns; pid->level = ns->level; + for (i = ns->level; i >= 0; i--) { - nr = alloc_pidmap(tmp); + int pid_min = 1; + + idr_preload(GFP_KERNEL); + spin_lock_irq(&pidmap_lock); + + /* + * init really needs pid 1, but after reaching the maximum + * wrap back to RESERVED_PIDS + */ + if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS) + pid_min = RESERVED_PIDS; + + /* + * Store a null pointer so find_pid_ns does not find + * a partially initialized PID (see below). + */ + nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, + pid_max, GFP_ATOMIC); + spin_unlock_irq(&pidmap_lock); + idr_preload_end(); + if (nr < 0) { retval = nr; goto out_free; @@ -339,6 +217,8 @@ struct pid *alloc_pid(struct pid_namespace *ns) for ( ; upid >= pid->numbers; --upid) { hlist_add_head_rcu(&upid->pid_chain, &pid_hash[pid_hashfn(upid->nr, upid->ns)]); + /* Make the PID visible to find_pid_ns. */ + idr_replace(&upid->ns->idr, pid, upid->nr); upid->ns->nr_hashed++; } spin_unlock_irq(&pidmap_lock); @@ -350,8 +230,11 @@ struct pid *alloc_pid(struct pid_namespace *ns) put_pid_ns(ns); out_free: + spin_lock_irq(&pidmap_lock); while (++i <= ns->level) - free_pidmap(pid->numbers + i); + idr_remove(&ns->idr, (pid->numbers + i)->nr); + + spin_unlock_irq(&pidmap_lock); kmem_cache_free(ns->pid_cachep, pid); return ERR_PTR(retval); @@ -553,16 +436,7 @@ EXPORT_SYMBOL_GPL(task_active_pid_ns); */ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) { - struct pid *pid; - - do { - pid = find_pid_ns(nr, ns); - if (pid) - break; - nr = next_pidmap(ns, nr); - } while (nr > 0); - - return pid; + return idr_get_next(&ns->idr, &nr); } /* @@ -578,7 +452,7 @@ void __init pidhash_init(void) 0, 4096); } -void __init pidmap_init(void) +void __init pid_idr_init(void) { /* Verify no one has done anything silly: */ BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING); @@ -590,10 +464,7 @@ void __init pidmap_init(void) PIDS_PER_CPU_MIN * num_possible_cpus()); pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min); - init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); - /* Reserve PID 0. We never call free_pidmap(0) */ - set_bit(0, init_pid_ns.pidmap[0].page); - atomic_dec(&init_pid_ns.pidmap[0].nr_free); + idr_init(&init_pid_ns.idr); init_pid_ns.pid_cachep = KMEM_CACHE(pid, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 4918314893bc66..ca7c8a8823b1fe 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -21,6 +21,7 @@ #include #include #include +#include struct pid_cache { int nr_ids; @@ -98,7 +99,6 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns struct pid_namespace *ns; unsigned int level = parent_pid_ns->level + 1; struct ucounts *ucounts; - int i; int err; err = -EINVAL; @@ -117,17 +117,15 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns if (ns == NULL) goto out_dec; - ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); - if (!ns->pidmap[0].page) - goto out_free; + idr_init(&ns->idr); ns->pid_cachep = create_pid_cachep(level + 1); if (ns->pid_cachep == NULL) - goto out_free_map; + goto out_free_idr; err = ns_alloc_inum(&ns->ns); if (err) - goto out_free_map; + goto out_free_idr; ns->ns.ops = &pidns_operations; kref_init(&ns->kref); @@ -138,17 +136,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns ns->nr_hashed = PIDNS_HASH_ADDING; INIT_WORK(&ns->proc_work, proc_cleanup_work); - set_bit(0, ns->pidmap[0].page); - atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); - - for (i = 1; i < PIDMAP_ENTRIES; i++) - atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); - return ns; -out_free_map: - kfree(ns->pidmap[0].page); -out_free: +out_free_idr: + idr_destroy(&ns->idr); kmem_cache_free(pid_ns_cachep, ns); out_dec: dec_pid_namespaces(ucounts); @@ -168,11 +159,9 @@ static void delayed_free_pidns(struct rcu_head *p) static void destroy_pid_namespace(struct pid_namespace *ns) { - int i; - ns_free_inum(&ns->ns); - for (i = 0; i < PIDMAP_ENTRIES; i++) - kfree(ns->pidmap[i].page); + + idr_destroy(&ns->idr); call_rcu(&ns->rcu, delayed_free_pidns); } @@ -213,6 +202,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) int rc; struct task_struct *task, *me = current; int init_pids = thread_group_leader(me) ? 1 : 2; + struct pid *pid; /* Don't allow any more processes into the pid namespace */ disable_pid_allocation(pid_ns); @@ -239,20 +229,16 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) * maintain a tasklist for each pid namespace. * */ + rcu_read_lock(); read_lock(&tasklist_lock); - nr = next_pidmap(pid_ns, 1); - while (nr > 0) { - rcu_read_lock(); - - task = pid_task(find_vpid(nr), PIDTYPE_PID); + nr = 2; + idr_for_each_entry_continue(&pid_ns->idr, pid, nr) { + task = pid_task(pid, PIDTYPE_PID); if (task && !__fatal_signal_pending(task)) send_sig_info(SIGKILL, SEND_SIG_FORCED, task); - - rcu_read_unlock(); - - nr = next_pidmap(pid_ns, nr); } read_unlock(&tasklist_lock); + rcu_read_unlock(); /* * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD. @@ -301,6 +287,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write, { struct pid_namespace *pid_ns = task_active_pid_ns(current); struct ctl_table tmp = *table; + int ret, next; if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; @@ -311,8 +298,14 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write, * it should synchronize its usage with external means. */ - tmp.data = &pid_ns->last_pid; - return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + next = idr_get_cursor(&pid_ns->idr) - 1; + + tmp.data = &next; + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (!ret && write) + idr_set_cursor(&pid_ns->idr, next + 1); + + return ret; } extern int pid_max; From e8cfbc245e24887e3c30235f71e9e9405e0cfc39 Mon Sep 17 00:00:00 2001 From: Gargi Sharma Date: Fri, 17 Nov 2017 15:30:34 -0800 Subject: [PATCH 74/94] pid: remove pidhash pidhash is no longer required as all the information can be looked up from idr tree. nr_hashed represented the number of pids that had been hashed. Since, nr_hashed and PIDNS_HASH_ADDING are no longer relevant, it has been renamed to pid_allocated and PIDNS_ADDING respectively. [gs051095@gmail.com: v6] Link: http://lkml.kernel.org/r/1507760379-21662-3-git-send-email-gs051095@gmail.com Link: http://lkml.kernel.org/r/1507583624-22146-3-git-send-email-gs051095@gmail.com Signed-off-by: Gargi Sharma Reviewed-by: Rik van Riel Tested-by: Tony Luck [ia64] Cc: Julia Lawall Cc: Ingo Molnar Cc: Pavel Tatashin Cc: Kirill Tkhai Cc: Oleg Nesterov Cc: Eric W. Biederman Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/asm-offsets.c | 4 +-- include/linux/init_task.h | 1 - include/linux/pid.h | 2 -- include/linux/pid_namespace.h | 4 +-- init/main.c | 1 - kernel/fork.c | 2 +- kernel/pid.c | 48 +++++++--------------------------- kernel/pid_namespace.c | 6 ++--- 8 files changed, 18 insertions(+), 50 deletions(-) diff --git a/arch/ia64/kernel/asm-offsets.c b/arch/ia64/kernel/asm-offsets.c index f7693f49c57369..f4db2168d1b89e 100644 --- a/arch/ia64/kernel/asm-offsets.c +++ b/arch/ia64/kernel/asm-offsets.c @@ -31,8 +31,8 @@ void foo(void) DEFINE(SIGFRAME_SIZE, sizeof (struct sigframe)); DEFINE(UNW_FRAME_INFO_SIZE, sizeof (struct unw_frame_info)); - BUILD_BUG_ON(sizeof(struct upid) != 32); - DEFINE(IA64_UPID_SHIFT, 5); + BUILD_BUG_ON(sizeof(struct upid) != 16); + DEFINE(IA64_UPID_SHIFT, 4); BLANK(); diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 8062e6cc607c82..6a532629c98350 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -105,7 +105,6 @@ extern struct group_info init_groups; .numbers = { { \ .nr = 0, \ .ns = &init_pid_ns, \ - .pid_chain = { .next = NULL, .pprev = NULL }, \ }, } \ } diff --git a/include/linux/pid.h b/include/linux/pid.h index dfd684ce078754..7633d55d9a2473 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -51,10 +51,8 @@ enum pid_type */ struct upid { - /* Try to keep pid_chain in the same cacheline as nr for find_vpid */ int nr; struct pid_namespace *ns; - struct hlist_node pid_chain; }; struct pid diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 92c6aa509d2ea7..49538b172483c4 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -25,7 +25,7 @@ struct pid_namespace { struct kref kref; struct idr idr; struct rcu_head rcu; - unsigned int nr_hashed; + unsigned int pid_allocated; struct task_struct *child_reaper; struct kmem_cache *pid_cachep; unsigned int level; @@ -49,7 +49,7 @@ struct pid_namespace { extern struct pid_namespace init_pid_ns; -#define PIDNS_HASH_ADDING (1U << 31) +#define PIDNS_ADDING (1U << 31) #ifdef CONFIG_PID_NS static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) diff --git a/init/main.c b/init/main.c index d0cbcfc06124c3..dfec3809e7404f 100644 --- a/init/main.c +++ b/init/main.c @@ -562,7 +562,6 @@ asmlinkage __visible void __init start_kernel(void) * kmem_cache_init() */ setup_log_buf(0); - pidhash_init(); vfs_caches_init_early(); sort_main_extable(); trap_init(); diff --git a/kernel/fork.c b/kernel/fork.c index 4e55eedba8d689..432eadf6b58c18 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1871,7 +1871,7 @@ static __latent_entropy struct task_struct *copy_process( retval = -ERESTARTNOINTR; goto bad_fork_cancel_cgroup; } - if (unlikely(!(ns_of_pid(pid)->nr_hashed & PIDNS_HASH_ADDING))) { + if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) { retval = -ENOMEM; goto bad_fork_cancel_cgroup; } diff --git a/kernel/pid.c b/kernel/pid.c index 0ce59369632fec..b13b624e2c4902 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -41,10 +41,6 @@ #include #include -#define pid_hashfn(nr, ns) \ - hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) -static struct hlist_head *pid_hash; -static unsigned int pidhash_shift = 4; struct pid init_struct_pid = INIT_STRUCT_PID; int pid_max = PID_MAX_DEFAULT; @@ -54,7 +50,6 @@ int pid_max = PID_MAX_DEFAULT; int pid_max_min = RESERVED_PIDS + 1; int pid_max_max = PID_MAX_LIMIT; - /* * PID-map pages start out as NULL, they get allocated upon * first use and are never deallocated. This way a low pid_max @@ -64,7 +59,7 @@ int pid_max_max = PID_MAX_LIMIT; struct pid_namespace init_pid_ns = { .kref = KREF_INIT(2), .idr = IDR_INIT, - .nr_hashed = PIDNS_HASH_ADDING, + .pid_allocated = PIDNS_ADDING, .level = 0, .child_reaper = &init_task, .user_ns = &init_user_ns, @@ -123,8 +118,7 @@ void free_pid(struct pid *pid) for (i = 0; i <= pid->level; i++) { struct upid *upid = pid->numbers + i; struct pid_namespace *ns = upid->ns; - hlist_del_rcu(&upid->pid_chain); - switch (--ns->nr_hashed) { + switch (--ns->pid_allocated) { case 2: case 1: /* When all that is left in the pid namespace @@ -133,10 +127,10 @@ void free_pid(struct pid *pid) */ wake_up_process(ns->child_reaper); break; - case PIDNS_HASH_ADDING: + case PIDNS_ADDING: /* Handle a fork failure of the first process */ WARN_ON(ns->child_reaper); - ns->nr_hashed = 0; + ns->pid_allocated = 0; /* fall through */ case 0: schedule_work(&ns->proc_work); @@ -212,14 +206,12 @@ struct pid *alloc_pid(struct pid_namespace *ns) upid = pid->numbers + ns->level; spin_lock_irq(&pidmap_lock); - if (!(ns->nr_hashed & PIDNS_HASH_ADDING)) + if (!(ns->pid_allocated & PIDNS_ADDING)) goto out_unlock; for ( ; upid >= pid->numbers; --upid) { - hlist_add_head_rcu(&upid->pid_chain, - &pid_hash[pid_hashfn(upid->nr, upid->ns)]); /* Make the PID visible to find_pid_ns. */ idr_replace(&upid->ns->idr, pid, upid->nr); - upid->ns->nr_hashed++; + upid->ns->pid_allocated++; } spin_unlock_irq(&pidmap_lock); @@ -243,21 +235,13 @@ struct pid *alloc_pid(struct pid_namespace *ns) void disable_pid_allocation(struct pid_namespace *ns) { spin_lock_irq(&pidmap_lock); - ns->nr_hashed &= ~PIDNS_HASH_ADDING; + ns->pid_allocated &= ~PIDNS_ADDING; spin_unlock_irq(&pidmap_lock); } struct pid *find_pid_ns(int nr, struct pid_namespace *ns) { - struct upid *pnr; - - hlist_for_each_entry_rcu(pnr, - &pid_hash[pid_hashfn(nr, ns)], pid_chain) - if (pnr->nr == nr && pnr->ns == ns) - return container_of(pnr, struct pid, - numbers[ns->level]); - - return NULL; + return idr_find(&ns->idr, nr); } EXPORT_SYMBOL_GPL(find_pid_ns); @@ -413,6 +397,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, if (type != PIDTYPE_PID) { if (type == __PIDTYPE_TGID) type = PIDTYPE_PID; + task = task->group_leader; } nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns); @@ -439,23 +424,10 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) return idr_get_next(&ns->idr, &nr); } -/* - * The pid hash table is scaled according to the amount of memory in the - * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or - * more. - */ -void __init pidhash_init(void) -{ - pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, - HASH_EARLY | HASH_SMALL | HASH_ZERO, - &pidhash_shift, NULL, - 0, 4096); -} - void __init pid_idr_init(void) { /* Verify no one has done anything silly: */ - BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING); + BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING); /* bump default and minimum pid_max based on number of cpus */ pid_max = min(pid_max_max, max_t(int, pid_max, diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index ca7c8a8823b1fe..0b53eef7d34b1a 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -133,7 +133,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns ns->parent = get_pid_ns(parent_pid_ns); ns->user_ns = get_user_ns(user_ns); ns->ucounts = ucounts; - ns->nr_hashed = PIDNS_HASH_ADDING; + ns->pid_allocated = PIDNS_ADDING; INIT_WORK(&ns->proc_work, proc_cleanup_work); return ns; @@ -254,7 +254,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) * sys_wait4() above can't reap the EXIT_DEAD children but we do not * really care, we could reparent them to the global init. We could * exit and reap ->child_reaper even if it is not the last thread in - * this pid_ns, free_pid(nr_hashed == 0) calls proc_cleanup_work(), + * this pid_ns, free_pid(pid_allocated == 0) calls proc_cleanup_work(), * pid_ns can not go away until proc_kill_sb() drops the reference. * * But this ns can also have other tasks injected by setns()+fork(). @@ -268,7 +268,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) */ for (;;) { set_current_state(TASK_INTERRUPTIBLE); - if (pid_ns->nr_hashed == init_pids) + if (pid_ns->pid_allocated == init_pids) break; schedule(); } From 4efb442cc12eb66535b7c7ed06005fd7889c1d77 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 17 Nov 2017 15:30:38 -0800 Subject: [PATCH 75/94] kernel/panic.c: add TAINT_AUX This is the gist of a patch which we've been forward-porting in our kernels for a long time now and it probably would make a good sense to have such TAINT_AUX flag upstream which can be used by each distro etc, how they see fit. This way, we won't need to forward-port a distro-only version indefinitely. Add an auxiliary taint flag to be used by distros and others. This obviates the need to forward-port whatever internal solutions people have in favor of a single flag which they can map arbitrarily to a definition of their pleasing. The "X" mnemonic could also mean eXternal, which would be taint from a distro or something else but not the upstream kernel. We will use it to mark modules for which we don't provide support. I.e., a really eXternal module. Link: http://lkml.kernel.org/r/20170911134533.dp5mtyku5bongx4c@pd.tnic Signed-off-by: Borislav Petkov Cc: Kees Cook Cc: Jessica Yu Cc: Peter Zijlstra Cc: Jiri Slaby Cc: Jiri Olsa Cc: Michal Marek Cc: Jiri Kosina Cc: Takashi Iwai Cc: Petr Mladek Cc: Jeff Mahoney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 3 ++- kernel/panic.c | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 4b484ab9e1635e..ce51455e2adf63 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -549,7 +549,8 @@ extern enum system_states { #define TAINT_UNSIGNED_MODULE 13 #define TAINT_SOFTLOCKUP 14 #define TAINT_LIVEPATCH 15 -#define TAINT_FLAGS_COUNT 16 +#define TAINT_AUX 16 +#define TAINT_FLAGS_COUNT 17 struct taint_flag { char c_true; /* character printed when tainted */ diff --git a/kernel/panic.c b/kernel/panic.c index 3242b64b1956de..2cfef408fec931 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -324,6 +324,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = { { 'E', ' ', true }, /* TAINT_UNSIGNED_MODULE */ { 'L', ' ', false }, /* TAINT_SOFTLOCKUP */ { 'K', ' ', true }, /* TAINT_LIVEPATCH */ + { 'X', ' ', true }, /* TAINT_AUX */ }; /** @@ -345,6 +346,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = { * 'E' - Unsigned module has been loaded. * 'L' - A soft lockup has previously occurred. * 'K' - Kernel has been live patched. + * 'X' - Auxiliary taint, for distros' use. * * The string is overwritten by the next call to print_tainted(). */ From fcf4edac049a8bca41658970292e2dfdbc9d5f62 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 17 Nov 2017 15:30:42 -0800 Subject: [PATCH 76/94] kcov: remove pointless current != NULL check __sanitizer_cov_trace_pc() is a hot code, so it's worth to remove pointless '!current' check. Current is never NULL. Link: http://lkml.kernel.org/r/20170929162221.32500-1-aryabinin@virtuozzo.com Signed-off-by: Andrey Ryabinin Acked-by: Dmitry Vyukov Acked-by: Mark Rutland Cc: Andrey Konovalov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kcov.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/kcov.c b/kernel/kcov.c index fc6af9e1308b7a..d9f9fa9cacc629 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -62,7 +62,7 @@ void notrace __sanitizer_cov_trace_pc(void) * We are interested in code coverage as a function of a syscall inputs, * so we ignore code executed in interrupts. */ - if (!t || !in_task()) + if (!in_task()) return; mode = READ_ONCE(t->kcov_mode); if (mode == KCOV_MODE_TRACE) { From ded97d2c2b2c5f1dcced0bc57133f7753b037dfc Mon Sep 17 00:00:00 2001 From: Victor Chibotaru Date: Fri, 17 Nov 2017 15:30:46 -0800 Subject: [PATCH 77/94] kcov: support comparison operands collection Enables kcov to collect comparison operands from instrumented code. This is done by using Clang's -fsanitize=trace-cmp instrumentation (currently not available for GCC). The comparison operands help a lot in fuzz testing. E.g. they are used in Syzkaller to cover the interiors of conditional statements with way less attempts and thus make previously unreachable code reachable. To allow separate collection of coverage and comparison operands two different work modes are implemented. Mode selection is now done via a KCOV_ENABLE ioctl call with corresponding argument value. Link: http://lkml.kernel.org/r/20171011095459.70721-1-glider@google.com Signed-off-by: Victor Chibotaru Signed-off-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Andrey Konovalov Cc: Mark Rutland Cc: Alexander Popov Cc: Andrey Ryabinin Cc: Kees Cook Cc: Vegard Nossum Cc: Quentin Casasnovas Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kcov.h | 12 ++- include/uapi/linux/kcov.h | 24 +++++ kernel/kcov.c | 214 +++++++++++++++++++++++++++++++------- 3 files changed, 211 insertions(+), 39 deletions(-) diff --git a/include/linux/kcov.h b/include/linux/kcov.h index f5d8ce4f4f8667..3ecf6f5e3a5f01 100644 --- a/include/linux/kcov.h +++ b/include/linux/kcov.h @@ -8,19 +8,23 @@ struct task_struct; #ifdef CONFIG_KCOV -void kcov_task_init(struct task_struct *t); -void kcov_task_exit(struct task_struct *t); - enum kcov_mode { /* Coverage collection is not enabled yet. */ KCOV_MODE_DISABLED = 0, + /* KCOV was initialized, but tracing mode hasn't been chosen yet. */ + KCOV_MODE_INIT = 1, /* * Tracing coverage collection mode. * Covered PCs are collected in a per-task buffer. */ - KCOV_MODE_TRACE = 1, + KCOV_MODE_TRACE_PC = 2, + /* Collecting comparison operands mode. */ + KCOV_MODE_TRACE_CMP = 3, }; +void kcov_task_init(struct task_struct *t); +void kcov_task_exit(struct task_struct *t); + #else static inline void kcov_task_init(struct task_struct *t) {} diff --git a/include/uapi/linux/kcov.h b/include/uapi/linux/kcov.h index 33eabbb8ada11e..9529867717a866 100644 --- a/include/uapi/linux/kcov.h +++ b/include/uapi/linux/kcov.h @@ -8,4 +8,28 @@ #define KCOV_ENABLE _IO('c', 100) #define KCOV_DISABLE _IO('c', 101) +enum { + /* + * Tracing coverage collection mode. + * Covered PCs are collected in a per-task buffer. + * In new KCOV version the mode is chosen by calling + * ioctl(fd, KCOV_ENABLE, mode). In older versions the mode argument + * was supposed to be 0 in such a call. So, for reasons of backward + * compatibility, we have chosen the value KCOV_TRACE_PC to be 0. + */ + KCOV_TRACE_PC = 0, + /* Collecting comparison operands mode. */ + KCOV_TRACE_CMP = 1, +}; + +/* + * The format for the types of collected comparisons. + * + * Bit 0 shows whether one of the arguments is a compile-time constant. + * Bits 1 & 2 contain log2 of the argument size, up to 8 bytes. + */ +#define KCOV_CMP_CONST (1 << 0) +#define KCOV_CMP_SIZE(n) ((n) << 1) +#define KCOV_CMP_MASK KCOV_CMP_SIZE(3) + #endif /* _LINUX_KCOV_IOCTLS_H */ diff --git a/kernel/kcov.c b/kernel/kcov.c index d9f9fa9cacc629..15f33faf4013bd 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -22,13 +22,21 @@ #include #include +/* Number of 64-bit words written per one comparison: */ +#define KCOV_WORDS_PER_CMP 4 + /* * kcov descriptor (one per opened debugfs file). * State transitions of the descriptor: * - initial state after open() * - then there must be a single ioctl(KCOV_INIT_TRACE) call * - then, mmap() call (several calls are allowed but not useful) - * - then, repeated enable/disable for a task (only one task a time allowed) + * - then, ioctl(KCOV_ENABLE, arg), where arg is + * KCOV_TRACE_PC - to trace only the PCs + * or + * KCOV_TRACE_CMP - to trace only the comparison operands + * - then, ioctl(KCOV_DISABLE) to disable the task. + * Enabling/disabling ioctls can be repeated (only one task a time allowed). */ struct kcov { /* @@ -48,51 +56,176 @@ struct kcov { struct task_struct *t; }; -/* - * Entry point from instrumented code. - * This is called once per basic-block/edge. - */ -void notrace __sanitizer_cov_trace_pc(void) +static bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t) { - struct task_struct *t; enum kcov_mode mode; - t = current; /* * We are interested in code coverage as a function of a syscall inputs, * so we ignore code executed in interrupts. */ if (!in_task()) - return; + return false; mode = READ_ONCE(t->kcov_mode); - if (mode == KCOV_MODE_TRACE) { - unsigned long *area; - unsigned long pos; - unsigned long ip = _RET_IP_; + /* + * There is some code that runs in interrupts but for which + * in_interrupt() returns false (e.g. preempt_schedule_irq()). + * READ_ONCE()/barrier() effectively provides load-acquire wrt + * interrupts, there are paired barrier()/WRITE_ONCE() in + * kcov_ioctl_locked(). + */ + barrier(); + return mode == needed_mode; +} +static unsigned long canonicalize_ip(unsigned long ip) +{ #ifdef CONFIG_RANDOMIZE_BASE - ip -= kaslr_offset(); + ip -= kaslr_offset(); #endif + return ip; +} - /* - * There is some code that runs in interrupts but for which - * in_interrupt() returns false (e.g. preempt_schedule_irq()). - * READ_ONCE()/barrier() effectively provides load-acquire wrt - * interrupts, there are paired barrier()/WRITE_ONCE() in - * kcov_ioctl_locked(). - */ - barrier(); - area = t->kcov_area; - /* The first word is number of subsequent PCs. */ - pos = READ_ONCE(area[0]) + 1; - if (likely(pos < t->kcov_size)) { - area[pos] = ip; - WRITE_ONCE(area[0], pos); - } +/* + * Entry point from instrumented code. + * This is called once per basic-block/edge. + */ +void notrace __sanitizer_cov_trace_pc(void) +{ + struct task_struct *t; + unsigned long *area; + unsigned long ip = canonicalize_ip(_RET_IP_); + unsigned long pos; + + t = current; + if (!check_kcov_mode(KCOV_MODE_TRACE_PC, t)) + return; + + area = t->kcov_area; + /* The first 64-bit word is the number of subsequent PCs. */ + pos = READ_ONCE(area[0]) + 1; + if (likely(pos < t->kcov_size)) { + area[pos] = ip; + WRITE_ONCE(area[0], pos); } } EXPORT_SYMBOL(__sanitizer_cov_trace_pc); +#ifdef CONFIG_KCOV_ENABLE_COMPARISONS +static void write_comp_data(u64 type, u64 arg1, u64 arg2, u64 ip) +{ + struct task_struct *t; + u64 *area; + u64 count, start_index, end_pos, max_pos; + + t = current; + if (!check_kcov_mode(KCOV_MODE_TRACE_CMP, t)) + return; + + ip = canonicalize_ip(ip); + + /* + * We write all comparison arguments and types as u64. + * The buffer was allocated for t->kcov_size unsigned longs. + */ + area = (u64 *)t->kcov_area; + max_pos = t->kcov_size * sizeof(unsigned long); + + count = READ_ONCE(area[0]); + + /* Every record is KCOV_WORDS_PER_CMP 64-bit words. */ + start_index = 1 + count * KCOV_WORDS_PER_CMP; + end_pos = (start_index + KCOV_WORDS_PER_CMP) * sizeof(u64); + if (likely(end_pos <= max_pos)) { + area[start_index] = type; + area[start_index + 1] = arg1; + area[start_index + 2] = arg2; + area[start_index + 3] = ip; + WRITE_ONCE(area[0], count + 1); + } +} + +void notrace __sanitizer_cov_trace_cmp1(u8 arg1, u8 arg2) +{ + write_comp_data(KCOV_CMP_SIZE(0), arg1, arg2, _RET_IP_); +} +EXPORT_SYMBOL(__sanitizer_cov_trace_cmp1); + +void notrace __sanitizer_cov_trace_cmp2(u16 arg1, u16 arg2) +{ + write_comp_data(KCOV_CMP_SIZE(1), arg1, arg2, _RET_IP_); +} +EXPORT_SYMBOL(__sanitizer_cov_trace_cmp2); + +void notrace __sanitizer_cov_trace_cmp4(u16 arg1, u16 arg2) +{ + write_comp_data(KCOV_CMP_SIZE(2), arg1, arg2, _RET_IP_); +} +EXPORT_SYMBOL(__sanitizer_cov_trace_cmp4); + +void notrace __sanitizer_cov_trace_cmp8(u64 arg1, u64 arg2) +{ + write_comp_data(KCOV_CMP_SIZE(3), arg1, arg2, _RET_IP_); +} +EXPORT_SYMBOL(__sanitizer_cov_trace_cmp8); + +void notrace __sanitizer_cov_trace_const_cmp1(u8 arg1, u8 arg2) +{ + write_comp_data(KCOV_CMP_SIZE(0) | KCOV_CMP_CONST, arg1, arg2, + _RET_IP_); +} +EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp1); + +void notrace __sanitizer_cov_trace_const_cmp2(u16 arg1, u16 arg2) +{ + write_comp_data(KCOV_CMP_SIZE(1) | KCOV_CMP_CONST, arg1, arg2, + _RET_IP_); +} +EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp2); + +void notrace __sanitizer_cov_trace_const_cmp4(u16 arg1, u16 arg2) +{ + write_comp_data(KCOV_CMP_SIZE(2) | KCOV_CMP_CONST, arg1, arg2, + _RET_IP_); +} +EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp4); + +void notrace __sanitizer_cov_trace_const_cmp8(u64 arg1, u64 arg2) +{ + write_comp_data(KCOV_CMP_SIZE(3) | KCOV_CMP_CONST, arg1, arg2, + _RET_IP_); +} +EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp8); + +void notrace __sanitizer_cov_trace_switch(u64 val, u64 *cases) +{ + u64 i; + u64 count = cases[0]; + u64 size = cases[1]; + u64 type = KCOV_CMP_CONST; + + switch (size) { + case 8: + type |= KCOV_CMP_SIZE(0); + break; + case 16: + type |= KCOV_CMP_SIZE(1); + break; + case 32: + type |= KCOV_CMP_SIZE(2); + break; + case 64: + type |= KCOV_CMP_SIZE(3); + break; + default: + return; + } + for (i = 0; i < count; i++) + write_comp_data(type, cases[i + 2], val, _RET_IP_); +} +EXPORT_SYMBOL(__sanitizer_cov_trace_switch); +#endif /* ifdef CONFIG_KCOV_ENABLE_COMPARISONS */ + static void kcov_get(struct kcov *kcov) { atomic_inc(&kcov->refcount); @@ -129,6 +262,7 @@ void kcov_task_exit(struct task_struct *t) /* Just to not leave dangling references behind. */ kcov_task_init(t); kcov->t = NULL; + kcov->mode = KCOV_MODE_INIT; spin_unlock(&kcov->lock); kcov_put(kcov); } @@ -147,7 +281,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) spin_lock(&kcov->lock); size = kcov->size * sizeof(unsigned long); - if (kcov->mode == KCOV_MODE_DISABLED || vma->vm_pgoff != 0 || + if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != size) { res = -EINVAL; goto exit; @@ -176,6 +310,7 @@ static int kcov_open(struct inode *inode, struct file *filep) kcov = kzalloc(sizeof(*kcov), GFP_KERNEL); if (!kcov) return -ENOMEM; + kcov->mode = KCOV_MODE_DISABLED; atomic_set(&kcov->refcount, 1); spin_lock_init(&kcov->lock); filep->private_data = kcov; @@ -211,7 +346,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, if (size < 2 || size > INT_MAX / sizeof(unsigned long)) return -EINVAL; kcov->size = size; - kcov->mode = KCOV_MODE_TRACE; + kcov->mode = KCOV_MODE_INIT; return 0; case KCOV_ENABLE: /* @@ -221,17 +356,25 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, * at task exit or voluntary by KCOV_DISABLE. After that it can * be enabled for another task. */ - unused = arg; - if (unused != 0 || kcov->mode == KCOV_MODE_DISABLED || - kcov->area == NULL) + if (kcov->mode != KCOV_MODE_INIT || !kcov->area) return -EINVAL; if (kcov->t != NULL) return -EBUSY; + if (arg == KCOV_TRACE_PC) + kcov->mode = KCOV_MODE_TRACE_PC; + else if (arg == KCOV_TRACE_CMP) +#ifdef CONFIG_KCOV_ENABLE_COMPARISONS + kcov->mode = KCOV_MODE_TRACE_CMP; +#else + return -ENOTSUPP; +#endif + else + return -EINVAL; t = current; /* Cache in task struct for performance. */ t->kcov_size = kcov->size; t->kcov_area = kcov->area; - /* See comment in __sanitizer_cov_trace_pc(). */ + /* See comment in check_kcov_mode(). */ barrier(); WRITE_ONCE(t->kcov_mode, kcov->mode); t->kcov = kcov; @@ -249,6 +392,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, return -EINVAL; kcov_task_init(t); kcov->t = NULL; + kcov->mode = KCOV_MODE_INIT; kcov_put(kcov); return 0; default: From d677a4d6019385488e794cc47bd3d6f9c2aab874 Mon Sep 17 00:00:00 2001 From: Victor Chibotaru Date: Fri, 17 Nov 2017 15:30:50 -0800 Subject: [PATCH 78/94] Makefile: support flag -fsanitizer-coverage=trace-cmp The flag enables Clang instrumentation of comparison operations (currently not supported by GCC). This instrumentation is needed by the new KCOV device to collect comparison operands. Link: http://lkml.kernel.org/r/20171011095459.70721-2-glider@google.com Signed-off-by: Victor Chibotaru Signed-off-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Andrey Konovalov Cc: Mark Rutland Cc: Alexander Popov Cc: Andrey Ryabinin Cc: Kees Cook Cc: Vegard Nossum Cc: Quentin Casasnovas Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Makefile | 3 +-- lib/Kconfig.debug | 10 ++++++++++ scripts/Makefile.kcov | 7 +++++++ 3 files changed, 18 insertions(+), 2 deletions(-) create mode 100644 scripts/Makefile.kcov diff --git a/Makefile b/Makefile index 763ab35df12acd..ccb7d5b2fbf5f9 100644 --- a/Makefile +++ b/Makefile @@ -375,8 +375,6 @@ CFLAGS_KERNEL = AFLAGS_KERNEL = LDFLAGS_vmlinux = CFLAGS_GCOV := -fprofile-arcs -ftest-coverage -fno-tree-loop-im $(call cc-disable-warning,maybe-uninitialized,) -CFLAGS_KCOV := $(call cc-option,-fsanitize-coverage=trace-pc,) - # Use USERINCLUDE when you must reference the UAPI directories only. USERINCLUDE := \ @@ -659,6 +657,7 @@ ifeq ($(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-goto.sh $(CC) $(KBUILD_CFLA KBUILD_AFLAGS += -DCC_HAVE_ASM_GOTO endif +include scripts/Makefile.kcov include scripts/Makefile.gcc-plugins ifdef CONFIG_READABLE_ASM diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index ce813731269f6c..947d3e2ed5c2f1 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -756,6 +756,16 @@ config KCOV For more details, see Documentation/dev-tools/kcov.rst. +config KCOV_ENABLE_COMPARISONS + bool "Enable comparison operands collection by KCOV" + depends on KCOV + default n + help + KCOV also exposes operands of every comparison in the instrumented + code along with operand sizes and PCs of the comparison instructions. + These operands can be used by fuzzing engines to improve the quality + of fuzzing coverage. + config KCOV_INSTRUMENT_ALL bool "Instrument all code by default" depends on KCOV diff --git a/scripts/Makefile.kcov b/scripts/Makefile.kcov new file mode 100644 index 00000000000000..5cc72037e42345 --- /dev/null +++ b/scripts/Makefile.kcov @@ -0,0 +1,7 @@ +ifdef CONFIG_KCOV +CFLAGS_KCOV := $(call cc-option,-fsanitize-coverage=trace-pc,) +ifeq ($(CONFIG_KCOV_ENABLE_COMPARISONS),y) +CFLAGS_KCOV += $(call cc-option,-fsanitize-coverage=trace-cmp,) +endif + +endif From c512ac01d8a841033da8ec538a83f80fb0b4d1fe Mon Sep 17 00:00:00 2001 From: Victor Chibotaru Date: Fri, 17 Nov 2017 15:30:53 -0800 Subject: [PATCH 79/94] kcov: update documentation The updated documentation describes new KCOV mode for collecting comparison operands. Link: http://lkml.kernel.org/r/20171011095459.70721-3-glider@google.com Signed-off-by: Victor Chibotaru Signed-off-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Andrey Konovalov Cc: Mark Rutland Cc: Alexander Popov Cc: Andrey Ryabinin Cc: Kees Cook Cc: Vegard Nossum Cc: Quentin Casasnovas Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/dev-tools/kcov.rst | 99 ++++++++++++++++++++++++++++++-- 1 file changed, 95 insertions(+), 4 deletions(-) diff --git a/Documentation/dev-tools/kcov.rst b/Documentation/dev-tools/kcov.rst index 44886c91e112d4..c2f6452e38ed00 100644 --- a/Documentation/dev-tools/kcov.rst +++ b/Documentation/dev-tools/kcov.rst @@ -12,19 +12,30 @@ To achieve this goal it does not collect coverage in soft/hard interrupts and instrumentation of some inherently non-deterministic parts of kernel is disabled (e.g. scheduler, locking). -Usage ------ +kcov is also able to collect comparison operands from the instrumented code +(this feature currently requires that the kernel is compiled with clang). + +Prerequisites +------------- Configure the kernel with:: CONFIG_KCOV=y CONFIG_KCOV requires gcc built on revision 231296 or later. + +If the comparison operands need to be collected, set:: + + CONFIG_KCOV_ENABLE_COMPARISONS=y + Profiling data will only become accessible once debugfs has been mounted:: mount -t debugfs none /sys/kernel/debug -The following program demonstrates kcov usage from within a test program: +Coverage collection +------------------- +The following program demonstrates coverage collection from within a test +program using kcov: .. code-block:: c @@ -44,6 +55,9 @@ The following program demonstrates kcov usage from within a test program: #define KCOV_DISABLE _IO('c', 101) #define COVER_SIZE (64<<10) + #define KCOV_TRACE_PC 0 + #define KCOV_TRACE_CMP 1 + int main(int argc, char **argv) { int fd; @@ -64,7 +78,7 @@ The following program demonstrates kcov usage from within a test program: if ((void*)cover == MAP_FAILED) perror("mmap"), exit(1); /* Enable coverage collection on the current thread. */ - if (ioctl(fd, KCOV_ENABLE, 0)) + if (ioctl(fd, KCOV_ENABLE, KCOV_TRACE_PC)) perror("ioctl"), exit(1); /* Reset coverage from the tail of the ioctl() call. */ __atomic_store_n(&cover[0], 0, __ATOMIC_RELAXED); @@ -111,3 +125,80 @@ The interface is fine-grained to allow efficient forking of test processes. That is, a parent process opens /sys/kernel/debug/kcov, enables trace mode, mmaps coverage buffer and then forks child processes in a loop. Child processes only need to enable coverage (disable happens automatically on thread end). + +Comparison operands collection +------------------------------ +Comparison operands collection is similar to coverage collection: + +.. code-block:: c + + /* Same includes and defines as above. */ + + /* Number of 64-bit words per record. */ + #define KCOV_WORDS_PER_CMP 4 + + /* + * The format for the types of collected comparisons. + * + * Bit 0 shows whether one of the arguments is a compile-time constant. + * Bits 1 & 2 contain log2 of the argument size, up to 8 bytes. + */ + + #define KCOV_CMP_CONST (1 << 0) + #define KCOV_CMP_SIZE(n) ((n) << 1) + #define KCOV_CMP_MASK KCOV_CMP_SIZE(3) + + int main(int argc, char **argv) + { + int fd; + uint64_t *cover, type, arg1, arg2, is_const, size; + unsigned long n, i; + + fd = open("/sys/kernel/debug/kcov", O_RDWR); + if (fd == -1) + perror("open"), exit(1); + if (ioctl(fd, KCOV_INIT_TRACE, COVER_SIZE)) + perror("ioctl"), exit(1); + /* + * Note that the buffer pointer is of type uint64_t*, because all + * the comparison operands are promoted to uint64_t. + */ + cover = (uint64_t *)mmap(NULL, COVER_SIZE * sizeof(unsigned long), + PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if ((void*)cover == MAP_FAILED) + perror("mmap"), exit(1); + /* Note KCOV_TRACE_CMP instead of KCOV_TRACE_PC. */ + if (ioctl(fd, KCOV_ENABLE, KCOV_TRACE_CMP)) + perror("ioctl"), exit(1); + __atomic_store_n(&cover[0], 0, __ATOMIC_RELAXED); + read(-1, NULL, 0); + /* Read number of comparisons collected. */ + n = __atomic_load_n(&cover[0], __ATOMIC_RELAXED); + for (i = 0; i < n; i++) { + type = cover[i * KCOV_WORDS_PER_CMP + 1]; + /* arg1 and arg2 - operands of the comparison. */ + arg1 = cover[i * KCOV_WORDS_PER_CMP + 2]; + arg2 = cover[i * KCOV_WORDS_PER_CMP + 3]; + /* ip - caller address. */ + ip = cover[i * KCOV_WORDS_PER_CMP + 4]; + /* size of the operands. */ + size = 1 << ((type & KCOV_CMP_MASK) >> 1); + /* is_const - true if either operand is a compile-time constant.*/ + is_const = type & KCOV_CMP_CONST; + printf("ip: 0x%lx type: 0x%lx, arg1: 0x%lx, arg2: 0x%lx, " + "size: %lu, %s\n", + ip, type, arg1, arg2, size, + is_const ? "const" : "non-const"); + } + if (ioctl(fd, KCOV_DISABLE, 0)) + perror("ioctl"), exit(1); + /* Free resources. */ + if (munmap(cover, COVER_SIZE * sizeof(unsigned long))) + perror("munmap"), exit(1); + if (close(fd)) + perror("close"), exit(1); + return 0; + } + +Note that the kcov modes (coverage collection or comparison operands) are +mutually exclusive. From 2d8364bae4db144df75ba85e92d2b8619ba8eedc Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Fri, 17 Nov 2017 15:30:57 -0800 Subject: [PATCH 80/94] kernel/reboot.c: add devm_register_reboot_notifier() Add devm_* wrapper around register_reboot_notifier to simplify device specific reboot notifier registration/unregistration. [akpm@linux-foundation.org: move `struct device' forward decl to top-of-file] Link: http://lkml.kernel.org/r/20170320171753.1705-1-andrew.smirnov@gmail.com Signed-off-by: Andrey Smirnov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/reboot.h | 4 ++++ kernel/reboot.c | 27 +++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/include/linux/reboot.h b/include/linux/reboot.h index d03da0eb95ca77..e63799a6e89515 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -6,6 +6,8 @@ #include #include +struct device; + #define SYS_DOWN 0x0001 /* Notify of system down */ #define SYS_RESTART SYS_DOWN #define SYS_HALT 0x0002 /* Notify of system halt */ @@ -39,6 +41,8 @@ extern int reboot_force; extern int register_reboot_notifier(struct notifier_block *); extern int unregister_reboot_notifier(struct notifier_block *); +extern int devm_register_reboot_notifier(struct device *, struct notifier_block *); + extern int register_restart_handler(struct notifier_block *); extern int unregister_restart_handler(struct notifier_block *); extern void do_kernel_restart(char *cmd); diff --git a/kernel/reboot.c b/kernel/reboot.c index bd30a973fe946b..e4ced883d8de1c 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -104,6 +104,33 @@ int unregister_reboot_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_reboot_notifier); +static void devm_unregister_reboot_notifier(struct device *dev, void *res) +{ + WARN_ON(unregister_reboot_notifier(*(struct notifier_block **)res)); +} + +int devm_register_reboot_notifier(struct device *dev, struct notifier_block *nb) +{ + struct notifier_block **rcnb; + int ret; + + rcnb = devres_alloc(devm_unregister_reboot_notifier, + sizeof(*rcnb), GFP_KERNEL); + if (!rcnb) + return -ENOMEM; + + ret = register_reboot_notifier(nb); + if (!ret) { + *rcnb = nb; + devres_add(dev, rcnb); + } else { + devres_free(rcnb); + } + + return ret; +} +EXPORT_SYMBOL(devm_register_reboot_notifier); + /* * Notifier list for kernel code which wants to be called * to restart the system. From 44ea39420fc95e7432ddc91de4eb58c7470ab897 Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Fri, 17 Nov 2017 15:31:01 -0800 Subject: [PATCH 81/94] drivers/watchdog: make use of devm_register_reboot_notifier() Save a bit of cleanup code by leveraging newly added devm_register_reboot_notifier(). [akpm@linux-foundation.org: small cleanup: avoid 80-col tricks] Link: http://lkml.kernel.org/r/20170411160615.9784-1-andrew.smirnov@gmail.com Signed-off-by: Andrey Smirnov Acked-by: Guenter Roeck Cc: Chris Healy Cc: Wim Van Sebroeck Cc: Andy Shevchenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/watchdog/watchdog_core.c | 35 -------------------------------- drivers/watchdog/watchdog_dev.c | 32 +++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 35 deletions(-) diff --git a/drivers/watchdog/watchdog_core.c b/drivers/watchdog/watchdog_core.c index 74265b2f806ca2..8a8d952f8df96c 100644 --- a/drivers/watchdog/watchdog_core.c +++ b/drivers/watchdog/watchdog_core.c @@ -137,25 +137,6 @@ int watchdog_init_timeout(struct watchdog_device *wdd, } EXPORT_SYMBOL_GPL(watchdog_init_timeout); -static int watchdog_reboot_notifier(struct notifier_block *nb, - unsigned long code, void *data) -{ - struct watchdog_device *wdd = container_of(nb, struct watchdog_device, - reboot_nb); - - if (code == SYS_DOWN || code == SYS_HALT) { - if (watchdog_active(wdd)) { - int ret; - - ret = wdd->ops->stop(wdd); - if (ret) - return NOTIFY_BAD; - } - } - - return NOTIFY_DONE; -} - static int watchdog_restart_notifier(struct notifier_block *nb, unsigned long action, void *data) { @@ -244,19 +225,6 @@ static int __watchdog_register_device(struct watchdog_device *wdd) } } - if (test_bit(WDOG_STOP_ON_REBOOT, &wdd->status)) { - wdd->reboot_nb.notifier_call = watchdog_reboot_notifier; - - ret = register_reboot_notifier(&wdd->reboot_nb); - if (ret) { - pr_err("watchdog%d: Cannot register reboot notifier (%d)\n", - wdd->id, ret); - watchdog_dev_unregister(wdd); - ida_simple_remove(&watchdog_ida, wdd->id); - return ret; - } - } - if (wdd->ops->restart) { wdd->restart_nb.notifier_call = watchdog_restart_notifier; @@ -302,9 +270,6 @@ static void __watchdog_unregister_device(struct watchdog_device *wdd) if (wdd->ops->restart) unregister_restart_handler(&wdd->restart_nb); - if (test_bit(WDOG_STOP_ON_REBOOT, &wdd->status)) - unregister_reboot_notifier(&wdd->reboot_nb); - watchdog_dev_unregister(wdd); ida_simple_remove(&watchdog_ida, wdd->id); } diff --git a/drivers/watchdog/watchdog_dev.c b/drivers/watchdog/watchdog_dev.c index 0826e663bd5a3c..1e971a50d7fb74 100644 --- a/drivers/watchdog/watchdog_dev.c +++ b/drivers/watchdog/watchdog_dev.c @@ -42,6 +42,7 @@ #include /* For handling misc devices */ #include /* For module stuff/... */ #include /* For mutexes */ +#include /* For reboot notifier */ #include /* For memory functions */ #include /* For standard types (like size_t) */ #include /* For watchdog specific items */ @@ -1016,6 +1017,25 @@ static struct class watchdog_class = { .dev_groups = wdt_groups, }; +static int watchdog_reboot_notifier(struct notifier_block *nb, + unsigned long code, void *data) +{ + struct watchdog_device *wdd; + + wdd = container_of(nb, struct watchdog_device, reboot_nb); + if (code == SYS_DOWN || code == SYS_HALT) { + if (watchdog_active(wdd)) { + int ret; + + ret = wdd->ops->stop(wdd); + if (ret) + return NOTIFY_BAD; + } + } + + return NOTIFY_DONE; +} + /* * watchdog_dev_register: register a watchdog device * @wdd: watchdog device @@ -1049,6 +1069,18 @@ int watchdog_dev_register(struct watchdog_device *wdd) if (ret) { device_destroy(&watchdog_class, devno); watchdog_cdev_unregister(wdd); + return ret; + } + + if (test_bit(WDOG_STOP_ON_REBOOT, &wdd->status)) { + wdd->reboot_nb.notifier_call = watchdog_reboot_notifier; + + ret = devm_register_reboot_notifier(dev, &wdd->reboot_nb); + if (ret) { + pr_err("watchdog%d: Cannot register reboot notifier (%d)\n", + wdd->id, ret); + watchdog_dev_unregister(wdd); + } } return ret; From e35c4c64fe492b212f9c7d9e046626e48e89f863 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 17 Nov 2017 15:31:04 -0800 Subject: [PATCH 82/94] initramfs: use time64_t timestamps The cpio format uses a 32-bit number to encode file timestamps, which breaks initramfs support in 2038. This reinterprets the timestamp as unsigned, to give us another 68 years and avoids breaking until 2106. Link: http://lkml.kernel.org/r/20171019095536.801199-1-arnd@arndb.de Signed-off-by: Arnd Bergmann Cc: Al Viro Cc: Deepa Dinamani Cc: Arnd Bergmann Cc: Daniel Thompson Cc: Lokesh Vutla Cc: Stafford Horne Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/initramfs.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/init/initramfs.c b/init/initramfs.c index 7046feffef6b58..7e99a00389429d 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -109,7 +109,7 @@ static void __init free_hash(void) } } -static long __init do_utime(char *filename, time_t mtime) +static long __init do_utime(char *filename, time64_t mtime) { struct timespec64 t[2]; @@ -125,10 +125,10 @@ static __initdata LIST_HEAD(dir_list); struct dir_entry { struct list_head list; char *name; - time_t mtime; + time64_t mtime; }; -static void __init dir_add(const char *name, time_t mtime) +static void __init dir_add(const char *name, time64_t mtime) { struct dir_entry *de = kmalloc(sizeof(struct dir_entry), GFP_KERNEL); if (!de) @@ -150,7 +150,7 @@ static void __init dir_utime(void) } } -static __initdata time_t mtime; +static __initdata time64_t mtime; /* cpio header parsing */ @@ -177,7 +177,7 @@ static void __init parse_header(char *s) uid = parsed[2]; gid = parsed[3]; nlink = parsed[4]; - mtime = parsed[5]; + mtime = parsed[5]; /* breaks in y2106 */ body_len = parsed[6]; major = parsed[7]; minor = parsed[8]; From b8fd99838435f9b420c3e848192bd43abc648b7f Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 17 Nov 2017 15:31:08 -0800 Subject: [PATCH 83/94] sysvipc: unteach ids->next_id for !CHECKPOINT_RESTORE Patch series "sysvipc: ipc-key management improvements". Here are a few improvements I spotted while eyeballing Guillaume's rhashtable implementation for ipc keys. The first and fourth patches are the interesting ones, the middle two are trivial. This patch (of 4): The next_id object-allocation functionality was introduced in commit 03f595668017 ("ipc: add sysctl to specify desired next object id"). Given that these new entries are _only_ exported under the CONFIG_CHECKPOINT_RESTORE option, there is no point for the common case to even know about ->next_id. As such rewrite ipc_buildid() such that it can do away with the field as well as unnecessary branches when adding a new identifier. The end result also better differentiates both cases, so the code ends up being cleaner; albeit the small duplications regarding the default case. [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/20170831172049.14576-2-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ipc_namespace.h | 2 ++ ipc/util.c | 60 ++++++++++++++++++++++++++--------- ipc/util.h | 5 --- 3 files changed, 47 insertions(+), 20 deletions(-) diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index 474812abe7731a..d7cf3a85085396 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -19,7 +19,9 @@ struct ipc_ids { bool tables_initialized; struct rw_semaphore rwsem; struct idr ipcs_idr; +#ifdef CONFIG_CHECKPOINT_RESTORE int next_id; +#endif struct rhashtable key_ht; }; diff --git a/ipc/util.c b/ipc/util.c index 79b30eee32cd85..429c06bdb8ef79 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -116,13 +116,15 @@ int ipc_init_ids(struct ipc_ids *ids) int err; ids->in_use = 0; ids->seq = 0; - ids->next_id = -1; init_rwsem(&ids->rwsem); err = rhashtable_init(&ids->key_ht, &ipc_kht_params); if (err) return err; idr_init(&ids->ipcs_idr); ids->tables_initialized = true; +#ifdef CONFIG_CHECKPOINT_RESTORE + ids->next_id = -1; +#endif return 0; } @@ -216,6 +218,46 @@ int ipc_get_maxid(struct ipc_ids *ids) return max_id; } +#ifdef CONFIG_CHECKPOINT_RESTORE +/* + * Specify desired id for next allocated IPC object. + */ +#define ipc_idr_alloc(ids, new) \ + idr_alloc(&(ids)->ipcs_idr, (new), \ + (ids)->next_id < 0 ? 0 : ipcid_to_idx((ids)->next_id),\ + 0, GFP_NOWAIT) + +static inline int ipc_buildid(int id, struct ipc_ids *ids, + struct kern_ipc_perm *new) +{ + if (ids->next_id < 0) { /* default, behave as !CHECKPOINT_RESTORE */ + new->seq = ids->seq++; + if (ids->seq > IPCID_SEQ_MAX) + ids->seq = 0; + } else { + new->seq = ipcid_to_seqx(ids->next_id); + ids->next_id = -1; + } + + return SEQ_MULTIPLIER * new->seq + id; +} + +#else +#define ipc_idr_alloc(ids, new) \ + idr_alloc(&(ids)->ipcs_idr, (new), 0, 0, GFP_NOWAIT) + +static inline int ipc_buildid(int id, struct ipc_ids *ids, + struct kern_ipc_perm *new) +{ + new->seq = ids->seq++; + if (ids->seq > IPCID_SEQ_MAX) + ids->seq = 0; + + return SEQ_MULTIPLIER * new->seq + id; +} + +#endif /* CONFIG_CHECKPOINT_RESTORE */ + /** * ipc_addid - add an ipc identifier * @ids: ipc identifier set @@ -234,7 +276,6 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int size) kuid_t euid; kgid_t egid; int id, err; - int next_id = ids->next_id; if (size > IPCMNI) size = IPCMNI; @@ -254,9 +295,7 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int size) new->cuid = new->uid = euid; new->gid = new->cgid = egid; - id = idr_alloc(&ids->ipcs_idr, new, - (next_id < 0) ? 0 : ipcid_to_idx(next_id), 0, - GFP_NOWAIT); + id = ipc_idr_alloc(ids, new); idr_preload_end(); if (id >= 0 && new->key != IPC_PRIVATE) { @@ -274,17 +313,8 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int size) } ids->in_use++; + new->id = ipc_buildid(id, ids, new); - if (next_id < 0) { - new->seq = ids->seq++; - if (ids->seq > IPCID_SEQ_MAX) - ids->seq = 0; - } else { - new->seq = ipcid_to_seqx(next_id); - ids->next_id = -1; - } - - new->id = ipc_buildid(id, new->seq); return id; } diff --git a/ipc/util.h b/ipc/util.h index 579112d90016ce..0cd6201fe63a4a 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -146,11 +146,6 @@ extern struct msg_msg *load_msg(const void __user *src, size_t len); extern struct msg_msg *copy_msg(struct msg_msg *src, struct msg_msg *dst); extern int store_msg(void __user *dest, struct msg_msg *msg, size_t len); -static inline int ipc_buildid(int id, int seq) -{ - return SEQ_MULTIPLIER * seq + id; -} - static inline int ipc_checkid(struct kern_ipc_perm *ipcp, int uid) { return uid / SEQ_MULTIPLIER != ipcp->seq; From 39c96a1b96a5991b1c9e79b85a8d74ef93b36026 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 17 Nov 2017 15:31:11 -0800 Subject: [PATCH 84/94] sysvipc: duplicate lock comments wrt ipc_addid() The comment in msgqueues when using ipc_addid() is quite useful imo. Duplicate it for shm and semaphores. Link: http://lkml.kernel.org/r/20170831172049.14576-3-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/sem.c | 1 + ipc/shm.c | 1 + 2 files changed, 2 insertions(+) diff --git a/ipc/sem.c b/ipc/sem.c index b2698ebdcb31e6..28a5c9f0be876b 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -515,6 +515,7 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params) sma->sem_nsems = nsems; sma->sem_ctime = ktime_get_real_seconds(); + /* ipc_addid() locks sma upon success. */ retval = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni); if (retval < 0) { call_rcu(&sma->sem_perm.rcu, sem_rcu_free); diff --git a/ipc/shm.c b/ipc/shm.c index bd652755d32cc6..378c929194ce68 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -601,6 +601,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) shp->shm_file = file; shp->shm_creator = current; + /* ipc_addid() locks shp upon success. */ error = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni); if (error < 0) goto no_id; From ebf66799acfb5f52ada4ff96ecc9579867941ea9 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 17 Nov 2017 15:31:15 -0800 Subject: [PATCH 85/94] sysvipc: properly name ipc_addid() limit parameter This is better understood as a limit, instead of size; exactly like the function comment indicates. Rename it. Link: http://lkml.kernel.org/r/20170831172049.14576-4-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/util.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ipc/util.c b/ipc/util.c index 429c06bdb8ef79..e09bf76610efb3 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -262,7 +262,7 @@ static inline int ipc_buildid(int id, struct ipc_ids *ids, * ipc_addid - add an ipc identifier * @ids: ipc identifier set * @new: new ipc permission set - * @size: limit for the number of used ids + * @limit: limit for the number of used ids * * Add an entry 'new' to the ipc ids idr. The permissions object is * initialised and the first free entry is set up and the id assigned @@ -271,16 +271,16 @@ static inline int ipc_buildid(int id, struct ipc_ids *ids, * * Called with writer ipc_ids.rwsem held. */ -int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int size) +int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit) { kuid_t euid; kgid_t egid; int id, err; - if (size > IPCMNI) - size = IPCMNI; + if (limit > IPCMNI) + limit = IPCMNI; - if (!ids->tables_initialized || ids->in_use >= size) + if (!ids->tables_initialized || ids->in_use >= limit) return -ENOSPC; idr_preload(GFP_KERNEL); From 15df03c87983660a4d1eedb4541778592bd97684 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 17 Nov 2017 15:31:18 -0800 Subject: [PATCH 86/94] sysvipc: make get_maxid O(1) again For a custom microbenchmark on a 3.30GHz Xeon SandyBridge, which calls IPC_STAT over and over, it was calculated that, on avg the cost of ipc_get_maxid() for increasing amounts of keys was: 10 keys: ~900 cycles 100 keys: ~15000 cycles 1000 keys: ~150000 cycles 10000 keys: ~2100000 cycles This is unsurprising as maxid is currently O(n). By having the max_id available in O(1) we save all those cycles for each semctl(_STAT) command, the idr_find can be expensive -- which some real (customer) workloads actually poll on. Note that this used to be the case, until commit 7ca7e564e04 ("ipc: store ipcs into IDRs"). The cost is the extra idr_find when doing RMIDs, but we simply go backwards, and should not take too many iterations to find the new value. [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/20170831172049.14576-5-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ipc_namespace.h | 1 + ipc/util.c | 43 +++++++++++------------------------ ipc/util.h | 21 ++++++++++++++--- 3 files changed, 32 insertions(+), 33 deletions(-) diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index d7cf3a85085396..b5630c8eb2f3a9 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -19,6 +19,7 @@ struct ipc_ids { bool tables_initialized; struct rw_semaphore rwsem; struct idr ipcs_idr; + int max_id; #ifdef CONFIG_CHECKPOINT_RESTORE int next_id; #endif diff --git a/ipc/util.c b/ipc/util.c index e09bf76610efb3..ff045fec8d8355 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -122,6 +122,7 @@ int ipc_init_ids(struct ipc_ids *ids) return err; idr_init(&ids->ipcs_idr); ids->tables_initialized = true; + ids->max_id = -1; #ifdef CONFIG_CHECKPOINT_RESTORE ids->next_id = -1; #endif @@ -188,36 +189,6 @@ static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key) return NULL; } -/** - * ipc_get_maxid - get the last assigned id - * @ids: ipc identifier set - * - * Called with ipc_ids.rwsem held. - */ -int ipc_get_maxid(struct ipc_ids *ids) -{ - struct kern_ipc_perm *ipc; - int max_id = -1; - int total, id; - - if (ids->in_use == 0) - return -1; - - if (ids->in_use == IPCMNI) - return IPCMNI - 1; - - /* Look for the last assigned id */ - total = 0; - for (id = 0; id < IPCMNI && total < ids->in_use; id++) { - ipc = idr_find(&ids->ipcs_idr, id); - if (ipc != NULL) { - max_id = id; - total++; - } - } - return max_id; -} - #ifdef CONFIG_CHECKPOINT_RESTORE /* * Specify desired id for next allocated IPC object. @@ -313,6 +284,9 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit) } ids->in_use++; + if (id > ids->max_id) + ids->max_id = id; + new->id = ipc_buildid(id, ids, new); return id; @@ -459,6 +433,15 @@ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) ipc_kht_remove(ids, ipcp); ids->in_use--; ipcp->deleted = true; + + if (unlikely(lid == ids->max_id)) { + do { + lid--; + if (lid == -1) + break; + } while (!idr_find(&ids->ipcs_idr, lid)); + ids->max_id = lid; + } } /** diff --git a/ipc/util.h b/ipc/util.h index 0cd6201fe63a4a..89b8ec176fc4c4 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -13,6 +13,7 @@ #include #include +#include #define SEQ_MULTIPLIER (IPCMNI) @@ -99,9 +100,6 @@ void __init ipc_init_proc_interface(const char *path, const char *header, /* must be called with ids->rwsem acquired for writing */ int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int); -/* must be called with ids->rwsem acquired for reading */ -int ipc_get_maxid(struct ipc_ids *); - /* must be called with both locks acquired. */ void ipc_rmid(struct ipc_ids *, struct kern_ipc_perm *); @@ -111,6 +109,23 @@ void ipc_set_key_private(struct ipc_ids *, struct kern_ipc_perm *); /* must be called with ipcp locked */ int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flg); +/** + * ipc_get_maxid - get the last assigned id + * @ids: ipc identifier set + * + * Called with ipc_ids.rwsem held for reading. + */ +static inline int ipc_get_maxid(struct ipc_ids *ids) +{ + if (ids->in_use == 0) + return -1; + + if (ids->in_use == IPCMNI) + return IPCMNI - 1; + + return ids->max_id; +} + /* * For allocation that need to be freed by RCU. * Objects are reference counted, they start with reference count 1. From 64c349f4ae78723248c474531d0cbc524fc5ba77 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 17 Nov 2017 15:31:22 -0800 Subject: [PATCH 87/94] mm: add infrastructure for get_user_pages_fast() benchmarking Performance of get_user_pages_fast() is critical for some workloads, but it's tricky to test it directly. This patch provides /sys/kernel/debug/gup_benchmark that helps with testing performance of it. See tools/testing/selftests/vm/gup_benchmark.c for userspace counterpart. Link: http://lkml.kernel.org/r/20170908215603.9189-2-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Cc: Shuah Khan Cc: Ingo Molnar Cc: Thorsten Leemhuis Cc: Jonathan Corbet Cc: Huang Ying Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 9 ++ mm/Makefile | 1 + mm/gup_benchmark.c | 100 +++++++++++++++++++++ tools/testing/selftests/vm/Makefile | 1 + tools/testing/selftests/vm/gup_benchmark.c | 91 +++++++++++++++++++ 5 files changed, 202 insertions(+) create mode 100644 mm/gup_benchmark.c create mode 100644 tools/testing/selftests/vm/gup_benchmark.c diff --git a/mm/Kconfig b/mm/Kconfig index 9c4bdddd80c212..03ff7703d3228e 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -756,3 +756,12 @@ config PERCPU_STATS This feature collects and exposes statistics via debugfs. The information includes global and per chunk statistics, which can be used to help understand percpu memory usage. + +config GUP_BENCHMARK + bool "Enable infrastructure for get_user_pages_fast() benchmarking" + default n + help + Provides /sys/kernel/debug/gup_benchmark that helps with testing + performance of get_user_pages_fast(). + + See tools/testing/selftests/vm/gup_benchmark.c diff --git a/mm/Makefile b/mm/Makefile index e7ebd176fb935e..e669f02c5a5425 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -80,6 +80,7 @@ obj-$(CONFIG_PAGE_COUNTER) += page_counter.o obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o +obj-$(CONFIG_GUP_BENCHMARK) += gup_benchmark.o obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c new file mode 100644 index 00000000000000..5c8e2abeaa1598 --- /dev/null +++ b/mm/gup_benchmark.c @@ -0,0 +1,100 @@ +#include +#include +#include +#include +#include +#include + +#define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark) + +struct gup_benchmark { + __u64 delta_usec; + __u64 addr; + __u64 size; + __u32 nr_pages_per_call; + __u32 flags; +}; + +static int __gup_benchmark_ioctl(unsigned int cmd, + struct gup_benchmark *gup) +{ + ktime_t start_time, end_time; + unsigned long i, nr, nr_pages, addr, next; + struct page **pages; + + nr_pages = gup->size / PAGE_SIZE; + pages = kvmalloc(sizeof(void *) * nr_pages, GFP_KERNEL); + if (!pages) + return -ENOMEM; + + i = 0; + nr = gup->nr_pages_per_call; + start_time = ktime_get(); + for (addr = gup->addr; addr < gup->addr + gup->size; addr = next) { + if (nr != gup->nr_pages_per_call) + break; + + next = addr + nr * PAGE_SIZE; + if (next > gup->addr + gup->size) { + next = gup->addr + gup->size; + nr = (next - addr) / PAGE_SIZE; + } + + nr = get_user_pages_fast(addr, nr, gup->flags & 1, pages + i); + i += nr; + } + end_time = ktime_get(); + + gup->delta_usec = ktime_us_delta(end_time, start_time); + gup->size = addr - gup->addr; + + for (i = 0; i < nr_pages; i++) { + if (!pages[i]) + break; + put_page(pages[i]); + } + + kvfree(pages); + return 0; +} + +static long gup_benchmark_ioctl(struct file *filep, unsigned int cmd, + unsigned long arg) +{ + struct gup_benchmark gup; + int ret; + + if (cmd != GUP_FAST_BENCHMARK) + return -EINVAL; + + if (copy_from_user(&gup, (void __user *)arg, sizeof(gup))) + return -EFAULT; + + ret = __gup_benchmark_ioctl(cmd, &gup); + if (ret) + return ret; + + if (copy_to_user((void __user *)arg, &gup, sizeof(gup))) + return -EFAULT; + + return 0; +} + +static const struct file_operations gup_benchmark_fops = { + .open = nonseekable_open, + .unlocked_ioctl = gup_benchmark_ioctl, +}; + +static int gup_benchmark_init(void) +{ + void *ret; + + ret = debugfs_create_file_unsafe("gup_benchmark", 0600, NULL, NULL, + &gup_benchmark_fops); + if (!ret) + pr_warn("Failed to create gup_benchmark in debugfs"); + + return 0; +} + +late_initcall(gup_benchmark_init); diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index e49eca1915f8ca..7f45806bd86324 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -18,6 +18,7 @@ TEST_GEN_FILES += transhuge-stress TEST_GEN_FILES += userfaultfd TEST_GEN_FILES += mlock-random-test TEST_GEN_FILES += virtual_address_range +TEST_GEN_FILES += gup_benchmark TEST_PROGS := run_vmtests diff --git a/tools/testing/selftests/vm/gup_benchmark.c b/tools/testing/selftests/vm/gup_benchmark.c new file mode 100644 index 00000000000000..36df55132036fb --- /dev/null +++ b/tools/testing/selftests/vm/gup_benchmark.c @@ -0,0 +1,91 @@ +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#define MB (1UL << 20) +#define PAGE_SIZE sysconf(_SC_PAGESIZE) + +#define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark) + +struct gup_benchmark { + __u64 delta_usec; + __u64 addr; + __u64 size; + __u32 nr_pages_per_call; + __u32 flags; +}; + +int main(int argc, char **argv) +{ + struct gup_benchmark gup; + unsigned long size = 128 * MB; + int i, fd, opt, nr_pages = 1, thp = -1, repeats = 1, write = 0; + char *p; + + while ((opt = getopt(argc, argv, "m:r:n:tT")) != -1) { + switch (opt) { + case 'm': + size = atoi(optarg) * MB; + break; + case 'r': + repeats = atoi(optarg); + break; + case 'n': + nr_pages = atoi(optarg); + break; + case 't': + thp = 1; + break; + case 'T': + thp = 0; + break; + case 'w': + write = 1; + default: + return -1; + } + } + + gup.nr_pages_per_call = nr_pages; + gup.flags = write; + + fd = open("/sys/kernel/debug/gup_benchmark", O_RDWR); + if (fd == -1) + perror("open"), exit(1); + + p = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (p == MAP_FAILED) + perror("mmap"), exit(1); + gup.addr = (unsigned long)p; + + if (thp == 1) + madvise(p, size, MADV_HUGEPAGE); + else if (thp == 0) + madvise(p, size, MADV_NOHUGEPAGE); + + for (; (unsigned long)p < gup.addr + size; p += PAGE_SIZE) + p[0] = 0; + + for (i = 0; i < repeats; i++) { + gup.size = size; + if (ioctl(fd, GUP_FAST_BENCHMARK, &gup)) + perror("ioctl"), exit(1); + + printf("Time: %lld us", gup.delta_usec); + if (gup.size != size) + printf(", truncated (size: %lld)", gup.size); + printf("\n"); + } + + return 0; +} From d4258247d9057c848cc1c1ad9581400b5124dedd Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 17 Nov 2017 15:31:26 -0800 Subject: [PATCH 88/94] drivers/pcmcia/sa1111_badge4.c: avoid unused function warning pcmv_setup() is only used when the badge4 driver is built-in, but not when it is a loadable module: drivers/pcmcia/sa1111_badge4.c:153:122: error: 'pcmv_setup' defined but not used [-Werror=unused-function] This adds an #ifdef to avoid the definition of the unused function in the modular case. Link: http://lkml.kernel.org/r/20170911201133.3421636-1-arnd@arndb.de Signed-off-by: Arnd Bergmann Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/pcmcia/sa1111_badge4.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/pcmcia/sa1111_badge4.c b/drivers/pcmcia/sa1111_badge4.c index 2f490930430d46..93a5c7423d8093 100644 --- a/drivers/pcmcia/sa1111_badge4.c +++ b/drivers/pcmcia/sa1111_badge4.c @@ -144,6 +144,7 @@ int pcmcia_badge4_init(struct sa1111_dev *dev) sa11xx_drv_pcmcia_add_one); } +#ifndef MODULE static int __init pcmv_setup(char *s) { int v[4]; @@ -158,3 +159,4 @@ static int __init pcmv_setup(char *s) } __setup("pcmv=", pcmv_setup); +#endif From 5eb9e8ac9a8f8ae98ae4386357683a0b5684bb48 Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Fri, 17 Nov 2017 15:31:29 -0800 Subject: [PATCH 89/94] arch/ia64/include/asm/topology.h: remove unused parent_node() macro Commit a7be6e5a7f8d ("mm: drop useless local parameters of __register_one_node()") removed the last user of parent_node(). The parent_node() macro in IA64(Itanium) platform is unnecessary. Remove it for cleanup. Link: http://lkml.kernel.org/r/1504234599-29533-2-git-send-email-douly.fnst@cn.fujitsu.com Signed-off-by: Dou Liyang Reported-by: Michael Ellerman Cc: Tony Luck Cc: Fenghua Yu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/include/asm/topology.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h index 3ad8f698836346..82f9bf702804f1 100644 --- a/arch/ia64/include/asm/topology.h +++ b/arch/ia64/include/asm/topology.h @@ -33,13 +33,6 @@ cpu_all_mask : \ &node_to_cpu_mask[node]) -/* - * Returns the number of the node containing Node 'nid'. - * Not implemented here. Multi-level hierarchies detected with - * the help of node_distance(). - */ -#define parent_node(nid) (nid) - /* * Determines the node for a given pci bus */ From ece15787e72442550f33aaa43dacae033e8628b1 Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Fri, 17 Nov 2017 15:31:33 -0800 Subject: [PATCH 90/94] arch/sh/include/asm/topology.h: remove unused parent_node() macro Commit a7be6e5a7f8d ("mm: drop useless local parameters of __register_one_node()") removed the last user of parent_node(). The parent_node() macro in SUPERH platform is unnecessary. Remove it for cleanup. Link: http://lkml.kernel.org/r/1504234599-29533-5-git-send-email-douly.fnst@cn.fujitsu.com Signed-off-by: Dou Liyang Reported-by: Michael Ellerman Cc: Yoshinori Sato Cc: Rich Felker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/include/asm/topology.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h index 9a32eb4098dfcf..1db470e024565b 100644 --- a/arch/sh/include/asm/topology.h +++ b/arch/sh/include/asm/topology.h @@ -5,7 +5,6 @@ #ifdef CONFIG_NUMA #define cpu_to_node(cpu) ((void)(cpu),0) -#define parent_node(node) ((void)(node),0) #define cpumask_of_node(node) ((void)node, cpu_online_mask) From 5f4cdac6bce420ba86c3ab5ec30037822d726ce5 Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Fri, 17 Nov 2017 15:31:36 -0800 Subject: [PATCH 91/94] arch/sparc/include/asm/topology_64.h: remove unused parent_node() macro Commit a7be6e5a7f8d ("mm: drop useless local parameters of __register_one_node()") removed the last user of parent_node(). The parent_node() macro in SPARC64 platform is unnecessary. Remove it for cleanup. Link: http://lkml.kernel.org/r/1504234599-29533-6-git-send-email-douly.fnst@cn.fujitsu.com Signed-off-by: Dou Liyang Reported-by: Michael Ellerman Acked-by: David S. Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sparc/include/asm/topology_64.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h index 3831b1911a19a3..34c628a22ea5b5 100644 --- a/arch/sparc/include/asm/topology_64.h +++ b/arch/sparc/include/asm/topology_64.h @@ -11,8 +11,6 @@ static inline int cpu_to_node(int cpu) return numa_cpu_lookup_table[cpu]; } -#define parent_node(node) (node) - #define cpumask_of_node(node) ((node) == -1 ? \ cpu_all_mask : \ &numa_cpumask_lookup_table[node]) From 52563d05f2500ec9c8d715f3c753b0ef415f9cbb Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Fri, 17 Nov 2017 15:31:40 -0800 Subject: [PATCH 92/94] arch/tile/include/asm/topology.h: remove unused parent_node() macro Commit a7be6e5a7f8d ("mm: drop useless local parameters of __register_one_node()") removed the last user of parent_node(). The parent_node() macro in tile platform is unnecessary. Remove it for cleanup. Link: http://lkml.kernel.org/r/1504234599-29533-7-git-send-email-douly.fnst@cn.fujitsu.com Signed-off-by: Dou Liyang Reported-by: Michael Ellerman Acked-by: Chris Metcalf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/tile/include/asm/topology.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/tile/include/asm/topology.h b/arch/tile/include/asm/topology.h index b11d5fcd2c4121..635a0a4596f00c 100644 --- a/arch/tile/include/asm/topology.h +++ b/arch/tile/include/asm/topology.h @@ -29,12 +29,6 @@ static inline int cpu_to_node(int cpu) return cpu_2_node[cpu]; } -/* - * Returns the number of the node containing Node 'node'. - * This architecture is flat, so it is a pretty simple function! - */ -#define parent_node(node) (node) - /* Returns a bitmask of CPUs on Node 'node'. */ static inline const struct cpumask *cpumask_of_node(int node) { From 7016383b44e855209aff47e56f11c59b8aa7b642 Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Fri, 17 Nov 2017 15:31:43 -0800 Subject: [PATCH 93/94] include/asm-generic/topology.h: remove unused parent_node() macro Commit a7be6e5a7f8d ("mm: drop useless local parameters of __register_one_node()") removed the last user of parent_node(). The parent_node() macro in generic situation is unnecessary. Remove it for cleanup. Link: http://lkml.kernel.org/r/1504234599-29533-8-git-send-email-douly.fnst@cn.fujitsu.com Signed-off-by: Dou Liyang Reported-by: Michael Ellerman Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/topology.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/asm-generic/topology.h b/include/asm-generic/topology.h index 5d2add1a6c9648..23887373955021 100644 --- a/include/asm-generic/topology.h +++ b/include/asm-generic/topology.h @@ -44,9 +44,6 @@ #define cpu_to_mem(cpu) ((void)(cpu),0) #endif -#ifndef parent_node -#define parent_node(node) ((void)(node),0) -#endif #ifndef cpumask_of_node #ifdef CONFIG_NEED_MULTIPLE_NODES #define cpumask_of_node(node) ((node) == 0 ? cpu_online_mask : cpu_none_mask) From d1b069f5febc850556cf49e9bb9092d3179c5be5 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 17 Nov 2017 15:31:47 -0800 Subject: [PATCH 94/94] EXPERT Kconfig menu: fix broken EXPERT menu Clean up the EXPERT menu (yet again). Move FHANDLE and CHECKPOINT_RESTORE into the primary EXPERT menu since they already depend on EXPERT. Move BPF_SYSCALL and USERFAULTFD out of the EXPERT Kconfig symbols menu list since they do not depend on EXPERT and were breaking the continuity of that menu list. Move all of the KALLSYMS Kconfig symbols to the end of the EXPERT menu. This separates the kernel services from the build options. This patch depends on [PATCH] pci: move PCI_QUIRKS to the PCI bus menu (https://lkml.org/lkml/2017/11/2/907). Link: http://lkml.kernel.org/r/72e4465a-a5ff-cb3c-1a90-11aa4861b161@infradead.org Signed-off-by: Randy Dunlap Acked-by: Daniel Borkmann [BPF] Cc: Andrea Arcangeli Cc: Alexei Starovoitov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/Kconfig | 184 ++++++++++++++++++++++++++------------------------- 1 file changed, 93 insertions(+), 91 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index 7d5a6fbac56a0f..2934249fba4674 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -283,19 +283,6 @@ config CROSS_MEMORY_ATTACH to directly read from or write to another process' address space. See the man page for more details. -config FHANDLE - bool "open by fhandle syscalls" if EXPERT - select EXPORTFS - default y - help - If you say Y here, a user level program will be able to map - file names to handle and then later use the handle for - different file system operations. This is useful in implementing - userspace file servers, which now track files using handles instead - of names. The handle would remain the same even if file names - get renamed. Enables open_by_handle_at(2) and name_to_handle_at(2) - syscalls. - config USELIB bool "uselib syscall" def_bool ALPHA || M68K || SPARC || X86_32 || IA32_EMULATION @@ -883,18 +870,6 @@ config SOCK_CGROUP_DATA endif # CGROUPS -config CHECKPOINT_RESTORE - bool "Checkpoint/restore support" if EXPERT - select PROC_CHILDREN - default n - help - Enables additional kernel features in a sake of checkpoint/restore. - In particular it adds auxiliary prctl codes to setup process text, - data and heap segment sizes, and a few additional /proc filesystem - entries. - - If unsure, say N here. - menuconfig NAMESPACES bool "Namespaces support" if EXPERT depends on MULTIUSER @@ -1163,6 +1138,19 @@ config SYSCTL_SYSCALL If unsure say N here. +config FHANDLE + bool "open by fhandle syscalls" if EXPERT + select EXPORTFS + default y + help + If you say Y here, a user level program will be able to map + file names to handle and then later use the handle for + different file system operations. This is useful in implementing + userspace file servers, which now track files using handles instead + of names. The handle would remain the same even if file names + get renamed. Enables open_by_handle_at(2) and name_to_handle_at(2) + syscalls. + config POSIX_TIMERS bool "Posix Clocks & timers" if EXPERT default y @@ -1180,54 +1168,6 @@ config POSIX_TIMERS If unsure say y. -config KALLSYMS - bool "Load all symbols for debugging/ksymoops" if EXPERT - default y - help - Say Y here to let the kernel print out symbolic crash information and - symbolic stack backtraces. This increases the size of the kernel - somewhat, as all symbols have to be loaded into the kernel image. - -config KALLSYMS_ALL - bool "Include all symbols in kallsyms" - depends on DEBUG_KERNEL && KALLSYMS - help - Normally kallsyms only contains the symbols of functions for nicer - OOPS messages and backtraces (i.e., symbols from the text and inittext - sections). This is sufficient for most cases. And only in very rare - cases (e.g., when a debugger is used) all symbols are required (e.g., - names of variables from the data sections, etc). - - This option makes sure that all symbols are loaded into the kernel - image (i.e., symbols from all sections) in cost of increased kernel - size (depending on the kernel configuration, it may be 300KiB or - something like this). - - Say N unless you really need all symbols. - -config KALLSYMS_ABSOLUTE_PERCPU - bool - depends on KALLSYMS - default X86_64 && SMP - -config KALLSYMS_BASE_RELATIVE - bool - depends on KALLSYMS - default !IA64 && !(TILE && 64BIT) - help - Instead of emitting them as absolute values in the native word size, - emit the symbol references in the kallsyms table as 32-bit entries, - each containing a relative value in the range [base, base + U32_MAX] - or, when KALLSYMS_ABSOLUTE_PERCPU is in effect, each containing either - an absolute value in the range [0, S32_MAX] or a relative value in the - range [base, base + S32_MAX], where base is the lowest relative symbol - address encountered in the image. - - On 64-bit builds, this reduces the size of the address table by 50%, - but more importantly, it results in entries whose values are build - time constants, and no relocation pass is required at runtime to fix - up the entries based on the runtime load address of the kernel. - config PRINTK default y bool "Enable support for printk" if EXPERT @@ -1339,16 +1279,6 @@ config EVENTFD If unsure, say Y. -# syscall, maps, verifier -config BPF_SYSCALL - bool "Enable bpf() system call" - select ANON_INODES - select BPF - default n - help - Enable the bpf() system call that allows to manipulate eBPF - programs and maps via file descriptors. - config SHMEM bool "Use full shmem filesystem" if EXPERT default y @@ -1378,14 +1308,6 @@ config ADVISE_SYSCALLS applications use these syscalls, you can disable this option to save space. -config USERFAULTFD - bool "Enable userfaultfd() system call" - select ANON_INODES - depends on MMU - help - Enable the userfaultfd() system call that allows to intercept and - handle page faults in userland. - config MEMBARRIER bool "Enable membarrier() system call" if EXPERT default y @@ -1398,6 +1320,86 @@ config MEMBARRIER If unsure, say Y. +config CHECKPOINT_RESTORE + bool "Checkpoint/restore support" if EXPERT + select PROC_CHILDREN + default n + help + Enables additional kernel features in a sake of checkpoint/restore. + In particular it adds auxiliary prctl codes to setup process text, + data and heap segment sizes, and a few additional /proc filesystem + entries. + + If unsure, say N here. + +config KALLSYMS + bool "Load all symbols for debugging/ksymoops" if EXPERT + default y + help + Say Y here to let the kernel print out symbolic crash information and + symbolic stack backtraces. This increases the size of the kernel + somewhat, as all symbols have to be loaded into the kernel image. + +config KALLSYMS_ALL + bool "Include all symbols in kallsyms" + depends on DEBUG_KERNEL && KALLSYMS + help + Normally kallsyms only contains the symbols of functions for nicer + OOPS messages and backtraces (i.e., symbols from the text and inittext + sections). This is sufficient for most cases. And only in very rare + cases (e.g., when a debugger is used) all symbols are required (e.g., + names of variables from the data sections, etc). + + This option makes sure that all symbols are loaded into the kernel + image (i.e., symbols from all sections) in cost of increased kernel + size (depending on the kernel configuration, it may be 300KiB or + something like this). + + Say N unless you really need all symbols. + +config KALLSYMS_ABSOLUTE_PERCPU + bool + depends on KALLSYMS + default X86_64 && SMP + +config KALLSYMS_BASE_RELATIVE + bool + depends on KALLSYMS + default !IA64 && !(TILE && 64BIT) + help + Instead of emitting them as absolute values in the native word size, + emit the symbol references in the kallsyms table as 32-bit entries, + each containing a relative value in the range [base, base + U32_MAX] + or, when KALLSYMS_ABSOLUTE_PERCPU is in effect, each containing either + an absolute value in the range [0, S32_MAX] or a relative value in the + range [base, base + S32_MAX], where base is the lowest relative symbol + address encountered in the image. + + On 64-bit builds, this reduces the size of the address table by 50%, + but more importantly, it results in entries whose values are build + time constants, and no relocation pass is required at runtime to fix + up the entries based on the runtime load address of the kernel. + +# end of the "standard kernel features (expert users)" menu + +# syscall, maps, verifier +config BPF_SYSCALL + bool "Enable bpf() system call" + select ANON_INODES + select BPF + default n + help + Enable the bpf() system call that allows to manipulate eBPF + programs and maps via file descriptors. + +config USERFAULTFD + bool "Enable userfaultfd() system call" + select ANON_INODES + depends on MMU + help + Enable the userfaultfd() system call that allows to intercept and + handle page faults in userland. + config EMBEDDED bool "Embedded system" option allnoconfig_y