slab_common.c 37.5 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2
3
4
5
6
7
8
9
10
11
12
/*
 * Slab allocator functions that are independent of the allocator strategy
 *
 * (C) 2012 Christoph Lameter <cl@linux.com>
 */
#include <linux/slab.h>

#include <linux/mm.h>
#include <linux/poison.h>
#include <linux/interrupt.h>
#include <linux/memory.h>
13
#include <linux/cache.h>
14
15
#include <linux/compiler.h>
#include <linux/module.h>
16
17
#include <linux/cpu.h>
#include <linux/uaccess.h>
18
19
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
20
21
22
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
#include <asm/page.h>
23
#include <linux/memcontrol.h>
24
25

#define CREATE_TRACE_POINTS
26
#include <trace/events/kmem.h>
27

28
29
30
#include "slab.h"

enum slab_state slab_state;
31
32
LIST_HEAD(slab_caches);
DEFINE_MUTEX(slab_mutex);
33
struct kmem_cache *kmem_cache;
34

35
36
37
38
39
40
41
42
#ifdef CONFIG_HARDENED_USERCOPY
bool usercopy_fallback __ro_after_init =
		IS_ENABLED(CONFIG_HARDENED_USERCOPY_FALLBACK);
module_param(usercopy_fallback, bool, 0400);
MODULE_PARM_DESC(usercopy_fallback,
		"WARN instead of reject usercopy whitelist violations");
#endif

43
44
45
46
47
static LIST_HEAD(slab_caches_to_rcu_destroy);
static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work);
static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
		    slab_caches_to_rcu_destroy_workfn);

48
49
50
51
/*
 * Set of flags that will prevent slab merging
 */
#define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
52
		SLAB_TRACE | SLAB_TYPESAFE_BY_RCU | SLAB_NOLEAKTRACE | \
Alexander Potapenko's avatar
Alexander Potapenko committed
53
		SLAB_FAILSLAB | SLAB_KASAN)
54

Vladimir Davydov's avatar
Vladimir Davydov committed
55
#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
56
			 SLAB_ACCOUNT)
57
58
59
60

/*
 * Merge control. If this is set then no merging of slab caches will occur.
 */
61
static bool slab_nomerge = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT);
62
63
64

static int __init setup_slab_nomerge(char *str)
{
65
	slab_nomerge = true;
66
67
68
69
70
71
72
73
74
	return 1;
}

#ifdef CONFIG_SLUB
__setup_param("slub_nomerge", slub_nomerge, setup_slab_nomerge, 0);
#endif

__setup("slab_nomerge", setup_slab_nomerge);

75
76
77
78
79
80
81
82
83
/*
 * Determine the size of a slab object
 */
unsigned int kmem_cache_size(struct kmem_cache *s)
{
	return s->object_size;
}
EXPORT_SYMBOL(kmem_cache_size);

84
#ifdef CONFIG_DEBUG_VM
85
static int kmem_cache_sanity_check(const char *name, unsigned int size)
86
87
88
{
	if (!name || in_interrupt() || size < sizeof(void *) ||
		size > KMALLOC_MAX_SIZE) {
89
90
		pr_err("kmem_cache_create(%s) integrity check failed\n", name);
		return -EINVAL;
91
	}
92

93
	WARN_ON(strchr(name, ' '));	/* It confuses parsers */
94
95
96
	return 0;
}
#else
97
static inline int kmem_cache_sanity_check(const char *name, unsigned int size)
98
99
100
{
	return 0;
}
101
102
#endif

103
104
105
106
void __kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p)
{
	size_t i;

107
108
109
110
111
112
	for (i = 0; i < nr; i++) {
		if (s)
			kmem_cache_free(s, p[i]);
		else
			kfree(p[i]);
	}
113
114
}

115
int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
116
117
118
119
120
121
122
123
								void **p)
{
	size_t i;

	for (i = 0; i < nr; i++) {
		void *x = p[i] = kmem_cache_alloc(s, flags);
		if (!x) {
			__kmem_cache_free_bulk(s, i, p);
124
			return 0;
125
126
		}
	}
127
	return i;
128
129
}

130
#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
131
132
133

LIST_HEAD(slab_root_caches);

134
void slab_init_memcg_params(struct kmem_cache *s)
135
{
136
	s->memcg_params.root_cache = NULL;
137
	RCU_INIT_POINTER(s->memcg_params.memcg_caches, NULL);
138
	INIT_LIST_HEAD(&s->memcg_params.children);
139
	s->memcg_params.dying = false;
140
141
142
143
144
145
}

static int init_memcg_params(struct kmem_cache *s,
		struct mem_cgroup *memcg, struct kmem_cache *root_cache)
{
	struct memcg_cache_array *arr;
146

147
	if (root_cache) {
148
		s->memcg_params.root_cache = root_cache;
149
150
		s->memcg_params.memcg = memcg;
		INIT_LIST_HEAD(&s->memcg_params.children_node);
151
		INIT_LIST_HEAD(&s->memcg_params.kmem_caches_node);
152
		return 0;
153
	}
154

155
	slab_init_memcg_params(s);
156

157
158
	if (!memcg_nr_cache_ids)
		return 0;
159

160
161
162
	arr = kvzalloc(sizeof(struct memcg_cache_array) +
		       memcg_nr_cache_ids * sizeof(void *),
		       GFP_KERNEL);
163
164
	if (!arr)
		return -ENOMEM;
165

166
	RCU_INIT_POINTER(s->memcg_params.memcg_caches, arr);
167
168
169
	return 0;
}

170
static void destroy_memcg_params(struct kmem_cache *s)
171
{
172
	if (is_root_cache(s))
173
174
175
176
177
178
179
180
181
		kvfree(rcu_access_pointer(s->memcg_params.memcg_caches));
}

static void free_memcg_params(struct rcu_head *rcu)
{
	struct memcg_cache_array *old;

	old = container_of(rcu, struct memcg_cache_array, rcu);
	kvfree(old);
182
183
}

184
static int update_memcg_params(struct kmem_cache *s, int new_array_size)
185
{
186
	struct memcg_cache_array *old, *new;
187

188
189
	new = kvzalloc(sizeof(struct memcg_cache_array) +
		       new_array_size * sizeof(void *), GFP_KERNEL);
190
	if (!new)
191
192
		return -ENOMEM;

193
194
195
196
197
	old = rcu_dereference_protected(s->memcg_params.memcg_caches,
					lockdep_is_held(&slab_mutex));
	if (old)
		memcpy(new->entries, old->entries,
		       memcg_nr_cache_ids * sizeof(void *));
198

199
200
	rcu_assign_pointer(s->memcg_params.memcg_caches, new);
	if (old)
201
		call_rcu(&old->rcu, free_memcg_params);
202
203
204
	return 0;
}

205
206
207
208
209
int memcg_update_all_caches(int num_memcgs)
{
	struct kmem_cache *s;
	int ret = 0;

210
	mutex_lock(&slab_mutex);
211
	list_for_each_entry(s, &slab_root_caches, root_caches_node) {
212
		ret = update_memcg_params(s, num_memcgs);
213
214
215
216
217
		/*
		 * Instead of freeing the memory, we'll just leave the caches
		 * up to this point in an updated state.
		 */
		if (ret)
218
			break;
219
220
221
222
	}
	mutex_unlock(&slab_mutex);
	return ret;
}
223

224
void memcg_link_cache(struct kmem_cache *s)
225
{
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
	if (is_root_cache(s)) {
		list_add(&s->root_caches_node, &slab_root_caches);
	} else {
		list_add(&s->memcg_params.children_node,
			 &s->memcg_params.root_cache->memcg_params.children);
		list_add(&s->memcg_params.kmem_caches_node,
			 &s->memcg_params.memcg->kmem_caches);
	}
}

static void memcg_unlink_cache(struct kmem_cache *s)
{
	if (is_root_cache(s)) {
		list_del(&s->root_caches_node);
	} else {
		list_del(&s->memcg_params.children_node);
		list_del(&s->memcg_params.kmem_caches_node);
	}
244
}
245
#else
246
247
static inline int init_memcg_params(struct kmem_cache *s,
		struct mem_cgroup *memcg, struct kmem_cache *root_cache)
248
249
250
251
{
	return 0;
}

252
static inline void destroy_memcg_params(struct kmem_cache *s)
253
254
{
}
255

256
static inline void memcg_unlink_cache(struct kmem_cache *s)
257
258
{
}
259
#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
260

261
262
263
264
/*
 * Figure out what the alignment of the objects will be given a set of
 * flags, a user specified alignment and the size of the objects.
 */
265
266
static unsigned int calculate_alignment(slab_flags_t flags,
		unsigned int align, unsigned int size)
267
268
269
270
271
272
273
274
275
{
	/*
	 * If the user wants hardware cache aligned objects then follow that
	 * suggestion if the object is sufficiently large.
	 *
	 * The hardware cache alignment cannot override the specified
	 * alignment though. If that is greater then use it.
	 */
	if (flags & SLAB_HWCACHE_ALIGN) {
276
		unsigned int ralign;
277
278
279
280
281
282
283
284
285
286
287
288
289

		ralign = cache_line_size();
		while (size <= ralign / 2)
			ralign /= 2;
		align = max(align, ralign);
	}

	if (align < ARCH_SLAB_MINALIGN)
		align = ARCH_SLAB_MINALIGN;

	return ALIGN(align, sizeof(void *));
}

290
291
292
293
294
295
296
297
298
299
300
301
302
303
/*
 * Find a mergeable slab cache
 */
int slab_unmergeable(struct kmem_cache *s)
{
	if (slab_nomerge || (s->flags & SLAB_NEVER_MERGE))
		return 1;

	if (!is_root_cache(s))
		return 1;

	if (s->ctor)
		return 1;

304
305
306
	if (s->usersize)
		return 1;

307
308
309
310
311
312
313
314
315
	/*
	 * We may have set a slab to be unmergeable during bootstrap.
	 */
	if (s->refcount < 0)
		return 1;

	return 0;
}

316
struct kmem_cache *find_mergeable(unsigned int size, unsigned int align,
317
		slab_flags_t flags, const char *name, void (*ctor)(void *))
318
319
320
{
	struct kmem_cache *s;

321
	if (slab_nomerge)
322
323
324
325
326
327
328
329
330
331
		return NULL;

	if (ctor)
		return NULL;

	size = ALIGN(size, sizeof(void *));
	align = calculate_alignment(flags, align, size);
	size = ALIGN(size, align);
	flags = kmem_cache_flags(size, flags, name, NULL);

332
333
334
	if (flags & SLAB_NEVER_MERGE)
		return NULL;

335
	list_for_each_entry_reverse(s, &slab_root_caches, root_caches_node) {
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
		if (slab_unmergeable(s))
			continue;

		if (size > s->size)
			continue;

		if ((flags & SLAB_MERGE_SAME) != (s->flags & SLAB_MERGE_SAME))
			continue;
		/*
		 * Check if alignment is compatible.
		 * Courtesy of Adrian Drzewiecki
		 */
		if ((s->size & ~(align - 1)) != s->size)
			continue;

		if (s->size - size >= sizeof(void *))
			continue;

354
355
356
357
		if (IS_ENABLED(CONFIG_SLAB) && align &&
			(align > s->align || s->align % align))
			continue;

358
359
360
361
362
		return s;
	}
	return NULL;
}

363
static struct kmem_cache *create_cache(const char *name,
364
		unsigned int object_size, unsigned int align,
365
366
		slab_flags_t flags, unsigned int useroffset,
		unsigned int usersize, void (*ctor)(void *),
367
		struct mem_cgroup *memcg, struct kmem_cache *root_cache)
368
369
370
371
{
	struct kmem_cache *s;
	int err;

372
373
374
	if (WARN_ON(useroffset + usersize > object_size))
		useroffset = usersize = 0;

375
376
377
378
379
380
	err = -ENOMEM;
	s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
	if (!s)
		goto out;

	s->name = name;
381
	s->size = s->object_size = object_size;
382
383
	s->align = align;
	s->ctor = ctor;
384
385
	s->useroffset = useroffset;
	s->usersize = usersize;
386

387
	err = init_memcg_params(s, memcg, root_cache);
388
389
390
391
392
393
394
395
396
	if (err)
		goto out_free_cache;

	err = __kmem_cache_create(s, flags);
	if (err)
		goto out_free_cache;

	s->refcount = 1;
	list_add(&s->list, &slab_caches);
397
	memcg_link_cache(s);
398
399
400
401
402
403
out:
	if (err)
		return ERR_PTR(err);
	return s;

out_free_cache:
404
	destroy_memcg_params(s);
405
	kmem_cache_free(kmem_cache, s);
406
407
	goto out;
}
408

409
/*
410
 * kmem_cache_create_usercopy - Create a cache.
411
412
413
414
 * @name: A string which is used in /proc/slabinfo to identify this cache.
 * @size: The size of objects to be created in this cache.
 * @align: The required alignment for the objects.
 * @flags: SLAB flags
415
416
 * @useroffset: Usercopy region offset
 * @usersize: Usercopy region size
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
 * @ctor: A constructor for the objects.
 *
 * Returns a ptr to the cache on success, NULL on failure.
 * Cannot be called within a interrupt, but can be interrupted.
 * The @ctor is run when new pages are allocated by the cache.
 *
 * The flags are
 *
 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
 * to catch references to uninitialised memory.
 *
 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
 * for buffer overruns.
 *
 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
 * cacheline.  This can be beneficial if you're counting cycles as closely
 * as davem.
 */
435
struct kmem_cache *
436
437
kmem_cache_create_usercopy(const char *name,
		  unsigned int size, unsigned int align,
438
439
		  slab_flags_t flags,
		  unsigned int useroffset, unsigned int usersize,
440
		  void (*ctor)(void *))
441
{
442
	struct kmem_cache *s = NULL;
443
	const char *cache_name;
444
	int err;
445

446
	get_online_cpus();
447
	get_online_mems();
448
	memcg_get_cache_ids();
449

450
	mutex_lock(&slab_mutex);
451

452
	err = kmem_cache_sanity_check(name, size);
453
	if (err) {
454
		goto out_unlock;
455
	}
456

457
458
459
460
461
462
	/* Refuse requests with allocator specific flags */
	if (flags & ~SLAB_FLAGS_PERMITTED) {
		err = -EINVAL;
		goto out_unlock;
	}

463
464
465
466
467
468
469
	/*
	 * Some allocators will constraint the set of valid flags to a subset
	 * of all flags. We expect them to define CACHE_CREATE_MASK in this
	 * case, and we'll just provide them with a sanitized version of the
	 * passed flags.
	 */
	flags &= CACHE_CREATE_MASK;
470

471
472
473
474
475
476
477
	/* Fail closed on bad usersize of useroffset values. */
	if (WARN_ON(!usersize && useroffset) ||
	    WARN_ON(size < usersize || size - usersize < useroffset))
		usersize = useroffset = 0;

	if (!usersize)
		s = __kmem_cache_alias(name, size, align, flags, ctor);
478
	if (s)
479
		goto out_unlock;
480

481
	cache_name = kstrdup_const(name, GFP_KERNEL);
482
483
484
485
	if (!cache_name) {
		err = -ENOMEM;
		goto out_unlock;
	}
486

487
	s = create_cache(cache_name, size,
488
			 calculate_alignment(flags, align, size),
489
			 flags, useroffset, usersize, ctor, NULL, NULL);
490
491
	if (IS_ERR(s)) {
		err = PTR_ERR(s);
492
		kfree_const(cache_name);
493
	}
494
495

out_unlock:
496
	mutex_unlock(&slab_mutex);
497

498
	memcg_put_cache_ids();
499
	put_online_mems();
500
501
	put_online_cpus();

502
	if (err) {
503
504
505
506
		if (flags & SLAB_PANIC)
			panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
				name, err);
		else {
507
			pr_warn("kmem_cache_create(%s) failed with error %d\n",
508
509
510
511
512
				name, err);
			dump_stack();
		}
		return NULL;
	}
513
514
	return s;
}
515
516
517
EXPORT_SYMBOL(kmem_cache_create_usercopy);

struct kmem_cache *
518
kmem_cache_create(const char *name, unsigned int size, unsigned int align,
519
520
		slab_flags_t flags, void (*ctor)(void *))
{
521
	return kmem_cache_create_usercopy(name, size, align, flags, 0, 0,
522
523
					  ctor);
}
524
EXPORT_SYMBOL(kmem_cache_create);
525

526
static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work)
527
{
528
529
	LIST_HEAD(to_destroy);
	struct kmem_cache *s, *s2;
530

531
	/*
532
	 * On destruction, SLAB_TYPESAFE_BY_RCU kmem_caches are put on the
533
534
535
536
537
538
539
540
541
542
	 * @slab_caches_to_rcu_destroy list.  The slab pages are freed
	 * through RCU and and the associated kmem_cache are dereferenced
	 * while freeing the pages, so the kmem_caches should be freed only
	 * after the pending RCU operations are finished.  As rcu_barrier()
	 * is a pretty slow operation, we batch all pending destructions
	 * asynchronously.
	 */
	mutex_lock(&slab_mutex);
	list_splice_init(&slab_caches_to_rcu_destroy, &to_destroy);
	mutex_unlock(&slab_mutex);
543

544
545
546
547
548
549
550
551
552
553
554
555
	if (list_empty(&to_destroy))
		return;

	rcu_barrier();

	list_for_each_entry_safe(s, s2, &to_destroy, list) {
#ifdef SLAB_SUPPORTS_SYSFS
		sysfs_slab_release(s);
#else
		slab_kmem_cache_release(s);
#endif
	}
556
557
}

558
static int shutdown_cache(struct kmem_cache *s)
559
{
560
561
562
	/* free asan quarantined objects */
	kasan_cache_shutdown(s);

563
564
	if (__kmem_cache_shutdown(s) != 0)
		return -EBUSY;
565

566
	memcg_unlink_cache(s);
567
	list_del(&s->list);
568

569
	if (s->flags & SLAB_TYPESAFE_BY_RCU) {
570
571
572
		list_add_tail(&s->list, &slab_caches_to_rcu_destroy);
		schedule_work(&slab_caches_to_rcu_destroy_work);
	} else {
573
#ifdef SLAB_SUPPORTS_SYSFS
574
		sysfs_slab_release(s);
575
576
577
578
#else
		slab_kmem_cache_release(s);
#endif
	}
579
580

	return 0;
581
582
}

583
#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
584
/*
585
 * memcg_create_kmem_cache - Create a cache for a memory cgroup.
586
587
588
589
590
591
592
 * @memcg: The memory cgroup the new cache is for.
 * @root_cache: The parent of the new cache.
 *
 * This function attempts to create a kmem cache that will serve allocation
 * requests going from @memcg to @root_cache. The new cache inherits properties
 * from its parent.
 */
593
594
void memcg_create_kmem_cache(struct mem_cgroup *memcg,
			     struct kmem_cache *root_cache)
595
{
596
	static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */
Michal Hocko's avatar
Michal Hocko committed
597
	struct cgroup_subsys_state *css = &memcg->css;
598
	struct memcg_cache_array *arr;
599
	struct kmem_cache *s = NULL;
600
	char *cache_name;
601
	int idx;
602
603

	get_online_cpus();
604
605
	get_online_mems();

606
607
	mutex_lock(&slab_mutex);

608
	/*
609
	 * The memory cgroup could have been offlined while the cache
610
611
	 * creation work was pending.
	 */
612
	if (memcg->kmem_state != KMEM_ONLINE || root_cache->memcg_params.dying)
613
614
		goto out_unlock;

615
616
617
618
	idx = memcg_cache_id(memcg);
	arr = rcu_dereference_protected(root_cache->memcg_params.memcg_caches,
					lockdep_is_held(&slab_mutex));

619
620
621
622
623
	/*
	 * Since per-memcg caches are created asynchronously on first
	 * allocation (see memcg_kmem_get_cache()), several threads can try to
	 * create the same cache, but only one of them may succeed.
	 */
624
	if (arr->entries[idx])
625
626
		goto out_unlock;

627
	cgroup_name(css->cgroup, memcg_name_buf, sizeof(memcg_name_buf));
628
629
	cache_name = kasprintf(GFP_KERNEL, "%s(%llu:%s)", root_cache->name,
			       css->serial_nr, memcg_name_buf);
630
631
632
	if (!cache_name)
		goto out_unlock;

633
	s = create_cache(cache_name, root_cache->object_size,
634
			 root_cache->align,
635
			 root_cache->flags & CACHE_CREATE_MASK,
636
			 root_cache->useroffset, root_cache->usersize,
637
			 root_cache->ctor, memcg, root_cache);
638
639
640
641
642
	/*
	 * If we could not create a memcg cache, do not complain, because
	 * that's not critical at all as we can always proceed with the root
	 * cache.
	 */
643
	if (IS_ERR(s)) {
644
		kfree(cache_name);
645
		goto out_unlock;
646
	}
647

648
649
650
651
652
653
	/*
	 * Since readers won't lock (see cache_from_memcg_idx()), we need a
	 * barrier here to ensure nobody will see the kmem_cache partially
	 * initialized.
	 */
	smp_wmb();
654
	arr->entries[idx] = s;
655

656
657
out_unlock:
	mutex_unlock(&slab_mutex);
658
659

	put_online_mems();
660
	put_online_cpus();
661
}
662

663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
static void kmemcg_deactivate_workfn(struct work_struct *work)
{
	struct kmem_cache *s = container_of(work, struct kmem_cache,
					    memcg_params.deact_work);

	get_online_cpus();
	get_online_mems();

	mutex_lock(&slab_mutex);

	s->memcg_params.deact_fn(s);

	mutex_unlock(&slab_mutex);

	put_online_mems();
	put_online_cpus();

	/* done, put the ref from slab_deactivate_memcg_cache_rcu_sched() */
	css_put(&s->memcg_params.memcg->css);
}

static void kmemcg_deactivate_rcufn(struct rcu_head *head)
{
	struct kmem_cache *s = container_of(head, struct kmem_cache,
					    memcg_params.deact_rcu_head);

	/*
	 * We need to grab blocking locks.  Bounce to ->deact_work.  The
	 * work item shares the space with the RCU head and can't be
	 * initialized eariler.
	 */
	INIT_WORK(&s->memcg_params.deact_work, kmemcg_deactivate_workfn);
695
	queue_work(memcg_kmem_cache_wq, &s->memcg_params.deact_work);
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
}

/**
 * slab_deactivate_memcg_cache_rcu_sched - schedule deactivation after a
 *					   sched RCU grace period
 * @s: target kmem_cache
 * @deact_fn: deactivation function to call
 *
 * Schedule @deact_fn to be invoked with online cpus, mems and slab_mutex
 * held after a sched RCU grace period.  The slab is guaranteed to stay
 * alive until @deact_fn is finished.  This is to be used from
 * __kmemcg_cache_deactivate().
 */
void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s,
					   void (*deact_fn)(struct kmem_cache *))
{
	if (WARN_ON_ONCE(is_root_cache(s)) ||
	    WARN_ON_ONCE(s->memcg_params.deact_fn))
		return;

716
717
718
	if (s->memcg_params.root_cache->memcg_params.dying)
		return;

719
720
721
722
723
724
725
	/* pin memcg so that @s doesn't get destroyed in the middle */
	css_get(&s->memcg_params.memcg->css);

	s->memcg_params.deact_fn = deact_fn;
	call_rcu_sched(&s->memcg_params.deact_rcu_head, kmemcg_deactivate_rcufn);
}

726
727
728
729
void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
{
	int idx;
	struct memcg_cache_array *arr;
730
	struct kmem_cache *s, *c;
731
732
733

	idx = memcg_cache_id(memcg);

734
735
736
	get_online_cpus();
	get_online_mems();

737
	mutex_lock(&slab_mutex);
738
	list_for_each_entry(s, &slab_root_caches, root_caches_node) {
739
740
		arr = rcu_dereference_protected(s->memcg_params.memcg_caches,
						lockdep_is_held(&slab_mutex));
741
742
743
744
		c = arr->entries[idx];
		if (!c)
			continue;

745
		__kmemcg_cache_deactivate(c);
746
747
748
		arr->entries[idx] = NULL;
	}
	mutex_unlock(&slab_mutex);
749
750
751

	put_online_mems();
	put_online_cpus();
752
753
}

754
void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
755
{
756
	struct kmem_cache *s, *s2;
757

758
759
	get_online_cpus();
	get_online_mems();
760
761

	mutex_lock(&slab_mutex);
762
763
	list_for_each_entry_safe(s, s2, &memcg->kmem_caches,
				 memcg_params.kmem_caches_node) {
764
765
766
767
		/*
		 * The cgroup is about to be freed and therefore has no charges
		 * left. Hence, all its caches must be empty by now.
		 */
768
		BUG_ON(shutdown_cache(s));
769
770
	}
	mutex_unlock(&slab_mutex);
771

772
773
	put_online_mems();
	put_online_cpus();
774
}
775

776
static int shutdown_memcg_caches(struct kmem_cache *s)
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
{
	struct memcg_cache_array *arr;
	struct kmem_cache *c, *c2;
	LIST_HEAD(busy);
	int i;

	BUG_ON(!is_root_cache(s));

	/*
	 * First, shutdown active caches, i.e. caches that belong to online
	 * memory cgroups.
	 */
	arr = rcu_dereference_protected(s->memcg_params.memcg_caches,
					lockdep_is_held(&slab_mutex));
	for_each_memcg_cache_index(i) {
		c = arr->entries[i];
		if (!c)
			continue;
795
		if (shutdown_cache(c))
796
797
798
799
800
			/*
			 * The cache still has objects. Move it to a temporary
			 * list so as not to try to destroy it for a second
			 * time while iterating over inactive caches below.
			 */
801
			list_move(&c->memcg_params.children_node, &busy);
802
803
804
805
806
807
808
809
810
811
812
813
814
815
		else
			/*
			 * The cache is empty and will be destroyed soon. Clear
			 * the pointer to it in the memcg_caches array so that
			 * it will never be accessed even if the root cache
			 * stays alive.
			 */
			arr->entries[i] = NULL;
	}

	/*
	 * Second, shutdown all caches left from memory cgroups that are now
	 * offline.
	 */
816
817
	list_for_each_entry_safe(c, c2, &s->memcg_params.children,
				 memcg_params.children_node)
818
		shutdown_cache(c);
819

820
	list_splice(&busy, &s->memcg_params.children);
821
822
823
824
825

	/*
	 * A cache being destroyed must be empty. In particular, this means
	 * that all per memcg caches attached to it must be empty too.
	 */
826
	if (!list_empty(&s->memcg_params.children))
827
828
829
		return -EBUSY;
	return 0;
}
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850

static void flush_memcg_workqueue(struct kmem_cache *s)
{
	mutex_lock(&slab_mutex);
	s->memcg_params.dying = true;
	mutex_unlock(&slab_mutex);

	/*
	 * SLUB deactivates the kmem_caches through call_rcu_sched. Make
	 * sure all registered rcu callbacks have been invoked.
	 */
	if (IS_ENABLED(CONFIG_SLUB))
		rcu_barrier_sched();

	/*
	 * SLAB and SLUB create memcg kmem_caches through workqueue and SLUB
	 * deactivates the memcg kmem_caches through workqueue. Make sure all
	 * previous workitems on workqueue are processed.
	 */
	flush_workqueue(memcg_kmem_cache_wq);
}
851
#else
852
static inline int shutdown_memcg_caches(struct kmem_cache *s)
853
854
855
{
	return 0;
}
856
857
858
859

static inline void flush_memcg_workqueue(struct kmem_cache *s)
{
}
860
#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
861

862
863
void slab_kmem_cache_release(struct kmem_cache *s)
{
864
	__kmem_cache_release(s);
865
	destroy_memcg_params(s);
866
	kfree_const(s->name);
867
868
869
	kmem_cache_free(kmem_cache, s);
}

870
871
void kmem_cache_destroy(struct kmem_cache *s)
{
872
	int err;
873

874
875
876
	if (unlikely(!s))
		return;

877
878
	flush_memcg_workqueue(s);

879
	get_online_cpus();
880
881
	get_online_mems();

882
	mutex_lock(&slab_mutex);
883

884
	s->refcount--;
885
886
887
	if (s->refcount)
		goto out_unlock;

888
	err = shutdown_memcg_caches(s);
889
	if (!err)
890
		err = shutdown_cache(s);
891

892
	if (err) {
Joe Perches's avatar
Joe Perches committed
893
894
		pr_err("kmem_cache_destroy %s: Slab cache still has objects\n",
		       s->name);
895
896
		dump_stack();
	}
897
898
out_unlock:
	mutex_unlock(&slab_mutex);
899

900
	put_online_mems();
901
902
903
904
	put_online_cpus();
}
EXPORT_SYMBOL(kmem_cache_destroy);

905
906
907
908
909
910
911
912
913
914
915
916
917
/**
 * kmem_cache_shrink - Shrink a cache.
 * @cachep: The cache to shrink.
 *
 * Releases as many slabs as possible for a cache.
 * To help debugging, a zero exit status indicates all slabs were released.
 */
int kmem_cache_shrink(struct kmem_cache *cachep)
{
	int ret;

	get_online_cpus();
	get_online_mems();
918
	kasan_cache_shrink(cachep);
919
	ret = __kmem_cache_shrink(cachep);
920
921
922
923
924
925
	put_online_mems();
	put_online_cpus();
	return ret;
}
EXPORT_SYMBOL(kmem_cache_shrink);

926
bool slab_is_available(void)
927
928
929
{
	return slab_state >= UP;
}
930

931
932
#ifndef CONFIG_SLOB
/* Create a cache during boot when no slab services are available yet */
933
934
935
void __init create_boot_cache(struct kmem_cache *s, const char *name,
		unsigned int size, slab_flags_t flags,
		unsigned int useroffset, unsigned int usersize)
936
937
938
939
940
{
	int err;

	s->name = name;
	s->size = s->object_size = size;
941
	s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size);
942
943
	s->useroffset = useroffset;
	s->usersize = usersize;
944
945
946

	slab_init_memcg_params(s);

947
948
949
	err = __kmem_cache_create(s, flags);

	if (err)
950
		panic("Creation of kmalloc slab %s size=%u failed. Reason %d\n",
951
952
953
954
955
					name, size, err);

	s->refcount = -1;	/* Exempt from merging for now */
}

956
957
958
struct kmem_cache *__init create_kmalloc_cache(const char *name,
		unsigned int size, slab_flags_t flags,
		unsigned int useroffset, unsigned int usersize)
959
960
961
962
963
964
{
	struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);

	if (!s)
		panic("Out of memory when creating slab %s\n", name);

965
	create_boot_cache(s, name, size, flags, useroffset, usersize);
966
	list_add(&s->list, &slab_caches);
967
	memcg_link_cache(s);
968
969
970
971
	s->refcount = 1;
	return s;
}

972
struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __ro_after_init;
973
974
975
EXPORT_SYMBOL(kmalloc_caches);

#ifdef CONFIG_ZONE_DMA
976
struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1] __ro_after_init;
977
978
979
EXPORT_SYMBOL(kmalloc_dma_caches);
#endif

980
981
982
983
984
985
/*
 * Conversion table for small slabs sizes / 8 to the index in the
 * kmalloc array. This is necessary for slabs < 192 since we have non power
 * of two cache sizes there. The size of larger slabs can be determined using
 * fls.
 */
986
static u8 size_index[24] __ro_after_init = {
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
	3,	/* 8 */
	4,	/* 16 */
	5,	/* 24 */
	5,	/* 32 */
	6,	/* 40 */
	6,	/* 48 */
	6,	/* 56 */
	6,	/* 64 */
	1,	/* 72 */
	1,	/* 80 */
	1,	/* 88 */
	1,	/* 96 */
	7,	/* 104 */
	7,	/* 112 */
	7,	/* 120 */
	7,	/* 128 */
	2,	/* 136 */
	2,	/* 144 */
	2,	/* 152 */
	2,	/* 160 */
	2,	/* 168 */
	2,	/* 176 */
	2,	/* 184 */
	2	/* 192 */
};

1013
static inline unsigned int size_index_elem(unsigned int bytes)
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
{
	return (bytes - 1) / 8;
}

/*
 * Find the kmem_cache structure that serves a given size of
 * allocation
 */
struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
{
1024
	unsigned int index;
1025

1026
	if (unlikely(size > KMALLOC_MAX_SIZE)) {
1027
		WARN_ON_ONCE(!(flags & __GFP_NOWARN));
1028
		return NULL;
1029
	}
1030

1031
1032
1033
1034
1035
1036
1037
1038
1039
	if (size <= 192) {
		if (!size)
			return ZERO_SIZE_PTR;

		index = size_index[size_index_elem(size)];
	} else
		index = fls(size - 1);

#ifdef CONFIG_ZONE_DMA
1040
	if (unlikely((flags & GFP_DMA)))
1041
1042
1043
1044
1045
1046
		return kmalloc_dma_caches[index];

#endif
	return kmalloc_caches[index];
}

1047
1048
1049
1050
1051
/*
 * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time.
 * kmalloc_index() supports up to 2^26=64MB, so the final entry of the table is
 * kmalloc-67108864.
 */
1052
const struct kmalloc_info_struct kmalloc_info[] __initconst = {
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
	{NULL,                      0},		{"kmalloc-96",             96},
	{"kmalloc-192",           192},		{"kmalloc-8",               8},
	{"kmalloc-16",             16},		{"kmalloc-32",             32},
	{"kmalloc-64",             64},		{"kmalloc-128",           128},
	{"kmalloc-256",           256},		{"kmalloc-512",           512},
	{"kmalloc-1024",         1024},		{"kmalloc-2048",         2048},
	{"kmalloc-4096",         4096},		{"kmalloc-8192",         8192},
	{"kmalloc-16384",       16384},		{"kmalloc-32768",       32768},
	{"kmalloc-65536",       65536},		{"kmalloc-131072",     131072},
	{"kmalloc-262144",     262144},		{"kmalloc-524288",     524288},
	{"kmalloc-1048576",   1048576},		{"kmalloc-2097152",   2097152},
	{"kmalloc-4194304",   4194304},		{"kmalloc-8388608",   8388608},
	{"kmalloc-16777216", 16777216},		{"kmalloc-33554432", 33554432},
	{"kmalloc-67108864", 67108864}
};

1069
/*
1070
1071
1072
1073
1074
1075
1076
1077
1078
 * Patch up the size_index table if we have strange large alignment
 * requirements for the kmalloc array. This is only the case for
 * MIPS it seems. The standard arches will not generate any code here.
 *
 * Largest permitted alignment is 256 bytes due to the way we
 * handle the index determination for the smaller caches.
 *
 * Make sure that nothing crazy happens if someone starts tinkering
 * around with ARCH_KMALLOC_MINALIGN
1079
 */
1080
void __init setup_kmalloc_cache_index_table(void)
1081
{
1082
	unsigned int i;
1083

1084
1085
1086
1087
	BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
		(KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));

	for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) {
1088
		unsigned int elem = size_index_elem(i);