gitlab.arm.com will be in the maintainance mode on Wednesday June 29th 01:00 - 10:00 (UTC+1). Repositories is read only during the maintainance.

slab_common.c 37.9 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2
3
4
5
6
7
8
9
10
11
12
/*
 * Slab allocator functions that are independent of the allocator strategy
 *
 * (C) 2012 Christoph Lameter <cl@linux.com>
 */
#include <linux/slab.h>

#include <linux/mm.h>
#include <linux/poison.h>
#include <linux/interrupt.h>
#include <linux/memory.h>
13
#include <linux/cache.h>
14
15
#include <linux/compiler.h>
#include <linux/module.h>
16
17
#include <linux/cpu.h>
#include <linux/uaccess.h>
18
19
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
20
21
22
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
#include <asm/page.h>
23
#include <linux/memcontrol.h>
24
25

#define CREATE_TRACE_POINTS
26
#include <trace/events/kmem.h>
27

28
29
30
#include "slab.h"

enum slab_state slab_state;
31
32
LIST_HEAD(slab_caches);
DEFINE_MUTEX(slab_mutex);
33
struct kmem_cache *kmem_cache;
34

35
36
37
38
39
40
41
42
#ifdef CONFIG_HARDENED_USERCOPY
bool usercopy_fallback __ro_after_init =
		IS_ENABLED(CONFIG_HARDENED_USERCOPY_FALLBACK);
module_param(usercopy_fallback, bool, 0400);
MODULE_PARM_DESC(usercopy_fallback,
		"WARN instead of reject usercopy whitelist violations");
#endif

43
44
45
46
47
static LIST_HEAD(slab_caches_to_rcu_destroy);
static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work);
static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
		    slab_caches_to_rcu_destroy_workfn);

48
49
50
51
/*
 * Set of flags that will prevent slab merging
 */
#define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
52
		SLAB_TRACE | SLAB_TYPESAFE_BY_RCU | SLAB_NOLEAKTRACE | \
Alexander Potapenko's avatar
Alexander Potapenko committed
53
		SLAB_FAILSLAB | SLAB_KASAN)
54

Vladimir Davydov's avatar
Vladimir Davydov committed
55
#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
56
			 SLAB_ACCOUNT)
57
58
59
60

/*
 * Merge control. If this is set then no merging of slab caches will occur.
 */
61
static bool slab_nomerge = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT);
62
63
64

static int __init setup_slab_nomerge(char *str)
{
65
	slab_nomerge = true;
66
67
68
69
70
71
72
73
74
	return 1;
}

#ifdef CONFIG_SLUB
__setup_param("slub_nomerge", slub_nomerge, setup_slab_nomerge, 0);
#endif

__setup("slab_nomerge", setup_slab_nomerge);

75
76
77
78
79
80
81
82
83
/*
 * Determine the size of a slab object
 */
unsigned int kmem_cache_size(struct kmem_cache *s)
{
	return s->object_size;
}
EXPORT_SYMBOL(kmem_cache_size);

84
#ifdef CONFIG_DEBUG_VM
85
static int kmem_cache_sanity_check(const char *name, unsigned int size)
86
87
88
{
	if (!name || in_interrupt() || size < sizeof(void *) ||
		size > KMALLOC_MAX_SIZE) {
89
90
		pr_err("kmem_cache_create(%s) integrity check failed\n", name);
		return -EINVAL;
91
	}
92

93
	WARN_ON(strchr(name, ' '));	/* It confuses parsers */
94
95
96
	return 0;
}
#else
97
static inline int kmem_cache_sanity_check(const char *name, unsigned int size)
98
99
100
{
	return 0;
}
101
102
#endif

103
104
105
106
void __kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p)
{
	size_t i;

107
108
109
110
111
112
	for (i = 0; i < nr; i++) {
		if (s)
			kmem_cache_free(s, p[i]);
		else
			kfree(p[i]);
	}
113
114
}

115
int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
116
117
118
119
120
121
122
123
								void **p)
{
	size_t i;

	for (i = 0; i < nr; i++) {
		void *x = p[i] = kmem_cache_alloc(s, flags);
		if (!x) {
			__kmem_cache_free_bulk(s, i, p);
124
			return 0;
125
126
		}
	}
127
	return i;
128
129
}

130
#ifdef CONFIG_MEMCG_KMEM
131
132
133

LIST_HEAD(slab_root_caches);

134
void slab_init_memcg_params(struct kmem_cache *s)
135
{
136
	s->memcg_params.root_cache = NULL;
137
	RCU_INIT_POINTER(s->memcg_params.memcg_caches, NULL);
138
	INIT_LIST_HEAD(&s->memcg_params.children);
139
	s->memcg_params.dying = false;
140
141
142
143
144
145
}

static int init_memcg_params(struct kmem_cache *s,
		struct mem_cgroup *memcg, struct kmem_cache *root_cache)
{
	struct memcg_cache_array *arr;
146

147
	if (root_cache) {
148
		s->memcg_params.root_cache = root_cache;
149
150
		s->memcg_params.memcg = memcg;
		INIT_LIST_HEAD(&s->memcg_params.children_node);
151
		INIT_LIST_HEAD(&s->memcg_params.kmem_caches_node);
152
		return 0;
153
	}
154

155
	slab_init_memcg_params(s);
156

157
158
	if (!memcg_nr_cache_ids)
		return 0;
159

160
161
162
	arr = kvzalloc(sizeof(struct memcg_cache_array) +
		       memcg_nr_cache_ids * sizeof(void *),
		       GFP_KERNEL);
163
164
	if (!arr)
		return -ENOMEM;
165

166
	RCU_INIT_POINTER(s->memcg_params.memcg_caches, arr);
167
168
169
	return 0;
}

170
static void destroy_memcg_params(struct kmem_cache *s)
171
{
172
	if (is_root_cache(s))
173
174
175
176
177
178
179
180
181
		kvfree(rcu_access_pointer(s->memcg_params.memcg_caches));
}

static void free_memcg_params(struct rcu_head *rcu)
{
	struct memcg_cache_array *old;

	old = container_of(rcu, struct memcg_cache_array, rcu);
	kvfree(old);
182
183
}

184
static int update_memcg_params(struct kmem_cache *s, int new_array_size)
185
{
186
	struct memcg_cache_array *old, *new;
187

188
189
	new = kvzalloc(sizeof(struct memcg_cache_array) +
		       new_array_size * sizeof(void *), GFP_KERNEL);
190
	if (!new)
191
192
		return -ENOMEM;

193
194
195
196
197
	old = rcu_dereference_protected(s->memcg_params.memcg_caches,
					lockdep_is_held(&slab_mutex));
	if (old)
		memcpy(new->entries, old->entries,
		       memcg_nr_cache_ids * sizeof(void *));
198

199
200
	rcu_assign_pointer(s->memcg_params.memcg_caches, new);
	if (old)
201
		call_rcu(&old->rcu, free_memcg_params);
202
203
204
	return 0;
}

205
206
207
208
209
int memcg_update_all_caches(int num_memcgs)
{
	struct kmem_cache *s;
	int ret = 0;

210
	mutex_lock(&slab_mutex);
211
	list_for_each_entry(s, &slab_root_caches, root_caches_node) {
212
		ret = update_memcg_params(s, num_memcgs);
213
214
215
216
217
		/*
		 * Instead of freeing the memory, we'll just leave the caches
		 * up to this point in an updated state.
		 */
		if (ret)
218
			break;
219
220
221
222
	}
	mutex_unlock(&slab_mutex);
	return ret;
}
223

224
void memcg_link_cache(struct kmem_cache *s)
225
{
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
	if (is_root_cache(s)) {
		list_add(&s->root_caches_node, &slab_root_caches);
	} else {
		list_add(&s->memcg_params.children_node,
			 &s->memcg_params.root_cache->memcg_params.children);
		list_add(&s->memcg_params.kmem_caches_node,
			 &s->memcg_params.memcg->kmem_caches);
	}
}

static void memcg_unlink_cache(struct kmem_cache *s)
{
	if (is_root_cache(s)) {
		list_del(&s->root_caches_node);
	} else {
		list_del(&s->memcg_params.children_node);
		list_del(&s->memcg_params.kmem_caches_node);
	}
244
}
245
#else
246
247
static inline int init_memcg_params(struct kmem_cache *s,
		struct mem_cgroup *memcg, struct kmem_cache *root_cache)
248
249
250
251
{
	return 0;
}

252
static inline void destroy_memcg_params(struct kmem_cache *s)
253
254
{
}
255

256
static inline void memcg_unlink_cache(struct kmem_cache *s)
257
258
{
}
259
#endif /* CONFIG_MEMCG_KMEM */
260

261
262
263
264
/*
 * Figure out what the alignment of the objects will be given a set of
 * flags, a user specified alignment and the size of the objects.
 */
265
266
static unsigned int calculate_alignment(slab_flags_t flags,
		unsigned int align, unsigned int size)
267
268
269
270
271
272
273
274
275
{
	/*
	 * If the user wants hardware cache aligned objects then follow that
	 * suggestion if the object is sufficiently large.
	 *
	 * The hardware cache alignment cannot override the specified
	 * alignment though. If that is greater then use it.
	 */
	if (flags & SLAB_HWCACHE_ALIGN) {
276
		unsigned int ralign;
277
278
279
280
281
282
283
284
285
286
287
288
289

		ralign = cache_line_size();
		while (size <= ralign / 2)
			ralign /= 2;
		align = max(align, ralign);
	}

	if (align < ARCH_SLAB_MINALIGN)
		align = ARCH_SLAB_MINALIGN;

	return ALIGN(align, sizeof(void *));
}

290
291
292
293
294
295
296
297
298
299
300
301
302
303
/*
 * Find a mergeable slab cache
 */
int slab_unmergeable(struct kmem_cache *s)
{
	if (slab_nomerge || (s->flags & SLAB_NEVER_MERGE))
		return 1;

	if (!is_root_cache(s))
		return 1;

	if (s->ctor)
		return 1;

304
305
306
	if (s->usersize)
		return 1;

307
308
309
310
311
312
313
314
315
	/*
	 * We may have set a slab to be unmergeable during bootstrap.
	 */
	if (s->refcount < 0)
		return 1;

	return 0;
}

316
struct kmem_cache *find_mergeable(unsigned int size, unsigned int align,
317
		slab_flags_t flags, const char *name, void (*ctor)(void *))
318
319
320
{
	struct kmem_cache *s;

321
	if (slab_nomerge)
322
323
324
325
326
327
328
329
330
331
		return NULL;

	if (ctor)
		return NULL;

	size = ALIGN(size, sizeof(void *));
	align = calculate_alignment(flags, align, size);
	size = ALIGN(size, align);
	flags = kmem_cache_flags(size, flags, name, NULL);

332
333
334
	if (flags & SLAB_NEVER_MERGE)
		return NULL;

335
	list_for_each_entry_reverse(s, &slab_root_caches, root_caches_node) {
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
		if (slab_unmergeable(s))
			continue;

		if (size > s->size)
			continue;

		if ((flags & SLAB_MERGE_SAME) != (s->flags & SLAB_MERGE_SAME))
			continue;
		/*
		 * Check if alignment is compatible.
		 * Courtesy of Adrian Drzewiecki
		 */
		if ((s->size & ~(align - 1)) != s->size)
			continue;

		if (s->size - size >= sizeof(void *))
			continue;

354
355
356
357
		if (IS_ENABLED(CONFIG_SLAB) && align &&
			(align > s->align || s->align % align))
			continue;

358
359
360
361
362
		return s;
	}
	return NULL;
}

363
static struct kmem_cache *create_cache(const char *name,
364
		unsigned int object_size, unsigned int align,
365
366
		slab_flags_t flags, unsigned int useroffset,
		unsigned int usersize, void (*ctor)(void *),
367
		struct mem_cgroup *memcg, struct kmem_cache *root_cache)
368
369
370
371
{
	struct kmem_cache *s;
	int err;

372
373
374
	if (WARN_ON(useroffset + usersize > object_size))
		useroffset = usersize = 0;

375
376
377
378
379
380
	err = -ENOMEM;
	s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
	if (!s)
		goto out;

	s->name = name;
381
	s->size = s->object_size = object_size;
382
383
	s->align = align;
	s->ctor = ctor;
384
385
	s->useroffset = useroffset;
	s->usersize = usersize;
386

387
	err = init_memcg_params(s, memcg, root_cache);
388
389
390
391
392
393
394
395
396
	if (err)
		goto out_free_cache;

	err = __kmem_cache_create(s, flags);
	if (err)
		goto out_free_cache;

	s->refcount = 1;
	list_add(&s->list, &slab_caches);
397
	memcg_link_cache(s);
398
399
400
401
402
403
out:
	if (err)
		return ERR_PTR(err);
	return s;

out_free_cache:
404
	destroy_memcg_params(s);
405
	kmem_cache_free(kmem_cache, s);
406
407
	goto out;
}
408

409
/*
410
 * kmem_cache_create_usercopy - Create a cache.
411
412
413
414
 * @name: A string which is used in /proc/slabinfo to identify this cache.
 * @size: The size of objects to be created in this cache.
 * @align: The required alignment for the objects.
 * @flags: SLAB flags
415
416
 * @useroffset: Usercopy region offset
 * @usersize: Usercopy region size
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
 * @ctor: A constructor for the objects.
 *
 * Returns a ptr to the cache on success, NULL on failure.
 * Cannot be called within a interrupt, but can be interrupted.
 * The @ctor is run when new pages are allocated by the cache.
 *
 * The flags are
 *
 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
 * to catch references to uninitialised memory.
 *
 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
 * for buffer overruns.
 *
 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
 * cacheline.  This can be beneficial if you're counting cycles as closely
 * as davem.
 */
435
struct kmem_cache *
436
437
kmem_cache_create_usercopy(const char *name,
		  unsigned int size, unsigned int align,
438
439
		  slab_flags_t flags,
		  unsigned int useroffset, unsigned int usersize,
440
		  void (*ctor)(void *))
441
{
442
	struct kmem_cache *s = NULL;
443
	const char *cache_name;
444
	int err;
445

446
	get_online_cpus();
447
	get_online_mems();
448
	memcg_get_cache_ids();
449

450
	mutex_lock(&slab_mutex);
451

452
	err = kmem_cache_sanity_check(name, size);
453
	if (err) {
454
		goto out_unlock;
455
	}
456

457
458
459
460
461
462
	/* Refuse requests with allocator specific flags */
	if (flags & ~SLAB_FLAGS_PERMITTED) {
		err = -EINVAL;
		goto out_unlock;
	}

463
464
465
466
467
468
469
	/*
	 * Some allocators will constraint the set of valid flags to a subset
	 * of all flags. We expect them to define CACHE_CREATE_MASK in this
	 * case, and we'll just provide them with a sanitized version of the
	 * passed flags.
	 */
	flags &= CACHE_CREATE_MASK;
470

471
472
473
474
475
476
477
	/* Fail closed on bad usersize of useroffset values. */
	if (WARN_ON(!usersize && useroffset) ||
	    WARN_ON(size < usersize || size - usersize < useroffset))
		usersize = useroffset = 0;

	if (!usersize)
		s = __kmem_cache_alias(name, size, align, flags, ctor);
478
	if (s)
479
		goto out_unlock;
480

481
	cache_name = kstrdup_const(name, GFP_KERNEL);
482
483
484
485
	if (!cache_name) {
		err = -ENOMEM;
		goto out_unlock;
	}
486

487
	s = create_cache(cache_name, size,
488
			 calculate_alignment(flags, align, size),
489
			 flags, useroffset, usersize, ctor, NULL, NULL);
490
491
	if (IS_ERR(s)) {
		err = PTR_ERR(s);
492
		kfree_const(cache_name);
493
	}
494
495

out_unlock:
496
	mutex_unlock(&slab_mutex);
497

498
	memcg_put_cache_ids();
499
	put_online_mems();
500
501
	put_online_cpus();

502
	if (err) {
503
504
505
506
		if (flags & SLAB_PANIC)
			panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
				name, err);
		else {
507
			pr_warn("kmem_cache_create(%s) failed with error %d\n",
508
509
510
511
512
				name, err);
			dump_stack();
		}
		return NULL;
	}
513
514
	return s;
}
515
516
517
EXPORT_SYMBOL(kmem_cache_create_usercopy);

struct kmem_cache *
518
kmem_cache_create(const char *name, unsigned int size, unsigned int align,
519
520
		slab_flags_t flags, void (*ctor)(void *))
{
521
	return kmem_cache_create_usercopy(name, size, align, flags, 0, 0,
522
523
					  ctor);
}
524
EXPORT_SYMBOL(kmem_cache_create);
525

526
static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work)
527
{
528
529
	LIST_HEAD(to_destroy);
	struct kmem_cache *s, *s2;
530

531
	/*
532
	 * On destruction, SLAB_TYPESAFE_BY_RCU kmem_caches are put on the
533
534
535
536
537
538
539
540
541
542
	 * @slab_caches_to_rcu_destroy list.  The slab pages are freed
	 * through RCU and and the associated kmem_cache are dereferenced
	 * while freeing the pages, so the kmem_caches should be freed only
	 * after the pending RCU operations are finished.  As rcu_barrier()
	 * is a pretty slow operation, we batch all pending destructions
	 * asynchronously.
	 */
	mutex_lock(&slab_mutex);
	list_splice_init(&slab_caches_to_rcu_destroy, &to_destroy);
	mutex_unlock(&slab_mutex);
543

544
545
546
547
548
549
550
551
552
553
554
555
	if (list_empty(&to_destroy))
		return;

	rcu_barrier();

	list_for_each_entry_safe(s, s2, &to_destroy, list) {
#ifdef SLAB_SUPPORTS_SYSFS
		sysfs_slab_release(s);
#else
		slab_kmem_cache_release(s);
#endif
	}
556
557
}

558
static int shutdown_cache(struct kmem_cache *s)
559
{
560
561
562
	/* free asan quarantined objects */
	kasan_cache_shutdown(s);

563
564
	if (__kmem_cache_shutdown(s) != 0)
		return -EBUSY;
565

566
	memcg_unlink_cache(s);
567
	list_del(&s->list);
568

569
	if (s->flags & SLAB_TYPESAFE_BY_RCU) {
570
571
572
#ifdef SLAB_SUPPORTS_SYSFS
		sysfs_slab_unlink(s);
#endif
573
574
575
		list_add_tail(&s->list, &slab_caches_to_rcu_destroy);
		schedule_work(&slab_caches_to_rcu_destroy_work);
	} else {
576
#ifdef SLAB_SUPPORTS_SYSFS
577
		sysfs_slab_unlink(s);
578
		sysfs_slab_release(s);
579
580
581
582
#else
		slab_kmem_cache_release(s);
#endif
	}
583
584

	return 0;
585
586
}

587
#ifdef CONFIG_MEMCG_KMEM
588
/*
589
 * memcg_create_kmem_cache - Create a cache for a memory cgroup.
590
591
592
593
594
595
596
 * @memcg: The memory cgroup the new cache is for.
 * @root_cache: The parent of the new cache.
 *
 * This function attempts to create a kmem cache that will serve allocation
 * requests going from @memcg to @root_cache. The new cache inherits properties
 * from its parent.
 */
597
598
void memcg_create_kmem_cache(struct mem_cgroup *memcg,
			     struct kmem_cache *root_cache)
599
{
600
	static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */
Michal Hocko's avatar
Michal Hocko committed
601
	struct cgroup_subsys_state *css = &memcg->css;
602
	struct memcg_cache_array *arr;
603
	struct kmem_cache *s = NULL;
604
	char *cache_name;
605
	int idx;
606
607

	get_online_cpus();
608
609
	get_online_mems();

610
611
	mutex_lock(&slab_mutex);

612
	/*
613
	 * The memory cgroup could have been offlined while the cache
614
615
	 * creation work was pending.
	 */
616
	if (memcg->kmem_state != KMEM_ONLINE || root_cache->memcg_params.dying)
617
618
		goto out_unlock;

619
620
621
622
	idx = memcg_cache_id(memcg);
	arr = rcu_dereference_protected(root_cache->memcg_params.memcg_caches,
					lockdep_is_held(&slab_mutex));

623
624
625
626
627
	/*
	 * Since per-memcg caches are created asynchronously on first
	 * allocation (see memcg_kmem_get_cache()), several threads can try to
	 * create the same cache, but only one of them may succeed.
	 */
628
	if (arr->entries[idx])
629
630
		goto out_unlock;

631
	cgroup_name(css->cgroup, memcg_name_buf, sizeof(memcg_name_buf));
632
633
	cache_name = kasprintf(GFP_KERNEL, "%s(%llu:%s)", root_cache->name,
			       css->serial_nr, memcg_name_buf);
634
635
636
	if (!cache_name)
		goto out_unlock;

637
	s = create_cache(cache_name, root_cache->object_size,
638
			 root_cache->align,
639
			 root_cache->flags & CACHE_CREATE_MASK,
640
			 root_cache->useroffset, root_cache->usersize,
641
			 root_cache->ctor, memcg, root_cache);
642
643
644
645
646
	/*
	 * If we could not create a memcg cache, do not complain, because
	 * that's not critical at all as we can always proceed with the root
	 * cache.
	 */
647
	if (IS_ERR(s)) {
648
		kfree(cache_name);
649
		goto out_unlock;
650
	}
651

652
653
654
655
656
657
	/*
	 * Since readers won't lock (see cache_from_memcg_idx()), we need a
	 * barrier here to ensure nobody will see the kmem_cache partially
	 * initialized.
	 */
	smp_wmb();
658
	arr->entries[idx] = s;
659

660
661
out_unlock:
	mutex_unlock(&slab_mutex);
662
663

	put_online_mems();
664
	put_online_cpus();
665
}
666

667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
static void kmemcg_deactivate_workfn(struct work_struct *work)
{
	struct kmem_cache *s = container_of(work, struct kmem_cache,
					    memcg_params.deact_work);

	get_online_cpus();
	get_online_mems();

	mutex_lock(&slab_mutex);

	s->memcg_params.deact_fn(s);

	mutex_unlock(&slab_mutex);

	put_online_mems();
	put_online_cpus();

	/* done, put the ref from slab_deactivate_memcg_cache_rcu_sched() */
	css_put(&s->memcg_params.memcg->css);
}

static void kmemcg_deactivate_rcufn(struct rcu_head *head)
{
	struct kmem_cache *s = container_of(head, struct kmem_cache,
					    memcg_params.deact_rcu_head);

	/*
	 * We need to grab blocking locks.  Bounce to ->deact_work.  The
	 * work item shares the space with the RCU head and can't be
	 * initialized eariler.
	 */
	INIT_WORK(&s->memcg_params.deact_work, kmemcg_deactivate_workfn);
699
	queue_work(memcg_kmem_cache_wq, &s->memcg_params.deact_work);
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
}

/**
 * slab_deactivate_memcg_cache_rcu_sched - schedule deactivation after a
 *					   sched RCU grace period
 * @s: target kmem_cache
 * @deact_fn: deactivation function to call
 *
 * Schedule @deact_fn to be invoked with online cpus, mems and slab_mutex
 * held after a sched RCU grace period.  The slab is guaranteed to stay
 * alive until @deact_fn is finished.  This is to be used from
 * __kmemcg_cache_deactivate().
 */
void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s,
					   void (*deact_fn)(struct kmem_cache *))
{
	if (WARN_ON_ONCE(is_root_cache(s)) ||
	    WARN_ON_ONCE(s->memcg_params.deact_fn))
		return;

720
721
722
	if (s->memcg_params.root_cache->memcg_params.dying)
		return;

723
724
725
726
	/* pin memcg so that @s doesn't get destroyed in the middle */
	css_get(&s->memcg_params.memcg->css);

	s->memcg_params.deact_fn = deact_fn;
727
	call_rcu(&s->memcg_params.deact_rcu_head, kmemcg_deactivate_rcufn);
728
729
}

730
731
732
733
void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
{
	int idx;
	struct memcg_cache_array *arr;
734
	struct kmem_cache *s, *c;
735
736
737

	idx = memcg_cache_id(memcg);

738
739
740
	get_online_cpus();
	get_online_mems();

741
	mutex_lock(&slab_mutex);
742
	list_for_each_entry(s, &slab_root_caches, root_caches_node) {
743
744
		arr = rcu_dereference_protected(s->memcg_params.memcg_caches,
						lockdep_is_held(&slab_mutex));
745
746
747
748
		c = arr->entries[idx];
		if (!c)
			continue;

749
		__kmemcg_cache_deactivate(c);
750
751
752
		arr->entries[idx] = NULL;
	}
	mutex_unlock(&slab_mutex);
753
754
755

	put_online_mems();
	put_online_cpus();
756
757
}

758
void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
759
{
760
	struct kmem_cache *s, *s2;
761

762
763
	get_online_cpus();
	get_online_mems();
764
765

	mutex_lock(&slab_mutex);
766
767
	list_for_each_entry_safe(s, s2, &memcg->kmem_caches,
				 memcg_params.kmem_caches_node) {
768
769
770
771
		/*
		 * The cgroup is about to be freed and therefore has no charges
		 * left. Hence, all its caches must be empty by now.
		 */
772
		BUG_ON(shutdown_cache(s));
773
774
	}
	mutex_unlock(&slab_mutex);
775

776
777
	put_online_mems();
	put_online_cpus();
778
}
779

780
static int shutdown_memcg_caches(struct kmem_cache *s)
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
{
	struct memcg_cache_array *arr;
	struct kmem_cache *c, *c2;
	LIST_HEAD(busy);
	int i;

	BUG_ON(!is_root_cache(s));

	/*
	 * First, shutdown active caches, i.e. caches that belong to online
	 * memory cgroups.
	 */
	arr = rcu_dereference_protected(s->memcg_params.memcg_caches,
					lockdep_is_held(&slab_mutex));
	for_each_memcg_cache_index(i) {
		c = arr->entries[i];
		if (!c)
			continue;
799
		if (shutdown_cache(c))
800
801
802
803
804
			/*
			 * The cache still has objects. Move it to a temporary
			 * list so as not to try to destroy it for a second
			 * time while iterating over inactive caches below.
			 */
805
			list_move(&c->memcg_params.children_node, &busy);
806
807
808
809
810
811
812
813
814
815
816
817
818
819
		else
			/*
			 * The cache is empty and will be destroyed soon. Clear
			 * the pointer to it in the memcg_caches array so that
			 * it will never be accessed even if the root cache
			 * stays alive.
			 */
			arr->entries[i] = NULL;
	}

	/*
	 * Second, shutdown all caches left from memory cgroups that are now
	 * offline.
	 */
820
821
	list_for_each_entry_safe(c, c2, &s->memcg_params.children,
				 memcg_params.children_node)
822
		shutdown_cache(c);
823

824
	list_splice(&busy, &s->memcg_params.children);
825
826
827
828
829

	/*
	 * A cache being destroyed must be empty. In particular, this means
	 * that all per memcg caches attached to it must be empty too.
	 */
830
	if (!list_empty(&s->memcg_params.children))
831
832
833
		return -EBUSY;
	return 0;
}
834
835
836
837
838
839
840
841

static void flush_memcg_workqueue(struct kmem_cache *s)
{
	mutex_lock(&slab_mutex);
	s->memcg_params.dying = true;
	mutex_unlock(&slab_mutex);

	/*
842
	 * SLUB deactivates the kmem_caches through call_rcu. Make
843
844
845
	 * sure all registered rcu callbacks have been invoked.
	 */
	if (IS_ENABLED(CONFIG_SLUB))
846
		rcu_barrier();
847
848
849
850
851
852
853
854

	/*
	 * SLAB and SLUB create memcg kmem_caches through workqueue and SLUB
	 * deactivates the memcg kmem_caches through workqueue. Make sure all
	 * previous workitems on workqueue are processed.
	 */
	flush_workqueue(memcg_kmem_cache_wq);
}
855
#else
856
static inline int shutdown_memcg_caches(struct kmem_cache *s)
857
858
859
{
	return 0;
}
860
861
862
863

static inline void flush_memcg_workqueue(struct kmem_cache *s)
{
}
864
#endif /* CONFIG_MEMCG_KMEM */
865

866
867
void slab_kmem_cache_release(struct kmem_cache *s)
{
868
	__kmem_cache_release(s);
869
	destroy_memcg_params(s);
870
	kfree_const(s->name);
871
872
873
	kmem_cache_free(kmem_cache, s);
}

874
875
void kmem_cache_destroy(struct kmem_cache *s)
{
876
	int err;
877

878
879
880
	if (unlikely(!s))
		return;

881
882
	flush_memcg_workqueue(s);

883
	get_online_cpus();
884
885
	get_online_mems();

886
	mutex_lock(&slab_mutex);
887

888
	s->refcount--;
889
890
891
	if (s->refcount)
		goto out_unlock;

892
	err = shutdown_memcg_caches(s);
893
	if (!err)
894
		err = shutdown_cache(s);
895

896
	if (err) {
Joe Perches's avatar
Joe Perches committed
897
898
		pr_err("kmem_cache_destroy %s: Slab cache still has objects\n",
		       s->name);
899
900
		dump_stack();
	}
901
902
out_unlock:
	mutex_unlock(&slab_mutex);
903

904
	put_online_mems();
905
906
907
908
	put_online_cpus();
}
EXPORT_SYMBOL(kmem_cache_destroy);

909
910
911
912
913
914
915
916
917
918
919
920
921
/**
 * kmem_cache_shrink - Shrink a cache.
 * @cachep: The cache to shrink.
 *
 * Releases as many slabs as possible for a cache.
 * To help debugging, a zero exit status indicates all slabs were released.
 */
int kmem_cache_shrink(struct kmem_cache *cachep)
{
	int ret;

	get_online_cpus();
	get_online_mems();
922
	kasan_cache_shrink(cachep);
923
	ret = __kmem_cache_shrink(cachep);
924
925
926
927
928
929
	put_online_mems();
	put_online_cpus();
	return ret;
}
EXPORT_SYMBOL(kmem_cache_shrink);

930
bool slab_is_available(void)
931
932
933
{
	return slab_state >= UP;
}
934

935
936
#ifndef CONFIG_SLOB
/* Create a cache during boot when no slab services are available yet */
937
938
939
void __init create_boot_cache(struct kmem_cache *s, const char *name,
		unsigned int size, slab_flags_t flags,
		unsigned int useroffset, unsigned int usersize)
940
941
942
943
944
{
	int err;

	s->name = name;
	s->size = s->object_size = size;
945
	s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size);
946
947
	s->useroffset = useroffset;
	s->usersize = usersize;
948
949
950

	slab_init_memcg_params(s);

951
952
953
	err = __kmem_cache_create(s, flags);

	if (err)
954
		panic("Creation of kmalloc slab %s size=%u failed. Reason %d\n",
955
956
957
958
959
					name, size, err);

	s->refcount = -1;	/* Exempt from merging for now */
}

960
961
962
struct kmem_cache *__init create_kmalloc_cache(const char *name,
		unsigned int size, slab_flags_t flags,
		unsigned int useroffset, unsigned int usersize)
963
964
965
966
967
968
{
	struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);

	if (!s)
		panic("Out of memory when creating slab %s\n", name);

969
	create_boot_cache(s, name, size, flags, useroffset, usersize);
970
	list_add(&s->list, &slab_caches);
971
	memcg_link_cache(s);
972
973
974
975
	s->refcount = 1;
	return s;
}

976
977
struct kmem_cache *
kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1] __ro_after_init;
978
979
EXPORT_SYMBOL(kmalloc_caches);

980
981
982
983
984
985
/*
 * Conversion table for small slabs sizes / 8 to the index in the
 * kmalloc array. This is necessary for slabs < 192 since we have non power
 * of two cache sizes there. The size of larger slabs can be determined using
 * fls.
 */
986
static u8 size_index[24] __ro_after_init = {
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
	3,	/* 8 */
	4,	/* 16 */
	5,	/* 24 */
	5,	/* 32 */
	6,	/* 40 */
	6,	/* 48 */
	6,	/* 56 */
	6,	/* 64 */
	1,	/* 72 */
	1,	/* 80 */
	1,	/* 88 */
	1,	/* 96 */
	7,	/* 104 */
	7,	/* 112 */
	7,	/* 120 */
	7,	/* 128 */
	2,	/* 136 */
	2,	/* 144 */
	2,	/* 152 */
	2,	/* 160 */
	2,	/* 168 */
	2,	/* 176 */
	2,	/* 184 */
	2	/* 192 */
};

1013
static inline unsigned int size_index_elem(unsigned int bytes)
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
{
	return (bytes - 1) / 8;
}

/*
 * Find the kmem_cache structure that serves a given size of
 * allocation
 */
struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
{
1024
	unsigned int index;
1025
1026
1027
1028
1029
1030

	if (size <= 192) {
		if (!size)
			return ZERO_SIZE_PTR;

		index = size_index[size_index_elem(size)];
1031
1032
1033
1034
1035
	} else {
		if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
			WARN_ON(1);
			return NULL;
		}
1036
		index = fls(size - 1);
1037
	}
1038

1039
	return kmalloc_caches[kmalloc_type(flags)][index];
1040
1041
}

1042
1043
1044
1045
1046
/*
 * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time.
 * kmalloc_index() supports up to 2^26=64MB, so the final entry of the table is
 * kmalloc-67108864.
 */
1047
const struct kmalloc_info_struct kmalloc_info[] __initconst = {
1048
1049
1050
1051
1052
	{NULL,                      0},		{"kmalloc-96",             96},
	{"kmalloc-192",           192},		{"kmalloc-8",               8},
	{"kmalloc-16",             16},		{"kmalloc-32",             32},
	{"kmalloc-64",             64},		{"kmalloc-128",           128},
	{"kmalloc-256",           256},		{"kmalloc-512",           512},
1053
1054
1055
1056
1057
1058
1059
1060
1061
	{"kmalloc-1k",           1024},		{"kmalloc-2k",           2048},
	{"kmalloc-4k",           4096},		{"kmalloc-8k",           8192},
	{"kmalloc-16k",         16384},		{"kmalloc-32k",         32768},
	{"kmalloc-64k",         65536},		{"kmalloc-128k",       131072},
	{"kmalloc-256k",       262144},		{"kmalloc-512k",       524288},
	{"kmalloc-1M",        1048576},		{"kmalloc-2M",        2097152},
	{"kmalloc-4M",        4194304},		{"kmalloc-8M",        8388608},
	{"kmalloc-16M",      16777216},		{"kmalloc-32M",      33554432},
	{"kmalloc-64M",      67108864}
1062
1063
};

1064
/*
1065
1066
1067
1068
1069
1070
1071
1072
1073
 * Patch up the size_index table if we have strange large alignment
 * requirements for the kmalloc array. This is only the case for
 * MIPS it seems. The standard arches will not generate any code here.
 *
 * Largest permitted alignment is 256 bytes due to the way we
 * handle the index determination for the smaller caches.
 *
 * Make sure that nothing crazy happens if someone starts tinkering
 * around with ARCH_KMALLOC_MINALIGN
1074
 */
1075
void __init setup_kmalloc_cache_index_table(void)
1076
{
1077
	unsigned int i;
1078

1079
1080
1081
1082
	BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
		(KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));

	for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) {
1083
		unsigned int elem = size_index_elem(i);
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108

		if (elem >= ARRAY_SIZE(size_index))