slab_common.c 35.5 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2
3
4
5
6
7
8
9
10
11
12
13
14
/*
 * Slab allocator functions that are independent of the allocator strategy
 *
 * (C) 2012 Christoph Lameter <cl@linux.com>
 */
#include <linux/slab.h>

#include <linux/mm.h>
#include <linux/poison.h>
#include <linux/interrupt.h>
#include <linux/memory.h>
#include <linux/compiler.h>
#include <linux/module.h>
15
16
#include <linux/cpu.h>
#include <linux/uaccess.h>
17
18
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
19
20
21
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
#include <asm/page.h>
22
#include <linux/memcontrol.h>
23
24

#define CREATE_TRACE_POINTS
25
#include <trace/events/kmem.h>
26

27
28
29
#include "slab.h"

enum slab_state slab_state;
30
31
LIST_HEAD(slab_caches);
DEFINE_MUTEX(slab_mutex);
32
struct kmem_cache *kmem_cache;
33

34
35
36
37
38
static LIST_HEAD(slab_caches_to_rcu_destroy);
static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work);
static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
		    slab_caches_to_rcu_destroy_workfn);

39
40
41
42
/*
 * Set of flags that will prevent slab merging
 */
#define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
43
		SLAB_TRACE | SLAB_TYPESAFE_BY_RCU | SLAB_NOLEAKTRACE | \
Alexander Potapenko's avatar
Alexander Potapenko committed
44
		SLAB_FAILSLAB | SLAB_KASAN)
45

Vladimir Davydov's avatar
Vladimir Davydov committed
46
47
#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
			 SLAB_NOTRACK | SLAB_ACCOUNT)
48
49
50
51

/*
 * Merge control. If this is set then no merging of slab caches will occur.
 */
52
static bool slab_nomerge = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT);
53
54
55

static int __init setup_slab_nomerge(char *str)
{
56
	slab_nomerge = true;
57
58
59
60
61
62
63
64
65
	return 1;
}

#ifdef CONFIG_SLUB
__setup_param("slub_nomerge", slub_nomerge, setup_slab_nomerge, 0);
#endif

__setup("slab_nomerge", setup_slab_nomerge);

66
67
68
69
70
71
72
73
74
/*
 * Determine the size of a slab object
 */
unsigned int kmem_cache_size(struct kmem_cache *s)
{
	return s->object_size;
}
EXPORT_SYMBOL(kmem_cache_size);

75
#ifdef CONFIG_DEBUG_VM
76
static int kmem_cache_sanity_check(const char *name, size_t size)
77
78
79
80
81
{
	struct kmem_cache *s = NULL;

	if (!name || in_interrupt() || size < sizeof(void *) ||
		size > KMALLOC_MAX_SIZE) {
82
83
		pr_err("kmem_cache_create(%s) integrity check failed\n", name);
		return -EINVAL;
84
	}
85

86
87
88
89
90
91
92
93
94
95
96
	list_for_each_entry(s, &slab_caches, list) {
		char tmp;
		int res;

		/*
		 * This happens when the module gets unloaded and doesn't
		 * destroy its slab cache and no-one else reuses the vmalloc
		 * area of the module.  Print a warning.
		 */
		res = probe_kernel_address(s->name, tmp);
		if (res) {
97
			pr_err("Slab cache with size %d has lost its name\n",
98
99
100
101
102
103
			       s->object_size);
			continue;
		}
	}

	WARN_ON(strchr(name, ' '));	/* It confuses parsers */
104
105
106
	return 0;
}
#else
107
static inline int kmem_cache_sanity_check(const char *name, size_t size)
108
109
110
{
	return 0;
}
111
112
#endif

113
114
115
116
void __kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p)
{
	size_t i;

117
118
119
120
121
122
	for (i = 0; i < nr; i++) {
		if (s)
			kmem_cache_free(s, p[i]);
		else
			kfree(p[i]);
	}
123
124
}

125
int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
126
127
128
129
130
131
132
133
								void **p)
{
	size_t i;

	for (i = 0; i < nr; i++) {
		void *x = p[i] = kmem_cache_alloc(s, flags);
		if (!x) {
			__kmem_cache_free_bulk(s, i, p);
134
			return 0;
135
136
		}
	}
137
	return i;
138
139
}

140
#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
141
142
143

LIST_HEAD(slab_root_caches);

144
void slab_init_memcg_params(struct kmem_cache *s)
145
{
146
	s->memcg_params.root_cache = NULL;
147
	RCU_INIT_POINTER(s->memcg_params.memcg_caches, NULL);
148
	INIT_LIST_HEAD(&s->memcg_params.children);
149
150
151
152
153
154
}

static int init_memcg_params(struct kmem_cache *s,
		struct mem_cgroup *memcg, struct kmem_cache *root_cache)
{
	struct memcg_cache_array *arr;
155

156
	if (root_cache) {
157
		s->memcg_params.root_cache = root_cache;
158
159
		s->memcg_params.memcg = memcg;
		INIT_LIST_HEAD(&s->memcg_params.children_node);
160
		INIT_LIST_HEAD(&s->memcg_params.kmem_caches_node);
161
		return 0;
162
	}
163

164
	slab_init_memcg_params(s);
165

166
167
	if (!memcg_nr_cache_ids)
		return 0;
168

169
170
171
	arr = kvzalloc(sizeof(struct memcg_cache_array) +
		       memcg_nr_cache_ids * sizeof(void *),
		       GFP_KERNEL);
172
173
	if (!arr)
		return -ENOMEM;
174

175
	RCU_INIT_POINTER(s->memcg_params.memcg_caches, arr);
176
177
178
	return 0;
}

179
static void destroy_memcg_params(struct kmem_cache *s)
180
{
181
	if (is_root_cache(s))
182
183
184
185
186
187
188
189
190
		kvfree(rcu_access_pointer(s->memcg_params.memcg_caches));
}

static void free_memcg_params(struct rcu_head *rcu)
{
	struct memcg_cache_array *old;

	old = container_of(rcu, struct memcg_cache_array, rcu);
	kvfree(old);
191
192
}

193
static int update_memcg_params(struct kmem_cache *s, int new_array_size)
194
{
195
	struct memcg_cache_array *old, *new;
196

197
198
	new = kvzalloc(sizeof(struct memcg_cache_array) +
		       new_array_size * sizeof(void *), GFP_KERNEL);
199
	if (!new)
200
201
		return -ENOMEM;

202
203
204
205
206
	old = rcu_dereference_protected(s->memcg_params.memcg_caches,
					lockdep_is_held(&slab_mutex));
	if (old)
		memcpy(new->entries, old->entries,
		       memcg_nr_cache_ids * sizeof(void *));
207

208
209
	rcu_assign_pointer(s->memcg_params.memcg_caches, new);
	if (old)
210
		call_rcu(&old->rcu, free_memcg_params);
211
212
213
	return 0;
}

214
215
216
217
218
int memcg_update_all_caches(int num_memcgs)
{
	struct kmem_cache *s;
	int ret = 0;

219
	mutex_lock(&slab_mutex);
220
	list_for_each_entry(s, &slab_root_caches, root_caches_node) {
221
		ret = update_memcg_params(s, num_memcgs);
222
223
224
225
226
		/*
		 * Instead of freeing the memory, we'll just leave the caches
		 * up to this point in an updated state.
		 */
		if (ret)
227
			break;
228
229
230
231
	}
	mutex_unlock(&slab_mutex);
	return ret;
}
232

233
void memcg_link_cache(struct kmem_cache *s)
234
{
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
	if (is_root_cache(s)) {
		list_add(&s->root_caches_node, &slab_root_caches);
	} else {
		list_add(&s->memcg_params.children_node,
			 &s->memcg_params.root_cache->memcg_params.children);
		list_add(&s->memcg_params.kmem_caches_node,
			 &s->memcg_params.memcg->kmem_caches);
	}
}

static void memcg_unlink_cache(struct kmem_cache *s)
{
	if (is_root_cache(s)) {
		list_del(&s->root_caches_node);
	} else {
		list_del(&s->memcg_params.children_node);
		list_del(&s->memcg_params.kmem_caches_node);
	}
253
}
254
#else
255
256
static inline int init_memcg_params(struct kmem_cache *s,
		struct mem_cgroup *memcg, struct kmem_cache *root_cache)
257
258
259
260
{
	return 0;
}

261
static inline void destroy_memcg_params(struct kmem_cache *s)
262
263
{
}
264

265
static inline void memcg_unlink_cache(struct kmem_cache *s)
266
267
{
}
268
#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
269

270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
/*
 * Find a mergeable slab cache
 */
int slab_unmergeable(struct kmem_cache *s)
{
	if (slab_nomerge || (s->flags & SLAB_NEVER_MERGE))
		return 1;

	if (!is_root_cache(s))
		return 1;

	if (s->ctor)
		return 1;

	/*
	 * We may have set a slab to be unmergeable during bootstrap.
	 */
	if (s->refcount < 0)
		return 1;

	return 0;
}

struct kmem_cache *find_mergeable(size_t size, size_t align,
294
		slab_flags_t flags, const char *name, void (*ctor)(void *))
295
296
297
{
	struct kmem_cache *s;

298
	if (slab_nomerge)
299
300
301
302
303
304
305
306
307
308
		return NULL;

	if (ctor)
		return NULL;

	size = ALIGN(size, sizeof(void *));
	align = calculate_alignment(flags, align, size);
	size = ALIGN(size, align);
	flags = kmem_cache_flags(size, flags, name, NULL);

309
310
311
	if (flags & SLAB_NEVER_MERGE)
		return NULL;

312
	list_for_each_entry_reverse(s, &slab_root_caches, root_caches_node) {
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
		if (slab_unmergeable(s))
			continue;

		if (size > s->size)
			continue;

		if ((flags & SLAB_MERGE_SAME) != (s->flags & SLAB_MERGE_SAME))
			continue;
		/*
		 * Check if alignment is compatible.
		 * Courtesy of Adrian Drzewiecki
		 */
		if ((s->size & ~(align - 1)) != s->size)
			continue;

		if (s->size - size >= sizeof(void *))
			continue;

331
332
333
334
		if (IS_ENABLED(CONFIG_SLAB) && align &&
			(align > s->align || s->align % align))
			continue;

335
336
337
338
339
		return s;
	}
	return NULL;
}

340
341
342
343
/*
 * Figure out what the alignment of the objects will be given a set of
 * flags, a user specified alignment and the size of the objects.
 */
344
unsigned long calculate_alignment(slab_flags_t flags,
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
		unsigned long align, unsigned long size)
{
	/*
	 * If the user wants hardware cache aligned objects then follow that
	 * suggestion if the object is sufficiently large.
	 *
	 * The hardware cache alignment cannot override the specified
	 * alignment though. If that is greater then use it.
	 */
	if (flags & SLAB_HWCACHE_ALIGN) {
		unsigned long ralign = cache_line_size();
		while (size <= ralign / 2)
			ralign /= 2;
		align = max(align, ralign);
	}

	if (align < ARCH_SLAB_MINALIGN)
		align = ARCH_SLAB_MINALIGN;

	return ALIGN(align, sizeof(void *));
}

367
368
static struct kmem_cache *create_cache(const char *name,
		size_t object_size, size_t size, size_t align,
369
		slab_flags_t flags, void (*ctor)(void *),
370
		struct mem_cgroup *memcg, struct kmem_cache *root_cache)
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
{
	struct kmem_cache *s;
	int err;

	err = -ENOMEM;
	s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
	if (!s)
		goto out;

	s->name = name;
	s->object_size = object_size;
	s->size = size;
	s->align = align;
	s->ctor = ctor;

386
	err = init_memcg_params(s, memcg, root_cache);
387
388
389
390
391
392
393
394
395
	if (err)
		goto out_free_cache;

	err = __kmem_cache_create(s, flags);
	if (err)
		goto out_free_cache;

	s->refcount = 1;
	list_add(&s->list, &slab_caches);
396
	memcg_link_cache(s);
397
398
399
400
401
402
out:
	if (err)
		return ERR_PTR(err);
	return s;

out_free_cache:
403
	destroy_memcg_params(s);
404
	kmem_cache_free(kmem_cache, s);
405
406
	goto out;
}
407

408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
/*
 * kmem_cache_create - Create a cache.
 * @name: A string which is used in /proc/slabinfo to identify this cache.
 * @size: The size of objects to be created in this cache.
 * @align: The required alignment for the objects.
 * @flags: SLAB flags
 * @ctor: A constructor for the objects.
 *
 * Returns a ptr to the cache on success, NULL on failure.
 * Cannot be called within a interrupt, but can be interrupted.
 * The @ctor is run when new pages are allocated by the cache.
 *
 * The flags are
 *
 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
 * to catch references to uninitialised memory.
 *
 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
 * for buffer overruns.
 *
 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
 * cacheline.  This can be beneficial if you're counting cycles as closely
 * as davem.
 */
432
struct kmem_cache *
433
kmem_cache_create(const char *name, size_t size, size_t align,
434
		  slab_flags_t flags, void (*ctor)(void *))
435
{
436
	struct kmem_cache *s = NULL;
437
	const char *cache_name;
438
	int err;
439

440
	get_online_cpus();
441
	get_online_mems();
442
	memcg_get_cache_ids();
443

444
	mutex_lock(&slab_mutex);
445

446
	err = kmem_cache_sanity_check(name, size);
447
	if (err) {
448
		goto out_unlock;
449
	}
450

451
452
453
454
455
456
	/* Refuse requests with allocator specific flags */
	if (flags & ~SLAB_FLAGS_PERMITTED) {
		err = -EINVAL;
		goto out_unlock;
	}

457
458
459
460
461
462
463
	/*
	 * Some allocators will constraint the set of valid flags to a subset
	 * of all flags. We expect them to define CACHE_CREATE_MASK in this
	 * case, and we'll just provide them with a sanitized version of the
	 * passed flags.
	 */
	flags &= CACHE_CREATE_MASK;
464

465
466
	s = __kmem_cache_alias(name, size, align, flags, ctor);
	if (s)
467
		goto out_unlock;
468

469
	cache_name = kstrdup_const(name, GFP_KERNEL);
470
471
472
473
	if (!cache_name) {
		err = -ENOMEM;
		goto out_unlock;
	}
474

475
476
477
	s = create_cache(cache_name, size, size,
			 calculate_alignment(flags, align, size),
			 flags, ctor, NULL, NULL);
478
479
	if (IS_ERR(s)) {
		err = PTR_ERR(s);
480
		kfree_const(cache_name);
481
	}
482
483

out_unlock:
484
	mutex_unlock(&slab_mutex);
485

486
	memcg_put_cache_ids();
487
	put_online_mems();
488
489
	put_online_cpus();

490
	if (err) {
491
492
493
494
		if (flags & SLAB_PANIC)
			panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
				name, err);
		else {
495
			pr_warn("kmem_cache_create(%s) failed with error %d\n",
496
497
498
499
500
				name, err);
			dump_stack();
		}
		return NULL;
	}
501
502
	return s;
}
503
EXPORT_SYMBOL(kmem_cache_create);
504

505
static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work)
506
{
507
508
	LIST_HEAD(to_destroy);
	struct kmem_cache *s, *s2;
509

510
	/*
511
	 * On destruction, SLAB_TYPESAFE_BY_RCU kmem_caches are put on the
512
513
514
515
516
517
518
519
520
521
	 * @slab_caches_to_rcu_destroy list.  The slab pages are freed
	 * through RCU and and the associated kmem_cache are dereferenced
	 * while freeing the pages, so the kmem_caches should be freed only
	 * after the pending RCU operations are finished.  As rcu_barrier()
	 * is a pretty slow operation, we batch all pending destructions
	 * asynchronously.
	 */
	mutex_lock(&slab_mutex);
	list_splice_init(&slab_caches_to_rcu_destroy, &to_destroy);
	mutex_unlock(&slab_mutex);
522

523
524
525
526
527
528
529
530
531
532
533
534
	if (list_empty(&to_destroy))
		return;

	rcu_barrier();

	list_for_each_entry_safe(s, s2, &to_destroy, list) {
#ifdef SLAB_SUPPORTS_SYSFS
		sysfs_slab_release(s);
#else
		slab_kmem_cache_release(s);
#endif
	}
535
536
}

537
static int shutdown_cache(struct kmem_cache *s)
538
{
539
540
541
	/* free asan quarantined objects */
	kasan_cache_shutdown(s);

542
543
	if (__kmem_cache_shutdown(s) != 0)
		return -EBUSY;
544

545
	memcg_unlink_cache(s);
546
	list_del(&s->list);
547

548
	if (s->flags & SLAB_TYPESAFE_BY_RCU) {
549
550
551
		list_add_tail(&s->list, &slab_caches_to_rcu_destroy);
		schedule_work(&slab_caches_to_rcu_destroy_work);
	} else {
552
#ifdef SLAB_SUPPORTS_SYSFS
553
		sysfs_slab_release(s);
554
555
556
557
#else
		slab_kmem_cache_release(s);
#endif
	}
558
559

	return 0;
560
561
}

562
#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
563
/*
564
 * memcg_create_kmem_cache - Create a cache for a memory cgroup.
565
566
567
568
569
570
571
 * @memcg: The memory cgroup the new cache is for.
 * @root_cache: The parent of the new cache.
 *
 * This function attempts to create a kmem cache that will serve allocation
 * requests going from @memcg to @root_cache. The new cache inherits properties
 * from its parent.
 */
572
573
void memcg_create_kmem_cache(struct mem_cgroup *memcg,
			     struct kmem_cache *root_cache)
574
{
575
	static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */
Michal Hocko's avatar
Michal Hocko committed
576
	struct cgroup_subsys_state *css = &memcg->css;
577
	struct memcg_cache_array *arr;
578
	struct kmem_cache *s = NULL;
579
	char *cache_name;
580
	int idx;
581
582

	get_online_cpus();
583
584
	get_online_mems();

585
586
	mutex_lock(&slab_mutex);

587
	/*
588
	 * The memory cgroup could have been offlined while the cache
589
590
	 * creation work was pending.
	 */
591
	if (memcg->kmem_state != KMEM_ONLINE)
592
593
		goto out_unlock;

594
595
596
597
	idx = memcg_cache_id(memcg);
	arr = rcu_dereference_protected(root_cache->memcg_params.memcg_caches,
					lockdep_is_held(&slab_mutex));

598
599
600
601
602
	/*
	 * Since per-memcg caches are created asynchronously on first
	 * allocation (see memcg_kmem_get_cache()), several threads can try to
	 * create the same cache, but only one of them may succeed.
	 */
603
	if (arr->entries[idx])
604
605
		goto out_unlock;

606
	cgroup_name(css->cgroup, memcg_name_buf, sizeof(memcg_name_buf));
607
608
	cache_name = kasprintf(GFP_KERNEL, "%s(%llu:%s)", root_cache->name,
			       css->serial_nr, memcg_name_buf);
609
610
611
	if (!cache_name)
		goto out_unlock;

612
613
	s = create_cache(cache_name, root_cache->object_size,
			 root_cache->size, root_cache->align,
614
615
			 root_cache->flags & CACHE_CREATE_MASK,
			 root_cache->ctor, memcg, root_cache);
616
617
618
619
620
	/*
	 * If we could not create a memcg cache, do not complain, because
	 * that's not critical at all as we can always proceed with the root
	 * cache.
	 */
621
	if (IS_ERR(s)) {
622
		kfree(cache_name);
623
		goto out_unlock;
624
	}
625

626
627
628
629
630
631
	/*
	 * Since readers won't lock (see cache_from_memcg_idx()), we need a
	 * barrier here to ensure nobody will see the kmem_cache partially
	 * initialized.
	 */
	smp_wmb();
632
	arr->entries[idx] = s;
633

634
635
out_unlock:
	mutex_unlock(&slab_mutex);
636
637

	put_online_mems();
638
	put_online_cpus();
639
}
640

641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
static void kmemcg_deactivate_workfn(struct work_struct *work)
{
	struct kmem_cache *s = container_of(work, struct kmem_cache,
					    memcg_params.deact_work);

	get_online_cpus();
	get_online_mems();

	mutex_lock(&slab_mutex);

	s->memcg_params.deact_fn(s);

	mutex_unlock(&slab_mutex);

	put_online_mems();
	put_online_cpus();

	/* done, put the ref from slab_deactivate_memcg_cache_rcu_sched() */
	css_put(&s->memcg_params.memcg->css);
}

static void kmemcg_deactivate_rcufn(struct rcu_head *head)
{
	struct kmem_cache *s = container_of(head, struct kmem_cache,
					    memcg_params.deact_rcu_head);

	/*
	 * We need to grab blocking locks.  Bounce to ->deact_work.  The
	 * work item shares the space with the RCU head and can't be
	 * initialized eariler.
	 */
	INIT_WORK(&s->memcg_params.deact_work, kmemcg_deactivate_workfn);
673
	queue_work(memcg_kmem_cache_wq, &s->memcg_params.deact_work);
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
}

/**
 * slab_deactivate_memcg_cache_rcu_sched - schedule deactivation after a
 *					   sched RCU grace period
 * @s: target kmem_cache
 * @deact_fn: deactivation function to call
 *
 * Schedule @deact_fn to be invoked with online cpus, mems and slab_mutex
 * held after a sched RCU grace period.  The slab is guaranteed to stay
 * alive until @deact_fn is finished.  This is to be used from
 * __kmemcg_cache_deactivate().
 */
void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s,
					   void (*deact_fn)(struct kmem_cache *))
{
	if (WARN_ON_ONCE(is_root_cache(s)) ||
	    WARN_ON_ONCE(s->memcg_params.deact_fn))
		return;

	/* pin memcg so that @s doesn't get destroyed in the middle */
	css_get(&s->memcg_params.memcg->css);

	s->memcg_params.deact_fn = deact_fn;
	call_rcu_sched(&s->memcg_params.deact_rcu_head, kmemcg_deactivate_rcufn);
}

701
702
703
704
void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
{
	int idx;
	struct memcg_cache_array *arr;
705
	struct kmem_cache *s, *c;
706
707
708

	idx = memcg_cache_id(memcg);

709
710
711
	get_online_cpus();
	get_online_mems();

712
	mutex_lock(&slab_mutex);
713
	list_for_each_entry(s, &slab_root_caches, root_caches_node) {
714
715
		arr = rcu_dereference_protected(s->memcg_params.memcg_caches,
						lockdep_is_held(&slab_mutex));
716
717
718
719
		c = arr->entries[idx];
		if (!c)
			continue;

720
		__kmemcg_cache_deactivate(c);
721
722
723
		arr->entries[idx] = NULL;
	}
	mutex_unlock(&slab_mutex);
724
725
726

	put_online_mems();
	put_online_cpus();
727
728
}

729
void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
730
{
731
	struct kmem_cache *s, *s2;
732

733
734
	get_online_cpus();
	get_online_mems();
735
736

	mutex_lock(&slab_mutex);
737
738
	list_for_each_entry_safe(s, s2, &memcg->kmem_caches,
				 memcg_params.kmem_caches_node) {
739
740
741
742
		/*
		 * The cgroup is about to be freed and therefore has no charges
		 * left. Hence, all its caches must be empty by now.
		 */
743
		BUG_ON(shutdown_cache(s));
744
745
	}
	mutex_unlock(&slab_mutex);
746

747
748
	put_online_mems();
	put_online_cpus();
749
}
750

751
static int shutdown_memcg_caches(struct kmem_cache *s)
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
{
	struct memcg_cache_array *arr;
	struct kmem_cache *c, *c2;
	LIST_HEAD(busy);
	int i;

	BUG_ON(!is_root_cache(s));

	/*
	 * First, shutdown active caches, i.e. caches that belong to online
	 * memory cgroups.
	 */
	arr = rcu_dereference_protected(s->memcg_params.memcg_caches,
					lockdep_is_held(&slab_mutex));
	for_each_memcg_cache_index(i) {
		c = arr->entries[i];
		if (!c)
			continue;
770
		if (shutdown_cache(c))
771
772
773
774
775
			/*
			 * The cache still has objects. Move it to a temporary
			 * list so as not to try to destroy it for a second
			 * time while iterating over inactive caches below.
			 */
776
			list_move(&c->memcg_params.children_node, &busy);
777
778
779
780
781
782
783
784
785
786
787
788
789
790
		else
			/*
			 * The cache is empty and will be destroyed soon. Clear
			 * the pointer to it in the memcg_caches array so that
			 * it will never be accessed even if the root cache
			 * stays alive.
			 */
			arr->entries[i] = NULL;
	}

	/*
	 * Second, shutdown all caches left from memory cgroups that are now
	 * offline.
	 */
791
792
	list_for_each_entry_safe(c, c2, &s->memcg_params.children,
				 memcg_params.children_node)
793
		shutdown_cache(c);
794

795
	list_splice(&busy, &s->memcg_params.children);
796
797
798
799
800

	/*
	 * A cache being destroyed must be empty. In particular, this means
	 * that all per memcg caches attached to it must be empty too.
	 */
801
	if (!list_empty(&s->memcg_params.children))
802
803
804
805
		return -EBUSY;
	return 0;
}
#else
806
static inline int shutdown_memcg_caches(struct kmem_cache *s)
807
808
809
{
	return 0;
}
810
#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
811

812
813
void slab_kmem_cache_release(struct kmem_cache *s)
{
814
	__kmem_cache_release(s);
815
	destroy_memcg_params(s);
816
	kfree_const(s->name);
817
818
819
	kmem_cache_free(kmem_cache, s);
}

820
821
void kmem_cache_destroy(struct kmem_cache *s)
{
822
	int err;
823

824
825
826
	if (unlikely(!s))
		return;

827
	get_online_cpus();
828
829
	get_online_mems();

830
	mutex_lock(&slab_mutex);
831

832
	s->refcount--;
833
834
835
	if (s->refcount)
		goto out_unlock;

836
	err = shutdown_memcg_caches(s);
837
	if (!err)
838
		err = shutdown_cache(s);
839

840
	if (err) {
Joe Perches's avatar
Joe Perches committed
841
842
		pr_err("kmem_cache_destroy %s: Slab cache still has objects\n",
		       s->name);
843
844
		dump_stack();
	}
845
846
out_unlock:
	mutex_unlock(&slab_mutex);
847

848
	put_online_mems();
849
850
851
852
	put_online_cpus();
}
EXPORT_SYMBOL(kmem_cache_destroy);

853
854
855
856
857
858
859
860
861
862
863
864
865
/**
 * kmem_cache_shrink - Shrink a cache.
 * @cachep: The cache to shrink.
 *
 * Releases as many slabs as possible for a cache.
 * To help debugging, a zero exit status indicates all slabs were released.
 */
int kmem_cache_shrink(struct kmem_cache *cachep)
{
	int ret;

	get_online_cpus();
	get_online_mems();
866
	kasan_cache_shrink(cachep);
867
	ret = __kmem_cache_shrink(cachep);
868
869
870
871
872
873
	put_online_mems();
	put_online_cpus();
	return ret;
}
EXPORT_SYMBOL(kmem_cache_shrink);

874
bool slab_is_available(void)
875
876
877
{
	return slab_state >= UP;
}
878

879
880
881
#ifndef CONFIG_SLOB
/* Create a cache during boot when no slab services are available yet */
void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size,
882
		slab_flags_t flags)
883
884
885
886
887
{
	int err;

	s->name = name;
	s->size = s->object_size = size;
888
	s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size);
889
890
891

	slab_init_memcg_params(s);

892
893
894
	err = __kmem_cache_create(s, flags);

	if (err)
895
		panic("Creation of kmalloc slab %s size=%zu failed. Reason %d\n",
896
897
898
899
900
901
					name, size, err);

	s->refcount = -1;	/* Exempt from merging for now */
}

struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size,
902
				slab_flags_t flags)
903
904
905
906
907
908
909
910
{
	struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);

	if (!s)
		panic("Out of memory when creating slab %s\n", name);

	create_boot_cache(s, name, size, flags);
	list_add(&s->list, &slab_caches);
911
	memcg_link_cache(s);
912
913
914
915
	s->refcount = 1;
	return s;
}

916
917
918
919
920
921
922
923
struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1];
EXPORT_SYMBOL(kmalloc_caches);

#ifdef CONFIG_ZONE_DMA
struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1];
EXPORT_SYMBOL(kmalloc_dma_caches);
#endif

924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
/*
 * Conversion table for small slabs sizes / 8 to the index in the
 * kmalloc array. This is necessary for slabs < 192 since we have non power
 * of two cache sizes there. The size of larger slabs can be determined using
 * fls.
 */
static s8 size_index[24] = {
	3,	/* 8 */
	4,	/* 16 */
	5,	/* 24 */
	5,	/* 32 */
	6,	/* 40 */
	6,	/* 48 */
	6,	/* 56 */
	6,	/* 64 */
	1,	/* 72 */
	1,	/* 80 */
	1,	/* 88 */
	1,	/* 96 */
	7,	/* 104 */
	7,	/* 112 */
	7,	/* 120 */
	7,	/* 128 */
	2,	/* 136 */
	2,	/* 144 */
	2,	/* 152 */
	2,	/* 160 */
	2,	/* 168 */
	2,	/* 176 */
	2,	/* 184 */
	2	/* 192 */
};

static inline int size_index_elem(size_t bytes)
{
	return (bytes - 1) / 8;
}

/*
 * Find the kmem_cache structure that serves a given size of
 * allocation
 */
struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
{
	int index;

970
	if (unlikely(size > KMALLOC_MAX_SIZE)) {
971
		WARN_ON_ONCE(!(flags & __GFP_NOWARN));
972
		return NULL;
973
	}
974

975
976
977
978
979
980
981
982
983
	if (size <= 192) {
		if (!size)
			return ZERO_SIZE_PTR;

		index = size_index[size_index_elem(size)];
	} else
		index = fls(size - 1);

#ifdef CONFIG_ZONE_DMA
984
	if (unlikely((flags & GFP_DMA)))
985
986
987
988
989
990
		return kmalloc_dma_caches[index];

#endif
	return kmalloc_caches[index];
}

991
992
993
994
995
/*
 * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time.
 * kmalloc_index() supports up to 2^26=64MB, so the final entry of the table is
 * kmalloc-67108864.
 */
996
const struct kmalloc_info_struct kmalloc_info[] __initconst = {
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
	{NULL,                      0},		{"kmalloc-96",             96},
	{"kmalloc-192",           192},		{"kmalloc-8",               8},
	{"kmalloc-16",             16},		{"kmalloc-32",             32},
	{"kmalloc-64",             64},		{"kmalloc-128",           128},
	{"kmalloc-256",           256},		{"kmalloc-512",           512},
	{"kmalloc-1024",         1024},		{"kmalloc-2048",         2048},
	{"kmalloc-4096",         4096},		{"kmalloc-8192",         8192},
	{"kmalloc-16384",       16384},		{"kmalloc-32768",       32768},
	{"kmalloc-65536",       65536},		{"kmalloc-131072",     131072},
	{"kmalloc-262144",     262144},		{"kmalloc-524288",     524288},
	{"kmalloc-1048576",   1048576},		{"kmalloc-2097152",   2097152},
	{"kmalloc-4194304",   4194304},		{"kmalloc-8388608",   8388608},
	{"kmalloc-16777216", 16777216},		{"kmalloc-33554432", 33554432},
	{"kmalloc-67108864", 67108864}
};

1013
/*
1014
1015
1016
1017
1018
1019
1020
1021
1022
 * Patch up the size_index table if we have strange large alignment
 * requirements for the kmalloc array. This is only the case for
 * MIPS it seems. The standard arches will not generate any code here.
 *
 * Largest permitted alignment is 256 bytes due to the way we
 * handle the index determination for the smaller caches.
 *
 * Make sure that nothing crazy happens if someone starts tinkering
 * around with ARCH_KMALLOC_MINALIGN
1023
 */
1024
void __init setup_kmalloc_cache_index_table(void)
1025
1026
1027
{
	int i;

1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
	BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
		(KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));

	for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) {
		int elem = size_index_elem(i);

		if (elem >= ARRAY_SIZE(size_index))
			break;
		size_index[elem] = KMALLOC_SHIFT_LOW;
	}

	if (KMALLOC_MIN_SIZE >= 64) {
		/*
		 * The 96 byte size cache is not used if the alignment
		 * is 64 byte.
		 */
		for (i = 64 + 8; i <= 96; i += 8)
			size_index[size_index_elem(i)] = 7;

	}

	if (KMALLOC_MIN_SIZE >= 128) {
		/*
		 * The 192 byte sized cache is not used if the alignment
		 * is 128 byte. Redirect kmalloc to use the 256 byte cache
		 * instead.
		 */
		for (i = 128 + 8; i <= 192; i += 8)
			size_index[size_index_elem(i)] = 8;
	}
1058
1059
}

1060
static void __init new_kmalloc_cache(int idx, slab_flags_t flags)
1061
1062
1063
1064
1065
{
	kmalloc_caches[idx] = create_kmalloc_cache(kmalloc_info[idx].name,
					kmalloc_info[idx].size, flags);
}

1066
1067
1068
1069
1070
/*
 * Create the kmalloc array. Some of the regular kmalloc arrays
 * may already have been created because they were needed to
 * enable allocations for slab creation.
 */
1071
void __init create_kmalloc_caches(slab_flags_t flags)
1072
1073
1074
{
	int i;

1075
1076
1077
	for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
		if (!kmalloc_caches[i])
			new_kmalloc_cache(i, flags);
1078

1079
		/*
1080
1081
1082
		 * Caches that are not of the two-to-the-power-of size.
		 * These have to be created immediately after the
		 * earlier power of two caches
1083
		 */
1084
1085
1086
1087
		if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6)
			new_kmalloc_cache(1, flags);
		if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7)
			new_kmalloc_cache(2, flags);
1088
1089
	}

1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
	/* Kmalloc array is now usable */
	slab_state = UP;

#ifdef CONFIG_ZONE_DMA
	for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
		struct kmem_cache *s = kmalloc_caches[i];

		if (s) {
			int size = kmalloc_size(i);
			char *n = kasprintf(GFP_NOWAIT,
				 "dma-kmalloc-%d", size);

			BUG_ON(!n);
			kmalloc_dma_caches[i] = create_kmalloc_cache(n,
				size, SLAB_CACHE_DMA | flags);
		}
	}
#endif
}
1109
1110
#endif /* !CONFIG_SLOB */

1111
1112
1113
1114
1115
/*
 * To avoid unnecessary overhead, we pass through large allocation requests
 * directly to the page allocator. We use __GFP_COMP, because we will need to
 * know the allocation order to free the pages properly in kfree.
 */
Vladimir Davydov's avatar
Vladimir Davydov committed
1116
1117
1118
1119
1120
1121
void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
{
	void *ret;
	struct page *page;

	flags |= __GFP_COMP;
1122
	page = alloc_pages(flags, order);
Vladimir Davydov's avatar
Vladimir Davydov committed
1123
1124
	ret = page ? page_address(page) : NULL;
	kmemleak_alloc(ret, size, 1, flags);
1125
	kasan_kmalloc_large(ret, size, flags);
Vladimir Davydov's avatar
Vladimir Davydov committed
1126
1127
1128
1129
	return ret;
}
EXPORT_SYMBOL(kmalloc_order);

1130
1131
1132
1133
1134
1135
1136
1137
1138
#ifdef CONFIG_TRACING
void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
{
	void *ret = kmalloc_order(size, flags, order);
	trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags);
	return ret;
}
EXPORT_SYMBOL(kmalloc_order_trace);
#endif
1139

1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
#ifdef CONFIG_SLAB_FREELIST_RANDOM
/* Randomize a generic freelist */
static void freelist_randomize(struct rnd_state *state, unsigned int *list,
			size_t count)
{
	size_t i;
	unsigned int rand;

	for (i = 0; i < count; i++)
		list[i] = i;

	/* Fisher-Yates shuffle */
	for (i = count - 1; i > 0; i--) {
		rand = prandom_u32_state(state);
		rand %= (i + 1);
		swap(list[i], list[rand]);
	}
}

/* Create a random sequence per cache */
int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count,
				    gfp_t gfp)
{
	struct rnd_state state;

	if (count < 2 || cachep->random_seq)
		return 0;

	cachep->random_seq = kcalloc(count, sizeof(unsigned int), gfp);
	if (!cachep->random_seq)
		return -ENOMEM;

	/* Get best entropy at this stage of boot */
	prandom_seed_state(&state, get_random_long());

	freelist_randomize(&state, cachep->random_seq, count);
	return 0;
}

/* Destroy the per-cache random freelist sequence */
void cache_random_seq_destroy(struct kmem_cache *cachep)
{
	kfree(cachep->random_seq);
	cachep->random_seq = NULL;
}
#endif /* CONFIG_SLAB_FREELIST_RANDOM */

1187
#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
1188
1189
1190
1191
1192
1193
#ifdef CONFIG_SLAB
#define SLABINFO_RIGHTS (S_IWUSR | S_IRUSR)
#else
#define SLABINFO_RIGHTS S_IRUSR
#endif

1194
static void print_slabinfo_header(struct seq_file *m)
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
{
	/*
	 * Output format version, so at least we can change it
	 * without _too_ many complaints.
	 */
#ifdef CONFIG_DEBUG_SLAB
	seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
#else
	seq_puts(m, "slabinfo - version: 2.1\n");
#endif
Joe Perches's avatar
Joe Perches committed
1205
	seq_puts(m, "# name            <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>");
1206
1207
1208
	seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
	seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
#ifdef CONFIG_DEBUG_SLAB
Joe Perches's avatar
Joe Perches committed
1209
	seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> <error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
1210
1211
1212
1213
1214
	seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
#endif
	seq_putc(m, '\n');
}

1215
void *slab_start(struct seq_file *m, loff_t *pos)
1216
1217
{
	mutex_lock(&slab_mutex);