zsmalloc.c 60.3 KB
Newer Older
1
2
3
4
/*
 * zsmalloc memory allocator
 *
 * Copyright (C) 2011  Nitin Gupta
Minchan Kim's avatar
Minchan Kim committed
5
 * Copyright (C) 2012, 2013 Minchan Kim
6
7
8
9
10
11
12
13
 *
 * This code is released using a dual license strategy: BSD/GPL
 * You can choose the license that better fits your requirements.
 *
 * Released under the terms of 3-clause BSD License
 * Released under the terms of GNU General Public License Version 2.0
 */

Nitin Gupta's avatar
Nitin Gupta committed
14
15
16
17
18
/*
 * Following is how we use various fields and flags of underlying
 * struct page(s) to form a zspage.
 *
 * Usage of struct page fields:
19
 *	page->private: points to zspage
Minchan Kim's avatar
Minchan Kim committed
20
21
22
 *	page->freelist(index): links together all component pages of a zspage
 *		For the huge page, this is always 0, so we use this field
 *		to store handle.
23
 *	page->units: first object offset in a subpage of zspage
Nitin Gupta's avatar
Nitin Gupta committed
24
25
26
 *
 * Usage of struct page flags:
 *	PG_private: identifies the first component page
27
 *	PG_owner_priv_1: identifies the huge component page
Nitin Gupta's avatar
Nitin Gupta committed
28
29
30
 *
 */

31
32
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

33
34
#include <linux/module.h>
#include <linux/kernel.h>
Minchan Kim's avatar
Minchan Kim committed
35
#include <linux/sched.h>
36
#include <linux/magic.h>
37
38
39
40
41
42
43
44
45
#include <linux/bitops.h>
#include <linux/errno.h>
#include <linux/highmem.h>
#include <linux/string.h>
#include <linux/slab.h>
#include <asm/tlbflush.h>
#include <asm/pgtable.h>
#include <linux/cpumask.h>
#include <linux/cpu.h>
46
#include <linux/vmalloc.h>
47
#include <linux/preempt.h>
48
#include <linux/spinlock.h>
49
#include <linux/shrinker.h>
50
#include <linux/types.h>
51
#include <linux/debugfs.h>
Minchan Kim's avatar
Minchan Kim committed
52
#include <linux/zsmalloc.h>
53
#include <linux/zpool.h>
Minchan Kim's avatar
Minchan Kim committed
54
#include <linux/mount.h>
55
#include <linux/migrate.h>
Minchan Kim's avatar
Minchan Kim committed
56
#include <linux/pagemap.h>
57
#include <linux/fs.h>
Minchan Kim's avatar
Minchan Kim committed
58
59

#define ZSPAGE_MAGIC	0x58
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75

/*
 * This must be power of 2 and greater than of equal to sizeof(link_free).
 * These two conditions ensure that any 'struct link_free' itself doesn't
 * span more than 1 page which avoids complex case of mapping 2 pages simply
 * to restore link_free pointer values.
 */
#define ZS_ALIGN		8

/*
 * A single 'zspage' is composed of up to 2^N discontiguous 0-order (single)
 * pages. ZS_MAX_ZSPAGE_ORDER defines upper limit on N.
 */
#define ZS_MAX_ZSPAGE_ORDER 2
#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)

76
77
#define ZS_HANDLE_SIZE (sizeof(unsigned long))

78
79
/*
 * Object location (<PFN>, <obj_idx>) is encoded as
Nitin Cupta's avatar
Nitin Cupta committed
80
 * as single (unsigned long) handle value.
81
 *
Minchan Kim's avatar
Minchan Kim committed
82
 * Note that object index <obj_idx> starts from 0.
83
84
85
86
 *
 * This is made more complicated by various memory models and PAE.
 */

87
88
89
90
#ifndef MAX_POSSIBLE_PHYSMEM_BITS
#ifdef MAX_PHYSMEM_BITS
#define MAX_POSSIBLE_PHYSMEM_BITS MAX_PHYSMEM_BITS
#else
91
92
93
94
/*
 * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just
 * be PAGE_SHIFT
 */
95
#define MAX_POSSIBLE_PHYSMEM_BITS BITS_PER_LONG
96
97
#endif
#endif
98
99

#define _PFN_BITS		(MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT)
Minchan Kim's avatar
Minchan Kim committed
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119

/*
 * Memory for allocating for handle keeps object position by
 * encoding <page, obj_idx> and the encoded value has a room
 * in least bit(ie, look at obj_to_location).
 * We use the bit to synchronize between object access by
 * user and migration.
 */
#define HANDLE_PIN_BIT	0

/*
 * Head in allocated object should have OBJ_ALLOCATED_TAG
 * to identify the object was allocated or not.
 * It's okay to add the status bit in the least bit because
 * header keeps handle which is 4byte-aligned address so we
 * have room for two bit at least.
 */
#define OBJ_ALLOCATED_TAG 1
#define OBJ_TAG_BITS 1
#define OBJ_INDEX_BITS	(BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)
120
121
#define OBJ_INDEX_MASK	((_AC(1, UL) << OBJ_INDEX_BITS) - 1)

122
123
124
125
126
#define FULLNESS_BITS	2
#define CLASS_BITS	8
#define ISOLATED_BITS	3
#define MAGIC_VAL_BITS	8

127
128
129
130
#define MAX(a, b) ((a) >= (b) ? (a) : (b))
/* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */
#define ZS_MIN_ALLOC_SIZE \
	MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
131
/* each chunk includes extra space to keep handle */
132
#define ZS_MAX_ALLOC_SIZE	PAGE_SIZE
133
134

/*
135
 * On systems with 4K page size, this gives 255 size classes! There is a
136
137
138
139
140
141
142
143
144
145
146
 * trader-off here:
 *  - Large number of size classes is potentially wasteful as free page are
 *    spread across these classes
 *  - Small number of size classes causes large internal fragmentation
 *  - Probably its better to use specific size classes (empirically
 *    determined). NOTE: all those class sizes must be set as multiple of
 *    ZS_ALIGN to make sure link_free itself never has to span 2 pages.
 *
 *  ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN
 *  (reason above)
 */
147
#define ZS_SIZE_CLASS_DELTA	(PAGE_SIZE >> CLASS_BITS)
148
149
#define ZS_SIZE_CLASSES	(DIV_ROUND_UP(ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE, \
				      ZS_SIZE_CLASS_DELTA) + 1)
150
151
152

enum fullness_group {
	ZS_EMPTY,
Minchan Kim's avatar
Minchan Kim committed
153
154
155
156
	ZS_ALMOST_EMPTY,
	ZS_ALMOST_FULL,
	ZS_FULL,
	NR_ZS_FULLNESS,
157
158
};

159
enum zs_stat_type {
Minchan Kim's avatar
Minchan Kim committed
160
161
162
163
	CLASS_EMPTY,
	CLASS_ALMOST_EMPTY,
	CLASS_ALMOST_FULL,
	CLASS_FULL,
164
165
	OBJ_ALLOCATED,
	OBJ_USED,
Minchan Kim's avatar
Minchan Kim committed
166
	NR_ZS_STAT_TYPE,
167
168
169
170
171
172
};

struct zs_size_stat {
	unsigned long objs[NR_ZS_STAT_TYPE];
};

173
174
#ifdef CONFIG_ZSMALLOC_STAT
static struct dentry *zs_stat_root;
175
176
#endif

Minchan Kim's avatar
Minchan Kim committed
177
178
179
180
#ifdef CONFIG_COMPACTION
static struct vfsmount *zsmalloc_mnt;
#endif

181
182
183
184
185
/*
 * We assign a page to ZS_ALMOST_EMPTY fullness group when:
 *	n <= N / f, where
 * n = number of allocated objects
 * N = total number of objects zspage can store
186
 * f = fullness_threshold_frac
187
188
189
190
191
192
193
194
195
 *
 * Similarly, we assign zspage to:
 *	ZS_ALMOST_FULL	when n > N / f
 *	ZS_EMPTY	when n == 0
 *	ZS_FULL		when n == N
 *
 * (see: fix_fullness_group())
 */
static const int fullness_threshold_frac = 4;
196
static size_t huge_class_size;
197
198

struct size_class {
199
	spinlock_t lock;
Minchan Kim's avatar
Minchan Kim committed
200
	struct list_head fullness_list[NR_ZS_FULLNESS];
201
202
203
204
205
	/*
	 * Size of objects stored in this class. Must be multiple
	 * of ZS_ALIGN.
	 */
	int size;
206
	int objs_per_zspage;
207
208
	/* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
	int pages_per_zspage;
Minchan Kim's avatar
Minchan Kim committed
209
210
211

	unsigned int index;
	struct zs_size_stat stats;
212
213
};

Minchan Kim's avatar
Minchan Kim committed
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
static void SetPageHugeObject(struct page *page)
{
	SetPageOwnerPriv1(page);
}

static void ClearPageHugeObject(struct page *page)
{
	ClearPageOwnerPriv1(page);
}

static int PageHugeObject(struct page *page)
{
	return PageOwnerPriv1(page);
}

230
231
/*
 * Placed within free objects to form a singly linked list.
232
 * For every zspage, zspage->freeobj gives head of this list.
233
234
235
236
 *
 * This must be power of 2 and less than or equal to ZS_ALIGN
 */
struct link_free {
237
238
	union {
		/*
Minchan Kim's avatar
Minchan Kim committed
239
		 * Free object index;
240
241
		 * It's valid for non-allocated object
		 */
Minchan Kim's avatar
Minchan Kim committed
242
		unsigned long next;
243
244
245
246
247
		/*
		 * Handle of allocated object.
		 */
		unsigned long handle;
	};
248
249
250
};

struct zs_pool {
251
	const char *name;
252

253
	struct size_class *size_class[ZS_SIZE_CLASSES];
254
	struct kmem_cache *handle_cachep;
255
	struct kmem_cache *zspage_cachep;
256

257
	atomic_long_t pages_allocated;
258

259
	struct zs_pool_stats stats;
260
261
262

	/* Compact classes */
	struct shrinker shrinker;
263

264
265
266
#ifdef CONFIG_ZSMALLOC_STAT
	struct dentry *stat_dentry;
#endif
Minchan Kim's avatar
Minchan Kim committed
267
268
269
270
#ifdef CONFIG_COMPACTION
	struct inode *inode;
	struct work_struct free_work;
#endif
271
};
272

273
274
275
struct zspage {
	struct {
		unsigned int fullness:FULLNESS_BITS;
Minchan Kim's avatar
Minchan Kim committed
276
		unsigned int class:CLASS_BITS + 1;
Minchan Kim's avatar
Minchan Kim committed
277
278
		unsigned int isolated:ISOLATED_BITS;
		unsigned int magic:MAGIC_VAL_BITS;
279
280
	};
	unsigned int inuse;
Minchan Kim's avatar
Minchan Kim committed
281
	unsigned int freeobj;
282
283
	struct page *first_page;
	struct list_head list; /* fullness list */
Minchan Kim's avatar
Minchan Kim committed
284
285
286
#ifdef CONFIG_COMPACTION
	rwlock_t lock;
#endif
287
};
288

289
struct mapping_area {
290
#ifdef CONFIG_PGTABLE_MAPPING
291
292
293
294
295
296
297
298
	struct vm_struct *vm; /* vm area for mapping object that span pages */
#else
	char *vm_buf; /* copy buffer for objects that span pages */
#endif
	char *vm_addr; /* address of kmap_atomic()'ed pages */
	enum zs_mapmode vm_mm; /* mapping mode */
};

Minchan Kim's avatar
Minchan Kim committed
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
#ifdef CONFIG_COMPACTION
static int zs_register_migration(struct zs_pool *pool);
static void zs_unregister_migration(struct zs_pool *pool);
static void migrate_lock_init(struct zspage *zspage);
static void migrate_read_lock(struct zspage *zspage);
static void migrate_read_unlock(struct zspage *zspage);
static void kick_deferred_free(struct zs_pool *pool);
static void init_deferred_free(struct zs_pool *pool);
static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage);
#else
static int zsmalloc_mount(void) { return 0; }
static void zsmalloc_unmount(void) {}
static int zs_register_migration(struct zs_pool *pool) { return 0; }
static void zs_unregister_migration(struct zs_pool *pool) {}
static void migrate_lock_init(struct zspage *zspage) {}
static void migrate_read_lock(struct zspage *zspage) {}
static void migrate_read_unlock(struct zspage *zspage) {}
static void kick_deferred_free(struct zs_pool *pool) {}
static void init_deferred_free(struct zs_pool *pool) {}
static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {}
#endif

321
static int create_cache(struct zs_pool *pool)
322
323
324
{
	pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
					0, 0, NULL);
325
326
327
328
329
330
331
332
333
334
335
336
	if (!pool->handle_cachep)
		return 1;

	pool->zspage_cachep = kmem_cache_create("zspage", sizeof(struct zspage),
					0, 0, NULL);
	if (!pool->zspage_cachep) {
		kmem_cache_destroy(pool->handle_cachep);
		pool->handle_cachep = NULL;
		return 1;
	}

	return 0;
337
338
}

339
static void destroy_cache(struct zs_pool *pool)
340
{
341
	kmem_cache_destroy(pool->handle_cachep);
342
	kmem_cache_destroy(pool->zspage_cachep);
343
344
}

345
static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp)
346
347
{
	return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
Minchan Kim's avatar
Minchan Kim committed
348
			gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
349
350
}

351
static void cache_free_handle(struct zs_pool *pool, unsigned long handle)
352
353
354
355
{
	kmem_cache_free(pool->handle_cachep, (void *)handle);
}

356
357
static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags)
{
Minchan Kim's avatar
Minchan Kim committed
358
359
	return kmem_cache_alloc(pool->zspage_cachep,
			flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
360
}
361
362
363
364
365
366

static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage)
{
	kmem_cache_free(pool->zspage_cachep, zspage);
}

367
368
static void record_obj(unsigned long handle, unsigned long obj)
{
369
370
371
372
373
374
	/*
	 * lsb of @obj represents handle lock while other bits
	 * represent object value the handle is pointing so
	 * updating shouldn't do store tearing.
	 */
	WRITE_ONCE(*(unsigned long *)handle, obj);
375
376
}

377
378
379
380
/* zpool driver */

#ifdef CONFIG_ZPOOL

381
static void *zs_zpool_create(const char *name, gfp_t gfp,
382
			     const struct zpool_ops *zpool_ops,
Dan Streetman's avatar
Dan Streetman committed
383
			     struct zpool *zpool)
384
{
385
386
387
388
389
390
	/*
	 * Ignore global gfp flags: zs_malloc() may be invoked from
	 * different contexts and its caller must provide a valid
	 * gfp mask.
	 */
	return zs_create_pool(name);
391
392
393
394
395
396
397
398
399
400
}

static void zs_zpool_destroy(void *pool)
{
	zs_destroy_pool(pool);
}

static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp,
			unsigned long *handle)
{
401
	*handle = zs_malloc(pool, size, gfp);
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
	return *handle ? 0 : -1;
}
static void zs_zpool_free(void *pool, unsigned long handle)
{
	zs_free(pool, handle);
}

static void *zs_zpool_map(void *pool, unsigned long handle,
			enum zpool_mapmode mm)
{
	enum zs_mapmode zs_mm;

	switch (mm) {
	case ZPOOL_MM_RO:
		zs_mm = ZS_MM_RO;
		break;
	case ZPOOL_MM_WO:
		zs_mm = ZS_MM_WO;
		break;
	case ZPOOL_MM_RW: /* fallthru */
	default:
		zs_mm = ZS_MM_RW;
		break;
	}

	return zs_map_object(pool, handle, zs_mm);
}
static void zs_zpool_unmap(void *pool, unsigned long handle)
{
	zs_unmap_object(pool, handle);
}

static u64 zs_zpool_total_size(void *pool)
{
436
	return zs_get_total_pages(pool) << PAGE_SHIFT;
437
438
439
440
441
442
443
444
445
446
447
448
449
450
}

static struct zpool_driver zs_zpool_driver = {
	.type =		"zsmalloc",
	.owner =	THIS_MODULE,
	.create =	zs_zpool_create,
	.destroy =	zs_zpool_destroy,
	.malloc =	zs_zpool_malloc,
	.free =		zs_zpool_free,
	.map =		zs_zpool_map,
	.unmap =	zs_zpool_unmap,
	.total_size =	zs_zpool_total_size,
};

451
MODULE_ALIAS("zpool-zsmalloc");
452
453
#endif /* CONFIG_ZPOOL */

454
455
456
/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
static DEFINE_PER_CPU(struct mapping_area, zs_map_area);

Minchan Kim's avatar
Minchan Kim committed
457
458
459
460
461
static bool is_zspage_isolated(struct zspage *zspage)
{
	return zspage->isolated;
}

462
static __maybe_unused int is_first_page(struct page *page)
463
{
464
	return PagePrivate(page);
465
466
}

Minchan Kim's avatar
Minchan Kim committed
467
/* Protected by class->lock */
468
static inline int get_zspage_inuse(struct zspage *zspage)
Minchan Kim's avatar
Minchan Kim committed
469
{
470
	return zspage->inuse;
Minchan Kim's avatar
Minchan Kim committed
471
472
}

473
static inline void set_zspage_inuse(struct zspage *zspage, int val)
Minchan Kim's avatar
Minchan Kim committed
474
{
475
	zspage->inuse = val;
Minchan Kim's avatar
Minchan Kim committed
476
477
}

478
static inline void mod_zspage_inuse(struct zspage *zspage, int val)
Minchan Kim's avatar
Minchan Kim committed
479
{
480
	zspage->inuse += val;
Minchan Kim's avatar
Minchan Kim committed
481
482
}

Minchan Kim's avatar
Minchan Kim committed
483
static inline struct page *get_first_page(struct zspage *zspage)
Minchan Kim's avatar
Minchan Kim committed
484
{
Minchan Kim's avatar
Minchan Kim committed
485
	struct page *first_page = zspage->first_page;
486

Minchan Kim's avatar
Minchan Kim committed
487
488
	VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
	return first_page;
Minchan Kim's avatar
Minchan Kim committed
489
490
}

Minchan Kim's avatar
Minchan Kim committed
491
static inline int get_first_obj_offset(struct page *page)
Minchan Kim's avatar
Minchan Kim committed
492
{
Minchan Kim's avatar
Minchan Kim committed
493
494
	return page->units;
}
495

Minchan Kim's avatar
Minchan Kim committed
496
497
498
static inline void set_first_obj_offset(struct page *page, int offset)
{
	page->units = offset;
Minchan Kim's avatar
Minchan Kim committed
499
500
}

Minchan Kim's avatar
Minchan Kim committed
501
static inline unsigned int get_freeobj(struct zspage *zspage)
Minchan Kim's avatar
Minchan Kim committed
502
{
Minchan Kim's avatar
Minchan Kim committed
503
	return zspage->freeobj;
Minchan Kim's avatar
Minchan Kim committed
504
505
}

Minchan Kim's avatar
Minchan Kim committed
506
static inline void set_freeobj(struct zspage *zspage, unsigned int obj)
Minchan Kim's avatar
Minchan Kim committed
507
{
Minchan Kim's avatar
Minchan Kim committed
508
	zspage->freeobj = obj;
Minchan Kim's avatar
Minchan Kim committed
509
510
}

511
static void get_zspage_mapping(struct zspage *zspage,
512
				unsigned int *class_idx,
513
514
				enum fullness_group *fullness)
{
Minchan Kim's avatar
Minchan Kim committed
515
516
	BUG_ON(zspage->magic != ZSPAGE_MAGIC);

517
518
	*fullness = zspage->fullness;
	*class_idx = zspage->class;
519
520
}

521
static void set_zspage_mapping(struct zspage *zspage,
522
				unsigned int class_idx,
523
524
				enum fullness_group fullness)
{
525
526
	zspage->class = class_idx;
	zspage->fullness = fullness;
527
528
}

Nitin Cupta's avatar
Nitin Cupta committed
529
530
531
532
533
534
535
/*
 * zsmalloc divides the pool into various size classes where each
 * class maintains a list of zspages where each zspage is divided
 * into equal sized chunks. Each allocation falls into one of these
 * classes depending on its size. This function returns index of the
 * size class which has chunk size big enough to hold the give size.
 */
536
537
538
539
540
541
542
543
static int get_size_class_index(int size)
{
	int idx = 0;

	if (likely(size > ZS_MIN_ALLOC_SIZE))
		idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE,
				ZS_SIZE_CLASS_DELTA);

544
	return min_t(int, ZS_SIZE_CLASSES - 1, idx);
545
546
}

547
/* type can be of enum type zs_stat_type or fullness_group */
Minchan Kim's avatar
Minchan Kim committed
548
static inline void zs_stat_inc(struct size_class *class,
549
				int type, unsigned long cnt)
Minchan Kim's avatar
Minchan Kim committed
550
{
Minchan Kim's avatar
Minchan Kim committed
551
	class->stats.objs[type] += cnt;
Minchan Kim's avatar
Minchan Kim committed
552
553
}

554
/* type can be of enum type zs_stat_type or fullness_group */
Minchan Kim's avatar
Minchan Kim committed
555
static inline void zs_stat_dec(struct size_class *class,
556
				int type, unsigned long cnt)
Minchan Kim's avatar
Minchan Kim committed
557
{
Minchan Kim's avatar
Minchan Kim committed
558
	class->stats.objs[type] -= cnt;
Minchan Kim's avatar
Minchan Kim committed
559
560
}

561
/* type can be of enum type zs_stat_type or fullness_group */
Minchan Kim's avatar
Minchan Kim committed
562
static inline unsigned long zs_stat_get(struct size_class *class,
563
				int type)
Minchan Kim's avatar
Minchan Kim committed
564
{
Minchan Kim's avatar
Minchan Kim committed
565
	return class->stats.objs[type];
Minchan Kim's avatar
Minchan Kim committed
566
567
}

568
569
#ifdef CONFIG_ZSMALLOC_STAT

570
static void __init zs_stat_init(void)
Minchan Kim's avatar
Minchan Kim committed
571
{
572
573
574
575
	if (!debugfs_initialized()) {
		pr_warn("debugfs not available, stat dir not created\n");
		return;
	}
Minchan Kim's avatar
Minchan Kim committed
576
577
578

	zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
	if (!zs_stat_root)
579
		pr_warn("debugfs 'zsmalloc' stat dir creation failed\n");
Minchan Kim's avatar
Minchan Kim committed
580
581
582
583
584
585
586
}

static void __exit zs_stat_exit(void)
{
	debugfs_remove_recursive(zs_stat_root);
}

587
588
static unsigned long zs_can_compact(struct size_class *class);

Minchan Kim's avatar
Minchan Kim committed
589
590
591
592
593
594
595
static int zs_stats_size_show(struct seq_file *s, void *v)
{
	int i;
	struct zs_pool *pool = s->private;
	struct size_class *class;
	int objs_per_zspage;
	unsigned long class_almost_full, class_almost_empty;
596
	unsigned long obj_allocated, obj_used, pages_used, freeable;
Minchan Kim's avatar
Minchan Kim committed
597
598
	unsigned long total_class_almost_full = 0, total_class_almost_empty = 0;
	unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
599
	unsigned long total_freeable = 0;
Minchan Kim's avatar
Minchan Kim committed
600

601
	seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s %8s\n",
Minchan Kim's avatar
Minchan Kim committed
602
603
			"class", "size", "almost_full", "almost_empty",
			"obj_allocated", "obj_used", "pages_used",
604
			"pages_per_zspage", "freeable");
Minchan Kim's avatar
Minchan Kim committed
605

606
	for (i = 0; i < ZS_SIZE_CLASSES; i++) {
Minchan Kim's avatar
Minchan Kim committed
607
608
609
610
611
612
613
614
615
616
		class = pool->size_class[i];

		if (class->index != i)
			continue;

		spin_lock(&class->lock);
		class_almost_full = zs_stat_get(class, CLASS_ALMOST_FULL);
		class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY);
		obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
		obj_used = zs_stat_get(class, OBJ_USED);
617
		freeable = zs_can_compact(class);
Minchan Kim's avatar
Minchan Kim committed
618
619
		spin_unlock(&class->lock);

620
		objs_per_zspage = class->objs_per_zspage;
Minchan Kim's avatar
Minchan Kim committed
621
622
623
		pages_used = obj_allocated / objs_per_zspage *
				class->pages_per_zspage;

624
625
		seq_printf(s, " %5u %5u %11lu %12lu %13lu"
				" %10lu %10lu %16d %8lu\n",
Minchan Kim's avatar
Minchan Kim committed
626
627
			i, class->size, class_almost_full, class_almost_empty,
			obj_allocated, obj_used, pages_used,
628
			class->pages_per_zspage, freeable);
Minchan Kim's avatar
Minchan Kim committed
629
630
631
632
633
634

		total_class_almost_full += class_almost_full;
		total_class_almost_empty += class_almost_empty;
		total_objs += obj_allocated;
		total_used_objs += obj_used;
		total_pages += pages_used;
635
		total_freeable += freeable;
Minchan Kim's avatar
Minchan Kim committed
636
637
638
	}

	seq_puts(s, "\n");
639
	seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu %16s %8lu\n",
Minchan Kim's avatar
Minchan Kim committed
640
641
			"Total", "", total_class_almost_full,
			total_class_almost_empty, total_objs,
642
			total_used_objs, total_pages, "", total_freeable);
Minchan Kim's avatar
Minchan Kim committed
643
644
645

	return 0;
}
646
DEFINE_SHOW_ATTRIBUTE(zs_stats_size);
Minchan Kim's avatar
Minchan Kim committed
647

648
static void zs_pool_stat_create(struct zs_pool *pool, const char *name)
Minchan Kim's avatar
Minchan Kim committed
649
650
651
{
	struct dentry *entry;

652
653
	if (!zs_stat_root) {
		pr_warn("no root stat dir, not creating <%s> stat dir\n", name);
654
		return;
655
	}
Minchan Kim's avatar
Minchan Kim committed
656
657
658
659

	entry = debugfs_create_dir(name, zs_stat_root);
	if (!entry) {
		pr_warn("debugfs dir <%s> creation failed\n", name);
660
		return;
Minchan Kim's avatar
Minchan Kim committed
661
662
663
	}
	pool->stat_dentry = entry;

664
665
666
	entry = debugfs_create_file("classes", S_IFREG | 0444,
				    pool->stat_dentry, pool,
				    &zs_stats_size_fops);
Minchan Kim's avatar
Minchan Kim committed
667
668
669
	if (!entry) {
		pr_warn("%s: debugfs file entry <%s> creation failed\n",
				name, "classes");
670
671
		debugfs_remove_recursive(pool->stat_dentry);
		pool->stat_dentry = NULL;
Minchan Kim's avatar
Minchan Kim committed
672
673
674
675
676
677
678
679
680
	}
}

static void zs_pool_stat_destroy(struct zs_pool *pool)
{
	debugfs_remove_recursive(pool->stat_dentry);
}

#else /* CONFIG_ZSMALLOC_STAT */
681
static void __init zs_stat_init(void)
Minchan Kim's avatar
Minchan Kim committed
682
683
684
685
686
687
688
{
}

static void __exit zs_stat_exit(void)
{
}

689
static inline void zs_pool_stat_create(struct zs_pool *pool, const char *name)
Minchan Kim's avatar
Minchan Kim committed
690
691
692
693
694
695
696
697
{
}

static inline void zs_pool_stat_destroy(struct zs_pool *pool)
{
}
#endif

Minchan Kim's avatar
Minchan Kim committed
698

Nitin Cupta's avatar
Nitin Cupta committed
699
700
701
702
703
704
705
/*
 * For each size class, zspages are divided into different groups
 * depending on how "full" they are. This was done so that we could
 * easily find empty or nearly empty zspages when we try to shrink
 * the pool (not yet implemented). This function returns fullness
 * status of the given page.
 */
706
static enum fullness_group get_fullness_group(struct size_class *class,
707
						struct zspage *zspage)
708
{
709
	int inuse, objs_per_zspage;
710
	enum fullness_group fg;
Minchan Kim's avatar
Minchan Kim committed
711

712
	inuse = get_zspage_inuse(zspage);
713
	objs_per_zspage = class->objs_per_zspage;
714
715
716

	if (inuse == 0)
		fg = ZS_EMPTY;
717
	else if (inuse == objs_per_zspage)
718
		fg = ZS_FULL;
719
	else if (inuse <= 3 * objs_per_zspage / fullness_threshold_frac)
720
721
722
723
724
725
726
		fg = ZS_ALMOST_EMPTY;
	else
		fg = ZS_ALMOST_FULL;

	return fg;
}

Nitin Cupta's avatar
Nitin Cupta committed
727
728
729
730
731
732
/*
 * Each size class maintains various freelists and zspages are assigned
 * to one of these freelists based on the number of live objects they
 * have. This functions inserts the given zspage into the freelist
 * identified by <class, fullness_group>.
 */
733
static void insert_zspage(struct size_class *class,
734
735
				struct zspage *zspage,
				enum fullness_group fullness)
736
{
737
	struct zspage *head;
738

Minchan Kim's avatar
Minchan Kim committed
739
	zs_stat_inc(class, fullness, 1);
740
741
	head = list_first_entry_or_null(&class->fullness_list[fullness],
					struct zspage, list);
742
	/*
743
744
	 * We want to see more ZS_FULL pages and less almost empty/full.
	 * Put pages with higher ->inuse first.
745
	 */
746
747
748
749
750
751
752
	if (head) {
		if (get_zspage_inuse(zspage) < get_zspage_inuse(head)) {
			list_add(&zspage->list, &head->list);
			return;
		}
	}
	list_add(&zspage->list, &class->fullness_list[fullness]);
753
754
}

Nitin Cupta's avatar
Nitin Cupta committed
755
756
757
758
/*
 * This function removes the given zspage from the freelist identified
 * by <class, fullness_group>.
 */
759
static void remove_zspage(struct size_class *class,
760
761
				struct zspage *zspage,
				enum fullness_group fullness)
762
{
763
	VM_BUG_ON(list_empty(&class->fullness_list[fullness]));
Minchan Kim's avatar
Minchan Kim committed
764
	VM_BUG_ON(is_zspage_isolated(zspage));
765

766
	list_del_init(&zspage->list);
Minchan Kim's avatar
Minchan Kim committed
767
	zs_stat_dec(class, fullness, 1);
768
769
}

Nitin Cupta's avatar
Nitin Cupta committed
770
771
772
773
774
775
776
777
778
/*
 * Each size class maintains zspages in different fullness groups depending
 * on the number of live objects they contain. When allocating or freeing
 * objects, the fullness status of the page can change, say, from ALMOST_FULL
 * to ALMOST_EMPTY when freeing an object. This function checks if such
 * a status change has occurred for the given page and accordingly moves the
 * page from the freelist of the old fullness group to that of the new
 * fullness group.
 */
779
static enum fullness_group fix_fullness_group(struct size_class *class,
780
						struct zspage *zspage)
781
782
783
784
{
	int class_idx;
	enum fullness_group currfg, newfg;

785
786
	get_zspage_mapping(zspage, &class_idx, &currfg);
	newfg = get_fullness_group(class, zspage);
787
788
789
	if (newfg == currfg)
		goto out;

Minchan Kim's avatar
Minchan Kim committed
790
791
792
793
794
	if (!is_zspage_isolated(zspage)) {
		remove_zspage(class, zspage, currfg);
		insert_zspage(class, zspage, newfg);
	}

795
	set_zspage_mapping(zspage, class_idx, newfg);
796
797
798
799
800
801
802
803
804
805

out:
	return newfg;
}

/*
 * We have to decide on how many pages to link together
 * to form a zspage for each size class. This is important
 * to reduce wastage due to unusable space left at end of
 * each zspage which is given as:
806
807
 *     wastage = Zp % class_size
 *     usage = Zp - wastage
808
809
810
811
812
813
 * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ...
 *
 * For example, for size class of 3/8 * PAGE_SIZE, we should
 * link together 3 PAGE_SIZE sized pages to form a zspage
 * since then we can perfectly fit in 8 such objects.
 */
814
static int get_pages_per_zspage(int class_size)
815
816
817
818
819
{
	int i, max_usedpc = 0;
	/* zspage order which gives maximum used size per KB */
	int max_usedpc_order = 1;

820
	for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) {
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
		int zspage_size;
		int waste, usedpc;

		zspage_size = i * PAGE_SIZE;
		waste = zspage_size % class_size;
		usedpc = (zspage_size - waste) * 100 / zspage_size;

		if (usedpc > max_usedpc) {
			max_usedpc = usedpc;
			max_usedpc_order = i;
		}
	}

	return max_usedpc_order;
}

837
static struct zspage *get_zspage(struct page *page)
838
{
Minchan Kim's avatar
Minchan Kim committed
839
840
841
842
	struct zspage *zspage = (struct zspage *)page->private;

	BUG_ON(zspage->magic != ZSPAGE_MAGIC);
	return zspage;
843
844
845
846
}

static struct page *get_next_page(struct page *page)
{
Minchan Kim's avatar
Minchan Kim committed
847
848
849
850
	if (unlikely(PageHugeObject(page)))
		return NULL;

	return page->freelist;
851
852
}

Minchan Kim's avatar
Minchan Kim committed
853
854
/**
 * obj_to_location - get (<page>, <obj_idx>) from encoded object value
855
 * @obj: the encoded object value
Minchan Kim's avatar
Minchan Kim committed
856
857
 * @page: page object resides in zspage
 * @obj_idx: object index
858
 */
Minchan Kim's avatar
Minchan Kim committed
859
860
static void obj_to_location(unsigned long obj, struct page **page,
				unsigned int *obj_idx)
861
{
Minchan Kim's avatar
Minchan Kim committed
862
863
864
865
	obj >>= OBJ_TAG_BITS;
	*page = pfn_to_page(obj >> OBJ_INDEX_BITS);
	*obj_idx = (obj & OBJ_INDEX_MASK);
}
866

Minchan Kim's avatar
Minchan Kim committed
867
868
869
870
871
872
873
874
/**
 * location_to_obj - get obj value encoded from (<page>, <obj_idx>)
 * @page: page object resides in zspage
 * @obj_idx: object index
 */
static unsigned long location_to_obj(struct page *page, unsigned int obj_idx)
{
	unsigned long obj;
875

Minchan Kim's avatar
Minchan Kim committed
876
	obj = page_to_pfn(page) << OBJ_INDEX_BITS;
Minchan Kim's avatar
Minchan Kim committed
877
	obj |= obj_idx & OBJ_INDEX_MASK;
Minchan Kim's avatar
Minchan Kim committed
878
	obj <<= OBJ_TAG_BITS;
879

Minchan Kim's avatar
Minchan Kim committed
880
	return obj;
881
882
}

883
884
885
886
887
static unsigned long handle_to_obj(unsigned long handle)
{
	return *(unsigned long *)handle;
}

Minchan Kim's avatar
Minchan Kim committed
888
static unsigned long obj_to_head(struct page *page, void *obj)
Minchan Kim's avatar
Minchan Kim committed
889
{
Minchan Kim's avatar
Minchan Kim committed
890
	if (unlikely(PageHugeObject(page))) {
Minchan Kim's avatar
Minchan Kim committed
891
		VM_BUG_ON_PAGE(!is_first_page(page), page);
892
		return page->index;
893
894
	} else
		return *(unsigned long *)obj;
Minchan Kim's avatar
Minchan Kim committed
895
896
}

Minchan Kim's avatar
Minchan Kim committed
897
898
899
900
901
static inline int testpin_tag(unsigned long handle)
{
	return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle);
}

Minchan Kim's avatar
Minchan Kim committed
902
903
static inline int trypin_tag(unsigned long handle)
{
Minchan Kim's avatar
Minchan Kim committed
904
	return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle);
Minchan Kim's avatar
Minchan Kim committed
905
906
907
908
}

static void pin_tag(unsigned long handle)
{
Minchan Kim's avatar
Minchan Kim committed
909
	bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle);
Minchan Kim's avatar
Minchan Kim committed
910
911
912
913
}

static void unpin_tag(unsigned long handle)
{
Minchan Kim's avatar
Minchan Kim committed
914
	bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle);
Minchan Kim's avatar
Minchan Kim committed
915
916
}

917
918
static void reset_page(struct page *page)
{
Minchan Kim's avatar
Minchan Kim committed
919
	__ClearPageMovable(page);
920
	ClearPagePrivate(page);
921
	set_page_private(page, 0);
Minchan Kim's avatar
Minchan Kim committed
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
	page_mapcount_reset(page);
	ClearPageHugeObject(page);
	page->freelist = NULL;
}

/*
 * To prevent zspage destroy during migration, zspage freeing should
 * hold locks of all pages in the zspage.
 */
void lock_zspage(struct zspage *zspage)
{
	struct page *page = get_first_page(zspage);

	do {
		lock_page(page);
	} while ((page = get_next_page(page)) != NULL);
}

int trylock_zspage(struct zspage *zspage)
{
	struct page *cursor, *fail;

	for (cursor = get_first_page(zspage); cursor != NULL; cursor =
					get_next_page(cursor)) {
		if (!trylock_page(cursor)) {
			fail = cursor;
			goto unlock;
		}
	}

	return 1;
unlock:
	for (cursor = get_first_page(zspage); cursor != fail; cursor =
					get_next_page(cursor))
		unlock_page(cursor);

	return 0;
959
960
}

Minchan Kim's avatar
Minchan Kim committed
961
962
static void __free_zspage(struct zs_pool *pool, struct size_class *class,
				struct zspage *zspage)
963
{
964
	struct page *page, *next;
Minchan Kim's avatar
Minchan Kim committed
965
966
967
968
969
970
	enum fullness_group fg;
	unsigned int class_idx;

	get_zspage_mapping(zspage, &class_idx, &fg);

	assert_spin_locked(&class->lock);
971

972
	VM_BUG_ON(get_zspage_inuse(zspage));
Minchan Kim's avatar
Minchan Kim committed
973
	VM_BUG_ON(fg != ZS_EMPTY);
974

Minchan Kim's avatar
Minchan Kim committed
975
	next = page = get_first_page(zspage);
976
	do {
Minchan Kim's avatar
Minchan Kim committed
977
978
		VM_BUG_ON_PAGE(!PageLocked(page), page);
		next = get_next_page(page);
979
		reset_page(page);
Minchan Kim's avatar
Minchan Kim committed
980
		unlock_page(page);
Minchan Kim's avatar
Minchan Kim committed
981
		dec_zone_page_state(page, NR_ZSPAGES);
982
983
984
		put_page(page);
		page = next;
	} while (page != NULL);
985

986
	cache_free_zspage(pool, zspage);
Minchan Kim's avatar
Minchan Kim committed
987

988
	zs_stat_dec(class, OBJ_ALLOCATED, class->objs_per_zspage);
Minchan Kim's avatar
Minchan Kim committed
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
	atomic_long_sub(class->pages_per_zspage,
					&pool->pages_allocated);
}

static void free_zspage(struct zs_pool *pool, struct size_class *class,
				struct zspage *zspage)
{
	VM_BUG_ON(get_zspage_inuse(zspage));
	VM_BUG_ON(list_empty(&zspage->list));

	if (!trylock_zspage(zspage)) {
		kick_deferred_free(pool);
		return;
	}

	remove_zspage(class, zspage, ZS_EMPTY);
	__free_zspage(pool, class, zspage);
1006
1007
1008
}

/* Initialize a newly allocated zspage */
1009
static void init_zspage(struct size_class *class, struct zspage *zspage)
1010
{
Minchan Kim's avatar
Minchan Kim committed
1011
	unsigned int freeobj = 1;
1012
	unsigned long off = 0;
Minchan Kim's avatar
Minchan Kim committed
1013
	struct page *page = get_first_page(zspage);
Minchan Kim's avatar
Minchan Kim committed
1014

1015
1016
1017
	while (page) {
		struct page *next_page;
		struct link_free *link;
1018
		void *vaddr;
1019

1020
		set_first_obj_offset(page, off);
1021

1022
1023
		vaddr = kmap_atomic(page);
		link = (struct link_free *)vaddr + off / sizeof(*link);
1024
1025

		while ((off += class->size) < PAGE_SIZE) {
1026
			link->next = freeobj++ << OBJ_TAG_BITS;
1027
			link += class->size / sizeof(*link);
1028
1029
1030
1031
1032
1033
1034
1035
		}

		/*
		 * We now come to the last (full or partial) object on this
		 * page, which must point to the first object on the next
		 * page (if present)
		 */
		next_page = get_next_page(page);
Minchan Kim's avatar
Minchan Kim committed
1036
		if (next_page) {
1037
			link->next = freeobj++ << OBJ_TAG_BITS;
Minchan Kim's avatar
Minchan Kim committed
1038
1039
		} else {
			/*
1040
			 * Reset OBJ_TAG_BITS bit to last link to tell
Minchan Kim's avatar
Minchan Kim committed
1041
1042
			 * whether it's allocated object or not.
			 */
1043
			link->next = -1UL << OBJ_TAG_BITS;
Minchan Kim's avatar
Minchan Kim committed
1044
		}
1045
		kunmap_atomic(vaddr);
1046
		page = next_page;
1047
		off %= PAGE_SIZE;
1048
	}
1049

Minchan Kim's avatar
Minchan Kim committed
1050
	set_freeobj(zspage, 0);
1051
1052
}

Minchan Kim's avatar
Minchan Kim committed
1053
1054
static void create_page_chain(struct size_class *class, struct zspage *zspage,
				struct page *pages[])
1055
{
1056
1057
1058
	int i;
	struct page *page;
	struct page *prev_page = NULL;
Minchan Kim's avatar
Minchan Kim committed
1059
	int nr_pages = class->pages_per_zspage;
1060
1061
1062

	/*
	 * Allocate individual pages and link them together as:
Minchan Kim's avatar
Minchan Kim committed
1063
	 * 1. all pages are linked together using page->freelist
1064
	 * 2. each sub-page point to zspage using page->private
1065
	 *
1066
	 * we set PG_private to identify the first page (i.e. no other sub-page
1067
	 * has this flag set).
1068
	 */
1069
1070
	for (i = 0; i < nr_pages; i++) {
		page = pages[i];
1071
		set_page_private(page, (unsigned long)zspage);
Minchan Kim's avatar
Minchan Kim committed
1072
		page->freelist = NULL;
1073
		if (i == 0) {
1074
			zspage->first_page = page;
1075
			SetPagePrivate(page);