struct sbitmap - scalable bitmap
A &struct sbitmap is spread over multiple cachelines to avoid ping-pong. This trades off higher memory usage for better scalability 最浪费空间的情况是 1 bit per cacheline.
There are depth / bits_per_word full words and depth % bits_per_word bits left over. In bitwise arithmetic:
bits_per_word = 1 << shift
depth / bits_per_word = depth >> shift
depth % bits_per_word = depth & ((1 << shift) - 1)
round-robin 是对 cacheline 来说,一个 cacheline 只从 alloc hint 开始尝试一次 find,不再 loop around,这样每个 cacheline 都有机会提供 bit 来 rr
struct sbitmap {
/* Number of bits used in the whole bitmap. */
unsigned int depth;
/* log2(number of bits used per word) */
unsigned int shift;
/* Number of words (cachelines) being used for the bitmap. */
unsigned int map_nr;
/* Allocated bitmap. */
struct sbitmap_word *map;
};
#define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift)
nr 对应的 cacheline index
#define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U))
nr 对于的 cacheline 内的 offset
static inline unsigned long *__sbitmap_word(struct sbitmap *sb, unsigned int bitnr)
&sb->map[SB_NR_TO_INDEX(sb, bitnr)].word;
static inline void sbitmap_set_bit(struct sbitmap *sb, unsigned int bitnr)
set_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr))
static inline void sbitmap_clear_bit(struct sbitmap *sb, unsigned int bitnr)
clear_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr))
static inline void sbitmap_clear_bit_unlock(struct sbitmap *sb, unsigned int bitnr)
clear_bit_unlock(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr))
static inline int sbitmap_test_bit(struct sbitmap *sb, unsigned int bitnr)
test_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr))
static inline void __sbitmap_for_each_set(struct sbitmap *sb, unsigned int start, sb_for_each_fn fn, void *data)
Iterate over each set bit in a &struct sbitmap.
@start: Where to start the iteration.
@sb: Bitmap to iterate over.
@fn: Callback. Should return true to continue or false to break early.
@data: Pointer to pass to callback.
This is inline even though it's non-trivial so that the function calls to the
callback will hopefully get optimized away.
static inline void sbitmap_for_each_set(struct sbitmap *sb, sb_for_each_fn fn, void *data)
Iterate over each set bit in a &struct sbitmap.
@sb: Bitmap to iterate over.
@fn: Callback. Should return true to continue or false to break early.
@data: Pointer to pass to callback.
/*
* This one is special, since it doesn't actually clear the bit, rather it
* sets the corresponding bit in the ->cleared mask instead. Paired with
* the caller doing sbitmap_deferred_clear() if a given index is full, which
* will clear the previously freed entries in the corresponding ->word.
*/
static inline void sbitmap_deferred_clear_bit(struct sbitmap *sb, unsigned int bitnr)
unsigned long *addr = &sb->map[SB_NR_TO_INDEX(sb, bitnr)].cleared;
set_bit(SB_NR_TO_BIT(sb, bitnr), addr);
延迟更新到 .word
struct sbitmap_word - CACHELINE - word in a &struct sbitmap
struct sbitmap_word {
/* Number of bits being used in @word/@cleared */
unsigned long depth; 同一个 sbitmap 中的 words 其 depth 不一定相同,
最后一个 word 和之前的 words 可能取值不同,
但是同一个 sbitmap 中的 words depth 累加等于 sbitmap->depth
/* word holding free bits */
unsigned long word ____cacheline_aligned_in_smp;
word 跟 cleared 的更新不是同步的,所以有
sbitmap_deferred_clear 的定义,根据 cleared 更新 word
/* word holding cleared bits */
unsigned long cleared ____cacheline_aligned_in_smp;
/* Held while swapping word <-> cleared */
spinlock_t swap_lock;
} ____cacheline_aligned_in_smp;
sbitmap API
- int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift, gfp_t flags, int node)
Initialize a &struct sbitmap on a specific memory node.
@sb: Bitmap to initialize.
@depth: Number of bits to allocate.
@shift: Use 2^@shift bits per word in the bitmap; if a negative
number if given, a good default is chosen.
@flags: Allocation flags.
@node: Memory node to allocate on.
Return: Zero on success or negative errno on failure.
-ENOMEM 分配 map 失败
如果 @depth == 0,则会设置 sb->map = NULL 也同时返回 0
对 @shift 可能的调整
if (shift < 0) {
shift = ilog2(BITS_PER_LONG);
/*
* If the bitmap is small, shrink the number of bits per word so
* we spread over a few cachelines, at least. If less than 4
* bits, just forget about it, it's not going to work optimally
* anyway.
*/
if (depth >= 4) {
while ((4U << shift) > depth)
shift--;
}
}
假设调整到 (4U << shift) == depth 而 bits_per_word = 1U << shift,
则需要 4 个 sbitmap_words(cachelines),
最浪费空间的情况是 1 bit per word
对于 depth < 4 的情况,只使用 1 个 cacheline
-
static inline void sbitmap_free(struct sbitmap *sb) Free memory used by a &struct sbitmap.
-
void sbitmap_resize(struct sbitmap *sb, unsigned int depth)
Resize a &struct sbitmap.
@sb: Bitmap to resize.
@depth: New number of bits to resize to.
Doesn't reallocate anything. It's up to the caller to ensure that the new
depth doesn't exceed the depth that the sb was initialized with.
不会检测是否 exceed,只能靠 caller 自觉
shift 不会改变,即开始部分的 words 存放的 depth 不变,改变的是后面 words 的 depth
-
bool sbitmap_any_bit_set(const struct sbitmap *sb) Return: true if any bit in the bitmap is set, false otherwise.
-
bool sbitmap_any_bit_clear(const struct sbitmap *sb) Return: true if any bit in the bitmap is clear, false otherwise.
-
int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
Try to allocate a free bit from a &struct sbitmap.
@sb: Bitmap to allocate from.
@alloc_hint: Hint for where to start searching for a free bit.
@round_robin: If true, be stricter about allocation order; always allocate
starting from the last allocated bit. This is less efficient
than the default behavior (false).
round-robin 是对 cachelines 来说,一个 cacheline 只从 alloc hint 开始
尝试一次 find,不再 loop around,这样每个 cacheline 都有机会提供 bit 来 rr
This operation provides acquire barrier semantics if it succeeds.
Return: Non-negative allocated bit number if successful, -1 otherwise.
- int sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint, unsigned long shallow_depth)
Try to allocate a free bit from a &struct sbitmap, limiting the depth used from each word.
@sb: Bitmap to allocate from.
@alloc_hint: Hint for where to start searching for a free bit.
@shallow_depth: The maximum number of bits to allocate from a single word.
This rather specific operation allows for having multiple users with
different allocation limits. E.g., there can be a high-priority class that
uses sbitmap_get() and a low-priority class that uses sbitmap_get_shallow()
with a @shallow_depth of (1 << (@sb->shift - 1)). Then, the low-priority
class can only allocate half of the total bits in the bitmap, preventing it
from starving out the high-priority class.
Return: Non-negative allocated bit number if successful, -1 otherwise
- void sbitmap_bitmap_show(struct sbitmap sb, struct seq_file m) Write a hex dump of a &struct sbitmap to a &struct seq_file. This is intended for debugging. The output isn't guaranteed to be internally consistent.
struct sbitmap_queue - scalable bitmap with the added ability to wait on free bits
A &struct sbitmap_queue uses multiple wait queues and rolling wakeups to avoid contention on the wait queue spinlock. This ensures that we don't hit a scalability wall when we run out of free bits and have to start putting tasks to sleep.
#define SBQ_WAIT_QUEUES 8
#define SBQ_WAKE_BATCH 8
struct sbitmap_queue {
/* Scalable bitmap. */
struct sbitmap sb;
/*
* Cache of last successfully allocated or freed bit.
* This is per-cpu, which allows multiple users to stick to different
* cachelines until the map is exhausted.
*/
unsigned int __percpu *alloc_hint;
/* Number of bits which must be freed before we wake up any waiters. */
unsigned int wake_batch; 初始化为 sbq_calc_wake_batch(sbq, depth)
resize 后如果导致 wake_batch 变化会设置为 1
/* Next wait queue in @ws to wake up. */
atomic_t wake_index; 初始化为 0
/* Wait queues. */
struct sbq_wait_state *ws; SBQ_WAIT_QUEUES 个 wqs
atomic_set(&sbq->ws[i].wait_cnt, sbq->wake_batch)
/* count of currently active ws waitqueues */
atomic_t ws_active; 初始化为 0
关联的 sbq_wait 数量
/* Allocate bits in strict round-robin order. */
bool round_robin;
/**
* minimum shallow depth which may be passed to
* sbitmap_queue_get_shallow() or __sbitmap_queue_get_shallow().
*/
unsigned int min_shallow_depth; 初始化为 UINT_MAX
};
struct sbq_wait_state - wait queue in a &struct sbitmap_queue
struct sbq_wait_state {
/* Number of frees remaining before we wake up. */
atomic_t wait_cnt; 初始化为 atomic_set(&sbq->ws[i].wait_cnt, sbq->wake_batch)
/* Wait queue. */
wait_queue_head_t wait;
} ____cacheline_aligned_in_smp;
sbitmap_queue API
- int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, int shift, bool round_robin, gfp_t flags, int node)
Initialize a &struct sbitmap_queue on a specific memory node.
@sbq: Bitmap queue to initialize.
@depth: See sbitmap_init_node().
@shift: See sbitmap_init_node().
@round_robin: See sbitmap_get().
@flags: Allocation flags.
@node: Memory node to allocate on.
Return: Zero on success or negative errno on failure.
sbq->wake_batch = sbq_calc_wake_batch(sbq, depth)
if (depth && !round_robin) { 随机的起始 alloc hint
for_each_possible_cpu(i)
*per_cpu_ptr(sbq->alloc_hint, i) = prandom_u32() % depth;
}
- static unsigned int sbq_calc_wake_batch(struct sbitmap_queue *sbq, unsigned int depth)
/*
* For each batch, we wake up one queue. We need to make sure that our
* batch size is small enough that the full depth of the bitmap,
* potentially limited by a shallow depth, is enough to wake up all of
* the queues.
*
* Each full word of the bitmap has bits_per_word bits, and there might
* be a partial word. There are depth / bits_per_word full words and
* depth % bits_per_word bits left over. In bitwise arithmetic:
*
* bits_per_word = 1 << shift
* depth / bits_per_word = depth >> shift
* depth % bits_per_word = depth & ((1 << shift) - 1)
*
* Each word can be limited to sbq->min_shallow_depth bits.
*/
shallow_depth = min(1U << sbq->sb.shift, sbq->min_shallow_depth);
depth = ((depth >> sbq->sb.shift) * shallow_depth +
min(depth & ((1U << sbq->sb.shift) - 1), shallow_depth));
wake_batch = clamp_t(unsigned int, depth / SBQ_WAIT_QUEUES, 1,
SBQ_WAKE_BATCH);
-
static inline void sbitmap_queue_free(struct sbitmap_queue *sbq) Free memory used by a &struct sbitmap_queue
-
void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth)
Resize a &struct sbitmap_queue.
@sbq: Bitmap queue to resize.
@depth: New number of bits to resize to.
Like sbitmap_resize(), this doesn't reallocate anything. It has to do
some extra work on the &struct sbitmap_queue, so it's not safe to just
resize the underlying &struct sbitmap.
如果改动了 sbq->wake_batch 则会 atomic_set(&sbq->ws[i].wait_cnt, 1)
而不是使用更新后的 wake_batch
- static inline int sbitmap_queue_get(struct sbitmap_queue sbq, unsigned int cpu) Try to allocate a free bit from a &struct sbitmap_queue.
- int __sbitmap_queue_get(struct sbitmap_queue *sbq)
Try to allocate a free bit from a &struct sbitmap_queue with preemption already disabled.
@sbq: Bitmap queue to allocate from.
Return: Non-negative allocated bit number if successful, -1 otherwise.
- static inline int sbitmap_queue_get_shallow(struct sbitmap_queue sbq, unsigned int cpu, unsigned int shallow_depth)
- int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq, unsigned int shallow_depth)
Try to allocate a free bit from a &struct sbitmap_queue, limiting the depth
used from each word, with preemption already disabled.
@sbq: Bitmap queue to allocate from.
@shallow_depth: The maximum number of bits to allocate from a single word.
See sbitmap_get_shallow().
If you call this, make sure to call sbitmap_queue_min_shallow_depth() after
initializing @sbq. 因为 sbq->min_shallow_depth 初始化为 UINT_MAX
Return: Non-negative allocated bit number if successful, -1 otherwise.
- void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq, unsigned int min_shallow_depth)
Inform a &struct sbitmap_queue of the minimum shallow depth that will be used.
@sbq: Bitmap queue in question.
@min_shallow_depth: The minimum shallow depth that will be passed to
sbitmap_queue_get_shallow() or __sbitmap_queue_get_shallow().
sbitmap_queue_clear() batches wakeups as an optimization. The batch size
depends on the depth of the bitmap. Since the shallow allocation functions
effectively operate with a different depth, the shallow depth must be taken
into account when calculating the batch size. This function must be called
with the minimum shallow depth that will be used. Failure to do so can result
in missed wakeups.
sbq->min_shallow_depth 初始化为 UINT_MAX
sbq->min_shallow_depth = min_shallow_depth;
sbitmap_queue_update_wake_batch(sbq, sbq->sb.depth);
- void sbitmap_queue_show(struct sbitmap_queue sbq, struct seq_file m)
Dump &struct sbitmap_queue information to a &struct seq_file.
This is intended for debugging. The format may change at any time.
- void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, unsigned int cpu)
Free an allocated bit and wake up waiters on a &struct sbitmap_queue.
@sbq: Bitmap to free from.
@nr: Bit number to free.
@cpu: CPU the bit was allocated on.
- static inline struct sbq_wait_state * sbq_wait_ptr(struct sbitmap_queue sbq, atomic_t wait_index)
Get the next wait queue to use for a &struct sbitmap_queue.
@sbq: Bitmap queue to wait on.
@wait_index: A counter per "user" of @sbq.
还会 inc @wait_index
- void sbitmap_queue_wake_all(struct sbitmap_queue *sbq)
Wake up everything waiting on a &struct sbitmap_queue.
- void sbitmap_queue_wake_up(struct sbitmap_queue *sbq)
Wake up some of waiters in one waitqueue on a &struct sbitmap_queue.
struct sbq_wait
struct sbq_wait {
struct sbitmap_queue *sbq; /* if set, sbq_wait is accounted */
struct wait_queue_entry wait;
};
#define DEFINE_SBQ_WAIT(name) \
struct sbq_wait name = { \
.sbq = NULL, \
.wait = { \
.private = current, \
.func = autoremove_wake_function, \
.entry = LIST_HEAD_INIT((name).wait.entry), \
} \
}
- void sbitmap_prepare_to_wait(struct sbitmap_queue sbq, struct sbq_wait_state ws, struct sbq_wait *sbq_wait, int state)
Wrapper around prepare_to_wait_exclusive(), which maintains some extra internal state.
if (!sbq_wait->sbq) {
atomic_inc(&sbq->ws_active);
sbq_wait->sbq = sbq;
}
prepare_to_wait_exclusive(&ws->wait, &sbq_wait->wait, state);
- void sbitmap_finish_wait(struct sbitmap_queue sbq, struct sbq_wait_state ws, struct sbq_wait *sbq_wait)
Must be paired with sbitmap_prepare_to_wait().
finish_wait(&ws->wait, &sbq_wait->wait);
if (sbq_wait->sbq) {
atomic_dec(&sbq->ws_active);
sbq_wait->sbq = NULL;
}
- void sbitmap_add_wait_queue(struct sbitmap_queue sbq, struct sbq_wait_state ws, struct sbq_wait *sbq_wait)
Wrapper around add_wait_queue(), which maintains some extra internal state
if (!sbq_wait->sbq) {
sbq_wait->sbq = sbq;
atomic_inc(&sbq->ws_active);
}
add_wait_queue(&ws->wait, &sbq_wait->wait);
- void sbitmap_del_wait_queue(struct sbq_wait *sbq_wait)
Must be paired with sbitmap_add_wait_queue()
list_del_init(&sbq_wait->wait.entry);
if (sbq_wait->sbq) {
atomic_dec(&sbq_wait->sbq->ws_active);
sbq_wait->sbq = NULL;
}