175bb4625SJens Axboe /* 275bb4625SJens Axboe * Fast and scalable bitmap tagging variant. Uses sparser bitmaps spread 375bb4625SJens Axboe * over multiple cachelines to avoid ping-pong between multiple submitters 475bb4625SJens Axboe * or submitter and completer. Uses rolling wakeups to avoid falling of 575bb4625SJens Axboe * the scaling cliff when we run out of tags and have to start putting 675bb4625SJens Axboe * submitters to sleep. 775bb4625SJens Axboe * 875bb4625SJens Axboe * Uses active queue tracking to support fairer distribution of tags 975bb4625SJens Axboe * between multiple submitters when a shared tag map is used. 1075bb4625SJens Axboe * 1175bb4625SJens Axboe * Copyright (C) 2013-2014 Jens Axboe 1275bb4625SJens Axboe */ 13*320ae51fSJens Axboe #include <linux/kernel.h> 14*320ae51fSJens Axboe #include <linux/module.h> 154bb659b1SJens Axboe #include <linux/random.h> 16*320ae51fSJens Axboe 17*320ae51fSJens Axboe #include <linux/blk-mq.h> 18*320ae51fSJens Axboe #include "blk.h" 19*320ae51fSJens Axboe #include "blk-mq.h" 20*320ae51fSJens Axboe #include "blk-mq-tag.h" 21*320ae51fSJens Axboe 224bb659b1SJens Axboe static bool bt_has_free_tags(struct blk_mq_bitmap_tags *bt) 234bb659b1SJens Axboe { 244bb659b1SJens Axboe int i; 254bb659b1SJens Axboe 264bb659b1SJens Axboe for (i = 0; i < bt->map_nr; i++) { 27e93ecf60SJens Axboe struct blk_align_bitmap *bm = &bt->map[i]; 284bb659b1SJens Axboe int ret; 294bb659b1SJens Axboe 304bb659b1SJens Axboe ret = find_first_zero_bit(&bm->word, bm->depth); 314bb659b1SJens Axboe if (ret < bm->depth) 324bb659b1SJens Axboe return true; 334bb659b1SJens Axboe } 344bb659b1SJens Axboe 354bb659b1SJens Axboe return false; 36*320ae51fSJens Axboe } 37*320ae51fSJens Axboe 38*320ae51fSJens Axboe bool blk_mq_has_free_tags(struct blk_mq_tags *tags) 39*320ae51fSJens Axboe { 404bb659b1SJens Axboe if (!tags) 414bb659b1SJens Axboe return true; 424bb659b1SJens Axboe 434bb659b1SJens Axboe return bt_has_free_tags(&tags->bitmap_tags); 44*320ae51fSJens Axboe } 45*320ae51fSJens Axboe 468537b120SAlexander Gordeev static inline int bt_index_inc(int index) 470d2602caSJens Axboe { 488537b120SAlexander Gordeev return (index + 1) & (BT_WAIT_QUEUES - 1); 498537b120SAlexander Gordeev } 508537b120SAlexander Gordeev 518537b120SAlexander Gordeev static inline void bt_index_atomic_inc(atomic_t *index) 528537b120SAlexander Gordeev { 538537b120SAlexander Gordeev int old = atomic_read(index); 548537b120SAlexander Gordeev int new = bt_index_inc(old); 558537b120SAlexander Gordeev atomic_cmpxchg(index, old, new); 560d2602caSJens Axboe } 570d2602caSJens Axboe 580d2602caSJens Axboe /* 590d2602caSJens Axboe * If a previously inactive queue goes active, bump the active user count. 600d2602caSJens Axboe */ 610d2602caSJens Axboe bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) 620d2602caSJens Axboe { 630d2602caSJens Axboe if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) && 640d2602caSJens Axboe !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) 650d2602caSJens Axboe atomic_inc(&hctx->tags->active_queues); 660d2602caSJens Axboe 670d2602caSJens Axboe return true; 680d2602caSJens Axboe } 690d2602caSJens Axboe 700d2602caSJens Axboe /* 71e3a2b3f9SJens Axboe * Wakeup all potentially sleeping on normal (non-reserved) tags 720d2602caSJens Axboe */ 73e3a2b3f9SJens Axboe static void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags) 740d2602caSJens Axboe { 750d2602caSJens Axboe struct blk_mq_bitmap_tags *bt; 760d2602caSJens Axboe int i, wake_index; 770d2602caSJens Axboe 780d2602caSJens Axboe bt = &tags->bitmap_tags; 798537b120SAlexander Gordeev wake_index = atomic_read(&bt->wake_index); 800d2602caSJens Axboe for (i = 0; i < BT_WAIT_QUEUES; i++) { 810d2602caSJens Axboe struct bt_wait_state *bs = &bt->bs[wake_index]; 820d2602caSJens Axboe 830d2602caSJens Axboe if (waitqueue_active(&bs->wait)) 840d2602caSJens Axboe wake_up(&bs->wait); 850d2602caSJens Axboe 868537b120SAlexander Gordeev wake_index = bt_index_inc(wake_index); 870d2602caSJens Axboe } 880d2602caSJens Axboe } 890d2602caSJens Axboe 900d2602caSJens Axboe /* 91e3a2b3f9SJens Axboe * If a previously busy queue goes inactive, potential waiters could now 92e3a2b3f9SJens Axboe * be allowed to queue. Wake them up and check. 93e3a2b3f9SJens Axboe */ 94e3a2b3f9SJens Axboe void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) 95e3a2b3f9SJens Axboe { 96e3a2b3f9SJens Axboe struct blk_mq_tags *tags = hctx->tags; 97e3a2b3f9SJens Axboe 98e3a2b3f9SJens Axboe if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) 99e3a2b3f9SJens Axboe return; 100e3a2b3f9SJens Axboe 101e3a2b3f9SJens Axboe atomic_dec(&tags->active_queues); 102e3a2b3f9SJens Axboe 103e3a2b3f9SJens Axboe blk_mq_tag_wakeup_all(tags); 104e3a2b3f9SJens Axboe } 105e3a2b3f9SJens Axboe 106e3a2b3f9SJens Axboe /* 1070d2602caSJens Axboe * For shared tag users, we track the number of currently active users 1080d2602caSJens Axboe * and attempt to provide a fair share of the tag depth for each of them. 1090d2602caSJens Axboe */ 1100d2602caSJens Axboe static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, 1110d2602caSJens Axboe struct blk_mq_bitmap_tags *bt) 1120d2602caSJens Axboe { 1130d2602caSJens Axboe unsigned int depth, users; 1140d2602caSJens Axboe 1150d2602caSJens Axboe if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED)) 1160d2602caSJens Axboe return true; 1170d2602caSJens Axboe if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) 1180d2602caSJens Axboe return true; 1190d2602caSJens Axboe 1200d2602caSJens Axboe /* 1210d2602caSJens Axboe * Don't try dividing an ant 1220d2602caSJens Axboe */ 1230d2602caSJens Axboe if (bt->depth == 1) 1240d2602caSJens Axboe return true; 1250d2602caSJens Axboe 1260d2602caSJens Axboe users = atomic_read(&hctx->tags->active_queues); 1270d2602caSJens Axboe if (!users) 1280d2602caSJens Axboe return true; 1290d2602caSJens Axboe 1300d2602caSJens Axboe /* 1310d2602caSJens Axboe * Allow at least some tags 1320d2602caSJens Axboe */ 1330d2602caSJens Axboe depth = max((bt->depth + users - 1) / users, 4U); 1340d2602caSJens Axboe return atomic_read(&hctx->nr_active) < depth; 1350d2602caSJens Axboe } 1360d2602caSJens Axboe 137e93ecf60SJens Axboe static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag) 1384bb659b1SJens Axboe { 1394bb659b1SJens Axboe int tag, org_last_tag, end; 1409e98e9d7SBart Van Assche bool wrap = last_tag != 0; 1414bb659b1SJens Axboe 14259d13bf5SJens Axboe org_last_tag = last_tag; 1434bb659b1SJens Axboe end = bm->depth; 1444bb659b1SJens Axboe do { 1454bb659b1SJens Axboe restart: 1464bb659b1SJens Axboe tag = find_next_zero_bit(&bm->word, end, last_tag); 1474bb659b1SJens Axboe if (unlikely(tag >= end)) { 1484bb659b1SJens Axboe /* 1494bb659b1SJens Axboe * We started with an offset, start from 0 to 1504bb659b1SJens Axboe * exhaust the map. 1514bb659b1SJens Axboe */ 1529e98e9d7SBart Van Assche if (wrap) { 1539e98e9d7SBart Van Assche wrap = false; 1549e98e9d7SBart Van Assche end = org_last_tag; 1554bb659b1SJens Axboe last_tag = 0; 1564bb659b1SJens Axboe goto restart; 1574bb659b1SJens Axboe } 1584bb659b1SJens Axboe return -1; 1594bb659b1SJens Axboe } 1604bb659b1SJens Axboe last_tag = tag + 1; 161c38d185dSBart Van Assche } while (test_and_set_bit(tag, &bm->word)); 1624bb659b1SJens Axboe 1634bb659b1SJens Axboe return tag; 1644bb659b1SJens Axboe } 1654bb659b1SJens Axboe 1664bb659b1SJens Axboe /* 1674bb659b1SJens Axboe * Straight forward bitmap tag implementation, where each bit is a tag 1684bb659b1SJens Axboe * (cleared == free, and set == busy). The small twist is using per-cpu 1694bb659b1SJens Axboe * last_tag caches, which blk-mq stores in the blk_mq_ctx software queue 1704bb659b1SJens Axboe * contexts. This enables us to drastically limit the space searched, 1714bb659b1SJens Axboe * without dirtying an extra shared cacheline like we would if we stored 1724bb659b1SJens Axboe * the cache value inside the shared blk_mq_bitmap_tags structure. On top 1734bb659b1SJens Axboe * of that, each word of tags is in a separate cacheline. This means that 1744bb659b1SJens Axboe * multiple users will tend to stick to different cachelines, at least 1754bb659b1SJens Axboe * until the map is exhausted. 1764bb659b1SJens Axboe */ 1770d2602caSJens Axboe static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt, 1780d2602caSJens Axboe unsigned int *tag_cache) 1794bb659b1SJens Axboe { 1804bb659b1SJens Axboe unsigned int last_tag, org_last_tag; 1814bb659b1SJens Axboe int index, i, tag; 1824bb659b1SJens Axboe 1830d2602caSJens Axboe if (!hctx_may_queue(hctx, bt)) 1840d2602caSJens Axboe return -1; 1850d2602caSJens Axboe 1864bb659b1SJens Axboe last_tag = org_last_tag = *tag_cache; 18759d13bf5SJens Axboe index = TAG_TO_INDEX(bt, last_tag); 1884bb659b1SJens Axboe 1894bb659b1SJens Axboe for (i = 0; i < bt->map_nr; i++) { 19059d13bf5SJens Axboe tag = __bt_get_word(&bt->map[index], TAG_TO_BIT(bt, last_tag)); 1914bb659b1SJens Axboe if (tag != -1) { 19259d13bf5SJens Axboe tag += (index << bt->bits_per_word); 1934bb659b1SJens Axboe goto done; 1944bb659b1SJens Axboe } 1954bb659b1SJens Axboe 1964bb659b1SJens Axboe last_tag = 0; 1974bb659b1SJens Axboe if (++index >= bt->map_nr) 1984bb659b1SJens Axboe index = 0; 1994bb659b1SJens Axboe } 2004bb659b1SJens Axboe 2014bb659b1SJens Axboe *tag_cache = 0; 2024bb659b1SJens Axboe return -1; 2034bb659b1SJens Axboe 2044bb659b1SJens Axboe /* 2054bb659b1SJens Axboe * Only update the cache from the allocation path, if we ended 2064bb659b1SJens Axboe * up using the specific cached tag. 2074bb659b1SJens Axboe */ 2084bb659b1SJens Axboe done: 2094bb659b1SJens Axboe if (tag == org_last_tag) { 2104bb659b1SJens Axboe last_tag = tag + 1; 2114bb659b1SJens Axboe if (last_tag >= bt->depth - 1) 2124bb659b1SJens Axboe last_tag = 0; 2134bb659b1SJens Axboe 2144bb659b1SJens Axboe *tag_cache = last_tag; 2154bb659b1SJens Axboe } 2164bb659b1SJens Axboe 2174bb659b1SJens Axboe return tag; 2184bb659b1SJens Axboe } 2194bb659b1SJens Axboe 2204bb659b1SJens Axboe static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt, 2214bb659b1SJens Axboe struct blk_mq_hw_ctx *hctx) 2224bb659b1SJens Axboe { 2234bb659b1SJens Axboe struct bt_wait_state *bs; 2248537b120SAlexander Gordeev int wait_index; 2254bb659b1SJens Axboe 2264bb659b1SJens Axboe if (!hctx) 2274bb659b1SJens Axboe return &bt->bs[0]; 2284bb659b1SJens Axboe 2298537b120SAlexander Gordeev wait_index = atomic_read(&hctx->wait_index); 2308537b120SAlexander Gordeev bs = &bt->bs[wait_index]; 2318537b120SAlexander Gordeev bt_index_atomic_inc(&hctx->wait_index); 2324bb659b1SJens Axboe return bs; 2334bb659b1SJens Axboe } 2344bb659b1SJens Axboe 235cb96a42cSMing Lei static int bt_get(struct blk_mq_alloc_data *data, 236cb96a42cSMing Lei struct blk_mq_bitmap_tags *bt, 237cb96a42cSMing Lei struct blk_mq_hw_ctx *hctx, 238cb96a42cSMing Lei unsigned int *last_tag) 2394bb659b1SJens Axboe { 2404bb659b1SJens Axboe struct bt_wait_state *bs; 2414bb659b1SJens Axboe DEFINE_WAIT(wait); 2424bb659b1SJens Axboe int tag; 2434bb659b1SJens Axboe 2440d2602caSJens Axboe tag = __bt_get(hctx, bt, last_tag); 2454bb659b1SJens Axboe if (tag != -1) 2464bb659b1SJens Axboe return tag; 2474bb659b1SJens Axboe 248cb96a42cSMing Lei if (!(data->gfp & __GFP_WAIT)) 2494bb659b1SJens Axboe return -1; 2504bb659b1SJens Axboe 25152f7eb94SBart Van Assche bs = bt_wait_ptr(bt, hctx); 25235d37c66SJens Axboe do { 2534bb659b1SJens Axboe prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE); 2544bb659b1SJens Axboe 2550d2602caSJens Axboe tag = __bt_get(hctx, bt, last_tag); 2564bb659b1SJens Axboe if (tag != -1) 2574bb659b1SJens Axboe break; 2584bb659b1SJens Axboe 259b3223207SBart Van Assche /* 260b3223207SBart Van Assche * We're out of tags on this hardware queue, kick any 261b3223207SBart Van Assche * pending IO submits before going to sleep waiting for 262b3223207SBart Van Assche * some to complete. 263b3223207SBart Van Assche */ 264b3223207SBart Van Assche blk_mq_run_hw_queue(hctx, false); 265b3223207SBart Van Assche 266080ff351SJens Axboe /* 267080ff351SJens Axboe * Retry tag allocation after running the hardware queue, 268080ff351SJens Axboe * as running the queue may also have found completions. 269080ff351SJens Axboe */ 270080ff351SJens Axboe tag = __bt_get(hctx, bt, last_tag); 271080ff351SJens Axboe if (tag != -1) 272080ff351SJens Axboe break; 273080ff351SJens Axboe 274cb96a42cSMing Lei blk_mq_put_ctx(data->ctx); 275cb96a42cSMing Lei 2764bb659b1SJens Axboe io_schedule(); 277cb96a42cSMing Lei 278cb96a42cSMing Lei data->ctx = blk_mq_get_ctx(data->q); 279cb96a42cSMing Lei data->hctx = data->q->mq_ops->map_queue(data->q, 280cb96a42cSMing Lei data->ctx->cpu); 281cb96a42cSMing Lei if (data->reserved) { 282cb96a42cSMing Lei bt = &data->hctx->tags->breserved_tags; 283cb96a42cSMing Lei } else { 284cb96a42cSMing Lei last_tag = &data->ctx->last_tag; 285cb96a42cSMing Lei hctx = data->hctx; 286cb96a42cSMing Lei bt = &hctx->tags->bitmap_tags; 287cb96a42cSMing Lei } 28835d37c66SJens Axboe finish_wait(&bs->wait, &wait); 28935d37c66SJens Axboe bs = bt_wait_ptr(bt, hctx); 2904bb659b1SJens Axboe } while (1); 2914bb659b1SJens Axboe 2924bb659b1SJens Axboe finish_wait(&bs->wait, &wait); 2934bb659b1SJens Axboe return tag; 2944bb659b1SJens Axboe } 2954bb659b1SJens Axboe 296cb96a42cSMing Lei static unsigned int __blk_mq_get_tag(struct blk_mq_alloc_data *data) 297*320ae51fSJens Axboe { 298*320ae51fSJens Axboe int tag; 299*320ae51fSJens Axboe 300cb96a42cSMing Lei tag = bt_get(data, &data->hctx->tags->bitmap_tags, data->hctx, 301cb96a42cSMing Lei &data->ctx->last_tag); 3024bb659b1SJens Axboe if (tag >= 0) 303cb96a42cSMing Lei return tag + data->hctx->tags->nr_reserved_tags; 3044bb659b1SJens Axboe 3054bb659b1SJens Axboe return BLK_MQ_TAG_FAIL; 306*320ae51fSJens Axboe } 307*320ae51fSJens Axboe 308cb96a42cSMing Lei static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_alloc_data *data) 309*320ae51fSJens Axboe { 3104bb659b1SJens Axboe int tag, zero = 0; 311*320ae51fSJens Axboe 312cb96a42cSMing Lei if (unlikely(!data->hctx->tags->nr_reserved_tags)) { 313*320ae51fSJens Axboe WARN_ON_ONCE(1); 314*320ae51fSJens Axboe return BLK_MQ_TAG_FAIL; 315*320ae51fSJens Axboe } 316*320ae51fSJens Axboe 317cb96a42cSMing Lei tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL, &zero); 318*320ae51fSJens Axboe if (tag < 0) 319*320ae51fSJens Axboe return BLK_MQ_TAG_FAIL; 3204bb659b1SJens Axboe 321*320ae51fSJens Axboe return tag; 322*320ae51fSJens Axboe } 323*320ae51fSJens Axboe 324cb96a42cSMing Lei unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) 325*320ae51fSJens Axboe { 326cb96a42cSMing Lei if (!data->reserved) 327cb96a42cSMing Lei return __blk_mq_get_tag(data); 328*320ae51fSJens Axboe 329cb96a42cSMing Lei return __blk_mq_get_reserved_tag(data); 330*320ae51fSJens Axboe } 331*320ae51fSJens Axboe 3324bb659b1SJens Axboe static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt) 3334bb659b1SJens Axboe { 3344bb659b1SJens Axboe int i, wake_index; 3354bb659b1SJens Axboe 3368537b120SAlexander Gordeev wake_index = atomic_read(&bt->wake_index); 3374bb659b1SJens Axboe for (i = 0; i < BT_WAIT_QUEUES; i++) { 3384bb659b1SJens Axboe struct bt_wait_state *bs = &bt->bs[wake_index]; 3394bb659b1SJens Axboe 3404bb659b1SJens Axboe if (waitqueue_active(&bs->wait)) { 3418537b120SAlexander Gordeev int o = atomic_read(&bt->wake_index); 3428537b120SAlexander Gordeev if (wake_index != o) 3438537b120SAlexander Gordeev atomic_cmpxchg(&bt->wake_index, o, wake_index); 3444bb659b1SJens Axboe 3454bb659b1SJens Axboe return bs; 3464bb659b1SJens Axboe } 3474bb659b1SJens Axboe 3488537b120SAlexander Gordeev wake_index = bt_index_inc(wake_index); 3494bb659b1SJens Axboe } 3504bb659b1SJens Axboe 3514bb659b1SJens Axboe return NULL; 3524bb659b1SJens Axboe } 3534bb659b1SJens Axboe 3544bb659b1SJens Axboe static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag) 3554bb659b1SJens Axboe { 35659d13bf5SJens Axboe const int index = TAG_TO_INDEX(bt, tag); 3574bb659b1SJens Axboe struct bt_wait_state *bs; 3582971c35fSAlexander Gordeev int wait_cnt; 3594bb659b1SJens Axboe 360c38d185dSBart Van Assche clear_bit(TAG_TO_BIT(bt, tag), &bt->map[index].word); 361c38d185dSBart Van Assche 362c38d185dSBart Van Assche /* Ensure that the wait list checks occur after clear_bit(). */ 363c38d185dSBart Van Assche smp_mb(); 3644bb659b1SJens Axboe 3654bb659b1SJens Axboe bs = bt_wake_ptr(bt); 3662971c35fSAlexander Gordeev if (!bs) 3672971c35fSAlexander Gordeev return; 3682971c35fSAlexander Gordeev 3692971c35fSAlexander Gordeev wait_cnt = atomic_dec_return(&bs->wait_cnt); 3709d8f0bccSBart Van Assche if (unlikely(wait_cnt < 0)) 3719d8f0bccSBart Van Assche wait_cnt = atomic_inc_return(&bs->wait_cnt); 3722971c35fSAlexander Gordeev if (wait_cnt == 0) { 3732971c35fSAlexander Gordeev atomic_add(bt->wake_cnt, &bs->wait_cnt); 3748537b120SAlexander Gordeev bt_index_atomic_inc(&bt->wake_index); 3754bb659b1SJens Axboe wake_up(&bs->wait); 3764bb659b1SJens Axboe } 3774bb659b1SJens Axboe } 3784bb659b1SJens Axboe 3790d2602caSJens Axboe void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, 3804bb659b1SJens Axboe unsigned int *last_tag) 381*320ae51fSJens Axboe { 3820d2602caSJens Axboe struct blk_mq_tags *tags = hctx->tags; 3830d2602caSJens Axboe 3844bb659b1SJens Axboe if (tag >= tags->nr_reserved_tags) { 3854bb659b1SJens Axboe const int real_tag = tag - tags->nr_reserved_tags; 3864bb659b1SJens Axboe 38770114c39SJens Axboe BUG_ON(real_tag >= tags->nr_tags); 38870114c39SJens Axboe bt_clear_tag(&tags->bitmap_tags, real_tag); 3894bb659b1SJens Axboe *last_tag = real_tag; 39070114c39SJens Axboe } else { 39170114c39SJens Axboe BUG_ON(tag >= tags->nr_reserved_tags); 39270114c39SJens Axboe bt_clear_tag(&tags->breserved_tags, tag); 39370114c39SJens Axboe } 394*320ae51fSJens Axboe } 395*320ae51fSJens Axboe 39681481eb4SChristoph Hellwig static void bt_for_each(struct blk_mq_hw_ctx *hctx, 39781481eb4SChristoph Hellwig struct blk_mq_bitmap_tags *bt, unsigned int off, 39881481eb4SChristoph Hellwig busy_iter_fn *fn, void *data, bool reserved) 399*320ae51fSJens Axboe { 40081481eb4SChristoph Hellwig struct request *rq; 40181481eb4SChristoph Hellwig int bit, i; 4024bb659b1SJens Axboe 4034bb659b1SJens Axboe for (i = 0; i < bt->map_nr; i++) { 404e93ecf60SJens Axboe struct blk_align_bitmap *bm = &bt->map[i]; 4054bb659b1SJens Axboe 40681481eb4SChristoph Hellwig for (bit = find_first_bit(&bm->word, bm->depth); 40781481eb4SChristoph Hellwig bit < bm->depth; 40881481eb4SChristoph Hellwig bit = find_next_bit(&bm->word, bm->depth, bit + 1)) { 40981481eb4SChristoph Hellwig rq = blk_mq_tag_to_rq(hctx->tags, off + bit); 41081481eb4SChristoph Hellwig if (rq->q == hctx->queue) 41181481eb4SChristoph Hellwig fn(hctx, rq, data, reserved); 41281481eb4SChristoph Hellwig } 4134bb659b1SJens Axboe 41459d13bf5SJens Axboe off += (1 << bt->bits_per_word); 4154bb659b1SJens Axboe } 416*320ae51fSJens Axboe } 417*320ae51fSJens Axboe 41881481eb4SChristoph Hellwig void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn, 41981481eb4SChristoph Hellwig void *priv) 420*320ae51fSJens Axboe { 42181481eb4SChristoph Hellwig struct blk_mq_tags *tags = hctx->tags; 422*320ae51fSJens Axboe 423*320ae51fSJens Axboe if (tags->nr_reserved_tags) 42481481eb4SChristoph Hellwig bt_for_each(hctx, &tags->breserved_tags, 0, fn, priv, true); 42581481eb4SChristoph Hellwig bt_for_each(hctx, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv, 42681481eb4SChristoph Hellwig false); 427*320ae51fSJens Axboe } 428edf866b3SSam Bradshaw EXPORT_SYMBOL(blk_mq_tag_busy_iter); 429*320ae51fSJens Axboe 4304bb659b1SJens Axboe static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt) 4314bb659b1SJens Axboe { 4324bb659b1SJens Axboe unsigned int i, used; 4334bb659b1SJens Axboe 4344bb659b1SJens Axboe for (i = 0, used = 0; i < bt->map_nr; i++) { 435e93ecf60SJens Axboe struct blk_align_bitmap *bm = &bt->map[i]; 4364bb659b1SJens Axboe 4374bb659b1SJens Axboe used += bitmap_weight(&bm->word, bm->depth); 4384bb659b1SJens Axboe } 4394bb659b1SJens Axboe 4404bb659b1SJens Axboe return bt->depth - used; 4414bb659b1SJens Axboe } 4424bb659b1SJens Axboe 443e3a2b3f9SJens Axboe static void bt_update_count(struct blk_mq_bitmap_tags *bt, 444e3a2b3f9SJens Axboe unsigned int depth) 445e3a2b3f9SJens Axboe { 446e3a2b3f9SJens Axboe unsigned int tags_per_word = 1U << bt->bits_per_word; 447e3a2b3f9SJens Axboe unsigned int map_depth = depth; 448e3a2b3f9SJens Axboe 449e3a2b3f9SJens Axboe if (depth) { 450e3a2b3f9SJens Axboe int i; 451e3a2b3f9SJens Axboe 452e3a2b3f9SJens Axboe for (i = 0; i < bt->map_nr; i++) { 453e3a2b3f9SJens Axboe bt->map[i].depth = min(map_depth, tags_per_word); 454e3a2b3f9SJens Axboe map_depth -= bt->map[i].depth; 455e3a2b3f9SJens Axboe } 456e3a2b3f9SJens Axboe } 457e3a2b3f9SJens Axboe 458e3a2b3f9SJens Axboe bt->wake_cnt = BT_WAIT_BATCH; 459abab13b5SJens Axboe if (bt->wake_cnt > depth / BT_WAIT_QUEUES) 460abab13b5SJens Axboe bt->wake_cnt = max(1U, depth / BT_WAIT_QUEUES); 461e3a2b3f9SJens Axboe 462e3a2b3f9SJens Axboe bt->depth = depth; 463e3a2b3f9SJens Axboe } 464e3a2b3f9SJens Axboe 4654bb659b1SJens Axboe static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth, 4664bb659b1SJens Axboe int node, bool reserved) 4674bb659b1SJens Axboe { 4684bb659b1SJens Axboe int i; 4694bb659b1SJens Axboe 47059d13bf5SJens Axboe bt->bits_per_word = ilog2(BITS_PER_LONG); 47159d13bf5SJens Axboe 4724bb659b1SJens Axboe /* 4734bb659b1SJens Axboe * Depth can be zero for reserved tags, that's not a failure 4744bb659b1SJens Axboe * condition. 4754bb659b1SJens Axboe */ 4764bb659b1SJens Axboe if (depth) { 477e3a2b3f9SJens Axboe unsigned int nr, tags_per_word; 4784bb659b1SJens Axboe 47959d13bf5SJens Axboe tags_per_word = (1 << bt->bits_per_word); 48059d13bf5SJens Axboe 48159d13bf5SJens Axboe /* 48259d13bf5SJens Axboe * If the tag space is small, shrink the number of tags 48359d13bf5SJens Axboe * per word so we spread over a few cachelines, at least. 48459d13bf5SJens Axboe * If less than 4 tags, just forget about it, it's not 48559d13bf5SJens Axboe * going to work optimally anyway. 48659d13bf5SJens Axboe */ 48759d13bf5SJens Axboe if (depth >= 4) { 48859d13bf5SJens Axboe while (tags_per_word * 4 > depth) { 48959d13bf5SJens Axboe bt->bits_per_word--; 49059d13bf5SJens Axboe tags_per_word = (1 << bt->bits_per_word); 49159d13bf5SJens Axboe } 49259d13bf5SJens Axboe } 49359d13bf5SJens Axboe 49459d13bf5SJens Axboe nr = ALIGN(depth, tags_per_word) / tags_per_word; 495e93ecf60SJens Axboe bt->map = kzalloc_node(nr * sizeof(struct blk_align_bitmap), 4964bb659b1SJens Axboe GFP_KERNEL, node); 4974bb659b1SJens Axboe if (!bt->map) 4984bb659b1SJens Axboe return -ENOMEM; 4994bb659b1SJens Axboe 5004bb659b1SJens Axboe bt->map_nr = nr; 5014bb659b1SJens Axboe } 5024bb659b1SJens Axboe 5034bb659b1SJens Axboe bt->bs = kzalloc(BT_WAIT_QUEUES * sizeof(*bt->bs), GFP_KERNEL); 5044bb659b1SJens Axboe if (!bt->bs) { 5054bb659b1SJens Axboe kfree(bt->map); 5064bb659b1SJens Axboe return -ENOMEM; 5074bb659b1SJens Axboe } 5084bb659b1SJens Axboe 509e3a2b3f9SJens Axboe bt_update_count(bt, depth); 51086fb5c56SAlexander Gordeev 51186fb5c56SAlexander Gordeev for (i = 0; i < BT_WAIT_QUEUES; i++) { 51286fb5c56SAlexander Gordeev init_waitqueue_head(&bt->bs[i].wait); 51386fb5c56SAlexander Gordeev atomic_set(&bt->bs[i].wait_cnt, bt->wake_cnt); 51486fb5c56SAlexander Gordeev } 51586fb5c56SAlexander Gordeev 5164bb659b1SJens Axboe return 0; 5174bb659b1SJens Axboe } 5184bb659b1SJens Axboe 5194bb659b1SJens Axboe static void bt_free(struct blk_mq_bitmap_tags *bt) 5204bb659b1SJens Axboe { 5214bb659b1SJens Axboe kfree(bt->map); 5224bb659b1SJens Axboe kfree(bt->bs); 5234bb659b1SJens Axboe } 5244bb659b1SJens Axboe 5254bb659b1SJens Axboe static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, 5264bb659b1SJens Axboe int node) 5274bb659b1SJens Axboe { 5284bb659b1SJens Axboe unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; 5294bb659b1SJens Axboe 5304bb659b1SJens Axboe if (bt_alloc(&tags->bitmap_tags, depth, node, false)) 5314bb659b1SJens Axboe goto enomem; 5324bb659b1SJens Axboe if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node, true)) 5334bb659b1SJens Axboe goto enomem; 5344bb659b1SJens Axboe 5354bb659b1SJens Axboe return tags; 5364bb659b1SJens Axboe enomem: 5374bb659b1SJens Axboe bt_free(&tags->bitmap_tags); 5384bb659b1SJens Axboe kfree(tags); 5394bb659b1SJens Axboe return NULL; 5404bb659b1SJens Axboe } 5414bb659b1SJens Axboe 542*320ae51fSJens Axboe struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, 543*320ae51fSJens Axboe unsigned int reserved_tags, int node) 544*320ae51fSJens Axboe { 545*320ae51fSJens Axboe struct blk_mq_tags *tags; 546*320ae51fSJens Axboe 547*320ae51fSJens Axboe if (total_tags > BLK_MQ_TAG_MAX) { 548*320ae51fSJens Axboe pr_err("blk-mq: tag depth too large\n"); 549*320ae51fSJens Axboe return NULL; 550*320ae51fSJens Axboe } 551*320ae51fSJens Axboe 552*320ae51fSJens Axboe tags = kzalloc_node(sizeof(*tags), GFP_KERNEL, node); 553*320ae51fSJens Axboe if (!tags) 554*320ae51fSJens Axboe return NULL; 555*320ae51fSJens Axboe 556*320ae51fSJens Axboe tags->nr_tags = total_tags; 557*320ae51fSJens Axboe tags->nr_reserved_tags = reserved_tags; 558*320ae51fSJens Axboe 5594bb659b1SJens Axboe return blk_mq_init_bitmap_tags(tags, node); 560*320ae51fSJens Axboe } 561*320ae51fSJens Axboe 562*320ae51fSJens Axboe void blk_mq_free_tags(struct blk_mq_tags *tags) 563*320ae51fSJens Axboe { 5644bb659b1SJens Axboe bt_free(&tags->bitmap_tags); 5654bb659b1SJens Axboe bt_free(&tags->breserved_tags); 566*320ae51fSJens Axboe kfree(tags); 567*320ae51fSJens Axboe } 568*320ae51fSJens Axboe 5694bb659b1SJens Axboe void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *tag) 5704bb659b1SJens Axboe { 5714bb659b1SJens Axboe unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; 5724bb659b1SJens Axboe 5739d3d21aeSMing Lei *tag = prandom_u32() % depth; 5744bb659b1SJens Axboe } 5754bb659b1SJens Axboe 576e3a2b3f9SJens Axboe int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int tdepth) 577e3a2b3f9SJens Axboe { 578e3a2b3f9SJens Axboe tdepth -= tags->nr_reserved_tags; 579e3a2b3f9SJens Axboe if (tdepth > tags->nr_tags) 580e3a2b3f9SJens Axboe return -EINVAL; 581e3a2b3f9SJens Axboe 582e3a2b3f9SJens Axboe /* 583e3a2b3f9SJens Axboe * Don't need (or can't) update reserved tags here, they remain 584e3a2b3f9SJens Axboe * static and should never need resizing. 585e3a2b3f9SJens Axboe */ 586e3a2b3f9SJens Axboe bt_update_count(&tags->bitmap_tags, tdepth); 587e3a2b3f9SJens Axboe blk_mq_tag_wakeup_all(tags); 588e3a2b3f9SJens Axboe return 0; 589e3a2b3f9SJens Axboe } 590e3a2b3f9SJens Axboe 591205fb5f5SBart Van Assche /** 592205fb5f5SBart Van Assche * blk_mq_unique_tag() - return a tag that is unique queue-wide 593205fb5f5SBart Van Assche * @rq: request for which to compute a unique tag 594205fb5f5SBart Van Assche * 595205fb5f5SBart Van Assche * The tag field in struct request is unique per hardware queue but not over 596205fb5f5SBart Van Assche * all hardware queues. Hence this function that returns a tag with the 597205fb5f5SBart Van Assche * hardware context index in the upper bits and the per hardware queue tag in 598205fb5f5SBart Van Assche * the lower bits. 599205fb5f5SBart Van Assche * 600205fb5f5SBart Van Assche * Note: When called for a request that is queued on a non-multiqueue request 601205fb5f5SBart Van Assche * queue, the hardware context index is set to zero. 602205fb5f5SBart Van Assche */ 603205fb5f5SBart Van Assche u32 blk_mq_unique_tag(struct request *rq) 604205fb5f5SBart Van Assche { 605205fb5f5SBart Van Assche struct request_queue *q = rq->q; 606205fb5f5SBart Van Assche struct blk_mq_hw_ctx *hctx; 607205fb5f5SBart Van Assche int hwq = 0; 608205fb5f5SBart Van Assche 609205fb5f5SBart Van Assche if (q->mq_ops) { 610205fb5f5SBart Van Assche hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu); 611205fb5f5SBart Van Assche hwq = hctx->queue_num; 612205fb5f5SBart Van Assche } 613205fb5f5SBart Van Assche 614205fb5f5SBart Van Assche return (hwq << BLK_MQ_UNIQUE_TAG_BITS) | 615205fb5f5SBart Van Assche (rq->tag & BLK_MQ_UNIQUE_TAG_MASK); 616205fb5f5SBart Van Assche } 617205fb5f5SBart Van Assche EXPORT_SYMBOL(blk_mq_unique_tag); 618205fb5f5SBart Van Assche 619*320ae51fSJens Axboe ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page) 620*320ae51fSJens Axboe { 621*320ae51fSJens Axboe char *orig_page = page; 6224bb659b1SJens Axboe unsigned int free, res; 623*320ae51fSJens Axboe 624*320ae51fSJens Axboe if (!tags) 625*320ae51fSJens Axboe return 0; 626*320ae51fSJens Axboe 62759d13bf5SJens Axboe page += sprintf(page, "nr_tags=%u, reserved_tags=%u, " 62859d13bf5SJens Axboe "bits_per_word=%u\n", 62959d13bf5SJens Axboe tags->nr_tags, tags->nr_reserved_tags, 63059d13bf5SJens Axboe tags->bitmap_tags.bits_per_word); 631*320ae51fSJens Axboe 6324bb659b1SJens Axboe free = bt_unused_tags(&tags->bitmap_tags); 6334bb659b1SJens Axboe res = bt_unused_tags(&tags->breserved_tags); 634*320ae51fSJens Axboe 6354bb659b1SJens Axboe page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", free, res); 6360d2602caSJens Axboe page += sprintf(page, "active_queues=%u\n", atomic_read(&tags->active_queues)); 637*320ae51fSJens Axboe 638*320ae51fSJens Axboe return page - orig_page; 639*320ae51fSJens Axboe } 640