blk-mq.h source code [linux/include/linux/blk-mq.h]

1	/ SPDX-License-Identifier: GPL-2.0 /
2	#ifndef BLK_MQ_H
3	#define BLK_MQ_H
4
5	#include <linux/blkdev.h>
6	#include <linux/sbitmap.h>
7	#include <linux/lockdep.h>
8	#include <linux/scatterlist.h>
9	#include <linux/prefetch.h>
10	#include <linux/srcu.h>
11	#include <linux/rw_hint.h>
12	#include <linux/rwsem.h>
13
14	struct blk_mq_tags;
15	struct blk_flush_queue;
16
17	#define BLKDEV_MIN_RQ 4
18	#define BLKDEV_DEFAULT_RQ 128
19
20	enum rq_end_io_ret {
21	RQ_END_IO_NONE,
22	RQ_END_IO_FREE,
23	};
24
25	typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t);
26
27	/*
28	* request flags */
29	typedef __u32 __bitwise req_flags_t;
30
31	/ Keep rqf_name[] in sync with the definitions below /
32	enum rqf_flags {
33	/ drive already may have started this one /
34	__RQF_STARTED,
35	/ request for flush sequence /
36	__RQF_FLUSH_SEQ,
37	/ merge of different types, fail separately /
38	__RQF_MIXED_MERGE,
39	/ don't call prep for this one /
40	__RQF_DONTPREP,
41	/ use hctx->sched_tags /
42	__RQF_SCHED_TAGS,
43	/ use an I/O scheduler for this request /
44	__RQF_USE_SCHED,
45	/ vaguely specified driver internal error. Ignored by block layer /
46	__RQF_FAILED,
47	/ don't warn about errors /
48	__RQF_QUIET,
49	/ account into disk and partition IO statistics /
50	__RQF_IO_STAT,
51	/ runtime pm request /
52	__RQF_PM,
53	/ on IO scheduler merge hash /
54	__RQF_HASHED,
55	/ track IO completion time /
56	__RQF_STATS,
57	/ Look at ->special_vec for the actual data payload instead of the*
58	bio chain. /*
59	__RQF_SPECIAL_PAYLOAD,
60	/ request completion needs to be signaled to zone write plugging. /
61	__RQF_ZONE_WRITE_PLUGGING,
62	/ ->timeout has been called, don't expire again /
63	__RQF_TIMED_OUT,
64	__RQF_RESV,
65	__RQF_BITS
66	};
67
68	#define RQF_STARTED ((__force req_flags_t)(1 << __RQF_STARTED))
69	#define RQF_FLUSH_SEQ ((__force req_flags_t)(1 << __RQF_FLUSH_SEQ))
70	#define RQF_MIXED_MERGE ((__force req_flags_t)(1 << __RQF_MIXED_MERGE))
71	#define RQF_DONTPREP ((__force req_flags_t)(1 << __RQF_DONTPREP))
72	#define RQF_SCHED_TAGS ((__force req_flags_t)(1 << __RQF_SCHED_TAGS))
73	#define RQF_USE_SCHED ((__force req_flags_t)(1 << __RQF_USE_SCHED))
74	#define RQF_FAILED ((__force req_flags_t)(1 << __RQF_FAILED))
75	#define RQF_QUIET ((__force req_flags_t)(1 << __RQF_QUIET))
76	#define RQF_IO_STAT ((__force req_flags_t)(1 << __RQF_IO_STAT))
77	#define RQF_PM ((__force req_flags_t)(1 << __RQF_PM))
78	#define RQF_HASHED ((__force req_flags_t)(1 << __RQF_HASHED))
79	#define RQF_STATS ((__force req_flags_t)(1 << __RQF_STATS))
80	#define RQF_SPECIAL_PAYLOAD \
81	((__force req_flags_t)(1 << __RQF_SPECIAL_PAYLOAD))
82	#define RQF_ZONE_WRITE_PLUGGING \
83	((__force req_flags_t)(1 << __RQF_ZONE_WRITE_PLUGGING))
84	#define RQF_TIMED_OUT ((__force req_flags_t)(1 << __RQF_TIMED_OUT))
85	#define RQF_RESV ((__force req_flags_t)(1 << __RQF_RESV))
86
87	/ flags that prevent us from merging requests: /
88	#define RQF_NOMERGE_FLAGS \
89	(RQF_STARTED \| RQF_FLUSH_SEQ \| RQF_SPECIAL_PAYLOAD)
90
91	enum mq_rq_state {
92	MQ_RQ_IDLE = `0`,
93	MQ_RQ_IN_FLIGHT = `1`,
94	MQ_RQ_COMPLETE = `2`,
95	};
96
97	/*
98	* Try to put the fields that are referenced together in the same cacheline.
99	*
100	* If you modify this structure, make sure to update blk_rq_init() and
101	* especially blk_mq_rq_ctx_init() to take care of the added fields.
102	*/
103	struct request {
104	struct request_queue *q;
105	struct blk_mq_ctx *mq_ctx;
106	struct blk_mq_hw_ctx *mq_hctx;
107
108	blk_opf_t cmd_flags; / op and common flags /
109	req_flags_t rq_flags;
110
111	int tag;
112	int internal_tag;
113
114	unsigned int timeout;
115
116	/ the following two fields are internal, NEVER access directly /
117	unsigned int __data_len; / total data len /
118	sector_t __sector; / sector cursor /
119
120	struct bio *bio;
121	struct bio *biotail;
122
123	union {
124	struct list_head queuelist;
125	struct request *rq_next;
126	};
127
128	struct block_device *part;
129	#ifdef CONFIG_BLK_RQ_ALLOC_TIME
130	/ Time that the first bio started allocating this request. /
131	u64 alloc_time_ns;
132	#endif
133	/ Time that this request was allocated for this IO. /
134	u64 start_time_ns;
135	/ Time that I/O was submitted to the device. /
136	u64 io_start_time_ns;
137
138	#ifdef CONFIG_BLK_WBT
139	unsigned short wbt_flags;
140	#endif
141	/*
142	* rq sectors used for blk stats. It has the same value
143	* with blk_rq_sectors(rq), except that it never be zeroed
144	* by completion.
145	*/
146	unsigned short stats_sectors;
147
148	/*
149	* Number of scatter-gather DMA addr+len pairs after
150	* physical address coalescing is performed.
151	*/
152	unsigned short nr_phys_segments;
153	unsigned short nr_integrity_segments;
154
155	/*
156	* The lowest set bit for address gaps between physical segments. This
157	* provides information necessary for dma optimization opprotunities,
158	* like for testing if the segments can be coalesced against the
159	* device's iommu granule.
160	*/
161	unsigned char phys_gap_bit;
162
163	#ifdef CONFIG_BLK_INLINE_ENCRYPTION
164	struct bio_crypt_ctx *crypt_ctx;
165	struct blk_crypto_keyslot *crypt_keyslot;
166	#endif
167
168	enum mq_rq_state state;
169	atomic_t ref;
170
171	unsigned long deadline;
172
173	/*
174	* The hash is used inside the scheduler, and killed once the
175	* request reaches the dispatch list. The ipi_list is only used
176	* to queue the request for softirq completion, which is long
177	* after the request has been unhashed (and even removed from
178	* the dispatch list).
179	*/
180	union {
181	struct hlist_node hash; / merge hash /
182	struct llist_node ipi_list;
183	};
184
185	/*
186	* The rb_node is only used inside the io scheduler, requests
187	* are pruned when moved to the dispatch queue. special_vec must
188	* only be used if RQF_SPECIAL_PAYLOAD is set, and those cannot be
189	* insert into an IO scheduler.
190	*/
191	union {
192	struct rb_node rb_node; / sort/lookup /
193	struct bio_vec special_vec;
194	};
195
196	/*
197	* Three pointers are available for the IO schedulers, if they need
198	* more they have to dynamically allocate it.
199	*/
200	struct {
201	struct io_cq *icq;
202	void *priv[`2`];
203	} elv;
204
205	struct {
206	unsigned int seq;
207	rq_end_io_fn *saved_end_io;
208	} flush;
209
210	u64 fifo_time;
211
212	/*
213	* completion callback.
214	*/
215	rq_end_io_fn *end_io;
216	void *end_io_data;
217	};
218
219	/*
220	* Returns a mask with all bits starting at req->phys_gap_bit set to 1.
221	*/
222	static inline unsigned long req_phys_gap_mask(const struct request *req)
223	{
224	return ~(((`1` << req->phys_gap_bit) >> `1`) - `1`);
225	}
226
227	static inline enum req_op req_op(const struct request *req)
228	{
229	return req->cmd_flags & REQ_OP_MASK;
230	}
231
232	static inline bool blk_rq_is_passthrough(struct request *rq)
233	{
234	return blk_op_is_passthrough(op: rq->cmd_flags);
235	}
236
237	static inline unsigned short req_get_ioprio(struct request *req)
238	{
239	if (req->bio)
240	return req->bio->bi_ioprio;
241	return `0`;
242	}
243
244	#define rq_data_dir(rq) (op_is_write(req_op(rq)) ? WRITE : READ)
245
246	#define rq_dma_dir(rq) \
247	(op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE)
248
249	static inline int rq_list_empty(const struct rq_list *rl)
250	{
251	return rl->head == NULL;
252	}
253
254	static inline void rq_list_init(struct rq_list *rl)
255	{
256	rl->head = NULL;
257	rl->tail = NULL;
258	}
259
260	static inline void rq_list_add_tail(struct rq_list rl, struct* request *rq)
261	{
262	rq->rq_next = NULL;
263	if (rl->tail)
264	rl->tail->rq_next = rq;
265	else
266	rl->head = rq;
267	rl->tail = rq;
268	}
269
270	static inline void rq_list_add_head(struct rq_list rl, struct* request *rq)
271	{
272	rq->rq_next = rl->head;
273	rl->head = rq;
274	if (!rl->tail)
275	rl->tail = rq;
276	}
277
278	static inline struct request rq_list_pop(struct* rq_list *rl)
279	{
280	struct request *rq = rl->head;
281
282	if (rq) {
283	rl->head = rl->head->rq_next;
284	if (!rl->head)
285	rl->tail = NULL;
286	rq->rq_next = NULL;
287	}
288
289	return rq;
290	}
291
292	static inline struct request rq_list_peek(struct* rq_list *rl)
293	{
294	return rl->head;
295	}
296
297	#define rq_list_for_each(rl, pos) \
298	for (pos = rq_list_peek((rl)); (pos); pos = pos->rq_next)
299
300	#define rq_list_for_each_safe(rl, pos, nxt) \
301	for (pos = rq_list_peek((rl)), nxt = pos->rq_next; \
302	pos; pos = nxt, nxt = pos ? pos->rq_next : NULL)
303
304	/**
305	* enum blk_eh_timer_return - How the timeout handler should proceed
306	* @BLK_EH_DONE: The block driver completed the command or will complete it at
307	* a later time.
308	* @BLK_EH_RESET_TIMER: Reset the request timer and continue waiting for the
309	* request to complete.
310	*/
311	enum blk_eh_timer_return {
312	BLK_EH_DONE,
313	BLK_EH_RESET_TIMER,
314	};
315
316	/**
317	* struct blk_mq_hw_ctx - State for a hardware queue facing the hardware
318	* block device
319	*/
320	struct blk_mq_hw_ctx {
321	struct {
322	/* @lock: Protects the dispatch list. /
323	spinlock_t lock;
324	/**
325	* @dispatch: Used for requests that are ready to be
326	* dispatched to the hardware but for some reason (e.g. lack of
327	* resources) could not be sent to the hardware. As soon as the
328	* driver can send new requests, requests at this list will
329	* be sent first for a fairer dispatch.
330	*/
331	struct list_head dispatch;
332	/**
333	* @state: BLK_MQ_S_* flags. Defines the state of the hw
334	* queue (active, scheduled to restart, stopped).
335	*/
336	unsigned long state;
337	} ____cacheline_aligned_in_smp;
338
339	/**
340	* @run_work: Used for scheduling a hardware queue run at a later time.
341	*/
342	struct delayed_work run_work;
343	/* @cpumask: Map of available CPUs where this hctx can run. /
344	cpumask_var_t cpumask;
345	/**
346	* @next_cpu: Used by blk_mq_hctx_next_cpu() for round-robin CPU
347	* selection from @cpumask.
348	*/
349	int next_cpu;
350	/**
351	* @next_cpu_batch: Counter of how many works left in the batch before
352	* changing to the next CPU.
353	*/
354	int next_cpu_batch;
355
356	/* @flags: BLK_MQ_F_* flags. Defines the behaviour of the queue. /
357	unsigned long flags;
358
359	/**
360	* @sched_data: Pointer owned by the IO scheduler attached to a request
361	* queue. It's up to the IO scheduler how to use this pointer.
362	*/
363	void *sched_data;
364	/**
365	* @queue: Pointer to the request queue that owns this hardware context.
366	*/
367	struct request_queue *queue;
368	/* @fq: Queue of requests that need to perform a flush operation. /
369	struct blk_flush_queue *fq;
370
371	/**
372	* @driver_data: Pointer to data owned by the block driver that created
373	* this hctx
374	*/
375	void *driver_data;
376
377	/**
378	* @ctx_map: Bitmap for each software queue. If bit is on, there is a
379	* pending request in that software queue.
380	*/
381	struct sbitmap ctx_map;
382
383	/**
384	* @dispatch_from: Software queue to be used when no scheduler was
385	* selected.
386	*/
387	struct blk_mq_ctx *dispatch_from;
388	/**
389	* @dispatch_busy: Number used by blk_mq_update_dispatch_busy() to
390	* decide if the hw_queue is busy using Exponential Weighted Moving
391	* Average algorithm.
392	*/
393	unsigned int dispatch_busy;
394
395	/* @type: HCTX_TYPE_* flags. Type of hardware queue. /
396	unsigned short type;
397	/* @nr_ctx: Number of software queues. /
398	unsigned short nr_ctx;
399	/* @ctxs: Array of software queues. /
400	struct blk_mq_ctx **ctxs;
401
402	/* @dispatch_wait_lock: Lock for dispatch_wait queue. /
403	spinlock_t dispatch_wait_lock;
404	/**
405	* @dispatch_wait: Waitqueue to put requests when there is no tag
406	* available at the moment, to wait for another try in the future.
407	*/
408	wait_queue_entry_t dispatch_wait;
409
410	/**
411	* @wait_index: Index of next available dispatch_wait queue to insert
412	* requests.
413	*/
414	atomic_t wait_index;
415
416	/**
417	* @tags: Tags owned by the block driver. A tag at this set is only
418	* assigned when a request is dispatched from a hardware queue.
419	*/
420	struct blk_mq_tags *tags;
421	/**
422	* @sched_tags: Tags owned by I/O scheduler. If there is an I/O
423	* scheduler associated with a request queue, a tag is assigned when
424	* that request is allocated. Else, this member is not used.
425	*/
426	struct blk_mq_tags *sched_tags;
427
428	/* @numa_node: NUMA node the storage adapter has been connected to. /
429	unsigned int numa_node;
430	/* @queue_num: Index of this hardware queue. /
431	unsigned int queue_num;
432
433	/**
434	* @nr_active: Number of active requests. Only used when a tag set is
435	* shared across request queues.
436	*/
437	atomic_t nr_active;
438
439	/* @cpuhp_online: List to store request if CPU is going to die /
440	struct hlist_node cpuhp_online;
441	/* @cpuhp_dead: List to store request if some CPU die. /
442	struct hlist_node cpuhp_dead;
443	/* @kobj: Kernel object for sysfs. /
444	struct kobject kobj;
445
446	#ifdef CONFIG_BLK_DEBUG_FS
447	/**
448	* @debugfs_dir: debugfs directory for this hardware queue. Named
449	* as cpu<cpu_number>.
450	*/
451	struct dentry *debugfs_dir;
452	/* @sched_debugfs_dir: debugfs directory for the scheduler. /
453	struct dentry *sched_debugfs_dir;
454	#endif
455
456	/**
457	* @hctx_list: if this hctx is not in use, this is an entry in
458	* q->unused_hctx_list.
459	*/
460	struct list_head hctx_list;
461	};
462
463	/**
464	* struct blk_mq_queue_map - Map software queues to hardware queues
465	* @mq_map: CPU ID to hardware queue index map. This is an array
466	* with nr_cpu_ids elements. Each element has a value in the range
467	* [@queue_offset, @queue_offset + @nr_queues).
468	* @nr_queues: Number of hardware queues to map CPU IDs onto.
469	* @queue_offset: First hardware queue to map onto. Used by the PCIe NVMe
470	* driver to map each hardware queue type (enum hctx_type) onto a distinct
471	* set of hardware queues.
472	*/
473	struct blk_mq_queue_map {
474	unsigned int *mq_map;
475	unsigned int nr_queues;
476	unsigned int queue_offset;
477	};
478
479	/**
480	* enum hctx_type - Type of hardware queue
481	* @HCTX_TYPE_DEFAULT: All I/O not otherwise accounted for.
482	* @HCTX_TYPE_READ: Just for READ I/O.
483	* @HCTX_TYPE_POLL: Polled I/O of any kind.
484	* @HCTX_MAX_TYPES: Number of types of hctx.
485	*/
486	enum hctx_type {
487	HCTX_TYPE_DEFAULT,
488	HCTX_TYPE_READ,
489	HCTX_TYPE_POLL,
490
491	HCTX_MAX_TYPES,
492	};
493
494	/**
495	* struct blk_mq_tag_set - tag set that can be shared between request queues
496	* @ops: Pointers to functions that implement block driver behavior.
497	* @map: One or more ctx -> hctx mappings. One map exists for each
498	* hardware queue type (enum hctx_type) that the driver wishes
499	* to support. There are no restrictions on maps being of the
500	* same size, and it's perfectly legal to share maps between
501	* types.
502	* @nr_maps: Number of elements in the @map array. A number in the range
503	* [1, HCTX_MAX_TYPES].
504	* @nr_hw_queues: Number of hardware queues supported by the block driver that
505	* owns this data structure.
506	* @queue_depth: Number of tags per hardware queue, reserved tags included.
507	* @reserved_tags: Number of tags to set aside for BLK_MQ_REQ_RESERVED tag
508	* allocations.
509	* @cmd_size: Number of additional bytes to allocate per request. The block
510	* driver owns these additional bytes.
511	* @numa_node: NUMA node the storage adapter has been connected to.
512	* @timeout: Request processing timeout in jiffies.
513	* @flags: Zero or more BLK_MQ_F_* flags.
514	* @driver_data: Pointer to data owned by the block driver that created this
515	* tag set.
516	* @tags: Tag sets. One tag set per hardware queue. Has @nr_hw_queues
517	* elements.
518	* @shared_tags:
519	* Shared set of tags. Has @nr_hw_queues elements. If set,
520	* shared by all @tags.
521	* @tag_list_lock: Serializes tag_list accesses.
522	* @tag_list: List of the request queues that use this tag set. See also
523	* request_queue.tag_set_list.
524	* @srcu: Use as lock when type of the request queue is blocking
525	* (BLK_MQ_F_BLOCKING).
526	* @tags_srcu: SRCU used to defer freeing of tags page_list to prevent
527	* use-after-free when iterating tags.
528	* @update_nr_hwq_lock:
529	* Synchronize updating nr_hw_queues with add/del disk &
530	* switching elevator.
531	*/
532	struct blk_mq_tag_set {
533	const struct blk_mq_ops *ops;
534	struct blk_mq_queue_map map[HCTX_MAX_TYPES];
535	unsigned int nr_maps;
536	unsigned int nr_hw_queues;
537	unsigned int queue_depth;
538	unsigned int reserved_tags;
539	unsigned int cmd_size;
540	int numa_node;
541	unsigned int timeout;
542	unsigned int flags;
543	void *driver_data;
544
545	struct blk_mq_tags **tags;
546
547	struct blk_mq_tags *shared_tags;
548
549	struct mutex tag_list_lock;
550	struct list_head tag_list;
551	struct srcu_struct *srcu;
552	struct srcu_struct tags_srcu;
553
554	struct rw_semaphore update_nr_hwq_lock;
555	};
556
557	/**
558	* struct blk_mq_queue_data - Data about a request inserted in a queue
559	*
560	* @rq: Request pointer.
561	* @last: If it is the last request in the queue.
562	*/
563	struct blk_mq_queue_data {
564	struct request *rq;
565	bool last;
566	};
567
568	typedef bool (busy_tag_iter_fn)(struct request , void* *);
569
570	/**
571	* struct blk_mq_ops - Callback functions that implements block driver
572	* behaviour.
573	*/
574	struct blk_mq_ops {
575	/**
576	* @queue_rq: Queue a new request from block IO.
577	*/
578	blk_status_t (queue_rq)(struct* blk_mq_hw_ctx *,
579	const struct blk_mq_queue_data *);
580
581	/**
582	* @commit_rqs: If a driver uses bd->last to judge when to submit
583	* requests to hardware, it must define this function. In case of errors
584	* that make us stop issuing further requests, this hook serves the
585	* purpose of kicking the hardware (which the last request otherwise
586	* would have done).
587	*/
588	void (commit_rqs)(struct* blk_mq_hw_ctx *);
589
590	/**
591	* @queue_rqs: Queue a list of new requests. Driver is guaranteed
592	* that each request belongs to the same queue. If the driver doesn't
593	* empty the @rqlist completely, then the rest will be queued
594	* individually by the block layer upon return.
595	*/
596	void (queue_rqs)(struct* rq_list *rqlist);
597
598	/**
599	* @get_budget: Reserve budget before queue request, once .queue_rq is
600	* run, it is driver's responsibility to release the
601	* reserved budget. Also we have to handle failure case
602	* of .get_budget for avoiding I/O deadlock.
603	*/
604	int (get_budget)(struct* request_queue *);
605
606	/**
607	* @put_budget: Release the reserved budget.
608	*/
609	void (put_budget)(struct* request_queue , int*);
610
611	/**
612	* @set_rq_budget_token: store rq's budget token
613	*/
614	void (set_rq_budget_token)(struct* request , int*);
615	/**
616	* @get_rq_budget_token: retrieve rq's budget token
617	*/
618	int (get_rq_budget_token)(struct* request *);
619
620	/**
621	* @timeout: Called on request timeout.
622	*/
623	enum blk_eh_timer_return (timeout)(struct* request *);
624
625	/**
626	* @poll: Called to poll for completion of a specific tag.
627	*/
628	int (poll)(struct* blk_mq_hw_ctx , struct* io_comp_batch *);
629
630	/**
631	* @complete: Mark the request as complete.
632	*/
633	void (complete)(struct* request *);
634
635	/**
636	* @init_hctx: Called when the block layer side of a hardware queue has
637	* been set up, allowing the driver to allocate/init matching
638	* structures.
639	*/
640	int (init_hctx)(struct* blk_mq_hw_ctx , void* , unsigned* int);
641	/**
642	* @exit_hctx: Ditto for exit/teardown.
643	*/
644	void (exit_hctx)(struct* blk_mq_hw_ctx , unsigned* int);
645
646	/**
647	* @init_request: Called for every command allocated by the block layer
648	* to allow the driver to set up driver specific data.
649	*
650	* Tag greater than or equal to queue_depth is for setting up
651	* flush request.
652	*/
653	int (init_request)(struct* blk_mq_tag_set set, struct* request *,
654	unsigned int, unsigned int);
655	/**
656	* @exit_request: Ditto for exit/teardown.
657	*/
658	void (exit_request)(struct* blk_mq_tag_set set, struct* request *,
659	unsigned int);
660
661	/**
662	* @cleanup_rq: Called before freeing one request which isn't completed
663	* yet, and usually for freeing the driver private data.
664	*/
665	void (cleanup_rq)(struct* request *);
666
667	/**
668	* @busy: If set, returns whether or not this queue currently is busy.
669	*/
670	bool (busy)(struct* request_queue *);
671
672	/**
673	* @map_queues: This allows drivers specify their own queue mapping by
674	* overriding the setup-time function that builds the mq_map.
675	*/
676	void (map_queues)(struct* blk_mq_tag_set *set);
677
678	#ifdef CONFIG_BLK_DEBUG_FS
679	/**
680	* @show_rq: Used by the debugfs implementation to show driver-specific
681	* information about a request.
682	*/
683	void (show_rq)(struct* seq_file m, struct* request *rq);
684	#endif
685	};
686
687	/ Keep hctx_flag_name[] in sync with the definitions below /
688	enum {
689	BLK_MQ_F_TAG_QUEUE_SHARED = `1` << `1`,
690	/*
691	* Set when this device requires underlying blk-mq device for
692	* completing IO:
693	*/
694	BLK_MQ_F_STACKING = `1` << `2`,
695	BLK_MQ_F_TAG_HCTX_SHARED = `1` << `3`,
696	BLK_MQ_F_BLOCKING = `1` << `4`,
697
698	/*
699	* Alloc tags on a round-robin base instead of the first available one.
700	*/
701	BLK_MQ_F_TAG_RR = `1` << `5`,
702
703	/*
704	* Select 'none' during queue registration in case of a single hwq
705	* or shared hwqs instead of 'mq-deadline'.
706	*/
707	BLK_MQ_F_NO_SCHED_BY_DEFAULT = `1` << `6`,
708
709	BLK_MQ_F_MAX = `1` << `7`,
710	};
711
712	#define BLK_MQ_MAX_DEPTH (10240)
713	#define BLK_MQ_NO_HCTX_IDX (-1U)
714
715	enum {
716	/ Keep hctx_state_name[] in sync with the definitions below /
717	BLK_MQ_S_STOPPED,
718	BLK_MQ_S_TAG_ACTIVE,
719	BLK_MQ_S_SCHED_RESTART,
720	/ hw queue is inactive after all its CPUs become offline /
721	BLK_MQ_S_INACTIVE,
722	BLK_MQ_S_MAX
723	};
724
725	struct gendisk __blk_mq_alloc_disk(struct* blk_mq_tag_set *set,
726	struct queue_limits lim, void* *queuedata,
727	struct lock_class_key *lkclass);
728	#define blk_mq_alloc_disk(set, lim, queuedata) \
729	({ \
730	static struct lock_class_key __key; \
731	\
732	__blk_mq_alloc_disk(set, lim, queuedata, &__key); \
733	})
734	struct gendisk blk_mq_alloc_disk_for_queue(struct* request_queue *q,
735	struct lock_class_key *lkclass);
736	struct request_queue blk_mq_alloc_queue(struct* blk_mq_tag_set *set,
737	struct queue_limits lim, void* *queuedata);
738	int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
739	struct request_queue *q);
740	void blk_mq_destroy_queue(struct request_queue *);
741
742	int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set);
743	int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set,
744	const struct blk_mq_ops ops, unsigned* int queue_depth,
745	unsigned int set_flags);
746	void blk_mq_free_tag_set(struct blk_mq_tag_set *set);
747
748	void blk_mq_free_request(struct request *rq);
749	int blk_rq_poll(struct request rq, struct* io_comp_batch *iob,
750	unsigned int poll_flags);
751
752	bool blk_mq_queue_inflight(struct request_queue *q);
753
754	enum {
755	/ return when out of requests /
756	BLK_MQ_REQ_NOWAIT = (__force blk_mq_req_flags_t)(`1` << `0`),
757	/ allocate from reserved pool /
758	BLK_MQ_REQ_RESERVED = (__force blk_mq_req_flags_t)(`1` << `1`),
759	/ set RQF_PM /
760	BLK_MQ_REQ_PM = (__force blk_mq_req_flags_t)(`1` << `2`),
761	};
762
763	struct request blk_mq_alloc_request(struct* request_queue *q, blk_opf_t opf,
764	blk_mq_req_flags_t flags);
765	struct request blk_mq_alloc_request_hctx(struct* request_queue *q,
766	blk_opf_t opf, blk_mq_req_flags_t flags,
767	unsigned int hctx_idx);
768
769	/*
770	* Tag address space map.
771	*/
772	struct blk_mq_tags {
773	unsigned int nr_tags;
774	unsigned int nr_reserved_tags;
775	unsigned int active_queues;
776
777	struct sbitmap_queue bitmap_tags;
778	struct sbitmap_queue breserved_tags;
779
780	struct request **rqs;
781	struct request **static_rqs;
782	struct list_head page_list;
783
784	/*
785	* used to clear request reference in rqs[] before freeing one
786	* request pool
787	*/
788	spinlock_t lock;
789	struct rcu_head rcu_head;
790	};
791
792	static inline struct request blk_mq_tag_to_rq(struct* blk_mq_tags *tags,
793	unsigned int tag)
794	{
795	if (tag < tags->nr_tags) {
796	prefetch(tags->rqs[tag]);
797	return tags->rqs[tag];
798	}
799
800	return NULL;
801	}
802
803	enum {
804	BLK_MQ_UNIQUE_TAG_BITS = `16`,
805	BLK_MQ_UNIQUE_TAG_MASK = (`1` << BLK_MQ_UNIQUE_TAG_BITS) - `1`,
806	};
807
808	u32 blk_mq_unique_tag(struct request *rq);
809
810	static inline u16 blk_mq_unique_tag_to_hwq(u32 unique_tag)
811	{
812	return unique_tag >> BLK_MQ_UNIQUE_TAG_BITS;
813	}
814
815	static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag)
816	{
817	return unique_tag & BLK_MQ_UNIQUE_TAG_MASK;
818	}
819
820	/**
821	* blk_mq_rq_state() - read the current MQ_RQ_* state of a request
822	* @rq: target request.
823	*/
824	static inline enum mq_rq_state blk_mq_rq_state(struct request *rq)
825	{
826	return READ_ONCE(rq->state);
827	}
828
829	static inline int blk_mq_request_started(struct request *rq)
830	{
831	return blk_mq_rq_state(rq) != MQ_RQ_IDLE;
832	}
833
834	static inline int blk_mq_request_completed(struct request *rq)
835	{
836	return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE;
837	}
838
839	/*
840	*
841	* Set the state to complete when completing a request from inside ->queue_rq.
842	* This is used by drivers that want to ensure special complete actions that
843	* need access to the request are called on failure, e.g. by nvme for
844	* multipathing.
845	*/
846	static inline void blk_mq_set_request_complete(struct request *rq)
847	{
848	WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
849	}
850
851	/*
852	* Complete the request directly instead of deferring it to softirq or
853	* completing it another CPU. Useful in preemptible instead of an interrupt.
854	*/
855	static inline void blk_mq_complete_request_direct(struct request *rq,
856	void (complete)(struct* request *rq))
857	{
858	WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
859	complete(rq);
860	}
861
862	void blk_mq_start_request(struct request *rq);
863	void blk_mq_end_request(struct request *rq, blk_status_t error);
864	void __blk_mq_end_request(struct request *rq, blk_status_t error);
865	void blk_mq_end_request_batch(struct io_comp_batch *ib);
866
867	/*
868	* Only need start/end time stamping if we have iostat or
869	* blk stats enabled, or using an IO scheduler.
870	*/
871	static inline bool blk_mq_need_time_stamp(struct request *rq)
872	{
873	return (rq->rq_flags & (RQF_IO_STAT \| RQF_STATS \| RQF_USE_SCHED));
874	}
875
876	static inline bool blk_mq_is_reserved_rq(struct request *rq)
877	{
878	return rq->rq_flags & RQF_RESV;
879	}
880
881	/**
882	* blk_mq_add_to_batch() - add a request to the completion batch
883	* @req: The request to add to batch
884	* @iob: The batch to add the request
885	* @is_error: Specify true if the request failed with an error
886	* @complete: The completaion handler for the request
887	*
888	* Batched completions only work when there is no I/O error and no special
889	* ->end_io handler.
890	*
891	* Return: true when the request was added to the batch, otherwise false
892	*/
893	static inline bool blk_mq_add_to_batch(struct request *req,
894	struct io_comp_batch *iob, bool is_error,
895	void (complete)(struct* io_comp_batch *))
896	{
897	/*
898	* Check various conditions that exclude batch processing:
899	* 1) No batch container
900	* 2) Has scheduler data attached
901	* 3) Not a passthrough request and end_io set
902	* 4) Not a passthrough request and failed with an error
903	*/
904	if (!iob)
905	return false;
906	if (req->rq_flags & RQF_SCHED_TAGS)
907	return false;
908	if (!blk_rq_is_passthrough(rq: req)) {
909	if (req->end_io)
910	return false;
911	if (is_error)
912	return false;
913	}
914
915	if (!iob->complete)
916	iob->complete = complete;
917	else if (iob->complete != complete)
918	return false;
919	iob->need_ts \|= blk_mq_need_time_stamp(rq: req);
920	rq_list_add_tail(rl: &iob->req_list, rq: req);
921	return true;
922	}
923
924	void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list);
925	void blk_mq_kick_requeue_list(struct request_queue *q);
926	void blk_mq_delay_kick_requeue_list(struct request_queue q, unsigned* long msecs);
927	void blk_mq_complete_request(struct request *rq);
928	bool blk_mq_complete_request_remote(struct request *rq);
929	void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
930	void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
931	void blk_mq_stop_hw_queues(struct request_queue *q);
932	void blk_mq_start_hw_queues(struct request_queue *q);
933	void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
934	void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
935	void blk_mq_quiesce_queue(struct request_queue *q);
936	void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set);
937	void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set);
938	void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set);
939	void blk_mq_unquiesce_queue(struct request_queue *q);
940	void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx hctx, unsigned* long msecs);
941	void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
942	void blk_mq_run_hw_queues(struct request_queue *q, bool async);
943	void blk_mq_delay_run_hw_queues(struct request_queue q, unsigned* long msecs);
944	void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
945	busy_tag_iter_fn fn, void* *priv);
946	void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset);
947	void blk_mq_freeze_queue_nomemsave(struct request_queue *q);
948	void blk_mq_unfreeze_queue_nomemrestore(struct request_queue *q);
949	static inline unsigned int __must_check
950	blk_mq_freeze_queue(struct request_queue *q)
951	{
952	unsigned int memflags = memalloc_noio_save();
953
954	blk_mq_freeze_queue_nomemsave(q);
955	return memflags;
956	}
957	static inline void
958	blk_mq_unfreeze_queue(struct request_queue q, unsigned* int memflags)
959	{
960	blk_mq_unfreeze_queue_nomemrestore(q);
961	memalloc_noio_restore(flags: memflags);
962	}
963	void blk_freeze_queue_start(struct request_queue *q);
964	void blk_mq_freeze_queue_wait(struct request_queue *q);
965	int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
966	unsigned long timeout);
967	void blk_mq_unfreeze_queue_non_owner(struct request_queue *q);
968	void blk_freeze_queue_start_non_owner(struct request_queue *q);
969
970	unsigned int blk_mq_num_possible_queues(unsigned int max_queues);
971	unsigned int blk_mq_num_online_queues(unsigned int max_queues);
972	void blk_mq_map_queues(struct blk_mq_queue_map *qmap);
973	void blk_mq_map_hw_queues(struct blk_mq_queue_map *qmap,
974	struct device dev, unsigned* int offset);
975	void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set set, int* nr_hw_queues);
976
977	void blk_mq_quiesce_queue_nowait(struct request_queue *q);
978
979	unsigned int blk_mq_rq_cpu(struct request *rq);
980
981	bool __blk_should_fake_timeout(struct request_queue *q);
982	static inline bool blk_should_fake_timeout(struct request_queue *q)
983	{
984	if (IS_ENABLED(CONFIG_FAIL_IO_TIMEOUT) &&
985	test_bit(QUEUE_FLAG_FAIL_IO, &q->queue_flags))
986	return __blk_should_fake_timeout(q);
987	return false;
988	}
989
990	/**
991	* blk_mq_rq_from_pdu - cast a PDU to a request
992	* @pdu: the PDU (Protocol Data Unit) to be casted
993	*
994	* Return: request
995	*
996	* Driver command data is immediately after the request. So subtract request
997	* size to get back to the original request.
998	*/
999	static inline struct request blk_mq_rq_from_pdu(void* *pdu)
1000	{
1001	return pdu - sizeof(struct request);
1002	}
1003
1004	/**
1005	* blk_mq_rq_to_pdu - cast a request to a PDU
1006	* @rq: the request to be casted
1007	*
1008	* Return: pointer to the PDU
1009	*
1010	* Driver command data is immediately after the request. So add request to get
1011	* the PDU.
1012	*/
1013	static inline void blk_mq_rq_to_pdu(struct* request *rq)
1014	{
1015	return rq + `1`;
1016	}
1017
1018	static inline struct blk_mq_hw_ctx queue_hctx(struct* request_queue q, int* id)
1019	{
1020	struct blk_mq_hw_ctx *hctx;
1021
1022	rcu_read_lock();
1023	hctx = rcu_dereference(q->queue_hw_ctx)[id];
1024	rcu_read_unlock();
1025
1026	return hctx;
1027	}
1028
1029	#define queue_for_each_hw_ctx(q, hctx, i) \
1030	for ((i) = 0; (i) < (q)->nr_hw_queues && \
1031	({ hctx = queue_hctx((q), i); 1; }); (i)++)
1032
1033	#define hctx_for_each_ctx(hctx, ctx, i) \
1034	for ((i) = 0; (i) < (hctx)->nr_ctx && \
1035	({ ctx = (hctx)->ctxs[(i)]; 1; }); (i)++)
1036
1037	static inline void blk_mq_cleanup_rq(struct request *rq)
1038	{
1039	if (rq->q->mq_ops->cleanup_rq)
1040	rq->q->mq_ops->cleanup_rq(rq);
1041	}
1042
1043	void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx,
1044	struct lock_class_key *key);
1045
1046	static inline bool rq_is_sync(struct request *rq)
1047	{
1048	return op_is_sync(op: rq->cmd_flags);
1049	}
1050
1051	void blk_rq_init(struct request_queue q, struct* request *rq);
1052	int blk_rq_prep_clone(struct request rq, struct* request *rq_src,
1053	struct bio_set *bs, gfp_t gfp_mask,
1054	int (bio_ctr)(struct* bio , struct* bio , void* ), void* *data);
1055	void blk_rq_unprep_clone(struct request *rq);
1056	blk_status_t blk_insert_cloned_request(struct request *rq);
1057
1058	struct rq_map_data {
1059	struct page **pages;
1060	unsigned long offset;
1061	unsigned short page_order;
1062	unsigned short nr_entries;
1063	bool null_mapped;
1064	bool from_user;
1065	};
1066
1067	int blk_rq_map_user(struct request_queue , struct* request *,
1068	struct rq_map_data , void* __user , unsigned* long, gfp_t);
1069	int blk_rq_map_user_io(struct request , struct* rq_map_data *,
1070	void __user , unsigned* long, gfp_t, bool, int, bool, int);
1071	int blk_rq_map_user_iov(struct request_queue , struct* request *,
1072	struct rq_map_data , const* struct iov_iter *, gfp_t);
1073	int blk_rq_unmap_user(struct bio *);
1074	int blk_rq_map_kern(struct request rq, void* kbuf, unsigned* int len,
1075	gfp_t gfp);
1076	int blk_rq_append_bio(struct request rq, struct* bio *bio);
1077	void blk_execute_rq_nowait(struct request *rq, bool at_head);
1078	blk_status_t blk_execute_rq(struct request *rq, bool at_head);
1079	bool blk_rq_is_poll(struct request *rq);
1080
1081	struct req_iterator {
1082	struct bvec_iter iter;
1083	struct bio *bio;
1084	};
1085
1086	#define __rq_for_each_bio(_bio, rq) \
1087	if ((rq->bio)) \
1088	for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next)
1089
1090	#define rq_for_each_segment(bvl, _rq, _iter) \
1091	__rq_for_each_bio(_iter.bio, _rq) \
1092	bio_for_each_segment(bvl, _iter.bio, _iter.iter)
1093
1094	#define rq_for_each_bvec(bvl, _rq, _iter) \
1095	__rq_for_each_bio(_iter.bio, _rq) \
1096	bio_for_each_bvec(bvl, _iter.bio, _iter.iter)
1097
1098	#define rq_iter_last(bvec, _iter) \
1099	(_iter.bio->bi_next == NULL && \
1100	bio_iter_last(bvec, _iter.iter))
1101
1102	/*
1103	* blk_rq_pos() : the current sector
1104	* blk_rq_bytes() : bytes left in the entire request
1105	* blk_rq_cur_bytes() : bytes left in the current segment
1106	* blk_rq_sectors() : sectors left in the entire request
1107	* blk_rq_cur_sectors() : sectors left in the current segment
1108	* blk_rq_stats_sectors() : sectors of the entire request used for stats
1109	*/
1110	static inline sector_t blk_rq_pos(const struct request *rq)
1111	{
1112	return rq->__sector;
1113	}
1114
1115	static inline unsigned int blk_rq_bytes(const struct request *rq)
1116	{
1117	return rq->__data_len;
1118	}
1119
1120	static inline int blk_rq_cur_bytes(const struct request *rq)
1121	{
1122	if (!rq->bio)
1123	return `0`;
1124	if (!bio_has_data(bio: rq->bio)) / dataless requests such as discard /
1125	return rq->bio->bi_iter.bi_size;
1126	return bio_iovec(rq->bio).bv_len;
1127	}
1128
1129	static inline unsigned int blk_rq_sectors(const struct request *rq)
1130	{
1131	return blk_rq_bytes(rq) >> SECTOR_SHIFT;
1132	}
1133
1134	static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
1135	{
1136	return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT;
1137	}
1138
1139	static inline unsigned int blk_rq_stats_sectors(const struct request *rq)
1140	{
1141	return rq->stats_sectors;
1142	}
1143
1144	/*
1145	* Some commands like WRITE SAME have a payload or data transfer size which
1146	* is different from the size of the request. Any driver that supports such
1147	* commands using the RQF_SPECIAL_PAYLOAD flag needs to use this helper to
1148	* calculate the data transfer size.
1149	*/
1150	static inline unsigned int blk_rq_payload_bytes(struct request *rq)
1151	{
1152	if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
1153	return rq->special_vec.bv_len;
1154	return blk_rq_bytes(rq);
1155	}
1156
1157	/*
1158	* Return the first full biovec in the request. The caller needs to check that
1159	* there are any bvecs before calling this helper.
1160	*/
1161	static inline struct bio_vec req_bvec(struct request *rq)
1162	{
1163	if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
1164	return rq->special_vec;
1165	return mp_bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter);
1166	}
1167
1168	static inline unsigned int blk_rq_count_bios(struct request *rq)
1169	{
1170	unsigned int nr_bios = `0`;
1171	struct bio *bio;
1172
1173	__rq_for_each_bio(bio, rq)
1174	nr_bios++;
1175
1176	return nr_bios;
1177	}
1178
1179	void blk_steal_bios(struct bio_list list, struct* request *rq);
1180
1181	/*
1182	* Request completion related functions.
1183	*
1184	* blk_update_request() completes given number of bytes and updates
1185	* the request without completing it.
1186	*/
1187	bool blk_update_request(struct request *rq, blk_status_t error,
1188	unsigned int nr_bytes);
1189	void blk_abort_request(struct request *);
1190
1191	/*
1192	* Number of physical segments as sent to the device.
1193	*
1194	* Normally this is the number of discontiguous data segments sent by the
1195	* submitter. But for data-less command like discard we might have no
1196	* actual data segments submitted, but the driver might have to add it's
1197	* own special payload. In that case we still return 1 here so that this
1198	* special payload will be mapped.
1199	*/
1200	static inline unsigned short blk_rq_nr_phys_segments(struct request *rq)
1201	{
1202	if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
1203	return `1`;
1204	return rq->nr_phys_segments;
1205	}
1206
1207	/*
1208	* Number of discard segments (or ranges) the driver needs to fill in.
1209	* Each discard bio merged into a request is counted as one segment.
1210	*/
1211	static inline unsigned short blk_rq_nr_discard_segments(struct request *rq)
1212	{
1213	return max_t(unsigned short, rq->nr_phys_segments, `1`);
1214	}
1215
1216	/**
1217	* blk_rq_nr_bvec - return number of bvecs in a request
1218	* @rq: request to calculate bvecs for
1219	*
1220	* Returns the number of bvecs.
1221	*/
1222	static inline unsigned int blk_rq_nr_bvec(struct request *rq)
1223	{
1224	struct req_iterator rq_iter;
1225	struct bio_vec bv;
1226	unsigned int nr_bvec = `0`;
1227
1228	rq_for_each_bvec(bv, rq, rq_iter)
1229	nr_bvec++;
1230
1231	return nr_bvec;
1232	}
1233
1234	int __blk_rq_map_sg(struct request rq, struct* scatterlist *sglist,
1235	struct scatterlist **last_sg);
1236	static inline int blk_rq_map_sg(struct request rq, struct* scatterlist *sglist)
1237	{
1238	struct scatterlist *last_sg = NULL;
1239
1240	return __blk_rq_map_sg(rq, sglist, last_sg: &last_sg);
1241	}
1242	void blk_dump_rq_flags(struct request , char* *);
1243
1244	#endif /* BLK_MQ_H */
1245

source code of linux/include/linux/blk-mq.h