zig

fork of https://codeberg.org/ziglang/zig
Log | Files | Refs | README | LICENSE

netmap.h (35558B) - Raw


      1 /*-
      2  * SPDX-License-Identifier: BSD-2-Clause
      3  *
      4  * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
      5  *
      6  * Redistribution and use in source and binary forms, with or without
      7  * modification, are permitted provided that the following conditions
      8  * are met:
      9  *
     10  *   1. Redistributions of source code must retain the above copyright
     11  *      notice, this list of conditions and the following disclaimer.
     12  *   2. Redistributions in binary form must reproduce the above copyright
     13  *      notice, this list of conditions and the following disclaimer in the
     14  *      documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``S IS''AND
     17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     19  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
     20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     26  * SUCH DAMAGE.
     27  */
     28 
     29 /*
     30  *
     31  * Definitions of constants and the structures used by the netmap
     32  * framework, for the part visible to both kernel and userspace.
     33  * Detailed info on netmap is available with "man netmap" or at
     34  *
     35  *	http://info.iet.unipi.it/~luigi/netmap/
     36  *
     37  * This API is also used to communicate with the VALE software switch
     38  */
     39 
     40 #ifndef _NET_NETMAP_H_
     41 #define _NET_NETMAP_H_
     42 
     43 #define	NETMAP_API	14		/* current API version */
     44 
     45 #define	NETMAP_MIN_API	14		/* min and max versions accepted */
     46 #define	NETMAP_MAX_API	15
     47 /*
     48  * Some fields should be cache-aligned to reduce contention.
     49  * The alignment is architecture and OS dependent, but rather than
     50  * digging into OS headers to find the exact value we use an estimate
     51  * that should cover most architectures.
     52  */
     53 #define NM_CACHE_ALIGN	128
     54 
     55 /*
     56  * --- Netmap data structures ---
     57  *
     58  * The userspace data structures used by netmap are shown below.
     59  * They are allocated by the kernel and mmap()ed by userspace threads.
     60  * Pointers are implemented as memory offsets or indexes,
     61  * so that they can be easily dereferenced in kernel and userspace.
     62 
     63    KERNEL (opaque, obviously)
     64 
     65   ====================================================================
     66                                           |
     67    USERSPACE                              |      struct netmap_ring
     68                                           +---->+---------------+
     69                                               / | head,cur,tail |
     70    struct netmap_if (nifp, 1 per fd)         /  | buf_ofs       |
     71     +----------------+                      /   | other fields  |
     72     | ni_tx_rings    |                     /    +===============+
     73     | ni_rx_rings    |                    /     | buf_idx, len  | slot[0]
     74     |                |                   /      | flags, ptr    |
     75     |                |                  /       +---------------+
     76     +================+                 /        | buf_idx, len  | slot[1]
     77     | txring_ofs[0]  | (rel.to nifp)--'         | flags, ptr    |
     78     | txring_ofs[1]  |                          +---------------+
     79      (tx+htx entries)                           (num_slots entries)
     80     | txring_ofs[t]  |                          | buf_idx, len  | slot[n-1]
     81     +----------------+                          | flags, ptr    |
     82     | rxring_ofs[0]  |                          +---------------+
     83     | rxring_ofs[1]  |
     84      (rx+hrx entries)
     85     | rxring_ofs[r]  |
     86     +----------------+
     87 
     88  * For each "interface" (NIC, host stack, PIPE, VALE switch port) bound to
     89  * a file descriptor, the mmap()ed region contains a (logically readonly)
     90  * struct netmap_if pointing to struct netmap_ring's.
     91  *
     92  * There is one netmap_ring per physical NIC ring, plus at least one tx/rx ring
     93  * pair attached to the host stack (these pairs are unused for non-NIC ports).
     94  *
     95  * All physical/host stack ports share the same memory region,
     96  * so that zero-copy can be implemented between them.
     97  * VALE switch ports instead have separate memory regions.
     98  *
     99  * The netmap_ring is the userspace-visible replica of the NIC ring.
    100  * Each slot has the index of a buffer (MTU-sized and residing in the
    101  * mmapped region), its length and some flags. An extra 64-bit pointer
    102  * is provided for user-supplied buffers in the tx path.
    103  *
    104  * In user space, the buffer address is computed as
    105  *	(char *)ring + buf_ofs + index * NETMAP_BUF_SIZE
    106  *
    107  * Added in NETMAP_API 11:
    108  *
    109  * + NIOCREGIF can request the allocation of extra spare buffers from
    110  *   the same memory pool. The desired number of buffers must be in
    111  *   nr_arg3. The ioctl may return fewer buffers, depending on memory
    112  *   availability. nr_arg3 will return the actual value, and, once
    113  *   mapped, nifp->ni_bufs_head will be the index of the first buffer.
    114  *
    115  *   The buffers are linked to each other using the first uint32_t
    116  *   as the index. On close, ni_bufs_head must point to the list of
    117  *   buffers to be released.
    118  *
    119  * + NIOCREGIF can attach to PIPE rings sharing the same memory
    120  *   space with a parent device. The ifname indicates the parent device,
    121  *   which must already exist. Flags in nr_flags indicate if we want to
    122  *   bind the master or slave side, the index (from nr_ringid)
    123  *   is just a cookie and does not need to be sequential.
    124  *
    125  * + NIOCREGIF can also attach to 'monitor' rings that replicate
    126  *   the content of specific rings, also from the same memory space.
    127  *
    128  *   Extra flags in nr_flags support the above functions.
    129  *   Application libraries may use the following naming scheme:
    130  *	netmap:foo			all NIC rings pairs
    131  *	netmap:foo^			only host rings pairs
    132  *	netmap:foo^k			the k-th host rings pair
    133  *	netmap:foo+			all NIC rings + host rings pairs
    134  *	netmap:foo-k			the k-th NIC rings pair
    135  *	netmap:foo{k			PIPE rings pair k, master side
    136  *	netmap:foo}k			PIPE rings pair k, slave side
    137  *
    138  * Some notes about host rings:
    139  *
    140  * + The RX host rings are used to store those packets that the host network
    141  *   stack is trying to transmit through a NIC queue, but only if that queue
    142  *   is currently in netmap mode. Netmap will not intercept host stack mbufs
    143  *   designated to NIC queues that are not in netmap mode. As a consequence,
    144  *   registering a netmap port with netmap:foo^ is not enough to intercept
    145  *   mbufs in the RX host rings; the netmap port should be registered with
    146  *   netmap:foo*, or another registration should be done to open at least a
    147  *   NIC TX queue in netmap mode.
    148  *
    149  * + Netmap is not currently able to deal with intercepted transmit mbufs which
    150  *   require offloadings like TSO, UFO, checksumming offloadings, etc. It is
    151  *   responsibility of the user to disable those offloadings (e.g. using
    152  *   ifconfig on FreeBSD or ethtool -K on Linux) for an interface that is being
    153  *   used in netmap mode. If the offloadings are not disabled, GSO and/or
    154  *   unchecksummed packets may be dropped immediately or end up in the host RX
    155  *   rings, and will be dropped as soon as the packet reaches another netmap
    156  *   adapter.
    157  */
    158 
    159 /*
    160  * struct netmap_slot is a buffer descriptor
    161  */
    162 struct netmap_slot {
    163 	uint32_t buf_idx;	/* buffer index */
    164 	uint16_t len;		/* length for this slot */
    165 	uint16_t flags;		/* buf changed, etc. */
    166 	uint64_t ptr;		/* pointer for indirect buffers */
    167 };
    168 
    169 /*
    170  * The following flags control how the slot is used
    171  */
    172 
    173 #define	NS_BUF_CHANGED	0x0001	/* buf_idx changed */
    174 	/*
    175 	 * must be set whenever buf_idx is changed (as it might be
    176 	 * necessary to recompute the physical address and mapping)
    177 	 *
    178 	 * It is also set by the kernel whenever the buf_idx is
    179 	 * changed internally (e.g., by pipes). Applications may
    180 	 * use this information to know when they can reuse the
    181 	 * contents of previously prepared buffers.
    182 	 */
    183 
    184 #define	NS_REPORT	0x0002	/* ask the hardware to report results */
    185 	/*
    186 	 * Request notification when slot is used by the hardware.
    187 	 * Normally transmit completions are handled lazily and
    188 	 * may be unreported. This flag lets us know when a slot
    189 	 * has been sent (e.g. to terminate the sender).
    190 	 */
    191 
    192 #define	NS_FORWARD	0x0004	/* pass packet 'forward' */
    193 	/*
    194 	 * (Only for physical ports, rx rings with NR_FORWARD set).
    195 	 * Slot released to the kernel (i.e. before ring->head) with
    196 	 * this flag set are passed to the peer ring (host/NIC),
    197 	 * thus restoring the host-NIC connection for these slots.
    198 	 * This supports efficient traffic monitoring or firewalling.
    199 	 */
    200 
    201 #define	NS_NO_LEARN	0x0008	/* disable bridge learning */
    202  	/*
    203 	 * On a VALE switch, do not 'learn' the source port for
    204  	 * this buffer.
    205 	 */
    206 
    207 #define	NS_INDIRECT	0x0010	/* userspace buffer */
    208  	/*
    209 	 * (VALE tx rings only) data is in a userspace buffer,
    210 	 * whose address is in the 'ptr' field in the slot.
    211 	 */
    212 
    213 #define	NS_MOREFRAG	0x0020	/* packet has more fragments */
    214  	/*
    215 	 * (VALE ports, ptnetmap ports and some NIC ports, e.g.
    216          * ixgbe and i40e on Linux)
    217 	 * Set on all but the last slot of a multi-segment packet.
    218 	 * The 'len' field refers to the individual fragment.
    219 	 */
    220 
    221 #define NS_TXMON	0x0040
    222 	/* (monitor ports only) the packet comes from the TX
    223 	 * ring of the monitored port
    224 	 */
    225 
    226 #define	NS_PORT_SHIFT	8
    227 #define	NS_PORT_MASK	(0xff << NS_PORT_SHIFT)
    228 	/*
    229  	 * The high 8 bits of the flag, if not zero, indicate the
    230 	 * destination port for the VALE switch, overriding
    231  	 * the lookup table.
    232  	 */
    233 
    234 #define	NS_RFRAGS(_slot)	( ((_slot)->flags >> 8) & 0xff)
    235 	/*
    236 	 * (VALE rx rings only) the high 8 bits
    237 	 *  are the number of fragments.
    238 	 */
    239 
    240 #define NETMAP_MAX_FRAGS	64	/* max number of fragments */
    241 
    242 
    243 /*
    244  * struct netmap_ring
    245  *
    246  * Netmap representation of a TX or RX ring (also known as "queue").
    247  * This is a queue implemented as a fixed-size circular array.
    248  * At the software level the important fields are: head, cur, tail.
    249  *
    250  * In TX rings:
    251  *
    252  *	head	first slot available for transmission.
    253  *	cur	wakeup point. select() and poll() will unblock
    254  *		when 'tail' moves past 'cur'
    255  *	tail	(readonly) first slot reserved to the kernel
    256  *
    257  *	[head .. tail-1] can be used for new packets to send;
    258  *	'head' and 'cur' must be incremented as slots are filled
    259  *	    with new packets to be sent;
    260  *	'cur' can be moved further ahead if we need more space
    261  *	for new transmissions. XXX todo (2014-03-12)
    262  *
    263  * In RX rings:
    264  *
    265  *	head	first valid received packet
    266  *	cur	wakeup point. select() and poll() will unblock
    267  *		when 'tail' moves past 'cur'
    268  *	tail	(readonly) first slot reserved to the kernel
    269  *
    270  *	[head .. tail-1] contain received packets;
    271  *	'head' and 'cur' must be incremented as slots are consumed
    272  *		and can be returned to the kernel;
    273  *	'cur' can be moved further ahead if we want to wait for
    274  *		new packets without returning the previous ones.
    275  *
    276  * DATA OWNERSHIP/LOCKING:
    277  *	The netmap_ring, and all slots and buffers in the range
    278  *	[head .. tail-1] are owned by the user program;
    279  *	the kernel only accesses them during a netmap system call
    280  *	and in the user thread context.
    281  *
    282  *	Other slots and buffers are reserved for use by the kernel
    283  */
    284 struct netmap_ring {
    285 	/*
    286 	 * buf_ofs is meant to be used through macros.
    287 	 * It contains the offset of the buffer region from this
    288 	 * descriptor.
    289 	 */
    290 	const int64_t	buf_ofs;
    291 	const uint32_t	num_slots;	/* number of slots in the ring. */
    292 	const uint32_t	nr_buf_size;
    293 	const uint16_t	ringid;
    294 	const uint16_t	dir;		/* 0: tx, 1: rx */
    295 
    296 	uint32_t        head;		/* (u) first user slot */
    297 	uint32_t        cur;		/* (u) wakeup point */
    298 	uint32_t	tail;		/* (k) first kernel slot */
    299 
    300 	uint32_t	flags;
    301 
    302 	struct timeval	ts;		/* (k) time of last *sync() */
    303 
    304 	/* offset_mask is used to isolate the part of the ptr field
    305 	 * in the slots used to contain an offset in the buffer.
    306 	 * It is zero if the ring has not be opened using the
    307 	 * NETMAP_REQ_OPT_OFFSETS option.
    308 	 */
    309 	const uint64_t	offset_mask;
    310 	/* the alignment requirement, in bytes, for the start
    311 	 * of the packets inside the buffers.
    312 	 * User programs should take this alignment into
    313 	 * account when specifying buffer-offsets in TX slots.
    314 	 */
    315 	const uint64_t	buf_align;
    316 
    317 	/* opaque room for a mutex or similar object */
    318 #if !defined(_WIN32) || defined(__CYGWIN__)
    319 	uint8_t	__attribute__((__aligned__(NM_CACHE_ALIGN))) sem[128];
    320 #else
    321 	uint8_t	__declspec(align(NM_CACHE_ALIGN)) sem[128];
    322 #endif
    323 
    324 	/* the slots follow. This struct has variable size */
    325 	struct netmap_slot slot[0];	/* array of slots. */
    326 };
    327 
    328 
    329 /*
    330  * RING FLAGS
    331  */
    332 #define	NR_TIMESTAMP	0x0002		/* set timestamp on *sync() */
    333 	/*
    334 	 * updates the 'ts' field on each netmap syscall. This saves
    335 	 * saves a separate gettimeofday(), and is not much worse than
    336 	 * software timestamps generated in the interrupt handler.
    337 	 */
    338 
    339 #define	NR_FORWARD	0x0004		/* enable NS_FORWARD for ring */
    340  	/*
    341 	 * Enables the NS_FORWARD slot flag for the ring.
    342 	 */
    343 
    344 /*
    345  * Helper functions for kernel and userspace
    346  */
    347 
    348 /*
    349  * Check if space is available in the ring. We use ring->head, which
    350  * points to the next netmap slot to be published to netmap. It is
    351  * possible that the applications moves ring->cur ahead of ring->tail
    352  * (e.g., by setting ring->cur <== ring->tail), if it wants more slots
    353  * than the ones currently available, and it wants to be notified when
    354  * more arrive. See netmap(4) for more details and examples.
    355  */
    356 static inline int
    357 nm_ring_empty(struct netmap_ring *ring)
    358 {
    359 	return (ring->head == ring->tail);
    360 }
    361 
    362 /*
    363  * Netmap representation of an interface and its queue(s).
    364  * This is initialized by the kernel when binding a file
    365  * descriptor to a port, and should be considered as readonly
    366  * by user programs. The kernel never uses it.
    367  *
    368  * There is one netmap_if for each file descriptor on which we want
    369  * to select/poll.
    370  * select/poll operates on one or all pairs depending on the value of
    371  * nmr_queueid passed on the ioctl.
    372  */
    373 struct netmap_if {
    374 	char		ni_name[IFNAMSIZ]; /* name of the interface. */
    375 	const uint32_t	ni_version;	/* API version, currently unused */
    376 	const uint32_t	ni_flags;	/* properties */
    377 #define	NI_PRIV_MEM	0x1		/* private memory region */
    378 
    379 	/*
    380 	 * The number of packet rings available in netmap mode.
    381 	 * Physical NICs can have different numbers of tx and rx rings.
    382 	 * Physical NICs also have at least a 'host' rings pair.
    383 	 * Additionally, clients can request additional ring pairs to
    384 	 * be used for internal communication.
    385 	 */
    386 	const uint32_t	ni_tx_rings;	/* number of HW tx rings */
    387 	const uint32_t	ni_rx_rings;	/* number of HW rx rings */
    388 
    389 	uint32_t	ni_bufs_head;	/* head index for extra bufs */
    390 	const uint32_t	ni_host_tx_rings; /* number of SW tx rings */
    391 	const uint32_t	ni_host_rx_rings; /* number of SW rx rings */
    392 	uint32_t	ni_spare1[3];
    393 	/*
    394 	 * The following array contains the offset of each netmap ring
    395 	 * from this structure, in the following order:
    396 	 *     - NIC tx rings (ni_tx_rings);
    397 	 *     - host tx rings (ni_host_tx_rings);
    398 	 *     - NIC rx rings (ni_rx_rings);
    399 	 *     - host rx ring (ni_host_rx_rings);
    400 	 *
    401 	 * The area is filled up by the kernel on NETMAP_REQ_REGISTER,
    402 	 * and then only read by userspace code.
    403 	 */
    404 	const ssize_t	ring_ofs[0];
    405 };
    406 
    407 /* Legacy interface to interact with a netmap control device.
    408  * Included for backward compatibility. The user should not include this
    409  * file directly. */
    410 #include "netmap_legacy.h"
    411 
    412 /*
    413  * New API to control netmap control devices. New applications should only use
    414  * nmreq_xyz structs with the NIOCCTRL ioctl() command.
    415  *
    416  * NIOCCTRL takes a nmreq_header struct, which contains the required
    417  * API version, the name of a netmap port, a command type, and pointers
    418  * to request body and options.
    419  *
    420  *	nr_name	(in)
    421  *		The name of the port (em0, valeXXX:YYY, eth0{pn1 etc.)
    422  *
    423  *	nr_version (in/out)
    424  *		Must match NETMAP_API as used in the kernel, error otherwise.
    425  *		Always returns the desired value on output.
    426  *
    427  *	nr_reqtype (in)
    428  *		One of the NETMAP_REQ_* command types below
    429  *
    430  *	nr_body (in)
    431  *		Pointer to a command-specific struct, described by one
    432  *		of the struct nmreq_xyz below.
    433  *
    434  *	nr_options (in)
    435  *		Command specific options, if any.
    436  *
    437  * A NETMAP_REQ_REGISTER command activates netmap mode on the netmap
    438  * port (e.g. physical interface) specified by nmreq_header.nr_name.
    439  * The request body (struct nmreq_register) has several arguments to
    440  * specify how the port is to be registered.
    441  *
    442  *	nr_tx_slots, nr_tx_slots, nr_tx_rings, nr_rx_rings,
    443  *	nr_host_tx_rings, nr_host_rx_rings (in/out)
    444  *		On input, non-zero values may be used to reconfigure the port
    445  *		according to the requested values, but this is not guaranteed.
    446  *		On output the actual values in use are reported.
    447  *
    448  *	nr_mode (in)
    449  *		Indicate what set of rings must be bound to the netmap
    450  *		device (e.g. all NIC rings, host rings only, NIC and
    451  *		host rings, ...). Values are in NR_REG_*.
    452  *
    453  *	nr_ringid (in)
    454  *		If nr_mode == NR_REG_ONE_NIC (only a single couple of TX/RX
    455  *		rings), indicate which NIC TX and/or RX ring is to be bound
    456  *		(0..nr_*x_rings-1).
    457  *
    458  *	nr_flags (in)
    459  *		Indicate special options for how to open the port.
    460  *
    461  *		NR_NO_TX_POLL can be OR-ed to make select()/poll() push
    462  *			packets on tx rings only if POLLOUT is set.
    463  *			The default is to push any pending packet.
    464  *
    465  *		NR_DO_RX_POLL can be OR-ed to make select()/poll() release
    466  *			packets on rx rings also when POLLIN is NOT set.
    467  *			The default is to touch the rx ring only with POLLIN.
    468  *			Note that this is the opposite of TX because it
    469  *			reflects the common usage.
    470  *
    471  *		Other options are NR_MONITOR_TX, NR_MONITOR_RX, NR_ZCOPY_MON,
    472  *		NR_EXCLUSIVE, NR_RX_RINGS_ONLY, NR_TX_RINGS_ONLY and
    473  *		NR_ACCEPT_VNET_HDR.
    474  *
    475  *	nr_mem_id (in/out)
    476  *		The identity of the memory region used.
    477  *		On input, 0 means the system decides autonomously,
    478  *		other values may try to select a specific region.
    479  *		On return the actual value is reported.
    480  *		Region '1' is the global allocator, normally shared
    481  *		by all interfaces. Other values are private regions.
    482  *		If two ports the same region zero-copy is possible.
    483  *
    484  *	nr_extra_bufs (in/out)
    485  *		Number of extra buffers to be allocated.
    486  *
    487  * The other NETMAP_REQ_* commands are described below.
    488  *
    489  */
    490 
    491 /* maximum size of a request, including all options */
    492 #define NETMAP_REQ_MAXSIZE	4096
    493 
    494 /* Header common to all request options. */
    495 struct nmreq_option {
    496 	/* Pointer to the next option. */
    497 	uint64_t		nro_next;
    498 	/* Option type. */
    499 	uint32_t		nro_reqtype;
    500 	/* (out) status of the option:
    501 	 * 0: recognized and processed
    502 	 * !=0: errno value
    503 	 */
    504 	uint32_t		nro_status;
    505 	/* Option size, used only for options that can have variable size
    506 	 * (e.g. because they contain arrays). For fixed-size options this
    507 	 * field should be set to zero. */
    508 	uint64_t		nro_size;
    509 };
    510 
    511 /* Header common to all requests. Do not reorder these fields, as we need
    512  * the second one (nr_reqtype) to know how much to copy from/to userspace. */
    513 struct nmreq_header {
    514 	uint16_t		nr_version;	/* API version */
    515 	uint16_t		nr_reqtype;	/* nmreq type (NETMAP_REQ_*) */
    516 	uint32_t		nr_reserved;	/* must be zero */
    517 #define NETMAP_REQ_IFNAMSIZ	64
    518 	char			nr_name[NETMAP_REQ_IFNAMSIZ]; /* port name */
    519 	uint64_t		nr_options;	/* command-specific options */
    520 	uint64_t		nr_body;	/* ptr to nmreq_xyz struct */
    521 };
    522 
    523 enum {
    524 	/* Register a netmap port with the device. */
    525 	NETMAP_REQ_REGISTER = 1,
    526 	/* Get information from a netmap port. */
    527 	NETMAP_REQ_PORT_INFO_GET,
    528 	/* Attach a netmap port to a VALE switch. */
    529 	NETMAP_REQ_VALE_ATTACH,
    530 	/* Detach a netmap port from a VALE switch. */
    531 	NETMAP_REQ_VALE_DETACH,
    532 	/* List the ports attached to a VALE switch. */
    533 	NETMAP_REQ_VALE_LIST,
    534 	/* Set the port header length (was virtio-net header length). */
    535 	NETMAP_REQ_PORT_HDR_SET,
    536 	/* Get the port header length (was virtio-net header length). */
    537 	NETMAP_REQ_PORT_HDR_GET,
    538 	/* Create a new persistent VALE port. */
    539 	NETMAP_REQ_VALE_NEWIF,
    540 	/* Delete a persistent VALE port. */
    541 	NETMAP_REQ_VALE_DELIF,
    542 	/* Enable polling kernel thread(s) on an attached VALE port. */
    543 	NETMAP_REQ_VALE_POLLING_ENABLE,
    544 	/* Disable polling kernel thread(s) on an attached VALE port. */
    545 	NETMAP_REQ_VALE_POLLING_DISABLE,
    546 	/* Get info about the pools of a memory allocator. */
    547 	NETMAP_REQ_POOLS_INFO_GET,
    548 	/* Start an in-kernel loop that syncs the rings periodically or
    549 	 * on notifications. The loop runs in the context of the ioctl
    550 	 * syscall, and only stops on NETMAP_REQ_SYNC_KLOOP_STOP. */
    551 	NETMAP_REQ_SYNC_KLOOP_START,
    552 	/* Stops the thread executing the in-kernel loop. The thread
    553 	 * returns from the ioctl syscall. */
    554 	NETMAP_REQ_SYNC_KLOOP_STOP,
    555 	/* Enable CSB mode on a registered netmap control device. */
    556 	NETMAP_REQ_CSB_ENABLE,
    557 };
    558 
    559 enum {
    560 	/* On NETMAP_REQ_REGISTER, ask netmap to use memory allocated
    561 	 * from user-space allocated memory pools (e.g. hugepages).
    562 	 */
    563 	NETMAP_REQ_OPT_EXTMEM = 1,
    564 
    565 	/* ON NETMAP_REQ_SYNC_KLOOP_START, ask netmap to use eventfd-based
    566 	 * notifications to synchronize the kernel loop with the application.
    567 	 */
    568 	NETMAP_REQ_OPT_SYNC_KLOOP_EVENTFDS,
    569 
    570 	/* On NETMAP_REQ_REGISTER, ask netmap to work in CSB mode, where
    571 	 * head, cur and tail pointers are not exchanged through the
    572 	 * struct netmap_ring header, but rather using an user-provided
    573 	 * memory area (see struct nm_csb_atok and struct nm_csb_ktoa).
    574 	 */
    575 	NETMAP_REQ_OPT_CSB,
    576 
    577 	/* An extension to NETMAP_REQ_OPT_SYNC_KLOOP_EVENTFDS, which specifies
    578 	 * if the TX and/or RX rings are synced in the context of the VM exit.
    579 	 * This requires the 'ioeventfd' fields to be valid (cannot be < 0).
    580 	 */
    581 	NETMAP_REQ_OPT_SYNC_KLOOP_MODE,
    582 
    583 	/* On NETMAP_REQ_REGISTER, ask for (part of) the ptr field in the
    584 	 * slots of the registered rings to be used as an offset field
    585 	 * for the start of the packets inside the netmap buffer.
    586 	 */
    587 	NETMAP_REQ_OPT_OFFSETS,
    588 
    589 	/* This is a marker to count the number of available options.
    590 	 * New options must be added above it. */
    591 	NETMAP_REQ_OPT_MAX,
    592 };
    593 
    594 /*
    595  * nr_reqtype: NETMAP_REQ_REGISTER
    596  * Bind (register) a netmap port to this control device.
    597  */
    598 struct nmreq_register {
    599 	uint64_t	nr_offset;	/* nifp offset in the shared region */
    600 	uint64_t	nr_memsize;	/* size of the shared region */
    601 	uint32_t	nr_tx_slots;	/* slots in tx rings */
    602 	uint32_t	nr_rx_slots;	/* slots in rx rings */
    603 	uint16_t	nr_tx_rings;	/* number of tx rings */
    604 	uint16_t	nr_rx_rings;	/* number of rx rings */
    605 	uint16_t	nr_host_tx_rings; /* number of host tx rings */
    606 	uint16_t	nr_host_rx_rings; /* number of host rx rings */
    607 
    608 	uint16_t	nr_mem_id;	/* id of the memory allocator */
    609 	uint16_t	nr_ringid;	/* ring(s) we care about */
    610 	uint32_t	nr_mode;	/* specify NR_REG_* modes */
    611 	uint32_t	nr_extra_bufs;	/* number of requested extra buffers */
    612 
    613 	uint64_t	nr_flags;	/* additional flags (see below) */
    614 /* monitors use nr_ringid and nr_mode to select the rings to monitor */
    615 #define NR_MONITOR_TX	0x100
    616 #define NR_MONITOR_RX	0x200
    617 #define NR_ZCOPY_MON	0x400
    618 /* request exclusive access to the selected rings */
    619 #define NR_EXCLUSIVE	0x800
    620 /* 0x1000 unused */
    621 #define NR_RX_RINGS_ONLY	0x2000
    622 #define NR_TX_RINGS_ONLY	0x4000
    623 /* Applications set this flag if they are able to deal with virtio-net headers,
    624  * that is send/receive frames that start with a virtio-net header.
    625  * If not set, NETMAP_REQ_REGISTER will fail with netmap ports that require
    626  * applications to use those headers. If the flag is set, the application can
    627  * use the NETMAP_VNET_HDR_GET command to figure out the header length. */
    628 #define NR_ACCEPT_VNET_HDR	0x8000
    629 /* The following two have the same meaning of NETMAP_NO_TX_POLL and
    630  * NETMAP_DO_RX_POLL. */
    631 #define NR_DO_RX_POLL		0x10000
    632 #define NR_NO_TX_POLL		0x20000
    633 };
    634 
    635 /* Valid values for nmreq_register.nr_mode (see above). */
    636 enum {	NR_REG_DEFAULT	= 0,	/* backward compat, should not be used. */
    637 	NR_REG_ALL_NIC	= 1,
    638 	NR_REG_SW	= 2,
    639 	NR_REG_NIC_SW	= 3,
    640 	NR_REG_ONE_NIC	= 4,
    641 	NR_REG_PIPE_MASTER = 5, /* deprecated, use "x{y" port name syntax */
    642 	NR_REG_PIPE_SLAVE = 6,  /* deprecated, use "x}y" port name syntax */
    643 	NR_REG_NULL     = 7,
    644 	NR_REG_ONE_SW	= 8,
    645 };
    646 
    647 /* A single ioctl number is shared by all the new API command.
    648  * Demultiplexing is done using the hdr.nr_reqtype field.
    649  * FreeBSD uses the size value embedded in the _IOWR to determine
    650  * how much to copy in/out, so we define the ioctl() command
    651  * specifying only nmreq_header, and copyin/copyout the rest. */
    652 #define NIOCCTRL	_IOWR('i', 151, struct nmreq_header)
    653 
    654 /* The ioctl commands to sync TX/RX netmap rings.
    655  * NIOCTXSYNC, NIOCRXSYNC synchronize tx or rx queues,
    656  *	whose identity is set in NETMAP_REQ_REGISTER through nr_ringid.
    657  *	These are non blocking and take no argument. */
    658 #define NIOCTXSYNC	_IO('i', 148) /* sync tx queues */
    659 #define NIOCRXSYNC	_IO('i', 149) /* sync rx queues */
    660 
    661 /*
    662  * nr_reqtype: NETMAP_REQ_PORT_INFO_GET
    663  * Get information about a netmap port, including number of rings.
    664  * slots per ring, id of the memory allocator, etc. The netmap
    665  * control device used for this operation does not need to be bound
    666  * to a netmap port.
    667  */
    668 struct nmreq_port_info_get {
    669 	uint64_t	nr_memsize;	/* size of the shared region */
    670 	uint32_t	nr_tx_slots;	/* slots in tx rings */
    671 	uint32_t	nr_rx_slots;	/* slots in rx rings */
    672 	uint16_t	nr_tx_rings;	/* number of tx rings */
    673 	uint16_t	nr_rx_rings;	/* number of rx rings */
    674 	uint16_t	nr_host_tx_rings; /* number of host tx rings */
    675 	uint16_t	nr_host_rx_rings; /* number of host rx rings */
    676 	uint16_t	nr_mem_id;	/* memory allocator id (in/out) */
    677 	uint16_t	pad[3];
    678 };
    679 
    680 #define	NM_BDG_NAME		"vale"	/* prefix for bridge port name */
    681 
    682 /*
    683  * nr_reqtype: NETMAP_REQ_VALE_ATTACH
    684  * Attach a netmap port to a VALE switch. Both the name of the netmap
    685  * port and the VALE switch are specified through the nr_name argument.
    686  * The attach operation could need to register a port, so at least
    687  * the same arguments are available.
    688  * port_index will contain the index where the port has been attached.
    689  */
    690 struct nmreq_vale_attach {
    691 	struct nmreq_register reg;
    692 	uint32_t port_index;
    693 	uint32_t pad1;
    694 };
    695 
    696 /*
    697  * nr_reqtype: NETMAP_REQ_VALE_DETACH
    698  * Detach a netmap port from a VALE switch. Both the name of the netmap
    699  * port and the VALE switch are specified through the nr_name argument.
    700  * port_index will contain the index where the port was attached.
    701  */
    702 struct nmreq_vale_detach {
    703 	uint32_t port_index;
    704 	uint32_t pad1;
    705 };
    706 
    707 /*
    708  * nr_reqtype: NETMAP_REQ_VALE_LIST
    709  * List the ports of a VALE switch.
    710  */
    711 struct nmreq_vale_list {
    712 	/* Name of the VALE port (valeXXX:YYY) or empty. */
    713 	uint16_t	nr_bridge_idx;
    714 	uint16_t	pad1;
    715 	uint32_t	nr_port_idx;
    716 };
    717 
    718 /*
    719  * nr_reqtype: NETMAP_REQ_PORT_HDR_SET or NETMAP_REQ_PORT_HDR_GET
    720  * Set or get the port header length of the port identified by hdr.nr_name.
    721  * The control device does not need to be bound to a netmap port.
    722  */
    723 struct nmreq_port_hdr {
    724 	uint32_t	nr_hdr_len;
    725 	uint32_t	pad1;
    726 };
    727 
    728 /*
    729  * nr_reqtype: NETMAP_REQ_VALE_NEWIF
    730  * Create a new persistent VALE port.
    731  */
    732 struct nmreq_vale_newif {
    733 	uint32_t	nr_tx_slots;	/* slots in tx rings */
    734 	uint32_t	nr_rx_slots;	/* slots in rx rings */
    735 	uint16_t	nr_tx_rings;	/* number of tx rings */
    736 	uint16_t	nr_rx_rings;	/* number of rx rings */
    737 	uint16_t	nr_mem_id;	/* id of the memory allocator */
    738 	uint16_t	pad1;
    739 };
    740 
    741 /*
    742  * nr_reqtype: NETMAP_REQ_VALE_POLLING_ENABLE or NETMAP_REQ_VALE_POLLING_DISABLE
    743  * Enable or disable polling kthreads on a VALE port.
    744  */
    745 struct nmreq_vale_polling {
    746 	uint32_t	nr_mode;
    747 #define NETMAP_POLLING_MODE_SINGLE_CPU 1
    748 #define NETMAP_POLLING_MODE_MULTI_CPU 2
    749 	uint32_t	nr_first_cpu_id;
    750 	uint32_t	nr_num_polling_cpus;
    751 	uint32_t	pad1;
    752 };
    753 
    754 /*
    755  * nr_reqtype: NETMAP_REQ_POOLS_INFO_GET
    756  * Get info about the pools of the memory allocator of the netmap
    757  * port specified by hdr.nr_name and nr_mem_id. The netmap control
    758  * device used for this operation does not need to be bound to a netmap
    759  * port.
    760  */
    761 struct nmreq_pools_info {
    762 	uint64_t	nr_memsize;
    763 	uint16_t	nr_mem_id; /* in/out argument */
    764 	uint16_t	pad1[3];
    765 	uint64_t	nr_if_pool_offset;
    766 	uint32_t	nr_if_pool_objtotal;
    767 	uint32_t	nr_if_pool_objsize;
    768 	uint64_t	nr_ring_pool_offset;
    769 	uint32_t	nr_ring_pool_objtotal;
    770 	uint32_t	nr_ring_pool_objsize;
    771 	uint64_t	nr_buf_pool_offset;
    772 	uint32_t	nr_buf_pool_objtotal;
    773 	uint32_t	nr_buf_pool_objsize;
    774 };
    775 
    776 /*
    777  * nr_reqtype: NETMAP_REQ_SYNC_KLOOP_START
    778  * Start an in-kernel loop that syncs the rings periodically or on
    779  * notifications. The loop runs in the context of the ioctl syscall,
    780  * and only stops on NETMAP_REQ_SYNC_KLOOP_STOP.
    781  * The registered netmap port must be open in CSB mode.
    782  */
    783 struct nmreq_sync_kloop_start {
    784 	/* Sleeping is the default synchronization method for the kloop.
    785 	 * The 'sleep_us' field specifies how many microseconds to sleep for
    786 	 * when there is no work to do, before doing another kloop iteration.
    787 	 */
    788 	uint32_t	sleep_us;
    789 	uint32_t	pad1;
    790 };
    791 
    792 /* A CSB entry for the application --> kernel direction. */
    793 struct nm_csb_atok {
    794 	uint32_t head;		  /* AW+ KR+ the head of the appl netmap_ring */
    795 	uint32_t cur;		  /* AW+ KR+ the cur of the appl netmap_ring */
    796 	uint32_t appl_need_kick;  /* AW+ KR+ kern --> appl notification enable */
    797 	uint32_t sync_flags;	  /* AW+ KR+ the flags of the appl [tx|rx]sync() */
    798 	uint32_t pad[12];	  /* pad to a 64 bytes cacheline */
    799 };
    800 
    801 /* A CSB entry for the application <-- kernel direction. */
    802 struct nm_csb_ktoa {
    803 	uint32_t hwcur;		  /* AR+ KW+ the hwcur of the kern netmap_kring */
    804 	uint32_t hwtail;	  /* AR+ KW+ the hwtail of the kern netmap_kring */
    805 	uint32_t kern_need_kick;  /* AR+ KW+ appl-->kern notification enable */
    806 	uint32_t pad[13];
    807 };
    808 
    809 #ifdef __linux__
    810 
    811 #ifdef __KERNEL__
    812 #define nm_stst_barrier smp_wmb
    813 #define nm_ldld_barrier smp_rmb
    814 #define nm_stld_barrier smp_mb
    815 #else  /* !__KERNEL__ */
    816 static inline void nm_stst_barrier(void)
    817 {
    818 	/* A memory barrier with release semantic has the combined
    819 	 * effect of a store-store barrier and a load-store barrier,
    820 	 * which is fine for us. */
    821 	__atomic_thread_fence(__ATOMIC_RELEASE);
    822 }
    823 static inline void nm_ldld_barrier(void)
    824 {
    825 	/* A memory barrier with acquire semantic has the combined
    826 	 * effect of a load-load barrier and a store-load barrier,
    827 	 * which is fine for us. */
    828 	__atomic_thread_fence(__ATOMIC_ACQUIRE);
    829 }
    830 #endif /* !__KERNEL__ */
    831 
    832 #elif defined(__FreeBSD__)
    833 
    834 #ifdef _KERNEL
    835 #define nm_stst_barrier	atomic_thread_fence_rel
    836 #define nm_ldld_barrier	atomic_thread_fence_acq
    837 #define nm_stld_barrier	atomic_thread_fence_seq_cst
    838 #else  /* !_KERNEL */
    839 
    840 #ifdef __cplusplus
    841 #include <atomic>
    842 using std::memory_order_release;
    843 using std::memory_order_acquire;
    844 
    845 #else /* __cplusplus */
    846 #include <stdatomic.h>
    847 #endif /* __cplusplus */
    848 
    849 static inline void nm_stst_barrier(void)
    850 {
    851 	atomic_thread_fence(memory_order_release);
    852 }
    853 static inline void nm_ldld_barrier(void)
    854 {
    855 	atomic_thread_fence(memory_order_acquire);
    856 }
    857 #endif /* !_KERNEL */
    858 
    859 #else  /* !__linux__ && !__FreeBSD__ */
    860 #error "OS not supported"
    861 #endif /* !__linux__ && !__FreeBSD__ */
    862 
    863 /* Application side of sync-kloop: Write ring pointers (cur, head) to the CSB.
    864  * This routine is coupled with sync_kloop_kernel_read(). */
    865 static inline void
    866 nm_sync_kloop_appl_write(struct nm_csb_atok *atok, uint32_t cur,
    867 			 uint32_t head)
    868 {
    869 	/* Issue a first store-store barrier to make sure writes to the
    870 	 * netmap ring do not overcome updates on atok->cur and atok->head. */
    871 	nm_stst_barrier();
    872 
    873 	/*
    874 	 * We need to write cur and head to the CSB but we cannot do it atomically.
    875 	 * There is no way we can prevent the host from reading the updated value
    876 	 * of one of the two and the old value of the other. However, if we make
    877 	 * sure that the host never reads a value of head more recent than the
    878 	 * value of cur we are safe. We can allow the host to read a value of cur
    879 	 * more recent than the value of head, since in the netmap ring cur can be
    880 	 * ahead of head and cur cannot wrap around head because it must be behind
    881 	 * tail. Inverting the order of writes below could instead result into the
    882 	 * host to think head went ahead of cur, which would cause the sync
    883 	 * prologue to fail.
    884 	 *
    885 	 * The following memory barrier scheme is used to make this happen:
    886 	 *
    887 	 *          Guest                Host
    888 	 *
    889 	 *          STORE(cur)           LOAD(head)
    890 	 *          wmb() <----------->  rmb()
    891 	 *          STORE(head)          LOAD(cur)
    892 	 *
    893 	 */
    894 	atok->cur = cur;
    895 	nm_stst_barrier();
    896 	atok->head = head;
    897 }
    898 
    899 /* Application side of sync-kloop: Read kring pointers (hwcur, hwtail) from
    900  * the CSB. This routine is coupled with sync_kloop_kernel_write(). */
    901 static inline void
    902 nm_sync_kloop_appl_read(struct nm_csb_ktoa *ktoa, uint32_t *hwtail,
    903 			uint32_t *hwcur)
    904 {
    905 	/*
    906 	 * We place a memory barrier to make sure that the update of hwtail never
    907 	 * overtakes the update of hwcur.
    908 	 * (see explanation in sync_kloop_kernel_write).
    909 	 */
    910 	*hwtail = ktoa->hwtail;
    911 	nm_ldld_barrier();
    912 	*hwcur = ktoa->hwcur;
    913 
    914 	/* Make sure that loads from ktoa->hwtail and ktoa->hwcur are not delayed
    915 	 * after the loads from the netmap ring. */
    916 	nm_ldld_barrier();
    917 }
    918 
    919 /*
    920  * data for NETMAP_REQ_OPT_* options
    921  */
    922 
    923 struct nmreq_opt_sync_kloop_eventfds {
    924 	struct nmreq_option	nro_opt;	/* common header */
    925 	/* An array of N entries for bidirectional notifications between
    926 	 * the kernel loop and the application. The number of entries and
    927 	 * their order must agree with the CSB arrays passed in the
    928 	 * NETMAP_REQ_OPT_CSB option. Each entry contains a file descriptor
    929 	 * backed by an eventfd.
    930 	 *
    931 	 * If any of the 'ioeventfd' entries is < 0, the event loop uses
    932 	 * the sleeping synchronization strategy (according to sleep_us),
    933 	 * and keeps kern_need_kick always disabled.
    934 	 * Each 'irqfd' can be < 0, and in that case the corresponding queue
    935 	 * is never notified.
    936 	 */
    937 	struct {
    938 		/* Notifier for the application --> kernel loop direction. */
    939 		int32_t ioeventfd;
    940 		/* Notifier for the kernel loop --> application direction. */
    941 		int32_t irqfd;
    942 	} eventfds[0];
    943 };
    944 
    945 struct nmreq_opt_sync_kloop_mode {
    946 	struct nmreq_option	nro_opt;	/* common header */
    947 #define NM_OPT_SYNC_KLOOP_DIRECT_TX (1 << 0)
    948 #define NM_OPT_SYNC_KLOOP_DIRECT_RX (1 << 1)
    949 	uint32_t mode;
    950 };
    951 
    952 struct nmreq_opt_extmem {
    953 	struct nmreq_option	nro_opt;	/* common header */
    954 	uint64_t		nro_usrptr;	/* (in) ptr to usr memory */
    955 	struct nmreq_pools_info	nro_info;	/* (in/out) */
    956 };
    957 
    958 struct nmreq_opt_csb {
    959 	struct nmreq_option	nro_opt;
    960 
    961 	/* Array of CSB entries for application --> kernel communication
    962 	 * (N entries). */
    963 	uint64_t		csb_atok;
    964 
    965 	/* Array of CSB entries for kernel --> application communication
    966 	 * (N entries). */
    967 	uint64_t		csb_ktoa;
    968 };
    969 
    970 /* option NETMAP_REQ_OPT_OFFSETS */
    971 struct nmreq_opt_offsets {
    972 	struct nmreq_option	nro_opt;
    973 	/* the user must declare the maximum offset value that she is
    974 	 * going to put into the offset slot-fields. Any larger value
    975 	 * found at runtime will be cropped. On output the (possibly
    976 	 * higher) effective max value is returned.
    977 	 */
    978 	uint64_t		nro_max_offset;
    979 	/* optional initial offset value, to be set in all slots. */
    980 	uint64_t		nro_initial_offset;
    981 	/* number of bits in the lower part of the 'ptr' field to be
    982 	 * used as the offset field. On output the (possibly larger)
    983 	 * effective number of bits is returned.
    984 	 * 0 means: use the whole ptr field.
    985 	 */
    986 	uint32_t		nro_offset_bits;
    987 	/* required alignment for the beginning of the packets
    988 	 * (base of the buffer plus offset) in the TX slots.
    989 	 */
    990 	uint32_t		nro_tx_align;
    991 	/* Reserved: set to zero. */
    992 	uint64_t		nro_min_gap;
    993 };
    994 
    995 #endif /* _NET_NETMAP_H_ */