2 * NET3 Protocol independent device support routines.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Derived from the non IP parts of dev.c 1.0.19
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
51 * Rudi Cilibrasi : Pass the right thing to
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
75 #include <linux/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/sched/mm.h>
85 #include <linux/mutex.h>
86 #include <linux/string.h>
88 #include <linux/socket.h>
89 #include <linux/sockios.h>
90 #include <linux/errno.h>
91 #include <linux/interrupt.h>
92 #include <linux/if_ether.h>
93 #include <linux/netdevice.h>
94 #include <linux/etherdevice.h>
95 #include <linux/ethtool.h>
96 #include <linux/skbuff.h>
97 #include <linux/bpf.h>
98 #include <linux/bpf_trace.h>
99 #include <net/net_namespace.h>
100 #include <net/sock.h>
101 #include <net/busy_poll.h>
102 #include <linux/rtnetlink.h>
103 #include <linux/stat.h>
105 #include <net/dst_metadata.h>
106 #include <net/pkt_sched.h>
107 #include <net/pkt_cls.h>
108 #include <net/checksum.h>
109 #include <net/xfrm.h>
110 #include <linux/highmem.h>
111 #include <linux/init.h>
112 #include <linux/module.h>
113 #include <linux/netpoll.h>
114 #include <linux/rcupdate.h>
115 #include <linux/delay.h>
116 #include <net/iw_handler.h>
117 #include <asm/current.h>
118 #include <linux/audit.h>
119 #include <linux/dmaengine.h>
120 #include <linux/err.h>
121 #include <linux/ctype.h>
122 #include <linux/if_arp.h>
123 #include <linux/if_vlan.h>
124 #include <linux/ip.h>
126 #include <net/mpls.h>
127 #include <linux/ipv6.h>
128 #include <linux/in.h>
129 #include <linux/jhash.h>
130 #include <linux/random.h>
131 #include <trace/events/napi.h>
132 #include <trace/events/net.h>
133 #include <trace/events/skb.h>
134 #include <linux/pci.h>
135 #include <linux/inetdevice.h>
136 #include <linux/cpu_rmap.h>
137 #include <linux/static_key.h>
138 #include <linux/hashtable.h>
139 #include <linux/vmalloc.h>
140 #include <linux/if_macvlan.h>
141 #include <linux/errqueue.h>
142 #include <linux/hrtimer.h>
143 #include <linux/netfilter_ingress.h>
144 #include <linux/crash_dump.h>
145 #include <linux/sctp.h>
146 #include <net/udp_tunnel.h>
147 #include <linux/net_namespace.h>
149 #include "net-sysfs.h"
151 #define MAX_GRO_SKBS 8
153 /* This should be increased if a protocol with a bigger head is added. */
154 #define GRO_MAX_HEAD (MAX_HEADER + 128)
156 static DEFINE_SPINLOCK(ptype_lock);
157 static DEFINE_SPINLOCK(offload_lock);
158 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
159 struct list_head ptype_all __read_mostly; /* Taps */
160 static struct list_head offload_base __read_mostly;
162 static int netif_rx_internal(struct sk_buff *skb);
163 static int call_netdevice_notifiers_info(unsigned long val,
164 struct netdev_notifier_info *info);
165 static struct napi_struct *napi_by_id(unsigned int napi_id);
168 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
171 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
173 * Writers must hold the rtnl semaphore while they loop through the
174 * dev_base_head list, and hold dev_base_lock for writing when they do the
175 * actual updates. This allows pure readers to access the list even
176 * while a writer is preparing to update it.
178 * To put it another way, dev_base_lock is held for writing only to
179 * protect against pure readers; the rtnl semaphore provides the
180 * protection against other writers.
182 * See, for example usages, register_netdevice() and
183 * unregister_netdevice(), which must be called with the rtnl
186 DEFINE_RWLOCK(dev_base_lock);
187 EXPORT_SYMBOL(dev_base_lock);
189 static DEFINE_MUTEX(ifalias_mutex);
191 /* protects napi_hash addition/deletion and napi_gen_id */
192 static DEFINE_SPINLOCK(napi_hash_lock);
194 static unsigned int napi_gen_id = NR_CPUS;
195 static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
197 static seqcount_t devnet_rename_seq;
199 static inline void dev_base_seq_inc(struct net *net)
201 while (++net->dev_base_seq == 0)
205 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
207 unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
209 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
212 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
214 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
217 static inline void rps_lock(struct softnet_data *sd)
220 spin_lock(&sd->input_pkt_queue.lock);
224 static inline void rps_unlock(struct softnet_data *sd)
227 spin_unlock(&sd->input_pkt_queue.lock);
231 /* Device list insertion */
232 static void list_netdevice(struct net_device *dev)
234 struct net *net = dev_net(dev);
238 write_lock_bh(&dev_base_lock);
239 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
240 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
241 hlist_add_head_rcu(&dev->index_hlist,
242 dev_index_hash(net, dev->ifindex));
243 write_unlock_bh(&dev_base_lock);
245 dev_base_seq_inc(net);
248 /* Device list removal
249 * caller must respect a RCU grace period before freeing/reusing dev
251 static void unlist_netdevice(struct net_device *dev)
255 /* Unlink dev from the device chain */
256 write_lock_bh(&dev_base_lock);
257 list_del_rcu(&dev->dev_list);
258 hlist_del_rcu(&dev->name_hlist);
259 hlist_del_rcu(&dev->index_hlist);
260 write_unlock_bh(&dev_base_lock);
262 dev_base_seq_inc(dev_net(dev));
269 static RAW_NOTIFIER_HEAD(netdev_chain);
272 * Device drivers call our routines to queue packets here. We empty the
273 * queue in the local softnet handler.
276 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
277 EXPORT_PER_CPU_SYMBOL(softnet_data);
279 #ifdef CONFIG_LOCKDEP
281 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
282 * according to dev->type
284 static const unsigned short netdev_lock_type[] = {
285 ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
286 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
287 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
288 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
289 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
290 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
291 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
292 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
293 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
294 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
295 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
296 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
297 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
298 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
299 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
301 static const char *const netdev_lock_name[] = {
302 "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
303 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
304 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
305 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
306 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
307 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
308 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
309 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
310 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
311 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
312 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
313 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
314 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
315 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
316 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
318 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
319 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
321 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
325 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
326 if (netdev_lock_type[i] == dev_type)
328 /* the last key is used by default */
329 return ARRAY_SIZE(netdev_lock_type) - 1;
332 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
333 unsigned short dev_type)
337 i = netdev_lock_pos(dev_type);
338 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
339 netdev_lock_name[i]);
342 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
346 i = netdev_lock_pos(dev->type);
347 lockdep_set_class_and_name(&dev->addr_list_lock,
348 &netdev_addr_lock_key[i],
349 netdev_lock_name[i]);
352 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
353 unsigned short dev_type)
356 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
361 /*******************************************************************************
363 * Protocol management and registration routines
365 *******************************************************************************/
369 * Add a protocol ID to the list. Now that the input handler is
370 * smarter we can dispense with all the messy stuff that used to be
373 * BEWARE!!! Protocol handlers, mangling input packets,
374 * MUST BE last in hash buckets and checking protocol handlers
375 * MUST start from promiscuous ptype_all chain in net_bh.
376 * It is true now, do not change it.
377 * Explanation follows: if protocol handler, mangling packet, will
378 * be the first on list, it is not able to sense, that packet
379 * is cloned and should be copied-on-write, so that it will
380 * change it and subsequent readers will get broken packet.
384 static inline struct list_head *ptype_head(const struct packet_type *pt)
386 if (pt->type == htons(ETH_P_ALL))
387 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
389 return pt->dev ? &pt->dev->ptype_specific :
390 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
394 * dev_add_pack - add packet handler
395 * @pt: packet type declaration
397 * Add a protocol handler to the networking stack. The passed &packet_type
398 * is linked into kernel lists and may not be freed until it has been
399 * removed from the kernel lists.
401 * This call does not sleep therefore it can not
402 * guarantee all CPU's that are in middle of receiving packets
403 * will see the new packet type (until the next received packet).
406 void dev_add_pack(struct packet_type *pt)
408 struct list_head *head = ptype_head(pt);
410 spin_lock(&ptype_lock);
411 list_add_rcu(&pt->list, head);
412 spin_unlock(&ptype_lock);
414 EXPORT_SYMBOL(dev_add_pack);
417 * __dev_remove_pack - remove packet handler
418 * @pt: packet type declaration
420 * Remove a protocol handler that was previously added to the kernel
421 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
422 * from the kernel lists and can be freed or reused once this function
425 * The packet type might still be in use by receivers
426 * and must not be freed until after all the CPU's have gone
427 * through a quiescent state.
429 void __dev_remove_pack(struct packet_type *pt)
431 struct list_head *head = ptype_head(pt);
432 struct packet_type *pt1;
434 spin_lock(&ptype_lock);
436 list_for_each_entry(pt1, head, list) {
438 list_del_rcu(&pt->list);
443 pr_warn("dev_remove_pack: %p not found\n", pt);
445 spin_unlock(&ptype_lock);
447 EXPORT_SYMBOL(__dev_remove_pack);
450 * dev_remove_pack - remove packet handler
451 * @pt: packet type declaration
453 * Remove a protocol handler that was previously added to the kernel
454 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
455 * from the kernel lists and can be freed or reused once this function
458 * This call sleeps to guarantee that no CPU is looking at the packet
461 void dev_remove_pack(struct packet_type *pt)
463 __dev_remove_pack(pt);
467 EXPORT_SYMBOL(dev_remove_pack);
471 * dev_add_offload - register offload handlers
472 * @po: protocol offload declaration
474 * Add protocol offload handlers to the networking stack. The passed
475 * &proto_offload is linked into kernel lists and may not be freed until
476 * it has been removed from the kernel lists.
478 * This call does not sleep therefore it can not
479 * guarantee all CPU's that are in middle of receiving packets
480 * will see the new offload handlers (until the next received packet).
482 void dev_add_offload(struct packet_offload *po)
484 struct packet_offload *elem;
486 spin_lock(&offload_lock);
487 list_for_each_entry(elem, &offload_base, list) {
488 if (po->priority < elem->priority)
491 list_add_rcu(&po->list, elem->list.prev);
492 spin_unlock(&offload_lock);
494 EXPORT_SYMBOL(dev_add_offload);
497 * __dev_remove_offload - remove offload handler
498 * @po: packet offload declaration
500 * Remove a protocol offload handler that was previously added to the
501 * kernel offload handlers by dev_add_offload(). The passed &offload_type
502 * is removed from the kernel lists and can be freed or reused once this
505 * The packet type might still be in use by receivers
506 * and must not be freed until after all the CPU's have gone
507 * through a quiescent state.
509 static void __dev_remove_offload(struct packet_offload *po)
511 struct list_head *head = &offload_base;
512 struct packet_offload *po1;
514 spin_lock(&offload_lock);
516 list_for_each_entry(po1, head, list) {
518 list_del_rcu(&po->list);
523 pr_warn("dev_remove_offload: %p not found\n", po);
525 spin_unlock(&offload_lock);
529 * dev_remove_offload - remove packet offload handler
530 * @po: packet offload declaration
532 * Remove a packet offload handler that was previously added to the kernel
533 * offload handlers by dev_add_offload(). The passed &offload_type is
534 * removed from the kernel lists and can be freed or reused once this
537 * This call sleeps to guarantee that no CPU is looking at the packet
540 void dev_remove_offload(struct packet_offload *po)
542 __dev_remove_offload(po);
546 EXPORT_SYMBOL(dev_remove_offload);
548 /******************************************************************************
550 * Device Boot-time Settings Routines
552 ******************************************************************************/
554 /* Boot time configuration table */
555 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
558 * netdev_boot_setup_add - add new setup entry
559 * @name: name of the device
560 * @map: configured settings for the device
562 * Adds new setup entry to the dev_boot_setup list. The function
563 * returns 0 on error and 1 on success. This is a generic routine to
566 static int netdev_boot_setup_add(char *name, struct ifmap *map)
568 struct netdev_boot_setup *s;
572 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
573 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
574 memset(s[i].name, 0, sizeof(s[i].name));
575 strlcpy(s[i].name, name, IFNAMSIZ);
576 memcpy(&s[i].map, map, sizeof(s[i].map));
581 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
585 * netdev_boot_setup_check - check boot time settings
586 * @dev: the netdevice
588 * Check boot time settings for the device.
589 * The found settings are set for the device to be used
590 * later in the device probing.
591 * Returns 0 if no settings found, 1 if they are.
593 int netdev_boot_setup_check(struct net_device *dev)
595 struct netdev_boot_setup *s = dev_boot_setup;
598 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
599 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
600 !strcmp(dev->name, s[i].name)) {
601 dev->irq = s[i].map.irq;
602 dev->base_addr = s[i].map.base_addr;
603 dev->mem_start = s[i].map.mem_start;
604 dev->mem_end = s[i].map.mem_end;
610 EXPORT_SYMBOL(netdev_boot_setup_check);
614 * netdev_boot_base - get address from boot time settings
615 * @prefix: prefix for network device
616 * @unit: id for network device
618 * Check boot time settings for the base address of device.
619 * The found settings are set for the device to be used
620 * later in the device probing.
621 * Returns 0 if no settings found.
623 unsigned long netdev_boot_base(const char *prefix, int unit)
625 const struct netdev_boot_setup *s = dev_boot_setup;
629 sprintf(name, "%s%d", prefix, unit);
632 * If device already registered then return base of 1
633 * to indicate not to probe for this interface
635 if (__dev_get_by_name(&init_net, name))
638 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
639 if (!strcmp(name, s[i].name))
640 return s[i].map.base_addr;
645 * Saves at boot time configured settings for any netdevice.
647 int __init netdev_boot_setup(char *str)
652 str = get_options(str, ARRAY_SIZE(ints), ints);
657 memset(&map, 0, sizeof(map));
661 map.base_addr = ints[2];
663 map.mem_start = ints[3];
665 map.mem_end = ints[4];
667 /* Add new entry to the list */
668 return netdev_boot_setup_add(str, &map);
671 __setup("netdev=", netdev_boot_setup);
673 /*******************************************************************************
675 * Device Interface Subroutines
677 *******************************************************************************/
680 * dev_get_iflink - get 'iflink' value of a interface
681 * @dev: targeted interface
683 * Indicates the ifindex the interface is linked to.
684 * Physical interfaces have the same 'ifindex' and 'iflink' values.
687 int dev_get_iflink(const struct net_device *dev)
689 if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
690 return dev->netdev_ops->ndo_get_iflink(dev);
694 EXPORT_SYMBOL(dev_get_iflink);
697 * dev_fill_metadata_dst - Retrieve tunnel egress information.
698 * @dev: targeted interface
701 * For better visibility of tunnel traffic OVS needs to retrieve
702 * egress tunnel information for a packet. Following API allows
703 * user to get this info.
705 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
707 struct ip_tunnel_info *info;
709 if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst)
712 info = skb_tunnel_info_unclone(skb);
715 if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
718 return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
720 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
723 * __dev_get_by_name - find a device by its name
724 * @net: the applicable net namespace
725 * @name: name to find
727 * Find an interface by name. Must be called under RTNL semaphore
728 * or @dev_base_lock. If the name is found a pointer to the device
729 * is returned. If the name is not found then %NULL is returned. The
730 * reference counters are not incremented so the caller must be
731 * careful with locks.
734 struct net_device *__dev_get_by_name(struct net *net, const char *name)
736 struct net_device *dev;
737 struct hlist_head *head = dev_name_hash(net, name);
739 hlist_for_each_entry(dev, head, name_hlist)
740 if (!strncmp(dev->name, name, IFNAMSIZ))
745 EXPORT_SYMBOL(__dev_get_by_name);
748 * dev_get_by_name_rcu - find a device by its name
749 * @net: the applicable net namespace
750 * @name: name to find
752 * Find an interface by name.
753 * If the name is found a pointer to the device is returned.
754 * If the name is not found then %NULL is returned.
755 * The reference counters are not incremented so the caller must be
756 * careful with locks. The caller must hold RCU lock.
759 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
761 struct net_device *dev;
762 struct hlist_head *head = dev_name_hash(net, name);
764 hlist_for_each_entry_rcu(dev, head, name_hlist)
765 if (!strncmp(dev->name, name, IFNAMSIZ))
770 EXPORT_SYMBOL(dev_get_by_name_rcu);
773 * dev_get_by_name - find a device by its name
774 * @net: the applicable net namespace
775 * @name: name to find
777 * Find an interface by name. This can be called from any
778 * context and does its own locking. The returned handle has
779 * the usage count incremented and the caller must use dev_put() to
780 * release it when it is no longer needed. %NULL is returned if no
781 * matching device is found.
784 struct net_device *dev_get_by_name(struct net *net, const char *name)
786 struct net_device *dev;
789 dev = dev_get_by_name_rcu(net, name);
795 EXPORT_SYMBOL(dev_get_by_name);
798 * __dev_get_by_index - find a device by its ifindex
799 * @net: the applicable net namespace
800 * @ifindex: index of device
802 * Search for an interface by index. Returns %NULL if the device
803 * is not found or a pointer to the device. The device has not
804 * had its reference counter increased so the caller must be careful
805 * about locking. The caller must hold either the RTNL semaphore
809 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
811 struct net_device *dev;
812 struct hlist_head *head = dev_index_hash(net, ifindex);
814 hlist_for_each_entry(dev, head, index_hlist)
815 if (dev->ifindex == ifindex)
820 EXPORT_SYMBOL(__dev_get_by_index);
823 * dev_get_by_index_rcu - find a device by its ifindex
824 * @net: the applicable net namespace
825 * @ifindex: index of device
827 * Search for an interface by index. Returns %NULL if the device
828 * is not found or a pointer to the device. The device has not
829 * had its reference counter increased so the caller must be careful
830 * about locking. The caller must hold RCU lock.
833 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
835 struct net_device *dev;
836 struct hlist_head *head = dev_index_hash(net, ifindex);
838 hlist_for_each_entry_rcu(dev, head, index_hlist)
839 if (dev->ifindex == ifindex)
844 EXPORT_SYMBOL(dev_get_by_index_rcu);
848 * dev_get_by_index - find a device by its ifindex
849 * @net: the applicable net namespace
850 * @ifindex: index of device
852 * Search for an interface by index. Returns NULL if the device
853 * is not found or a pointer to the device. The device returned has
854 * had a reference added and the pointer is safe until the user calls
855 * dev_put to indicate they have finished with it.
858 struct net_device *dev_get_by_index(struct net *net, int ifindex)
860 struct net_device *dev;
863 dev = dev_get_by_index_rcu(net, ifindex);
869 EXPORT_SYMBOL(dev_get_by_index);
872 * dev_get_by_napi_id - find a device by napi_id
873 * @napi_id: ID of the NAPI struct
875 * Search for an interface by NAPI ID. Returns %NULL if the device
876 * is not found or a pointer to the device. The device has not had
877 * its reference counter increased so the caller must be careful
878 * about locking. The caller must hold RCU lock.
881 struct net_device *dev_get_by_napi_id(unsigned int napi_id)
883 struct napi_struct *napi;
885 WARN_ON_ONCE(!rcu_read_lock_held());
887 if (napi_id < MIN_NAPI_ID)
890 napi = napi_by_id(napi_id);
892 return napi ? napi->dev : NULL;
894 EXPORT_SYMBOL(dev_get_by_napi_id);
897 * netdev_get_name - get a netdevice name, knowing its ifindex.
898 * @net: network namespace
899 * @name: a pointer to the buffer where the name will be stored.
900 * @ifindex: the ifindex of the interface to get the name from.
902 * The use of raw_seqcount_begin() and cond_resched() before
903 * retrying is required as we want to give the writers a chance
904 * to complete when CONFIG_PREEMPT is not set.
906 int netdev_get_name(struct net *net, char *name, int ifindex)
908 struct net_device *dev;
912 seq = raw_seqcount_begin(&devnet_rename_seq);
914 dev = dev_get_by_index_rcu(net, ifindex);
920 strcpy(name, dev->name);
922 if (read_seqcount_retry(&devnet_rename_seq, seq)) {
931 * dev_getbyhwaddr_rcu - find a device by its hardware address
932 * @net: the applicable net namespace
933 * @type: media type of device
934 * @ha: hardware address
936 * Search for an interface by MAC address. Returns NULL if the device
937 * is not found or a pointer to the device.
938 * The caller must hold RCU or RTNL.
939 * The returned device has not had its ref count increased
940 * and the caller must therefore be careful about locking
944 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
947 struct net_device *dev;
949 for_each_netdev_rcu(net, dev)
950 if (dev->type == type &&
951 !memcmp(dev->dev_addr, ha, dev->addr_len))
956 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
958 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
960 struct net_device *dev;
963 for_each_netdev(net, dev)
964 if (dev->type == type)
969 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
971 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
973 struct net_device *dev, *ret = NULL;
976 for_each_netdev_rcu(net, dev)
977 if (dev->type == type) {
985 EXPORT_SYMBOL(dev_getfirstbyhwtype);
988 * __dev_get_by_flags - find any device with given flags
989 * @net: the applicable net namespace
990 * @if_flags: IFF_* values
991 * @mask: bitmask of bits in if_flags to check
993 * Search for any interface with the given flags. Returns NULL if a device
994 * is not found or a pointer to the device. Must be called inside
995 * rtnl_lock(), and result refcount is unchanged.
998 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
1001 struct net_device *dev, *ret;
1006 for_each_netdev(net, dev) {
1007 if (((dev->flags ^ if_flags) & mask) == 0) {
1014 EXPORT_SYMBOL(__dev_get_by_flags);
1017 * dev_valid_name - check if name is okay for network device
1018 * @name: name string
1020 * Network device names need to be valid file names to
1021 * to allow sysfs to work. We also disallow any kind of
1024 bool dev_valid_name(const char *name)
1028 if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
1030 if (!strcmp(name, ".") || !strcmp(name, ".."))
1034 if (*name == '/' || *name == ':' || isspace(*name))
1040 EXPORT_SYMBOL(dev_valid_name);
1043 * __dev_alloc_name - allocate a name for a device
1044 * @net: network namespace to allocate the device name in
1045 * @name: name format string
1046 * @buf: scratch buffer and result name string
1048 * Passed a format string - eg "lt%d" it will try and find a suitable
1049 * id. It scans list of devices to build up a free map, then chooses
1050 * the first empty slot. The caller must hold the dev_base or rtnl lock
1051 * while allocating the name and adding the device in order to avoid
1053 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1054 * Returns the number of the unit assigned or a negative errno code.
1057 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1061 const int max_netdevices = 8*PAGE_SIZE;
1062 unsigned long *inuse;
1063 struct net_device *d;
1065 if (!dev_valid_name(name))
1068 p = strchr(name, '%');
1071 * Verify the string as this thing may have come from
1072 * the user. There must be either one "%d" and no other "%"
1075 if (p[1] != 'd' || strchr(p + 2, '%'))
1078 /* Use one page as a bit array of possible slots */
1079 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1083 for_each_netdev(net, d) {
1084 if (!sscanf(d->name, name, &i))
1086 if (i < 0 || i >= max_netdevices)
1089 /* avoid cases where sscanf is not exact inverse of printf */
1090 snprintf(buf, IFNAMSIZ, name, i);
1091 if (!strncmp(buf, d->name, IFNAMSIZ))
1095 i = find_first_zero_bit(inuse, max_netdevices);
1096 free_page((unsigned long) inuse);
1099 snprintf(buf, IFNAMSIZ, name, i);
1100 if (!__dev_get_by_name(net, buf))
1103 /* It is possible to run out of possible slots
1104 * when the name is long and there isn't enough space left
1105 * for the digits, or if all bits are used.
1110 static int dev_alloc_name_ns(struct net *net,
1111 struct net_device *dev,
1118 ret = __dev_alloc_name(net, name, buf);
1120 strlcpy(dev->name, buf, IFNAMSIZ);
1125 * dev_alloc_name - allocate a name for a device
1127 * @name: name format string
1129 * Passed a format string - eg "lt%d" it will try and find a suitable
1130 * id. It scans list of devices to build up a free map, then chooses
1131 * the first empty slot. The caller must hold the dev_base or rtnl lock
1132 * while allocating the name and adding the device in order to avoid
1134 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1135 * Returns the number of the unit assigned or a negative errno code.
1138 int dev_alloc_name(struct net_device *dev, const char *name)
1140 return dev_alloc_name_ns(dev_net(dev), dev, name);
1142 EXPORT_SYMBOL(dev_alloc_name);
1144 int dev_get_valid_name(struct net *net, struct net_device *dev,
1149 if (!dev_valid_name(name))
1152 if (strchr(name, '%'))
1153 return dev_alloc_name_ns(net, dev, name);
1154 else if (__dev_get_by_name(net, name))
1156 else if (dev->name != name)
1157 strlcpy(dev->name, name, IFNAMSIZ);
1161 EXPORT_SYMBOL(dev_get_valid_name);
1164 * dev_change_name - change name of a device
1166 * @newname: name (or format string) must be at least IFNAMSIZ
1168 * Change name of a device, can pass format strings "eth%d".
1171 int dev_change_name(struct net_device *dev, const char *newname)
1173 unsigned char old_assign_type;
1174 char oldname[IFNAMSIZ];
1180 BUG_ON(!dev_net(dev));
1183 if (dev->flags & IFF_UP)
1186 write_seqcount_begin(&devnet_rename_seq);
1188 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1189 write_seqcount_end(&devnet_rename_seq);
1193 memcpy(oldname, dev->name, IFNAMSIZ);
1195 err = dev_get_valid_name(net, dev, newname);
1197 write_seqcount_end(&devnet_rename_seq);
1201 if (oldname[0] && !strchr(oldname, '%'))
1202 netdev_info(dev, "renamed from %s\n", oldname);
1204 old_assign_type = dev->name_assign_type;
1205 dev->name_assign_type = NET_NAME_RENAMED;
1208 ret = device_rename(&dev->dev, dev->name);
1210 memcpy(dev->name, oldname, IFNAMSIZ);
1211 dev->name_assign_type = old_assign_type;
1212 write_seqcount_end(&devnet_rename_seq);
1216 write_seqcount_end(&devnet_rename_seq);
1218 netdev_adjacent_rename_links(dev, oldname);
1220 write_lock_bh(&dev_base_lock);
1221 hlist_del_rcu(&dev->name_hlist);
1222 write_unlock_bh(&dev_base_lock);
1226 write_lock_bh(&dev_base_lock);
1227 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1228 write_unlock_bh(&dev_base_lock);
1230 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1231 ret = notifier_to_errno(ret);
1234 /* err >= 0 after dev_alloc_name() or stores the first errno */
1237 write_seqcount_begin(&devnet_rename_seq);
1238 memcpy(dev->name, oldname, IFNAMSIZ);
1239 memcpy(oldname, newname, IFNAMSIZ);
1240 dev->name_assign_type = old_assign_type;
1241 old_assign_type = NET_NAME_RENAMED;
1244 pr_err("%s: name change rollback failed: %d\n",
1253 * dev_set_alias - change ifalias of a device
1255 * @alias: name up to IFALIASZ
1256 * @len: limit of bytes to copy from info
1258 * Set ifalias for a device,
1260 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1262 struct dev_ifalias *new_alias = NULL;
1264 if (len >= IFALIASZ)
1268 new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
1272 memcpy(new_alias->ifalias, alias, len);
1273 new_alias->ifalias[len] = 0;
1276 mutex_lock(&ifalias_mutex);
1277 rcu_swap_protected(dev->ifalias, new_alias,
1278 mutex_is_locked(&ifalias_mutex));
1279 mutex_unlock(&ifalias_mutex);
1282 kfree_rcu(new_alias, rcuhead);
1286 EXPORT_SYMBOL(dev_set_alias);
1289 * dev_get_alias - get ifalias of a device
1291 * @name: buffer to store name of ifalias
1292 * @len: size of buffer
1294 * get ifalias for a device. Caller must make sure dev cannot go
1295 * away, e.g. rcu read lock or own a reference count to device.
1297 int dev_get_alias(const struct net_device *dev, char *name, size_t len)
1299 const struct dev_ifalias *alias;
1303 alias = rcu_dereference(dev->ifalias);
1305 ret = snprintf(name, len, "%s", alias->ifalias);
1312 * netdev_features_change - device changes features
1313 * @dev: device to cause notification
1315 * Called to indicate a device has changed features.
1317 void netdev_features_change(struct net_device *dev)
1319 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1321 EXPORT_SYMBOL(netdev_features_change);
1324 * netdev_state_change - device changes state
1325 * @dev: device to cause notification
1327 * Called to indicate a device has changed state. This function calls
1328 * the notifier chains for netdev_chain and sends a NEWLINK message
1329 * to the routing socket.
1331 void netdev_state_change(struct net_device *dev)
1333 if (dev->flags & IFF_UP) {
1334 struct netdev_notifier_change_info change_info = {
1338 call_netdevice_notifiers_info(NETDEV_CHANGE,
1340 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1343 EXPORT_SYMBOL(netdev_state_change);
1346 * netdev_notify_peers - notify network peers about existence of @dev
1347 * @dev: network device
1349 * Generate traffic such that interested network peers are aware of
1350 * @dev, such as by generating a gratuitous ARP. This may be used when
1351 * a device wants to inform the rest of the network about some sort of
1352 * reconfiguration such as a failover event or virtual machine
1355 void netdev_notify_peers(struct net_device *dev)
1358 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1359 call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
1362 EXPORT_SYMBOL(netdev_notify_peers);
1364 static int __dev_open(struct net_device *dev)
1366 const struct net_device_ops *ops = dev->netdev_ops;
1371 if (!netif_device_present(dev))
1374 /* Block netpoll from trying to do any rx path servicing.
1375 * If we don't do this there is a chance ndo_poll_controller
1376 * or ndo_poll may be running while we open the device
1378 netpoll_poll_disable(dev);
1380 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1381 ret = notifier_to_errno(ret);
1385 set_bit(__LINK_STATE_START, &dev->state);
1387 if (ops->ndo_validate_addr)
1388 ret = ops->ndo_validate_addr(dev);
1390 if (!ret && ops->ndo_open)
1391 ret = ops->ndo_open(dev);
1393 netpoll_poll_enable(dev);
1396 clear_bit(__LINK_STATE_START, &dev->state);
1398 dev->flags |= IFF_UP;
1399 dev_set_rx_mode(dev);
1401 add_device_randomness(dev->dev_addr, dev->addr_len);
1408 * dev_open - prepare an interface for use.
1409 * @dev: device to open
1411 * Takes a device from down to up state. The device's private open
1412 * function is invoked and then the multicast lists are loaded. Finally
1413 * the device is moved into the up state and a %NETDEV_UP message is
1414 * sent to the netdev notifier chain.
1416 * Calling this function on an active interface is a nop. On a failure
1417 * a negative errno code is returned.
1419 int dev_open(struct net_device *dev)
1423 if (dev->flags & IFF_UP)
1426 ret = __dev_open(dev);
1430 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1431 call_netdevice_notifiers(NETDEV_UP, dev);
1435 EXPORT_SYMBOL(dev_open);
1437 static void __dev_close_many(struct list_head *head)
1439 struct net_device *dev;
1444 list_for_each_entry(dev, head, close_list) {
1445 /* Temporarily disable netpoll until the interface is down */
1446 netpoll_poll_disable(dev);
1448 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1450 clear_bit(__LINK_STATE_START, &dev->state);
1452 /* Synchronize to scheduled poll. We cannot touch poll list, it
1453 * can be even on different cpu. So just clear netif_running().
1455 * dev->stop() will invoke napi_disable() on all of it's
1456 * napi_struct instances on this device.
1458 smp_mb__after_atomic(); /* Commit netif_running(). */
1461 dev_deactivate_many(head);
1463 list_for_each_entry(dev, head, close_list) {
1464 const struct net_device_ops *ops = dev->netdev_ops;
1467 * Call the device specific close. This cannot fail.
1468 * Only if device is UP
1470 * We allow it to be called even after a DETACH hot-plug
1476 dev->flags &= ~IFF_UP;
1477 netpoll_poll_enable(dev);
1481 static void __dev_close(struct net_device *dev)
1485 list_add(&dev->close_list, &single);
1486 __dev_close_many(&single);
1490 void dev_close_many(struct list_head *head, bool unlink)
1492 struct net_device *dev, *tmp;
1494 /* Remove the devices that don't need to be closed */
1495 list_for_each_entry_safe(dev, tmp, head, close_list)
1496 if (!(dev->flags & IFF_UP))
1497 list_del_init(&dev->close_list);
1499 __dev_close_many(head);
1501 list_for_each_entry_safe(dev, tmp, head, close_list) {
1502 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1503 call_netdevice_notifiers(NETDEV_DOWN, dev);
1505 list_del_init(&dev->close_list);
1508 EXPORT_SYMBOL(dev_close_many);
1511 * dev_close - shutdown an interface.
1512 * @dev: device to shutdown
1514 * This function moves an active device into down state. A
1515 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1516 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1519 void dev_close(struct net_device *dev)
1521 if (dev->flags & IFF_UP) {
1524 list_add(&dev->close_list, &single);
1525 dev_close_many(&single, true);
1529 EXPORT_SYMBOL(dev_close);
1533 * dev_disable_lro - disable Large Receive Offload on a device
1536 * Disable Large Receive Offload (LRO) on a net device. Must be
1537 * called under RTNL. This is needed if received packets may be
1538 * forwarded to another interface.
1540 void dev_disable_lro(struct net_device *dev)
1542 struct net_device *lower_dev;
1543 struct list_head *iter;
1545 dev->wanted_features &= ~NETIF_F_LRO;
1546 netdev_update_features(dev);
1548 if (unlikely(dev->features & NETIF_F_LRO))
1549 netdev_WARN(dev, "failed to disable LRO!\n");
1551 netdev_for_each_lower_dev(dev, lower_dev, iter)
1552 dev_disable_lro(lower_dev);
1554 EXPORT_SYMBOL(dev_disable_lro);
1557 * dev_disable_gro_hw - disable HW Generic Receive Offload on a device
1560 * Disable HW Generic Receive Offload (GRO_HW) on a net device. Must be
1561 * called under RTNL. This is needed if Generic XDP is installed on
1564 static void dev_disable_gro_hw(struct net_device *dev)
1566 dev->wanted_features &= ~NETIF_F_GRO_HW;
1567 netdev_update_features(dev);
1569 if (unlikely(dev->features & NETIF_F_GRO_HW))
1570 netdev_WARN(dev, "failed to disable GRO_HW!\n");
1573 const char *netdev_cmd_to_name(enum netdev_cmd cmd)
1576 case NETDEV_##val: \
1577 return "NETDEV_" __stringify(val);
1579 N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
1580 N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
1581 N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
1582 N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER)
1583 N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO)
1584 N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO)
1585 N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
1586 N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
1587 N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
1590 return "UNKNOWN_NETDEV_EVENT";
1592 EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
1594 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1595 struct net_device *dev)
1597 struct netdev_notifier_info info = {
1601 return nb->notifier_call(nb, val, &info);
1604 static int dev_boot_phase = 1;
1607 * register_netdevice_notifier - register a network notifier block
1610 * Register a notifier to be called when network device events occur.
1611 * The notifier passed is linked into the kernel structures and must
1612 * not be reused until it has been unregistered. A negative errno code
1613 * is returned on a failure.
1615 * When registered all registration and up events are replayed
1616 * to the new notifier to allow device to have a race free
1617 * view of the network device list.
1620 int register_netdevice_notifier(struct notifier_block *nb)
1622 struct net_device *dev;
1623 struct net_device *last;
1627 /* Close race with setup_net() and cleanup_net() */
1628 down_write(&pernet_ops_rwsem);
1630 err = raw_notifier_chain_register(&netdev_chain, nb);
1636 for_each_netdev(net, dev) {
1637 err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1638 err = notifier_to_errno(err);
1642 if (!(dev->flags & IFF_UP))
1645 call_netdevice_notifier(nb, NETDEV_UP, dev);
1651 up_write(&pernet_ops_rwsem);
1657 for_each_netdev(net, dev) {
1661 if (dev->flags & IFF_UP) {
1662 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1664 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1666 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1671 raw_notifier_chain_unregister(&netdev_chain, nb);
1674 EXPORT_SYMBOL(register_netdevice_notifier);
1677 * unregister_netdevice_notifier - unregister a network notifier block
1680 * Unregister a notifier previously registered by
1681 * register_netdevice_notifier(). The notifier is unlinked into the
1682 * kernel structures and may then be reused. A negative errno code
1683 * is returned on a failure.
1685 * After unregistering unregister and down device events are synthesized
1686 * for all devices on the device list to the removed notifier to remove
1687 * the need for special case cleanup code.
1690 int unregister_netdevice_notifier(struct notifier_block *nb)
1692 struct net_device *dev;
1696 /* Close race with setup_net() and cleanup_net() */
1697 down_write(&pernet_ops_rwsem);
1699 err = raw_notifier_chain_unregister(&netdev_chain, nb);
1704 for_each_netdev(net, dev) {
1705 if (dev->flags & IFF_UP) {
1706 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1708 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1710 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1715 up_write(&pernet_ops_rwsem);
1718 EXPORT_SYMBOL(unregister_netdevice_notifier);
1721 * call_netdevice_notifiers_info - call all network notifier blocks
1722 * @val: value passed unmodified to notifier function
1723 * @info: notifier information data
1725 * Call all network notifier blocks. Parameters and return value
1726 * are as for raw_notifier_call_chain().
1729 static int call_netdevice_notifiers_info(unsigned long val,
1730 struct netdev_notifier_info *info)
1733 return raw_notifier_call_chain(&netdev_chain, val, info);
1737 * call_netdevice_notifiers - call all network notifier blocks
1738 * @val: value passed unmodified to notifier function
1739 * @dev: net_device pointer passed unmodified to notifier function
1741 * Call all network notifier blocks. Parameters and return value
1742 * are as for raw_notifier_call_chain().
1745 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1747 struct netdev_notifier_info info = {
1751 return call_netdevice_notifiers_info(val, &info);
1753 EXPORT_SYMBOL(call_netdevice_notifiers);
1756 * call_netdevice_notifiers_mtu - call all network notifier blocks
1757 * @val: value passed unmodified to notifier function
1758 * @dev: net_device pointer passed unmodified to notifier function
1759 * @arg: additional u32 argument passed to the notifier function
1761 * Call all network notifier blocks. Parameters and return value
1762 * are as for raw_notifier_call_chain().
1764 static int call_netdevice_notifiers_mtu(unsigned long val,
1765 struct net_device *dev, u32 arg)
1767 struct netdev_notifier_info_ext info = {
1772 BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
1774 return call_netdevice_notifiers_info(val, &info.info);
1777 #ifdef CONFIG_NET_INGRESS
1778 static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
1780 void net_inc_ingress_queue(void)
1782 static_branch_inc(&ingress_needed_key);
1784 EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1786 void net_dec_ingress_queue(void)
1788 static_branch_dec(&ingress_needed_key);
1790 EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1793 #ifdef CONFIG_NET_EGRESS
1794 static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
1796 void net_inc_egress_queue(void)
1798 static_branch_inc(&egress_needed_key);
1800 EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1802 void net_dec_egress_queue(void)
1804 static_branch_dec(&egress_needed_key);
1806 EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1809 static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
1810 #ifdef HAVE_JUMP_LABEL
1811 static atomic_t netstamp_needed_deferred;
1812 static atomic_t netstamp_wanted;
1813 static void netstamp_clear(struct work_struct *work)
1815 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1818 wanted = atomic_add_return(deferred, &netstamp_wanted);
1820 static_branch_enable(&netstamp_needed_key);
1822 static_branch_disable(&netstamp_needed_key);
1824 static DECLARE_WORK(netstamp_work, netstamp_clear);
1827 void net_enable_timestamp(void)
1829 #ifdef HAVE_JUMP_LABEL
1833 wanted = atomic_read(&netstamp_wanted);
1836 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
1839 atomic_inc(&netstamp_needed_deferred);
1840 schedule_work(&netstamp_work);
1842 static_branch_inc(&netstamp_needed_key);
1845 EXPORT_SYMBOL(net_enable_timestamp);
1847 void net_disable_timestamp(void)
1849 #ifdef HAVE_JUMP_LABEL
1853 wanted = atomic_read(&netstamp_wanted);
1856 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
1859 atomic_dec(&netstamp_needed_deferred);
1860 schedule_work(&netstamp_work);
1862 static_branch_dec(&netstamp_needed_key);
1865 EXPORT_SYMBOL(net_disable_timestamp);
1867 static inline void net_timestamp_set(struct sk_buff *skb)
1870 if (static_branch_unlikely(&netstamp_needed_key))
1871 __net_timestamp(skb);
1874 #define net_timestamp_check(COND, SKB) \
1875 if (static_branch_unlikely(&netstamp_needed_key)) { \
1876 if ((COND) && !(SKB)->tstamp) \
1877 __net_timestamp(SKB); \
1880 bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
1884 if (!(dev->flags & IFF_UP))
1887 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1888 if (skb->len <= len)
1891 /* if TSO is enabled, we don't care about the length as the packet
1892 * could be forwarded without being segmented before
1894 if (skb_is_gso(skb))
1899 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1901 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1903 int ret = ____dev_forward_skb(dev, skb);
1906 skb->protocol = eth_type_trans(skb, dev);
1907 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1912 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1915 * dev_forward_skb - loopback an skb to another netif
1917 * @dev: destination network device
1918 * @skb: buffer to forward
1921 * NET_RX_SUCCESS (no congestion)
1922 * NET_RX_DROP (packet was dropped, but freed)
1924 * dev_forward_skb can be used for injecting an skb from the
1925 * start_xmit function of one device into the receive queue
1926 * of another device.
1928 * The receiving device may be in another namespace, so
1929 * we have to clear all information in the skb that could
1930 * impact namespace isolation.
1932 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1934 return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1936 EXPORT_SYMBOL_GPL(dev_forward_skb);
1938 static inline int deliver_skb(struct sk_buff *skb,
1939 struct packet_type *pt_prev,
1940 struct net_device *orig_dev)
1942 if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
1944 refcount_inc(&skb->users);
1945 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1948 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1949 struct packet_type **pt,
1950 struct net_device *orig_dev,
1952 struct list_head *ptype_list)
1954 struct packet_type *ptype, *pt_prev = *pt;
1956 list_for_each_entry_rcu(ptype, ptype_list, list) {
1957 if (ptype->type != type)
1960 deliver_skb(skb, pt_prev, orig_dev);
1966 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1968 if (!ptype->af_packet_priv || !skb->sk)
1971 if (ptype->id_match)
1972 return ptype->id_match(ptype, skb->sk);
1973 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1980 * dev_nit_active - return true if any network interface taps are in use
1982 * @dev: network device to check for the presence of taps
1984 bool dev_nit_active(struct net_device *dev)
1986 return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all);
1988 EXPORT_SYMBOL_GPL(dev_nit_active);
1991 * Support routine. Sends outgoing frames to any network
1992 * taps currently in use.
1995 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1997 struct packet_type *ptype;
1998 struct sk_buff *skb2 = NULL;
1999 struct packet_type *pt_prev = NULL;
2000 struct list_head *ptype_list = &ptype_all;
2004 list_for_each_entry_rcu(ptype, ptype_list, list) {
2005 if (ptype->ignore_outgoing)
2008 /* Never send packets back to the socket
2009 * they originated from - MvS (miquels@drinkel.ow.org)
2011 if (skb_loop_sk(ptype, skb))
2015 deliver_skb(skb2, pt_prev, skb->dev);
2020 /* need to clone skb, done only once */
2021 skb2 = skb_clone(skb, GFP_ATOMIC);
2025 net_timestamp_set(skb2);
2027 /* skb->nh should be correctly
2028 * set by sender, so that the second statement is
2029 * just protection against buggy protocols.
2031 skb_reset_mac_header(skb2);
2033 if (skb_network_header(skb2) < skb2->data ||
2034 skb_network_header(skb2) > skb_tail_pointer(skb2)) {
2035 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
2036 ntohs(skb2->protocol),
2038 skb_reset_network_header(skb2);
2041 skb2->transport_header = skb2->network_header;
2042 skb2->pkt_type = PACKET_OUTGOING;
2046 if (ptype_list == &ptype_all) {
2047 ptype_list = &dev->ptype_all;
2052 if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
2053 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
2059 EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
2062 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
2063 * @dev: Network device
2064 * @txq: number of queues available
2066 * If real_num_tx_queues is changed the tc mappings may no longer be
2067 * valid. To resolve this verify the tc mapping remains valid and if
2068 * not NULL the mapping. With no priorities mapping to this
2069 * offset/count pair it will no longer be used. In the worst case TC0
2070 * is invalid nothing can be done so disable priority mappings. If is
2071 * expected that drivers will fix this mapping if they can before
2072 * calling netif_set_real_num_tx_queues.
2074 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
2077 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2079 /* If TC0 is invalidated disable TC mapping */
2080 if (tc->offset + tc->count > txq) {
2081 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
2086 /* Invalidated prio to tc mappings set to TC0 */
2087 for (i = 1; i < TC_BITMASK + 1; i++) {
2088 int q = netdev_get_prio_tc_map(dev, i);
2090 tc = &dev->tc_to_txq[q];
2091 if (tc->offset + tc->count > txq) {
2092 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
2094 netdev_set_prio_tc_map(dev, i, 0);
2099 int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
2102 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2105 /* walk through the TCs and see if it falls into any of them */
2106 for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
2107 if ((txq - tc->offset) < tc->count)
2111 /* didn't find it, just return -1 to indicate no match */
2117 EXPORT_SYMBOL(netdev_txq_to_tc);
2120 struct static_key xps_needed __read_mostly;
2121 EXPORT_SYMBOL(xps_needed);
2122 struct static_key xps_rxqs_needed __read_mostly;
2123 EXPORT_SYMBOL(xps_rxqs_needed);
2124 static DEFINE_MUTEX(xps_map_mutex);
2125 #define xmap_dereference(P) \
2126 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
2128 static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
2131 struct xps_map *map = NULL;
2135 map = xmap_dereference(dev_maps->attr_map[tci]);
2139 for (pos = map->len; pos--;) {
2140 if (map->queues[pos] != index)
2144 map->queues[pos] = map->queues[--map->len];
2148 RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
2149 kfree_rcu(map, rcu);
2156 static bool remove_xps_queue_cpu(struct net_device *dev,
2157 struct xps_dev_maps *dev_maps,
2158 int cpu, u16 offset, u16 count)
2160 int num_tc = dev->num_tc ? : 1;
2161 bool active = false;
2164 for (tci = cpu * num_tc; num_tc--; tci++) {
2167 for (i = count, j = offset; i--; j++) {
2168 if (!remove_xps_queue(dev_maps, tci, j))
2178 static void clean_xps_maps(struct net_device *dev, const unsigned long *mask,
2179 struct xps_dev_maps *dev_maps, unsigned int nr_ids,
2180 u16 offset, u16 count, bool is_rxqs_map)
2182 bool active = false;
2185 for (j = -1; j = netif_attrmask_next(j, mask, nr_ids),
2187 active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
2191 RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
2193 RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
2195 for (i = offset + (count - 1); count--; i--)
2196 netdev_queue_numa_node_write(
2197 netdev_get_tx_queue(dev, i),
2200 kfree_rcu(dev_maps, rcu);
2204 static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2207 const unsigned long *possible_mask = NULL;
2208 struct xps_dev_maps *dev_maps;
2209 unsigned int nr_ids;
2211 if (!static_key_false(&xps_needed))
2215 mutex_lock(&xps_map_mutex);
2217 if (static_key_false(&xps_rxqs_needed)) {
2218 dev_maps = xmap_dereference(dev->xps_rxqs_map);
2220 nr_ids = dev->num_rx_queues;
2221 clean_xps_maps(dev, possible_mask, dev_maps, nr_ids,
2222 offset, count, true);
2226 dev_maps = xmap_dereference(dev->xps_cpus_map);
2230 if (num_possible_cpus() > 1)
2231 possible_mask = cpumask_bits(cpu_possible_mask);
2232 nr_ids = nr_cpu_ids;
2233 clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count,
2237 if (static_key_enabled(&xps_rxqs_needed))
2238 static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
2240 static_key_slow_dec_cpuslocked(&xps_needed);
2241 mutex_unlock(&xps_map_mutex);
2245 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2247 netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2250 static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
2251 u16 index, bool is_rxqs_map)
2253 struct xps_map *new_map;
2254 int alloc_len = XPS_MIN_MAP_ALLOC;
2257 for (pos = 0; map && pos < map->len; pos++) {
2258 if (map->queues[pos] != index)
2263 /* Need to add tx-queue to this CPU's/rx-queue's existing map */
2265 if (pos < map->alloc_len)
2268 alloc_len = map->alloc_len * 2;
2271 /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
2275 new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
2277 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2278 cpu_to_node(attr_index));
2282 for (i = 0; i < pos; i++)
2283 new_map->queues[i] = map->queues[i];
2284 new_map->alloc_len = alloc_len;
2290 /* Must be called under cpus_read_lock */
2291 int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
2292 u16 index, bool is_rxqs_map)
2294 const unsigned long *online_mask = NULL, *possible_mask = NULL;
2295 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2296 int i, j, tci, numa_node_id = -2;
2297 int maps_sz, num_tc = 1, tc = 0;
2298 struct xps_map *map, *new_map;
2299 bool active = false;
2300 unsigned int nr_ids;
2303 /* Do not allow XPS on subordinate device directly */
2304 num_tc = dev->num_tc;
2308 /* If queue belongs to subordinate dev use its map */
2309 dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
2311 tc = netdev_txq_to_tc(dev, index);
2316 mutex_lock(&xps_map_mutex);
2318 maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
2319 dev_maps = xmap_dereference(dev->xps_rxqs_map);
2320 nr_ids = dev->num_rx_queues;
2322 maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
2323 if (num_possible_cpus() > 1) {
2324 online_mask = cpumask_bits(cpu_online_mask);
2325 possible_mask = cpumask_bits(cpu_possible_mask);
2327 dev_maps = xmap_dereference(dev->xps_cpus_map);
2328 nr_ids = nr_cpu_ids;
2331 if (maps_sz < L1_CACHE_BYTES)
2332 maps_sz = L1_CACHE_BYTES;
2334 /* allocate memory for queue storage */
2335 for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
2338 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2339 if (!new_dev_maps) {
2340 mutex_unlock(&xps_map_mutex);
2344 tci = j * num_tc + tc;
2345 map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
2348 map = expand_xps_map(map, j, index, is_rxqs_map);
2352 RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2356 goto out_no_new_maps;
2358 static_key_slow_inc_cpuslocked(&xps_needed);
2360 static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
2362 for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2364 /* copy maps belonging to foreign traffic classes */
2365 for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
2366 /* fill in the new device map from the old device map */
2367 map = xmap_dereference(dev_maps->attr_map[tci]);
2368 RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2371 /* We need to explicitly update tci as prevous loop
2372 * could break out early if dev_maps is NULL.
2374 tci = j * num_tc + tc;
2376 if (netif_attr_test_mask(j, mask, nr_ids) &&
2377 netif_attr_test_online(j, online_mask, nr_ids)) {
2378 /* add tx-queue to CPU/rx-queue maps */
2381 map = xmap_dereference(new_dev_maps->attr_map[tci]);
2382 while ((pos < map->len) && (map->queues[pos] != index))
2385 if (pos == map->len)
2386 map->queues[map->len++] = index;
2389 if (numa_node_id == -2)
2390 numa_node_id = cpu_to_node(j);
2391 else if (numa_node_id != cpu_to_node(j))
2395 } else if (dev_maps) {
2396 /* fill in the new device map from the old device map */
2397 map = xmap_dereference(dev_maps->attr_map[tci]);
2398 RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2401 /* copy maps belonging to foreign traffic classes */
2402 for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
2403 /* fill in the new device map from the old device map */
2404 map = xmap_dereference(dev_maps->attr_map[tci]);
2405 RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2410 rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps);
2412 rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps);
2414 /* Cleanup old maps */
2416 goto out_no_old_maps;
2418 for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2420 for (i = num_tc, tci = j * num_tc; i--; tci++) {
2421 new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2422 map = xmap_dereference(dev_maps->attr_map[tci]);
2423 if (map && map != new_map)
2424 kfree_rcu(map, rcu);
2428 kfree_rcu(dev_maps, rcu);
2431 dev_maps = new_dev_maps;
2436 /* update Tx queue numa node */
2437 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2438 (numa_node_id >= 0) ?
2439 numa_node_id : NUMA_NO_NODE);
2445 /* removes tx-queue from unused CPUs/rx-queues */
2446 for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2448 for (i = tc, tci = j * num_tc; i--; tci++)
2449 active |= remove_xps_queue(dev_maps, tci, index);
2450 if (!netif_attr_test_mask(j, mask, nr_ids) ||
2451 !netif_attr_test_online(j, online_mask, nr_ids))
2452 active |= remove_xps_queue(dev_maps, tci, index);
2453 for (i = num_tc - tc, tci++; --i; tci++)
2454 active |= remove_xps_queue(dev_maps, tci, index);
2457 /* free map if not active */
2460 RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
2462 RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
2463 kfree_rcu(dev_maps, rcu);
2467 mutex_unlock(&xps_map_mutex);
2471 /* remove any maps that we added */
2472 for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2474 for (i = num_tc, tci = j * num_tc; i--; tci++) {
2475 new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2477 xmap_dereference(dev_maps->attr_map[tci]) :
2479 if (new_map && new_map != map)
2484 mutex_unlock(&xps_map_mutex);
2486 kfree(new_dev_maps);
2489 EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
2491 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2497 ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, false);
2502 EXPORT_SYMBOL(netif_set_xps_queue);
2505 static void netdev_unbind_all_sb_channels(struct net_device *dev)
2507 struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2509 /* Unbind any subordinate channels */
2510 while (txq-- != &dev->_tx[0]) {
2512 netdev_unbind_sb_channel(dev, txq->sb_dev);
2516 void netdev_reset_tc(struct net_device *dev)
2519 netif_reset_xps_queues_gt(dev, 0);
2521 netdev_unbind_all_sb_channels(dev);
2523 /* Reset TC configuration of device */
2525 memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2526 memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2528 EXPORT_SYMBOL(netdev_reset_tc);
2530 int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2532 if (tc >= dev->num_tc)
2536 netif_reset_xps_queues(dev, offset, count);
2538 dev->tc_to_txq[tc].count = count;
2539 dev->tc_to_txq[tc].offset = offset;
2542 EXPORT_SYMBOL(netdev_set_tc_queue);
2544 int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2546 if (num_tc > TC_MAX_QUEUE)
2550 netif_reset_xps_queues_gt(dev, 0);
2552 netdev_unbind_all_sb_channels(dev);
2554 dev->num_tc = num_tc;
2557 EXPORT_SYMBOL(netdev_set_num_tc);
2559 void netdev_unbind_sb_channel(struct net_device *dev,
2560 struct net_device *sb_dev)
2562 struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2565 netif_reset_xps_queues_gt(sb_dev, 0);
2567 memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
2568 memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
2570 while (txq-- != &dev->_tx[0]) {
2571 if (txq->sb_dev == sb_dev)
2575 EXPORT_SYMBOL(netdev_unbind_sb_channel);
2577 int netdev_bind_sb_channel_queue(struct net_device *dev,
2578 struct net_device *sb_dev,
2579 u8 tc, u16 count, u16 offset)
2581 /* Make certain the sb_dev and dev are already configured */
2582 if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
2585 /* We cannot hand out queues we don't have */
2586 if ((offset + count) > dev->real_num_tx_queues)
2589 /* Record the mapping */
2590 sb_dev->tc_to_txq[tc].count = count;
2591 sb_dev->tc_to_txq[tc].offset = offset;
2593 /* Provide a way for Tx queue to find the tc_to_txq map or
2594 * XPS map for itself.
2597 netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
2601 EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
2603 int netdev_set_sb_channel(struct net_device *dev, u16 channel)
2605 /* Do not use a multiqueue device to represent a subordinate channel */
2606 if (netif_is_multiqueue(dev))
2609 /* We allow channels 1 - 32767 to be used for subordinate channels.
2610 * Channel 0 is meant to be "native" mode and used only to represent
2611 * the main root device. We allow writing 0 to reset the device back
2612 * to normal mode after being used as a subordinate channel.
2614 if (channel > S16_MAX)
2617 dev->num_tc = -channel;
2621 EXPORT_SYMBOL(netdev_set_sb_channel);
2624 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2625 * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
2627 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2632 disabling = txq < dev->real_num_tx_queues;
2634 if (txq < 1 || txq > dev->num_tx_queues)
2637 if (dev->reg_state == NETREG_REGISTERED ||
2638 dev->reg_state == NETREG_UNREGISTERING) {
2641 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2647 netif_setup_tc(dev, txq);
2649 dev->real_num_tx_queues = txq;
2653 qdisc_reset_all_tx_gt(dev, txq);
2655 netif_reset_xps_queues_gt(dev, txq);
2659 dev->real_num_tx_queues = txq;
2664 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2668 * netif_set_real_num_rx_queues - set actual number of RX queues used
2669 * @dev: Network device
2670 * @rxq: Actual number of RX queues
2672 * This must be called either with the rtnl_lock held or before
2673 * registration of the net device. Returns 0 on success, or a
2674 * negative error code. If called before registration, it always
2677 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2681 if (rxq < 1 || rxq > dev->num_rx_queues)
2684 if (dev->reg_state == NETREG_REGISTERED) {
2687 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2693 dev->real_num_rx_queues = rxq;
2696 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2700 * netif_get_num_default_rss_queues - default number of RSS queues
2702 * This routine should set an upper limit on the number of RSS queues
2703 * used by default by multiqueue devices.
2705 int netif_get_num_default_rss_queues(void)
2707 return is_kdump_kernel() ?
2708 1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2710 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2712 static void __netif_reschedule(struct Qdisc *q)
2714 struct softnet_data *sd;
2715 unsigned long flags;
2717 local_irq_save(flags);
2718 sd = this_cpu_ptr(&softnet_data);
2719 q->next_sched = NULL;
2720 *sd->output_queue_tailp = q;
2721 sd->output_queue_tailp = &q->next_sched;
2722 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2723 local_irq_restore(flags);
2726 void __netif_schedule(struct Qdisc *q)
2728 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2729 __netif_reschedule(q);
2731 EXPORT_SYMBOL(__netif_schedule);
2733 struct dev_kfree_skb_cb {
2734 enum skb_free_reason reason;
2737 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2739 return (struct dev_kfree_skb_cb *)skb->cb;
2742 void netif_schedule_queue(struct netdev_queue *txq)
2745 if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2746 struct Qdisc *q = rcu_dereference(txq->qdisc);
2748 __netif_schedule(q);
2752 EXPORT_SYMBOL(netif_schedule_queue);
2754 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2756 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2760 q = rcu_dereference(dev_queue->qdisc);
2761 __netif_schedule(q);
2765 EXPORT_SYMBOL(netif_tx_wake_queue);
2767 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2769 unsigned long flags;
2774 if (likely(refcount_read(&skb->users) == 1)) {
2776 refcount_set(&skb->users, 0);
2777 } else if (likely(!refcount_dec_and_test(&skb->users))) {
2780 get_kfree_skb_cb(skb)->reason = reason;
2781 local_irq_save(flags);
2782 skb->next = __this_cpu_read(softnet_data.completion_queue);
2783 __this_cpu_write(softnet_data.completion_queue, skb);
2784 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2785 local_irq_restore(flags);
2787 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2789 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2791 if (in_irq() || irqs_disabled())
2792 __dev_kfree_skb_irq(skb, reason);
2796 EXPORT_SYMBOL(__dev_kfree_skb_any);
2800 * netif_device_detach - mark device as removed
2801 * @dev: network device
2803 * Mark device as removed from system and therefore no longer available.
2805 void netif_device_detach(struct net_device *dev)
2807 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2808 netif_running(dev)) {
2809 netif_tx_stop_all_queues(dev);
2812 EXPORT_SYMBOL(netif_device_detach);
2815 * netif_device_attach - mark device as attached
2816 * @dev: network device
2818 * Mark device as attached from system and restart if needed.
2820 void netif_device_attach(struct net_device *dev)
2822 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2823 netif_running(dev)) {
2824 netif_tx_wake_all_queues(dev);
2825 __netdev_watchdog_up(dev);
2828 EXPORT_SYMBOL(netif_device_attach);
2831 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2832 * to be used as a distribution range.
2834 static u16 skb_tx_hash(const struct net_device *dev,
2835 const struct net_device *sb_dev,
2836 struct sk_buff *skb)
2840 u16 qcount = dev->real_num_tx_queues;
2843 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2845 qoffset = sb_dev->tc_to_txq[tc].offset;
2846 qcount = sb_dev->tc_to_txq[tc].count;
2849 if (skb_rx_queue_recorded(skb)) {
2850 hash = skb_get_rx_queue(skb);
2851 while (unlikely(hash >= qcount))
2853 return hash + qoffset;
2856 return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2859 static void skb_warn_bad_offload(const struct sk_buff *skb)
2861 static const netdev_features_t null_features;
2862 struct net_device *dev = skb->dev;
2863 const char *name = "";
2865 if (!net_ratelimit())
2869 if (dev->dev.parent)
2870 name = dev_driver_string(dev->dev.parent);
2872 name = netdev_name(dev);
2874 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2875 "gso_type=%d ip_summed=%d\n",
2876 name, dev ? &dev->features : &null_features,
2877 skb->sk ? &skb->sk->sk_route_caps : &null_features,
2878 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2879 skb_shinfo(skb)->gso_type, skb->ip_summed);
2883 * Invalidate hardware checksum when packet is to be mangled, and
2884 * complete checksum manually on outgoing path.
2886 int skb_checksum_help(struct sk_buff *skb)
2889 int ret = 0, offset;
2891 if (skb->ip_summed == CHECKSUM_COMPLETE)
2892 goto out_set_summed;
2894 if (unlikely(skb_shinfo(skb)->gso_size)) {
2895 skb_warn_bad_offload(skb);
2899 /* Before computing a checksum, we should make sure no frag could
2900 * be modified by an external entity : checksum could be wrong.
2902 if (skb_has_shared_frag(skb)) {
2903 ret = __skb_linearize(skb);
2908 offset = skb_checksum_start_offset(skb);
2909 BUG_ON(offset >= skb_headlen(skb));
2910 csum = skb_checksum(skb, offset, skb->len - offset, 0);
2912 offset += skb->csum_offset;
2913 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2915 if (skb_cloned(skb) &&
2916 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2917 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2922 *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
2924 skb->ip_summed = CHECKSUM_NONE;
2928 EXPORT_SYMBOL(skb_checksum_help);
2930 int skb_crc32c_csum_help(struct sk_buff *skb)
2933 int ret = 0, offset, start;
2935 if (skb->ip_summed != CHECKSUM_PARTIAL)
2938 if (unlikely(skb_is_gso(skb)))
2941 /* Before computing a checksum, we should make sure no frag could
2942 * be modified by an external entity : checksum could be wrong.
2944 if (unlikely(skb_has_shared_frag(skb))) {
2945 ret = __skb_linearize(skb);
2949 start = skb_checksum_start_offset(skb);
2950 offset = start + offsetof(struct sctphdr, checksum);
2951 if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
2955 if (skb_cloned(skb) &&
2956 !skb_clone_writable(skb, offset + sizeof(__le32))) {
2957 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2961 crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
2962 skb->len - start, ~(__u32)0,
2964 *(__le32 *)(skb->data + offset) = crc32c_csum;
2965 skb->ip_summed = CHECKSUM_NONE;
2966 skb->csum_not_inet = 0;
2971 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2973 __be16 type = skb->protocol;
2975 /* Tunnel gso handlers can set protocol to ethernet. */
2976 if (type == htons(ETH_P_TEB)) {
2979 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2982 eth = (struct ethhdr *)skb->data;
2983 type = eth->h_proto;
2986 return __vlan_get_protocol(skb, type, depth);
2990 * skb_mac_gso_segment - mac layer segmentation handler.
2991 * @skb: buffer to segment
2992 * @features: features for the output path (see dev->features)
2994 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2995 netdev_features_t features)
2997 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2998 struct packet_offload *ptype;
2999 int vlan_depth = skb->mac_len;
3000 __be16 type = skb_network_protocol(skb, &vlan_depth);
3002 if (unlikely(!type))
3003 return ERR_PTR(-EINVAL);
3005 __skb_pull(skb, vlan_depth);
3008 list_for_each_entry_rcu(ptype, &offload_base, list) {
3009 if (ptype->type == type && ptype->callbacks.gso_segment) {
3010 segs = ptype->callbacks.gso_segment(skb, features);
3016 __skb_push(skb, skb->data - skb_mac_header(skb));
3020 EXPORT_SYMBOL(skb_mac_gso_segment);
3023 /* openvswitch calls this on rx path, so we need a different check.
3025 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
3028 return skb->ip_summed != CHECKSUM_PARTIAL &&
3029 skb->ip_summed != CHECKSUM_UNNECESSARY;
3031 return skb->ip_summed == CHECKSUM_NONE;
3035 * __skb_gso_segment - Perform segmentation on skb.
3036 * @skb: buffer to segment
3037 * @features: features for the output path (see dev->features)
3038 * @tx_path: whether it is called in TX path
3040 * This function segments the given skb and returns a list of segments.
3042 * It may return NULL if the skb requires no segmentation. This is
3043 * only possible when GSO is used for verifying header integrity.
3045 * Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
3047 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
3048 netdev_features_t features, bool tx_path)
3050 struct sk_buff *segs;
3052 if (unlikely(skb_needs_check(skb, tx_path))) {
3055 /* We're going to init ->check field in TCP or UDP header */
3056 err = skb_cow_head(skb, 0);
3058 return ERR_PTR(err);
3061 /* Only report GSO partial support if it will enable us to
3062 * support segmentation on this frame without needing additional
3065 if (features & NETIF_F_GSO_PARTIAL) {
3066 netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
3067 struct net_device *dev = skb->dev;
3069 partial_features |= dev->features & dev->gso_partial_features;
3070 if (!skb_gso_ok(skb, features | partial_features))
3071 features &= ~NETIF_F_GSO_PARTIAL;
3074 BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
3075 sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
3077 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
3078 SKB_GSO_CB(skb)->encap_level = 0;
3080 skb_reset_mac_header(skb);
3081 skb_reset_mac_len(skb);
3083 segs = skb_mac_gso_segment(skb, features);
3085 if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
3086 skb_warn_bad_offload(skb);
3090 EXPORT_SYMBOL(__skb_gso_segment);
3092 /* Take action when hardware reception checksum errors are detected. */
3094 void netdev_rx_csum_fault(struct net_device *dev)
3096 if (net_ratelimit()) {
3097 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
3101 EXPORT_SYMBOL(netdev_rx_csum_fault);
3104 /* XXX: check that highmem exists at all on the given machine. */
3105 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
3107 #ifdef CONFIG_HIGHMEM
3110 if (!(dev->features & NETIF_F_HIGHDMA)) {
3111 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
3112 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
3114 if (PageHighMem(skb_frag_page(frag)))
3122 /* If MPLS offload request, verify we are testing hardware MPLS features
3123 * instead of standard features for the netdev.
3125 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
3126 static netdev_features_t net_mpls_features(struct sk_buff *skb,
3127 netdev_features_t features,
3130 if (eth_p_mpls(type))
3131 features &= skb->dev->mpls_features;
3136 static netdev_features_t net_mpls_features(struct sk_buff *skb,
3137 netdev_features_t features,
3144 static netdev_features_t harmonize_features(struct sk_buff *skb,
3145 netdev_features_t features)
3150 type = skb_network_protocol(skb, &tmp);
3151 features = net_mpls_features(skb, features, type);
3153 if (skb->ip_summed != CHECKSUM_NONE &&
3154 !can_checksum_protocol(features, type)) {
3155 features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
3157 if (illegal_highdma(skb->dev, skb))
3158 features &= ~NETIF_F_SG;
3163 netdev_features_t passthru_features_check(struct sk_buff *skb,
3164 struct net_device *dev,
3165 netdev_features_t features)
3169 EXPORT_SYMBOL(passthru_features_check);
3171 static netdev_features_t dflt_features_check(struct sk_buff *skb,
3172 struct net_device *dev,
3173 netdev_features_t features)
3175 return vlan_features_check(skb, features);
3178 static netdev_features_t gso_features_check(const struct sk_buff *skb,
3179 struct net_device *dev,
3180 netdev_features_t features)
3182 u16 gso_segs = skb_shinfo(skb)->gso_segs;
3184 if (gso_segs > dev->gso_max_segs)
3185 return features & ~NETIF_F_GSO_MASK;
3187 /* Support for GSO partial features requires software
3188 * intervention before we can actually process the packets
3189 * so we need to strip support for any partial features now
3190 * and we can pull them back in after we have partially
3191 * segmented the frame.
3193 if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
3194 features &= ~dev->gso_partial_features;
3196 /* Make sure to clear the IPv4 ID mangling feature if the
3197 * IPv4 header has the potential to be fragmented.
3199 if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
3200 struct iphdr *iph = skb->encapsulation ?
3201 inner_ip_hdr(skb) : ip_hdr(skb);
3203 if (!(iph->frag_off & htons(IP_DF)))
3204 features &= ~NETIF_F_TSO_MANGLEID;
3210 netdev_features_t netif_skb_features(struct sk_buff *skb)
3212 struct net_device *dev = skb->dev;
3213 netdev_features_t features = dev->features;
3215 if (skb_is_gso(skb))
3216 features = gso_features_check(skb, dev, features);
3218 /* If encapsulation offload request, verify we are testing
3219 * hardware encapsulation features instead of standard
3220 * features for the netdev
3222 if (skb->encapsulation)
3223 features &= dev->hw_enc_features;
3225 if (skb_vlan_tagged(skb))
3226 features = netdev_intersect_features(features,
3227 dev->vlan_features |
3228 NETIF_F_HW_VLAN_CTAG_TX |
3229 NETIF_F_HW_VLAN_STAG_TX);
3231 if (dev->netdev_ops->ndo_features_check)
3232 features &= dev->netdev_ops->ndo_features_check(skb, dev,
3235 features &= dflt_features_check(skb, dev, features);
3237 return harmonize_features(skb, features);
3239 EXPORT_SYMBOL(netif_skb_features);
3241 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
3242 struct netdev_queue *txq, bool more)
3247 if (dev_nit_active(dev))
3248 dev_queue_xmit_nit(skb, dev);
3251 trace_net_dev_start_xmit(skb, dev);
3252 rc = netdev_start_xmit(skb, dev, txq, more);
3253 trace_net_dev_xmit(skb, rc, dev, len);
3258 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
3259 struct netdev_queue *txq, int *ret)
3261 struct sk_buff *skb = first;
3262 int rc = NETDEV_TX_OK;
3265 struct sk_buff *next = skb->next;
3267 skb_mark_not_on_list(skb);
3268 rc = xmit_one(skb, dev, txq, next != NULL);
3269 if (unlikely(!dev_xmit_complete(rc))) {
3275 if (netif_xmit_stopped(txq) && skb) {
3276 rc = NETDEV_TX_BUSY;
3286 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
3287 netdev_features_t features)
3289 if (skb_vlan_tag_present(skb) &&
3290 !vlan_hw_offload_capable(features, skb->vlan_proto))
3291 skb = __vlan_hwaccel_push_inside(skb);
3295 int skb_csum_hwoffload_help(struct sk_buff *skb,
3296 const netdev_features_t features)
3298 if (unlikely(skb->csum_not_inet))
3299 return !!(features & NETIF_F_SCTP_CRC) ? 0 :
3300 skb_crc32c_csum_help(skb);
3302 return !!(features & NETIF_F_CSUM_MASK) ? 0 : skb_checksum_help(skb);
3304 EXPORT_SYMBOL(skb_csum_hwoffload_help);
3306 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
3308 netdev_features_t features;
3310 features = netif_skb_features(skb);
3311 skb = validate_xmit_vlan(skb, features);
3315 skb = sk_validate_xmit_skb(skb, dev);
3319 if (netif_needs_gso(skb, features)) {
3320 struct sk_buff *segs;
3322 segs = skb_gso_segment(skb, features);
3330 if (skb_needs_linearize(skb, features) &&
3331 __skb_linearize(skb))
3334 /* If packet is not checksummed and device does not
3335 * support checksumming for this protocol, complete
3336 * checksumming here.
3338 if (skb->ip_summed == CHECKSUM_PARTIAL) {
3339 if (skb->encapsulation)
3340 skb_set_inner_transport_header(skb,
3341 skb_checksum_start_offset(skb));
3343 skb_set_transport_header(skb,
3344 skb_checksum_start_offset(skb));
3345 if (skb_csum_hwoffload_help(skb, features))
3350 skb = validate_xmit_xfrm(skb, features, again);
3357 atomic_long_inc(&dev->tx_dropped);
3361 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
3363 struct sk_buff *next, *head = NULL, *tail;
3365 for (; skb != NULL; skb = next) {
3367 skb_mark_not_on_list(skb);
3369 /* in case skb wont be segmented, point to itself */
3372 skb = validate_xmit_skb(skb, dev, again);
3380 /* If skb was segmented, skb->prev points to
3381 * the last segment. If not, it still contains skb.
3387 EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3389 static void qdisc_pkt_len_init(struct sk_buff *skb)
3391 const struct skb_shared_info *shinfo = skb_shinfo(skb);
3393 qdisc_skb_cb(skb)->pkt_len = skb->len;
3395 /* To get more precise estimation of bytes sent on wire,
3396 * we add to pkt_len the headers size of all segments
3398 if (shinfo->gso_size) {
3399 unsigned int hdr_len;
3400 u16 gso_segs = shinfo->gso_segs;
3402 /* mac layer + network layer */
3403 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3405 /* + transport layer */
3406 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
3407 const struct tcphdr *th;
3408 struct tcphdr _tcphdr;
3410 th = skb_header_pointer(skb, skb_transport_offset(skb),
3411 sizeof(_tcphdr), &_tcphdr);
3413 hdr_len += __tcp_hdrlen(th);
3415 struct udphdr _udphdr;
3417 if (skb_header_pointer(skb, skb_transport_offset(skb),
3418 sizeof(_udphdr), &_udphdr))
3419 hdr_len += sizeof(struct udphdr);
3422 if (shinfo->gso_type & SKB_GSO_DODGY)
3423 gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3426 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3430 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3431 struct net_device *dev,
3432 struct netdev_queue *txq)
3434 spinlock_t *root_lock = qdisc_lock(q);
3435 struct sk_buff *to_free = NULL;
3439 qdisc_calculate_pkt_len(skb, q);
3441 if (q->flags & TCQ_F_NOLOCK) {
3442 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3443 __qdisc_drop(skb, &to_free);
3446 rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3450 if (unlikely(to_free))
3451 kfree_skb_list(to_free);
3456 * Heuristic to force contended enqueues to serialize on a
3457 * separate lock before trying to get qdisc main lock.
3458 * This permits qdisc->running owner to get the lock more
3459 * often and dequeue packets faster.
3461 contended = qdisc_is_running(q);
3462 if (unlikely(contended))
3463 spin_lock(&q->busylock);
3465 spin_lock(root_lock);
3466 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3467 __qdisc_drop(skb, &to_free);
3469 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3470 qdisc_run_begin(q)) {
3472 * This is a work-conserving queue; there are no old skbs
3473 * waiting to be sent out; and the qdisc is not running -
3474 * xmit the skb directly.
3477 qdisc_bstats_update(q, skb);
3479 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3480 if (unlikely(contended)) {
3481 spin_unlock(&q->busylock);
3488 rc = NET_XMIT_SUCCESS;
3490 rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3491 if (qdisc_run_begin(q)) {
3492 if (unlikely(contended)) {
3493 spin_unlock(&q->busylock);
3500 spin_unlock(root_lock);
3501 if (unlikely(to_free))
3502 kfree_skb_list(to_free);
3503 if (unlikely(contended))
3504 spin_unlock(&q->busylock);
3508 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3509 static void skb_update_prio(struct sk_buff *skb)
3511 const struct netprio_map *map;
3512 const struct sock *sk;
3513 unsigned int prioidx;
3517 map = rcu_dereference_bh(skb->dev->priomap);
3520 sk = skb_to_full_sk(skb);
3524 prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
3526 if (prioidx < map->priomap_len)
3527 skb->priority = map->priomap[prioidx];
3530 #define skb_update_prio(skb)
3533 DEFINE_PER_CPU(int, xmit_recursion);
3534 EXPORT_SYMBOL(xmit_recursion);
3537 * dev_loopback_xmit - loop back @skb
3538 * @net: network namespace this loopback is happening in
3539 * @sk: sk needed to be a netfilter okfn
3540 * @skb: buffer to transmit
3542 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3544 skb_reset_mac_header(skb);
3545 __skb_pull(skb, skb_network_offset(skb));
3546 skb->pkt_type = PACKET_LOOPBACK;
3547 skb->ip_summed = CHECKSUM_UNNECESSARY;
3548 WARN_ON(!skb_dst(skb));
3553 EXPORT_SYMBOL(dev_loopback_xmit);
3555 #ifdef CONFIG_NET_EGRESS
3556 static struct sk_buff *
3557 sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3559 struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
3560 struct tcf_result cl_res;
3565 /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
3566 mini_qdisc_bstats_cpu_update(miniq, skb);
3568 switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
3570 case TC_ACT_RECLASSIFY:
3571 skb->tc_index = TC_H_MIN(cl_res.classid);
3574 mini_qdisc_qstats_cpu_drop(miniq);
3575 *ret = NET_XMIT_DROP;
3581 *ret = NET_XMIT_SUCCESS;
3584 case TC_ACT_REDIRECT:
3585 /* No need to push/pop skb's mac_header here on egress! */
3586 skb_do_redirect(skb);
3587 *ret = NET_XMIT_SUCCESS;
3595 #endif /* CONFIG_NET_EGRESS */
3598 static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
3599 struct xps_dev_maps *dev_maps, unsigned int tci)
3601 struct xps_map *map;
3602 int queue_index = -1;
3606 tci += netdev_get_prio_tc_map(dev, skb->priority);
3609 map = rcu_dereference(dev_maps->attr_map[tci]);
3612 queue_index = map->queues[0];
3614 queue_index = map->queues[reciprocal_scale(
3615 skb_get_hash(skb), map->len)];
3616 if (unlikely(queue_index >= dev->real_num_tx_queues))
3623 static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
3624 struct sk_buff *skb)
3627 struct xps_dev_maps *dev_maps;
3628 struct sock *sk = skb->sk;
3629 int queue_index = -1;