net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <asm/system.h>
  77 #include <linux/bitops.h>
  78 #include <linux/capability.h>
  79 #include <linux/cpu.h>
  80 #include <linux/types.h>
  81 #include <linux/kernel.h>
  82 #include <linux/sched.h>
  83 #include <linux/mutex.h>
  84 #include <linux/string.h>
  85 #include <linux/mm.h>
  86 #include <linux/socket.h>
  87 #include <linux/sockios.h>
  88 #include <linux/errno.h>
  89 #include <linux/interrupt.h>
  90 #include <linux/if_ether.h>
  91 #include <linux/netdevice.h>
  92 #include <linux/etherdevice.h>
  93 #include <linux/notifier.h>
  94 #include <linux/skbuff.h>
  95 #include <net/net_namespace.h>
  96 #include <net/sock.h>
  97 #include <linux/rtnetlink.h>
  98 #include <linux/proc_fs.h>
  99 #include <linux/seq_file.h>
 100 #include <linux/stat.h>
 101 #include <linux/if_bridge.h>
 102 #include <linux/if_macvlan.h>
 103 #include <net/dst.h>
 104 #include <net/pkt_sched.h>
 105 #include <net/checksum.h>
 106 #include <linux/highmem.h>
 107 #include <linux/init.h>
 108 #include <linux/kmod.h>
 109 #include <linux/module.h>
 110 #include <linux/kallsyms.h>
 111 #include <linux/netpoll.h>
 112 #include <linux/rcupdate.h>
 113 #include <linux/delay.h>
 114 #include <net/wext.h>
 115 #include <net/iw_handler.h>
 116 #include <asm/current.h>
 117 #include <linux/audit.h>
 118 #include <linux/dmaengine.h>
 119 #include <linux/err.h>
 120 #include <linux/ctype.h>
 121 #include <linux/if_arp.h>
 122 #include <linux/if_vlan.h>
 123
 124 #include "net-sysfs.h"
 125
 126 /*
 127  *      The list of packet types we will receive (as opposed to discard)
 128  *      and the routines to invoke.
 129  *
 130  *      Why 16. Because with 16 the only overlap we get on a hash of the
 131  *      low nibble of the protocol value is RARP/SNAP/X.25.
 132  *
 133  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 134  *             sure which should go first, but I bet it won't make much
 135  *             difference if we are running VLANs.  The good news is that
 136  *             this protocol won't be in the list unless compiled in, so
 137  *             the average user (w/out VLANs) will not be adversely affected.
 138  *             --BLG
 139  *
 140  *              0800    IP
 141  *              8100    802.1Q VLAN
 142  *              0001    802.3
 143  *              0002    AX.25
 144  *              0004    802.2
 145  *              8035    RARP
 146  *              0005    SNAP
 147  *              0805    X.25
 148  *              0806    ARP
 149  *              8137    IPX
 150  *              0009    Localtalk
 151  *              86DD    IPv6
 152  */
 153
 154 #define PTYPE_HASH_SIZE (16)
 155 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 156
 157 static DEFINE_SPINLOCK(ptype_lock);
 158 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 159 static struct list_head ptype_all __read_mostly;        /* Taps */
 160
 161 #ifdef CONFIG_NET_DMA
 162 struct net_dma {
 163         struct dma_client client;
 164         spinlock_t lock;
 165         cpumask_t channel_mask;
 166         struct dma_chan **channels;
 167 };
 168
 169 static enum dma_state_client
 170 netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
 171         enum dma_state state);
 172
 173 static struct net_dma net_dma = {
 174         .client = {
 175                 .event_callback = netdev_dma_event,
 176         },
 177 };
 178 #endif
 179
 180 /*
 181  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 182  * semaphore.
 183  *
 184  * Pure readers hold dev_base_lock for reading.
 185  *
 186  * Writers must hold the rtnl semaphore while they loop through the
 187  * dev_base_head list, and hold dev_base_lock for writing when they do the
 188  * actual updates.  This allows pure readers to access the list even
 189  * while a writer is preparing to update it.
 190  *
 191  * To put it another way, dev_base_lock is held for writing only to
 192  * protect against pure readers; the rtnl semaphore provides the
 193  * protection against other writers.
 194  *
 195  * See, for example usages, register_netdevice() and
 196  * unregister_netdevice(), which must be called with the rtnl
 197  * semaphore held.
 198  */
 199 DEFINE_RWLOCK(dev_base_lock);
 200
 201 EXPORT_SYMBOL(dev_base_lock);
 202
 203 #define NETDEV_HASHBITS 8
 204 #define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
 205
 206 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 207 {
 208         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 209         return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
 210 }
 211
 212 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 213 {
 214         return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
 215 }
 216
 217 /* Device list insertion */
 218 static int list_netdevice(struct net_device *dev)
 219 {
 220         struct net *net = dev_net(dev);
 221
 222         ASSERT_RTNL();
 223
 224         write_lock_bh(&dev_base_lock);
 225         list_add_tail(&dev->dev_list, &net->dev_base_head);
 226         hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
 227         hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex));
 228         write_unlock_bh(&dev_base_lock);
 229         return 0;
 230 }
 231
 232 /* Device list removal */
 233 static void unlist_netdevice(struct net_device *dev)
 234 {
 235         ASSERT_RTNL();
 236
 237         /* Unlink dev from the device chain */
 238         write_lock_bh(&dev_base_lock);
 239         list_del(&dev->dev_list);
 240         hlist_del(&dev->name_hlist);
 241         hlist_del(&dev->index_hlist);
 242         write_unlock_bh(&dev_base_lock);
 243 }
 244
 245 /*
 246  *      Our notifier list
 247  */
 248
 249 static RAW_NOTIFIER_HEAD(netdev_chain);
 250
 251 /*
 252  *      Device drivers call our routines to queue packets here. We empty the
 253  *      queue in the local softnet handler.
 254  */
 255
 256 DEFINE_PER_CPU(struct softnet_data, softnet_data);
 257
 258 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 259 /*
 260  * register_netdevice() inits dev->_xmit_lock and sets lockdep class
 261  * according to dev->type
 262  */
 263 static const unsigned short netdev_lock_type[] =
 264         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 265          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 266          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 267          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 268          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 269          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 270          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 271          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 272          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 273          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 274          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 275          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 276          ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 277          ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_VOID,
 278          ARPHRD_NONE};
 279
 280 static const char *netdev_lock_name[] =
 281         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 282          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 283          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 284          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 285          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 286          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 287          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 288          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 289          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 290          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 291          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 292          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 293          "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 294          "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_VOID",
 295          "_xmit_NONE"};
 296
 297 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 298
 299 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 300 {
 301         int i;
 302
 303         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 304                 if (netdev_lock_type[i] == dev_type)
 305                         return i;
 306         /* the last key is used by default */
 307         return ARRAY_SIZE(netdev_lock_type) - 1;
 308 }
 309
 310 static inline void netdev_set_lockdep_class(spinlock_t *lock,
 311                                             unsigned short dev_type)
 312 {
 313         int i;
 314
 315         i = netdev_lock_pos(dev_type);
 316         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 317                                    netdev_lock_name[i]);
 318 }
 319 #else
 320 static inline void netdev_set_lockdep_class(spinlock_t *lock,
 321                                             unsigned short dev_type)
 322 {
 323 }
 324 #endif
 325
 326 /*******************************************************************************
 327
 328                 Protocol management and registration routines
 329
 330 *******************************************************************************/
 331
 332 /*
 333  *      Add a protocol ID to the list. Now that the input handler is
 334  *      smarter we can dispense with all the messy stuff that used to be
 335  *      here.
 336  *
 337  *      BEWARE!!! Protocol handlers, mangling input packets,
 338  *      MUST BE last in hash buckets and checking protocol handlers
 339  *      MUST start from promiscuous ptype_all chain in net_bh.
 340  *      It is true now, do not change it.
 341  *      Explanation follows: if protocol handler, mangling packet, will
 342  *      be the first on list, it is not able to sense, that packet
 343  *      is cloned and should be copied-on-write, so that it will
 344  *      change it and subsequent readers will get broken packet.
 345  *                                                      --ANK (980803)
 346  */
 347
 348 /**
 349  *      dev_add_pack - add packet handler
 350  *      @pt: packet type declaration
 351  *
 352  *      Add a protocol handler to the networking stack. The passed &packet_type
 353  *      is linked into kernel lists and may not be freed until it has been
 354  *      removed from the kernel lists.
 355  *
 356  *      This call does not sleep therefore it can not
 357  *      guarantee all CPU's that are in middle of receiving packets
 358  *      will see the new packet type (until the next received packet).
 359  */
 360
 361 void dev_add_pack(struct packet_type *pt)
 362 {
 363         int hash;
 364
 365         spin_lock_bh(&ptype_lock);
 366         if (pt->type == htons(ETH_P_ALL))
 367                 list_add_rcu(&pt->list, &ptype_all);
 368         else {
 369                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
 370                 list_add_rcu(&pt->list, &ptype_base[hash]);
 371         }
 372         spin_unlock_bh(&ptype_lock);
 373 }
 374
 375 /**
 376  *      __dev_remove_pack        - remove packet handler
 377  *      @pt: packet type declaration
 378  *
 379  *      Remove a protocol handler that was previously added to the kernel
 380  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 381  *      from the kernel lists and can be freed or reused once this function
 382  *      returns.
 383  *
 384  *      The packet type might still be in use by receivers
 385  *      and must not be freed until after all the CPU's have gone
 386  *      through a quiescent state.
 387  */
 388 void __dev_remove_pack(struct packet_type *pt)
 389 {
 390         struct list_head *head;
 391         struct packet_type *pt1;
 392
 393         spin_lock_bh(&ptype_lock);
 394
 395         if (pt->type == htons(ETH_P_ALL))
 396                 head = &ptype_all;
 397         else
 398                 head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 399
 400         list_for_each_entry(pt1, head, list) {
 401                 if (pt == pt1) {
 402                         list_del_rcu(&pt->list);
 403                         goto out;
 404                 }
 405         }
 406
 407         printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 408 out:
 409         spin_unlock_bh(&ptype_lock);
 410 }
 411 /**
 412  *      dev_remove_pack  - remove packet handler
 413  *      @pt: packet type declaration
 414  *
 415  *      Remove a protocol handler that was previously added to the kernel
 416  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 417  *      from the kernel lists and can be freed or reused once this function
 418  *      returns.
 419  *
 420  *      This call sleeps to guarantee that no CPU is looking at the packet
 421  *      type after return.
 422  */
 423 void dev_remove_pack(struct packet_type *pt)
 424 {
 425         __dev_remove_pack(pt);
 426
 427         synchronize_net();
 428 }
 429
 430 /******************************************************************************
 431
 432                       Device Boot-time Settings Routines
 433
 434 *******************************************************************************/
 435
 436 /* Boot time configuration table */
 437 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 438
 439 /**
 440  *      netdev_boot_setup_add   - add new setup entry
 441  *      @name: name of the device
 442  *      @map: configured settings for the device
 443  *
 444  *      Adds new setup entry to the dev_boot_setup list.  The function
 445  *      returns 0 on error and 1 on success.  This is a generic routine to
 446  *      all netdevices.
 447  */
 448 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 449 {
 450         struct netdev_boot_setup *s;
 451         int i;
 452
 453         s = dev_boot_setup;
 454         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 455                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 456                         memset(s[i].name, 0, sizeof(s[i].name));
 457                         strcpy(s[i].name, name);
 458                         memcpy(&s[i].map, map, sizeof(s[i].map));
 459                         break;
 460                 }
 461         }
 462
 463         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 464 }
 465
 466 /**
 467  *      netdev_boot_setup_check - check boot time settings
 468  *      @dev: the netdevice
 469  *
 470  *      Check boot time settings for the device.
 471  *      The found settings are set for the device to be used
 472  *      later in the device probing.
 473  *      Returns 0 if no settings found, 1 if they are.
 474  */
 475 int netdev_boot_setup_check(struct net_device *dev)
 476 {
 477         struct netdev_boot_setup *s = dev_boot_setup;
 478         int i;
 479
 480         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 481                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 482                     !strncmp(dev->name, s[i].name, strlen(s[i].name))) {
 483                         dev->irq        = s[i].map.irq;
 484                         dev->base_addr  = s[i].map.base_addr;
 485                         dev->mem_start  = s[i].map.mem_start;
 486                         dev->mem_end    = s[i].map.mem_end;
 487                         return 1;
 488                 }
 489         }
 490         return 0;
 491 }
 492
 493
 494 /**
 495  *      netdev_boot_base        - get address from boot time settings
 496  *      @prefix: prefix for network device
 497  *      @unit: id for network device
 498  *
 499  *      Check boot time settings for the base address of device.
 500  *      The found settings are set for the device to be used
 501  *      later in the device probing.
 502  *      Returns 0 if no settings found.
 503  */
 504 unsigned long netdev_boot_base(const char *prefix, int unit)
 505 {
 506         const struct netdev_boot_setup *s = dev_boot_setup;
 507         char name[IFNAMSIZ];
 508         int i;
 509
 510         sprintf(name, "%s%d", prefix, unit);
 511
 512         /*
 513          * If device already registered then return base of 1
 514          * to indicate not to probe for this interface
 515          */
 516         if (__dev_get_by_name(&init_net, name))
 517                 return 1;
 518
 519         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 520                 if (!strcmp(name, s[i].name))
 521                         return s[i].map.base_addr;
 522         return 0;
 523 }
 524
 525 /*
 526  * Saves at boot time configured settings for any netdevice.
 527  */
 528 int __init netdev_boot_setup(char *str)
 529 {
 530         int ints[5];
 531         struct ifmap map;
 532
 533         str = get_options(str, ARRAY_SIZE(ints), ints);
 534         if (!str || !*str)
 535                 return 0;
 536
 537         /* Save settings */
 538         memset(&map, 0, sizeof(map));
 539         if (ints[0] > 0)
 540                 map.irq = ints[1];
 541         if (ints[0] > 1)
 542                 map.base_addr = ints[2];
 543         if (ints[0] > 2)
 544                 map.mem_start = ints[3];
 545         if (ints[0] > 3)
 546                 map.mem_end = ints[4];
 547
 548         /* Add new entry to the list */
 549         return netdev_boot_setup_add(str, &map);
 550 }
 551
 552 __setup("netdev=", netdev_boot_setup);
 553
 554 /*******************************************************************************
 555
 556                             Device Interface Subroutines
 557
 558 *******************************************************************************/
 559
 560 /**
 561  *      __dev_get_by_name       - find a device by its name
 562  *      @net: the applicable net namespace
 563  *      @name: name to find
 564  *
 565  *      Find an interface by name. Must be called under RTNL semaphore
 566  *      or @dev_base_lock. If the name is found a pointer to the device
 567  *      is returned. If the name is not found then %NULL is returned. The
 568  *      reference counters are not incremented so the caller must be
 569  *      careful with locks.
 570  */
 571
 572 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 573 {
 574         struct hlist_node *p;
 575
 576         hlist_for_each(p, dev_name_hash(net, name)) {
 577                 struct net_device *dev
 578                         = hlist_entry(p, struct net_device, name_hlist);
 579                 if (!strncmp(dev->name, name, IFNAMSIZ))
 580                         return dev;
 581         }
 582         return NULL;
 583 }
 584
 585 /**
 586  *      dev_get_by_name         - find a device by its name
 587  *      @net: the applicable net namespace
 588  *      @name: name to find
 589  *
 590  *      Find an interface by name. This can be called from any
 591  *      context and does its own locking. The returned handle has
 592  *      the usage count incremented and the caller must use dev_put() to
 593  *      release it when it is no longer needed. %NULL is returned if no
 594  *      matching device is found.
 595  */
 596
 597 struct net_device *dev_get_by_name(struct net *net, const char *name)
 598 {
 599         struct net_device *dev;
 600
 601         read_lock(&dev_base_lock);
 602         dev = __dev_get_by_name(net, name);
 603         if (dev)
 604                 dev_hold(dev);
 605         read_unlock(&dev_base_lock);
 606         return dev;
 607 }
 608
 609 /**
 610  *      __dev_get_by_index - find a device by its ifindex
 611  *      @net: the applicable net namespace
 612  *      @ifindex: index of device
 613  *
 614  *      Search for an interface by index. Returns %NULL if the device
 615  *      is not found or a pointer to the device. The device has not
 616  *      had its reference counter increased so the caller must be careful
 617  *      about locking. The caller must hold either the RTNL semaphore
 618  *      or @dev_base_lock.
 619  */
 620
 621 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 622 {
 623         struct hlist_node *p;
 624
 625         hlist_for_each(p, dev_index_hash(net, ifindex)) {
 626                 struct net_device *dev
 627                         = hlist_entry(p, struct net_device, index_hlist);
 628                 if (dev->ifindex == ifindex)
 629                         return dev;
 630         }
 631         return NULL;
 632 }
 633
 634
 635 /**
 636  *      dev_get_by_index - find a device by its ifindex
 637  *      @net: the applicable net namespace
 638  *      @ifindex: index of device
 639  *
 640  *      Search for an interface by index. Returns NULL if the device
 641  *      is not found or a pointer to the device. The device returned has
 642  *      had a reference added and the pointer is safe until the user calls
 643  *      dev_put to indicate they have finished with it.
 644  */
 645
 646 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 647 {
 648         struct net_device *dev;
 649
 650         read_lock(&dev_base_lock);
 651         dev = __dev_get_by_index(net, ifindex);
 652         if (dev)
 653                 dev_hold(dev);
 654         read_unlock(&dev_base_lock);
 655         return dev;
 656 }
 657
 658 /**
 659  *      dev_getbyhwaddr - find a device by its hardware address
 660  *      @net: the applicable net namespace
 661  *      @type: media type of device
 662  *      @ha: hardware address
 663  *
 664  *      Search for an interface by MAC address. Returns NULL if the device
 665  *      is not found or a pointer to the device. The caller must hold the
 666  *      rtnl semaphore. The returned device has not had its ref count increased
 667  *      and the caller must therefore be careful about locking
 668  *
 669  *      BUGS:
 670  *      If the API was consistent this would be __dev_get_by_hwaddr
 671  */
 672
 673 struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
 674 {
 675         struct net_device *dev;
 676
 677         ASSERT_RTNL();
 678
 679         for_each_netdev(net, dev)
 680                 if (dev->type == type &&
 681                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 682                         return dev;
 683
 684         return NULL;
 685 }
 686
 687 EXPORT_SYMBOL(dev_getbyhwaddr);
 688
 689 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 690 {
 691         struct net_device *dev;
 692
 693         ASSERT_RTNL();
 694         for_each_netdev(net, dev)
 695                 if (dev->type == type)
 696                         return dev;
 697
 698         return NULL;
 699 }
 700
 701 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 702
 703 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 704 {
 705         struct net_device *dev;
 706
 707         rtnl_lock();
 708         dev = __dev_getfirstbyhwtype(net, type);
 709         if (dev)
 710                 dev_hold(dev);
 711         rtnl_unlock();
 712         return dev;
 713 }
 714
 715 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 716
 717 /**
 718  *      dev_get_by_flags - find any device with given flags
 719  *      @net: the applicable net namespace
 720  *      @if_flags: IFF_* values
 721  *      @mask: bitmask of bits in if_flags to check
 722  *
 723  *      Search for any interface with the given flags. Returns NULL if a device
 724  *      is not found or a pointer to the device. The device returned has
 725  *      had a reference added and the pointer is safe until the user calls
 726  *      dev_put to indicate they have finished with it.
 727  */
 728
 729 struct net_device * dev_get_by_flags(struct net *net, unsigned short if_flags, unsigned short mask)
 730 {
 731         struct net_device *dev, *ret;
 732
 733         ret = NULL;
 734         read_lock(&dev_base_lock);
 735         for_each_netdev(net, dev) {
 736                 if (((dev->flags ^ if_flags) & mask) == 0) {
 737                         dev_hold(dev);
 738                         ret = dev;
 739                         break;
 740                 }
 741         }
 742         read_unlock(&dev_base_lock);
 743         return ret;
 744 }
 745
 746 /**
 747  *      dev_valid_name - check if name is okay for network device
 748  *      @name: name string
 749  *
 750  *      Network device names need to be valid file names to
 751  *      to allow sysfs to work.  We also disallow any kind of
 752  *      whitespace.
 753  */
 754 int dev_valid_name(const char *name)
 755 {
 756         if (*name == '\0')
 757                 return 0;
 758         if (strlen(name) >= IFNAMSIZ)
 759                 return 0;
 760         if (!strcmp(name, ".") || !strcmp(name, ".."))
 761                 return 0;
 762
 763         while (*name) {
 764                 if (*name == '/' || isspace(*name))
 765                         return 0;
 766                 name++;
 767         }
 768         return 1;
 769 }
 770
 771 /**
 772  *      __dev_alloc_name - allocate a name for a device
 773  *      @net: network namespace to allocate the device name in
 774  *      @name: name format string
 775  *      @buf:  scratch buffer and result name string
 776  *
 777  *      Passed a format string - eg "lt%d" it will try and find a suitable
 778  *      id. It scans list of devices to build up a free map, then chooses
 779  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 780  *      while allocating the name and adding the device in order to avoid
 781  *      duplicates.
 782  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 783  *      Returns the number of the unit assigned or a negative errno code.
 784  */
 785
 786 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 787 {
 788         int i = 0;
 789         const char *p;
 790         const int max_netdevices = 8*PAGE_SIZE;
 791         unsigned long *inuse;
 792         struct net_device *d;
 793
 794         p = strnchr(name, IFNAMSIZ-1, '%');
 795         if (p) {
 796                 /*
 797                  * Verify the string as this thing may have come from
 798                  * the user.  There must be either one "%d" and no other "%"
 799                  * characters.
 800                  */
 801                 if (p[1] != 'd' || strchr(p + 2, '%'))
 802                         return -EINVAL;
 803
 804                 /* Use one page as a bit array of possible slots */
 805                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 806                 if (!inuse)
 807                         return -ENOMEM;
 808
 809                 for_each_netdev(net, d) {
 810                         if (!sscanf(d->name, name, &i))
 811                                 continue;
 812                         if (i < 0 || i >= max_netdevices)
 813                                 continue;
 814
 815                         /*  avoid cases where sscanf is not exact inverse of printf */
 816                         snprintf(buf, IFNAMSIZ, name, i);
 817                         if (!strncmp(buf, d->name, IFNAMSIZ))
 818                                 set_bit(i, inuse);
 819                 }
 820
 821                 i = find_first_zero_bit(inuse, max_netdevices);
 822                 free_page((unsigned long) inuse);
 823         }
 824
 825         snprintf(buf, IFNAMSIZ, name, i);
 826         if (!__dev_get_by_name(net, buf))
 827                 return i;
 828
 829         /* It is possible to run out of possible slots
 830          * when the name is long and there isn't enough space left
 831          * for the digits, or if all bits are used.
 832          */
 833         return -ENFILE;
 834 }
 835
 836 /**
 837  *      dev_alloc_name - allocate a name for a device
 838  *      @dev: device
 839  *      @name: name format string
 840  *
 841  *      Passed a format string - eg "lt%d" it will try and find a suitable
 842  *      id. It scans list of devices to build up a free map, then chooses
 843  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 844  *      while allocating the name and adding the device in order to avoid
 845  *      duplicates.
 846  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 847  *      Returns the number of the unit assigned or a negative errno code.
 848  */
 849
 850 int dev_alloc_name(struct net_device *dev, const char *name)
 851 {
 852         char buf[IFNAMSIZ];
 853         struct net *net;
 854         int ret;
 855
 856         BUG_ON(!dev_net(dev));
 857         net = dev_net(dev);
 858         ret = __dev_alloc_name(net, name, buf);
 859         if (ret >= 0)
 860                 strlcpy(dev->name, buf, IFNAMSIZ);
 861         return ret;
 862 }
 863
 864
 865 /**
 866  *      dev_change_name - change name of a device
 867  *      @dev: device
 868  *      @newname: name (or format string) must be at least IFNAMSIZ
 869  *
 870  *      Change name of a device, can pass format strings "eth%d".
 871  *      for wildcarding.
 872  */
 873 int dev_change_name(struct net_device *dev, char *newname)
 874 {
 875         char oldname[IFNAMSIZ];
 876         int err = 0;
 877         int ret;
 878         struct net *net;
 879
 880         ASSERT_RTNL();
 881         BUG_ON(!dev_net(dev));
 882
 883         net = dev_net(dev);
 884         if (dev->flags & IFF_UP)
 885                 return -EBUSY;
 886
 887         if (!dev_valid_name(newname))
 888                 return -EINVAL;
 889
 890         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 891                 return 0;
 892
 893         memcpy(oldname, dev->name, IFNAMSIZ);
 894
 895         if (strchr(newname, '%')) {
 896                 err = dev_alloc_name(dev, newname);
 897                 if (err < 0)
 898                         return err;
 899                 strcpy(newname, dev->name);
 900         }
 901         else if (__dev_get_by_name(net, newname))
 902                 return -EEXIST;
 903         else
 904                 strlcpy(dev->name, newname, IFNAMSIZ);
 905
 906 rollback:
 907         err = device_rename(&dev->dev, dev->name);
 908         if (err) {
 909                 memcpy(dev->name, oldname, IFNAMSIZ);
 910                 return err;
 911         }
 912
 913         write_lock_bh(&dev_base_lock);
 914         hlist_del(&dev->name_hlist);
 915         hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
 916         write_unlock_bh(&dev_base_lock);
 917
 918         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
 919         ret = notifier_to_errno(ret);
 920
 921         if (ret) {
 922                 if (err) {
 923                         printk(KERN_ERR
 924                                "%s: name change rollback failed: %d.\n",
 925                                dev->name, ret);
 926                 } else {
 927                         err = ret;
 928                         memcpy(dev->name, oldname, IFNAMSIZ);
 929                         goto rollback;
 930                 }
 931         }
 932
 933         return err;
 934 }
 935
 936 /**
 937  *      netdev_features_change - device changes features
 938  *      @dev: device to cause notification
 939  *
 940  *      Called to indicate a device has changed features.
 941  */
 942 void netdev_features_change(struct net_device *dev)
 943 {
 944         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
 945 }
 946 EXPORT_SYMBOL(netdev_features_change);
 947
 948 /**
 949  *      netdev_state_change - device changes state
 950  *      @dev: device to cause notification
 951  *
 952  *      Called to indicate a device has changed state. This function calls
 953  *      the notifier chains for netdev_chain and sends a NEWLINK message
 954  *      to the routing socket.
 955  */
 956 void netdev_state_change(struct net_device *dev)
 957 {
 958         if (dev->flags & IFF_UP) {
 959                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
 960                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
 961         }
 962 }
 963
 964 /**
 965  *      dev_load        - load a network module
 966  *      @net: the applicable net namespace
 967  *      @name: name of interface
 968  *
 969  *      If a network interface is not present and the process has suitable
 970  *      privileges this function loads the module. If module loading is not
 971  *      available in this kernel then it becomes a nop.
 972  */
 973
 974 void dev_load(struct net *net, const char *name)
 975 {
 976         struct net_device *dev;
 977
 978         read_lock(&dev_base_lock);
 979         dev = __dev_get_by_name(net, name);
 980         read_unlock(&dev_base_lock);
 981
 982         if (!dev && capable(CAP_SYS_MODULE))
 983                 request_module("%s", name);
 984 }
 985
 986 /**
 987  *      dev_open        - prepare an interface for use.
 988  *      @dev:   device to open
 989  *
 990  *      Takes a device from down to up state. The device's private open
 991  *      function is invoked and then the multicast lists are loaded. Finally
 992  *      the device is moved into the up state and a %NETDEV_UP message is
 993  *      sent to the netdev notifier chain.
 994  *
 995  *      Calling this function on an active interface is a nop. On a failure
 996  *      a negative errno code is returned.
 997  */
 998 int dev_open(struct net_device *dev)
 999 {
1000         int ret = 0;
1001
1002         ASSERT_RTNL();
1003
1004         /*
1005          *      Is it already up?
1006          */
1007
1008         if (dev->flags & IFF_UP)
1009                 return 0;
1010
1011         /*
1012          *      Is it even present?
1013          */
1014         if (!netif_device_present(dev))
1015                 return -ENODEV;
1016
1017         /*
1018          *      Call device private open method
1019          */
1020         set_bit(__LINK_STATE_START, &dev->state);
1021
1022         if (dev->validate_addr)
1023                 ret = dev->validate_addr(dev);
1024
1025         if (!ret && dev->open)
1026                 ret = dev->open(dev);
1027
1028         /*
1029          *      If it went open OK then:
1030          */
1031
1032         if (ret)
1033                 clear_bit(__LINK_STATE_START, &dev->state);
1034         else {
1035                 /*
1036                  *      Set the flags.
1037                  */
1038                 dev->flags |= IFF_UP;
1039
1040                 /*
1041                  *      Initialize multicasting status
1042                  */
1043                 dev_set_rx_mode(dev);
1044
1045                 /*
1046                  *      Wakeup transmit queue engine
1047                  */
1048                 dev_activate(dev);
1049
1050                 /*
1051                  *      ... and announce new interface.
1052                  */
1053                 call_netdevice_notifiers(NETDEV_UP, dev);
1054         }
1055
1056         return ret;
1057 }
1058
1059 /**
1060  *      dev_close - shutdown an interface.
1061  *      @dev: device to shutdown
1062  *
1063  *      This function moves an active device into down state. A
1064  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1065  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1066  *      chain.
1067  */
1068 int dev_close(struct net_device *dev)
1069 {
1070         ASSERT_RTNL();
1071
1072         might_sleep();
1073
1074         if (!(dev->flags & IFF_UP))
1075                 return 0;
1076
1077         /*
1078          *      Tell people we are going down, so that they can
1079          *      prepare to death, when device is still operating.
1080          */
1081         call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1082
1083         clear_bit(__LINK_STATE_START, &dev->state);
1084
1085         /* Synchronize to scheduled poll. We cannot touch poll list,
1086          * it can be even on different cpu. So just clear netif_running().
1087          *
1088          * dev->stop() will invoke napi_disable() on all of it's
1089          * napi_struct instances on this device.
1090          */
1091         smp_mb__after_clear_bit(); /* Commit netif_running(). */
1092
1093         dev_deactivate(dev);
1094
1095         /*
1096          *      Call the device specific close. This cannot fail.
1097          *      Only if device is UP
1098          *
1099          *      We allow it to be called even after a DETACH hot-plug
1100          *      event.
1101          */
1102         if (dev->stop)
1103                 dev->stop(dev);
1104
1105         /*
1106          *      Device is now down.
1107          */
1108
1109         dev->flags &= ~IFF_UP;
1110
1111         /*
1112          * Tell people we are down
1113          */
1114         call_netdevice_notifiers(NETDEV_DOWN, dev);
1115
1116         return 0;
1117 }
1118
1119
1120 static int dev_boot_phase = 1;
1121
1122 /*
1123  *      Device change register/unregister. These are not inline or static
1124  *      as we export them to the world.
1125  */
1126
1127 /**
1128  *      register_netdevice_notifier - register a network notifier block
1129  *      @nb: notifier
1130  *
1131  *      Register a notifier to be called when network device events occur.
1132  *      The notifier passed is linked into the kernel structures and must
1133  *      not be reused until it has been unregistered. A negative errno code
1134  *      is returned on a failure.
1135  *
1136  *      When registered all registration and up events are replayed
1137  *      to the new notifier to allow device to have a race free
1138  *      view of the network device list.
1139  */
1140
1141 int register_netdevice_notifier(struct notifier_block *nb)
1142 {
1143         struct net_device *dev;
1144         struct net_device *last;
1145         struct net *net;
1146         int err;
1147
1148         rtnl_lock();
1149         err = raw_notifier_chain_register(&netdev_chain, nb);
1150         if (err)
1151                 goto unlock;
1152         if (dev_boot_phase)
1153                 goto unlock;
1154         for_each_net(net) {
1155                 for_each_netdev(net, dev) {
1156                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1157                         err = notifier_to_errno(err);
1158                         if (err)
1159                                 goto rollback;
1160
1161                         if (!(dev->flags & IFF_UP))
1162                                 continue;
1163
1164                         nb->notifier_call(nb, NETDEV_UP, dev);
1165                 }
1166         }
1167
1168 unlock:
1169         rtnl_unlock();
1170         return err;
1171
1172 rollback:
1173         last = dev;
1174         for_each_net(net) {
1175                 for_each_netdev(net, dev) {
1176                         if (dev == last)
1177                                 break;
1178
1179                         if (dev->flags & IFF_UP) {
1180                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1181                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1182                         }
1183                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1184                 }
1185         }
1186
1187         raw_notifier_chain_unregister(&netdev_chain, nb);
1188         goto unlock;
1189 }
1190
1191 /**
1192  *      unregister_netdevice_notifier - unregister a network notifier block
1193  *      @nb: notifier
1194  *
1195  *      Unregister a notifier previously registered by
1196  *      register_netdevice_notifier(). The notifier is unlinked into the
1197  *      kernel structures and may then be reused. A negative errno code
1198  *      is returned on a failure.
1199  */
1200
1201 int unregister_netdevice_notifier(struct notifier_block *nb)
1202 {
1203         int err;
1204
1205         rtnl_lock();
1206         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1207         rtnl_unlock();
1208         return err;
1209 }
1210
1211 /**
1212  *      call_netdevice_notifiers - call all network notifier blocks
1213  *      @val: value passed unmodified to notifier function
1214  *      @dev: net_device pointer passed unmodified to notifier function
1215  *
1216  *      Call all network notifier blocks.  Parameters and return value
1217  *      are as for raw_notifier_call_chain().
1218  */
1219
1220 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1221 {
1222         return raw_notifier_call_chain(&netdev_chain, val, dev);
1223 }
1224
1225 /* When > 0 there are consumers of rx skb time stamps */
1226 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1227
1228 void net_enable_timestamp(void)
1229 {
1230         atomic_inc(&netstamp_needed);
1231 }
1232
1233 void net_disable_timestamp(void)
1234 {
1235         atomic_dec(&netstamp_needed);
1236 }
1237
1238 static inline void net_timestamp(struct sk_buff *skb)
1239 {
1240         if (atomic_read(&netstamp_needed))
1241                 __net_timestamp(skb);
1242         else
1243                 skb->tstamp.tv64 = 0;
1244 }
1245
1246 /*
1247  *      Support routine. Sends outgoing frames to any network
1248  *      taps currently in use.
1249  */
1250
1251 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1252 {
1253         struct packet_type *ptype;
1254
1255         net_timestamp(skb);
1256
1257         rcu_read_lock();
1258         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1259                 /* Never send packets back to the socket
1260                  * they originated from - MvS (miquels@drinkel.ow.org)
1261                  */
1262                 if ((ptype->dev == dev || !ptype->dev) &&
1263                     (ptype->af_packet_priv == NULL ||
1264                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1265                         struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
1266                         if (!skb2)
1267                                 break;
1268
1269                         /* skb->nh should be correctly
1270                            set by sender, so that the second statement is
1271                            just protection against buggy protocols.
1272                          */
1273                         skb_reset_mac_header(skb2);
1274
1275                         if (skb_network_header(skb2) < skb2->data ||
1276                             skb2->network_header > skb2->tail) {
1277                                 if (net_ratelimit())
1278                                         printk(KERN_CRIT "protocol %04x is "
1279                                                "buggy, dev %s\n",
1280                                                skb2->protocol, dev->name);
1281                                 skb_reset_network_header(skb2);
1282                         }
1283
1284                         skb2->transport_header = skb2->network_header;
1285                         skb2->pkt_type = PACKET_OUTGOING;
1286                         ptype->func(skb2, skb->dev, ptype, skb->dev);
1287                 }
1288         }
1289         rcu_read_unlock();
1290 }
1291
1292
1293 void __netif_schedule(struct net_device *dev)
1294 {
1295         if (!test_and_set_bit(__LINK_STATE_SCHED, &dev->state)) {
1296                 unsigned long flags;
1297                 struct softnet_data *sd;
1298
1299                 local_irq_save(flags);
1300                 sd = &__get_cpu_var(softnet_data);
1301                 dev->next_sched = sd->output_queue;
1302                 sd->output_queue = dev;
1303                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1304                 local_irq_restore(flags);
1305         }
1306 }
1307 EXPORT_SYMBOL(__netif_schedule);
1308
1309 void dev_kfree_skb_irq(struct sk_buff *skb)
1310 {
1311         if (atomic_dec_and_test(&skb->users)) {
1312                 struct softnet_data *sd;
1313                 unsigned long flags;
1314
1315                 local_irq_save(flags);
1316                 sd = &__get_cpu_var(softnet_data);
1317                 skb->next = sd->completion_queue;
1318                 sd->completion_queue = skb;
1319                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1320                 local_irq_restore(flags);
1321         }
1322 }
1323 EXPORT_SYMBOL(dev_kfree_skb_irq);
1324
1325 void dev_kfree_skb_any(struct sk_buff *skb)
1326 {
1327         if (in_irq() || irqs_disabled())
1328                 dev_kfree_skb_irq(skb);
1329         else
1330                 dev_kfree_skb(skb);
1331 }
1332 EXPORT_SYMBOL(dev_kfree_skb_any);
1333
1334
1335 /**
1336  * netif_device_detach - mark device as removed
1337  * @dev: network device
1338  *
1339  * Mark device as removed from system and therefore no longer available.
1340  */
1341 void netif_device_detach(struct net_device *dev)
1342 {
1343         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1344             netif_running(dev)) {
1345                 netif_stop_queue(dev);
1346         }
1347 }
1348 EXPORT_SYMBOL(netif_device_detach);
1349
1350 /**
1351  * netif_device_attach - mark device as attached
1352  * @dev: network device
1353  *
1354  * Mark device as attached from system and restart if needed.
1355  */
1356 void netif_device_attach(struct net_device *dev)
1357 {
1358         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1359             netif_running(dev)) {
1360                 netif_wake_queue(dev);
1361                 __netdev_watchdog_up(dev);
1362         }
1363 }
1364 EXPORT_SYMBOL(netif_device_attach);
1365
1366 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1367 {
1368         return ((features & NETIF_F_GEN_CSUM) ||
1369                 ((features & NETIF_F_IP_CSUM) &&
1370                  protocol == htons(ETH_P_IP)) ||
1371                 ((features & NETIF_F_IPV6_CSUM) &&
1372                  protocol == htons(ETH_P_IPV6)));
1373 }
1374
1375 static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1376 {
1377         if (can_checksum_protocol(dev->features, skb->protocol))
1378                 return true;
1379
1380         if (skb->protocol == htons(ETH_P_8021Q)) {
1381                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1382                 if (can_checksum_protocol(dev->features & dev->vlan_features,
1383                                           veh->h_vlan_encapsulated_proto))
1384                         return true;
1385         }
1386
1387         return false;
1388 }
1389
1390 /*
1391  * Invalidate hardware checksum when packet is to be mangled, and
1392  * complete checksum manually on outgoing path.
1393  */
1394 int skb_checksum_help(struct sk_buff *skb)
1395 {
1396         __wsum csum;
1397         int ret = 0, offset;
1398
1399         if (skb->ip_summed == CHECKSUM_COMPLETE)
1400                 goto out_set_summed;
1401
1402         if (unlikely(skb_shinfo(skb)->gso_size)) {
1403                 /* Let GSO fix up the checksum. */
1404                 goto out_set_summed;
1405         }
1406
1407         offset = skb->csum_start - skb_headroom(skb);
1408         BUG_ON(offset >= skb_headlen(skb));
1409         csum = skb_checksum(skb, offset, skb->len - offset, 0);
1410
1411         offset += skb->csum_offset;
1412         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1413
1414         if (skb_cloned(skb) &&
1415             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1416                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1417                 if (ret)
1418                         goto out;
1419         }
1420
1421         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1422 out_set_summed:
1423         skb->ip_summed = CHECKSUM_NONE;
1424 out:
1425         return ret;
1426 }
1427
1428 /**
1429  *      skb_gso_segment - Perform segmentation on skb.
1430  *      @skb: buffer to segment
1431  *      @features: features for the output path (see dev->features)
1432  *
1433  *      This function segments the given skb and returns a list of segments.
1434  *
1435  *      It may return NULL if the skb requires no segmentation.  This is
1436  *      only possible when GSO is used for verifying header integrity.
1437  */
1438 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1439 {
1440         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1441         struct packet_type *ptype;
1442         __be16 type = skb->protocol;
1443         int err;
1444
1445         BUG_ON(skb_shinfo(skb)->frag_list);
1446
1447         skb_reset_mac_header(skb);
1448         skb->mac_len = skb->network_header - skb->mac_header;
1449         __skb_pull(skb, skb->mac_len);
1450
1451         if (WARN_ON(skb->ip_summed != CHECKSUM_PARTIAL)) {
1452                 if (skb_header_cloned(skb) &&
1453                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1454                         return ERR_PTR(err);
1455         }
1456
1457         rcu_read_lock();
1458         list_for_each_entry_rcu(ptype,
1459                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1460                 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1461                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1462                                 err = ptype->gso_send_check(skb);
1463                                 segs = ERR_PTR(err);
1464                                 if (err || skb_gso_ok(skb, features))
1465                                         break;
1466                                 __skb_push(skb, (skb->data -
1467                                                  skb_network_header(skb)));
1468                         }
1469                         segs = ptype->gso_segment(skb, features);
1470                         break;
1471                 }
1472         }
1473         rcu_read_unlock();
1474
1475         __skb_push(skb, skb->data - skb_mac_header(skb));
1476
1477         return segs;
1478 }
1479
1480 EXPORT_SYMBOL(skb_gso_segment);
1481
1482 /* Take action when hardware reception checksum errors are detected. */
1483 #ifdef CONFIG_BUG
1484 void netdev_rx_csum_fault(struct net_device *dev)
1485 {
1486         if (net_ratelimit()) {
1487                 printk(KERN_ERR "%s: hw csum failure.\n",
1488                         dev ? dev->name : "<unknown>");
1489                 dump_stack();
1490         }
1491 }
1492 EXPORT_SYMBOL(netdev_rx_csum_fault);
1493 #endif
1494
1495 /* Actually, we should eliminate this check as soon as we know, that:
1496  * 1. IOMMU is present and allows to map all the memory.
1497  * 2. No high memory really exists on this machine.
1498  */
1499
1500 static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1501 {
1502 #ifdef CONFIG_HIGHMEM
1503         int i;
1504
1505         if (dev->features & NETIF_F_HIGHDMA)
1506                 return 0;
1507
1508         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1509                 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1510                         return 1;
1511
1512 #endif
1513         return 0;
1514 }
1515
1516 struct dev_gso_cb {
1517         void (*destructor)(struct sk_buff *skb);
1518 };
1519
1520 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1521
1522 static void dev_gso_skb_destructor(struct sk_buff *skb)
1523 {
1524         struct dev_gso_cb *cb;
1525
1526         do {
1527                 struct sk_buff *nskb = skb->next;
1528
1529                 skb->next = nskb->next;
1530                 nskb->next = NULL;
1531                 kfree_skb(nskb);
1532         } while (skb->next);
1533
1534         cb = DEV_GSO_CB(skb);
1535         if (cb->destructor)
1536                 cb->destructor(skb);
1537 }
1538
1539 /**
1540  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
1541  *      @skb: buffer to segment
1542  *
1543  *      This function segments the given skb and stores the list of segments
1544  *      in skb->next.
1545  */
1546 static int dev_gso_segment(struct sk_buff *skb)
1547 {
1548         struct net_device *dev = skb->dev;
1549         struct sk_buff *segs;
1550         int features = dev->features & ~(illegal_highdma(dev, skb) ?
1551                                          NETIF_F_SG : 0);
1552
1553         segs = skb_gso_segment(skb, features);
1554
1555         /* Verifying header integrity only. */
1556         if (!segs)
1557                 return 0;
1558
1559         if (IS_ERR(segs))
1560                 return PTR_ERR(segs);
1561
1562         skb->next = segs;
1563         DEV_GSO_CB(skb)->destructor = skb->destructor;
1564         skb->destructor = dev_gso_skb_destructor;
1565
1566         return 0;
1567 }
1568
1569 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
1570 {
1571         if (likely(!skb->next)) {
1572                 if (!list_empty(&ptype_all))
1573                         dev_queue_xmit_nit(skb, dev);
1574
1575                 if (netif_needs_gso(dev, skb)) {
1576                         if (unlikely(dev_gso_segment(skb)))
1577                                 goto out_kfree_skb;
1578                         if (skb->next)
1579                                 goto gso;
1580                 }
1581
1582                 return dev->hard_start_xmit(skb, dev);
1583         }
1584
1585 gso:
1586         do {
1587                 struct sk_buff *nskb = skb->next;
1588                 int rc;
1589
1590                 skb->next = nskb->next;
1591                 nskb->next = NULL;
1592                 rc = dev->hard_start_xmit(nskb, dev);
1593                 if (unlikely(rc)) {
1594                         nskb->next = skb->next;
1595                         skb->next = nskb;
1596                         return rc;
1597                 }
1598                 if (unlikely((netif_queue_stopped(dev) ||
1599                              netif_subqueue_stopped(dev, skb)) &&
1600                              skb->next))
1601                         return NETDEV_TX_BUSY;
1602         } while (skb->next);
1603
1604         skb->destructor = DEV_GSO_CB(skb)->destructor;
1605
1606 out_kfree_skb:
1607         kfree_skb(skb);
1608         return 0;
1609 }
1610
1611 /**
1612  *      dev_queue_xmit - transmit a buffer
1613  *      @skb: buffer to transmit
1614  *
1615  *      Queue a buffer for transmission to a network device. The caller must
1616  *      have set the device and priority and built the buffer before calling
1617  *      this function. The function can be called from an interrupt.
1618  *
1619  *      A negative errno code is returned on a failure. A success does not
1620  *      guarantee the frame will be transmitted as it may be dropped due
1621  *      to congestion or traffic shaping.
1622  *
1623  * -----------------------------------------------------------------------------------
1624  *      I notice this method can also return errors from the queue disciplines,
1625  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
1626  *      be positive.
1627  *
1628  *      Regardless of the return value, the skb is consumed, so it is currently
1629  *      difficult to retry a send to this method.  (You can bump the ref count
1630  *      before sending to hold a reference for retry if you are careful.)
1631  *
1632  *      When calling this method, interrupts MUST be enabled.  This is because
1633  *      the BH enable code must have IRQs enabled so that it will not deadlock.
1634  *          --BLG
1635  */
1636
1637 int dev_queue_xmit(struct sk_buff *skb)
1638 {
1639         struct net_device *dev = skb->dev;
1640         struct Qdisc *q;
1641         int rc = -ENOMEM;
1642
1643         /* GSO will handle the following emulations directly. */
1644         if (netif_needs_gso(dev, skb))
1645                 goto gso;
1646
1647         if (skb_shinfo(skb)->frag_list &&
1648             !(dev->features & NETIF_F_FRAGLIST) &&
1649             __skb_linearize(skb))
1650                 goto out_kfree_skb;
1651
1652         /* Fragmented skb is linearized if device does not support SG,
1653          * or if at least one of fragments is in highmem and device
1654          * does not support DMA from it.
1655          */
1656         if (skb_shinfo(skb)->nr_frags &&
1657             (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1658             __skb_linearize(skb))
1659                 goto out_kfree_skb;
1660
1661         /* If packet is not checksummed and device does not support
1662          * checksumming for this protocol, complete checksumming here.
1663          */
1664         if (skb->ip_summed == CHECKSUM_PARTIAL) {
1665                 skb_set_transport_header(skb, skb->csum_start -
1666                                               skb_headroom(skb));
1667                 if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
1668                         goto out_kfree_skb;
1669         }
1670
1671 gso:
1672         spin_lock_prefetch(&dev->queue_lock);
1673
1674         /* Disable soft irqs for various locks below. Also
1675          * stops preemption for RCU.
1676          */
1677         rcu_read_lock_bh();
1678
1679         /* Updates of qdisc are serialized by queue_lock.
1680          * The struct Qdisc which is pointed to by qdisc is now a
1681          * rcu structure - it may be accessed without acquiring
1682          * a lock (but the structure may be stale.) The freeing of the
1683          * qdisc will be deferred until it's known that there are no
1684          * more references to it.
1685          *
1686          * If the qdisc has an enqueue function, we still need to
1687          * hold the queue_lock before calling it, since queue_lock
1688          * also serializes access to the device queue.
1689          */
1690
1691         q = rcu_dereference(dev->qdisc);
1692 #ifdef CONFIG_NET_CLS_ACT
1693         skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
1694 #endif
1695         if (q->enqueue) {
1696                 /* Grab device queue */
1697                 spin_lock(&dev->queue_lock);
1698                 q = dev->qdisc;
1699                 if (q->enqueue) {
1700                         /* reset queue_mapping to zero */
1701                         skb_set_queue_mapping(skb, 0);
1702                         rc = q->enqueue(skb, q);
1703                         qdisc_run(dev);
1704                         spin_unlock(&dev->queue_lock);
1705
1706                         rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
1707                         goto out;
1708                 }
1709                 spin_unlock(&dev->queue_lock);
1710         }
1711
1712         /* The device has no queue. Common case for software devices:
1713            loopback, all the sorts of tunnels...
1714
1715            Really, it is unlikely that netif_tx_lock protection is necessary
1716            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
1717            counters.)
1718            However, it is possible, that they rely on protection
1719            made by us here.
1720
1721            Check this and shot the lock. It is not prone from deadlocks.
1722            Either shot noqueue qdisc, it is even simpler 8)
1723          */
1724         if (dev->flags & IFF_UP) {
1725                 int cpu = smp_processor_id(); /* ok because BHs are off */
1726
1727                 if (dev->xmit_lock_owner != cpu) {
1728
1729                         HARD_TX_LOCK(dev, cpu);
1730
1731                         if (!netif_queue_stopped(dev) &&
1732                             !netif_subqueue_stopped(dev, skb)) {
1733                                 rc = 0;
1734                                 if (!dev_hard_start_xmit(skb, dev)) {
1735                                         HARD_TX_UNLOCK(dev);
1736                                         goto out;
1737                                 }
1738                         }
1739                         HARD_TX_UNLOCK(dev);
1740                         if (net_ratelimit())
1741                                 printk(KERN_CRIT "Virtual device %s asks to "
1742                                        "queue packet!\n", dev->name);
1743                 } else {
1744                         /* Recursion is detected! It is possible,
1745                          * unfortunately */
1746                         if (net_ratelimit())
1747                                 printk(KERN_CRIT "Dead loop on virtual device "
1748                                        "%s, fix it urgently!\n", dev->name);
1749                 }
1750         }
1751
1752         rc = -ENETDOWN;
1753         rcu_read_unlock_bh();
1754
1755 out_kfree_skb:
1756         kfree_skb(skb);
1757         return rc;
1758 out:
1759         rcu_read_unlock_bh();
1760         return rc;
1761 }
1762
1763
1764 /*=======================================================================
1765                         Receiver routines
1766   =======================================================================*/
1767
1768 int netdev_max_backlog __read_mostly = 1000;
1769 int netdev_budget __read_mostly = 300;
1770 int weight_p __read_mostly = 64;            /* old backlog weight */
1771
1772 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1773
1774
1775 /**
1776  *      netif_rx        -       post buffer to the network code
1777  *      @skb: buffer to post
1778  *
1779  *      This function receives a packet from a device driver and queues it for
1780  *      the upper (protocol) levels to process.  It always succeeds. The buffer
1781  *      may be dropped during processing for congestion control or by the
1782  *      protocol layers.
1783  *
1784  *      return values:
1785  *      NET_RX_SUCCESS  (no congestion)
1786  *      NET_RX_DROP     (packet was dropped)
1787  *
1788  */
1789
1790 int netif_rx(struct sk_buff *skb)
1791 {
1792         struct softnet_data *queue;
1793         unsigned long flags;
1794
1795         /* if netpoll wants it, pretend we never saw it */
1796         if (netpoll_rx(skb))
1797                 return NET_RX_DROP;
1798
1799         if (!skb->tstamp.tv64)
1800                 net_timestamp(skb);
1801
1802         /*
1803          * The code is rearranged so that the path is the most
1804          * short when CPU is congested, but is still operating.
1805          */
1806         local_irq_save(flags);
1807         queue = &__get_cpu_var(softnet_data);
1808
1809         __get_cpu_var(netdev_rx_stat).total++;
1810         if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
1811                 if (queue->input_pkt_queue.qlen) {
1812 enqueue:
1813                         dev_hold(skb->dev);
1814                         __skb_queue_tail(&queue->input_pkt_queue, skb);
1815                         local_irq_restore(flags);
1816                         return NET_RX_SUCCESS;
1817                 }
1818
1819                 napi_schedule(&queue->backlog);
1820                 goto enqueue;
1821         }
1822
1823         __get_cpu_var(netdev_rx_stat).dropped++;
1824         local_irq_restore(flags);
1825
1826         kfree_skb(skb);
1827         return NET_RX_DROP;
1828 }
1829
1830 int netif_rx_ni(struct sk_buff *skb)
1831 {
1832         int err;
1833
1834         preempt_disable();
1835         err = netif_rx(skb);
1836         if (local_softirq_pending())
1837                 do_softirq();
1838         preempt_enable();
1839
1840         return err;
1841 }
1842
1843 EXPORT_SYMBOL(netif_rx_ni);
1844
1845 static inline struct net_device *skb_bond(struct sk_buff *skb)
1846 {
1847         struct net_device *dev = skb->dev;
1848
1849         if (dev->master) {
1850                 if (skb_bond_should_drop(skb)) {
1851                         kfree_skb(skb);
1852                         return NULL;
1853                 }
1854                 skb->dev = dev->master;
1855         }
1856
1857         return dev;
1858 }
1859
1860
1861 static void net_tx_action(struct softirq_action *h)
1862 {
1863         struct softnet_data *sd = &__get_cpu_var(softnet_data);
1864
1865         if (sd->completion_queue) {
1866                 struct sk_buff *clist;
1867
1868                 local_irq_disable();
1869                 clist = sd->completion_queue;
1870                 sd->completion_queue = NULL;
1871                 local_irq_enable();
1872
1873                 while (clist) {
1874                         struct sk_buff *skb = clist;
1875                         clist = clist->next;
1876
1877                         BUG_TRAP(!atomic_read(&skb->users));
1878                         __kfree_skb(skb);
1879                 }
1880         }
1881
1882         if (sd->output_queue) {
1883                 struct net_device *head;
1884
1885                 local_irq_disable();
1886                 head = sd->output_queue;
1887                 sd->output_queue = NULL;
1888                 local_irq_enable();
1889
1890                 while (head) {
1891                         struct net_device *dev = head;
1892                         head = head->next_sched;
1893
1894                         smp_mb__before_clear_bit();
1895                         clear_bit(__LINK_STATE_SCHED, &dev->state);
1896
1897                         if (spin_trylock(&dev->queue_lock)) {
1898                                 qdisc_run(dev);
1899                                 spin_unlock(&dev->queue_lock);
1900                         } else {
1901                                 netif_schedule(dev);
1902                         }
1903                 }
1904         }
1905 }
1906
1907 static inline int deliver_skb(struct sk_buff *skb,
1908                               struct packet_type *pt_prev,
1909                               struct net_device *orig_dev)
1910 {
1911         atomic_inc(&skb->users);
1912         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1913 }
1914
1915 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
1916 /* These hooks defined here for ATM */
1917 struct net_bridge;
1918 struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
1919                                                 unsigned char *addr);
1920 void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent) __read_mostly;
1921
1922 /*
1923  * If bridge module is loaded call bridging hook.
1924  *  returns NULL if packet was consumed.
1925  */
1926 struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
1927                                         struct sk_buff *skb) __read_mostly;
1928 static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
1929                                             struct packet_type **pt_prev, int *ret,
1930                                             struct net_device *orig_dev)
1931 {
1932         struct net_bridge_port *port;
1933
1934         if (skb->pkt_type == PACKET_LOOPBACK ||
1935             (port = rcu_dereference(skb->dev->br_port)) == NULL)
1936                 return skb;
1937
1938         if (*pt_prev) {
1939                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
1940                 *pt_prev = NULL;
1941         }
1942
1943         return br_handle_frame_hook(port, skb);
1944 }
1945 #else
1946 #define handle_bridge(skb, pt_prev, ret, orig_dev)      (skb)
1947 #endif
1948
1949 #if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
1950 struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
1951 EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
1952
1953 static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
1954                                              struct packet_type **pt_prev,
1955                                              int *ret,
1956                                              struct net_device *orig_dev)
1957 {
1958         if (skb->dev->macvlan_port == NULL)
1959                 return skb;
1960
1961         if (*pt_prev) {
1962                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
1963                 *pt_prev = NULL;
1964         }
1965         return macvlan_handle_frame_hook(skb);
1966 }
1967 #else
1968 #define handle_macvlan(skb, pt_prev, ret, orig_dev)     (skb)
1969 #endif
1970
1971 #ifdef CONFIG_NET_CLS_ACT
1972 /* TODO: Maybe we should just force sch_ingress to be compiled in
1973  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
1974  * a compare and 2 stores extra right now if we dont have it on
1975  * but have CONFIG_NET_CLS_ACT
1976  * NOTE: This doesnt stop any functionality; if you dont have
1977  * the ingress scheduler, you just cant add policies on ingress.
1978  *
1979  */
1980 static int ing_filter(struct sk_buff *skb)
1981 {
1982         struct Qdisc *q;
1983         struct net_device *dev = skb->dev;
1984         int result = TC_ACT_OK;
1985         u32 ttl = G_TC_RTTL(skb->tc_verd);
1986
1987         if (MAX_RED_LOOP < ttl++) {
1988                 printk(KERN_WARNING
1989                        "Redir loop detected Dropping packet (%d->%d)\n",
1990                        skb->iif, dev->ifindex);
1991                 return TC_ACT_SHOT;
1992         }
1993
1994         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
1995         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
1996
1997         spin_lock(&dev->ingress_lock);
1998         if ((q = dev->qdisc_ingress) != NULL)
1999                 result = q->enqueue(skb, q);
2000         spin_unlock(&dev->ingress_lock);
2001
2002         return result;
2003 }
2004
2005 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2006                                          struct packet_type **pt_prev,
2007                                          int *ret, struct net_device *orig_dev)
2008 {
2009         if (!skb->dev->qdisc_ingress)
2010                 goto out;
2011
2012         if (*pt_prev) {
2013                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2014                 *pt_prev = NULL;
2015         } else {
2016                 /* Huh? Why does turning on AF_PACKET affect this? */
2017                 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2018         }
2019
2020         switch (ing_filter(skb)) {
2021         case TC_ACT_SHOT:
2022         case TC_ACT_STOLEN:
2023                 kfree_skb(skb);
2024                 return NULL;
2025         }
2026
2027 out:
2028         skb->tc_verd = 0;
2029         return skb;
2030 }
2031 #endif
2032
2033 /**
2034  *      netif_receive_skb - process receive buffer from network
2035  *      @skb: buffer to process
2036  *
2037  *      netif_receive_skb() is the main receive data processing function.
2038  *      It always succeeds. The buffer may be dropped during processing
2039  *      for congestion control or by the protocol layers.
2040  *
2041  *      This function may only be called from softirq context and interrupts
2042  *      should be enabled.
2043  *
2044  *      Return values (usually ignored):
2045  *      NET_RX_SUCCESS: no congestion
2046  *      NET_RX_DROP: packet was dropped
2047  */
2048 int netif_receive_skb(struct sk_buff *skb)
2049 {
2050         struct packet_type *ptype, *pt_prev;
2051         struct net_device *orig_dev;
2052         int ret = NET_RX_DROP;
2053         __be16 type;
2054
2055         /* if we've gotten here through NAPI, check netpoll */
2056         if (netpoll_receive_skb(skb))
2057                 return NET_RX_DROP;
2058
2059         if (!skb->tstamp.tv64)
2060                 net_timestamp(skb);
2061
2062         if (!skb->iif)
2063                 skb->iif = skb->dev->ifindex;
2064
2065         orig_dev = skb_bond(skb);
2066
2067         if (!orig_dev)
2068                 return NET_RX_DROP;
2069
2070         __get_cpu_var(netdev_rx_stat).total++;
2071
2072         skb_reset_network_header(skb);
2073         skb_reset_transport_header(skb);
2074         skb->mac_len = skb->network_header - skb->mac_header;
2075
2076         pt_prev = NULL;
2077
2078         rcu_read_lock();
2079
2080 #ifdef CONFIG_NET_CLS_ACT
2081         if (skb->tc_verd & TC_NCLS) {
2082                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2083                 goto ncls;
2084         }
2085 #endif
2086
2087         list_for_each_entry_rcu(ptype, &ptype_all, list) {
2088                 if (!ptype->dev || ptype->dev == skb->dev) {
2089                         if (pt_prev)
2090                                 ret = deliver_skb(skb, pt_prev, orig_dev);
2091                         pt_prev = ptype;
2092                 }
2093         }
2094
2095 #ifdef CONFIG_NET_CLS_ACT
2096         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2097         if (!skb)
2098                 goto out;
2099 ncls:
2100 #endif
2101
2102         skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2103         if (!skb)
2104                 goto out;
2105         skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2106         if (!skb)
2107                 goto out;
2108
2109         type = skb->protocol;
2110         list_for_each_entry_rcu(ptype,
2111                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2112                 if (ptype->type == type &&
2113                     (!ptype->dev || ptype->dev == skb->dev)) {
2114                         if (pt_prev)
2115                                 ret = deliver_skb(skb, pt_prev, orig_dev);
2116                         pt_prev = ptype;
2117                 }
2118         }
2119
2120         if (pt_prev) {
2121                 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2122         } else {
2123                 kfree_skb(skb);
2124                 /* Jamal, now you will not able to escape explaining
2125                  * me how you were going to use this. :-)
2126                  */
2127                 ret = NET_RX_DROP;
2128         }
2129
2130 out:
2131         rcu_read_unlock();
2132         return ret;
2133 }
2134
2135 static int process_backlog(struct napi_struct *napi, int quota)
2136 {
2137         int work = 0;
2138         struct softnet_data *queue = &__get_cpu_var(softnet_data);
2139         unsigned long start_time = jiffies;
2140
2141         napi->weight = weight_p;
2142         do {
2143                 struct sk_buff *skb;
2144                 struct net_device *dev;
2145
2146                 local_irq_disable();
2147                 skb = __skb_dequeue(&queue->input_pkt_queue);
2148                 if (!skb) {
2149                         __napi_complete(napi);
2150                         local_irq_enable();
2151                         break;
2152                 }
2153
2154                 local_irq_enable();
2155
2156                 dev = skb->dev;
2157
2158                 netif_receive_skb(skb);
2159
2160                 dev_put(dev);
2161         } while (++work < quota && jiffies == start_time);
2162
2163         return work;
2164 }
2165
2166 /**
2167  * __napi_schedule - schedule for receive
2168  * @n: entry to schedule
2169  *
2170  * The entry's receive function will be scheduled to run
2171  */
2172 void __napi_schedule(struct napi_struct *n)
2173 {
2174         unsigned long flags;
2175
2176         local_irq_save(flags);
2177         list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
2178         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2179         local_irq_restore(flags);
2180 }
2181 EXPORT_SYMBOL(__napi_schedule);
2182
2183
2184 static void net_rx_action(struct softirq_action *h)
2185 {
2186         struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
2187         unsigned long start_time = jiffies;
2188         int budget = netdev_budget;
2189         void *have;
2190
2191         local_irq_disable();
2192
2193         while (!list_empty(list)) {
2194                 struct napi_struct *n;
2195                 int work, weight;
2196
2197                 /* If softirq window is exhuasted then punt.
2198                  *
2199                  * Note that this is a slight policy change from the
2200                  * previous NAPI code, which would allow up to 2
2201                  * jiffies to pass before breaking out.  The test
2202                  * used to be "jiffies - start_time > 1".
2203                  */
2204                 if (unlikely(budget <= 0 || jiffies != start_time))
2205                         goto softnet_break;
2206
2207                 local_irq_enable();
2208
2209                 /* Even though interrupts have been re-enabled, this
2210                  * access is safe because interrupts can only add new
2211                  * entries to the tail of this list, and only ->poll()
2212                  * calls can remove this head entry from the list.
2213                  */
2214                 n = list_entry(list->next, struct napi_struct, poll_list);
2215
2216                 have = netpoll_poll_lock(n);
2217
2218                 weight = n->weight;
2219
2220                 /* This NAPI_STATE_SCHED test is for avoiding a race
2221                  * with netpoll's poll_napi().  Only the entity which
2222                  * obtains the lock and sees NAPI_STATE_SCHED set will
2223                  * actually make the ->poll() call.  Therefore we avoid
2224                  * accidently calling ->poll() when NAPI is not scheduled.
2225                  */
2226                 work = 0;
2227                 if (test_bit(NAPI_STATE_SCHED, &n->state))
2228                         work = n->poll(n, weight);
2229
2230                 WARN_ON_ONCE(work > weight);
2231
2232                 budget -= work;
2233
2234                 local_irq_disable();
2235
2236                 /* Drivers must not modify the NAPI state if they
2237                  * consume the entire weight.  In such cases this code
2238                  * still "owns" the NAPI instance and therefore can
2239                  * move the instance around on the list at-will.
2240                  */
2241                 if (unlikely(work == weight)) {
2242                         if (unlikely(napi_disable_pending(n)))
2243                                 __napi_complete(n);
2244                         else
2245                                 list_move_tail(&n->poll_list, list);
2246                 }
2247
2248                 netpoll_poll_unlock(have);
2249         }
2250 out:
2251         local_irq_enable();
2252
2253 #ifdef CONFIG_NET_DMA
2254         /*
2255          * There may not be any more sk_buffs coming right now, so push
2256          * any pending DMA copies to hardware
2257          */
2258         if (!cpus_empty(net_dma.channel_mask)) {
2259                 int chan_idx;
2260                 for_each_cpu_mask(chan_idx, net_dma.channel_mask) {
2261                         struct dma_chan *chan = net_dma.channels[chan_idx];
2262                         if (chan)
2263                                 dma_async_memcpy_issue_pending(chan);
2264                 }
2265         }
2266 #endif
2267
2268         return;
2269
2270 softnet_break:
2271         __get_cpu_var(netdev_rx_stat).time_squeeze++;
2272         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2273         goto out;
2274 }
2275
2276 static gifconf_func_t * gifconf_list [NPROTO];
2277
2278 /**
2279  *      register_gifconf        -       register a SIOCGIF handler
2280  *      @family: Address family
2281  *      @gifconf: Function handler
2282  *
2283  *      Register protocol dependent address dumping routines. The handler
2284  *      that is passed must not be freed or reused until it has been replaced
2285  *      by another handler.
2286  */
2287 int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
2288 {
2289         if (family >= NPROTO)
2290                 return -EINVAL;
2291         gifconf_list[family] = gifconf;
2292         return 0;
2293 }
2294
2295
2296 /*
2297  *      Map an interface index to its name (SIOCGIFNAME)
2298  */
2299
2300 /*
2301  *      We need this ioctl for efficient implementation of the
2302  *      if_indextoname() function required by the IPv6 API.  Without
2303  *      it, we would have to search all the interfaces to find a
2304  *      match.  --pb
2305  */
2306
2307 static int dev_ifname(struct net *net, struct ifreq __user *arg)
2308 {
2309         struct net_device *dev;
2310         struct ifreq ifr;
2311
2312         /*
2313          *      Fetch the caller's info block.
2314          */
2315
2316         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2317                 return -EFAULT;
2318
2319         read_lock(&dev_base_lock);
2320         dev = __dev_get_by_index(net, ifr.ifr_ifindex);
2321         if (!dev) {
2322                 read_unlock(&dev_base_lock);
2323                 return -ENODEV;
2324         }
2325
2326         strcpy(ifr.ifr_name, dev->name);
2327         read_unlock(&dev_base_lock);
2328
2329         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
2330                 return -EFAULT;
2331         return 0;
2332 }
2333
2334 /*
2335  *      Perform a SIOCGIFCONF call. This structure will change
2336  *      size eventually, and there is nothing I can do about it.
2337  *      Thus we will need a 'compatibility mode'.
2338  */
2339
2340 static int dev_ifconf(struct net *net, char __user *arg)
2341 {
2342         struct ifconf ifc;
2343         struct net_device *dev;
2344         char __user *pos;
2345         int len;
2346         int total;
2347         int i;
2348
2349         /*
2350          *      Fetch the caller's info block.
2351          */
2352
2353         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
2354                 return -EFAULT;
2355
2356         pos = ifc.ifc_buf;
2357         len = ifc.ifc_len;
2358
2359         /*
2360          *      Loop over the interfaces, and write an info block for each.
2361          */
2362
2363         total = 0;
2364         for_each_netdev(net, dev) {
2365                 for (i = 0; i < NPROTO; i++) {
2366                         if (gifconf_list[i]) {
2367                                 int done;
2368                                 if (!pos)
2369                                         done = gifconf_list[i](dev, NULL, 0);
2370                                 else
2371                                         done = gifconf_list[i](dev, pos + total,
2372                                                                len - total);
2373                                 if (done < 0)
2374                                         return -EFAULT;
2375                                 total += done;
2376                         }
2377                 }
2378         }
2379
2380         /*
2381          *      All done.  Write the updated control block back to the caller.
2382          */
2383         ifc.ifc_len = total;
2384
2385         /*
2386          *      Both BSD and Solaris return 0 here, so we do too.
2387          */
2388         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
2389 }
2390
2391 #ifdef CONFIG_PROC_FS
2392 /*
2393  *      This is invoked by the /proc filesystem handler to display a device
2394  *      in detail.
2395  */
2396 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
2397         __acquires(dev_base_lock)
2398 {
2399         struct net *net = seq_file_net(seq);
2400         loff_t off;
2401         struct net_device *dev;
2402
2403         read_lock(&dev_base_lock);
2404         if (!*pos)
2405                 return SEQ_START_TOKEN;
2406
2407         off = 1;
2408         for_each_netdev(net, dev)
2409                 if (off++ == *pos)
2410                         return dev;
2411
2412         return NULL;
2413 }
2414
2415 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2416 {
2417         struct net *net = seq_file_net(seq);
2418         ++*pos;
2419         return v == SEQ_START_TOKEN ?
2420                 first_net_device(net) : next_net_device((struct net_device *)v);
2421 }
2422
2423 void dev_seq_stop(struct seq_file *seq, void *v)
2424         __releases(dev_base_lock)
2425 {
2426         read_unlock(&dev_base_lock);
2427 }
2428
2429 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
2430 {
2431         struct net_device_stats *stats = dev->get_stats(dev);
2432
2433         seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
2434                    "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
2435                    dev->name, stats->rx_bytes, stats->rx_packets,
2436                    stats->rx_errors,
2437                    stats->rx_dropped + stats->rx_missed_errors,
2438                    stats->rx_fifo_errors,
2439                    stats->rx_length_errors + stats->rx_over_errors +
2440                     stats->rx_crc_errors + stats->rx_frame_errors,
2441                    stats->rx_compressed, stats->multicast,
2442                    stats->tx_bytes, stats->tx_packets,
2443                    stats->tx_errors, stats->tx_dropped,
2444                    stats->tx_fifo_errors, stats->collisions,
2445                    stats->tx_carrier_errors +
2446                     stats->tx_aborted_errors +
2447                     stats->tx_window_errors +
2448                     stats->tx_heartbeat_errors,
2449                    stats->tx_compressed);
2450 }
2451
2452 /*
2453  *      Called from the PROCfs module. This now uses the new arbitrary sized
2454  *      /proc/net interface to create /proc/net/dev
2455  */
2456 static int dev_seq_show(struct seq_file *seq, void *v)
2457 {
2458         if (v == SEQ_START_TOKEN)
2459                 seq_puts(seq, "Inter-|   Receive                            "
2460                               "                    |  Transmit\n"
2461                               " face |bytes    packets errs drop fifo frame "
2462                               "compressed multicast|bytes    packets errs "
2463                               "drop fifo colls carrier compressed\n");
2464         else
2465                 dev_seq_printf_stats(seq, v);
2466         return 0;
2467 }
2468
2469 static struct netif_rx_stats *softnet_get_online(loff_t *pos)
2470 {
2471         struct netif_rx_stats *rc = NULL;
2472
2473         while (*pos < nr_cpu_ids)
2474                 if (cpu_online(*pos)) {
2475                         rc = &per_cpu(netdev_rx_stat, *pos);
2476                         break;
2477                 } else
2478                         ++*pos;
2479         return rc;
2480 }
2481
2482 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
2483 {
2484         return softnet_get_online(pos);
2485 }
2486
2487 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2488 {
2489         ++*pos;
2490         return softnet_get_online(pos);
2491 }
2492
2493 static void softnet_seq_stop(struct seq_file *seq, void *v)
2494 {
2495 }
2496
2497 static int softnet_seq_show(struct seq_file *seq, void *v)
2498 {
2499         struct netif_rx_stats *s = v;
2500
2501         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
2502                    s->total, s->dropped, s->time_squeeze, 0,
2503                    0, 0, 0, 0, /* was fastroute */
2504                    s->cpu_collision );
2505         return 0;
2506 }
2507
2508 static const struct seq_operations dev_seq_ops = {
2509         .start = dev_seq_start,
2510         .next  = dev_seq_next,
2511         .stop  = dev_seq_stop,
2512         .show  = dev_seq_show,
2513 };
2514
2515 static int dev_seq_open(struct inode *inode, struct file *file)
2516 {
2517         return seq_open_net(inode, file, &dev_seq_ops,
2518                             sizeof(struct seq_net_private));
2519 }
2520
2521 static const struct file_operations dev_seq_fops = {
2522         .owner   = THIS_MODULE,
2523         .open    = dev_seq_open,
2524         .read    = seq_read,
2525         .llseek  = seq_lseek,
2526         .release = seq_release_net,
2527 };
2528
2529 static const struct seq_operations softnet_seq_ops = {
2530         .start = softnet_seq_start,
2531         .next  = softnet_seq_next,
2532         .stop  = softnet_seq_stop,
2533         .show  = softnet_seq_show,
2534 };
2535
2536 static int softnet_seq_open(struct inode *inode, struct file *file)
2537 {
2538         return seq_open(file, &softnet_seq_ops);
2539 }
2540
2541 static const struct file_operations softnet_seq_fops = {
2542         .owner   = THIS_MODULE,
2543         .open    = softnet_seq_open,
2544         .read    = seq_read,
2545         .llseek  = seq_lseek,
2546         .release = seq_release,
2547 };
2548
2549 static void *ptype_get_idx(loff_t pos)
2550 {
2551         struct packet_type *pt = NULL;
2552         loff_t i = 0;
2553         int t;
2554
2555         list_for_each_entry_rcu(pt, &ptype_all, list) {
2556                 if (i == pos)
2557                         return pt;
2558                 ++i;
2559         }
2560
2561         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
2562                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
2563                         if (i == pos)
2564                                 return pt;
2565                         ++i;
2566                 }
2567         }
2568         return NULL;
2569 }
2570
2571 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
2572         __acquires(RCU)
2573 {
2574         rcu_read_lock();
2575         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
2576 }
2577
2578 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2579 {
2580         struct packet_type *pt;
2581         struct list_head *nxt;
2582         int hash;
2583
2584         ++*pos;
2585         if (v == SEQ_START_TOKEN)
2586                 return ptype_get_idx(0);
2587
2588         pt = v;
2589         nxt = pt->list.next;
2590         if (pt->type == htons(ETH_P_ALL)) {
2591                 if (nxt != &ptype_all)
2592                         goto found;
2593                 hash = 0;
2594                 nxt = ptype_base[0].next;
2595         } else
2596                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
2597
2598         while (nxt == &ptype_base[hash]) {
2599                 if (++hash >= PTYPE_HASH_SIZE)
2600                         return NULL;
2601                 nxt = ptype_base[hash].next;
2602         }
2603 found:
2604         return list_entry(nxt, struct packet_type, list);
2605 }
2606
2607 static void ptype_seq_stop(struct seq_file *seq, void *v)
2608         __releases(RCU)
2609 {
2610         rcu_read_unlock();
2611 }
2612
2613 static void ptype_seq_decode(struct seq_file *seq, void *sym)
2614 {
2615 #ifdef CONFIG_KALLSYMS
2616         unsigned long offset = 0, symsize;
2617         const char *symname;
2618         char *modname;
2619         char namebuf[128];
2620
2621         symname = kallsyms_lookup((unsigned long)sym, &symsize, &offset,
2622                                   &modname, namebuf);
2623
2624         if (symname) {
2625                 char *delim = ":";
2626
2627                 if (!modname)
2628                         modname = delim = "";
2629                 seq_printf(seq, "%s%s%s%s+0x%lx", delim, modname, delim,
2630                            symname, offset);
2631                 return;
2632         }
2633 #endif
2634
2635         seq_printf(seq, "[%p]", sym);
2636 }
2637
2638 static int ptype_seq_show(struct seq_file *seq, void *v)
2639 {
2640         struct packet_type *pt = v;
2641
2642         if (v == SEQ_START_TOKEN)
2643                 seq_puts(seq, "Type Device      Function\n");
2644         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
2645                 if (pt->type == htons(ETH_P_ALL))
2646                         seq_puts(seq, "ALL ");
2647                 else
2648                         seq_printf(seq, "%04x", ntohs(pt->type));
2649
2650                 seq_printf(seq, " %-8s ",
2651                            pt->dev ? pt->dev->name : "");
2652                 ptype_seq_decode(seq,  pt->func);
2653                 seq_putc(seq, '\n');
2654         }
2655
2656         return 0;
2657 }
2658
2659 static const struct seq_operations ptype_seq_ops = {
2660         .start = ptype_seq_start,
2661         .next  = ptype_seq_next,
2662         .stop  = ptype_seq_stop,
2663         .show  = ptype_seq_show,
2664 };
2665
2666 static int ptype_seq_open(struct inode *inode, struct file *file)
2667 {
2668         return seq_open_net(inode, file, &ptype_seq_ops,
2669                         sizeof(struct seq_net_private));
2670 }
2671
2672 static const struct file_operations ptype_seq_fops = {
2673         .owner   = THIS_MODULE,
2674         .open    = ptype_seq_open,
2675         .read    = seq_read,
2676         .llseek  = seq_lseek,
2677         .release = seq_release_net,
2678 };
2679
2680
2681 static int __net_init dev_proc_net_init(struct net *net)
2682 {
2683         int rc = -ENOMEM;
2684
2685         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
2686                 goto out;
2687         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
2688                 goto out_dev;
2689         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
2690                 goto out_softnet;
2691
2692         if (wext_proc_init(net))
2693                 goto out_ptype;
2694         rc = 0;
2695 out:
2696         return rc;
2697 out_ptype:
2698         proc_net_remove(net, "ptype");
2699 out_softnet:
2700         proc_net_remove(net, "softnet_stat");
2701 out_dev:
2702         proc_net_remove(net, "dev");
2703         goto out;
2704 }
2705
2706 static void __net_exit dev_proc_net_exit(struct net *net)
2707 {
2708         wext_proc_exit(net);
2709
2710         proc_net_remove(net, "ptype");
2711         proc_net_remove(net, "softnet_stat");
2712         proc_net_remove(net, "dev");
2713 }
2714
2715 static struct pernet_operations __net_initdata dev_proc_ops = {
2716         .init = dev_proc_net_init,
2717         .exit = dev_proc_net_exit,
2718 };
2719
2720 static int __init dev_proc_init(void)
2721 {
2722         return register_pernet_subsys(&dev_proc_ops);
2723 }
2724 #else
2725 #define dev_proc_init() 0
2726 #endif  /* CONFIG_PROC_FS */
2727
2728
2729 /**
2730  *      netdev_set_master       -       set up master/slave pair
2731  *      @slave: slave device
2732  *      @master: new master device
2733  *
2734  *      Changes the master device of the slave. Pass %NULL to break the
2735  *      bonding. The caller must hold the RTNL semaphore. On a failure
2736  *      a negative errno code is returned. On success the reference counts
2737  *      are adjusted, %RTM_NEWLINK is sent to the routing socket and the
2738  *      function returns zero.
2739  */
2740 int netdev_set_master(struct net_device *slave, struct net_device *master)
2741 {
2742         struct net_device *old = slave->master;
2743
2744         ASSERT_RTNL();
2745
2746         if (master) {
2747                 if (old)
2748                         return -EBUSY;
2749                 dev_hold(master);
2750         }
2751
2752         slave->master = master;
2753
2754         synchronize_net();
2755
2756         if (old)
2757                 dev_put(old);
2758
2759         if (master)
2760                 slave->flags |= IFF_SLAVE;
2761         else
2762                 slave->flags &= ~IFF_SLAVE;
2763
2764         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
2765         return 0;
2766 }
2767
2768 static void __dev_set_promiscuity(struct net_device *dev, int inc)
2769 {
2770         unsigned short old_flags = dev->flags;
2771
2772         ASSERT_RTNL();
2773
2774         if ((dev->promiscuity += inc) == 0)
2775                 dev->flags &= ~IFF_PROMISC;
2776         else
2777                 dev->flags |= IFF_PROMISC;
2778         if (dev->flags != old_flags) {
2779                 printk(KERN_INFO "device %s %s promiscuous mode\n",
2780                        dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
2781                                                                "left");
2782                 if (audit_enabled)
2783                         audit_log(current->audit_context, GFP_ATOMIC,
2784                                 AUDIT_ANOM_PROMISCUOUS,
2785                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
2786                                 dev->name, (dev->flags & IFF_PROMISC),
2787                                 (old_flags & IFF_PROMISC),
2788                                 audit_get_loginuid(current),
2789                                 current->uid, current->gid,
2790                                 audit_get_sessionid(current));
2791
2792                 if (dev->change_rx_flags)
2793                         dev->change_rx_flags(dev, IFF_PROMISC);
2794         }
2795 }
2796
2797 /**
2798  *      dev_set_promiscuity     - update promiscuity count on a device
2799  *      @dev: device
2800  *      @inc: modifier
2801  *
2802  *      Add or remove promiscuity from a device. While the count in the device
2803  *      remains above zero the interface remains promiscuous. Once it hits zero
2804  *      the device reverts back to normal filtering operation. A negative inc
2805  *      value is used to drop promiscuity on the device.
2806  */
2807 void dev_set_promiscuity(struct net_device *dev, int inc)
2808 {
2809         unsigned short old_flags = dev->flags;
2810
2811         __dev_set_promiscuity(dev, inc);
2812         if (dev->flags != old_flags)
2813                 dev_set_rx_mode(dev);
2814 }
2815
2816 /**
2817  *      dev_set_allmulti        - update allmulti count on a device
2818  *      @dev: device
2819  *      @inc: modifier
2820  *
2821  *      Add or remove reception of all multicast frames to a device. While the
2822  *      count in the device remains above zero the interface remains listening
2823  *      to all interfaces. Once it hits zero the device reverts back to normal
2824  *      filtering operation. A negative @inc value is used to drop the counter
2825  *      when releasing a resource needing all multicasts.
2826  */
2827
2828 void dev_set_allmulti(struct net_device *dev, int inc)
2829 {
2830         unsigned short old_flags = dev->flags;
2831
2832         ASSERT_RTNL();
2833
2834         dev->flags |= IFF_ALLMULTI;
2835         if ((dev->allmulti += inc) == 0)
2836                 dev->flags &= ~IFF_ALLMULTI;
2837         if (dev->flags ^ old_flags) {
2838                 if (dev->change_rx_flags)
2839                         dev->change_rx_flags(dev, IFF_ALLMULTI);
2840                 dev_set_rx_mode(dev);
2841         }
2842 }
2843
2844 /*
2845  *      Upload unicast and multicast address lists to device and
2846  *      configure RX filtering. When the device doesn't support unicast
2847  *      filtering it is put in promiscuous mode while unicast addresses
2848  *      are present.
2849  */
2850 void __dev_set_rx_mode(struct net_device *dev)
2851 {
2852         /* dev_open will call this function so the list will stay sane. */
2853         if (!(dev->flags&IFF_UP))
2854                 return;
2855
2856         if (!netif_device_present(dev))
2857                 return;
2858
2859         if (dev->set_rx_mode)
2860                 dev->set_rx_mode(dev);
2861         else {
2862                 /* Unicast addresses changes may only happen under the rtnl,
2863                  * therefore calling __dev_set_promiscuity here is safe.
2864                  */
2865                 if (dev->uc_count > 0 && !dev->uc_promisc) {
2866                         __dev_set_promiscuity(dev, 1);
2867                         dev->uc_promisc = 1;
2868                 } else if (dev->uc_count == 0 && dev->uc_promisc) {
2869                         __dev_set_promiscuity(dev, -1);
2870                         dev->uc_promisc = 0;
2871                 }
2872
2873                 if (dev->set_multicast_list)
2874                         dev->set_multicast_list(dev);
2875         }
2876 }
2877
2878 void dev_set_rx_mode(struct net_device *dev)
2879 {
2880         netif_tx_lock_bh(dev);
2881         __dev_set_rx_mode(dev);
2882         netif_tx_unlock_bh(dev);
2883 }
2884
2885 int __dev_addr_delete(struct dev_addr_list **list, int *count,
2886                       void *addr, int alen, int glbl)
2887 {
2888         struct dev_addr_list *da;
2889
2890         for (; (da = *list) != NULL; list = &da->next) {
2891                 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
2892                     alen == da->da_addrlen) {
2893                         if (glbl) {
2894                                 int old_glbl = da->da_gusers;
2895                                 da->da_gusers = 0;
2896                                 if (old_glbl == 0)
2897                                         break;
2898                         }
2899                         if (--da->da_users)
2900                                 return 0;
2901
2902                         *list = da->next;
2903                         kfree(da);
2904                         (*count)--;
2905                         return 0;
2906                 }
2907         }
2908         return -ENOENT;
2909 }
2910
2911 int __dev_addr_add(struct dev_addr_list **list, int *count,
2912                    void *addr, int alen, int glbl)
2913 {
2914         struct dev_addr_list *da;
2915
2916         for (da = *list; da != NULL; da = da->next) {
2917                 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
2918                     da->da_addrlen == alen) {
2919                         if (glbl) {
2920                                 int old_glbl = da->da_gusers;
2921                                 da->da_gusers = 1;
2922                                 if (old_glbl)
2923                                         return 0;
2924                         }
2925                         da->da_users++;
2926                         return 0;
2927                 }
2928         }
2929
2930         da = kzalloc(sizeof(*da), GFP_ATOMIC);
2931         if (da == NULL)
2932                 return -ENOMEM;
2933         memcpy(da->da_addr, addr, alen);
2934         da->da_addrlen = alen;
2935         da->da_users = 1;
2936         da->da_gusers = glbl ? 1 : 0;
2937         da->next = *list;
2938         *list = da;
2939         (*count)++;
2940         return 0;
2941 }
2942
2943 /**
2944  *      dev_unicast_delete      - Release secondary unicast address.
2945  *      @dev: device
2946  *      @addr: address to delete
2947  *      @alen: length of @addr
2948  *
2949  *      Release reference to a secondary unicast address and remove it
2950  *      from the device if the reference count drops to zero.
2951  *
2952  *      The caller must hold the rtnl_mutex.
2953  */
2954 int dev_unicast_delete(struct net_device *dev, void *addr, int alen)
2955 {
2956         int err;
2957
2958         ASSERT_RTNL();
2959
2960         netif_tx_lock_bh(dev);
2961         err = __dev_addr_delete(&dev->uc_list, &dev->uc_count, addr, alen, 0);
2962         if (!err)
2963                 __dev_set_rx_mode(dev);
2964         netif_tx_unlock_bh(dev);
2965         return err;
2966 }
2967 EXPORT_SYMBOL(dev_unicast_delete);
2968
2969 /**
2970  *      dev_unicast_add         - add a secondary unicast address
2971  *      @dev: device
2972  *      @addr: address to delete
2973  *      @alen: length of @addr
2974  *
2975  *      Add a secondary unicast address to the device or increase
2976  *      the reference count if it already exists.
2977  *
2978  *      The caller must hold the rtnl_mutex.
2979  */
2980 int dev_unicast_add(struct net_device *dev, void *addr, int alen)
2981 {
2982         int err;
2983
2984         ASSERT_RTNL();
2985
2986         netif_tx_lock_bh(dev);
2987         err = __dev_addr_add(&dev->uc_list, &dev->uc_count, addr, alen, 0);
2988         if (!err)
2989                 __dev_set_rx_mode(dev);
2990         netif_tx_unlock_bh(dev);
2991         return err;
2992 }
2993 EXPORT_SYMBOL(dev_unicast_add);
2994
2995 int __dev_addr_sync(struct dev_addr_list **to, int *to_count,
2996                     struct dev_addr_list **from, int *from_count)
2997 {
2998         struct dev_addr_list *da, *next;
2999         int err = 0;
3000
3001         da = *from;
3002         while (da != NULL) {
3003                 next = da->next;
3004                 if (!da->da_synced) {
3005                         err = __dev_addr_add(to, to_count,
3006                                              da->da_addr, da->da_addrlen, 0);
3007                         if (err < 0)
3008                                 break;
3009                         da->da_synced = 1;
3010                         da->da_users++;
3011                 } else if (da->da_users == 1) {
3012                         __dev_addr_delete(to, to_count,
3013                                           da->da_addr, da->da_addrlen, 0);
3014                         __dev_addr_delete(from, from_count,
3015                                           da->da_addr, da->da_addrlen, 0);
3016                 }
3017                 da = next;
3018         }
3019         return err;
3020 }
3021
3022 void __dev_addr_unsync(struct dev_addr_list **to, int *to_count,
3023                        struct dev_addr_list **from, int *from_count)
3024 {
3025         struct dev_addr_list *da, *next;
3026
3027         da = *from;
3028         while (da != NULL) {
3029                 next = da->next;
3030                 if (da->da_synced) {
3031                         __dev_addr_delete(to, to_count,
3032                                           da->da_addr, da->da_addrlen, 0);
3033                         da->da_synced = 0;
3034                         __dev_addr_delete(from, from_count,
3035                                           da->da_addr, da->da_addrlen, 0);
3036                 }
3037                 da = next;
3038         }
3039 }
3040
3041 /**
3042  *      dev_unicast_sync - Synchronize device's unicast list to another device
3043  *      @to: destination device
3044  *      @from: source device
3045  *
3046  *      Add newly added addresses to the destination device and release
3047  *      addresses that have no users left. The source device must be
3048  *      locked by netif_tx_lock_bh.
3049  *
3050  *      This function is intended to be called from the dev->set_rx_mode
3051  *      function of layered software devices.
3052  */
3053 int dev_unicast_sync(struct net_device *to, struct net_device *from)
3054 {
3055         int err = 0;
3056
3057         netif_tx_lock_bh(to);
3058         err = __dev_addr_sync(&to->uc_list, &to->uc_count,
3059                               &from->uc_list, &from->uc_count);
3060         if (!err)
3061                 __dev_set_rx_mode(to);
3062         netif_tx_unlock_bh(to);
3063         return err;
3064 }
3065 EXPORT_SYMBOL(dev_unicast_sync);
3066
3067 /**
3068  *      dev_unicast_unsync - Remove synchronized addresses from the destination device
3069  *      @to: destination device
3070  *      @from: source device
3071  *
3072  *      Remove all addresses that were added to the destination device by
3073  *      dev_unicast_sync(). This function is intended to be called from the
3074  *      dev->stop function of layered software devices.
3075  */
3076 void dev_unicast_unsync(struct net_device *to, struct net_device *from)
3077 {
3078         netif_tx_lock_bh(from);
3079         netif_tx_lock_bh(to);
3080
3081         __dev_addr_unsync(&to->uc_list, &to->uc_count,
3082                           &from->uc_list, &from->uc_count);
3083         __dev_set_rx_mode(to);
3084
3085         netif_tx_unlock_bh(to);
3086         netif_tx_unlock_bh(from);
3087 }
3088 EXPORT_SYMBOL(dev_unicast_unsync);
3089
3090 static void __dev_addr_discard(struct dev_addr_list **list)
3091 {
3092         struct dev_addr_list *tmp;
3093
3094         while (*list != NULL) {
3095                 tmp = *list;
3096                 *list = tmp->next;
3097                 if (tmp->da_users > tmp->da_gusers)
3098                         printk("__dev_addr_discard: address leakage! "
3099                                "da_users=%d\n", tmp->da_users);
3100                 kfree(tmp);
3101         }
3102 }
3103
3104 static void dev_addr_discard(struct net_device *dev)
3105 {
3106         netif_tx_lock_bh(dev);
3107
3108         __dev_addr_discard(&dev->uc_list);
3109         dev->uc_count = 0;
3110
3111         __dev_addr_discard(&dev->mc_list);
3112         dev->mc_count = 0;
3113
3114         netif_tx_unlock_bh(dev);
3115 }
3116
3117 unsigned dev_get_flags(const struct net_device *dev)
3118 {
3119         unsigned flags;
3120
3121         flags = (dev->flags & ~(IFF_PROMISC |
3122                                 IFF_ALLMULTI |
3123                                 IFF_RUNNING |
3124                                 IFF_LOWER_UP |
3125                                 IFF_DORMANT)) |
3126                 (dev->gflags & (IFF_PROMISC |
3127                                 IFF_ALLMULTI));
3128
3129         if (netif_running(dev)) {
3130                 if (netif_oper_up(dev))
3131                         flags |= IFF_RUNNING;
3132                 if (netif_carrier_ok(dev))
3133                         flags |= IFF_LOWER_UP;
3134                 if (netif_dormant(dev))
3135                         flags |= IFF_DORMANT;
3136         }
3137
3138         return flags;
3139 }
3140
3141 int dev_change_flags(struct net_device *dev, unsigned flags)
3142 {
3143         int ret, changes;
3144         int old_flags = dev->flags;
3145
3146         ASSERT_RTNL();
3147
3148         /*
3149          *      Set the flags on our device.
3150          */
3151
3152         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
3153                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
3154                                IFF_AUTOMEDIA)) |
3155                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
3156                                     IFF_ALLMULTI));
3157
3158         /*
3159          *      Load in the correct multicast list now the flags have changed.
3160          */
3161
3162         if (dev->change_rx_flags && (old_flags ^ flags) & IFF_MULTICAST)
3163                 dev->change_rx_flags(dev, IFF_MULTICAST);
3164
3165         dev_set_rx_mode(dev);
3166
3167         /*
3168          *      Have we downed the interface. We handle IFF_UP ourselves
3169          *      according to user attempts to set it, rather than blindly
3170          *      setting it.
3171          */
3172
3173         ret = 0;
3174         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
3175                 ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
3176
3177                 if (!ret)
3178                         dev_set_rx_mode(dev);
3179         }
3180
3181         if (dev->flags & IFF_UP &&
3182             ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
3183                                           IFF_VOLATILE)))
3184                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
3185
3186         if ((flags ^ dev->gflags) & IFF_PROMISC) {
3187                 int inc = (flags & IFF_PROMISC) ? +1 : -1;
3188                 dev->gflags ^= IFF_PROMISC;
3189                 dev_set_promiscuity(dev, inc);
3190         }
3191
3192         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
3193            is important. Some (broken) drivers set IFF_PROMISC, when
3194            IFF_ALLMULTI is requested not asking us and not reporting.
3195          */
3196         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
3197                 int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
3198                 dev->gflags ^= IFF_ALLMULTI;
3199                 dev_set_allmulti(dev, inc);
3200         }
3201
3202         /* Exclude state transition flags, already notified */
3203         changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING);
3204         if (changes)
3205                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
3206
3207         return ret;
3208 }
3209
3210 int dev_set_mtu(struct net_device *dev, int new_mtu)
3211 {
3212         int err;
3213
3214         if (new_mtu == dev->mtu)
3215                 return 0;
3216
3217         /*      MTU must be positive.    */
3218         if (new_mtu < 0)
3219                 return -EINVAL;
3220
3221         if (!netif_device_present(dev))
3222                 return -ENODEV;
3223
3224         err = 0;
3225         if (dev->change_mtu)
3226                 err = dev->change_mtu(dev, new_mtu);
3227         else
3228                 dev->mtu = new_mtu;
3229         if (!err && dev->flags & IFF_UP)
3230                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
3231         return err;
3232 }
3233
3234 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
3235 {
3236         int err;
3237
3238         if (!dev->set_mac_address)
3239                 return -EOPNOTSUPP;
3240         if (sa->sa_family != dev->type)
3241                 return -EINVAL;
3242         if (!netif_device_present(dev))
3243                 return -ENODEV;
3244         err = dev->set_mac_address(dev, sa);
3245         if (!err)
3246                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3247         return err;
3248 }
3249
3250 /*
3251  *      Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock)
3252  */
3253 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
3254 {
3255         int err;
3256         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3257
3258         if (!dev)
3259                 return -ENODEV;
3260
3261         switch (cmd) {
3262                 case SIOCGIFFLAGS:      /* Get interface flags */
3263                         ifr->ifr_flags = dev_get_flags(dev);
3264                         return 0;
3265
3266                 case SIOCGIFMETRIC:     /* Get the metric on the interface
3267                                            (currently unused) */
3268                         ifr->ifr_metric = 0;
3269                         return 0;
3270
3271                 case SIOCGIFMTU:        /* Get the MTU of a device */
3272                         ifr->ifr_mtu = dev->mtu;
3273                         return 0;
3274
3275                 case SIOCGIFHWADDR:
3276                         if (!dev->addr_len)
3277                                 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
3278                         else
3279                                 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
3280                                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3281                         ifr->ifr_hwaddr.sa_family = dev->type;
3282                         return 0;
3283
3284                 case SIOCGIFSLAVE:
3285                         err = -EINVAL;
3286                         break;
3287
3288                 case SIOCGIFMAP:
3289                         ifr->ifr_map.mem_start = dev->mem_start;
3290                         ifr->ifr_map.mem_end   = dev->mem_end;
3291                         ifr->ifr_map.base_addr = dev->base_addr;
3292                         ifr->ifr_map.irq       = dev->irq;
3293                         ifr->ifr_map.dma       = dev->dma;
3294                         ifr->ifr_map.port      = dev->if_port;
3295                         return 0;
3296
3297                 case SIOCGIFINDEX:
3298                         ifr->ifr_ifindex = dev->ifindex;
3299                         return 0;
3300
3301                 case SIOCGIFTXQLEN:
3302                         ifr->ifr_qlen = dev->tx_queue_len;
3303                         return 0;
3304
3305                 default:
3306                         /* dev_ioctl() should ensure this case
3307                          * is never reached
3308                          */
3309                         WARN_ON(1);
3310                         err = -EINVAL;
3311                         break;
3312
3313         }
3314         return err;
3315 }
3316
3317 /*
3318  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
3319  */
3320 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
3321 {
3322         int err;
3323         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3324
3325         if (!dev)
3326                 return -ENODEV;
3327
3328         switch (cmd) {
3329                 case SIOCSIFFLAGS:      /* Set interface flags */
3330                         return dev_change_flags(dev, ifr->ifr_flags);
3331
3332                 case SIOCSIFMETRIC:     /* Set the metric on the interface
3333                                            (currently unused) */
3334                         return -EOPNOTSUPP;
3335
3336                 case SIOCSIFMTU:        /* Set the MTU of a device */
3337                         return dev_set_mtu(dev, ifr->ifr_mtu);
3338
3339                 case SIOCSIFHWADDR:
3340                         return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
3341
3342                 case SIOCSIFHWBROADCAST:
3343                         if (ifr->ifr_hwaddr.sa_family != dev->type)
3344                                 return -EINVAL;
3345                         memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
3346                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3347                         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3348                         return 0;
3349
3350                 case SIOCSIFMAP:
3351                         if (dev->set_config) {
3352                                 if (!netif_device_present(dev))
3353                                         return -ENODEV;
3354                                 return dev->set_config(dev, &ifr->ifr_map);
3355                         }
3356                         return -EOPNOTSUPP;
3357
3358                 case SIOCADDMULTI:
3359                         if ((!dev->set_multicast_list && !dev->set_rx_mode) ||
3360                             ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3361                                 return -EINVAL;
3362                         if (!netif_device_present(dev))
3363                                 return -ENODEV;
3364                         return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
3365                                           dev->addr_len, 1);
3366
3367                 case SIOCDELMULTI:
3368                         if ((!dev->set_multicast_list && !dev->set_rx_mode) ||
3369                             ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3370                                 return -EINVAL;
3371                         if (!netif_device_present(dev))
3372                                 return -ENODEV;
3373                         return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
3374                                              dev->addr_len, 1);
3375
3376                 case SIOCSIFTXQLEN:
3377                         if (ifr->ifr_qlen < 0)
3378                                 return -EINVAL;
3379                         dev->tx_queue_len = ifr->ifr_qlen;
3380                         return 0;
3381
3382                 case SIOCSIFNAME:
3383                         ifr->ifr_newname[IFNAMSIZ-1] = '\0';
3384                         return dev_change_name(dev, ifr->ifr_newname);
3385
3386                 /*
3387                  *      Unknown or private ioctl
3388                  */
3389
3390                 default:
3391                         if ((cmd >= SIOCDEVPRIVATE &&
3392                             cmd <= SIOCDEVPRIVATE + 15) ||
3393                             cmd == SIOCBONDENSLAVE ||
3394                             cmd == SIOCBONDRELEASE ||
3395                             cmd == SIOCBONDSETHWADDR ||
3396                             cmd == SIOCBONDSLAVEINFOQUERY ||
3397                             cmd == SIOCBONDINFOQUERY ||
3398                             cmd == SIOCBONDCHANGEACTIVE ||
3399                             cmd == SIOCGMIIPHY ||
3400                             cmd == SIOCGMIIREG ||
3401                             cmd == SIOCSMIIREG ||
3402                             cmd == SIOCBRADDIF ||
3403                             cmd == SIOCBRDELIF ||
3404                             cmd == SIOCWANDEV) {
3405                                 err = -EOPNOTSUPP;
3406                                 if (dev->do_ioctl) {
3407                                         if (netif_device_present(dev))
3408                                                 err = dev->do_ioctl(dev, ifr,
3409                                                                     cmd);
3410                                         else
3411                                                 err = -ENODEV;
3412                                 }
3413                         } else
3414                                 err = -EINVAL;
3415
3416         }
3417         return err;
3418 }
3419
3420 /*
3421  *      This function handles all "interface"-type I/O control requests. The actual
3422  *      'doing' part of this is dev_ifsioc above.
3423  */
3424
3425 /**
3426  *      dev_ioctl       -       network device ioctl
3427  *      @net: the applicable net namespace
3428  *      @cmd: command to issue
3429  *      @arg: pointer to a struct ifreq in user space
3430  *
3431  *      Issue ioctl functions to devices. This is normally called by the
3432  *      user space syscall interfaces but can sometimes be useful for
3433  *      other purposes. The return value is the return from the syscall if
3434  *      positive or a negative errno code on error.
3435  */
3436
3437 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3438 {
3439         struct ifreq ifr;
3440         int ret;
3441         char *colon;
3442
3443         /* One special case: SIOCGIFCONF takes ifconf argument
3444            and requires shared lock, because it sleeps writing
3445            to user space.
3446          */
3447
3448         if (cmd == SIOCGIFCONF) {
3449                 rtnl_lock();
3450                 ret = dev_ifconf(net, (char __user *) arg);
3451                 rtnl_unlock();
3452                 return ret;
3453         }
3454         if (cmd == SIOCGIFNAME)
3455                 return dev_ifname(net, (struct ifreq __user *)arg);
3456
3457         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3458                 return -EFAULT;
3459
3460         ifr.ifr_name[IFNAMSIZ-1] = 0;
3461
3462         colon = strchr(ifr.ifr_name, ':');
3463         if (colon)
3464                 *colon = 0;
3465
3466         /*
3467          *      See which interface the caller is talking about.
3468          */
3469
3470         switch (cmd) {
3471                 /*
3472                  *      These ioctl calls:
3473                  *      - can be done by all.
3474                  *      - atomic and do not require locking.
3475                  *      - return a value
3476                  */
3477                 case SIOCGIFFLAGS:
3478                 case SIOCGIFMETRIC:
3479                 case SIOCGIFMTU:
3480                 case SIOCGIFHWADDR:
3481                 case SIOCGIFSLAVE:
3482                 case SIOCGIFMAP:
3483                 case SIOCGIFINDEX:
3484                 case SIOCGIFTXQLEN:
3485                         dev_load(net, ifr.ifr_name);
3486                         read_lock(&dev_base_lock);
3487                         ret = dev_ifsioc_locked(net, &ifr, cmd);
3488                         read_unlock(&dev_base_lock);
3489                         if (!ret) {
3490                                 if (colon)
3491                                         *colon = ':';
3492                                 if (copy_to_user(arg, &ifr,
3493                                                  sizeof(struct ifreq)))
3494                                         ret = -EFAULT;
3495                         }
3496                         return ret;
3497
3498                 case SIOCETHTOOL:
3499                         dev_load(net, ifr.ifr_name);
3500                         rtnl_lock();
3501                         ret = dev_ethtool(net, &ifr);
3502                         rtnl_unlock();
3503                         if (!ret) {
3504                                 if (colon)
3505                                         *colon = ':';
3506                                 if (copy_to_user(arg, &ifr,
3507                                                  sizeof(struct ifreq)))
3508                                         ret = -EFAULT;
3509                         }
3510                         return ret;
3511
3512                 /*
3513                  *      These ioctl calls:
3514                  *      - require superuser power.
3515                  *      - require strict serialization.
3516                  *      - return a value
3517                  */
3518                 case SIOCGMIIPHY:
3519                 case SIOCGMIIREG:
3520                 case SIOCSIFNAME:
3521                         if (!capable(CAP_NET_ADMIN))
3522                                 return -EPERM;
3523                         dev_load(net, ifr.ifr_name);
3524                         rtnl_lock();
3525                         ret = dev_ifsioc(net, &ifr, cmd);
3526                         rtnl_unlock();
3527                         if (!ret) {
3528                                 if (colon)
3529                                         *colon = ':';
3530                                 if (copy_to_user(arg, &ifr,
3531                                                  sizeof(struct ifreq)))
3532                                         ret = -EFAULT;
3533                         }
3534                         return ret;
3535
3536                 /*
3537                  *      These ioctl calls:
3538                  *      - require superuser power.
3539                  *      - require strict serialization.
3540                  *      - do not return a value
3541                  */
3542                 case SIOCSIFFLAGS:
3543                 case SIOCSIFMETRIC:
3544                 case SIOCSIFMTU:
3545                 case SIOCSIFMAP:
3546                 case SIOCSIFHWADDR:
3547                 case SIOCSIFSLAVE:
3548                 case SIOCADDMULTI:
3549                 case SIOCDELMULTI:
3550                 case SIOCSIFHWBROADCAST:
3551                 case SIOCSIFTXQLEN:
3552                 case SIOCSMIIREG:
3553                 case SIOCBONDENSLAVE:
3554                 case SIOCBONDRELEASE:
3555                 case SIOCBONDSETHWADDR:
3556                 case SIOCBONDCHANGEACTIVE:
3557                 case SIOCBRADDIF:
3558                 case SIOCBRDELIF:
3559                         if (!capable(CAP_NET_ADMIN))
3560                                 return -EPERM;
3561                         /* fall through */
3562                 case SIOCBONDSLAVEINFOQUERY:
3563                 case SIOCBONDINFOQUERY:
3564                         dev_load(net, ifr.ifr_name);
3565                         rtnl_lock();
3566                         ret = dev_ifsioc(net, &ifr, cmd);
3567                         rtnl_unlock();
3568                         return ret;
3569
3570                 case SIOCGIFMEM:
3571                         /* Get the per device memory space. We can add this but
3572                          * currently do not support it */
3573                 case SIOCSIFMEM:
3574                         /* Set the per device memory buffer space.
3575                          * Not applicable in our case */
3576                 case SIOCSIFLINK:
3577                         return -EINVAL;
3578
3579                 /*
3580                  *      Unknown or private ioctl.
3581                  */
3582                 default:
3583                         if (cmd == SIOCWANDEV ||
3584                             (cmd >= SIOCDEVPRIVATE &&
3585                              cmd <= SIOCDEVPRIVATE + 15)) {
3586                                 dev_load(net, ifr.ifr_name);
3587                                 rtnl_lock();
3588                                 ret = dev_ifsioc(net, &ifr, cmd);
3589                                 rtnl_unlock();
3590                                 if (!ret && copy_to_user(arg, &ifr,
3591                                                          sizeof(struct ifreq)))
3592                                         ret = -EFAULT;
3593                                 return ret;
3594                         }
3595                         /* Take care of Wireless Extensions */
3596                         if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
3597                                 return wext_handle_ioctl(net, &ifr, cmd, arg);
3598                         return -EINVAL;
3599         }
3600 }
3601
3602
3603 /**
3604  *      dev_new_index   -       allocate an ifindex
3605  *      @net: the applicable net namespace
3606  *
3607  *      Returns a suitable unique value for a new device interface
3608  *      number.  The caller must hold the rtnl semaphore or the
3609  *      dev_base_lock to be sure it remains unique.
3610  */
3611 static int dev_new_index(struct net *net)
3612 {
3613         static int ifindex;
3614         for (;;) {
3615                 if (++ifindex <= 0)
3616                         ifindex = 1;
3617                 if (!__dev_get_by_index(net, ifindex))
3618                         return ifindex;
3619         }
3620 }
3621
3622 /* Delayed registration/unregisteration */
3623 static DEFINE_SPINLOCK(net_todo_list_lock);
3624 static LIST_HEAD(net_todo_list);
3625
3626 static void net_set_todo(struct net_device *dev)
3627 {
3628         spin_lock(&net_todo_list_lock);
3629         list_add_tail(&dev->todo_list, &net_todo_list);
3630         spin_unlock(&net_todo_list_lock);
3631 }
3632
3633 static void rollback_registered(struct net_device *dev)
3634 {
3635         BUG_ON(dev_boot_phase);
3636         ASSERT_RTNL();
3637
3638         /* Some devices call without registering for initialization unwind. */
3639         if (dev->reg_state == NETREG_UNINITIALIZED) {
3640                 printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
3641                                   "was registered\n", dev->name, dev);
3642
3643                 WARN_ON(1);
3644                 return;
3645         }
3646
3647         BUG_ON(dev->reg_state != NETREG_REGISTERED);
3648
3649         /* If device is running, close it first. */
3650         dev_close(dev);
3651
3652         /* And unlink it from device chain. */
3653         unlist_netdevice(dev);
3654
3655         dev->reg_state = NETREG_UNREGISTERING;
3656
3657         synchronize_net();
3658
3659         /* Shutdown queueing discipline. */
3660         dev_shutdown(dev);
3661
3662
3663         /* Notify protocols, that we are about to destroy
3664            this device. They should clean all the things.
3665         */
3666         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
3667
3668         /*
3669          *      Flush the unicast and multicast chains
3670          */
3671         dev_addr_discard(dev);
3672
3673         if (dev->uninit)
3674                 dev->uninit(dev);
3675
3676         /* Notifier chain MUST detach us from master device. */
3677         BUG_TRAP(!dev->master);
3678
3679         /* Remove entries from kobject tree */
3680         netdev_unregister_kobject(dev);
3681
3682         synchronize_net();
3683
3684         dev_put(dev);
3685 }
3686
3687 /**
3688  *      register_netdevice      - register a network device
3689  *      @dev: device to register
3690  *
3691  *      Take a completed network device structure and add it to the kernel
3692  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
3693  *      chain. 0 is returned on success. A negative errno code is returned
3694  *      on a failure to set up the device, or if the name is a duplicate.
3695  *
3696  *      Callers must hold the rtnl semaphore. You may want
3697  *      register_netdev() instead of this.
3698  *
3699  *      BUGS:
3700  *      The locking appears insufficient to guarantee two parallel registers
3701  *      will not get the same name.
3702  */
3703
3704 int register_netdevice(struct net_device *dev)
3705 {
3706         struct hlist_head *head;
3707         struct hlist_node *p;
3708         int ret;
3709         struct net *net;
3710
3711         BUG_ON(dev_boot_phase);
3712         ASSERT_RTNL();
3713
3714         might_sleep();
3715
3716         /* When net_device's are persistent, this will be fatal. */
3717         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
3718         BUG_ON(!dev_net(dev));
3719         net = dev_net(dev);
3720
3721         spin_lock_init(&dev->queue_lock);
3722         spin_lock_init(&dev->_xmit_lock);
3723         netdev_set_lockdep_class(&dev->_xmit_lock, dev->type);
3724         dev->xmit_lock_owner = -1;
3725         spin_lock_init(&dev->ingress_lock);
3726
3727         dev->iflink = -1;
3728
3729         /* Init, if this function is available */
3730         if (dev->init) {
3731                 ret = dev->init(dev);
3732                 if (ret) {
3733                         if (ret > 0)
3734                                 ret = -EIO;
3735                         goto out;
3736                 }
3737         }
3738
3739         if (!dev_valid_name(dev->name)) {
3740                 ret = -EINVAL;
3741                 goto err_uninit;
3742         }
3743
3744         dev->ifindex = dev_new_index(net);
3745         if (dev->iflink == -1)
3746                 dev->iflink = dev->ifindex;
3747
3748         /* Check for existence of name */
3749         head = dev_name_hash(net, dev->name);
3750         hlist_for_each(p, head) {
3751                 struct net_device *d
3752                         = hlist_entry(p, struct net_device, name_hlist);
3753                 if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
3754                         ret = -EEXIST;
3755                         goto err_uninit;
3756                 }
3757         }
3758
3759         /* Fix illegal checksum combinations */
3760         if ((dev->features & NETIF_F_HW_CSUM) &&
3761             (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
3762                 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
3763                        dev->name);
3764                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
3765         }
3766
3767         if ((dev->features & NETIF_F_NO_CSUM) &&
3768             (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
3769                 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
3770                        dev->name);
3771                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
3772         }
3773
3774
3775         /* Fix illegal SG+CSUM combinations. */
3776         if ((dev->features & NETIF_F_SG) &&
3777             !(dev->features & NETIF_F_ALL_CSUM)) {
3778                 printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no checksum feature.\n",
3779                        dev->name);
3780                 dev->features &= ~NETIF_F_SG;
3781         }
3782
3783         /* TSO requires that SG is present as well. */
3784         if ((dev->features & NETIF_F_TSO) &&
3785             !(dev->features & NETIF_F_SG)) {
3786                 printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no SG feature.\n",
3787                        dev->name);
3788                 dev->features &= ~NETIF_F_TSO;
3789         }
3790         if (dev->features & NETIF_F_UFO) {
3791                 if (!(dev->features & NETIF_F_HW_CSUM)) {
3792                         printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
3793                                         "NETIF_F_HW_CSUM feature.\n",
3794                                                         dev->name);
3795                         dev->features &= ~NETIF_F_UFO;
3796                 }
3797                 if (!(dev->features & NETIF_F_SG)) {
3798                         printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
3799                                         "NETIF_F_SG feature.\n",
3800                                         dev->name);
3801                         dev->features &= ~NETIF_F_UFO;
3802                 }
3803         }
3804
3805         netdev_initialize_kobject(dev);
3806         ret = netdev_register_kobject(dev);
3807         if (ret)
3808                 goto err_uninit;
3809         dev->reg_state = NETREG_REGISTERED;
3810
3811         /*
3812          *      Default initial state at registry is that the
3813          *      device is present.
3814          */
3815
3816         set_bit(__LINK_STATE_PRESENT, &dev->state);
3817
3818         dev_init_scheduler(dev);
3819         dev_hold(dev);
3820         list_netdevice(dev);
3821
3822         /* Notify protocols, that a new device appeared. */
3823         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
3824         ret = notifier_to_errno(ret);
3825         if (ret) {
3826                 rollback_registered(dev);
3827                 dev->reg_state = NETREG_UNREGISTERED;
3828         }
3829
3830 out:
3831         return ret;
3832
3833 err_uninit:
3834         if (dev->uninit)
3835                 dev->uninit(dev);
3836         goto out;
3837 }
3838
3839 /**
3840  *      register_netdev - register a network device
3841  *      @dev: device to register
3842  *
3843  *      Take a completed network device structure and add it to the kernel
3844  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
3845  *      chain. 0 is returned on success. A negative errno code is returned
3846  *      on a failure to set up the device, or if the name is a duplicate.
3847  *
3848  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
3849  *      and expands the device name if you passed a format string to
3850  *      alloc_netdev.
3851  */
3852 int register_netdev(struct net_device *dev)
3853 {
3854         int err;
3855
3856         rtnl_lock();
3857
3858         /*
3859          * If the name is a format string the caller wants us to do a
3860          * name allocation.
3861          */
3862         if (strchr(dev->name, '%')) {
3863                 err = dev_alloc_name(dev, dev->name);
3864                 if (err < 0)
3865                         goto out;
3866         }
3867
3868         err = register_netdevice(dev);
3869 out:
3870         rtnl_unlock();
3871         return err;
3872 }
3873 EXPORT_SYMBOL(register_netdev);
3874
3875 /*
3876  * netdev_wait_allrefs - wait until all references are gone.
3877  *
3878  * This is called when unregistering network devices.
3879  *
3880  * Any protocol or device that holds a reference should register
3881  * for netdevice notification, and cleanup and put back the
3882  * reference if they receive an UNREGISTER event.
3883  * We can get stuck here if buggy protocols don't correctly
3884  * call dev_put.
3885  */
3886 static void netdev_wait_allrefs(struct net_device *dev)
3887 {
3888         unsigned long rebroadcast_time, warning_time;
3889
3890         rebroadcast_time = warning_time = jiffies;
3891         while (atomic_read(&dev->refcnt) != 0) {
3892                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
3893                         rtnl_lock();
3894
3895                         /* Rebroadcast unregister notification */
3896                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
3897
3898                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
3899                                      &dev->state)) {
3900                                 /* We must not have linkwatch events
3901                                  * pending on unregister. If this
3902                                  * happens, we simply run the queue
3903                                  * unscheduled, resulting in a noop
3904                                  * for this device.
3905                                  */
3906                                 linkwatch_run_queue();
3907                         }
3908
3909                         __rtnl_unlock();
3910
3911                         rebroadcast_time = jiffies;
3912                 }
3913
3914                 msleep(250);
3915
3916                 if (time_after(jiffies, warning_time + 10 * HZ)) {
3917                         printk(KERN_EMERG "unregister_netdevice: "
3918                                "waiting for %s to become free. Usage "
3919                                "count = %d\n",
3920                                dev->name, atomic_read(&dev->refcnt));
3921                         warning_time = jiffies;
3922                 }
3923         }
3924 }
3925
3926 /* The sequence is:
3927  *
3928  *      rtnl_lock();
3929  *      ...
3930  *      register_netdevice(x1);
3931  *      register_netdevice(x2);
3932  *      ...
3933  *      unregister_netdevice(y1);
3934  *      unregister_netdevice(y2);
3935  *      ...
3936  *      rtnl_unlock();
3937  *      free_netdev(y1);
3938  *      free_netdev(y2);
3939  *
3940  * We are invoked by rtnl_unlock() after it drops the semaphore.
3941  * This allows us to deal with problems:
3942  * 1) We can delete sysfs objects which invoke hotplug
3943  *    without deadlocking with linkwatch via keventd.
3944  * 2) Since we run with the RTNL semaphore not held, we can sleep
3945  *    safely in order to wait for the netdev refcnt to drop to zero.
3946  */
3947 static DEFINE_MUTEX(net_todo_run_mutex);
3948 void netdev_run_todo(void)
3949 {
3950         struct list_head list;
3951
3952         /* Need to guard against multiple cpu's getting out of order. */
3953         mutex_lock(&net_todo_run_mutex);
3954
3955         /* Not safe to do outside the semaphore.  We must not return
3956          * until all unregister events invoked by the local processor
3957          * have been completed (either by this todo run, or one on
3958          * another cpu).
3959          */
3960         if (list_empty(&net_todo_list))
3961                 goto out;
3962
3963         /* Snapshot list, allow later requests */
3964         spin_lock(&net_todo_list_lock);
3965         list_replace_init(&net_todo_list, &list);
3966         spin_unlock(&net_todo_list_lock);
3967
3968         while (!list_empty(&list)) {
3969                 struct net_device *dev
3970                         = list_entry(list.next, struct net_device, todo_list);
3971                 list_del(&dev->todo_list);
3972
3973                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
3974                         printk(KERN_ERR "network todo '%s' but state %d\n",
3975                                dev->name, dev->reg_state);
3976                         dump_stack();
3977                         continue;
3978                 }
3979
3980                 dev->reg_state = NETREG_UNREGISTERED;
3981
3982                 netdev_wait_allrefs(dev);
3983
3984                 /* paranoia */
3985                 BUG_ON(atomic_read(&dev->refcnt));
3986                 BUG_TRAP(!dev->ip_ptr);
3987                 BUG_TRAP(!dev->ip6_ptr);
3988                 BUG_TRAP(!dev->dn_ptr);
3989
3990                 if (dev->destructor)
3991                         dev->destructor(dev);
3992
3993                 /* Free network device */
3994                 kobject_put(&dev->dev.kobj);
3995         }
3996
3997 out:
3998         mutex_unlock(&net_todo_run_mutex);
3999 }
4000
4001 static struct net_device_stats *internal_stats(struct net_device *dev)
4002 {
4003         return &dev->stats;
4004 }
4005
4006 /**
4007  *      alloc_netdev_mq - allocate network device
4008  *      @sizeof_priv:   size of private data to allocate space for
4009  *      @name:          device name format string
4010  *      @setup:         callback to initialize device
4011  *      @queue_count:   the number of subqueues to allocate
4012  *
4013  *      Allocates a struct net_device with private data area for driver use
4014  *      and performs basic initialization.  Also allocates subquue structs
4015  *      for each queue on the device at the end of the netdevice.
4016  */
4017 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
4018                 void (*setup)(struct net_device *), unsigned int queue_count)
4019 {
4020         void *p;
4021         struct net_device *dev;
4022         int alloc_size;
4023
4024         BUG_ON(strlen(name) >= sizeof(dev->name));
4025
4026         alloc_size = sizeof(struct net_device) +
4027                      sizeof(struct net_device_subqueue) * (queue_count - 1);
4028         if (sizeof_priv) {
4029                 /* ensure 32-byte alignment of private area */
4030                 alloc_size = (alloc_size + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;
4031                 alloc_size += sizeof_priv;
4032         }
4033         /* ensure 32-byte alignment of whole construct */
4034         alloc_size += NETDEV_ALIGN_CONST;
4035
4036         p = kzalloc(alloc_size, GFP_KERNEL);
4037         if (!p) {
4038                 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
4039                 return NULL;
4040         }
4041
4042         dev = (struct net_device *)
4043                 (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
4044         dev->padded = (char *)dev - (char *)p;
4045         dev_net_set(dev, &init_net);
4046
4047         if (sizeof_priv) {
4048                 dev->priv = ((char *)dev +
4049                              ((sizeof(struct net_device) +
4050                                (sizeof(struct net_device_subqueue) *
4051                                 (queue_count - 1)) + NETDEV_ALIGN_CONST)
4052                               & ~NETDEV_ALIGN_CONST));
4053         }
4054
4055         dev->egress_subqueue_count = queue_count;
4056         dev->gso_max_size = GSO_MAX_SIZE;
4057
4058         dev->get_stats = internal_stats;
4059         netpoll_netdev_init(dev);
4060         setup(dev);
4061         strcpy(dev->name, name);
4062         return dev;
4063 }
4064 EXPORT_SYMBOL(alloc_netdev_mq);
4065
4066 /**
4067  *      free_netdev - free network device
4068  *      @dev: device
4069  *
4070  *      This function does the last stage of destroying an allocated device
4071  *      interface. The reference to the device object is released.
4072  *      If this is the last reference then it will be freed.
4073  */
4074 void free_netdev(struct net_device *dev)
4075 {
4076         release_net(dev_net(dev));
4077
4078         /*  Compatibility with error handling in drivers */
4079         if (dev->reg_state == NETREG_UNINITIALIZED) {
4080                 kfree((char *)dev - dev->padded);
4081                 return;
4082         }
4083
4084         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
4085         dev->reg_state = NETREG_RELEASED;
4086
4087         /* will free via device release */
4088         put_device(&dev->dev);
4089 }
4090
4091 /* Synchronize with packet receive processing. */
4092 void synchronize_net(void)
4093 {
4094         might_sleep();
4095         synchronize_rcu();
4096 }
4097
4098 /**
4099  *      unregister_netdevice - remove device from the kernel
4100  *      @dev: device
4101  *
4102  *      This function shuts down a device interface and removes it
4103  *      from the kernel tables.
4104  *
4105  *      Callers must hold the rtnl semaphore.  You may want
4106  *      unregister_netdev() instead of this.
4107  */
4108
4109 void unregister_netdevice(struct net_device *dev)
4110 {
4111         ASSERT_RTNL();
4112
4113         rollback_registered(dev);
4114         /* Finish processing unregister after unlock */
4115         net_set_todo(dev);
4116 }
4117
4118 /**
4119  *      unregister_netdev - remove device from the kernel
4120  *      @dev: device
4121  *
4122  *      This function shuts down a device interface and removes it
4123  *      from the kernel tables.
4124  *
4125  *      This is just a wrapper for unregister_netdevice that takes
4126  *      the rtnl semaphore.  In general you want to use this and not
4127  *      unregister_netdevice.
4128  */
4129 void unregister_netdev(struct net_device *dev)
4130 {
4131         rtnl_lock();
4132         unregister_netdevice(dev);
4133         rtnl_unlock();
4134 }
4135
4136 EXPORT_SYMBOL(unregister_netdev);
4137
4138 /**
4139  *      dev_change_net_namespace - move device to different nethost namespace
4140  *      @dev: device
4141  *      @net: network namespace
4142  *      @pat: If not NULL name pattern to try if the current device name
4143  *            is already taken in the destination network namespace.
4144  *
4145  *      This function shuts down a device interface and moves it
4146  *      to a new network namespace. On success 0 is returned, on
4147  *      a failure a netagive errno code is returned.
4148  *
4149  *      Callers must hold the rtnl semaphore.
4150  */
4151
4152 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
4153 {
4154         char buf[IFNAMSIZ];
4155         const char *destname;
4156         int err;
4157
4158         ASSERT_RTNL();
4159
4160         /* Don't allow namespace local devices to be moved. */
4161         err = -EINVAL;
4162         if (dev->features & NETIF_F_NETNS_LOCAL)
4163                 goto out;
4164
4165         /* Ensure the device has been registrered */
4166         err = -EINVAL;
4167         if (dev->reg_state != NETREG_REGISTERED)
4168                 goto out;
4169
4170         /* Get out if there is nothing todo */
4171         err = 0;
4172         if (net_eq(dev_net(dev), net))
4173                 goto out;
4174
4175         /* Pick the destination device name, and ensure
4176          * we can use it in the destination network namespace.
4177          */
4178         err = -EEXIST;
4179         destname = dev->name;
4180         if (__dev_get_by_name(net, destname)) {
4181                 /* We get here if we can't use the current device name */
4182                 if (!pat)
4183                         goto out;
4184                 if (!dev_valid_name(pat))
4185                         goto out;
4186                 if (strchr(pat, '%')) {
4187                         if (__dev_alloc_name(net, pat, buf) < 0)
4188                                 goto out;
4189                         destname = buf;
4190                 } else
4191                         destname = pat;
4192                 if (__dev_get_by_name(net, destname))
4193                         goto out;
4194         }
4195
4196         /*
4197          * And now a mini version of register_netdevice unregister_netdevice.
4198          */
4199
4200         /* If device is running close it first. */
4201         dev_close(dev);
4202
4203         /* And unlink it from device chain */
4204         err = -ENODEV;
4205         unlist_netdevice(dev);
4206
4207         synchronize_net();
4208
4209         /* Shutdown queueing discipline. */
4210         dev_shutdown(dev);
4211
4212         /* Notify protocols, that we are about to destroy
4213            this device. They should clean all the things.
4214         */
4215         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4216
4217         /*
4218          *      Flush the unicast and multicast chains
4219          */
4220         dev_addr_discard(dev);
4221
4222         /* Actually switch the network namespace */
4223         dev_net_set(dev, net);
4224
4225         /* Assign the new device name */
4226         if (destname != dev->name)
4227                 strcpy(dev->name, destname);
4228
4229         /* If there is an ifindex conflict assign a new one */
4230         if (__dev_get_by_index(net, dev->ifindex)) {
4231                 int iflink = (dev->iflink == dev->ifindex);
4232                 dev->ifindex = dev_new_index(net);
4233                 if (iflink)
4234                         dev->iflink = dev->ifindex;
4235         }
4236
4237         /* Fixup kobjects */
4238         netdev_unregister_kobject(dev);
4239         err = netdev_register_kobject(dev);
4240         WARN_ON(err);
4241
4242         /* Add the device back in the hashes */
4243         list_netdevice(dev);
4244
4245         /* Notify protocols, that a new device appeared. */
4246         call_netdevice_notifiers(NETDEV_REGISTER, dev);
4247
4248         synchronize_net();
4249         err = 0;
4250 out:
4251         return err;
4252 }
4253
4254 static int dev_cpu_callback(struct notifier_block *nfb,
4255                             unsigned long action,
4256                             void *ocpu)
4257 {
4258         struct sk_buff **list_skb;
4259         struct net_device **list_net;
4260         struct sk_buff *skb;
4261         unsigned int cpu, oldcpu = (unsigned long)ocpu;
4262         struct softnet_data *sd, *oldsd;
4263
4264         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
4265                 return NOTIFY_OK;
4266
4267         local_irq_disable();
4268         cpu = smp_processor_id();
4269         sd = &per_cpu(softnet_data, cpu);
4270         oldsd = &per_cpu(softnet_data, oldcpu);
4271
4272         /* Find end of our completion_queue. */
4273         list_skb = &sd->completion_queue;
4274         while (*list_skb)
4275                 list_skb = &(*list_skb)->next;
4276         /* Append completion queue from offline CPU. */
4277         *list_skb = oldsd->completion_queue;
4278         oldsd->completion_queue = NULL;
4279
4280         /* Find end of our output_queue. */
4281         list_net = &sd->output_queue;
4282         while (*list_net)
4283                 list_net = &(*list_net)->next_sched;
4284         /* Append output queue from offline CPU. */
4285         *list_net = oldsd->output_queue;
4286         oldsd->output_queue = NULL;
4287
4288         raise_softirq_irqoff(NET_TX_SOFTIRQ);
4289         local_irq_enable();
4290
4291         /* Process offline CPU's input_pkt_queue */
4292         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
4293                 netif_rx(skb);
4294
4295         return NOTIFY_OK;
4296 }
4297
4298 #ifdef CONFIG_NET_DMA
4299 /**
4300  * net_dma_rebalance - try to maintain one DMA channel per CPU
4301  * @net_dma: DMA client and associated data (lock, channels, channel_mask)
4302  *
4303  * This is called when the number of channels allocated to the net_dma client
4304  * changes.  The net_dma client tries to have one DMA channel per CPU.
4305  */
4306
4307 static void net_dma_rebalance(struct net_dma *net_dma)
4308 {
4309         unsigned int cpu, i, n, chan_idx;
4310         struct dma_chan *chan;
4311
4312         if (cpus_empty(net_dma->channel_mask)) {
4313                 for_each_online_cpu(cpu)
4314                         rcu_assign_pointer(per_cpu(softnet_data, cpu).net_dma, NULL);
4315                 return;
4316         }
4317
4318         i = 0;
4319         cpu = first_cpu(cpu_online_map);
4320
4321         for_each_cpu_mask(chan_idx, net_dma->channel_mask) {
4322                 chan = net_dma->channels[chan_idx];
4323
4324                 n = ((num_online_cpus() / cpus_weight(net_dma->channel_mask))
4325                    + (i < (num_online_cpus() %
4326                         cpus_weight(net_dma->channel_mask)) ? 1 : 0));
4327
4328                 while(n) {
4329                         per_cpu(softnet_data, cpu).net_dma = chan;
4330                         cpu = next_cpu(cpu, cpu_online_map);
4331                         n--;
4332                 }
4333                 i++;
4334         }
4335 }
4336
4337 /**
4338  * netdev_dma_event - event callback for the net_dma_client
4339  * @client: should always be net_dma_client
4340  * @chan: DMA channel for the event
4341  * @state: DMA state to be handled
4342  */
4343 static enum dma_state_client
4344 netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
4345         enum dma_state state)
4346 {
4347         int i, found = 0, pos = -1;
4348         struct net_dma *net_dma =
4349                 container_of(client, struct net_dma, client);
4350         enum dma_state_client ack = DMA_DUP; /* default: take no action */
4351
4352         spin_lock(&net_dma->lock);
4353         switch (state) {
4354         case DMA_RESOURCE_AVAILABLE:
4355                 for (i = 0; i < nr_cpu_ids; i++)
4356                         if (net_dma->channels[i] == chan) {
4357                                 found = 1;
4358                                 break;
4359                         } else if (net_dma->channels[i] == NULL && pos < 0)
4360                                 pos = i;
4361
4362                 if (!found && pos >= 0) {
4363                         ack = DMA_ACK;
4364                         net_dma->channels[pos] = chan;
4365                         cpu_set(pos, net_dma->channel_mask);
4366                         net_dma_rebalance(net_dma);
4367                 }
4368                 break;
4369         case DMA_RESOURCE_REMOVED:
4370                 for (i = 0; i < nr_cpu_ids; i++)
4371                         if (net_dma->channels[i] == chan) {
4372                                 found = 1;
4373                                 pos = i;
4374                                 break;
4375                         }
4376
4377                 if (found) {
4378                         ack = DMA_ACK;
4379                         cpu_clear(pos, net_dma->channel_mask);
4380                         net_dma->channels[i] = NULL;
4381                         net_dma_rebalance(net_dma);
4382                 }
4383                 break;
4384         default:
4385                 break;
4386         }
4387         spin_unlock(&net_dma->lock);
4388
4389         return ack;
4390 }
4391
4392 /**
4393  * netdev_dma_regiser - register the networking subsystem as a DMA client
4394  */
4395 static int __init netdev_dma_register(void)
4396 {
4397         net_dma.channels = kzalloc(nr_cpu_ids * sizeof(struct net_dma),
4398                                                                 GFP_KERNEL);
4399         if (unlikely(!net_dma.channels)) {
4400                 printk(KERN_NOTICE
4401                                 "netdev_dma: no memory for net_dma.channels\n");
4402                 return -ENOMEM;
4403         }
4404         spin_lock_init(&net_dma.lock);
4405         dma_cap_set(DMA_MEMCPY, net_dma.client.cap_mask);
4406         dma_async_client_register(&net_dma.client);
4407         dma_async_client_chan_request(&net_dma.client);
4408         return 0;
4409 }
4410
4411 #else
4412 static int __init netdev_dma_register(void) { return -ENODEV; }
4413 #endif /* CONFIG_NET_DMA */
4414
4415 /**
4416  *      netdev_compute_feature - compute conjunction of two feature sets
4417  *      @all: first feature set
4418  *      @one: second feature set
4419  *
4420  *      Computes a new feature set after adding a device with feature set
4421  *      @one to the master device with current feature set @all.  Returns
4422  *      the new feature set.
4423  */
4424 int netdev_compute_features(unsigned long all, unsigned long one)
4425 {
4426         /* if device needs checksumming, downgrade to hw checksumming */
4427         if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
4428                 all ^= NETIF_F_NO_CSUM | NETIF_F_HW_CSUM;
4429
4430         /* if device can't do all checksum, downgrade to ipv4/ipv6 */
4431         if (all & NETIF_F_HW_CSUM && !(one & NETIF_F_HW_CSUM))
4432                 all ^= NETIF_F_HW_CSUM
4433                         | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
4434
4435         if (one & NETIF_F_GSO)
4436                 one |= NETIF_F_GSO_SOFTWARE;
4437         one |= NETIF_F_GSO;
4438
4439         /* If even one device supports robust GSO, enable it for all. */
4440         if (one & NETIF_F_GSO_ROBUST)
4441                 all |= NETIF_F_GSO_ROBUST;
4442
4443         all &= one | NETIF_F_LLTX;
4444
4445         if (!(all & NETIF_F_ALL_CSUM))
4446                 all &= ~NETIF_F_SG;
4447         if (!(all & NETIF_F_SG))
4448                 all &= ~NETIF_F_GSO_MASK;
4449
4450         return all;
4451 }
4452 EXPORT_SYMBOL(netdev_compute_features);
4453
4454 static struct hlist_head *netdev_create_hash(void)
4455 {
4456         int i;
4457         struct hlist_head *hash;
4458
4459         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
4460         if (hash != NULL)
4461                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
4462                         INIT_HLIST_HEAD(&hash[i]);
4463
4464         return hash;
4465 }
4466
4467 /* Initialize per network namespace state */
4468 static int __net_init netdev_init(struct net *net)
4469 {
4470         INIT_LIST_HEAD(&net->dev_base_head);
4471
4472         net->dev_name_head = netdev_create_hash();
4473         if (net->dev_name_head == NULL)
4474                 goto err_name;
4475
4476         net->dev_index_head = netdev_create_hash();
4477         if (net->dev_index_head == NULL)
4478                 goto err_idx;
4479
4480         return 0;
4481
4482 err_idx:
4483         kfree(net->dev_name_head);
4484 err_name:
4485         return -ENOMEM;
4486 }
4487
4488 static void __net_exit netdev_exit(struct net *net)
4489 {
4490         kfree(net->dev_name_head);
4491         kfree(net->dev_index_head);
4492 }
4493
4494 static struct pernet_operations __net_initdata netdev_net_ops = {
4495         .init = netdev_init,
4496         .exit = netdev_exit,
4497 };
4498
4499 static void __net_exit default_device_exit(struct net *net)
4500 {
4501         struct net_device *dev, *next;
4502         /*
4503          * Push all migratable of the network devices back to the
4504          * initial network namespace
4505          */
4506         rtnl_lock();
4507         for_each_netdev_safe(net, dev, next) {
4508                 int err;
4509                 char fb_name[IFNAMSIZ];
4510
4511                 /* Ignore unmoveable devices (i.e. loopback) */
4512                 if (dev->features & NETIF_F_NETNS_LOCAL)
4513                         continue;
4514
4515                 /* Push remaing network devices to init_net */
4516                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
4517                 err = dev_change_net_namespace(dev, &init_net, fb_name);
4518                 if (err) {
4519                         printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
4520                                 __func__, dev->name, err);
4521                         BUG();
4522                 }
4523         }
4524         rtnl_unlock();
4525 }
4526
4527 static struct pernet_operations __net_initdata default_device_ops = {
4528         .exit = default_device_exit,
4529 };
4530
4531 /*
4532  *      Initialize the DEV module. At boot time this walks the device list and
4533  *      unhooks any devices that fail to initialise (normally hardware not
4534  *      present) and leaves us with a valid list of present and active devices.
4535  *
4536  */
4537
4538 /*
4539  *       This is called single threaded during boot, so no need
4540  *       to take the rtnl semaphore.
4541  */
4542 static int __init net_dev_init(void)
4543 {
4544         int i, rc = -ENOMEM;
4545
4546         BUG_ON(!dev_boot_phase);
4547
4548         if (dev_proc_init())
4549                 goto out;
4550
4551         if (netdev_kobject_init())
4552                 goto out;
4553
4554         INIT_LIST_HEAD(&ptype_all);
4555         for (i = 0; i < PTYPE_HASH_SIZE; i++)
4556                 INIT_LIST_HEAD(&ptype_base[i]);
4557
4558         if (register_pernet_subsys(&netdev_net_ops))
4559                 goto out;
4560
4561         if (register_pernet_device(&default_device_ops))
4562                 goto out;
4563
4564         /*
4565          *      Initialise the packet receive queues.
4566          */
4567
4568         for_each_possible_cpu(i) {
4569                 struct softnet_data *queue;
4570
4571                 queue = &per_cpu(softnet_data, i);
4572                 skb_queue_head_init(&queue->input_pkt_queue);
4573                 queue->completion_queue = NULL;
4574                 INIT_LIST_HEAD(&queue->poll_list);
4575
4576                 queue->backlog.poll = process_backlog;
4577                 queue->backlog.weight = weight_p;
4578         }
4579
4580         netdev_dma_register();
4581
4582         dev_boot_phase = 0;
4583
4584         open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
4585         open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);
4586
4587         hotcpu_notifier(dev_cpu_callback, 0);
4588         dst_init();
4589         dev_mcast_init();
4590         rc = 0;
4591 out:
4592         return rc;
4593 }
4594
4595 subsys_initcall(net_dev_init);
4596
4597 EXPORT_SYMBOL(__dev_get_by_index);
4598 EXPORT_SYMBOL(__dev_get_by_name);
4599 EXPORT_SYMBOL(__dev_remove_pack);
4600 EXPORT_SYMBOL(dev_valid_name);
4601 EXPORT_SYMBOL(dev_add_pack);
4602 EXPORT_SYMBOL(dev_alloc_name);
4603 EXPORT_SYMBOL(dev_close);
4604 EXPORT_SYMBOL(dev_get_by_flags);
4605 EXPORT_SYMBOL(dev_get_by_index);
4606 EXPORT_SYMBOL(dev_get_by_name);
4607 EXPORT_SYMBOL(dev_open);
4608 EXPORT_SYMBOL(dev_queue_xmit);
4609 EXPORT_SYMBOL(dev_remove_pack);
4610 EXPORT_SYMBOL(dev_set_allmulti);
4611 EXPORT_SYMBOL(dev_set_promiscuity);
4612 EXPORT_SYMBOL(dev_change_flags);
4613 EXPORT_SYMBOL(dev_set_mtu);
4614 EXPORT_SYMBOL(dev_set_mac_address);
4615 EXPORT_SYMBOL(free_netdev);
4616 EXPORT_SYMBOL(netdev_boot_setup_check);
4617 EXPORT_SYMBOL(netdev_set_master);
4618 EXPORT_SYMBOL(netdev_state_change);
4619 EXPORT_SYMBOL(netif_receive_skb);
4620 EXPORT_SYMBOL(netif_rx);
4621 EXPORT_SYMBOL(register_gifconf);
4622 EXPORT_SYMBOL(register_netdevice);
4623 EXPORT_SYMBOL(register_netdevice_notifier);
4624 EXPORT_SYMBOL(skb_checksum_help);
4625 EXPORT_SYMBOL(synchronize_net);
4626 EXPORT_SYMBOL(unregister_netdevice);
4627 EXPORT_SYMBOL(unregister_netdevice_notifier);
4628 EXPORT_SYMBOL(net_enable_timestamp);
4629 EXPORT_SYMBOL(net_disable_timestamp);
4630 EXPORT_SYMBOL(dev_get_flags);
4631
4632 #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
4633 EXPORT_SYMBOL(br_handle_frame_hook);
4634 EXPORT_SYMBOL(br_fdb_get_hook);
4635 EXPORT_SYMBOL(br_fdb_put_hook);
4636 #endif
4637
4638 #ifdef CONFIG_KMOD
4639 EXPORT_SYMBOL(dev_load);
4640 #endif
4641
4642 EXPORT_PER_CPU_SYMBOL(softnet_data);