S: Maintained
F: Documentation/ABI/testing/sysfs-class-net-batman-adv
F: Documentation/ABI/testing/sysfs-class-net-mesh
-F: Documentation/networking/batman-adv.txt
+F: Documentation/networking/batman-adv.rst
F: include/uapi/linux/batman_adv.h
F: net/batman-adv/
F: include/linux/phy.h
F: include/linux/phy_fixed.h
F: include/linux/platform_data/mdio-gpio.h
+F: include/linux/platform_data/mdio-bcm-unimac.h
F: include/trace/events/mdio.h
F: include/uapi/linux/mdio.h
F: include/uapi/linux/mii.h
F: drivers/net/ethernet/hisilicon/
F: Documentation/devicetree/bindings/net/hisilicon*.txt
+HISILICON NETWORK SUBSYSTEM 3 DRIVER (HNS3)
+M: Yisen Zhuang <yisen.zhuang@huawei.com>
+M: Salil Mehta <salil.mehta@huawei.com>
+L: netdev@vger.kernel.org
+W: http://www.hisilicon.com
+S: Maintained
+F: drivers/net/ethernet/hisilicon/hns3/
+
HISILICON ROCE DRIVER
M: Lijun Ou <oulijun@huawei.com>
M: Wei Hu(Xavier) <xavier.huwei@huawei.com>
M: Stephen Hemminger <sthemmin@microsoft.com>
L: devel@linuxdriverproject.org
S: Maintained
+F: Documentation/networking/netvsc.txt
F: arch/x86/include/asm/mshyperv.h
F: arch/x86/include/uapi/asm/hyperv.h
F: arch/x86/kernel/cpu/mshyperv.c
L: linux-kernel@vger.kernel.org
S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git irq/core
- T: git git://git.infradead.org/users/jcooper/linux.git irqchip/core
F: Documentation/devicetree/bindings/interrupt-controller/
F: drivers/irqchip/
MEDIATEK ETHERNET DRIVER
M: Felix Fietkau <nbd@openwrt.org>
-M: John Crispin <blogic@openwrt.org>
+M: John Crispin <john@phrozen.org>
+M: Sean Wang <sean.wang@mediatek.com>
+M: Nelson Chang <nelson.chang@mediatek.com>
L: netdev@vger.kernel.org
S: Maintained
F: drivers/net/ethernet/mediatek/
/* Virtual PCI function needs to determine UAR page size from
* firmware. Only master PCI function can set the uar page size
*/
- if (enable_4k_uar)
+ if (enable_4k_uar || !dev->persist->num_vfs)
dev->uar_page_shift = DEFAULT_UAR_PAGE_SHIFT;
else
dev->uar_page_shift = PAGE_SHIFT;
mlx4_replace_zero_macs(dev);
dev->caps.qp0_qkey = kcalloc(dev->caps.num_ports, sizeof(u32), GFP_KERNEL);
- dev->caps.qp0_tunnel = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL);
- dev->caps.qp0_proxy = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL);
- dev->caps.qp1_tunnel = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL);
- dev->caps.qp1_proxy = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL);
+ dev->caps.qp0_tunnel = kcalloc(dev->caps.num_ports, sizeof(u32), GFP_KERNEL);
+ dev->caps.qp0_proxy = kcalloc(dev->caps.num_ports, sizeof(u32), GFP_KERNEL);
+ dev->caps.qp1_tunnel = kcalloc(dev->caps.num_ports, sizeof(u32), GFP_KERNEL);
+ dev->caps.qp1_proxy = kcalloc(dev->caps.num_ports, sizeof(u32), GFP_KERNEL);
if (!dev->caps.qp0_tunnel || !dev->caps.qp0_proxy ||
!dev->caps.qp1_tunnel || !dev->caps.qp1_proxy ||
dev->caps.max_fmr_maps = (1 << (32 - ilog2(dev->caps.num_mpts))) - 1;
- if (enable_4k_uar) {
+ if (enable_4k_uar || !dev->persist->num_vfs) {
init_hca.log_uar_sz = ilog2(dev->caps.num_uars) +
PAGE_SHIFT - DEFAULT_UAR_PAGE_SHIFT;
init_hca.uar_page_sz = DEFAULT_UAR_PAGE_SHIFT - 12;
dev->caps.rx_checksum_flags_port[2] = params.rx_csum_flags_port_2;
}
priv->eq_table.inta_pin = adapter.inta_pin;
- memcpy(dev->board_id, adapter.board_id, sizeof dev->board_id);
+ memcpy(dev->board_id, adapter.board_id, sizeof(dev->board_id));
return 0;
dev->caps.num_eqs - dev->caps.reserved_eqs,
MAX_MSIX);
- entries = kcalloc(nreq, sizeof *entries, GFP_KERNEL);
+ entries = kcalloc(nreq, sizeof(*entries), GFP_KERNEL);
if (!entries)
goto no_msi;
err_disable_pdev:
mlx4_pci_disable_device(&priv->dev);
- pci_set_drvdata(pdev, NULL);
return err;
}
devlink_unregister(devlink);
kfree(dev->persist);
devlink_free(devlink);
- pci_set_drvdata(pdev, NULL);
}
static int restore_current_port_types(struct mlx4_dev *dev,
return NETDEV_TX_OK;
err_unmap:
- --f;
- while (f >= 0) {
+ while (--f >= 0) {
frag = &skb_shinfo(skb)->frags[f];
dma_unmap_page(dp->dev, tx_ring->txbufs[wr_idx].dma_addr,
skb_frag_size(frag), DMA_TO_DEVICE);
/* Step 2: Tell NFP
*/
nfp_net_clear_config_and_disable(nn);
+ nfp_port_configure(netdev, false);
/* Step 3: Free resources
*/
goto err_free_all;
/* Step 2: Configure the NFP
+ * - Ifup the physical interface if it exists
* - Enable rings from 0 to tx_rings/rx_rings - 1.
* - Write MAC address (in case it changed)
* - Set the MTU
* - Set the Freelist buffer size
* - Enable the FW
*/
- err = nfp_net_set_config_and_enable(nn);
+ err = nfp_port_configure(netdev, true);
if (err)
goto err_free_all;
+ err = nfp_net_set_config_and_enable(nn);
+ if (err)
+ goto err_port_disable;
+
/* Step 3: Enable for kernel
* - put some freelist descriptors on each RX ring
* - enable NAPI on each ring
return 0;
+err_port_disable:
+ nfp_port_configure(netdev, false);
err_free_all:
nfp_net_close_free_all(nn);
return err;
#include <linux/seq_file.h>
#include <linux/uio.h>
#include <linux/skb_array.h>
+#include <linux/bpf.h>
+#include <linux/bpf_trace.h>
#include <linux/uaccess.h>
} while (0)
#endif
+#define TUN_HEADROOM 256
+#define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD + TUN_HEADROOM)
+
/* TUN device flags */
/* IFF_ATTACH_QUEUE is never stored in device flags,
struct net_device *dev;
netdev_features_t set_features;
#define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \
- NETIF_F_TSO6|NETIF_F_UFO)
+ NETIF_F_TSO6)
int align;
int vnet_hdr_sz;
u32 flow_count;
u32 rx_batched;
struct tun_pcpu_stats __percpu *pcpu_stats;
+ struct bpf_prog __rcu *xdp_prog;
};
#ifdef CONFIG_TUN_VNET_CROSS_LE
static void tun_detach_all(struct net_device *dev)
{
struct tun_struct *tun = netdev_priv(dev);
+ struct bpf_prog *xdp_prog = rtnl_dereference(tun->xdp_prog);
struct tun_file *tfile, *tmp;
int i, n = tun->numqueues;
}
BUG_ON(tun->numdisabled != 0);
+ if (xdp_prog)
+ bpf_prog_put(xdp_prog);
+
if (tun->flags & IFF_PERSIST)
module_put(THIS_MODULE);
}
sk_filter(tfile->socket.sk, skb))
goto drop;
- if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
+ if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
goto drop;
skb_tx_timestamp(skb);
stats->tx_dropped = tx_dropped;
}
+static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog,
+ struct netlink_ext_ack *extack)
+{
+ struct tun_struct *tun = netdev_priv(dev);
+ struct bpf_prog *old_prog;
+
+ old_prog = rtnl_dereference(tun->xdp_prog);
+ rcu_assign_pointer(tun->xdp_prog, prog);
+ if (old_prog)
+ bpf_prog_put(old_prog);
+
+ return 0;
+}
+
+static u32 tun_xdp_query(struct net_device *dev)
+{
+ struct tun_struct *tun = netdev_priv(dev);
+ const struct bpf_prog *xdp_prog;
+
+ xdp_prog = rtnl_dereference(tun->xdp_prog);
+ if (xdp_prog)
+ return xdp_prog->aux->id;
+
+ return 0;
+}
+
+static int tun_xdp(struct net_device *dev, struct netdev_xdp *xdp)
+{
+ switch (xdp->command) {
+ case XDP_SETUP_PROG:
+ return tun_xdp_set(dev, xdp->prog, xdp->extack);
+ case XDP_QUERY_PROG:
+ xdp->prog_id = tun_xdp_query(dev);
+ xdp->prog_attached = !!xdp->prog_id;
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
+
static const struct net_device_ops tun_netdev_ops = {
.ndo_uninit = tun_net_uninit,
.ndo_open = tun_net_open,
.ndo_features_check = passthru_features_check,
.ndo_set_rx_headroom = tun_set_headroom,
.ndo_get_stats64 = tun_net_get_stats64,
+ .ndo_xdp = tun_xdp,
};
static void tun_flow_init(struct tun_struct *tun)
}
}
+static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile,
+ int len, int noblock, bool zerocopy)
+{
+ if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
+ return false;
+
+ if (tfile->socket.sk->sk_sndbuf != INT_MAX)
+ return false;
+
+ if (!noblock)
+ return false;
+
+ if (zerocopy)
+ return false;
+
+ if (SKB_DATA_ALIGN(len + TUN_RX_PAD) +
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
+ return false;
+
+ return true;
+}
+
+static struct sk_buff *tun_build_skb(struct tun_struct *tun,
+ struct tun_file *tfile,
+ struct iov_iter *from,
+ struct virtio_net_hdr *hdr,
+ int len, int *generic_xdp)
+{
+ struct page_frag *alloc_frag = ¤t->task_frag;
+ struct sk_buff *skb;
+ struct bpf_prog *xdp_prog;
+ int buflen = SKB_DATA_ALIGN(len + TUN_RX_PAD) +
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ unsigned int delta = 0;
+ char *buf;
+ size_t copied;
+ bool xdp_xmit = false;
+ int err;
+
+ if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL)))
+ return ERR_PTR(-ENOMEM);
+
+ buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
+ copied = copy_page_from_iter(alloc_frag->page,
+ alloc_frag->offset + TUN_RX_PAD,
+ len, from);
+ if (copied != len)
+ return ERR_PTR(-EFAULT);
+
+ if (hdr->gso_type)
+ *generic_xdp = 1;
+ else
+ *generic_xdp = 0;
+
+ rcu_read_lock();
+ xdp_prog = rcu_dereference(tun->xdp_prog);
+ if (xdp_prog && !*generic_xdp) {
+ struct xdp_buff xdp;
+ void *orig_data;
+ u32 act;
+
+ xdp.data_hard_start = buf;
+ xdp.data = buf + TUN_RX_PAD;
+ xdp.data_end = xdp.data + len;
+ orig_data = xdp.data;
+ act = bpf_prog_run_xdp(xdp_prog, &xdp);
+
+ switch (act) {
+ case XDP_REDIRECT:
+ get_page(alloc_frag->page);
+ alloc_frag->offset += buflen;
+ err = xdp_do_redirect(tun->dev, &xdp, xdp_prog);
+ if (err)
+ goto err_redirect;
+ return NULL;
+ case XDP_TX:
+ xdp_xmit = true;
+ /* fall through */
+ case XDP_PASS:
+ delta = orig_data - xdp.data;
+ break;
+ default:
+ bpf_warn_invalid_xdp_action(act);
+ /* fall through */
+ case XDP_ABORTED:
+ trace_xdp_exception(tun->dev, xdp_prog, act);
+ /* fall through */
+ case XDP_DROP:
+ goto err_xdp;
+ }
+ }
+
+ skb = build_skb(buf, buflen);
+ if (!skb) {
+ rcu_read_unlock();
+ return ERR_PTR(-ENOMEM);
+ }
+
+ skb_reserve(skb, TUN_RX_PAD - delta);
+ skb_put(skb, len + delta);
+ get_page(alloc_frag->page);
+ alloc_frag->offset += buflen;
+
+ if (xdp_xmit) {
+ skb->dev = tun->dev;
+ generic_xdp_tx(skb, xdp_prog);
+ rcu_read_lock();
+ return NULL;
+ }
+
+ rcu_read_unlock();
+
+ return skb;
+
+err_redirect:
+ put_page(alloc_frag->page);
+err_xdp:
+ rcu_read_unlock();
+ this_cpu_inc(tun->pcpu_stats->rx_dropped);
+ return NULL;
+}
+
/* Get packet from user space buffer */
static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
void *msg_control, struct iov_iter *from,
bool zerocopy = false;
int err;
u32 rxhash;
+ int generic_xdp = 1;
if (!(tun->dev->flags & IFF_UP))
return -EIO;
zerocopy = true;
}
- if (!zerocopy) {
- copylen = len;
- if (tun16_to_cpu(tun, gso.hdr_len) > good_linear)
- linear = good_linear;
- else
- linear = tun16_to_cpu(tun, gso.hdr_len);
- }
-
- skb = tun_alloc_skb(tfile, align, copylen, linear, noblock);
- if (IS_ERR(skb)) {
- if (PTR_ERR(skb) != -EAGAIN)
+ if (tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
+ skb = tun_build_skb(tun, tfile, from, &gso, len, &generic_xdp);
+ if (IS_ERR(skb)) {
this_cpu_inc(tun->pcpu_stats->rx_dropped);
- return PTR_ERR(skb);
- }
+ return PTR_ERR(skb);
+ }
+ if (!skb)
+ return total_len;
+ } else {
+ if (!zerocopy) {
+ copylen = len;
+ if (tun16_to_cpu(tun, gso.hdr_len) > good_linear)
+ linear = good_linear;
+ else
+ linear = tun16_to_cpu(tun, gso.hdr_len);
+ }
- if (zerocopy)
- err = zerocopy_sg_from_iter(skb, from);
- else
- err = skb_copy_datagram_from_iter(skb, 0, from, len);
+ skb = tun_alloc_skb(tfile, align, copylen, linear, noblock);
+ if (IS_ERR(skb)) {
+ if (PTR_ERR(skb) != -EAGAIN)
+ this_cpu_inc(tun->pcpu_stats->rx_dropped);
+ return PTR_ERR(skb);
+ }
- if (err) {
- this_cpu_inc(tun->pcpu_stats->rx_dropped);
- kfree_skb(skb);
- return -EFAULT;
+ if (zerocopy)
+ err = zerocopy_sg_from_iter(skb, from);
+ else
+ err = skb_copy_datagram_from_iter(skb, 0, from, len);
+
+ if (err) {
+ this_cpu_inc(tun->pcpu_stats->rx_dropped);
+ kfree_skb(skb);
+ return -EFAULT;
+ }
}
if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun))) {
skb_reset_network_header(skb);
skb_probe_transport_header(skb, 0);
+ if (generic_xdp) {
+ struct bpf_prog *xdp_prog;
+ int ret;
+
+ rcu_read_lock();
+ xdp_prog = rcu_dereference(tun->xdp_prog);
+ if (xdp_prog) {
+ ret = do_xdp_generic(xdp_prog, skb);
+ if (ret != XDP_PASS) {
+ rcu_read_unlock();
+ return total_len;
+ }
+ }
+ rcu_read_unlock();
+ }
+
rxhash = __skb_get_hash_symmetric(skb);
#ifndef CONFIG_4KSTACKS
tun_rx_batched(tun, tfile, skb, more);
err_detach:
tun_detach_all(dev);
+ /* register_netdevice() already called tun_free_netdev() */
+ goto err_free_dev;
+
err_free_flow:
tun_flow_uninit(tun);
security_tun_dev_free_security(tun->security);
features |= NETIF_F_TSO6;
arg &= ~(TUN_F_TSO4|TUN_F_TSO6);
}
-
- if (arg & TUN_F_UFO) {
- features |= NETIF_F_UFO;
- arg &= ~TUN_F_UFO;
- }
}
/* This gives the user a way to test for new features in future by
int n = tun->numqueues + tun->numdisabled;
int ret, i;
- arrays = kmalloc(sizeof *arrays * n, GFP_KERNEL);
+ arrays = kmalloc_array(n, sizeof(*arrays), GFP_KERNEL);
if (!arrays)
return -ENOMEM;
#define IPCB(skb) ((struct inet_skb_parm*)((skb)->cb))
#define PKTINFO_SKB_CB(skb) ((struct in_pktinfo *)((skb)->cb))
+/* return enslaved device index if relevant */
+static inline int inet_sdif(struct sk_buff *skb)
+{
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+ if (skb && ipv4_l3mdev_skb(IPCB(skb)->flags))
+ return IPCB(skb)->iif;
+#endif
+ return 0;
+}
+
struct ip_ra_chain {
struct ip_ra_chain __rcu *next;
struct sock *sk;
!forwarding)
return dst_mtu(dst);
- return min(dst->dev->mtu, IP_MAX_MTU);
+ return min(READ_ONCE(dst->dev->mtu), IP_MAX_MTU);
}
static inline unsigned int ip_skb_dst_mtu(struct sock *sk,
return ip_dst_mtu_maybe_forward(skb_dst(skb), forwarding);
}
- return min(skb_dst(skb)->dev->mtu, IP_MAX_MTU);
+ return min(READ_ONCE(skb_dst(skb)->dev->mtu), IP_MAX_MTU);
}
u32 ip_idents_reserve(u32 hash, int segs);
void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
__be32 daddr, struct rtable *rt, int is_frag);
-int __ip_options_echo(struct ip_options *dopt, struct sk_buff *skb,
- const struct ip_options *sopt);
-static inline int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb)
+int __ip_options_echo(struct net *net, struct ip_options *dopt,
+ struct sk_buff *skb, const struct ip_options *sopt);
+static inline int ip_options_echo(struct net *net, struct ip_options *dopt,
+ struct sk_buff *skb)
{
- return __ip_options_echo(dopt, skb, &IPCB(skb)->opt);
+ return __ip_options_echo(net, dopt, skb, &IPCB(skb)->opt);
}
void ip_options_fragment(struct sk_buff *skb);
/* Filter manipulation */
struct tcf_block * (*tcf_block)(struct Qdisc *, unsigned long);
- bool (*tcf_cl_offload)(u32 classid);
unsigned long (*bind_tcf)(struct Qdisc *, unsigned long,
u32 classid);
void (*unbind_tcf)(struct Qdisc *, unsigned long);
int (*init)(struct tcf_proto*);
void (*destroy)(struct tcf_proto*);
- unsigned long (*get)(struct tcf_proto*, u32 handle);
+ void* (*get)(struct tcf_proto*, u32 handle);
int (*change)(struct net *net, struct sk_buff *,
struct tcf_proto*, unsigned long,
u32 handle, struct nlattr **,
- unsigned long *, bool);
- int (*delete)(struct tcf_proto*, unsigned long, bool*);
+ void **, bool);
+ int (*delete)(struct tcf_proto*, void *, bool*);
void (*walk)(struct tcf_proto*, struct tcf_walker *arg);
/* rtnetlink specific */
- int (*dump)(struct net*, struct tcf_proto*, unsigned long,
+ int (*dump)(struct net*, struct tcf_proto*, void *,
struct sk_buff *skb, struct tcmsg*);
struct module *owner;
struct Qdisc_class_common *cl;
unsigned int h;
+ if (!id)
+ return NULL;
+
h = qdisc_class_hash(id, hash->hashmask);
hlist_for_each_entry(cl, &hash->hash[h], hnode) {
if (cl->classid == id)
old = *pold;
*pold = new;
if (old != NULL) {
- qdisc_tree_reduce_backlog(old, old->q.qlen, old->qstats.backlog);
+ unsigned int qlen = old->q.qlen;
+ unsigned int backlog = old->qstats.backlog;
+
qdisc_reset(old);
+ qdisc_tree_reduce_backlog(old, qlen, backlog);
}
sch_tree_unlock(sch);
* @sk_stamp: time stamp of last packet received
* @sk_tsflags: SO_TIMESTAMPING socket options
* @sk_tskey: counter to disambiguate concurrent tstamp requests
+ * @sk_zckey: counter to order MSG_ZEROCOPY notifications
* @sk_socket: Identd and reporting IO signals
* @sk_user_data: RPC layer private data
* @sk_frag: cached page frag
u16 sk_tsflags;
u8 sk_shutdown;
u32 sk_tskey;
+ atomic_t sk_zckey;
struct socket *sk_socket;
void *sk_user_data;
#ifdef CONFIG_SECURITY
static inline int sk_peek_offset(struct sock *sk, int flags)
{
if (unlikely(flags & MSG_PEEK)) {
- s32 off = READ_ONCE(sk->sk_peek_off);
- if (off >= 0)
- return off;
+ return READ_ONCE(sk->sk_peek_off);
}
return 0;
gfp_t priority);
void __sock_wfree(struct sk_buff *skb);
void sock_wfree(struct sk_buff *skb);
+struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
+ gfp_t priority);
void skb_orphan_partial(struct sk_buff *skb);
void sock_rfree(struct sk_buff *skb);
void sock_efree(struct sk_buff *skb);
int sock_no_getsockopt(struct socket *, int , int, char __user *, int __user *);
int sock_no_setsockopt(struct socket *, int, int, char __user *, unsigned int);
int sock_no_sendmsg(struct socket *, struct msghdr *, size_t);
+int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t len);
int sock_no_recvmsg(struct socket *, struct msghdr *, size_t, int);
int sock_no_mmap(struct file *file, struct socket *sock,
struct vm_area_struct *vma);
ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset,
size_t size, int flags);
+ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
+ int offset, size_t size, int flags);
/*
* Functions to fill in entries in struct proto_ops when a protocol
return can_add_hw;
}
+ /*
+ * Complement to update_event_times(). This computes the tstamp_* values to
+ * continue 'enabled' state from @now, and effectively discards the time
+ * between the prior tstamp_stopped and now (as we were in the OFF state, or
+ * just switched (context) time base).
+ *
+ * This further assumes '@event->state == INACTIVE' (we just came from OFF) and
+ * cannot have been scheduled in yet. And going into INACTIVE state means
+ * '@event->tstamp_stopped = @now'.
+ *
+ * Thus given the rules of update_event_times():
+ *
+ * total_time_enabled = tstamp_stopped - tstamp_enabled
+ * total_time_running = tstamp_stopped - tstamp_running
+ *
+ * We can insert 'tstamp_stopped == now' and reverse them to compute new
+ * tstamp_* values.
+ */
+ static void __perf_event_enable_time(struct perf_event *event, u64 now)
+ {
+ WARN_ON_ONCE(event->state != PERF_EVENT_STATE_INACTIVE);
+
+ event->tstamp_stopped = now;
+ event->tstamp_enabled = now - event->total_time_enabled;
+ event->tstamp_running = now - event->total_time_running;
+ }
+
static void add_event_to_ctx(struct perf_event *event,
struct perf_event_context *ctx)
{
list_add_event(event, ctx);
perf_group_attach(event);
- event->tstamp_enabled = tstamp;
- event->tstamp_running = tstamp;
- event->tstamp_stopped = tstamp;
+ /*
+ * We can be called with event->state == STATE_OFF when we create with
+ * .disabled = 1. In that case the IOC_ENABLE will call this function.
+ */
+ if (event->state == PERF_EVENT_STATE_INACTIVE)
+ __perf_event_enable_time(event, tstamp);
}
static void ctx_sched_out(struct perf_event_context *ctx,
u64 tstamp = perf_event_time(event);
event->state = PERF_EVENT_STATE_INACTIVE;
- event->tstamp_enabled = tstamp - event->total_time_enabled;
+ __perf_event_enable_time(event, tstamp);
list_for_each_entry(sub, &event->sibling_list, group_entry) {
+ /* XXX should not be > INACTIVE if event isn't */
if (sub->state >= PERF_EVENT_STATE_INACTIVE)
- sub->tstamp_enabled = tstamp - sub->total_time_enabled;
+ __perf_event_enable_time(sub, tstamp);
}
}
atomic_inc(&event->rb->aux_mmap_count);
if (event->pmu->event_mapped)
- event->pmu->event_mapped(event);
+ event->pmu->event_mapped(event, vma->vm_mm);
}
static void perf_pmu_output_stop(struct perf_event *event);
unsigned long size = perf_data_size(rb);
if (event->pmu->event_unmapped)
- event->pmu->event_unmapped(event);
+ event->pmu->event_unmapped(event, vma->vm_mm);
/*
* rb->aux_mmap_count will always drop before rb->mmap_count and
vma->vm_ops = &perf_mmap_vmops;
if (event->pmu->event_mapped)
- event->pmu->event_mapped(event);
+ event->pmu->event_mapped(event, vma->vm_mm);
return ret;
}
static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
{
- bool is_kprobe, is_tracepoint;
+ bool is_kprobe, is_tracepoint, is_syscall_tp;
struct bpf_prog *prog;
if (event->attr.type != PERF_TYPE_TRACEPOINT)
is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
- if (!is_kprobe && !is_tracepoint)
+ is_syscall_tp = is_syscall_trace_event(event->tp_event);
+ if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
/* bpf programs can only be attached to u/kprobe or tracepoint */
return -EINVAL;
return PTR_ERR(prog);
if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
- (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
+ (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
+ (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
/* valid fd, but invalid bpf program type */
bpf_prog_put(prog);
return -EINVAL;
}
- if (is_tracepoint) {
+ if (is_tracepoint || is_syscall_tp) {
int off = trace_event_get_offsets(event->tp_event);
if (prog->aux->max_ctx_offset > off) {
int *peeked, int *off, int *err,
struct sk_buff **last)
{
+ bool peek_at_off = false;
struct sk_buff *skb;
- int _off = *off;
+ int _off = 0;
+
+ if (unlikely(flags & MSG_PEEK && *off >= 0)) {
+ peek_at_off = true;
+ _off = *off;
+ }
*last = queue->prev;
skb_queue_walk(queue, skb) {
if (flags & MSG_PEEK) {
- if (_off >= skb->len && (skb->len || _off ||
- skb->peeked)) {
+ if (peek_at_off && _off >= skb->len &&
+ (_off || skb->peeked)) {
_off -= skb->len;
continue;
}
}
EXPORT_SYMBOL(skb_copy_datagram_from_iter);
-/**
- * zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter
- * @skb: buffer to copy
- * @from: the source to copy from
- *
- * The function will first copy up to headlen, and then pin the userspace
- * pages and build frags through them.
- *
- * Returns 0, -EFAULT or -EMSGSIZE.
- */
-int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
+int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
+ struct iov_iter *from, size_t length)
{
- int len = iov_iter_count(from);
- int copy = min_t(int, skb_headlen(skb), len);
- int frag = 0;
+ int frag = skb_shinfo(skb)->nr_frags;
- /* copy up to skb headlen */
- if (skb_copy_datagram_from_iter(skb, 0, from, copy))
- return -EFAULT;
-
- while (iov_iter_count(from)) {
+ while (length && iov_iter_count(from)) {
struct page *pages[MAX_SKB_FRAGS];
size_t start;
ssize_t copied;
if (frag == MAX_SKB_FRAGS)
return -EMSGSIZE;
- copied = iov_iter_get_pages(from, pages, ~0U,
+ copied = iov_iter_get_pages(from, pages, length,
MAX_SKB_FRAGS - frag, &start);
if (copied < 0)
return -EFAULT;
iov_iter_advance(from, copied);
+ length -= copied;
truesize = PAGE_ALIGN(copied + start);
skb->data_len += copied;
skb->len += copied;
skb->truesize += truesize;
- refcount_add(truesize, &skb->sk->sk_wmem_alloc);
+ if (sk && sk->sk_type == SOCK_STREAM) {
+ sk->sk_wmem_queued += truesize;
+ sk_mem_charge(sk, truesize);
+ } else {
+ refcount_add(truesize, &skb->sk->sk_wmem_alloc);
+ }
while (copied) {
int size = min_t(int, copied, PAGE_SIZE - start);
skb_fill_page_desc(skb, frag++, pages[n], start, size);
}
return 0;
}
+EXPORT_SYMBOL(__zerocopy_sg_from_iter);
+
+/**
+ * zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter
+ * @skb: buffer to copy
+ * @from: the source to copy from
+ *
+ * The function will first copy up to headlen, and then pin the userspace
+ * pages and build frags through them.
+ *
+ * Returns 0, -EFAULT or -EMSGSIZE.
+ */
+int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
+{
+ int copy = min_t(int, skb_headlen(skb), iov_iter_count(from));
+
+ /* copy up to skb headlen */
+ if (skb_copy_datagram_from_iter(skb, 0, from, copy))
+ return -EFAULT;
+
+ return __zerocopy_sg_from_iter(NULL, skb, from, ~0U);
+}
EXPORT_SYMBOL(zerocopy_sg_from_iter);
static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
{
/* This basically follows the spec line by line -- see RFC1112 */
struct igmphdr *ih;
- struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
+ struct net_device *dev = skb->dev;
+ struct in_device *in_dev;
int len = skb->len;
bool dropped = true;
+ if (netif_is_l3_master(dev)) {
+ dev = dev_get_by_index_rcu(dev_net(dev), IPCB(skb)->iif);
+ if (!dev)
+ goto drop;
+ }
+
+ in_dev = __in_dev_get_rcu(dev);
if (!in_dev)
goto drop;
/*
* check if a multicast source filter allows delivery for a given <src,dst,intf>
*/
-int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif)
+int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr,
+ int dif, int sdif)
{
struct inet_sock *inet = inet_sk(sk);
struct ip_mc_socklist *pmc;
rcu_read_lock();
for_each_pmc_rcu(inet, pmc) {
if (pmc->multi.imr_multiaddr.s_addr == loc_addr &&
- pmc->multi.imr_ifindex == dif)
+ (pmc->multi.imr_ifindex == dif ||
+ (sdif && pmc->multi.imr_ifindex == sdif)))
break;
}
ret = inet->mc_all;
if (mtu)
return mtu;
- mtu = dst->dev->mtu;
+ mtu = READ_ONCE(dst->dev->mtu);
if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
if (rt->rt_uses_gateway && mtu > 576)
struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
struct rtable *rt = (struct rtable *) dst;
- if (p != &dst_default_metrics && atomic_dec_and_test(&p->refcnt))
+ if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
kfree(p);
if (!list_empty(&rt->rt_uncached)) {
dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
if (fi->fib_metrics != &dst_default_metrics) {
rt->dst._metrics |= DST_METRICS_REFCOUNTED;
- atomic_inc(&fi->fib_metrics->refcnt);
+ refcount_inc(&fi->fib_metrics->refcnt);
}
#ifdef CONFIG_IP_ROUTE_CLASSID
rt->dst.tclassid = nh->nh_tclassid;
if (!rth)
return ERR_PTR(-ENOBUFS);
- rth->rt_iif = orig_oif ? : 0;
+ rth->rt_iif = orig_oif;
if (res->table)
rth->rt_table_id = res->table->tb_id;
/* L3 master device is the loopback for that domain */
dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
net->loopback_dev;
+
+ /* make sure orig_oif points to fib result device even
+ * though packet rx/tx happens over loopback or l3mdev
+ */
+ orig_oif = FIB_RES_OIF(*res);
+
fl4->flowi4_oif = dev_out->ifindex;
flags |= RTCF_LOCAL;
goto make_route;
if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
table_id = rt->rt_table_id;
- if (rtm->rtm_flags & RTM_F_FIB_MATCH)
+ if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
+ if (!res.fi) {
+ err = fib_props[res.type].error;
+ if (!err)
+ err = -EHOSTUNREACH;
+ goto errout_free;
+ }
err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
rt->rt_type, res.prefix, res.prefixlen,
fl4.flowi4_tos, res.fi, 0);
- else
+ } else {
err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
+ }
if (err < 0)
goto errout_free;
xfrm_init();
xfrm4_init();
#endif
- rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
+ rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
+ RTNL_FLAG_DOIT_UNLOCKED);
#ifdef CONFIG_SYSCTL
register_pernet_subsys(&sysctl_route_ops);
#define FLAG_DATA_SACKED 0x20 /* New SACK. */
#define FLAG_ECE 0x40 /* ECE in this ACK */
#define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */
-#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
!after(tp->high_seq, tp->snd_una) ||
(icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
tp->prior_ssthresh = tcp_current_ssthresh(sk);
+ tp->prior_cwnd = tp->snd_cwnd;
tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
tcp_ca_event(sk, CA_EVENT_LOSS);
tcp_init_undo(tp);
/* delta_us may not be positive if the socket is locked
* when the retrans timer fires and is rescheduled.
*/
- if (delta_us > 0)
- rto = usecs_to_jiffies(delta_us);
+ rto = usecs_to_jiffies(max_t(int, delta_us, 1));
}
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
TCP_RTO_MAX);
if (tp->snd_wnd != nwin) {
tp->snd_wnd = nwin;
- /* Note, it is the only place, where
- * fast path is recovered for sending TCP.
- */
- tp->pred_flags = 0;
- tcp_fast_path_check(sk);
-
if (tcp_send_head(sk))
tcp_slow_start_after_idle_check(sk);
u32 lost = tp->lost;
int acked = 0; /* Number of packets newly acked */
int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
+ u32 ack_ev_flags = 0;
sack_state.first_sackt = 0;
sack_state.rate = &rs;
if (flag & FLAG_UPDATE_TS_RECENT)
tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
- if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
- /* Window is constant, pure forward advance.
- * No more checks are required.
- * Note, we use the fact that SND.UNA>=SND.WL2.
- */
- tcp_update_wl(tp, ack_seq);
- tcp_snd_una_update(tp, ack);
- flag |= FLAG_WIN_UPDATE;
-
- tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
-
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS);
- } else {
- u32 ack_ev_flags = CA_ACK_SLOWPATH;
-
- if (ack_seq != TCP_SKB_CB(skb)->end_seq)
- flag |= FLAG_DATA;
- else
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS);
+ if (ack_seq != TCP_SKB_CB(skb)->end_seq)
+ flag |= FLAG_DATA;
+ else
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS);
- flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
+ flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
- if (TCP_SKB_CB(skb)->sacked)
- flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
- &sack_state);
+ if (TCP_SKB_CB(skb)->sacked)
+ flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
+ &sack_state);
- if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
- flag |= FLAG_ECE;
- ack_ev_flags |= CA_ACK_ECE;
- }
+ if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
+ flag |= FLAG_ECE;
+ ack_ev_flags = CA_ACK_ECE;
+ }
- if (flag & FLAG_WIN_UPDATE)
- ack_ev_flags |= CA_ACK_WIN_UPDATE;
+ if (flag & FLAG_WIN_UPDATE)
+ ack_ev_flags |= CA_ACK_WIN_UPDATE;
- tcp_in_ack_event(sk, ack_ev_flags);
- }
+ tcp_in_ack_event(sk, ack_ev_flags);
/* We passed data and got it acked, remove any soft error
* log. Something worked...
return;
}
- /* Disable header prediction. */
- tp->pred_flags = 0;
inet_csk_schedule_ack(sk);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
- bool fragstolen = false;
- int eaten = -1;
+ bool fragstolen;
+ int eaten;
if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
__kfree_skb(skb);
goto out_of_window;
/* Ok. In sequence. In window. */
- if (tp->ucopy.task == current &&
- tp->copied_seq == tp->rcv_nxt && tp->ucopy.len &&
- sock_owned_by_user(sk) && !tp->urg_data) {
- int chunk = min_t(unsigned int, skb->len,
- tp->ucopy.len);
-
- __set_current_state(TASK_RUNNING);
-
- if (!skb_copy_datagram_msg(skb, 0, tp->ucopy.msg, chunk)) {
- tp->ucopy.len -= chunk;
- tp->copied_seq += chunk;
- eaten = (chunk == skb->len);
- tcp_rcv_space_adjust(sk);
- }
- }
-
- if (eaten <= 0) {
queue_and_out:
- if (eaten < 0) {
- if (skb_queue_len(&sk->sk_receive_queue) == 0)
- sk_forced_mem_schedule(sk, skb->truesize);
- else if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
- goto drop;
- }
- eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
- }
+ if (skb_queue_len(&sk->sk_receive_queue) == 0)
+ sk_forced_mem_schedule(sk, skb->truesize);
+ else if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
+ goto drop;
+
+ eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
if (skb->len)
tcp_event_data_recv(sk, skb);
if (tp->rx_opt.num_sacks)
tcp_sack_remove(tp);
- tcp_fast_path_check(sk);
-
if (eaten > 0)
kfree_skb_partial(skb, fragstolen);
if (!sock_flag(sk, SOCK_DEAD))
NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED);
/* Massive buffer overcommit. */
- tp->pred_flags = 0;
return -1;
}
tp->urg_data = TCP_URG_NOTYET;
tp->urg_seq = ptr;
-
- /* Disable header prediction. */
- tp->pred_flags = 0;
}
/* This is the 'fast' part of urgent handling. */
}
}
-static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- int chunk = skb->len - hlen;
- int err;
-
- if (skb_csum_unnecessary(skb))
- err = skb_copy_datagram_msg(skb, hlen, tp->ucopy.msg, chunk);
- else
- err = skb_copy_and_csum_datagram_msg(skb, hlen, tp->ucopy.msg);
-
- if (!err) {
- tp->ucopy.len -= chunk;
- tp->copied_seq += chunk;
- tcp_rcv_space_adjust(sk);
- }
-
- return err;
-}
-
/* Accept RST for rcv_nxt - 1 after a FIN.
* When tcp connections are abruptly terminated from Mac OSX (via ^C), a
* FIN is sent followed by a RST packet. The RST is sent with the same
/*
* TCP receive function for the ESTABLISHED state.
- *
- * It is split into a fast path and a slow path. The fast path is
- * disabled when:
- * - A zero window was announced from us - zero window probing
- * is only handled properly in the slow path.
- * - Out of order segments arrived.
- * - Urgent data is expected.
- * - There is no buffer space left
- * - Unexpected TCP flags/window values/header lengths are received
- * (detected by checking the TCP header against pred_flags)
- * - Data is sent in both directions. Fast path only supports pure senders
- * or pure receivers (this means either the sequence number or the ack
- * value must stay constant)
- * - Unexpected TCP option.
- *
- * When these conditions are not satisfied it drops into a standard
- * receive procedure patterned after RFC793 to handle all cases.
- * The first three cases are guaranteed by proper pred_flags setting,
- * the rest is checked inline. Fast processing is turned on in
- * tcp_data_queue when everything is OK.
*/
void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
- const struct tcphdr *th, unsigned int len)
+ const struct tcphdr *th)
{
+ unsigned int len = skb->len;
struct tcp_sock *tp = tcp_sk(sk);
tcp_mstamp_refresh(tp);
if (unlikely(!sk->sk_rx_dst))
inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
- /*
- * Header prediction.
- * The code loosely follows the one in the famous
- * "30 instruction TCP receive" Van Jacobson mail.
- *
- * Van's trick is to deposit buffers into socket queue
- * on a device interrupt, to call tcp_recv function
- * on the receive process context and checksum and copy
- * the buffer to user space. smart...
- *
- * Our current scheme is not silly either but we take the
- * extra cost of the net_bh soft interrupt processing...
- * We do checksum and copy also but from device to kernel.
- */
tp->rx_opt.saw_tstamp = 0;
- /* pred_flags is 0xS?10 << 16 + snd_wnd
- * if header_prediction is to be made
- * 'S' will always be tp->tcp_header_len >> 2
- * '?' will be 0 for the fast path, otherwise pred_flags is 0 to
- * turn it off (when there are holes in the receive
- * space for instance)
- * PSH flag is ignored.
- */
-
- if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
- TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
- !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
- int tcp_header_len = tp->tcp_header_len;
-
- /* Timestamp header prediction: tcp_header_len
- * is automatically equal to th->doff*4 due to pred_flags
- * match.
- */
-
- /* Check timestamp */
- if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
- /* No? Slow path! */
- if (!tcp_parse_aligned_timestamp(tp, th))
- goto slow_path;
-
- /* If PAWS failed, check it more carefully in slow path */
- if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
- goto slow_path;
-
- /* DO NOT update ts_recent here, if checksum fails
- * and timestamp was corrupted part, it will result
- * in a hung connection since we will drop all
- * future packets due to the PAWS test.
- */
- }
-
- if (len <= tcp_header_len) {
- /* Bulk data transfer: sender */
- if (len == tcp_header_len) {
- /* Predicted packet is in window by definition.
- * seq == rcv_nxt and rcv_wup <= rcv_nxt.
- * Hence, check seq<=rcv_wup reduces to:
- */
- if (tcp_header_len ==
- (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
- tp->rcv_nxt == tp->rcv_wup)
- tcp_store_ts_recent(tp);
-
- /* We know that such packets are checksummed
- * on entry.
- */
- tcp_ack(sk, skb, 0);
- __kfree_skb(skb);
- tcp_data_snd_check(sk);
- return;
- } else { /* Header too small */
- TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
- goto discard;
- }
- } else {
- int eaten = 0;
- bool fragstolen = false;
-
- if (tp->ucopy.task == current &&
- tp->copied_seq == tp->rcv_nxt &&
- len - tcp_header_len <= tp->ucopy.len &&
- sock_owned_by_user(sk)) {
- __set_current_state(TASK_RUNNING);
-
- if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) {
- /* Predicted packet is in window by definition.
- * seq == rcv_nxt and rcv_wup <= rcv_nxt.
- * Hence, check seq<=rcv_wup reduces to:
- */
- if (tcp_header_len ==
- (sizeof(struct tcphdr) +
- TCPOLEN_TSTAMP_ALIGNED) &&
- tp->rcv_nxt == tp->rcv_wup)
- tcp_store_ts_recent(tp);
-
- tcp_rcv_rtt_measure_ts(sk, skb);
-
- __skb_pull(skb, tcp_header_len);
- tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
- NET_INC_STATS(sock_net(sk),
- LINUX_MIB_TCPHPHITSTOUSER);
- eaten = 1;
- }
- }
- if (!eaten) {
- if (tcp_checksum_complete(skb))
- goto csum_error;
-
- if ((int)skb->truesize > sk->sk_forward_alloc)
- goto step5;
-
- /* Predicted packet is in window by definition.
- * seq == rcv_nxt and rcv_wup <= rcv_nxt.
- * Hence, check seq<=rcv_wup reduces to:
- */
- if (tcp_header_len ==
- (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
- tp->rcv_nxt == tp->rcv_wup)
- tcp_store_ts_recent(tp);
-
- tcp_rcv_rtt_measure_ts(sk, skb);
-
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
-
- /* Bulk data transfer: receiver */
- eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
- &fragstolen);
- }
-
- tcp_event_data_recv(sk, skb);
-
- if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
- /* Well, only one small jumplet in fast path... */
- tcp_ack(sk, skb, FLAG_DATA);
- tcp_data_snd_check(sk);
- if (!inet_csk_ack_scheduled(sk))
- goto no_ack;
- }
-
- __tcp_ack_snd_check(sk, 0);
-no_ack:
- if (eaten)
- kfree_skb_partial(skb, fragstolen);
- sk->sk_data_ready(sk);
- return;
- }
- }
-
-slow_path:
if (len < (th->doff << 2) || tcp_checksum_complete(skb))
goto csum_error;
if (!th->ack && !th->rst && !th->syn)
goto discard;
- /*
- * Standard slow path.
- */
-
if (!tcp_validate_incoming(sk, skb, th, 1))
return;
-step5:
- if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0)
+ if (tcp_ack(sk, skb, FLAG_UPDATE_TS_RECENT) < 0)
goto discard;
tcp_rcv_rtt_measure_ts(sk, skb);
if (sock_flag(sk, SOCK_KEEPOPEN))
inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
-
- if (!tp->rx_opt.snd_wscale)
- __tcp_fast_path_on(tp, tp->snd_wnd);
- else
- tp->pred_flags = 0;
-
}
static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
tcp_ecn_rcv_synack(tp, th);
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
- tcp_ack(sk, skb, FLAG_SLOWPATH);
+ tcp_ack(sk, skb, 0);
/* Ok.. it's good. Set up sequence numbers and
* move to established.
return 0;
/* step 5: check the ACK field */
- acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
- FLAG_UPDATE_TS_RECENT |
+
+ acceptable = tcp_ack(sk, skb, FLAG_UPDATE_TS_RECENT |
FLAG_NO_CHALLENGE_ACK) > 0;
if (!acceptable) {
tp->lsndtime = tcp_jiffies32;
tcp_initialize_rcv_mss(sk);
- tcp_fast_path_on(tp);
break;
case TCP_FIN_WAIT1: {
static int compute_score(struct sock *sk, struct net *net,
__be32 saddr, __be16 sport,
- __be32 daddr, unsigned short hnum, int dif,
- bool exact_dif)
+ __be32 daddr, unsigned short hnum,
+ int dif, int sdif, bool exact_dif)
{
int score;
struct inet_sock *inet;
}
if (sk->sk_bound_dev_if || exact_dif) {
- if (sk->sk_bound_dev_if != dif)
+ bool dev_match = (sk->sk_bound_dev_if == dif ||
+ sk->sk_bound_dev_if == sdif);
+
+ if (exact_dif && !dev_match)
return -1;
- score += 4;
+ if (sk->sk_bound_dev_if && dev_match)
+ score += 4;
}
+
if (sk->sk_incoming_cpu == raw_smp_processor_id())
score++;
return score;
/* called with rcu_read_lock() */
static struct sock *udp4_lib_lookup2(struct net *net,
- __be32 saddr, __be16 sport,
- __be32 daddr, unsigned int hnum, int dif, bool exact_dif,
- struct udp_hslot *hslot2,
- struct sk_buff *skb)
+ __be32 saddr, __be16 sport,
+ __be32 daddr, unsigned int hnum,
+ int dif, int sdif, bool exact_dif,
+ struct udp_hslot *hslot2,
+ struct sk_buff *skb)
{
struct sock *sk, *result;
int score, badness, matches = 0, reuseport = 0;
badness = 0;
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
score = compute_score(sk, net, saddr, sport,
- daddr, hnum, dif, exact_dif);
+ daddr, hnum, dif, sdif, exact_dif);
if (score > badness) {
reuseport = sk->sk_reuseport;
if (reuseport) {
* harder than this. -DaveM
*/
struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
- __be16 sport, __be32 daddr, __be16 dport,
- int dif, struct udp_table *udptable, struct sk_buff *skb)
+ __be16 sport, __be32 daddr, __be16 dport, int dif,
+ int sdif, struct udp_table *udptable, struct sk_buff *skb)
{
struct sock *sk, *result;
unsigned short hnum = ntohs(dport);
goto begin;
result = udp4_lib_lookup2(net, saddr, sport,
- daddr, hnum, dif,
+ daddr, hnum, dif, sdif,
exact_dif, hslot2, skb);
if (!result) {
unsigned int old_slot2 = slot2;
goto begin;
result = udp4_lib_lookup2(net, saddr, sport,
- daddr, hnum, dif,
+ daddr, hnum, dif, sdif,
exact_dif, hslot2, skb);
}
return result;
badness = 0;
sk_for_each_rcu(sk, &hslot->head) {
score = compute_score(sk, net, saddr, sport,
- daddr, hnum, dif, exact_dif);
+ daddr, hnum, dif, sdif, exact_dif);
if (score > badness) {
reuseport = sk->sk_reuseport;
if (reuseport) {
return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport,
iph->daddr, dport, inet_iif(skb),
- udptable, skb);
+ inet_sdif(skb), udptable, skb);
}
struct sock *udp4_lib_lookup_skb(struct sk_buff *skb,
struct sock *sk;
sk = __udp4_lib_lookup(net, saddr, sport, daddr, dport,
- dif, &udp_table, NULL);
+ dif, 0, &udp_table, NULL);
if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
sk = NULL;
return sk;
static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
__be16 loc_port, __be32 loc_addr,
__be16 rmt_port, __be32 rmt_addr,
- int dif, unsigned short hnum)
+ int dif, int sdif, unsigned short hnum)
{
struct inet_sock *inet = inet_sk(sk);
(inet->inet_dport != rmt_port && inet->inet_dport) ||
(inet->inet_rcv_saddr && inet->inet_rcv_saddr != loc_addr) ||
ipv6_only_sock(sk) ||
- (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
+ (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif &&
+ sk->sk_bound_dev_if != sdif))
return false;
- if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif))
+ if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif, sdif))
return false;
return true;
}
struct net *net = dev_net(skb->dev);
sk = __udp4_lib_lookup(net, iph->daddr, uh->dest,
- iph->saddr, uh->source, skb->dev->ifindex, udptable,
- NULL);
+ iph->saddr, uh->source, skb->dev->ifindex, 0,
+ udptable, NULL);
if (!sk) {
__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
return; /* No socket for error */
scratch->csum_unnecessary = !!skb_csum_unnecessary(skb);
scratch->is_linear = !skb_is_nonlinear(skb);
#endif
- if (likely(!skb->_skb_refdst))
+ /* all head states execept sp (dst, sk, nf) are always cleared by
+ * udp_rcv() and we need to preserve secpath, if present, to eventually
+ * process IP_CMSG_PASSSEC at recvmsg() time
+ */
+ if (likely(!skb_sec_path(skb)))
scratch->_tsize_state |= UDP_SKB_IS_STATELESS;
}
return ip_recv_error(sk, msg, len, addr_len);
try_again:
- peeking = off = sk_peek_offset(sk, flags);
+ peeking = flags & MSG_PEEK;
+ off = sk_peek_offset(sk, flags);
skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
if (!skb)
return err;
sk_mark_napi_id_once(sk, skb);
}
- /* At recvmsg() time we may access skb->dst or skb->sp depending on
- * the IP options and the cmsg flags, elsewhere can we clear all
- * pending head states while they are hot in the cache
- */
- if (likely(IPCB(skb)->opt.optlen == 0 && !skb_sec_path(skb)))
- skb_release_head_state(skb);
-
rc = __udp_enqueue_schedule_skb(sk, skb);
if (rc < 0) {
int is_udplite = IS_UDPLITE(sk);
unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
unsigned int offset = offsetof(typeof(*sk), sk_node);
int dif = skb->dev->ifindex;
+ int sdif = inet_sdif(skb);
struct hlist_node *node;
struct sk_buff *nskb;
sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) {
if (!__udp_is_mcast_sock(net, sk, uh->dest, daddr,
- uh->source, saddr, dif, hnum))
+ uh->source, saddr, dif, sdif, hnum))
continue;
if (!first) {
static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net,
__be16 loc_port, __be32 loc_addr,
__be16 rmt_port, __be32 rmt_addr,
- int dif)
+ int dif, int sdif)
{
struct sock *sk, *result;
unsigned short hnum = ntohs(loc_port);
result = NULL;
sk_for_each_rcu(sk, &hslot->head) {
if (__udp_is_mcast_sock(net, sk, loc_port, loc_addr,
- rmt_port, rmt_addr, dif, hnum)) {
+ rmt_port, rmt_addr, dif, sdif, hnum)) {
if (result)
return NULL;
result = sk;
static struct sock *__udp4_lib_demux_lookup(struct net *net,
__be16 loc_port, __be32 loc_addr,
__be16 rmt_port, __be32 rmt_addr,
- int dif)
+ int dif, int sdif)
{
unsigned short hnum = ntohs(loc_port);
unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum);
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
if (INET_MATCH(sk, net, acookie, rmt_addr,
- loc_addr, ports, dif))
+ loc_addr, ports, dif, sdif))
return sk;
/* Only check first socket in chain */
break;
struct sock *sk = NULL;
struct dst_entry *dst;
int dif = skb->dev->ifindex;
+ int sdif = inet_sdif(skb);
int ours;
/* validate the packet */
}
sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr,
- uh->source, iph->saddr, dif);
+ uh->source, iph->saddr,
+ dif, sdif);
} else if (skb->pkt_type == PACKET_HOST) {
sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr,
- uh->source, iph->saddr, dif);
+ uh->source, iph->saddr, dif, sdif);
}
if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt))
#include <net/ndisc.h>
#include <net/addrconf.h>
#include <net/lwtunnel.h>
+#include <net/fib_notifier.h>
#include <net/ip6_fib.h>
#include <net/ip6_route.h>
kmem_cache_free(fib6_node_kmem, fn);
}
-static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
+void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
{
int cpu;
free_percpu(non_pcpu_rt->rt6i_pcpu);
non_pcpu_rt->rt6i_pcpu = NULL;
}
-
-static void rt6_release(struct rt6_info *rt)
-{
- if (atomic_dec_and_test(&rt->rt6i_ref)) {
- rt6_free_pcpu(rt);
- dst_dev_put(&rt->dst);
- dst_release(&rt->dst);
- }
-}
+EXPORT_SYMBOL_GPL(rt6_free_pcpu);
static void fib6_link_table(struct net *net, struct fib6_table *tb)
{
#endif
+unsigned int fib6_tables_seq_read(struct net *net)
+{
+ unsigned int h, fib_seq = 0;
+
+ rcu_read_lock();
+ for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
+ struct hlist_head *head = &net->ipv6.fib_table_hash[h];
+ struct fib6_table *tb;
+
+ hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
+ read_lock_bh(&tb->tb6_lock);
+ fib_seq += tb->fib_seq;
+ read_unlock_bh(&tb->tb6_lock);
+ }
+ }
+ rcu_read_unlock();
+
+ return fib_seq;
+}
+
+static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net,
+ enum fib_event_type event_type,
+ struct rt6_info *rt)
+{
+ struct fib6_entry_notifier_info info = {
+ .rt = rt,
+ };
+
+ return call_fib6_notifier(nb, net, event_type, &info.info);
+}
+
+static int call_fib6_entry_notifiers(struct net *net,
+ enum fib_event_type event_type,
+ struct rt6_info *rt)
+{
+ struct fib6_entry_notifier_info info = {
+ .rt = rt,
+ };
+
+ rt->rt6i_table->fib_seq++;
+ return call_fib6_notifiers(net, event_type, &info.info);
+}
+
+struct fib6_dump_arg {
+ struct net *net;
+ struct notifier_block *nb;
+};
+
+static void fib6_rt_dump(struct rt6_info *rt, struct fib6_dump_arg *arg)
+{
+ if (rt == arg->net->ipv6.ip6_null_entry)
+ return;
+ call_fib6_entry_notifier(arg->nb, arg->net, FIB_EVENT_ENTRY_ADD, rt);
+}
+
+static int fib6_node_dump(struct fib6_walker *w)
+{
+ struct rt6_info *rt;
+
+ for (rt = w->leaf; rt; rt = rt->dst.rt6_next)
+ fib6_rt_dump(rt, w->args);
+ w->leaf = NULL;
+ return 0;
+}
+
+static void fib6_table_dump(struct net *net, struct fib6_table *tb,
+ struct fib6_walker *w)
+{
+ w->root = &tb->tb6_root;
+ read_lock_bh(&tb->tb6_lock);
+ fib6_walk(net, w);
+ read_unlock_bh(&tb->tb6_lock);
+}
+
+/* Called with rcu_read_lock() */
+int fib6_tables_dump(struct net *net, struct notifier_block *nb)
+{
+ struct fib6_dump_arg arg;
+ struct fib6_walker *w;
+ unsigned int h;
+
+ w = kzalloc(sizeof(*w), GFP_ATOMIC);
+ if (!w)
+ return -ENOMEM;
+
+ w->func = fib6_node_dump;
+ arg.net = net;
+ arg.nb = nb;
+ w->args = &arg;
+
+ for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
+ struct hlist_head *head = &net->ipv6.fib_table_hash[h];
+ struct fib6_table *tb;
+
+ hlist_for_each_entry_rcu(tb, head, tb6_hlist)
+ fib6_table_dump(net, tb, w);
+ }
+
+ kfree(w);
+
+ return 0;
+}
+
static int fib6_dump_node(struct fib6_walker *w)
{
int res;
}
fn = fn->parent;
}
- /* No more references are possible at this point. */
- BUG_ON(atomic_read(&rt->rt6i_ref) != 1);
}
}
*ins = rt;
rt->rt6i_node = fn;
atomic_inc(&rt->rt6i_ref);
+ call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_ADD,
+ rt);
if (!info->skip_notify)
inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
info->nl_net->ipv6.rt6_stats->fib_rt_entries++;
rt->rt6i_node = fn;
rt->dst.rt6_next = iter->dst.rt6_next;
atomic_inc(&rt->rt6i_ref);
+ call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE,
+ rt);
if (!info->skip_notify)
inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
if (!(fn->fn_flags & RTN_RTINFO)) {
fn->fn_flags |= RTN_RTINFO;
}
nsiblings = iter->rt6i_nsiblings;
+ iter->rt6i_node = NULL;
fib6_purge_rt(iter, fn, info->nl_net);
+ if (fn->rr_ptr == iter)
+ fn->rr_ptr = NULL;
rt6_release(iter);
if (nsiblings) {
break;
if (rt6_qualify_for_ecmp(iter)) {
*ins = iter->dst.rt6_next;
+ iter->rt6i_node = NULL;
fib6_purge_rt(iter, fn, info->nl_net);
+ if (fn->rr_ptr == iter)
+ fn->rr_ptr = NULL;
rt6_release(iter);
nsiblings--;
} else {
/* Create subtree root node */
sfn = node_alloc();
if (!sfn)
- goto st_failure;
+ goto failure;
sfn->leaf = info->nl_net->ipv6.ip6_null_entry;
atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref);
if (IS_ERR(sn)) {
/* If it is failed, discard just allocated
- root, and then (in st_failure) stale node
+ root, and then (in failure) stale node
in main tree.
*/
node_free(sfn);
err = PTR_ERR(sn);
- goto st_failure;
+ goto failure;
}
/* Now link new subtree to main tree */
if (IS_ERR(sn)) {
err = PTR_ERR(sn);
- goto st_failure;
+ goto failure;
}
}
atomic_inc(&pn->leaf->rt6i_ref);
}
#endif
- /* Always release dst as dst->__refcnt is guaranteed
- * to be taken before entering this function
- */
- dst_release_immediate(&rt->dst);
+ goto failure;
}
return err;
- #ifdef CONFIG_IPV6_SUBTREES
- /* Subtree creation failed, probably main tree node
- is orphan. If it is, shoot it.
+ failure:
+ /* fn->leaf could be NULL if fn is an intermediate node and we
+ * failed to add the new route to it in both subtree creation
+ * failure and fib6_add_rt2node() failure case.
+ * In both cases, fib6_repair_tree() should be called to fix
+ * fn->leaf.
*/
- st_failure:
if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)))
fib6_repair_tree(info->nl_net, fn);
/* Always release dst as dst->__refcnt is guaranteed
*/
dst_release_immediate(&rt->dst);
return err;
- #endif
}
/*
fib6_purge_rt(rt, fn, net);
+ call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt);
if (!info->skip_notify)
inet6_rt_notify(RTM_DELROUTE, rt, info, 0);
rt6_release(rt);
static int __net_init fib6_net_init(struct net *net)
{
size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ;
+ int err;
+
+ err = fib6_notifier_init(net);
+ if (err)
+ return err;
spin_lock_init(&net->ipv6.fib6_gc_lock);
rwlock_init(&net->ipv6.fib6_walker_lock);
out_rt6_stats:
kfree(net->ipv6.rt6_stats);
out_timer:
+ fib6_notifier_exit(net);
return -ENOMEM;
}
kfree(net->ipv6.fib6_main_tbl);
kfree(net->ipv6.fib_table_hash);
kfree(net->ipv6.rt6_stats);
+ fib6_notifier_exit(net);
}
static struct pernet_operations fib6_net_ops = {
goto out_kmem_cache_create;
ret = __rtnl_register(PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib,
- NULL);
+ 0);
if (ret)
goto out_unregister_subsys;
static int compute_score(struct sock *sk, struct net *net,
const struct in6_addr *saddr, __be16 sport,
const struct in6_addr *daddr, unsigned short hnum,
- int dif, bool exact_dif)
+ int dif, int sdif, bool exact_dif)
{
int score;
struct inet_sock *inet;
}
if (sk->sk_bound_dev_if || exact_dif) {
- if (sk->sk_bound_dev_if != dif)
+ bool dev_match = (sk->sk_bound_dev_if == dif ||
+ sk->sk_bound_dev_if == sdif);
+
+ if (exact_dif && !dev_match)
return -1;
- score++;
+ if (sk->sk_bound_dev_if && dev_match)
+ score++;
}
if (sk->sk_incoming_cpu == raw_smp_processor_id())
/* called with rcu_read_lock() */
static struct sock *udp6_lib_lookup2(struct net *net,
const struct in6_addr *saddr, __be16 sport,
- const struct in6_addr *daddr, unsigned int hnum, int dif,
- bool exact_dif, struct udp_hslot *hslot2,
- struct sk_buff *skb)
+ const struct in6_addr *daddr, unsigned int hnum,
+ int dif, int sdif, bool exact_dif,
+ struct udp_hslot *hslot2, struct sk_buff *skb)
{
struct sock *sk, *result;
int score, badness, matches = 0, reuseport = 0;
badness = -1;
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
score = compute_score(sk, net, saddr, sport,
- daddr, hnum, dif, exact_dif);
+ daddr, hnum, dif, sdif, exact_dif);
if (score > badness) {
reuseport = sk->sk_reuseport;
if (reuseport) {
/* rcu_read_lock() must be held */
struct sock *__udp6_lib_lookup(struct net *net,
- const struct in6_addr *saddr, __be16 sport,
- const struct in6_addr *daddr, __be16 dport,
- int dif, struct udp_table *udptable,
- struct sk_buff *skb)
+ const struct in6_addr *saddr, __be16 sport,
+ const struct in6_addr *daddr, __be16 dport,
+ int dif, int sdif, struct udp_table *udptable,
+ struct sk_buff *skb)
{
struct sock *sk, *result;
unsigned short hnum = ntohs(dport);
goto begin;
result = udp6_lib_lookup2(net, saddr, sport,
- daddr, hnum, dif, exact_dif,
+ daddr, hnum, dif, sdif, exact_dif,
hslot2, skb);
if (!result) {
unsigned int old_slot2 = slot2;
goto begin;
result = udp6_lib_lookup2(net, saddr, sport,
- daddr, hnum, dif,
+ daddr, hnum, dif, sdif,
exact_dif, hslot2,
skb);
}
badness = -1;
sk_for_each_rcu(sk, &hslot->head) {
score = compute_score(sk, net, saddr, sport, daddr, hnum, dif,
- exact_dif);
+ sdif, exact_dif);
if (score > badness) {
reuseport = sk->sk_reuseport;
if (reuseport) {
return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport,
&iph->daddr, dport, inet6_iif(skb),
- udptable, skb);
+ inet6_sdif(skb), udptable, skb);
}
struct sock *udp6_lib_lookup_skb(struct sk_buff *skb,
return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport,
&iph->daddr, dport, inet6_iif(skb),
- &udp_table, skb);
+ inet6_sdif(skb), &udp_table, skb);
}
EXPORT_SYMBOL_GPL(udp6_lib_lookup_skb);
struct sock *sk;
sk = __udp6_lib_lookup(net, saddr, sport, daddr, dport,
- dif, &udp_table, NULL);
+ dif, 0, &udp_table, NULL);
if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
sk = NULL;
return sk;
return ipv6_recv_rxpmtu(sk, msg, len, addr_len);
try_again:
- peeking = off = sk_peek_offset(sk, flags);
+ peeking = flags & MSG_PEEK;
+ off = sk_peek_offset(sk, flags);
skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
if (!skb)
return err;
struct net *net = dev_net(skb->dev);
sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source,
- inet6_iif(skb), udptable, skb);
+ inet6_iif(skb), 0, udptable, skb);
if (!sk) {
__ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
ICMP6_MIB_INERRORS);
static struct sock *__udp6_lib_demux_lookup(struct net *net,
__be16 loc_port, const struct in6_addr *loc_addr,
__be16 rmt_port, const struct in6_addr *rmt_addr,
- int dif)
+ int dif, int sdif)
{
unsigned short hnum = ntohs(loc_port);
unsigned int hash2 = udp6_portaddr_hash(net, loc_addr, hnum);
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
if (sk->sk_state == TCP_ESTABLISHED &&
- INET6_MATCH(sk, net, rmt_addr, loc_addr, ports, dif))
+ INET6_MATCH(sk, net, rmt_addr, loc_addr, ports, dif, sdif))
return sk;
/* Only check first socket in chain */
break;
struct sock *sk;
struct dst_entry *dst;
int dif = skb->dev->ifindex;
+ int sdif = inet6_sdif(skb);
if (!pskb_may_pull(skb, skb_transport_offset(skb) +
sizeof(struct udphdr)))
sk = __udp6_lib_demux_lookup(net, uh->dest,
&ipv6_hdr(skb)->daddr,
uh->source, &ipv6_hdr(skb)->saddr,
- dif);
+ dif, sdif);
else
return;
const struct dp_upcall_info *upcall_info,
uint32_t cutlen)
{
- unsigned short gso_type = skb_shinfo(skb)->gso_type;
- struct sw_flow_key later_key;
struct sk_buff *segs, *nskb;
int err;
if (segs == NULL)
return -EINVAL;
- if (gso_type & SKB_GSO_UDP) {
- /* The initial flow key extracted by ovs_flow_key_extract()
- * in this case is for a first fragment, so we need to
- * properly mark later fragments.
- */
- later_key = *key;
- later_key.ip.frag = OVS_FRAG_TYPE_LATER;
- }
-
/* Queue all of the segments. */
skb = segs;
do {
- if (gso_type & SKB_GSO_UDP && skb != segs)
- key = &later_key;
-
err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen);
if (err)
break;
}
static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info,
- unsigned int hdrlen)
+ unsigned int hdrlen, int actions_attrlen)
{
size_t size = NLMSG_ALIGN(sizeof(struct ovs_header))
+ nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */
/* OVS_PACKET_ATTR_ACTIONS */
if (upcall_info->actions_len)
- size += nla_total_size(upcall_info->actions_len);
+ size += nla_total_size(actions_attrlen);
/* OVS_PACKET_ATTR_MRU */
if (upcall_info->mru)
else
hlen = skb->len;
- len = upcall_msg_size(upcall_info, hlen - cutlen);
+ len = upcall_msg_size(upcall_info, hlen - cutlen,
+ OVS_CB(skb)->acts_origlen);
user_skb = genlmsg_new(len, GFP_ATOMIC);
if (!user_skb) {
err = -ENOMEM;
}
EXPORT_SYMBOL(unregister_tcf_proto_ops);
-static int tfilter_notify(struct net *net, struct sk_buff *oskb,
- struct nlmsghdr *n, struct tcf_proto *tp,
- unsigned long fh, int event, bool unicast);
-
-static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb,
- struct nlmsghdr *n,
- struct tcf_chain *chain, int event)
-{
- struct tcf_proto *tp;
-
- for (tp = rtnl_dereference(chain->filter_chain);
- tp; tp = rtnl_dereference(tp->next))
- tfilter_notify(net, oskb, n, tp, 0, event, false);
-}
-
/* Select new prio value from the range, managed by kernel. */
static inline u32 tcf_auto_prio(struct tcf_proto *tp)
{
struct tcf_proto *tp;
- if (*chain->p_filter_chain)
+ if (chain->p_filter_chain)
RCU_INIT_POINTER(*chain->p_filter_chain, NULL);
while ((tp = rtnl_dereference(chain->filter_chain)) != NULL) {
RCU_INIT_POINTER(chain->filter_chain, tp->next);
return tp;
}
+static int tcf_fill_node(struct net *net, struct sk_buff *skb,
+ struct tcf_proto *tp, void *fh, u32 portid,
+ u32 seq, u16 flags, int event)
+{
+ struct tcmsg *tcm;
+ struct nlmsghdr *nlh;
+ unsigned char *b = skb_tail_pointer(skb);
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
+ if (!nlh)
+ goto out_nlmsg_trim;
+ tcm = nlmsg_data(nlh);
+ tcm->tcm_family = AF_UNSPEC;
+ tcm->tcm__pad1 = 0;
+ tcm->tcm__pad2 = 0;
+ tcm->tcm_ifindex = qdisc_dev(tp->q)->ifindex;
+ tcm->tcm_parent = tp->classid;
+ tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol);
+ if (nla_put_string(skb, TCA_KIND, tp->ops->kind))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, TCA_CHAIN, tp->chain->index))
+ goto nla_put_failure;
+ if (!fh) {
+ tcm->tcm_handle = 0;
+ } else {
+ if (tp->ops->dump && tp->ops->dump(net, tp, fh, skb, tcm) < 0)
+ goto nla_put_failure;
+ }
+ nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+ return skb->len;
+
+out_nlmsg_trim:
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int tfilter_notify(struct net *net, struct sk_buff *oskb,
+ struct nlmsghdr *n, struct tcf_proto *tp,
+ void *fh, int event, bool unicast)
+{
+ struct sk_buff *skb;
+ u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ if (tcf_fill_node(net, skb, tp, fh, portid, n->nlmsg_seq,
+ n->nlmsg_flags, event) <= 0) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ if (unicast)
+ return netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT);
+
+ return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
+}
+
+static int tfilter_del_notify(struct net *net, struct sk_buff *oskb,
+ struct nlmsghdr *n, struct tcf_proto *tp,
+ void *fh, bool unicast, bool *last)
+{
+ struct sk_buff *skb;
+ u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
+ int err;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ if (tcf_fill_node(net, skb, tp, fh, portid, n->nlmsg_seq,
+ n->nlmsg_flags, RTM_DELTFILTER) <= 0) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ err = tp->ops->delete(tp, fh, last);
+ if (err) {
+ kfree_skb(skb);
+ return err;
+ }
+
+ if (unicast)
+ return netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT);
+
+ return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
+ n->nlmsg_flags & NLM_F_ECHO);
+}
+
+static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb,
+ struct nlmsghdr *n,
+ struct tcf_chain *chain, int event)
+{
+ struct tcf_proto *tp;
+
+ for (tp = rtnl_dereference(chain->filter_chain);
+ tp; tp = rtnl_dereference(tp->next))
+ tfilter_notify(net, oskb, n, tp, 0, event, false);
+}
+
/* Add/change/delete/get a filter node */
static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
struct tcf_proto *tp;
const struct Qdisc_class_ops *cops;
unsigned long cl;
- unsigned long fh;
+ void *fh;
int err;
int tp_created;
fh = tp->ops->get(tp, t->tcm_handle);
- if (fh == 0) {
+ if (!fh) {
if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) {
tcf_chain_tp_remove(chain, &chain_info, tp);
tfilter_notify(net, skb, n, tp, fh,
}
break;
case RTM_DELTFILTER:
- err = tp->ops->delete(tp, fh, &last);
+ err = tfilter_del_notify(net, skb, n, tp, fh, false,
+ &last);
if (err)
goto errout;
- tfilter_notify(net, skb, n, tp, t->tcm_handle,
- RTM_DELTFILTER, false);
if (last) {
tcf_chain_tp_remove(chain, &chain_info, tp);
tcf_proto_destroy(tp);
return err;
}
-static int tcf_fill_node(struct net *net, struct sk_buff *skb,
- struct tcf_proto *tp, unsigned long fh, u32 portid,
- u32 seq, u16 flags, int event)
-{
- struct tcmsg *tcm;
- struct nlmsghdr *nlh;
- unsigned char *b = skb_tail_pointer(skb);
-
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
- if (!nlh)
- goto out_nlmsg_trim;
- tcm = nlmsg_data(nlh);
- tcm->tcm_family = AF_UNSPEC;
- tcm->tcm__pad1 = 0;
- tcm->tcm__pad2 = 0;
- tcm->tcm_ifindex = qdisc_dev(tp->q)->ifindex;
- tcm->tcm_parent = tp->classid;
- tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol);
- if (nla_put_string(skb, TCA_KIND, tp->ops->kind))
- goto nla_put_failure;
- if (nla_put_u32(skb, TCA_CHAIN, tp->chain->index))
- goto nla_put_failure;
- tcm->tcm_handle = fh;
- if (RTM_DELTFILTER != event) {
- tcm->tcm_handle = 0;
- if (tp->ops->dump && tp->ops->dump(net, tp, fh, skb, tcm) < 0)
- goto nla_put_failure;
- }
- nlh->nlmsg_len = skb_tail_pointer(skb) - b;
- return skb->len;
-
-out_nlmsg_trim:
-nla_put_failure:
- nlmsg_trim(skb, b);
- return -1;
-}
-
-static int tfilter_notify(struct net *net, struct sk_buff *oskb,
- struct nlmsghdr *n, struct tcf_proto *tp,
- unsigned long fh, int event, bool unicast)
-{
- struct sk_buff *skb;
- u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
-
- skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
- if (!skb)
- return -ENOBUFS;
-
- if (tcf_fill_node(net, skb, tp, fh, portid, n->nlmsg_seq,
- n->nlmsg_flags, event) <= 0) {
- kfree_skb(skb);
- return -EINVAL;
- }
-
- if (unicast)
- return netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT);
-
- return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
- n->nlmsg_flags & NLM_F_ECHO);
-}
-
struct tcf_dump_args {
struct tcf_walker w;
struct sk_buff *skb;
struct netlink_callback *cb;
};
-static int tcf_node_dump(struct tcf_proto *tp, unsigned long n,
- struct tcf_walker *arg)
+static int tcf_node_dump(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
{
struct tcf_dump_args *a = (void *)arg;
struct net *net = sock_net(a->skb->sk);
}
EXPORT_SYMBOL(tcf_exts_validate);
-void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst,
- struct tcf_exts *src)
+void tcf_exts_change(struct tcf_exts *dst, struct tcf_exts *src)
{
#ifdef CONFIG_NET_CLS_ACT
struct tcf_exts old = *dst;
- tcf_tree_lock(tp);
- dst->nr_actions = src->nr_actions;
- dst->actions = src->actions;
- dst->type = src->type;
- tcf_tree_unlock(tp);
-
+ *dst = *src;
tcf_exts_destroy(&old);
#endif
}
#ifdef CONFIG_NET_CLS_ACT
struct nlattr *nest;
- if (exts->action && exts->nr_actions) {
+ if (exts->action && tcf_exts_has_actions(exts)) {
/*
* again for backward compatible mode - we want
* to work with both old and new modes of entering
const struct tc_action *a;
LIST_HEAD(actions);
- if (tc_no_actions(exts))
+ if (!tcf_exts_has_actions(exts))
return -EINVAL;
tcf_exts_to_list(exts, &actions);
static int __init tc_filter_init(void)
{
- rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, NULL);
- rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_ctl_tfilter, NULL, NULL);
+ rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, 0);
+ rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_ctl_tfilter, NULL, 0);
rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_ctl_tfilter,
- tc_dump_tfilter, NULL);
+ tc_dump_tfilter, 0);
return 0;
}
union sctp_addr *daddr = &t->ipaddr;
union sctp_addr dst_saddr;
struct in6_addr *final_p, final;
+ enum sctp_scope scope;
__u8 matchlen = 0;
- sctp_scope_t scope;
memset(fl6, 0, sizeof(struct flowi6));
fl6->daddr = daddr->v6.sin6_addr;
static int sctp_v6_to_addr_param(const union sctp_addr *addr,
union sctp_addr_param *param)
{
- int length = sizeof(sctp_ipv6addr_param_t);
+ int length = sizeof(struct sctp_ipv6addr_param);
param->v6.param_hdr.type = SCTP_PARAM_IPV6_ADDRESS;
param->v6.param_hdr.length = htons(length);
{
addr->sa.sa_family = AF_INET6;
addr->v6.sin6_port = port;
+ addr->v6.sin6_flowinfo = 0;
addr->v6.sin6_addr = *saddr;
+ addr->v6.sin6_scope_id = 0;
}
/* Compare addresses exactly.
}
/* What is the scope of 'addr'? */
-static sctp_scope_t sctp_v6_scope(union sctp_addr *addr)
+static enum sctp_scope sctp_v6_scope(union sctp_addr *addr)
{
+ enum sctp_scope retval;
int v6scope;
- sctp_scope_t retval;
/* The IPv6 scope is really a set of bit fields.
* See IFA_* in <net/if_inet6.h>. Map to a generic SCTP scope.
return false;
}
-#define MAX_RECURSION_LEVEL 4
-
static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
{
int i;
- unsigned char max_level = 0;
if (too_many_unix_fds(current))
return -ETOOMANYREFS;
- for (i = scm->fp->count - 1; i >= 0; i--) {
- struct sock *sk = unix_get_socket(scm->fp->fp[i]);
-
- if (sk)
- max_level = max(max_level,
- unix_sk(sk)->recursion_level);
- }
- if (unlikely(max_level > MAX_RECURSION_LEVEL))
- return -ETOOMANYREFS;
-
/*
* Need to duplicate file references for the sake of garbage
* collection. Otherwise a socket in the fps might become a
for (i = scm->fp->count - 1; i >= 0; i--)
unix_inflight(scm->fp->user, scm->fp->fp[i]);
- return max_level;
+ return 0;
}
static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
struct sk_buff *skb;
long timeo;
struct scm_cookie scm;
- int max_level;
int data_len = 0;
int sk_locked;
err = unix_scm_to_skb(&scm, skb, true);
if (err < 0)
goto out_free;
- max_level = err + 1;
skb_put(skb, len - data_len);
skb->data_len = data_len;
__net_timestamp(skb);
maybe_add_creds(skb, sock, other);
skb_queue_tail(&other->sk_receive_queue, skb);
- if (max_level > unix_sk(other)->recursion_level)
- unix_sk(other)->recursion_level = max_level;
unix_state_unlock(other);
other->sk_data_ready(other);
sock_put(other);
int sent = 0;
struct scm_cookie scm;
bool fds_sent = false;
- int max_level;
int data_len;
wait_for_unix_gc();
kfree_skb(skb);
goto out_err;
}
- max_level = err + 1;
fds_sent = true;
skb_put(skb, size - data_len);
maybe_add_creds(skb, sock, other);
skb_queue_tail(&other->sk_receive_queue, skb);
- if (max_level > unix_sk(other)->recursion_level)
- unix_sk(other)->recursion_level = max_level;
unix_state_unlock(other);
other->sk_data_ready(other);
sent += size;
*/
mutex_lock(&u->iolock);
- if (flags & MSG_PEEK)
- skip = sk_peek_offset(sk, flags);
- else
- skip = 0;
+ skip = max(sk_peek_offset(sk, flags), 0);
do {
int chunk;
last_len = last ? last->len : 0;
again:
if (skb == NULL) {
- unix_sk(sk)->recursion_level = 0;
if (copied >= target)
goto unlock;
size_t j;
int err = *pfd;
- pr_warning("failed to create map: %s\n",
+ pr_warning("failed to create map (name: '%s'): %s\n",
+ obj->maps[i].name,
strerror(errno));
for (j = 0; j < i; j++)
zclose(obj->maps[j].fd);
return PTR_ERR(ptr);
return 0;
}
+
+int bpf_prog_load(const char *file, enum bpf_prog_type type,
+ struct bpf_object **pobj, int *prog_fd)
+{
+ struct bpf_program *prog;
+ struct bpf_object *obj;
+ int err;
+
+ obj = bpf_object__open(file);
+ if (IS_ERR(obj))
+ return -ENOENT;
+
+ prog = bpf_program__next(NULL, obj);
+ if (!prog) {
+ bpf_object__close(obj);
+ return -ENOENT;
+ }
+
+ bpf_program__set_type(prog, type);
+ err = bpf_object__load(obj);
+ if (err) {
+ bpf_object__close(obj);
+ return -EINVAL;
+ }
+
+ *pobj = obj;
+ *prog_fd = bpf_program__fd(prog);
+ return 0;
+}