whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit c93d9218ea561d6a91b23449cfd637ddec91dc23
parent fa3294c58c58c4fa87ee0356b6cb1901db00533e
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Sat,  2 Mar 2019 08:46:34 -0800

Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net

Pull networking fixes from David Miller:

 1) Fix refcount leak in act_ipt during replace, from Davide Caratti.

 2) Set task state properly in tun during blocking reads, from Timur
    Celik.

 3) Leaked reference in DSA, from Wen Yang.

 4) NULL deref in act_tunnel_key, from Vlad Buslov.

 5) cipso_v4_erro can reference the skb IPCB in inappropriate contexts
    thus referencing garbage, from Nazarov Sergey.

 6) Don't accept RTA_VIA and RTA_GATEWAY in contexts where those
    attributes make no sense.

 7) Fix hung sendto in tipc, from Tung Nguyen.

 8) Out-of-bounds access in netlabel, from Paul Moore.

 9) Grant reference leak in xen-netback, from Igor Druzhinin.

10) Fix tx stalls with lan743x, from Bryan Whitehead.

11) Fix interrupt storm with mv88e6xxx, from Hein Kallweit.

12) Memory leak in sit on device registry failure, from Mao Wenan.

* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net: (44 commits)
  net: sit: fix memory leak in sit_init_net()
  net: dsa: mv88e6xxx: Fix statistics on mv88e6161
  geneve: correctly handle ipv6.disable module parameter
  net: dsa: mv88e6xxx: prevent interrupt storm caused by mv88e6390x_port_set_cmode
  bpf: fix sanitation rewrite in case of non-pointers
  ipv4: Add ICMPv6 support when parse route ipproto
  MIPS: eBPF: Fix icache flush end address
  lan743x: Fix TX Stall Issue
  net: phy: phylink: fix uninitialized variable in phylink_get_mac_state
  net: aquantia: regression on cpus with high cores: set mode with 8 queues
  selftests: fixes for UDP GRO
  bpf: drop refcount if bpf_map_new_fd() fails in map_create()
  net: dsa: mv88e6xxx: power serdes on/off for 10G interfaces on 6390X
  net: dsa: mv88e6xxx: Fix u64 statistics
  xen-netback: don't populate the hash cache on XenBus disconnect
  xen-netback: fix occasional leak of grant ref mappings under memory pressure
  sctp: chunk.c: correct format string for size_t in printk
  net: netem: fix skb length BUG_ON in __skb_to_sgvec
  netlabel: fix out-of-bounds memory accesses
  ipv4: Pass original device to ip_rcv_finish_core
  ...

Diffstat:
March/mips/net/ebpf_jit.c | 2+-
Mdrivers/net/dsa/lantiq_gswip.c | 6++++++
Mdrivers/net/dsa/mv88e6xxx/chip.c | 14++++++++++++--
Mdrivers/net/dsa/mv88e6xxx/port.c | 8++++++--
Mdrivers/net/dsa/mv88e6xxx/port.h | 1+
Mdrivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c | 3+++
Mdrivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh.c | 9+++++++++
Mdrivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh.h | 4++++
Mdrivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh_internal.h | 13+++++++++++++
Mdrivers/net/ethernet/broadcom/bnxt/bnxt.c | 6++++++
Mdrivers/net/ethernet/microchip/enc28j60.c | 2+-
Mdrivers/net/ethernet/microchip/lan743x_main.c | 16++++++++++++----
Mdrivers/net/geneve.c | 11++++++++---
Mdrivers/net/hyperv/netvsc_drv.c | 22+++++++++++++++++++---
Mdrivers/net/phy/dp83867.c | 3+++
Mdrivers/net/phy/micrel.c | 13++++++++++++-
Mdrivers/net/phy/phylink.c | 4++++
Mdrivers/net/tun.c | 4++--
Mdrivers/net/xen-netback/hash.c | 2++
Mdrivers/net/xen-netback/interface.c | 7+++++++
Mdrivers/net/xen-netback/netback.c | 10+++++-----
Minclude/linux/netdevice.h | 2+-
Minclude/net/icmp.h | 9++++++++-
Minclude/net/ip.h | 4+++-
Mkernel/bpf/syscall.c | 6+++---
Mkernel/bpf/verifier.c | 3++-
Mnet/dsa/dsa2.c | 16++++++++++------
Mnet/dsa/port.c | 1+
Mnet/ipv4/cipso_ipv4.c | 20+++++++++++++++++---
Mnet/ipv4/fib_frontend.c | 4++++
Mnet/ipv4/icmp.c | 7++++---
Mnet/ipv4/ip_input.c | 9+++++----
Mnet/ipv4/ip_options.c | 22+++++++++++++++++-----
Mnet/ipv4/netlink.c | 17+++++++++++++----
Mnet/ipv4/route.c | 2+-
Mnet/ipv6/route.c | 7++++++-
Mnet/ipv6/sit.c | 1+
Mnet/mpls/af_mpls.c | 3+++
Mnet/netlabel/netlabel_kapi.c | 3++-
Mnet/nfc/llcp_commands.c | 20++++++++++++++++++++
Mnet/nfc/llcp_core.c | 24++++++++++++++++++++----
Mnet/sched/act_ipt.c | 3+--
Mnet/sched/act_skbedit.c | 3+--
Mnet/sched/act_tunnel_key.c | 3++-
Mnet/sched/sch_netem.c | 10+++++++---
Mnet/sctp/chunk.c | 2+-
Mnet/socket.c | 1+
Mnet/tipc/socket.c | 6+++++-
Mtools/testing/selftests/net/pmtu.sh | 96++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
Mtools/testing/selftests/net/udpgro.sh | 8++++----
Mtools/testing/selftests/net/udpgso_bench_rx.c | 42+++++++++++++++++++++++++++++-------------
51 files changed, 408 insertions(+), 106 deletions(-)

diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c @@ -1819,7 +1819,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) /* Update the icache */ flush_icache_range((unsigned long)ctx.target, - (unsigned long)(ctx.target + ctx.idx * sizeof(u32))); + (unsigned long)&ctx.target[ctx.idx]); if (bpf_jit_enable > 1) /* Dump JIT code */ diff --git a/drivers/net/dsa/lantiq_gswip.c b/drivers/net/dsa/lantiq_gswip.c @@ -1162,6 +1162,12 @@ static struct platform_driver gswip_driver = { module_platform_driver(gswip_driver); +MODULE_FIRMWARE("lantiq/xrx300_phy11g_a21.bin"); +MODULE_FIRMWARE("lantiq/xrx300_phy22f_a21.bin"); +MODULE_FIRMWARE("lantiq/xrx200_phy11g_a14.bin"); +MODULE_FIRMWARE("lantiq/xrx200_phy11g_a22.bin"); +MODULE_FIRMWARE("lantiq/xrx200_phy22f_a14.bin"); +MODULE_FIRMWARE("lantiq/xrx200_phy22f_a22.bin"); MODULE_AUTHOR("Hauke Mehrtens <hauke@hauke-m.de>"); MODULE_DESCRIPTION("Lantiq / Intel GSWIP driver"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c @@ -896,7 +896,7 @@ static uint64_t _mv88e6xxx_get_ethtool_stat(struct mv88e6xxx_chip *chip, default: return U64_MAX; } - value = (((u64)high) << 16) | low; + value = (((u64)high) << 32) | low; return value; } @@ -3093,7 +3093,7 @@ static const struct mv88e6xxx_ops mv88e6161_ops = { .port_disable_pri_override = mv88e6xxx_port_disable_pri_override, .port_link_state = mv88e6352_port_link_state, .port_get_cmode = mv88e6185_port_get_cmode, - .stats_snapshot = mv88e6320_g1_stats_snapshot, + .stats_snapshot = mv88e6xxx_g1_stats_snapshot, .stats_set_histogram = mv88e6095_g1_stats_set_histogram, .stats_get_sset_count = mv88e6095_stats_get_sset_count, .stats_get_strings = mv88e6095_stats_get_strings, @@ -4595,6 +4595,14 @@ static int mv88e6xxx_smi_init(struct mv88e6xxx_chip *chip, return 0; } +static void mv88e6xxx_ports_cmode_init(struct mv88e6xxx_chip *chip) +{ + int i; + + for (i = 0; i < mv88e6xxx_num_ports(chip); i++) + chip->ports[i].cmode = MV88E6XXX_PORT_STS_CMODE_INVALID; +} + static enum dsa_tag_protocol mv88e6xxx_get_tag_protocol(struct dsa_switch *ds, int port) { @@ -4631,6 +4639,8 @@ static const char *mv88e6xxx_drv_probe(struct device *dsa_dev, if (err) goto free; + mv88e6xxx_ports_cmode_init(chip); + mutex_lock(&chip->reg_lock); err = mv88e6xxx_switch_reset(chip); mutex_unlock(&chip->reg_lock); diff --git a/drivers/net/dsa/mv88e6xxx/port.c b/drivers/net/dsa/mv88e6xxx/port.c @@ -398,6 +398,10 @@ int mv88e6390x_port_set_cmode(struct mv88e6xxx_chip *chip, int port, cmode = 0; } + /* cmode doesn't change, nothing to do for us */ + if (cmode == chip->ports[port].cmode) + return 0; + lane = mv88e6390x_serdes_get_lane(chip, port); if (lane < 0) return lane; @@ -408,7 +412,7 @@ int mv88e6390x_port_set_cmode(struct mv88e6xxx_chip *chip, int port, return err; } - err = mv88e6390_serdes_power(chip, port, false); + err = mv88e6390x_serdes_power(chip, port, false); if (err) return err; @@ -424,7 +428,7 @@ int mv88e6390x_port_set_cmode(struct mv88e6xxx_chip *chip, int port, if (err) return err; - err = mv88e6390_serdes_power(chip, port, true); + err = mv88e6390x_serdes_power(chip, port, true); if (err) return err; diff --git a/drivers/net/dsa/mv88e6xxx/port.h b/drivers/net/dsa/mv88e6xxx/port.h @@ -52,6 +52,7 @@ #define MV88E6185_PORT_STS_CMODE_1000BASE_X 0x0005 #define MV88E6185_PORT_STS_CMODE_PHY 0x0006 #define MV88E6185_PORT_STS_CMODE_DISABLED 0x0007 +#define MV88E6XXX_PORT_STS_CMODE_INVALID 0xff /* Offset 0x01: MAC (or PCS or Physical) Control Register */ #define MV88E6XXX_PORT_MAC_CTL 0x01 diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c @@ -275,6 +275,9 @@ static int hw_atl_b0_hw_offload_set(struct aq_hw_s *self, static int hw_atl_b0_hw_init_tx_path(struct aq_hw_s *self) { + /* Tx TC/Queue number config */ + hw_atl_rpb_tps_tx_tc_mode_set(self, 1U); + hw_atl_thm_lso_tcp_flag_of_first_pkt_set(self, 0x0FF6U); hw_atl_thm_lso_tcp_flag_of_middle_pkt_set(self, 0x0FF6U); hw_atl_thm_lso_tcp_flag_of_last_pkt_set(self, 0x0F7FU); diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh.c @@ -1274,6 +1274,15 @@ void hw_atl_tpb_tx_buff_en_set(struct aq_hw_s *aq_hw, u32 tx_buff_en) HW_ATL_TPB_TX_BUF_EN_SHIFT, tx_buff_en); } +void hw_atl_rpb_tps_tx_tc_mode_set(struct aq_hw_s *aq_hw, + u32 tx_traf_class_mode) +{ + aq_hw_write_reg_bit(aq_hw, HW_ATL_TPB_TX_TC_MODE_ADDR, + HW_ATL_TPB_TX_TC_MODE_MSK, + HW_ATL_TPB_TX_TC_MODE_SHIFT, + tx_traf_class_mode); +} + void hw_atl_tpb_tx_buff_hi_threshold_per_tc_set(struct aq_hw_s *aq_hw, u32 tx_buff_hi_threshold_per_tc, u32 buffer) diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh.h b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh.h @@ -605,6 +605,10 @@ void hw_atl_thm_lso_tcp_flag_of_middle_pkt_set(struct aq_hw_s *aq_hw, /* tpb */ +/* set TX Traffic Class Mode */ +void hw_atl_rpb_tps_tx_tc_mode_set(struct aq_hw_s *aq_hw, + u32 tx_traf_class_mode); + /* set tx buffer enable */ void hw_atl_tpb_tx_buff_en_set(struct aq_hw_s *aq_hw, u32 tx_buff_en); diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh_internal.h b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh_internal.h @@ -1948,6 +1948,19 @@ /* default value of bitfield tx_buf_en */ #define HW_ATL_TPB_TX_BUF_EN_DEFAULT 0x0 +/* register address for bitfield tx_tc_mode */ +#define HW_ATL_TPB_TX_TC_MODE_ADDR 0x00007900 +/* bitmask for bitfield tx_tc_mode */ +#define HW_ATL_TPB_TX_TC_MODE_MSK 0x00000100 +/* inverted bitmask for bitfield tx_tc_mode */ +#define HW_ATL_TPB_TX_TC_MODE_MSKN 0xFFFFFEFF +/* lower bit position of bitfield tx_tc_mode */ +#define HW_ATL_TPB_TX_TC_MODE_SHIFT 8 +/* width of bitfield tx_tc_mode */ +#define HW_ATL_TPB_TX_TC_MODE_WIDTH 1 +/* default value of bitfield tx_tc_mode */ +#define HW_ATL_TPB_TX_TC_MODE_DEFAULT 0x0 + /* tx tx{b}_hi_thresh[c:0] bitfield definitions * preprocessor definitions for the bitfield "tx{b}_hi_thresh[c:0]". * parameter: buffer {b} | stride size 0x10 | range [0, 7] diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -500,6 +500,12 @@ normal_tx: } length >>= 9; + if (unlikely(length >= ARRAY_SIZE(bnxt_lhint_arr))) { + dev_warn_ratelimited(&pdev->dev, "Dropped oversize %d bytes TX packet.\n", + skb->len); + i = 0; + goto tx_dma_error; + } flags |= bnxt_lhint_arr[length]; txbd->tx_bd_len_flags_type = cpu_to_le32(flags); diff --git a/drivers/net/ethernet/microchip/enc28j60.c b/drivers/net/ethernet/microchip/enc28j60.c @@ -1681,5 +1681,5 @@ MODULE_DESCRIPTION(DRV_NAME " ethernet driver"); MODULE_AUTHOR("Claudio Lanconelli <lanconelli.claudio@eptar.com>"); MODULE_LICENSE("GPL"); module_param_named(debug, debug.msg_enable, int, 0); -MODULE_PARM_DESC(debug, "Debug verbosity level (0=none, ..., ffff=all)"); +MODULE_PARM_DESC(debug, "Debug verbosity level in amount of bits set (0=none, ..., 31=all)"); MODULE_ALIAS("spi:" DRV_NAME); diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c @@ -1400,7 +1400,8 @@ static int lan743x_tx_frame_start(struct lan743x_tx *tx, } static void lan743x_tx_frame_add_lso(struct lan743x_tx *tx, - unsigned int frame_length) + unsigned int frame_length, + int nr_frags) { /* called only from within lan743x_tx_xmit_frame. * assuming tx->ring_lock has already been acquired. @@ -1410,6 +1411,10 @@ static void lan743x_tx_frame_add_lso(struct lan743x_tx *tx, /* wrap up previous descriptor */ tx->frame_data0 |= TX_DESC_DATA0_EXT_; + if (nr_frags <= 0) { + tx->frame_data0 |= TX_DESC_DATA0_LS_; + tx->frame_data0 |= TX_DESC_DATA0_IOC_; + } tx_descriptor = &tx->ring_cpu_ptr[tx->frame_tail]; tx_descriptor->data0 = tx->frame_data0; @@ -1514,8 +1519,11 @@ static void lan743x_tx_frame_end(struct lan743x_tx *tx, u32 tx_tail_flags = 0; /* wrap up previous descriptor */ - tx->frame_data0 |= TX_DESC_DATA0_LS_; - tx->frame_data0 |= TX_DESC_DATA0_IOC_; + if ((tx->frame_data0 & TX_DESC_DATA0_DTYPE_MASK_) == + TX_DESC_DATA0_DTYPE_DATA_) { + tx->frame_data0 |= TX_DESC_DATA0_LS_; + tx->frame_data0 |= TX_DESC_DATA0_IOC_; + } tx_descriptor = &tx->ring_cpu_ptr[tx->frame_tail]; buffer_info = &tx->buffer_info[tx->frame_tail]; @@ -1600,7 +1608,7 @@ static netdev_tx_t lan743x_tx_xmit_frame(struct lan743x_tx *tx, } if (gso) - lan743x_tx_frame_add_lso(tx, frame_length); + lan743x_tx_frame_add_lso(tx, frame_length, nr_frags); if (nr_frags <= 0) goto finish; diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c @@ -692,15 +692,20 @@ out: static int geneve_open(struct net_device *dev) { struct geneve_dev *geneve = netdev_priv(dev); - bool ipv6 = !!(geneve->info.mode & IP_TUNNEL_INFO_IPV6); bool metadata = geneve->collect_md; + bool ipv4, ipv6; int ret = 0; + ipv6 = geneve->info.mode & IP_TUNNEL_INFO_IPV6 || metadata; + ipv4 = !ipv6 || metadata; #if IS_ENABLED(CONFIG_IPV6) - if (ipv6 || metadata) + if (ipv6) { ret = geneve_sock_add(geneve, true); + if (ret < 0 && ret != -EAFNOSUPPORT) + ipv4 = false; + } #endif - if (!ret && (!ipv6 || metadata)) + if (ipv4) ret = geneve_sock_add(geneve, false); if (ret < 0) geneve_sock_release(geneve); diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c @@ -744,6 +744,14 @@ void netvsc_linkstatus_callback(struct net_device *net, schedule_delayed_work(&ndev_ctx->dwork, 0); } +static void netvsc_comp_ipcsum(struct sk_buff *skb) +{ + struct iphdr *iph = (struct iphdr *)skb->data; + + iph->check = 0; + iph->check = ip_fast_csum(iph, iph->ihl); +} + static struct sk_buff *netvsc_alloc_recv_skb(struct net_device *net, struct netvsc_channel *nvchan) { @@ -770,9 +778,17 @@ static struct sk_buff *netvsc_alloc_recv_skb(struct net_device *net, /* skb is already created with CHECKSUM_NONE */ skb_checksum_none_assert(skb); - /* - * In Linux, the IP checksum is always checked. - * Do L4 checksum offload if enabled and present. + /* Incoming packets may have IP header checksum verified by the host. + * They may not have IP header checksum computed after coalescing. + * We compute it here if the flags are set, because on Linux, the IP + * checksum is always checked. + */ + if (csum_info && csum_info->receive.ip_checksum_value_invalid && + csum_info->receive.ip_checksum_succeeded && + skb->protocol == htons(ETH_P_IP)) + netvsc_comp_ipcsum(skb); + + /* Do L4 checksum offload if enabled and present. */ if (csum_info && (net->features & NETIF_F_RXCSUM)) { if (csum_info->receive.tcp_checksum_succeeded || diff --git a/drivers/net/phy/dp83867.c b/drivers/net/phy/dp83867.c @@ -19,6 +19,7 @@ #include <linux/module.h> #include <linux/of.h> #include <linux/phy.h> +#include <linux/delay.h> #include <dt-bindings/net/ti-dp83867.h> @@ -325,6 +326,8 @@ static int dp83867_phy_reset(struct phy_device *phydev) if (err < 0) return err; + usleep_range(10, 20); + return dp83867_config_init(phydev); } diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c @@ -344,6 +344,17 @@ static int ksz8041_config_aneg(struct phy_device *phydev) return genphy_config_aneg(phydev); } +static int ksz8061_config_init(struct phy_device *phydev) +{ + int ret; + + ret = phy_write_mmd(phydev, MDIO_MMD_PMAPMD, MDIO_DEVID1, 0xB61A); + if (ret) + return ret; + + return kszphy_config_init(phydev); +} + static int ksz9021_load_values_from_of(struct phy_device *phydev, const struct device_node *of_node, u16 reg, @@ -1040,7 +1051,7 @@ static struct phy_driver ksphy_driver[] = { .name = "Micrel KSZ8061", .phy_id_mask = MICREL_PHY_ID_MASK, .features = PHY_BASIC_FEATURES, - .config_init = kszphy_config_init, + .config_init = ksz8061_config_init, .ack_interrupt = kszphy_ack_interrupt, .config_intr = kszphy_config_intr, .suspend = genphy_suspend, diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c @@ -320,6 +320,10 @@ static int phylink_get_mac_state(struct phylink *pl, struct phylink_link_state * linkmode_zero(state->lp_advertising); state->interface = pl->link_config.interface; state->an_enabled = pl->link_config.an_enabled; + state->speed = SPEED_UNKNOWN; + state->duplex = DUPLEX_UNKNOWN; + state->pause = MLO_PAUSE_NONE; + state->an_complete = 0; state->link = 1; return pl->ops->mac_link_state(ndev, state); diff --git a/drivers/net/tun.c b/drivers/net/tun.c @@ -2167,9 +2167,9 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err) } add_wait_queue(&tfile->wq.wait, &wait); - current->state = TASK_INTERRUPTIBLE; while (1) { + set_current_state(TASK_INTERRUPTIBLE); ptr = ptr_ring_consume(&tfile->tx_ring); if (ptr) break; @@ -2185,7 +2185,7 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err) schedule(); } - current->state = TASK_RUNNING; + __set_current_state(TASK_RUNNING); remove_wait_queue(&tfile->wq.wait, &wait); out: diff --git a/drivers/net/xen-netback/hash.c b/drivers/net/xen-netback/hash.c @@ -454,6 +454,8 @@ void xenvif_init_hash(struct xenvif *vif) if (xenvif_hash_cache_size == 0) return; + BUG_ON(vif->hash.cache.count); + spin_lock_init(&vif->hash.cache.lock); INIT_LIST_HEAD(&vif->hash.cache.list); } diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c @@ -153,6 +153,13 @@ static u16 xenvif_select_queue(struct net_device *dev, struct sk_buff *skb, { struct xenvif *vif = netdev_priv(dev); unsigned int size = vif->hash.size; + unsigned int num_queues; + + /* If queues are not set up internally - always return 0 + * as the packet going to be dropped anyway */ + num_queues = READ_ONCE(vif->num_queues); + if (num_queues < 1) + return 0; if (vif->hash.alg == XEN_NETIF_CTRL_HASH_ALGORITHM_NONE) return fallback(dev, skb, NULL) % dev->real_num_tx_queues; diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c @@ -1072,11 +1072,6 @@ static int xenvif_handle_frag_list(struct xenvif_queue *queue, struct sk_buff *s skb_frag_size_set(&frags[i], len); } - /* Copied all the bits from the frag list -- free it. */ - skb_frag_list_init(skb); - xenvif_skb_zerocopy_prepare(queue, nskb); - kfree_skb(nskb); - /* Release all the original (foreign) frags. */ for (f = 0; f < skb_shinfo(skb)->nr_frags; f++) skb_frag_unref(skb, f); @@ -1145,6 +1140,8 @@ static int xenvif_tx_submit(struct xenvif_queue *queue) xenvif_fill_frags(queue, skb); if (unlikely(skb_has_frag_list(skb))) { + struct sk_buff *nskb = skb_shinfo(skb)->frag_list; + xenvif_skb_zerocopy_prepare(queue, nskb); if (xenvif_handle_frag_list(queue, skb)) { if (net_ratelimit()) netdev_err(queue->vif->dev, @@ -1153,6 +1150,9 @@ static int xenvif_tx_submit(struct xenvif_queue *queue) kfree_skb(skb); continue; } + /* Copied all the bits from the frag list -- free it. */ + skb_frag_list_init(skb); + kfree_skb(nskb); } skb->dev = queue->vif->dev; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h @@ -3861,7 +3861,7 @@ static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits) if (debug_value == 0) /* no output */ return 0; /* set low N bits */ - return (1 << debug_value) - 1; + return (1U << debug_value) - 1; } static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu) diff --git a/include/net/icmp.h b/include/net/icmp.h @@ -22,6 +22,7 @@ #include <net/inet_sock.h> #include <net/snmp.h> +#include <net/ip.h> struct icmp_err { int errno; @@ -39,7 +40,13 @@ struct net_proto_family; struct sk_buff; struct net; -void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info); +void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, + const struct ip_options *opt); +static inline void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) +{ + __icmp_send(skb_in, type, code, info, &IPCB(skb_in)->opt); +} + int icmp_rcv(struct sk_buff *skb); int icmp_err(struct sk_buff *skb, u32 info); int icmp_init(void); diff --git a/include/net/ip.h b/include/net/ip.h @@ -667,6 +667,8 @@ static inline int ip_options_echo(struct net *net, struct ip_options *dopt, } void ip_options_fragment(struct sk_buff *skb); +int __ip_options_compile(struct net *net, struct ip_options *opt, + struct sk_buff *skb, __be32 *info); int ip_options_compile(struct net *net, struct ip_options *opt, struct sk_buff *skb); int ip_options_get(struct net *net, struct ip_options_rcu **optp, @@ -716,7 +718,7 @@ extern int sysctl_icmp_msgs_burst; int ip_misc_proc_init(void); #endif -int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto, +int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto, u8 family, struct netlink_ext_ack *extack); #endif /* _IP_H */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c @@ -559,12 +559,12 @@ static int map_create(union bpf_attr *attr) err = bpf_map_new_fd(map, f_flags); if (err < 0) { /* failed to allocate fd. - * bpf_map_put() is needed because the above + * bpf_map_put_with_uref() is needed because the above * bpf_map_alloc_id() has published the map * to the userspace and the userspace may * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID. */ - bpf_map_put(map); + bpf_map_put_with_uref(map); return err; } @@ -1986,7 +1986,7 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr) fd = bpf_map_new_fd(map, f_flags); if (fd < 0) - bpf_map_put(map); + bpf_map_put_with_uref(map); return fd; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c @@ -6920,7 +6920,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) u32 off_reg; aux = &env->insn_aux_data[i + delta]; - if (!aux->alu_state) + if (!aux->alu_state || + aux->alu_state == BPF_ALU_NON_POINTER) continue; isneg = aux->alu_state & BPF_ALU_NEG_VALUE; diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c @@ -612,8 +612,8 @@ static int dsa_switch_parse_ports_of(struct dsa_switch *ds, { struct device_node *ports, *port; struct dsa_port *dp; + int err = 0; u32 reg; - int err; ports = of_get_child_by_name(dn, "ports"); if (!ports) { @@ -624,19 +624,23 @@ static int dsa_switch_parse_ports_of(struct dsa_switch *ds, for_each_available_child_of_node(ports, port) { err = of_property_read_u32(port, "reg", &reg); if (err) - return err; + goto out_put_node; - if (reg >= ds->num_ports) - return -EINVAL; + if (reg >= ds->num_ports) { + err = -EINVAL; + goto out_put_node; + } dp = &ds->ports[reg]; err = dsa_port_parse_of(dp, port); if (err) - return err; + goto out_put_node; } - return 0; +out_put_node: + of_node_put(ports); + return err; } static int dsa_switch_parse_member_of(struct dsa_switch *ds, diff --git a/net/dsa/port.c b/net/dsa/port.c @@ -292,6 +292,7 @@ static struct phy_device *dsa_port_get_phy_device(struct dsa_port *dp) return ERR_PTR(-EPROBE_DEFER); } + of_node_put(phy_dn); return phydev; } diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c @@ -667,7 +667,8 @@ static int cipso_v4_map_lvl_valid(const struct cipso_v4_doi *doi_def, u8 level) case CIPSO_V4_MAP_PASS: return 0; case CIPSO_V4_MAP_TRANS: - if (doi_def->map.std->lvl.cipso[level] < CIPSO_V4_INV_LVL) + if ((level < doi_def->map.std->lvl.cipso_size) && + (doi_def->map.std->lvl.cipso[level] < CIPSO_V4_INV_LVL)) return 0; break; } @@ -1735,13 +1736,26 @@ validate_return: */ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway) { + unsigned char optbuf[sizeof(struct ip_options) + 40]; + struct ip_options *opt = (struct ip_options *)optbuf; + if (ip_hdr(skb)->protocol == IPPROTO_ICMP || error != -EACCES) return; + /* + * We might be called above the IP layer, + * so we can not use icmp_send and IPCB here. + */ + + memset(opt, 0, sizeof(struct ip_options)); + opt->optlen = ip_hdr(skb)->ihl*4 - sizeof(struct iphdr); + if (__ip_options_compile(dev_net(skb->dev), opt, skb, NULL)) + return; + if (gateway) - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0); + __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0, opt); else - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0); + __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0, opt); } /** diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c @@ -710,6 +710,10 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, case RTA_GATEWAY: cfg->fc_gw = nla_get_be32(attr); break; + case RTA_VIA: + NL_SET_ERR_MSG(extack, "IPv4 does not support RTA_VIA attribute"); + err = -EINVAL; + goto errout; case RTA_PRIORITY: cfg->fc_priority = nla_get_u32(attr); break; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c @@ -570,7 +570,8 @@ relookup_failed: * MUST reply to only the first fragment. */ -void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) +void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, + const struct ip_options *opt) { struct iphdr *iph; int room; @@ -691,7 +692,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) iph->tos; mark = IP4_REPLY_MARK(net, skb_in->mark); - if (ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in)) + if (__ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in, opt)) goto out_unlock; @@ -742,7 +743,7 @@ out_bh_enable: local_bh_enable(); out:; } -EXPORT_SYMBOL(icmp_send); +EXPORT_SYMBOL(__icmp_send); static void icmp_socket_deliver(struct sk_buff *skb, u32 info) diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c @@ -307,11 +307,10 @@ drop: } static int ip_rcv_finish_core(struct net *net, struct sock *sk, - struct sk_buff *skb) + struct sk_buff *skb, struct net_device *dev) { const struct iphdr *iph = ip_hdr(skb); int (*edemux)(struct sk_buff *skb); - struct net_device *dev = skb->dev; struct rtable *rt; int err; @@ -400,6 +399,7 @@ drop_error: static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { + struct net_device *dev = skb->dev; int ret; /* if ingress device is enslaved to an L3 master device pass the @@ -409,7 +409,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) if (!skb) return NET_RX_SUCCESS; - ret = ip_rcv_finish_core(net, sk, skb); + ret = ip_rcv_finish_core(net, sk, skb, dev); if (ret != NET_RX_DROP) ret = dst_input(skb); return ret; @@ -545,6 +545,7 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk, INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { + struct net_device *dev = skb->dev; struct dst_entry *dst; skb_list_del_init(skb); @@ -554,7 +555,7 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk, skb = l3mdev_ip_rcv(skb); if (!skb) continue; - if (ip_rcv_finish_core(net, sk, skb) == NET_RX_DROP) + if (ip_rcv_finish_core(net, sk, skb, dev) == NET_RX_DROP) continue; dst = skb_dst(skb); diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c @@ -251,8 +251,9 @@ static void spec_dst_fill(__be32 *spec_dst, struct sk_buff *skb) * If opt == NULL, then skb->data should point to IP header. */ -int ip_options_compile(struct net *net, - struct ip_options *opt, struct sk_buff *skb) +int __ip_options_compile(struct net *net, + struct ip_options *opt, struct sk_buff *skb, + __be32 *info) { __be32 spec_dst = htonl(INADDR_ANY); unsigned char *pp_ptr = NULL; @@ -468,11 +469,22 @@ eol: return 0; error: - if (skb) { - icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((pp_ptr-iph)<<24)); - } + if (info) + *info = htonl((pp_ptr-iph)<<24); return -EINVAL; } + +int ip_options_compile(struct net *net, + struct ip_options *opt, struct sk_buff *skb) +{ + int ret; + __be32 info; + + ret = __ip_options_compile(net, opt, skb, &info); + if (ret != 0 && skb) + icmp_send(skb, ICMP_PARAMETERPROB, 0, info); + return ret; +} EXPORT_SYMBOL(ip_options_compile); /* diff --git a/net/ipv4/netlink.c b/net/ipv4/netlink.c @@ -3,9 +3,10 @@ #include <linux/types.h> #include <net/net_namespace.h> #include <net/netlink.h> +#include <linux/in6.h> #include <net/ip.h> -int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto, +int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto, u8 family, struct netlink_ext_ack *extack) { *ip_proto = nla_get_u8(attr); @@ -13,11 +14,19 @@ int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto, switch (*ip_proto) { case IPPROTO_TCP: case IPPROTO_UDP: + return 0; case IPPROTO_ICMP: + if (family != AF_INET) + break; + return 0; +#if IS_ENABLED(CONFIG_IPV6) + case IPPROTO_ICMPV6: + if (family != AF_INET6) + break; return 0; - default: - NL_SET_ERR_MSG(extack, "Unsupported ip proto"); - return -EOPNOTSUPP; +#endif } + NL_SET_ERR_MSG(extack, "Unsupported ip proto"); + return -EOPNOTSUPP; } EXPORT_SYMBOL_GPL(rtm_getroute_parse_ip_proto); diff --git a/net/ipv4/route.c b/net/ipv4/route.c @@ -2803,7 +2803,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (tb[RTA_IP_PROTO]) { err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], - &ip_proto, extack); + &ip_proto, AF_INET, extack); if (err) return err; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c @@ -4182,6 +4182,10 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); cfg->fc_flags |= RTF_GATEWAY; } + if (tb[RTA_VIA]) { + NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute"); + goto errout; + } if (tb[RTA_DST]) { int plen = (rtm->rtm_dst_len + 7) >> 3; @@ -4889,7 +4893,8 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (tb[RTA_IP_PROTO]) { err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], - &fl6.flowi6_proto, extack); + &fl6.flowi6_proto, AF_INET6, + extack); if (err) goto errout; } diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c @@ -1873,6 +1873,7 @@ static int __net_init sit_init_net(struct net *net) err_reg_dev: ipip6_dev_free(sitn->fb_tunnel_dev); + free_netdev(sitn->fb_tunnel_dev); err_alloc_dev: return err; } diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c @@ -1838,6 +1838,9 @@ static int rtm_to_route_config(struct sk_buff *skb, goto errout; break; } + case RTA_GATEWAY: + NL_SET_ERR_MSG(extack, "MPLS does not support RTA_GATEWAY attribute"); + goto errout; case RTA_VIA: { if (nla_get_via(nla, &cfg->rc_via_alen, diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c @@ -903,7 +903,8 @@ int netlbl_bitmap_walk(const unsigned char *bitmap, u32 bitmap_len, (state == 0 && (byte & bitmask) == 0)) return bit_spot; - bit_spot++; + if (++bit_spot >= bitmap_len) + return -1; bitmask >>= 1; if (bitmask == 0) { byte = bitmap[++byte_offset]; diff --git a/net/nfc/llcp_commands.c b/net/nfc/llcp_commands.c @@ -419,6 +419,10 @@ int nfc_llcp_send_connect(struct nfc_llcp_sock *sock) sock->service_name, sock->service_name_len, &service_name_tlv_length); + if (!service_name_tlv) { + err = -ENOMEM; + goto error_tlv; + } size += service_name_tlv_length; } @@ -429,9 +433,17 @@ int nfc_llcp_send_connect(struct nfc_llcp_sock *sock) miux_tlv = nfc_llcp_build_tlv(LLCP_TLV_MIUX, (u8 *)&miux, 0, &miux_tlv_length); + if (!miux_tlv) { + err = -ENOMEM; + goto error_tlv; + } size += miux_tlv_length; rw_tlv = nfc_llcp_build_tlv(LLCP_TLV_RW, &rw, 0, &rw_tlv_length); + if (!rw_tlv) { + err = -ENOMEM; + goto error_tlv; + } size += rw_tlv_length; pr_debug("SKB size %d SN length %zu\n", size, sock->service_name_len); @@ -484,9 +496,17 @@ int nfc_llcp_send_cc(struct nfc_llcp_sock *sock) miux_tlv = nfc_llcp_build_tlv(LLCP_TLV_MIUX, (u8 *)&miux, 0, &miux_tlv_length); + if (!miux_tlv) { + err = -ENOMEM; + goto error_tlv; + } size += miux_tlv_length; rw_tlv = nfc_llcp_build_tlv(LLCP_TLV_RW, &rw, 0, &rw_tlv_length); + if (!rw_tlv) { + err = -ENOMEM; + goto error_tlv; + } size += rw_tlv_length; skb = llcp_allocate_pdu(sock, LLCP_PDU_CC, size); diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c @@ -532,10 +532,10 @@ static u8 nfc_llcp_reserve_sdp_ssap(struct nfc_llcp_local *local) static int nfc_llcp_build_gb(struct nfc_llcp_local *local) { - u8 *gb_cur, *version_tlv, version, version_length; - u8 *lto_tlv, lto_length; - u8 *wks_tlv, wks_length; - u8 *miux_tlv, miux_length; + u8 *gb_cur, version, version_length; + u8 lto_length, wks_length, miux_length; + u8 *version_tlv = NULL, *lto_tlv = NULL, + *wks_tlv = NULL, *miux_tlv = NULL; __be16 wks = cpu_to_be16(local->local_wks); u8 gb_len = 0; int ret = 0; @@ -543,17 +543,33 @@ static int nfc_llcp_build_gb(struct nfc_llcp_local *local) version = LLCP_VERSION_11; version_tlv = nfc_llcp_build_tlv(LLCP_TLV_VERSION, &version, 1, &version_length); + if (!version_tlv) { + ret = -ENOMEM; + goto out; + } gb_len += version_length; lto_tlv = nfc_llcp_build_tlv(LLCP_TLV_LTO, &local->lto, 1, &lto_length); + if (!lto_tlv) { + ret = -ENOMEM; + goto out; + } gb_len += lto_length; pr_debug("Local wks 0x%lx\n", local->local_wks); wks_tlv = nfc_llcp_build_tlv(LLCP_TLV_WKS, (u8 *)&wks, 2, &wks_length); + if (!wks_tlv) { + ret = -ENOMEM; + goto out; + } gb_len += wks_length; miux_tlv = nfc_llcp_build_tlv(LLCP_TLV_MIUX, (u8 *)&local->miux, 0, &miux_length); + if (!miux_tlv) { + ret = -ENOMEM; + goto out; + } gb_len += miux_length; gb_len += ARRAY_SIZE(llcp_magic); diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c @@ -199,8 +199,7 @@ err3: err2: kfree(tname); err1: - if (ret == ACT_P_CREATED) - tcf_idr_release(*a, bind); + tcf_idr_release(*a, bind); return err; } diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c @@ -189,8 +189,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, params_new = kzalloc(sizeof(*params_new), GFP_KERNEL); if (unlikely(!params_new)) { - if (ret == ACT_P_CREATED) - tcf_idr_release(*a, bind); + tcf_idr_release(*a, bind); return -ENOMEM; } diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c @@ -377,7 +377,8 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, return ret; release_tun_meta: - dst_release(&metadata->dst); + if (metadata) + dst_release(&metadata->dst); err_out: if (exists) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c @@ -447,6 +447,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, int nb = 0; int count = 1; int rc = NET_XMIT_SUCCESS; + int rc_drop = NET_XMIT_DROP; /* Do not fool qdisc_drop_all() */ skb->prev = NULL; @@ -486,6 +487,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, q->duplicate = 0; rootq->enqueue(skb2, rootq, to_free); q->duplicate = dupsave; + rc_drop = NET_XMIT_SUCCESS; } /* @@ -498,7 +500,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (skb_is_gso(skb)) { segs = netem_segment(skb, sch, to_free); if (!segs) - return NET_XMIT_DROP; + return rc_drop; } else { segs = skb; } @@ -521,8 +523,10 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, 1<<(prandom_u32() % 8); } - if (unlikely(sch->q.qlen >= sch->limit)) - return qdisc_drop_all(skb, sch, to_free); + if (unlikely(sch->q.qlen >= sch->limit)) { + qdisc_drop_all(skb, sch, to_free); + return rc_drop; + } qdisc_qstats_backlog_inc(sch, skb); diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c @@ -192,7 +192,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, if (unlikely(!max_data)) { max_data = sctp_min_frag_point(sctp_sk(asoc->base.sk), sctp_datachk_len(&asoc->stream)); - pr_warn_ratelimited("%s: asoc:%p frag_point is zero, forcing max_data to default minimum (%Zu)", + pr_warn_ratelimited("%s: asoc:%p frag_point is zero, forcing max_data to default minimum (%zu)", __func__, asoc, max_data); } diff --git a/net/socket.c b/net/socket.c @@ -577,6 +577,7 @@ static void __sock_release(struct socket *sock, struct inode *inode) if (inode) inode_lock(inode); sock->ops->release(sock); + sock->sk = NULL; if (inode) inode_unlock(inode); sock->ops = NULL; diff --git a/net/tipc/socket.c b/net/tipc/socket.c @@ -379,11 +379,13 @@ static int tipc_sk_sock_err(struct socket *sock, long *timeout) #define tipc_wait_for_cond(sock_, timeo_, condition_) \ ({ \ + DEFINE_WAIT_FUNC(wait_, woken_wake_function); \ struct sock *sk_; \ int rc_; \ \ while ((rc_ = !(condition_))) { \ - DEFINE_WAIT_FUNC(wait_, woken_wake_function); \ + /* coupled with smp_wmb() in tipc_sk_proto_rcv() */ \ + smp_rmb(); \ sk_ = (sock_)->sk; \ rc_ = tipc_sk_sock_err((sock_), timeo_); \ if (rc_) \ @@ -1983,6 +1985,8 @@ static void tipc_sk_proto_rcv(struct sock *sk, return; case SOCK_WAKEUP: tipc_dest_del(&tsk->cong_links, msg_orignode(hdr), 0); + /* coupled with smp_rmb() in tipc_wait_for_cond() */ + smp_wmb(); tsk->cong_link_cnt--; wakeup = true; break; diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh @@ -103,6 +103,15 @@ # and check that configured MTU is used on link creation and changes, and # that MTU is properly calculated instead when MTU is not configured from # userspace +# +# - cleanup_ipv4_exception +# Similar to pmtu_ipv4_vxlan4_exception, but explicitly generate PMTU +# exceptions on multiple CPUs and check that the veth device tear-down +# happens in a timely manner +# +# - cleanup_ipv6_exception +# Same as above, but use IPv6 transport from A to B + # Kselftest framework requirement - SKIP code is 4. ksft_skip=4 @@ -135,7 +144,9 @@ tests=" pmtu_vti6_default_mtu vti6: default MTU assignment pmtu_vti4_link_add_mtu vti4: MTU setting on link creation pmtu_vti6_link_add_mtu vti6: MTU setting on link creation - pmtu_vti6_link_change_mtu vti6: MTU changes on link changes" + pmtu_vti6_link_change_mtu vti6: MTU changes on link changes + cleanup_ipv4_exception ipv4: cleanup of cached exceptions + cleanup_ipv6_exception ipv6: cleanup of cached exceptions" NS_A="ns-$(mktemp -u XXXXXX)" NS_B="ns-$(mktemp -u XXXXXX)" @@ -263,8 +274,6 @@ setup_fou_or_gue() { ${ns_a} ip link set ${encap}_a up ${ns_b} ip link set ${encap}_b up - - sleep 1 } setup_fou44() { @@ -302,6 +311,10 @@ setup_gue66() { setup_namespaces() { for n in ${NS_A} ${NS_B} ${NS_R1} ${NS_R2}; do ip netns add ${n} || return 1 + + # Disable DAD, so that we don't have to wait to use the + # configured IPv6 addresses + ip netns exec ${n} sysctl -q net/ipv6/conf/default/accept_dad=0 done } @@ -337,8 +350,6 @@ setup_vti() { ${ns_a} ip link set vti${proto}_a up ${ns_b} ip link set vti${proto}_b up - - sleep 1 } setup_vti4() { @@ -375,8 +386,6 @@ setup_vxlan_or_geneve() { ${ns_a} ip link set ${type}_a up ${ns_b} ip link set ${type}_b up - - sleep 1 } setup_geneve4() { @@ -588,8 +597,8 @@ test_pmtu_ipvX() { mtu "${ns_b}" veth_B-R2 1500 # Create route exceptions - ${ns_a} ${ping} -q -M want -i 0.1 -w 2 -s 1800 ${dst1} > /dev/null - ${ns_a} ${ping} -q -M want -i 0.1 -w 2 -s 1800 ${dst2} > /dev/null + ${ns_a} ${ping} -q -M want -i 0.1 -w 1 -s 1800 ${dst1} > /dev/null + ${ns_a} ${ping} -q -M want -i 0.1 -w 1 -s 1800 ${dst2} > /dev/null # Check that exceptions have been created with the correct PMTU pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst1})" @@ -621,7 +630,7 @@ test_pmtu_ipvX() { # Decrease remote MTU on path via R2, get new exception mtu "${ns_r2}" veth_R2-B 400 mtu "${ns_b}" veth_B-R2 400 - ${ns_a} ${ping} -q -M want -i 0.1 -w 2 -s 1400 ${dst2} > /dev/null + ${ns_a} ${ping} -q -M want -i 0.1 -w 1 -s 1400 ${dst2} > /dev/null pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst2})" check_pmtu_value "lock 552" "${pmtu_2}" "exceeding MTU, with MTU < min_pmtu" || return 1 @@ -638,7 +647,7 @@ test_pmtu_ipvX() { check_pmtu_value "1500" "${pmtu_2}" "increasing local MTU" || return 1 # Get new exception - ${ns_a} ${ping} -q -M want -i 0.1 -w 2 -s 1400 ${dst2} > /dev/null + ${ns_a} ${ping} -q -M want -i 0.1 -w 1 -s 1400 ${dst2} > /dev/null pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst2})" check_pmtu_value "lock 552" "${pmtu_2}" "exceeding MTU, with MTU < min_pmtu" || return 1 } @@ -687,7 +696,7 @@ test_pmtu_ipvX_over_vxlanY_or_geneveY_exception() { mtu "${ns_a}" ${type}_a $((${ll_mtu} + 1000)) mtu "${ns_b}" ${type}_b $((${ll_mtu} + 1000)) - ${ns_a} ${ping} -q -M want -i 0.1 -w 2 -s $((${ll_mtu} + 500)) ${dst} > /dev/null + ${ns_a} ${ping} -q -M want -i 0.1 -w 1 -s $((${ll_mtu} + 500)) ${dst} > /dev/null # Check that exception was created pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst})" @@ -767,7 +776,7 @@ test_pmtu_ipvX_over_fouY_or_gueY() { mtu "${ns_a}" ${encap}_a $((${ll_mtu} + 1000)) mtu "${ns_b}" ${encap}_b $((${ll_mtu} + 1000)) - ${ns_a} ${ping} -q -M want -i 0.1 -w 2 -s $((${ll_mtu} + 500)) ${dst} > /dev/null + ${ns_a} ${ping} -q -M want -i 0.1 -w 1 -s $((${ll_mtu} + 500)) ${dst} > /dev/null # Check that exception was created pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst})" @@ -825,13 +834,13 @@ test_pmtu_vti4_exception() { # Send DF packet without exceeding link layer MTU, check that no # exception is created - ${ns_a} ping -q -M want -i 0.1 -w 2 -s ${ping_payload} ${tunnel4_b_addr} > /dev/null + ${ns_a} ping -q -M want -i 0.1 -w 1 -s ${ping_payload} ${tunnel4_b_addr} > /dev/null pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel4_b_addr})" check_pmtu_value "" "${pmtu}" "sending packet smaller than PMTU (IP payload length ${esp_payload_rfc4106})" || return 1 # Now exceed link layer MTU by one byte, check that exception is created # with the right PMTU value - ${ns_a} ping -q -M want -i 0.1 -w 2 -s $((ping_payload + 1)) ${tunnel4_b_addr} > /dev/null + ${ns_a} ping -q -M want -i 0.1 -w 1 -s $((ping_payload + 1)) ${tunnel4_b_addr} > /dev/null pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel4_b_addr})" check_pmtu_value "${esp_payload_rfc4106}" "${pmtu}" "exceeding PMTU (IP payload length $((esp_payload_rfc4106 + 1)))" } @@ -847,7 +856,7 @@ test_pmtu_vti6_exception() { mtu "${ns_b}" veth_b 4000 mtu "${ns_a}" vti6_a 5000 mtu "${ns_b}" vti6_b 5000 - ${ns_a} ${ping6} -q -i 0.1 -w 2 -s 60000 ${tunnel6_b_addr} > /dev/null + ${ns_a} ${ping6} -q -i 0.1 -w 1 -s 60000 ${tunnel6_b_addr} > /dev/null # Check that exception was created pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel6_b_addr})" @@ -1008,6 +1017,61 @@ test_pmtu_vti6_link_change_mtu() { return ${fail} } +check_command() { + cmd=${1} + + if ! which ${cmd} > /dev/null 2>&1; then + err " missing required command: '${cmd}'" + return 1 + fi + return 0 +} + +test_cleanup_vxlanX_exception() { + outer="${1}" + encap="vxlan" + ll_mtu=4000 + + check_command taskset || return 2 + cpu_list=$(grep -m 2 processor /proc/cpuinfo | cut -d ' ' -f 2) + + setup namespaces routing ${encap}${outer} || return 2 + trace "${ns_a}" ${encap}_a "${ns_b}" ${encap}_b \ + "${ns_a}" veth_A-R1 "${ns_r1}" veth_R1-A \ + "${ns_b}" veth_B-R1 "${ns_r1}" veth_R1-B + + # Create route exception by exceeding link layer MTU + mtu "${ns_a}" veth_A-R1 $((${ll_mtu} + 1000)) + mtu "${ns_r1}" veth_R1-A $((${ll_mtu} + 1000)) + mtu "${ns_b}" veth_B-R1 ${ll_mtu} + mtu "${ns_r1}" veth_R1-B ${ll_mtu} + + mtu "${ns_a}" ${encap}_a $((${ll_mtu} + 1000)) + mtu "${ns_b}" ${encap}_b $((${ll_mtu} + 1000)) + + # Fill exception cache for multiple CPUs (2) + # we can always use inner IPv4 for that + for cpu in ${cpu_list}; do + taskset --cpu-list ${cpu} ${ns_a} ping -q -M want -i 0.1 -w 1 -s $((${ll_mtu} + 500)) ${tunnel4_b_addr} > /dev/null + done + + ${ns_a} ip link del dev veth_A-R1 & + iplink_pid=$! + sleep 1 + if [ "$(cat /proc/${iplink_pid}/cmdline 2>/dev/null | tr -d '\0')" = "iplinkdeldevveth_A-R1" ]; then + err " can't delete veth device in a timely manner, PMTU dst likely leaked" + return 1 + fi +} + +test_cleanup_ipv6_exception() { + test_cleanup_vxlanX_exception 6 +} + +test_cleanup_ipv4_exception() { + test_cleanup_vxlanX_exception 4 +} + usage() { echo echo "$0 [OPTIONS] [TEST]..." diff --git a/tools/testing/selftests/net/udpgro.sh b/tools/testing/selftests/net/udpgro.sh @@ -37,7 +37,7 @@ run_one() { cfg_veth - ip netns exec "${PEER_NS}" ./udpgso_bench_rx ${rx_args} && \ + ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 10 ${rx_args} && \ echo "ok" || \ echo "failed" & @@ -81,7 +81,7 @@ run_one_nat() { # will land on the 'plain' one ip netns exec "${PEER_NS}" ./udpgso_bench_rx -G ${family} -b ${addr1} -n 0 & pid=$! - ip netns exec "${PEER_NS}" ./udpgso_bench_rx ${family} -b ${addr2%/*} ${rx_args} && \ + ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 10 ${family} -b ${addr2%/*} ${rx_args} && \ echo "ok" || \ echo "failed"& @@ -99,8 +99,8 @@ run_one_2sock() { cfg_veth - ip netns exec "${PEER_NS}" ./udpgso_bench_rx ${rx_args} -p 12345 & - ip netns exec "${PEER_NS}" ./udpgso_bench_rx ${rx_args} && \ + ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 10 ${rx_args} -p 12345 & + ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 2000 -R 10 ${rx_args} && \ echo "ok" || \ echo "failed" & diff --git a/tools/testing/selftests/net/udpgso_bench_rx.c b/tools/testing/selftests/net/udpgso_bench_rx.c @@ -45,6 +45,8 @@ static int cfg_alen = sizeof(struct sockaddr_in6); static int cfg_expected_pkt_nr; static int cfg_expected_pkt_len; static int cfg_expected_gso_size; +static int cfg_connect_timeout_ms; +static int cfg_rcv_timeout_ms; static struct sockaddr_storage cfg_bind_addr; static bool interrupted; @@ -87,7 +89,7 @@ static unsigned long gettimeofday_ms(void) return (tv.tv_sec * 1000) + (tv.tv_usec / 1000); } -static void do_poll(int fd) +static void do_poll(int fd, int timeout_ms) { struct pollfd pfd; int ret; @@ -102,8 +104,16 @@ static void do_poll(int fd) break; if (ret == -1) error(1, errno, "poll"); - if (ret == 0) - continue; + if (ret == 0) { + if (!timeout_ms) + continue; + + timeout_ms -= 10; + if (timeout_ms <= 0) { + interrupted = true; + break; + } + } if (pfd.revents != POLLIN) error(1, errno, "poll: 0x%x expected 0x%x\n", pfd.revents, POLLIN); @@ -134,7 +144,7 @@ static int do_socket(bool do_tcp) if (listen(accept_fd, 1)) error(1, errno, "listen"); - do_poll(accept_fd); + do_poll(accept_fd, cfg_connect_timeout_ms); if (interrupted) exit(0); @@ -273,7 +283,9 @@ static void do_flush_udp(int fd) static void usage(const char *filepath) { - error(1, 0, "Usage: %s [-Grtv] [-b addr] [-p port] [-l pktlen] [-n packetnr] [-S gsosize]", filepath); + error(1, 0, "Usage: %s [-C connect_timeout] [-Grtv] [-b addr] [-p port]" + " [-l pktlen] [-n packetnr] [-R rcv_timeout] [-S gsosize]", + filepath); } static void parse_opts(int argc, char **argv) @@ -282,7 +294,7 @@ static void parse_opts(int argc, char **argv) /* bind to any by default */ setup_sockaddr(PF_INET6, "::", &cfg_bind_addr); - while ((c = getopt(argc, argv, "4b:Gl:n:p:rS:tv")) != -1) { + while ((c = getopt(argc, argv, "4b:C:Gl:n:p:rR:S:tv")) != -1) { switch (c) { case '4': cfg_family = PF_INET; @@ -292,6 +304,9 @@ static void parse_opts(int argc, char **argv) case 'b': setup_sockaddr(cfg_family, optarg, &cfg_bind_addr); break; + case 'C': + cfg_connect_timeout_ms = strtoul(optarg, NULL, 0); + break; case 'G': cfg_gro_segment = true; break; @@ -307,6 +322,9 @@ static void parse_opts(int argc, char **argv) case 'r': cfg_read_all = true; break; + case 'R': + cfg_rcv_timeout_ms = strtoul(optarg, NULL, 0); + break; case 'S': cfg_expected_gso_size = strtol(optarg, NULL, 0); break; @@ -329,8 +347,9 @@ static void parse_opts(int argc, char **argv) static void do_recv(void) { + int timeout_ms = cfg_tcp ? cfg_rcv_timeout_ms : cfg_connect_timeout_ms; unsigned long tnow, treport; - int fd, loop = 0; + int fd; fd = do_socket(cfg_tcp); @@ -342,12 +361,7 @@ static void do_recv(void) treport = gettimeofday_ms() + 1000; do { - /* force termination after the second poll(); this cope both - * with sender slower than receiver and missing packet errors - */ - if (cfg_expected_pkt_nr && loop++) - interrupted = true; - do_poll(fd); + do_poll(fd, timeout_ms); if (cfg_tcp) do_flush_tcp(fd); @@ -365,6 +379,8 @@ static void do_recv(void) treport = tnow + 1000; } + timeout_ms = cfg_rcv_timeout_ms; + } while (!interrupted); if (cfg_expected_pkt_nr && (packets != cfg_expected_pkt_nr))