whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit 60b548237fed4b4164bab13c994dd9615f6c4323
parent b26b2b24b1d8cc6e24d81872e6e18c0d76382a81
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Wed, 28 Nov 2018 12:53:48 -0800

Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net

Pull networking fixes from David Miller:

 1) ARM64 JIT fixes for subprog handling from Daniel Borkmann.

 2) Various sparc64 JIT bug fixes (fused branch convergance, frame
    pointer usage detection logic, PSEODU call argument handling).

 3) Fix to use BH locking in nf_conncount, from Taehee Yoo.

 4) Fix race of TX skb freeing in ipheth driver, from Bernd Eckstein.

 5) Handle return value of TX NAPI completion properly in lan743x
    driver, from Bryan Whitehead.

 6) MAC filter deletion in i40e driver clears wrong state bit, from
    Lihong Yang.

 7) Fix use after free in rionet driver, from Pan Bian.

* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net: (53 commits)
  s390/qeth: fix length check in SNMP processing
  net: hisilicon: remove unexpected free_netdev
  rapidio/rionet: do not free skb before reading its length
  i40e: fix kerneldoc for xsk methods
  ixgbe: recognize 1000BaseLX SFP modules as 1Gbps
  i40e: Fix deletion of MAC filters
  igb: fix uninitialized variables
  netfilter: nf_tables: deactivate expressions in rule replecement routine
  lan743x: Enable driver to work with LAN7431
  tipc: fix lockdep warning during node delete
  lan743x: fix return value for lan743x_tx_napi_poll
  net: via: via-velocity: fix spelling mistake "alignement" -> "alignment"
  qed: fix spelling mistake "attnetion" -> "attention"
  net: thunderx: fix NULL pointer dereference in nic_remove
  sctp: increase sk_wmem_alloc when head->truesize is increased
  firestream: fix spelling mistake: "Inititing" -> "Initializing"
  net: phy: add workaround for issue where PHY driver doesn't bind to the device
  usbnet: ipheth: fix potential recvmsg bug and recvmsg bug 2
  sparc: Adjust bpf JIT prologue for PSEUDO calls.
  bpf, doc: add entries of who looks over which jits
  ...

Diffstat:
MMAINTAINERS | 63++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
March/arm64/net/bpf_jit_comp.c | 26+++++++++++++++++---------
March/powerpc/net/bpf_jit_comp64.c | 57++++++++++++++++++++++++++++++++++++++-------------------
March/sparc/net/bpf_jit_comp_64.c | 97+++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------
Mdrivers/atm/firestream.c | 4++--
Mdrivers/net/ethernet/cavium/thunder/nic_main.c | 3+++
Mdrivers/net/ethernet/hisilicon/hip04_eth.c | 4+---
Mdrivers/net/ethernet/intel/i40e/i40e_main.c | 2+-
Mdrivers/net/ethernet/intel/i40e/i40e_xsk.c | 14+++++++-------
Mdrivers/net/ethernet/intel/igb/e1000_i210.c | 1+
Mdrivers/net/ethernet/intel/ixgbe/ixgbe_x550.c | 4+++-
Mdrivers/net/ethernet/microchip/lan743x_main.c | 11++++++-----
Mdrivers/net/ethernet/microchip/lan743x_main.h | 1+
Mdrivers/net/ethernet/qlogic/qed/qed_debug.c | 2+-
Mdrivers/net/ethernet/via/via-velocity.c | 2+-
Mdrivers/net/phy/phy_device.c | 8++++++++
Mdrivers/net/rionet.c | 2+-
Mdrivers/net/usb/ipheth.c | 10++++------
Mdrivers/s390/net/qeth_core_main.c | 27++++++++++++---------------
Minclude/linux/filter.h | 4++++
Minclude/linux/netfilter/nf_conntrack_proto_gre.h | 13+++++++++++++
Minclude/net/netfilter/ipv4/nf_nat_masquerade.h | 2+-
Minclude/net/netfilter/ipv6/nf_nat_masquerade.h | 2+-
Mkernel/bpf/core.c | 34++++++++++++++++++++++++++++++++++
Mkernel/bpf/local_storage.c | 3++-
Mkernel/bpf/queue_stack_maps.c | 16++++++++--------
Mkernel/bpf/verifier.c | 2+-
Mkernel/trace/bpf_trace.c | 8+++++---
Mnet/core/filter.c | 5++---
Mnet/ipv4/ip_output.c | 3++-
Mnet/ipv4/netfilter/ipt_MASQUERADE.c | 7+++++--
Mnet/ipv4/netfilter/nf_nat_masquerade_ipv4.c | 38++++++++++++++++++++++++++++++--------
Mnet/ipv4/netfilter/nft_masq_ipv4.c | 4+++-
Mnet/ipv4/tcp_input.c | 16++++++++++------
Mnet/ipv4/tcp_timer.c | 10++++++----
Mnet/ipv6/ip6_output.c | 3++-
Mnet/ipv6/netfilter.c | 3++-
Mnet/ipv6/netfilter/ip6t_MASQUERADE.c | 8++++++--
Mnet/ipv6/netfilter/nf_nat_masquerade_ipv6.c | 49++++++++++++++++++++++++++++++++++++-------------
Mnet/ipv6/netfilter/nft_masq_ipv6.c | 4+++-
Mnet/netfilter/ipvs/ip_vs_ctl.c | 3+++
Mnet/netfilter/nf_conncount.c | 44++++++++++++++++++++++++++++----------------
Mnet/netfilter/nf_conntrack_proto_gre.c | 14++------------
Mnet/netfilter/nf_tables_api.c | 46+++++++++++++++++-----------------------------
Mnet/netfilter/nfnetlink_cttimeout.c | 15+++++++++++++--
Mnet/netfilter/nft_compat.c | 3++-
Mnet/netfilter/nft_flow_offload.c | 5++++-
Mnet/netfilter/xt_RATEEST.c | 10----------
Mnet/netfilter/xt_hashlimit.c | 9+++------
Mnet/sctp/output.c | 1+
Mnet/tipc/node.c | 7+++++--
Mtools/bpf/bpftool/Documentation/bpftool-cgroup.rst | 8+++++++-
Mtools/bpf/bpftool/Documentation/bpftool-map.rst | 8+++++++-
Mtools/bpf/bpftool/Documentation/bpftool-net.rst | 8+++++++-
Mtools/bpf/bpftool/Documentation/bpftool-perf.rst | 8+++++++-
Mtools/bpf/bpftool/Documentation/bpftool-prog.rst | 11+++++++++--
Mtools/bpf/bpftool/Documentation/bpftool.rst | 9+++++++--
Mtools/bpf/bpftool/common.c | 17+++++++++--------
Mtools/bpf/bpftool/main.h | 2+-
Mtools/bpf/bpftool/prog.c | 13++++++++-----
Atools/include/uapi/linux/pkt_cls.h | 612+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atools/include/uapi/linux/tc_act/tc_bpf.h | 37+++++++++++++++++++++++++++++++++++++
Mtools/testing/selftests/Makefile | 1+
Mtools/testing/selftests/bpf/test_netcnt.c | 5++++-
Mtools/testing/selftests/bpf/test_verifier.c | 19+++++++++++++++++++
Atools/testing/selftests/netfilter/Makefile | 6++++++
Atools/testing/selftests/netfilter/config | 2++
Atools/testing/selftests/netfilter/nft_trans_stress.sh | 78++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
68 files changed, 1312 insertions(+), 261 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS @@ -2801,7 +2801,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git Q: https://patchwork.ozlabs.org/project/netdev/list/?delegate=77147 S: Supported -F: arch/x86/net/bpf_jit* +F: arch/*/net/* F: Documentation/networking/filter.txt F: Documentation/bpf/ F: include/linux/bpf* @@ -2821,6 +2821,67 @@ F: tools/bpf/ F: tools/lib/bpf/ F: tools/testing/selftests/bpf/ +BPF JIT for ARM +M: Shubham Bansal <illusionist.neo@gmail.com> +L: netdev@vger.kernel.org +S: Maintained +F: arch/arm/net/ + +BPF JIT for ARM64 +M: Daniel Borkmann <daniel@iogearbox.net> +M: Alexei Starovoitov <ast@kernel.org> +M: Zi Shen Lim <zlim.lnx@gmail.com> +L: netdev@vger.kernel.org +S: Supported +F: arch/arm64/net/ + +BPF JIT for MIPS (32-BIT AND 64-BIT) +M: Paul Burton <paul.burton@mips.com> +L: netdev@vger.kernel.org +S: Maintained +F: arch/mips/net/ + +BPF JIT for NFP NICs +M: Jakub Kicinski <jakub.kicinski@netronome.com> +L: netdev@vger.kernel.org +S: Supported +F: drivers/net/ethernet/netronome/nfp/bpf/ + +BPF JIT for POWERPC (32-BIT AND 64-BIT) +M: Naveen N. Rao <naveen.n.rao@linux.ibm.com> +M: Sandipan Das <sandipan@linux.ibm.com> +L: netdev@vger.kernel.org +S: Maintained +F: arch/powerpc/net/ + +BPF JIT for S390 +M: Martin Schwidefsky <schwidefsky@de.ibm.com> +M: Heiko Carstens <heiko.carstens@de.ibm.com> +L: netdev@vger.kernel.org +S: Maintained +F: arch/s390/net/ +X: arch/s390/net/pnet.c + +BPF JIT for SPARC (32-BIT AND 64-BIT) +M: David S. Miller <davem@davemloft.net> +L: netdev@vger.kernel.org +S: Maintained +F: arch/sparc/net/ + +BPF JIT for X86 32-BIT +M: Wang YanQing <udknight@gmail.com> +L: netdev@vger.kernel.org +S: Maintained +F: arch/x86/net/bpf_jit_comp32.c + +BPF JIT for X86 64-BIT +M: Alexei Starovoitov <ast@kernel.org> +M: Daniel Borkmann <daniel@iogearbox.net> +L: netdev@vger.kernel.org +S: Supported +F: arch/x86/net/ +X: arch/x86/net/bpf_jit_comp32.c + BROADCOM B44 10/100 ETHERNET DRIVER M: Michael Chan <michael.chan@broadcom.com> L: netdev@vger.kernel.org diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c @@ -351,7 +351,8 @@ static void build_epilogue(struct jit_ctx *ctx) * >0 - successfully JITed a 16-byte eBPF instruction. * <0 - failed to JIT. */ -static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) +static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, + bool extra_pass) { const u8 code = insn->code; const u8 dst = bpf2a64[insn->dst_reg]; @@ -625,12 +626,19 @@ emit_cond_jmp: case BPF_JMP | BPF_CALL: { const u8 r0 = bpf2a64[BPF_REG_0]; - const u64 func = (u64)__bpf_call_base + imm; + bool func_addr_fixed; + u64 func_addr; + int ret; - if (ctx->prog->is_func) - emit_addr_mov_i64(tmp, func, ctx); + ret = bpf_jit_get_func_addr(ctx->prog, insn, extra_pass, + &func_addr, &func_addr_fixed); + if (ret < 0) + return ret; + if (func_addr_fixed) + /* We can use optimized emission here. */ + emit_a64_mov_i64(tmp, func_addr, ctx); else - emit_a64_mov_i64(tmp, func, ctx); + emit_addr_mov_i64(tmp, func_addr, ctx); emit(A64_BLR(tmp), ctx); emit(A64_MOV(1, r0, A64_R(0)), ctx); break; @@ -753,7 +761,7 @@ emit_cond_jmp: return 0; } -static int build_body(struct jit_ctx *ctx) +static int build_body(struct jit_ctx *ctx, bool extra_pass) { const struct bpf_prog *prog = ctx->prog; int i; @@ -762,7 +770,7 @@ static int build_body(struct jit_ctx *ctx) const struct bpf_insn *insn = &prog->insnsi[i]; int ret; - ret = build_insn(insn, ctx); + ret = build_insn(insn, ctx, extra_pass); if (ret > 0) { i++; if (ctx->image == NULL) @@ -858,7 +866,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) /* 1. Initial fake pass to compute ctx->idx. */ /* Fake pass to fill in ctx->offset. */ - if (build_body(&ctx)) { + if (build_body(&ctx, extra_pass)) { prog = orig_prog; goto out_off; } @@ -888,7 +896,7 @@ skip_init_ctx: build_prologue(&ctx, was_classic); - if (build_body(&ctx)) { + if (build_body(&ctx, extra_pass)) { bpf_jit_binary_free(header); prog = orig_prog; goto out_off; diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c @@ -166,7 +166,33 @@ static void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx) PPC_BLR(); } -static void bpf_jit_emit_func_call(u32 *image, struct codegen_context *ctx, u64 func) +static void bpf_jit_emit_func_call_hlp(u32 *image, struct codegen_context *ctx, + u64 func) +{ +#ifdef PPC64_ELF_ABI_v1 + /* func points to the function descriptor */ + PPC_LI64(b2p[TMP_REG_2], func); + /* Load actual entry point from function descriptor */ + PPC_BPF_LL(b2p[TMP_REG_1], b2p[TMP_REG_2], 0); + /* ... and move it to LR */ + PPC_MTLR(b2p[TMP_REG_1]); + /* + * Load TOC from function descriptor at offset 8. + * We can clobber r2 since we get called through a + * function pointer (so caller will save/restore r2) + * and since we don't use a TOC ourself. + */ + PPC_BPF_LL(2, b2p[TMP_REG_2], 8); +#else + /* We can clobber r12 */ + PPC_FUNC_ADDR(12, func); + PPC_MTLR(12); +#endif + PPC_BLRL(); +} + +static void bpf_jit_emit_func_call_rel(u32 *image, struct codegen_context *ctx, + u64 func) { unsigned int i, ctx_idx = ctx->idx; @@ -273,7 +299,7 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, { const struct bpf_insn *insn = fp->insnsi; int flen = fp->len; - int i; + int i, ret; /* Start of epilogue code - will only be valid 2nd pass onwards */ u32 exit_addr = addrs[flen]; @@ -284,8 +310,9 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 src_reg = b2p[insn[i].src_reg]; s16 off = insn[i].off; s32 imm = insn[i].imm; + bool func_addr_fixed; + u64 func_addr; u64 imm64; - u8 *func; u32 true_cond; u32 tmp_idx; @@ -711,23 +738,15 @@ emit_clear: case BPF_JMP | BPF_CALL: ctx->seen |= SEEN_FUNC; - /* bpf function call */ - if (insn[i].src_reg == BPF_PSEUDO_CALL) - if (!extra_pass) - func = NULL; - else if (fp->aux->func && off < fp->aux->func_cnt) - /* use the subprog id from the off - * field to lookup the callee address - */ - func = (u8 *) fp->aux->func[off]->bpf_func; - else - return -EINVAL; - /* kernel helper call */ - else - func = (u8 *) __bpf_call_base + imm; - - bpf_jit_emit_func_call(image, ctx, (u64)func); + ret = bpf_jit_get_func_addr(fp, &insn[i], extra_pass, + &func_addr, &func_addr_fixed); + if (ret < 0) + return ret; + if (func_addr_fixed) + bpf_jit_emit_func_call_hlp(image, ctx, func_addr); + else + bpf_jit_emit_func_call_rel(image, ctx, func_addr); /* move return value from r3 to BPF_REG_0 */ PPC_MR(b2p[BPF_REG_0], 3); break; diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c @@ -791,7 +791,7 @@ static int emit_compare_and_branch(const u8 code, const u8 dst, u8 src, } /* Just skip the save instruction and the ctx register move. */ -#define BPF_TAILCALL_PROLOGUE_SKIP 16 +#define BPF_TAILCALL_PROLOGUE_SKIP 32 #define BPF_TAILCALL_CNT_SP_OFF (STACK_BIAS + 128) static void build_prologue(struct jit_ctx *ctx) @@ -824,9 +824,15 @@ static void build_prologue(struct jit_ctx *ctx) const u8 vfp = bpf2sparc[BPF_REG_FP]; emit(ADD | IMMED | RS1(FP) | S13(STACK_BIAS) | RD(vfp), ctx); + } else { + emit_nop(ctx); } emit_reg_move(I0, O0, ctx); + emit_reg_move(I1, O1, ctx); + emit_reg_move(I2, O2, ctx); + emit_reg_move(I3, O3, ctx); + emit_reg_move(I4, O4, ctx); /* If you add anything here, adjust BPF_TAILCALL_PROLOGUE_SKIP above. */ } @@ -1270,6 +1276,9 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) const u8 tmp2 = bpf2sparc[TMP_REG_2]; u32 opcode = 0, rs2; + if (insn->dst_reg == BPF_REG_FP) + ctx->saw_frame_pointer = true; + ctx->tmp_2_used = true; emit_loadimm(imm, tmp2, ctx); @@ -1308,6 +1317,9 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) const u8 tmp = bpf2sparc[TMP_REG_1]; u32 opcode = 0, rs2; + if (insn->dst_reg == BPF_REG_FP) + ctx->saw_frame_pointer = true; + switch (BPF_SIZE(code)) { case BPF_W: opcode = ST32; @@ -1340,6 +1352,9 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) const u8 tmp2 = bpf2sparc[TMP_REG_2]; const u8 tmp3 = bpf2sparc[TMP_REG_3]; + if (insn->dst_reg == BPF_REG_FP) + ctx->saw_frame_pointer = true; + ctx->tmp_1_used = true; ctx->tmp_2_used = true; ctx->tmp_3_used = true; @@ -1360,6 +1375,9 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) const u8 tmp2 = bpf2sparc[TMP_REG_2]; const u8 tmp3 = bpf2sparc[TMP_REG_3]; + if (insn->dst_reg == BPF_REG_FP) + ctx->saw_frame_pointer = true; + ctx->tmp_1_used = true; ctx->tmp_2_used = true; ctx->tmp_3_used = true; @@ -1425,12 +1443,12 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) struct bpf_prog *tmp, *orig_prog = prog; struct sparc64_jit_data *jit_data; struct bpf_binary_header *header; + u32 prev_image_size, image_size; bool tmp_blinded = false; bool extra_pass = false; struct jit_ctx ctx; - u32 image_size; u8 *image_ptr; - int pass; + int pass, i; if (!prog->jit_requested) return orig_prog; @@ -1461,61 +1479,82 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) header = jit_data->header; extra_pass = true; image_size = sizeof(u32) * ctx.idx; + prev_image_size = image_size; + pass = 1; goto skip_init_ctx; } memset(&ctx, 0, sizeof(ctx)); ctx.prog = prog; - ctx.offset = kcalloc(prog->len, sizeof(unsigned int), GFP_KERNEL); + ctx.offset = kmalloc_array(prog->len, sizeof(unsigned int), GFP_KERNEL); if (ctx.offset == NULL) { prog = orig_prog; goto out_off; } - /* Fake pass to detect features used, and get an accurate assessment - * of what the final image size will be. + /* Longest sequence emitted is for bswap32, 12 instructions. Pre-cook + * the offset array so that we converge faster. */ - if (build_body(&ctx)) { - prog = orig_prog; - goto out_off; - } - build_prologue(&ctx); - build_epilogue(&ctx); - - /* Now we know the actual image size. */ - image_size = sizeof(u32) * ctx.idx; - header = bpf_jit_binary_alloc(image_size, &image_ptr, - sizeof(u32), jit_fill_hole); - if (header == NULL) { - prog = orig_prog; - goto out_off; - } + for (i = 0; i < prog->len; i++) + ctx.offset[i] = i * (12 * 4); - ctx.image = (u32 *)image_ptr; -skip_init_ctx: - for (pass = 1; pass < 3; pass++) { + prev_image_size = ~0U; + for (pass = 1; pass < 40; pass++) { ctx.idx = 0; build_prologue(&ctx); - if (build_body(&ctx)) { - bpf_jit_binary_free(header); prog = orig_prog; goto out_off; } - build_epilogue(&ctx); if (bpf_jit_enable > 1) - pr_info("Pass %d: shrink = %d, seen = [%c%c%c%c%c%c]\n", pass, - image_size - (ctx.idx * 4), + pr_info("Pass %d: size = %u, seen = [%c%c%c%c%c%c]\n", pass, + ctx.idx * 4, ctx.tmp_1_used ? '1' : ' ', ctx.tmp_2_used ? '2' : ' ', ctx.tmp_3_used ? '3' : ' ', ctx.saw_frame_pointer ? 'F' : ' ', ctx.saw_call ? 'C' : ' ', ctx.saw_tail_call ? 'T' : ' '); + + if (ctx.idx * 4 == prev_image_size) + break; + prev_image_size = ctx.idx * 4; + cond_resched(); + } + + /* Now we know the actual image size. */ + image_size = sizeof(u32) * ctx.idx; + header = bpf_jit_binary_alloc(image_size, &image_ptr, + sizeof(u32), jit_fill_hole); + if (header == NULL) { + prog = orig_prog; + goto out_off; + } + + ctx.image = (u32 *)image_ptr; +skip_init_ctx: + ctx.idx = 0; + + build_prologue(&ctx); + + if (build_body(&ctx)) { + bpf_jit_binary_free(header); + prog = orig_prog; + goto out_off; + } + + build_epilogue(&ctx); + + if (ctx.idx * 4 != prev_image_size) { + pr_err("bpf_jit: Failed to converge, prev_size=%u size=%d\n", + prev_image_size, ctx.idx * 4); + bpf_jit_binary_free(header); + prog = orig_prog; + goto out_off; } if (bpf_jit_enable > 1) diff --git a/drivers/atm/firestream.c b/drivers/atm/firestream.c @@ -1410,7 +1410,7 @@ static int init_q(struct fs_dev *dev, struct queue *txq, int queue, func_enter (); - fs_dprintk (FS_DEBUG_INIT, "Inititing queue at %x: %d entries:\n", + fs_dprintk (FS_DEBUG_INIT, "Initializing queue at %x: %d entries:\n", queue, nentries); p = aligned_kmalloc (sz, GFP_KERNEL, 0x10); @@ -1443,7 +1443,7 @@ static int init_fp(struct fs_dev *dev, struct freepool *fp, int queue, { func_enter (); - fs_dprintk (FS_DEBUG_INIT, "Inititing free pool at %x:\n", queue); + fs_dprintk (FS_DEBUG_INIT, "Initializing free pool at %x:\n", queue); write_fs (dev, FP_CNF(queue), (bufsize * RBFP_RBS) | RBFP_RBSVAL | RBFP_CME); write_fs (dev, FP_SA(queue), 0); diff --git a/drivers/net/ethernet/cavium/thunder/nic_main.c b/drivers/net/ethernet/cavium/thunder/nic_main.c @@ -1441,6 +1441,9 @@ static void nic_remove(struct pci_dev *pdev) { struct nicpf *nic = pci_get_drvdata(pdev); + if (!nic) + return; + if (nic->flags & NIC_SRIOV_ENABLED) pci_disable_sriov(pdev); diff --git a/drivers/net/ethernet/hisilicon/hip04_eth.c b/drivers/net/ethernet/hisilicon/hip04_eth.c @@ -915,10 +915,8 @@ static int hip04_mac_probe(struct platform_device *pdev) } ret = register_netdev(ndev); - if (ret) { - free_netdev(ndev); + if (ret) goto alloc_fail; - } return 0; diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -1413,7 +1413,7 @@ void __i40e_del_filter(struct i40e_vsi *vsi, struct i40e_mac_filter *f) } vsi->flags |= I40E_VSI_FLAG_FILTER_CHANGED; - set_bit(__I40E_MACVLAN_SYNC_PENDING, vsi->state); + set_bit(__I40E_MACVLAN_SYNC_PENDING, vsi->back->state); } /** diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -33,7 +33,7 @@ static int i40e_alloc_xsk_umems(struct i40e_vsi *vsi) } /** - * i40e_add_xsk_umem - Store an UMEM for a certain ring/qid + * i40e_add_xsk_umem - Store a UMEM for a certain ring/qid * @vsi: Current VSI * @umem: UMEM to store * @qid: Ring/qid to associate with the UMEM @@ -56,7 +56,7 @@ static int i40e_add_xsk_umem(struct i40e_vsi *vsi, struct xdp_umem *umem, } /** - * i40e_remove_xsk_umem - Remove an UMEM for a certain ring/qid + * i40e_remove_xsk_umem - Remove a UMEM for a certain ring/qid * @vsi: Current VSI * @qid: Ring/qid associated with the UMEM **/ @@ -130,7 +130,7 @@ static void i40e_xsk_umem_dma_unmap(struct i40e_vsi *vsi, struct xdp_umem *umem) } /** - * i40e_xsk_umem_enable - Enable/associate an UMEM to a certain ring/qid + * i40e_xsk_umem_enable - Enable/associate a UMEM to a certain ring/qid * @vsi: Current VSI * @umem: UMEM * @qid: Rx ring to associate UMEM to @@ -189,7 +189,7 @@ static int i40e_xsk_umem_enable(struct i40e_vsi *vsi, struct xdp_umem *umem, } /** - * i40e_xsk_umem_disable - Diassociate an UMEM from a certain ring/qid + * i40e_xsk_umem_disable - Disassociate a UMEM from a certain ring/qid * @vsi: Current VSI * @qid: Rx ring to associate UMEM to * @@ -255,12 +255,12 @@ int i40e_xsk_umem_query(struct i40e_vsi *vsi, struct xdp_umem **umem, } /** - * i40e_xsk_umem_query - Queries a certain ring/qid for its UMEM + * i40e_xsk_umem_setup - Enable/disassociate a UMEM to/from a ring/qid * @vsi: Current VSI * @umem: UMEM to enable/associate to a ring, or NULL to disable * @qid: Rx ring to (dis)associate UMEM (from)to * - * This function enables or disables an UMEM to a certain ring. + * This function enables or disables a UMEM to a certain ring. * * Returns 0 on success, <0 on failure **/ @@ -276,7 +276,7 @@ int i40e_xsk_umem_setup(struct i40e_vsi *vsi, struct xdp_umem *umem, * @rx_ring: Rx ring * @xdp: xdp_buff used as input to the XDP program * - * This function enables or disables an UMEM to a certain ring. + * This function enables or disables a UMEM to a certain ring. * * Returns any of I40E_XDP_{PASS, CONSUMED, TX, REDIR} **/ diff --git a/drivers/net/ethernet/intel/igb/e1000_i210.c b/drivers/net/ethernet/intel/igb/e1000_i210.c @@ -842,6 +842,7 @@ s32 igb_pll_workaround_i210(struct e1000_hw *hw) nvm_word = E1000_INVM_DEFAULT_AL; tmp_nvm = nvm_word | E1000_INVM_PLL_WO_VAL; igb_write_phy_reg_82580(hw, I347AT4_PAGE_SELECT, E1000_PHY_PLL_FREQ_PAGE); + phy_word = E1000_PHY_PLL_UNCONF; for (i = 0; i < E1000_MAX_PLL_TRIES; i++) { /* check current state directly from internal PHY */ igb_read_phy_reg_82580(hw, E1000_PHY_PLL_FREQ_REG, &phy_word); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c @@ -2262,7 +2262,9 @@ static s32 ixgbe_get_link_capabilities_X550em(struct ixgbe_hw *hw, *autoneg = false; if (hw->phy.sfp_type == ixgbe_sfp_type_1g_sx_core0 || - hw->phy.sfp_type == ixgbe_sfp_type_1g_sx_core1) { + hw->phy.sfp_type == ixgbe_sfp_type_1g_sx_core1 || + hw->phy.sfp_type == ixgbe_sfp_type_1g_lx_core0 || + hw->phy.sfp_type == ixgbe_sfp_type_1g_lx_core1) { *speed = IXGBE_LINK_SPEED_1GB_FULL; return 0; } diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c @@ -1672,7 +1672,7 @@ static int lan743x_tx_napi_poll(struct napi_struct *napi, int weight) netif_wake_queue(adapter->netdev); } - if (!napi_complete_done(napi, weight)) + if (!napi_complete(napi)) goto done; /* enable isr */ @@ -1681,7 +1681,7 @@ static int lan743x_tx_napi_poll(struct napi_struct *napi, int weight) lan743x_csr_read(adapter, INT_STS); done: - return weight; + return 0; } static void lan743x_tx_ring_cleanup(struct lan743x_tx *tx) @@ -1870,9 +1870,9 @@ static int lan743x_tx_open(struct lan743x_tx *tx) tx->vector_flags = lan743x_intr_get_vector_flags(adapter, INT_BIT_DMA_TX_ (tx->channel_number)); - netif_napi_add(adapter->netdev, - &tx->napi, lan743x_tx_napi_poll, - tx->ring_size - 1); + netif_tx_napi_add(adapter->netdev, + &tx->napi, lan743x_tx_napi_poll, + tx->ring_size - 1); napi_enable(&tx->napi); data = 0; @@ -3017,6 +3017,7 @@ static const struct dev_pm_ops lan743x_pm_ops = { static const struct pci_device_id lan743x_pcidev_tbl[] = { { PCI_DEVICE(PCI_VENDOR_ID_SMSC, PCI_DEVICE_ID_SMSC_LAN7430) }, + { PCI_DEVICE(PCI_VENDOR_ID_SMSC, PCI_DEVICE_ID_SMSC_LAN7431) }, { 0, } }; diff --git a/drivers/net/ethernet/microchip/lan743x_main.h b/drivers/net/ethernet/microchip/lan743x_main.h @@ -548,6 +548,7 @@ struct lan743x_adapter; /* SMSC acquired EFAR late 1990's, MCHP acquired SMSC 2012 */ #define PCI_VENDOR_ID_SMSC PCI_VENDOR_ID_EFAR #define PCI_DEVICE_ID_SMSC_LAN7430 (0x7430) +#define PCI_DEVICE_ID_SMSC_LAN7431 (0x7431) #define PCI_CONFIG_LENGTH (0x1000) diff --git a/drivers/net/ethernet/qlogic/qed/qed_debug.c b/drivers/net/ethernet/qlogic/qed/qed_debug.c @@ -6071,7 +6071,7 @@ static const char * const s_igu_fifo_error_strs[] = { "no error", "length error", "function disabled", - "VF sent command to attnetion address", + "VF sent command to attention address", "host sent prod update command", "read of during interrupt register while in MIMD mode", "access to PXP BAR reserved address", diff --git a/drivers/net/ethernet/via/via-velocity.c b/drivers/net/ethernet/via/via-velocity.c @@ -3605,7 +3605,7 @@ static const char velocity_gstrings[][ETH_GSTRING_LEN] = { "tx_jumbo", "rx_mac_control_frames", "tx_mac_control_frames", - "rx_frame_alignement_errors", + "rx_frame_alignment_errors", "rx_long_ok", "rx_long_err", "tx_sqe_errors", diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c @@ -2197,6 +2197,14 @@ int phy_driver_register(struct phy_driver *new_driver, struct module *owner) new_driver->mdiodrv.driver.remove = phy_remove; new_driver->mdiodrv.driver.owner = owner; + /* The following works around an issue where the PHY driver doesn't bind + * to the device, resulting in the genphy driver being used instead of + * the dedicated driver. The root cause of the issue isn't known yet + * and seems to be in the base driver core. Once this is fixed we may + * remove this workaround. + */ + new_driver->mdiodrv.driver.probe_type = PROBE_FORCE_SYNCHRONOUS; + retval = driver_register(&new_driver->mdiodrv.driver); if (retval) { pr_err("%s: Error %d in registering driver\n", diff --git a/drivers/net/rionet.c b/drivers/net/rionet.c @@ -216,9 +216,9 @@ static int rionet_start_xmit(struct sk_buff *skb, struct net_device *ndev) * it just report sending a packet to the target * (without actual packet transfer). */ - dev_kfree_skb_any(skb); ndev->stats.tx_packets++; ndev->stats.tx_bytes += skb->len; + dev_kfree_skb_any(skb); } } diff --git a/drivers/net/usb/ipheth.c b/drivers/net/usb/ipheth.c @@ -140,7 +140,6 @@ struct ipheth_device { struct usb_device *udev; struct usb_interface *intf; struct net_device *net; - struct sk_buff *tx_skb; struct urb *tx_urb; struct urb *rx_urb; unsigned char *tx_buf; @@ -230,6 +229,7 @@ static void ipheth_rcvbulk_callback(struct urb *urb) case -ENOENT: case -ECONNRESET: case -ESHUTDOWN: + case -EPROTO: return; case 0: break; @@ -281,7 +281,6 @@ static void ipheth_sndbulk_callback(struct urb *urb) dev_err(&dev->intf->dev, "%s: urb status: %d\n", __func__, status); - dev_kfree_skb_irq(dev->tx_skb); if (status == 0) netif_wake_queue(dev->net); else @@ -423,7 +422,7 @@ static int ipheth_tx(struct sk_buff *skb, struct net_device *net) if (skb->len > IPHETH_BUF_SIZE) { WARN(1, "%s: skb too large: %d bytes\n", __func__, skb->len); dev->net->stats.tx_dropped++; - dev_kfree_skb_irq(skb); + dev_kfree_skb_any(skb); return NETDEV_TX_OK; } @@ -443,12 +442,11 @@ static int ipheth_tx(struct sk_buff *skb, struct net_device *net) dev_err(&dev->intf->dev, "%s: usb_submit_urb: %d\n", __func__, retval); dev->net->stats.tx_errors++; - dev_kfree_skb_irq(skb); + dev_kfree_skb_any(skb); } else { - dev->tx_skb = skb; - dev->net->stats.tx_packets++; dev->net->stats.tx_bytes += skb->len; + dev_consume_skb_any(skb); netif_stop_queue(net); } diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c @@ -4518,8 +4518,8 @@ static int qeth_snmp_command_cb(struct qeth_card *card, { struct qeth_ipa_cmd *cmd; struct qeth_arp_query_info *qinfo; - struct qeth_snmp_cmd *snmp; unsigned char *data; + void *snmp_data; __u16 data_len; QETH_CARD_TEXT(card, 3, "snpcmdcb"); @@ -4527,7 +4527,6 @@ static int qeth_snmp_command_cb(struct qeth_card *card, cmd = (struct qeth_ipa_cmd *) sdata; data = (unsigned char *)((char *)cmd - reply->offset); qinfo = (struct qeth_arp_query_info *) reply->param; - snmp = &cmd->data.setadapterparms.data.snmp; if (cmd->hdr.return_code) { QETH_CARD_TEXT_(card, 4, "scer1%x", cmd->hdr.return_code); @@ -4540,10 +4539,15 @@ static int qeth_snmp_command_cb(struct qeth_card *card, return 0; } data_len = *((__u16 *)QETH_IPA_PDU_LEN_PDU1(data)); - if (cmd->data.setadapterparms.hdr.seq_no == 1) - data_len -= (__u16)((char *)&snmp->data - (char *)cmd); - else - data_len -= (__u16)((char *)&snmp->request - (char *)cmd); + if (cmd->data.setadapterparms.hdr.seq_no == 1) { + snmp_data = &cmd->data.setadapterparms.data.snmp; + data_len -= offsetof(struct qeth_ipa_cmd, + data.setadapterparms.data.snmp); + } else { + snmp_data = &cmd->data.setadapterparms.data.snmp.request; + data_len -= offsetof(struct qeth_ipa_cmd, + data.setadapterparms.data.snmp.request); + } /* check if there is enough room in userspace */ if ((qinfo->udata_len - qinfo->udata_offset) < data_len) { @@ -4556,16 +4560,9 @@ static int qeth_snmp_command_cb(struct qeth_card *card, QETH_CARD_TEXT_(card, 4, "sseqn%i", cmd->data.setadapterparms.hdr.seq_no); /*copy entries to user buffer*/ - if (cmd->data.setadapterparms.hdr.seq_no == 1) { - memcpy(qinfo->udata + qinfo->udata_offset, - (char *)snmp, - data_len + offsetof(struct qeth_snmp_cmd, data)); - qinfo->udata_offset += offsetof(struct qeth_snmp_cmd, data); - } else { - memcpy(qinfo->udata + qinfo->udata_offset, - (char *)&snmp->request, data_len); - } + memcpy(qinfo->udata + qinfo->udata_offset, snmp_data, data_len); qinfo->udata_offset += data_len; + /* check if all replies received ... */ QETH_CARD_TEXT_(card, 4, "srtot%i", cmd->data.setadapterparms.hdr.used_total); diff --git a/include/linux/filter.h b/include/linux/filter.h @@ -866,6 +866,10 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr); void bpf_jit_free(struct bpf_prog *fp); +int bpf_jit_get_func_addr(const struct bpf_prog *prog, + const struct bpf_insn *insn, bool extra_pass, + u64 *func_addr, bool *func_addr_fixed); + struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *fp); void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other); diff --git a/include/linux/netfilter/nf_conntrack_proto_gre.h b/include/linux/netfilter/nf_conntrack_proto_gre.h @@ -21,6 +21,19 @@ struct nf_ct_gre_keymap { struct nf_conntrack_tuple tuple; }; +enum grep_conntrack { + GRE_CT_UNREPLIED, + GRE_CT_REPLIED, + GRE_CT_MAX +}; + +struct netns_proto_gre { + struct nf_proto_net nf; + rwlock_t keymap_lock; + struct list_head keymap_list; + unsigned int gre_timeouts[GRE_CT_MAX]; +}; + /* add new tuple->key_reply pair to keymap */ int nf_ct_gre_keymap_add(struct nf_conn *ct, enum ip_conntrack_dir dir, struct nf_conntrack_tuple *t); diff --git a/include/net/netfilter/ipv4/nf_nat_masquerade.h b/include/net/netfilter/ipv4/nf_nat_masquerade.h @@ -9,7 +9,7 @@ nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, const struct nf_nat_range2 *range, const struct net_device *out); -void nf_nat_masquerade_ipv4_register_notifier(void); +int nf_nat_masquerade_ipv4_register_notifier(void); void nf_nat_masquerade_ipv4_unregister_notifier(void); #endif /*_NF_NAT_MASQUERADE_IPV4_H_ */ diff --git a/include/net/netfilter/ipv6/nf_nat_masquerade.h b/include/net/netfilter/ipv6/nf_nat_masquerade.h @@ -5,7 +5,7 @@ unsigned int nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, const struct net_device *out); -void nf_nat_masquerade_ipv6_register_notifier(void); +int nf_nat_masquerade_ipv6_register_notifier(void); void nf_nat_masquerade_ipv6_unregister_notifier(void); #endif /* _NF_NAT_MASQUERADE_IPV6_H_ */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c @@ -672,6 +672,40 @@ void __weak bpf_jit_free(struct bpf_prog *fp) bpf_prog_unlock_free(fp); } +int bpf_jit_get_func_addr(const struct bpf_prog *prog, + const struct bpf_insn *insn, bool extra_pass, + u64 *func_addr, bool *func_addr_fixed) +{ + s16 off = insn->off; + s32 imm = insn->imm; + u8 *addr; + + *func_addr_fixed = insn->src_reg != BPF_PSEUDO_CALL; + if (!*func_addr_fixed) { + /* Place-holder address till the last pass has collected + * all addresses for JITed subprograms in which case we + * can pick them up from prog->aux. + */ + if (!extra_pass) + addr = NULL; + else if (prog->aux->func && + off >= 0 && off < prog->aux->func_cnt) + addr = (u8 *)prog->aux->func[off]->bpf_func; + else + return -EINVAL; + } else { + /* Address of a BPF helper call. Since part of the core + * kernel, it's always at a fixed location. __bpf_call_base + * and the helper with imm relative to it are both in core + * kernel. + */ + addr = (u8 *)__bpf_call_base + imm; + } + + *func_addr = (unsigned long)addr; + return 0; +} + static int bpf_jit_blind_insn(const struct bpf_insn *from, const struct bpf_insn *aux, struct bpf_insn *to_buff) diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c @@ -139,7 +139,8 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, return -ENOENT; new = kmalloc_node(sizeof(struct bpf_storage_buffer) + - map->value_size, __GFP_ZERO | GFP_USER, + map->value_size, + __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN, map->numa_node); if (!new) return -ENOMEM; diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c @@ -7,6 +7,7 @@ #include <linux/bpf.h> #include <linux/list.h> #include <linux/slab.h> +#include <linux/capability.h> #include "percpu_freelist.h" #define QUEUE_STACK_CREATE_FLAG_MASK \ @@ -45,8 +46,12 @@ static bool queue_stack_map_is_full(struct bpf_queue_stack *qs) /* Called from syscall */ static int queue_stack_map_alloc_check(union bpf_attr *attr) { + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 0 || + attr->value_size == 0 || attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK) return -EINVAL; @@ -63,15 +68,10 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) { int ret, numa_node = bpf_map_attr_numa_node(attr); struct bpf_queue_stack *qs; - u32 size, value_size; - u64 queue_size, cost; - - size = attr->max_entries + 1; - value_size = attr->value_size; - - queue_size = sizeof(*qs) + (u64) value_size * size; + u64 size, queue_size, cost; - cost = queue_size; + size = (u64) attr->max_entries + 1; + cost = queue_size = sizeof(*qs) + size * attr->value_size; if (cost >= U32_MAX - PAGE_SIZE) return ERR_PTR(-E2BIG); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c @@ -5650,7 +5650,7 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len return; /* NOTE: fake 'exit' subprog should be updated as well. */ for (i = 0; i <= env->subprog_cnt; i++) { - if (env->subprog_info[i].start < off) + if (env->subprog_info[i].start <= off) continue; env->subprog_info[i].start += len - 1; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c @@ -196,11 +196,13 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, i++; } else if (fmt[i] == 'p' || fmt[i] == 's') { mod[fmt_cnt]++; - i++; - if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0) + /* disallow any further format extensions */ + if (fmt[i + 1] != 0 && + !isspace(fmt[i + 1]) && + !ispunct(fmt[i + 1])) return -EINVAL; fmt_cnt++; - if (fmt[i - 1] == 's') { + if (fmt[i] == 's') { if (str_seen) /* allow only one '%s' per fmt string */ return -EINVAL; diff --git a/net/core/filter.c b/net/core/filter.c @@ -4852,18 +4852,17 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, } else { struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr; struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr; - u16 hnum = ntohs(tuple->ipv6.dport); int sdif = inet6_sdif(skb); if (proto == IPPROTO_TCP) sk = __inet6_lookup(net, &tcp_hashinfo, skb, 0, src6, tuple->ipv6.sport, - dst6, hnum, + dst6, ntohs(tuple->ipv6.dport), dif, sdif, &refcounted); else if (likely(ipv6_bpf_stub)) sk = ipv6_bpf_stub->udp6_lib_lookup(net, src6, tuple->ipv6.sport, - dst6, hnum, + dst6, tuple->ipv6.dport, dif, sdif, &udp_table, skb); #endif diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c @@ -939,7 +939,7 @@ static int __ip_append_data(struct sock *sk, unsigned int fraglen; unsigned int fraggap; unsigned int alloclen; - unsigned int pagedlen = 0; + unsigned int pagedlen; struct sk_buff *skb_prev; alloc_new_skb: skb_prev = skb; @@ -956,6 +956,7 @@ alloc_new_skb: if (datalen > mtu - fragheaderlen) datalen = maxfraglen - fragheaderlen; fraglen = datalen + fragheaderlen; + pagedlen = 0; if ((flags & MSG_MORE) && !(rt->dst.dev->features&NETIF_F_SG)) diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -81,9 +81,12 @@ static int __init masquerade_tg_init(void) int ret; ret = xt_register_target(&masquerade_tg_reg); + if (ret) + return ret; - if (ret == 0) - nf_nat_masquerade_ipv4_register_notifier(); + ret = nf_nat_masquerade_ipv4_register_notifier(); + if (ret) + xt_unregister_target(&masquerade_tg_reg); return ret; } diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c @@ -147,28 +147,50 @@ static struct notifier_block masq_inet_notifier = { .notifier_call = masq_inet_event, }; -static atomic_t masquerade_notifier_refcount = ATOMIC_INIT(0); +static int masq_refcnt; +static DEFINE_MUTEX(masq_mutex); -void nf_nat_masquerade_ipv4_register_notifier(void) +int nf_nat_masquerade_ipv4_register_notifier(void) { + int ret = 0; + + mutex_lock(&masq_mutex); /* check if the notifier was already set */ - if (atomic_inc_return(&masquerade_notifier_refcount) > 1) - return; + if (++masq_refcnt > 1) + goto out_unlock; /* Register for device down reports */ - register_netdevice_notifier(&masq_dev_notifier); + ret = register_netdevice_notifier(&masq_dev_notifier); + if (ret) + goto err_dec; /* Register IP address change reports */ - register_inetaddr_notifier(&masq_inet_notifier); + ret = register_inetaddr_notifier(&masq_inet_notifier); + if (ret) + goto err_unregister; + + mutex_unlock(&masq_mutex); + return ret; + +err_unregister: + unregister_netdevice_notifier(&masq_dev_notifier); +err_dec: + masq_refcnt--; +out_unlock: + mutex_unlock(&masq_mutex); + return ret; } EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_register_notifier); void nf_nat_masquerade_ipv4_unregister_notifier(void) { + mutex_lock(&masq_mutex); /* check if the notifier still has clients */ - if (atomic_dec_return(&masquerade_notifier_refcount) > 0) - return; + if (--masq_refcnt > 0) + goto out_unlock; unregister_netdevice_notifier(&masq_dev_notifier); unregister_inetaddr_notifier(&masq_inet_notifier); +out_unlock: + mutex_unlock(&masq_mutex); } EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_unregister_notifier); diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c @@ -69,7 +69,9 @@ static int __init nft_masq_ipv4_module_init(void) if (ret < 0) return ret; - nf_nat_masquerade_ipv4_register_notifier(); + ret = nf_nat_masquerade_ipv4_register_notifier(); + if (ret) + nft_unregister_expr(&nft_masq_ipv4_type); return ret; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c @@ -579,10 +579,12 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; u32 delta_us; - if (!delta) - delta = 1; - delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); - tcp_rcv_rtt_update(tp, delta_us, 0); + if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { + if (!delta) + delta = 1; + delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); + tcp_rcv_rtt_update(tp, delta_us, 0); + } } } @@ -2910,9 +2912,11 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag, if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && flag & FLAG_ACKED) { u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; - u32 delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); - seq_rtt_us = ca_rtt_us = delta_us; + if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { + seq_rtt_us = delta * (USEC_PER_SEC / TCP_TS_HZ); + ca_rtt_us = seq_rtt_us; + } } rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */ if (seq_rtt_us < 0) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c @@ -40,15 +40,17 @@ static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); u32 elapsed, start_ts; + s32 remaining; start_ts = tcp_retransmit_stamp(sk); if (!icsk->icsk_user_timeout || !start_ts) return icsk->icsk_rto; elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts; - if (elapsed >= icsk->icsk_user_timeout) + remaining = icsk->icsk_user_timeout - elapsed; + if (remaining <= 0) return 1; /* user timeout has passed; fire ASAP */ - else - return min_t(u32, icsk->icsk_rto, msecs_to_jiffies(icsk->icsk_user_timeout - elapsed)); + + return min_t(u32, icsk->icsk_rto, msecs_to_jiffies(remaining)); } /** @@ -209,7 +211,7 @@ static bool retransmits_timed_out(struct sock *sk, (boundary - linear_backoff_thresh) * TCP_RTO_MAX; timeout = jiffies_to_msecs(timeout); } - return (tcp_time_stamp(tcp_sk(sk)) - start_ts) >= timeout; + return (s32)(tcp_time_stamp(tcp_sk(sk)) - start_ts - timeout) >= 0; } /* A write timeout has occurred. Process the after effects. */ diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c @@ -1354,7 +1354,7 @@ emsgsize: unsigned int fraglen; unsigned int fraggap; unsigned int alloclen; - unsigned int pagedlen = 0; + unsigned int pagedlen; alloc_new_skb: /* There's no room in the current skb */ if (skb) @@ -1378,6 +1378,7 @@ alloc_new_skb: if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen) datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; fraglen = datalen + fragheaderlen; + pagedlen = 0; if ((flags & MSG_MORE) && !(rt->dst.dev->features&NETIF_F_SG)) diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c @@ -24,7 +24,8 @@ int ip6_route_me_harder(struct net *net, struct sk_buff *skb) unsigned int hh_len; struct dst_entry *dst; struct flowi6 fl6 = { - .flowi6_oif = sk ? sk->sk_bound_dev_if : 0, + .flowi6_oif = sk && sk->sk_bound_dev_if ? sk->sk_bound_dev_if : + rt6_need_strict(&iph->daddr) ? skb_dst(skb)->dev->ifindex : 0, .flowi6_mark = skb->mark, .flowi6_uid = sock_net_uid(net, sk), .daddr = iph->daddr, diff --git a/net/ipv6/netfilter/ip6t_MASQUERADE.c b/net/ipv6/netfilter/ip6t_MASQUERADE.c @@ -58,8 +58,12 @@ static int __init masquerade_tg6_init(void) int err; err = xt_register_target(&masquerade_tg6_reg); - if (err == 0) - nf_nat_masquerade_ipv6_register_notifier(); + if (err) + return err; + + err = nf_nat_masquerade_ipv6_register_notifier(); + if (err) + xt_unregister_target(&masquerade_tg6_reg); return err; } diff --git a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c @@ -132,8 +132,8 @@ static void iterate_cleanup_work(struct work_struct *work) * of ipv6 addresses being deleted), we also need to add an upper * limit to the number of queued work items. */ -static int masq_inet_event(struct notifier_block *this, - unsigned long event, void *ptr) +static int masq_inet6_event(struct notifier_block *this, + unsigned long event, void *ptr) { struct inet6_ifaddr *ifa = ptr; const struct net_device *dev; @@ -171,30 +171,53 @@ static int masq_inet_event(struct notifier_block *this, return NOTIFY_DONE; } -static struct notifier_block masq_inet_notifier = { - .notifier_call = masq_inet_event, +static struct notifier_block masq_inet6_notifier = { + .notifier_call = masq_inet6_event, }; -static atomic_t masquerade_notifier_refcount = ATOMIC_INIT(0); +static int masq_refcnt; +static DEFINE_MUTEX(masq_mutex); -void nf_nat_masquerade_ipv6_register_notifier(void) +int nf_nat_masquerade_ipv6_register_notifier(void) { + int ret = 0; + + mutex_lock(&masq_mutex); /* check if the notifier is already set */ - if (atomic_inc_return(&masquerade_notifier_refcount) > 1) - return; + if (++masq_refcnt > 1) + goto out_unlock; + + ret = register_netdevice_notifier(&masq_dev_notifier); + if (ret) + goto err_dec; + + ret = register_inet6addr_notifier(&masq_inet6_notifier); + if (ret) + goto err_unregister; - register_netdevice_notifier(&masq_dev_notifier); - register_inet6addr_notifier(&masq_inet_notifier); + mutex_unlock(&masq_mutex); + return ret; + +err_unregister: + unregister_netdevice_notifier(&masq_dev_notifier); +err_dec: + masq_refcnt--; +out_unlock: + mutex_unlock(&masq_mutex); + return ret; } EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6_register_notifier); void nf_nat_masquerade_ipv6_unregister_notifier(void) { + mutex_lock(&masq_mutex); /* check if the notifier still has clients */ - if (atomic_dec_return(&masquerade_notifier_refcount) > 0) - return; + if (--masq_refcnt > 0) + goto out_unlock; - unregister_inet6addr_notifier(&masq_inet_notifier); + unregister_inet6addr_notifier(&masq_inet6_notifier); unregister_netdevice_notifier(&masq_dev_notifier); +out_unlock: + mutex_unlock(&masq_mutex); } EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6_unregister_notifier); diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c @@ -70,7 +70,9 @@ static int __init nft_masq_ipv6_module_init(void) if (ret < 0) return ret; - nf_nat_masquerade_ipv6_register_notifier(); + ret = nf_nat_masquerade_ipv6_register_notifier(); + if (ret) + nft_unregister_expr(&nft_masq_ipv6_type); return ret; } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c @@ -3980,6 +3980,9 @@ static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) static struct notifier_block ip_vs_dst_notifier = { .notifier_call = ip_vs_dst_event, +#ifdef CONFIG_IP_VS_IPV6 + .priority = ADDRCONF_NOTIFY_PRIORITY + 5, +#endif }; int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c @@ -49,6 +49,7 @@ struct nf_conncount_tuple { struct nf_conntrack_zone zone; int cpu; u32 jiffies32; + bool dead; struct rcu_head rcu_head; }; @@ -106,15 +107,16 @@ nf_conncount_add(struct nf_conncount_list *list, conn->zone = *zone; conn->cpu = raw_smp_processor_id(); conn->jiffies32 = (u32)jiffies; - spin_lock(&list->list_lock); + conn->dead = false; + spin_lock_bh(&list->list_lock); if (list->dead == true) { kmem_cache_free(conncount_conn_cachep, conn); - spin_unlock(&list->list_lock); + spin_unlock_bh(&list->list_lock); return NF_CONNCOUNT_SKIP; } list_add_tail(&conn->node, &list->head); list->count++; - spin_unlock(&list->list_lock); + spin_unlock_bh(&list->list_lock); return NF_CONNCOUNT_ADDED; } EXPORT_SYMBOL_GPL(nf_conncount_add); @@ -132,19 +134,22 @@ static bool conn_free(struct nf_conncount_list *list, { bool free_entry = false; - spin_lock(&list->list_lock); + spin_lock_bh(&list->list_lock); - if (list->count == 0) { - spin_unlock(&list->list_lock); - return free_entry; + if (conn->dead) { + spin_unlock_bh(&list->list_lock); + return free_entry; } list->count--; + conn->dead = true; list_del_rcu(&conn->node); - if (list->count == 0) + if (list->count == 0) { + list->dead = true; free_entry = true; + } - spin_unlock(&list->list_lock); + spin_unlock_bh(&list->list_lock); call_rcu(&conn->rcu_head, __conn_free); return free_entry; } @@ -245,7 +250,7 @@ void nf_conncount_list_init(struct nf_conncount_list *list) { spin_lock_init(&list->list_lock); INIT_LIST_HEAD(&list->head); - list->count = 1; + list->count = 0; list->dead = false; } EXPORT_SYMBOL_GPL(nf_conncount_list_init); @@ -259,6 +264,7 @@ bool nf_conncount_gc_list(struct net *net, struct nf_conn *found_ct; unsigned int collected = 0; bool free_entry = false; + bool ret = false; list_for_each_entry_safe(conn, conn_n, &list->head, node) { found = find_or_evict(net, list, conn, &free_entry); @@ -288,7 +294,15 @@ bool nf_conncount_gc_list(struct net *net, if (collected > CONNCOUNT_GC_MAX_NODES) return false; } - return false; + + spin_lock_bh(&list->list_lock); + if (!list->count) { + list->dead = true; + ret = true; + } + spin_unlock_bh(&list->list_lock); + + return ret; } EXPORT_SYMBOL_GPL(nf_conncount_gc_list); @@ -309,11 +323,8 @@ static void tree_nodes_free(struct rb_root *root, while (gc_count) { rbconn = gc_nodes[--gc_count]; spin_lock(&rbconn->list.list_lock); - if (rbconn->list.count == 0 && rbconn->list.dead == false) { - rbconn->list.dead = true; - rb_erase(&rbconn->node, root); - call_rcu(&rbconn->rcu_head, __tree_nodes_free); - } + rb_erase(&rbconn->node, root); + call_rcu(&rbconn->rcu_head, __tree_nodes_free); spin_unlock(&rbconn->list.list_lock); } } @@ -414,6 +425,7 @@ insert_tree(struct net *net, nf_conncount_list_init(&rbconn->list); list_add(&conn->node, &rbconn->list.head); count = 1; + rbconn->list.count = count; rb_link_node(&rbconn->node, parent, rbnode); rb_insert_color(&rbconn->node, root); diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c @@ -43,24 +43,12 @@ #include <linux/netfilter/nf_conntrack_proto_gre.h> #include <linux/netfilter/nf_conntrack_pptp.h> -enum grep_conntrack { - GRE_CT_UNREPLIED, - GRE_CT_REPLIED, - GRE_CT_MAX -}; - static const unsigned int gre_timeouts[GRE_CT_MAX] = { [GRE_CT_UNREPLIED] = 30*HZ, [GRE_CT_REPLIED] = 180*HZ, }; static unsigned int proto_gre_net_id __read_mostly; -struct netns_proto_gre { - struct nf_proto_net nf; - rwlock_t keymap_lock; - struct list_head keymap_list; - unsigned int gre_timeouts[GRE_CT_MAX]; -}; static inline struct netns_proto_gre *gre_pernet(struct net *net) { @@ -402,6 +390,8 @@ static int __init nf_ct_proto_gre_init(void) { int ret; + BUILD_BUG_ON(offsetof(struct netns_proto_gre, nf) != 0); + ret = register_pernet_subsys(&proto_gre_net_ops); if (ret < 0) goto out_pernet; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c @@ -2457,7 +2457,7 @@ err: static void nf_tables_rule_destroy(const struct nft_ctx *ctx, struct nft_rule *rule) { - struct nft_expr *expr; + struct nft_expr *expr, *next; /* * Careful: some expressions might not be initialized in case this @@ -2465,8 +2465,9 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx, */ expr = nft_expr_first(rule); while (expr != nft_expr_last(rule) && expr->ops) { + next = nft_expr_next(expr); nf_tables_expr_destroy(ctx, expr); - expr = nft_expr_next(expr); + expr = next; } kfree(rule); } @@ -2589,17 +2590,14 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, if (chain->use == UINT_MAX) return -EOVERFLOW; - } - - if (nla[NFTA_RULE_POSITION]) { - if (!(nlh->nlmsg_flags & NLM_F_CREATE)) - return -EOPNOTSUPP; - pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION])); - old_rule = __nft_rule_lookup(chain, pos_handle); - if (IS_ERR(old_rule)) { - NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION]); - return PTR_ERR(old_rule); + if (nla[NFTA_RULE_POSITION]) { + pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION])); + old_rule = __nft_rule_lookup(chain, pos_handle); + if (IS_ERR(old_rule)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION]); + return PTR_ERR(old_rule); + } } } @@ -2669,21 +2667,14 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, } if (nlh->nlmsg_flags & NLM_F_REPLACE) { - if (!nft_is_active_next(net, old_rule)) { - err = -ENOENT; - goto err2; - } - trans = nft_trans_rule_add(&ctx, NFT_MSG_DELRULE, - old_rule); + trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule); if (trans == NULL) { err = -ENOMEM; goto err2; } - nft_deactivate_next(net, old_rule); - chain->use--; - - if (nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule) == NULL) { - err = -ENOMEM; + err = nft_delrule(&ctx, old_rule); + if (err < 0) { + nft_trans_destroy(trans); goto err2; } @@ -6324,7 +6315,7 @@ static void nf_tables_commit_chain_free_rules_old(struct nft_rule **rules) call_rcu(&old->h, __nf_tables_commit_chain_free_rules_old); } -static void nf_tables_commit_chain_active(struct net *net, struct nft_chain *chain) +static void nf_tables_commit_chain(struct net *net, struct nft_chain *chain) { struct nft_rule **g0, **g1; bool next_genbit; @@ -6441,11 +6432,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) /* step 2. Make rules_gen_X visible to packet path */ list_for_each_entry(table, &net->nft.tables, list) { - list_for_each_entry(chain, &table->chains, list) { - if (!nft_is_active_next(net, chain)) - continue; - nf_tables_commit_chain_active(net, chain); - } + list_for_each_entry(chain, &table->chains, list) + nf_tables_commit_chain(net, chain); } /* diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c @@ -455,7 +455,8 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, case IPPROTO_TCP: timeouts = nf_tcp_pernet(net)->timeouts; break; - case IPPROTO_UDP: + case IPPROTO_UDP: /* fallthrough */ + case IPPROTO_UDPLITE: timeouts = nf_udp_pernet(net)->timeouts; break; case IPPROTO_DCCP: @@ -471,11 +472,21 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, timeouts = nf_sctp_pernet(net)->timeouts; #endif break; + case IPPROTO_GRE: +#ifdef CONFIG_NF_CT_PROTO_GRE + if (l4proto->net_id) { + struct netns_proto_gre *net_gre; + + net_gre = net_generic(net, *l4proto->net_id); + timeouts = net_gre->gre_timeouts; + } +#endif + break; case 255: timeouts = &nf_generic_pernet(net)->timeout; break; default: - WARN_ON_ONCE(1); + WARN_ONCE(1, "Missing timeouts for proto %d", l4proto->l4proto); break; } diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c @@ -520,6 +520,7 @@ __nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr, void *info) { struct xt_match *match = expr->ops->data; + struct module *me = match->me; struct xt_mtdtor_param par; par.net = ctx->net; @@ -530,7 +531,7 @@ __nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr, par.match->destroy(&par); if (nft_xt_put(container_of(expr->ops, struct nft_xt, ops))) - module_put(match->me); + module_put(me); } static void diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c @@ -214,7 +214,9 @@ static int __init nft_flow_offload_module_init(void) { int err; - register_netdevice_notifier(&flow_offload_netdev_notifier); + err = register_netdevice_notifier(&flow_offload_netdev_notifier); + if (err) + goto err; err = nft_register_expr(&nft_flow_offload_type); if (err < 0) @@ -224,6 +226,7 @@ static int __init nft_flow_offload_module_init(void) register_expr: unregister_netdevice_notifier(&flow_offload_netdev_notifier); +err: return err; } diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c @@ -201,18 +201,8 @@ static __net_init int xt_rateest_net_init(struct net *net) return 0; } -static void __net_exit xt_rateest_net_exit(struct net *net) -{ - struct xt_rateest_net *xn = net_generic(net, xt_rateest_id); - int i; - - for (i = 0; i < ARRAY_SIZE(xn->hash); i++) - WARN_ON_ONCE(!hlist_empty(&xn->hash[i])); -} - static struct pernet_operations xt_rateest_net_ops = { .init = xt_rateest_net_init, - .exit = xt_rateest_net_exit, .id = &xt_rateest_id, .size = sizeof(struct xt_rateest_net), }; diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c @@ -295,9 +295,10 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, /* copy match config into hashtable config */ ret = cfg_copy(&hinfo->cfg, (void *)cfg, 3); - - if (ret) + if (ret) { + vfree(hinfo); return ret; + } hinfo->cfg.size = size; if (hinfo->cfg.max == 0) @@ -814,7 +815,6 @@ hashlimit_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) int ret; ret = cfg_copy(&cfg, (void *)&info->cfg, 1); - if (ret) return ret; @@ -830,7 +830,6 @@ hashlimit_mt_v2(const struct sk_buff *skb, struct xt_action_param *par) int ret; ret = cfg_copy(&cfg, (void *)&info->cfg, 2); - if (ret) return ret; @@ -921,7 +920,6 @@ static int hashlimit_mt_check_v1(const struct xt_mtchk_param *par) return ret; ret = cfg_copy(&cfg, (void *)&info->cfg, 1); - if (ret) return ret; @@ -940,7 +938,6 @@ static int hashlimit_mt_check_v2(const struct xt_mtchk_param *par) return ret; ret = cfg_copy(&cfg, (void *)&info->cfg, 2); - if (ret) return ret; diff --git a/net/sctp/output.c b/net/sctp/output.c @@ -410,6 +410,7 @@ static void sctp_packet_gso_append(struct sk_buff *head, struct sk_buff *skb) head->truesize += skb->truesize; head->data_len += skb->len; head->len += skb->len; + refcount_add(skb->truesize, &head->sk->sk_wmem_alloc); __skb_header_release(skb); } diff --git a/net/tipc/node.c b/net/tipc/node.c @@ -584,12 +584,15 @@ static void tipc_node_clear_links(struct tipc_node *node) /* tipc_node_cleanup - delete nodes that does not * have active links for NODE_CLEANUP_AFTER time */ -static int tipc_node_cleanup(struct tipc_node *peer) +static bool tipc_node_cleanup(struct tipc_node *peer) { struct tipc_net *tn = tipc_net(peer->net); bool deleted = false; - spin_lock_bh(&tn->node_list_lock); + /* If lock held by tipc_node_stop() the node will be deleted anyway */ + if (!spin_trylock_bh(&tn->node_list_lock)) + return false; + tipc_node_write_lock(peer); if (!node_is_up(peer) && time_after(jiffies, peer->delete_at)) { diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst @@ -137,4 +137,10 @@ EXAMPLES SEE ALSO ======== - **bpftool**\ (8), **bpftool-prog**\ (8), **bpftool-map**\ (8) + **bpf**\ (2), + **bpf-helpers**\ (7), + **bpftool**\ (8), + **bpftool-prog**\ (8), + **bpftool-map**\ (8), + **bpftool-net**\ (8), + **bpftool-perf**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -171,4 +171,10 @@ The following three commands are equivalent: SEE ALSO ======== - **bpftool**\ (8), **bpftool-prog**\ (8), **bpftool-cgroup**\ (8) + **bpf**\ (2), + **bpf-helpers**\ (7), + **bpftool**\ (8), + **bpftool-prog**\ (8), + **bpftool-cgroup**\ (8), + **bpftool-net**\ (8), + **bpftool-perf**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst @@ -136,4 +136,10 @@ EXAMPLES SEE ALSO ======== - **bpftool**\ (8), **bpftool-prog**\ (8), **bpftool-map**\ (8) + **bpf**\ (2), + **bpf-helpers**\ (7), + **bpftool**\ (8), + **bpftool-prog**\ (8), + **bpftool-map**\ (8), + **bpftool-cgroup**\ (8), + **bpftool-perf**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-perf.rst b/tools/bpf/bpftool/Documentation/bpftool-perf.rst @@ -78,4 +78,10 @@ EXAMPLES SEE ALSO ======== - **bpftool**\ (8), **bpftool-prog**\ (8), **bpftool-map**\ (8) + **bpf**\ (2), + **bpf-helpers**\ (7), + **bpftool**\ (8), + **bpftool-prog**\ (8), + **bpftool-map**\ (8), + **bpftool-cgroup**\ (8), + **bpftool-net**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -124,7 +124,8 @@ OPTIONS Generate human-readable JSON output. Implies **-j**. -f, --bpffs - Show file names of pinned programs. + When showing BPF programs, show file names of pinned + programs. EXAMPLES ======== @@ -206,4 +207,10 @@ EXAMPLES SEE ALSO ======== - **bpftool**\ (8), **bpftool-map**\ (8), **bpftool-cgroup**\ (8) + **bpf**\ (2), + **bpf-helpers**\ (7), + **bpftool**\ (8), + **bpftool-map**\ (8), + **bpftool-cgroup**\ (8), + **bpftool-net**\ (8), + **bpftool-perf**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst @@ -63,5 +63,10 @@ OPTIONS SEE ALSO ======== - **bpftool-map**\ (8), **bpftool-prog**\ (8), **bpftool-cgroup**\ (8) - **bpftool-perf**\ (8), **bpftool-net**\ (8) + **bpf**\ (2), + **bpf-helpers**\ (7), + **bpftool-prog**\ (8), + **bpftool-map**\ (8), + **bpftool-cgroup**\ (8), + **bpftool-net**\ (8), + **bpftool-perf**\ (8) diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c @@ -130,16 +130,17 @@ static int mnt_bpffs(const char *target, char *buff, size_t bufflen) return 0; } -int open_obj_pinned(char *path) +int open_obj_pinned(char *path, bool quiet) { int fd; fd = bpf_obj_get(path); if (fd < 0) { - p_err("bpf obj get (%s): %s", path, - errno == EACCES && !is_bpffs(dirname(path)) ? - "directory not in bpf file system (bpffs)" : - strerror(errno)); + if (!quiet) + p_err("bpf obj get (%s): %s", path, + errno == EACCES && !is_bpffs(dirname(path)) ? + "directory not in bpf file system (bpffs)" : + strerror(errno)); return -1; } @@ -151,7 +152,7 @@ int open_obj_pinned_any(char *path, enum bpf_obj_type exp_type) enum bpf_obj_type type; int fd; - fd = open_obj_pinned(path); + fd = open_obj_pinned(path, false); if (fd < 0) return -1; @@ -304,7 +305,7 @@ char *get_fdinfo(int fd, const char *key) return NULL; } - while ((n = getline(&line, &line_n, fdi))) { + while ((n = getline(&line, &line_n, fdi)) > 0) { char *value; int len; @@ -384,7 +385,7 @@ int build_pinned_obj_table(struct pinned_obj_table *tab, while ((ftse = fts_read(fts))) { if (!(ftse->fts_info & FTS_F)) continue; - fd = open_obj_pinned(ftse->fts_path); + fd = open_obj_pinned(ftse->fts_path, true); if (fd < 0) continue; diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h @@ -127,7 +127,7 @@ int cmd_select(const struct cmd *cmds, int argc, char **argv, int get_fd_type(int fd); const char *get_fd_type_name(enum bpf_obj_type type); char *get_fdinfo(int fd, const char *key); -int open_obj_pinned(char *path); +int open_obj_pinned(char *path, bool quiet); int open_obj_pinned_any(char *path, enum bpf_obj_type exp_type); int do_pin_any(int argc, char **argv, int (*get_fd_by_id)(__u32)); int do_pin_fd(int fd, const char *name); diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c @@ -357,10 +357,9 @@ static void print_prog_plain(struct bpf_prog_info *info, int fd) if (!hash_empty(prog_table.table)) { struct pinned_obj *obj; - printf("\n"); hash_for_each_possible(prog_table.table, obj, hash, info->id) { if (obj->id == info->id) - printf("\tpinned %s\n", obj->path); + printf("\n\tpinned %s", obj->path); } } @@ -845,6 +844,7 @@ static int do_load(int argc, char **argv) } NEXT_ARG(); } else if (is_prefix(*argv, "map")) { + void *new_map_replace; char *endptr, *name; int fd; @@ -878,12 +878,15 @@ static int do_load(int argc, char **argv) if (fd < 0) goto err_free_reuse_maps; - map_replace = reallocarray(map_replace, old_map_fds + 1, - sizeof(*map_replace)); - if (!map_replace) { + new_map_replace = reallocarray(map_replace, + old_map_fds + 1, + sizeof(*map_replace)); + if (!new_map_replace) { p_err("mem alloc failed"); goto err_free_reuse_maps; } + map_replace = new_map_replace; + map_replace[old_map_fds].idx = idx; map_replace[old_map_fds].name = name; map_replace[old_map_fds].fd = fd; diff --git a/tools/include/uapi/linux/pkt_cls.h b/tools/include/uapi/linux/pkt_cls.h @@ -0,0 +1,612 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __LINUX_PKT_CLS_H +#define __LINUX_PKT_CLS_H + +#include <linux/types.h> +#include <linux/pkt_sched.h> + +#define TC_COOKIE_MAX_SIZE 16 + +/* Action attributes */ +enum { + TCA_ACT_UNSPEC, + TCA_ACT_KIND, + TCA_ACT_OPTIONS, + TCA_ACT_INDEX, + TCA_ACT_STATS, + TCA_ACT_PAD, + TCA_ACT_COOKIE, + __TCA_ACT_MAX +}; + +#define TCA_ACT_MAX __TCA_ACT_MAX +#define TCA_OLD_COMPAT (TCA_ACT_MAX+1) +#define TCA_ACT_MAX_PRIO 32 +#define TCA_ACT_BIND 1 +#define TCA_ACT_NOBIND 0 +#define TCA_ACT_UNBIND 1 +#define TCA_ACT_NOUNBIND 0 +#define TCA_ACT_REPLACE 1 +#define TCA_ACT_NOREPLACE 0 + +#define TC_ACT_UNSPEC (-1) +#define TC_ACT_OK 0 +#define TC_ACT_RECLASSIFY 1 +#define TC_ACT_SHOT 2 +#define TC_ACT_PIPE 3 +#define TC_ACT_STOLEN 4 +#define TC_ACT_QUEUED 5 +#define TC_ACT_REPEAT 6 +#define TC_ACT_REDIRECT 7 +#define TC_ACT_TRAP 8 /* For hw path, this means "trap to cpu" + * and don't further process the frame + * in hardware. For sw path, this is + * equivalent of TC_ACT_STOLEN - drop + * the skb and act like everything + * is alright. + */ +#define TC_ACT_VALUE_MAX TC_ACT_TRAP + +/* There is a special kind of actions called "extended actions", + * which need a value parameter. These have a local opcode located in + * the highest nibble, starting from 1. The rest of the bits + * are used to carry the value. These two parts together make + * a combined opcode. + */ +#define __TC_ACT_EXT_SHIFT 28 +#define __TC_ACT_EXT(local) ((local) << __TC_ACT_EXT_SHIFT) +#define TC_ACT_EXT_VAL_MASK ((1 << __TC_ACT_EXT_SHIFT) - 1) +#define TC_ACT_EXT_OPCODE(combined) ((combined) & (~TC_ACT_EXT_VAL_MASK)) +#define TC_ACT_EXT_CMP(combined, opcode) (TC_ACT_EXT_OPCODE(combined) == opcode) + +#define TC_ACT_JUMP __TC_ACT_EXT(1) +#define TC_ACT_GOTO_CHAIN __TC_ACT_EXT(2) +#define TC_ACT_EXT_OPCODE_MAX TC_ACT_GOTO_CHAIN + +/* Action type identifiers*/ +enum { + TCA_ID_UNSPEC=0, + TCA_ID_POLICE=1, + /* other actions go here */ + __TCA_ID_MAX=255 +}; + +#define TCA_ID_MAX __TCA_ID_MAX + +struct tc_police { + __u32 index; + int action; +#define TC_POLICE_UNSPEC TC_ACT_UNSPEC +#define TC_POLICE_OK TC_ACT_OK +#define TC_POLICE_RECLASSIFY TC_ACT_RECLASSIFY +#define TC_POLICE_SHOT TC_ACT_SHOT +#define TC_POLICE_PIPE TC_ACT_PIPE + + __u32 limit; + __u32 burst; + __u32 mtu; + struct tc_ratespec rate; + struct tc_ratespec peakrate; + int refcnt; + int bindcnt; + __u32 capab; +}; + +struct tcf_t { + __u64 install; + __u64 lastuse; + __u64 expires; + __u64 firstuse; +}; + +struct tc_cnt { + int refcnt; + int bindcnt; +}; + +#define tc_gen \ + __u32 index; \ + __u32 capab; \ + int action; \ + int refcnt; \ + int bindcnt + +enum { + TCA_POLICE_UNSPEC, + TCA_POLICE_TBF, + TCA_POLICE_RATE, + TCA_POLICE_PEAKRATE, + TCA_POLICE_AVRATE, + TCA_POLICE_RESULT, + TCA_POLICE_TM, + TCA_POLICE_PAD, + __TCA_POLICE_MAX +#define TCA_POLICE_RESULT TCA_POLICE_RESULT +}; + +#define TCA_POLICE_MAX (__TCA_POLICE_MAX - 1) + +/* tca flags definitions */ +#define TCA_CLS_FLAGS_SKIP_HW (1 << 0) /* don't offload filter to HW */ +#define TCA_CLS_FLAGS_SKIP_SW (1 << 1) /* don't use filter in SW */ +#define TCA_CLS_FLAGS_IN_HW (1 << 2) /* filter is offloaded to HW */ +#define TCA_CLS_FLAGS_NOT_IN_HW (1 << 3) /* filter isn't offloaded to HW */ +#define TCA_CLS_FLAGS_VERBOSE (1 << 4) /* verbose logging */ + +/* U32 filters */ + +#define TC_U32_HTID(h) ((h)&0xFFF00000) +#define TC_U32_USERHTID(h) (TC_U32_HTID(h)>>20) +#define TC_U32_HASH(h) (((h)>>12)&0xFF) +#define TC_U32_NODE(h) ((h)&0xFFF) +#define TC_U32_KEY(h) ((h)&0xFFFFF) +#define TC_U32_UNSPEC 0 +#define TC_U32_ROOT (0xFFF00000) + +enum { + TCA_U32_UNSPEC, + TCA_U32_CLASSID, + TCA_U32_HASH, + TCA_U32_LINK, + TCA_U32_DIVISOR, + TCA_U32_SEL, + TCA_U32_POLICE, + TCA_U32_ACT, + TCA_U32_INDEV, + TCA_U32_PCNT, + TCA_U32_MARK, + TCA_U32_FLAGS, + TCA_U32_PAD, + __TCA_U32_MAX +}; + +#define TCA_U32_MAX (__TCA_U32_MAX - 1) + +struct tc_u32_key { + __be32 mask; + __be32 val; + int off; + int offmask; +}; + +struct tc_u32_sel { + unsigned char flags; + unsigned char offshift; + unsigned char nkeys; + + __be16 offmask; + __u16 off; + short offoff; + + short hoff; + __be32 hmask; + struct tc_u32_key keys[0]; +}; + +struct tc_u32_mark { + __u32 val; + __u32 mask; + __u32 success; +}; + +struct tc_u32_pcnt { + __u64 rcnt; + __u64 rhit; + __u64 kcnts[0]; +}; + +/* Flags */ + +#define TC_U32_TERMINAL 1 +#define TC_U32_OFFSET 2 +#define TC_U32_VAROFFSET 4 +#define TC_U32_EAT 8 + +#define TC_U32_MAXDEPTH 8 + + +/* RSVP filter */ + +enum { + TCA_RSVP_UNSPEC, + TCA_RSVP_CLASSID, + TCA_RSVP_DST, + TCA_RSVP_SRC, + TCA_RSVP_PINFO, + TCA_RSVP_POLICE, + TCA_RSVP_ACT, + __TCA_RSVP_MAX +}; + +#define TCA_RSVP_MAX (__TCA_RSVP_MAX - 1 ) + +struct tc_rsvp_gpi { + __u32 key; + __u32 mask; + int offset; +}; + +struct tc_rsvp_pinfo { + struct tc_rsvp_gpi dpi; + struct tc_rsvp_gpi spi; + __u8 protocol; + __u8 tunnelid; + __u8 tunnelhdr; + __u8 pad; +}; + +/* ROUTE filter */ + +enum { + TCA_ROUTE4_UNSPEC, + TCA_ROUTE4_CLASSID, + TCA_ROUTE4_TO, + TCA_ROUTE4_FROM, + TCA_ROUTE4_IIF, + TCA_ROUTE4_POLICE, + TCA_ROUTE4_ACT, + __TCA_ROUTE4_MAX +}; + +#define TCA_ROUTE4_MAX (__TCA_ROUTE4_MAX - 1) + + +/* FW filter */ + +enum { + TCA_FW_UNSPEC, + TCA_FW_CLASSID, + TCA_FW_POLICE, + TCA_FW_INDEV, /* used by CONFIG_NET_CLS_IND */ + TCA_FW_ACT, /* used by CONFIG_NET_CLS_ACT */ + TCA_FW_MASK, + __TCA_FW_MAX +}; + +#define TCA_FW_MAX (__TCA_FW_MAX - 1) + +/* TC index filter */ + +enum { + TCA_TCINDEX_UNSPEC, + TCA_TCINDEX_HASH, + TCA_TCINDEX_MASK, + TCA_TCINDEX_SHIFT, + TCA_TCINDEX_FALL_THROUGH, + TCA_TCINDEX_CLASSID, + TCA_TCINDEX_POLICE, + TCA_TCINDEX_ACT, + __TCA_TCINDEX_MAX +}; + +#define TCA_TCINDEX_MAX (__TCA_TCINDEX_MAX - 1) + +/* Flow filter */ + +enum { + FLOW_KEY_SRC, + FLOW_KEY_DST, + FLOW_KEY_PROTO, + FLOW_KEY_PROTO_SRC, + FLOW_KEY_PROTO_DST, + FLOW_KEY_IIF, + FLOW_KEY_PRIORITY, + FLOW_KEY_MARK, + FLOW_KEY_NFCT, + FLOW_KEY_NFCT_SRC, + FLOW_KEY_NFCT_DST, + FLOW_KEY_NFCT_PROTO_SRC, + FLOW_KEY_NFCT_PROTO_DST, + FLOW_KEY_RTCLASSID, + FLOW_KEY_SKUID, + FLOW_KEY_SKGID, + FLOW_KEY_VLAN_TAG, + FLOW_KEY_RXHASH, + __FLOW_KEY_MAX, +}; + +#define FLOW_KEY_MAX (__FLOW_KEY_MAX - 1) + +enum { + FLOW_MODE_MAP, + FLOW_MODE_HASH, +}; + +enum { + TCA_FLOW_UNSPEC, + TCA_FLOW_KEYS, + TCA_FLOW_MODE, + TCA_FLOW_BASECLASS, + TCA_FLOW_RSHIFT, + TCA_FLOW_ADDEND, + TCA_FLOW_MASK, + TCA_FLOW_XOR, + TCA_FLOW_DIVISOR, + TCA_FLOW_ACT, + TCA_FLOW_POLICE, + TCA_FLOW_EMATCHES, + TCA_FLOW_PERTURB, + __TCA_FLOW_MAX +}; + +#define TCA_FLOW_MAX (__TCA_FLOW_MAX - 1) + +/* Basic filter */ + +enum { + TCA_BASIC_UNSPEC, + TCA_BASIC_CLASSID, + TCA_BASIC_EMATCHES, + TCA_BASIC_ACT, + TCA_BASIC_POLICE, + __TCA_BASIC_MAX +}; + +#define TCA_BASIC_MAX (__TCA_BASIC_MAX - 1) + + +/* Cgroup classifier */ + +enum { + TCA_CGROUP_UNSPEC, + TCA_CGROUP_ACT, + TCA_CGROUP_POLICE, + TCA_CGROUP_EMATCHES, + __TCA_CGROUP_MAX, +}; + +#define TCA_CGROUP_MAX (__TCA_CGROUP_MAX - 1) + +/* BPF classifier */ + +#define TCA_BPF_FLAG_ACT_DIRECT (1 << 0) + +enum { + TCA_BPF_UNSPEC, + TCA_BPF_ACT, + TCA_BPF_POLICE, + TCA_BPF_CLASSID, + TCA_BPF_OPS_LEN, + TCA_BPF_OPS, + TCA_BPF_FD, + TCA_BPF_NAME, + TCA_BPF_FLAGS, + TCA_BPF_FLAGS_GEN, + TCA_BPF_TAG, + TCA_BPF_ID, + __TCA_BPF_MAX, +}; + +#define TCA_BPF_MAX (__TCA_BPF_MAX - 1) + +/* Flower classifier */ + +enum { + TCA_FLOWER_UNSPEC, + TCA_FLOWER_CLASSID, + TCA_FLOWER_INDEV, + TCA_FLOWER_ACT, + TCA_FLOWER_KEY_ETH_DST, /* ETH_ALEN */ + TCA_FLOWER_KEY_ETH_DST_MASK, /* ETH_ALEN */ + TCA_FLOWER_KEY_ETH_SRC, /* ETH_ALEN */ + TCA_FLOWER_KEY_ETH_SRC_MASK, /* ETH_ALEN */ + TCA_FLOWER_KEY_ETH_TYPE, /* be16 */ + TCA_FLOWER_KEY_IP_PROTO, /* u8 */ + TCA_FLOWER_KEY_IPV4_SRC, /* be32 */ + TCA_FLOWER_KEY_IPV4_SRC_MASK, /* be32 */ + TCA_FLOWER_KEY_IPV4_DST, /* be32 */ + TCA_FLOWER_KEY_IPV4_DST_MASK, /* be32 */ + TCA_FLOWER_KEY_IPV6_SRC, /* struct in6_addr */ + TCA_FLOWER_KEY_IPV6_SRC_MASK, /* struct in6_addr */ + TCA_FLOWER_KEY_IPV6_DST, /* struct in6_addr */ + TCA_FLOWER_KEY_IPV6_DST_MASK, /* struct in6_addr */ + TCA_FLOWER_KEY_TCP_SRC, /* be16 */ + TCA_FLOWER_KEY_TCP_DST, /* be16 */ + TCA_FLOWER_KEY_UDP_SRC, /* be16 */ + TCA_FLOWER_KEY_UDP_DST, /* be16 */ + + TCA_FLOWER_FLAGS, + TCA_FLOWER_KEY_VLAN_ID, /* be16 */ + TCA_FLOWER_KEY_VLAN_PRIO, /* u8 */ + TCA_FLOWER_KEY_VLAN_ETH_TYPE, /* be16 */ + + TCA_FLOWER_KEY_ENC_KEY_ID, /* be32 */ + TCA_FLOWER_KEY_ENC_IPV4_SRC, /* be32 */ + TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,/* be32 */ + TCA_FLOWER_KEY_ENC_IPV4_DST, /* be32 */ + TCA_FLOWER_KEY_ENC_IPV4_DST_MASK,/* be32 */ + TCA_FLOWER_KEY_ENC_IPV6_SRC, /* struct in6_addr */ + TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,/* struct in6_addr */ + TCA_FLOWER_KEY_ENC_IPV6_DST, /* struct in6_addr */ + TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,/* struct in6_addr */ + + TCA_FLOWER_KEY_TCP_SRC_MASK, /* be16 */ + TCA_FLOWER_KEY_TCP_DST_MASK, /* be16 */ + TCA_FLOWER_KEY_UDP_SRC_MASK, /* be16 */ + TCA_FLOWER_KEY_UDP_DST_MASK, /* be16 */ + TCA_FLOWER_KEY_SCTP_SRC_MASK, /* be16 */ + TCA_FLOWER_KEY_SCTP_DST_MASK, /* be16 */ + + TCA_FLOWER_KEY_SCTP_SRC, /* be16 */ + TCA_FLOWER_KEY_SCTP_DST, /* be16 */ + + TCA_FLOWER_KEY_ENC_UDP_SRC_PORT, /* be16 */ + TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK, /* be16 */ + TCA_FLOWER_KEY_ENC_UDP_DST_PORT, /* be16 */ + TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK, /* be16 */ + + TCA_FLOWER_KEY_FLAGS, /* be32 */ + TCA_FLOWER_KEY_FLAGS_MASK, /* be32 */ + + TCA_FLOWER_KEY_ICMPV4_CODE, /* u8 */ + TCA_FLOWER_KEY_ICMPV4_CODE_MASK,/* u8 */ + TCA_FLOWER_KEY_ICMPV4_TYPE, /* u8 */ + TCA_FLOWER_KEY_ICMPV4_TYPE_MASK,/* u8 */ + TCA_FLOWER_KEY_ICMPV6_CODE, /* u8 */ + TCA_FLOWER_KEY_ICMPV6_CODE_MASK,/* u8 */ + TCA_FLOWER_KEY_ICMPV6_TYPE, /* u8 */ + TCA_FLOWER_KEY_ICMPV6_TYPE_MASK,/* u8 */ + + TCA_FLOWER_KEY_ARP_SIP, /* be32 */ + TCA_FLOWER_KEY_ARP_SIP_MASK, /* be32 */ + TCA_FLOWER_KEY_ARP_TIP, /* be32 */ + TCA_FLOWER_KEY_ARP_TIP_MASK, /* be32 */ + TCA_FLOWER_KEY_ARP_OP, /* u8 */ + TCA_FLOWER_KEY_ARP_OP_MASK, /* u8 */ + TCA_FLOWER_KEY_ARP_SHA, /* ETH_ALEN */ + TCA_FLOWER_KEY_ARP_SHA_MASK, /* ETH_ALEN */ + TCA_FLOWER_KEY_ARP_THA, /* ETH_ALEN */ + TCA_FLOWER_KEY_ARP_THA_MASK, /* ETH_ALEN */ + + TCA_FLOWER_KEY_MPLS_TTL, /* u8 - 8 bits */ + TCA_FLOWER_KEY_MPLS_BOS, /* u8 - 1 bit */ + TCA_FLOWER_KEY_MPLS_TC, /* u8 - 3 bits */ + TCA_FLOWER_KEY_MPLS_LABEL, /* be32 - 20 bits */ + + TCA_FLOWER_KEY_TCP_FLAGS, /* be16 */ + TCA_FLOWER_KEY_TCP_FLAGS_MASK, /* be16 */ + + TCA_FLOWER_KEY_IP_TOS, /* u8 */ + TCA_FLOWER_KEY_IP_TOS_MASK, /* u8 */ + TCA_FLOWER_KEY_IP_TTL, /* u8 */ + TCA_FLOWER_KEY_IP_TTL_MASK, /* u8 */ + + TCA_FLOWER_KEY_CVLAN_ID, /* be16 */ + TCA_FLOWER_KEY_CVLAN_PRIO, /* u8 */ + TCA_FLOWER_KEY_CVLAN_ETH_TYPE, /* be16 */ + + TCA_FLOWER_KEY_ENC_IP_TOS, /* u8 */ + TCA_FLOWER_KEY_ENC_IP_TOS_MASK, /* u8 */ + TCA_FLOWER_KEY_ENC_IP_TTL, /* u8 */ + TCA_FLOWER_KEY_ENC_IP_TTL_MASK, /* u8 */ + + TCA_FLOWER_KEY_ENC_OPTS, + TCA_FLOWER_KEY_ENC_OPTS_MASK, + + TCA_FLOWER_IN_HW_COUNT, + + __TCA_FLOWER_MAX, +}; + +#define TCA_FLOWER_MAX (__TCA_FLOWER_MAX - 1) + +enum { + TCA_FLOWER_KEY_ENC_OPTS_UNSPEC, + TCA_FLOWER_KEY_ENC_OPTS_GENEVE, /* Nested + * TCA_FLOWER_KEY_ENC_OPT_GENEVE_ + * attributes + */ + __TCA_FLOWER_KEY_ENC_OPTS_MAX, +}; + +#define TCA_FLOWER_KEY_ENC_OPTS_MAX (__TCA_FLOWER_KEY_ENC_OPTS_MAX - 1) + +enum { + TCA_FLOWER_KEY_ENC_OPT_GENEVE_UNSPEC, + TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS, /* u16 */ + TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE, /* u8 */ + TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA, /* 4 to 128 bytes */ + + __TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX, +}; + +#define TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX \ + (__TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX - 1) + +enum { + TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = (1 << 0), + TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1), +}; + +/* Match-all classifier */ + +enum { + TCA_MATCHALL_UNSPEC, + TCA_MATCHALL_CLASSID, + TCA_MATCHALL_ACT, + TCA_MATCHALL_FLAGS, + __TCA_MATCHALL_MAX, +}; + +#define TCA_MATCHALL_MAX (__TCA_MATCHALL_MAX - 1) + +/* Extended Matches */ + +struct tcf_ematch_tree_hdr { + __u16 nmatches; + __u16 progid; +}; + +enum { + TCA_EMATCH_TREE_UNSPEC, + TCA_EMATCH_TREE_HDR, + TCA_EMATCH_TREE_LIST, + __TCA_EMATCH_TREE_MAX +}; +#define TCA_EMATCH_TREE_MAX (__TCA_EMATCH_TREE_MAX - 1) + +struct tcf_ematch_hdr { + __u16 matchid; + __u16 kind; + __u16 flags; + __u16 pad; /* currently unused */ +}; + +/* 0 1 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 + * +-----------------------+-+-+---+ + * | Unused |S|I| R | + * +-----------------------+-+-+---+ + * + * R(2) ::= relation to next ematch + * where: 0 0 END (last ematch) + * 0 1 AND + * 1 0 OR + * 1 1 Unused (invalid) + * I(1) ::= invert result + * S(1) ::= simple payload + */ +#define TCF_EM_REL_END 0 +#define TCF_EM_REL_AND (1<<0) +#define TCF_EM_REL_OR (1<<1) +#define TCF_EM_INVERT (1<<2) +#define TCF_EM_SIMPLE (1<<3) + +#define TCF_EM_REL_MASK 3 +#define TCF_EM_REL_VALID(v) (((v) & TCF_EM_REL_MASK) != TCF_EM_REL_MASK) + +enum { + TCF_LAYER_LINK, + TCF_LAYER_NETWORK, + TCF_LAYER_TRANSPORT, + __TCF_LAYER_MAX +}; +#define TCF_LAYER_MAX (__TCF_LAYER_MAX - 1) + +/* Ematch type assignments + * 1..32767 Reserved for ematches inside kernel tree + * 32768..65535 Free to use, not reliable + */ +#define TCF_EM_CONTAINER 0 +#define TCF_EM_CMP 1 +#define TCF_EM_NBYTE 2 +#define TCF_EM_U32 3 +#define TCF_EM_META 4 +#define TCF_EM_TEXT 5 +#define TCF_EM_VLAN 6 +#define TCF_EM_CANID 7 +#define TCF_EM_IPSET 8 +#define TCF_EM_IPT 9 +#define TCF_EM_MAX 9 + +enum { + TCF_EM_PROG_TC +}; + +enum { + TCF_EM_OPND_EQ, + TCF_EM_OPND_GT, + TCF_EM_OPND_LT +}; + +#endif diff --git a/tools/include/uapi/linux/tc_act/tc_bpf.h b/tools/include/uapi/linux/tc_act/tc_bpf.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +/* + * Copyright (c) 2015 Jiri Pirko <jiri@resnulli.us> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#ifndef __LINUX_TC_BPF_H +#define __LINUX_TC_BPF_H + +#include <linux/pkt_cls.h> + +#define TCA_ACT_BPF 13 + +struct tc_act_bpf { + tc_gen; +}; + +enum { + TCA_ACT_BPF_UNSPEC, + TCA_ACT_BPF_TM, + TCA_ACT_BPF_PARMS, + TCA_ACT_BPF_OPS_LEN, + TCA_ACT_BPF_OPS, + TCA_ACT_BPF_FD, + TCA_ACT_BPF_NAME, + TCA_ACT_BPF_PAD, + TCA_ACT_BPF_TAG, + TCA_ACT_BPF_ID, + __TCA_ACT_BPF_MAX, +}; +#define TCA_ACT_BPF_MAX (__TCA_ACT_BPF_MAX - 1) + +#endif diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile @@ -24,6 +24,7 @@ TARGETS += memory-hotplug TARGETS += mount TARGETS += mqueue TARGETS += net +TARGETS += netfilter TARGETS += nsfs TARGETS += powerpc TARGETS += proc diff --git a/tools/testing/selftests/bpf/test_netcnt.c b/tools/testing/selftests/bpf/test_netcnt.c @@ -81,7 +81,10 @@ int main(int argc, char **argv) goto err; } - assert(system("ping localhost -6 -c 10000 -f -q > /dev/null") == 0); + if (system("which ping6 &>/dev/null") == 0) + assert(!system("ping6 localhost -c 10000 -f -q > /dev/null")); + else + assert(!system("ping -6 localhost -c 10000 -f -q > /dev/null")); if (bpf_prog_query(cgroup_fd, BPF_CGROUP_INET_EGRESS, 0, NULL, NULL, &prog_cnt)) { diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c @@ -13896,6 +13896,25 @@ static struct bpf_test tests[] = { .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = ACCEPT, }, + { + "calls: ctx read at start of subprog", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 5), + BPF_JMP_REG(BPF_JSGT, BPF_REG_0, BPF_REG_0, 0), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_B, BPF_REG_9, BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, + .errstr_unpriv = "function calls to other bpf functions are allowed for root only", + .result_unpriv = REJECT, + .result = ACCEPT, + }, }; static int probe_filter_length(const struct bpf_insn *fp) diff --git a/tools/testing/selftests/netfilter/Makefile b/tools/testing/selftests/netfilter/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0 +# Makefile for netfilter selftests + +TEST_PROGS := nft_trans_stress.sh + +include ../lib.mk diff --git a/tools/testing/selftests/netfilter/config b/tools/testing/selftests/netfilter/config @@ -0,0 +1,2 @@ +CONFIG_NET_NS=y +NF_TABLES_INET=y diff --git a/tools/testing/selftests/netfilter/nft_trans_stress.sh b/tools/testing/selftests/netfilter/nft_trans_stress.sh @@ -0,0 +1,78 @@ +#!/bin/bash +# +# This test is for stress-testing the nf_tables config plane path vs. +# packet path processing: Make sure we never release rules that are +# still visible to other cpus. +# +# set -e + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +testns=testns1 +tables="foo bar baz quux" + +nft --version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without nft tool" + exit $ksft_skip +fi + +ip -Version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +tmp=$(mktemp) + +for table in $tables; do + echo add table inet "$table" >> "$tmp" + echo flush table inet "$table" >> "$tmp" + + echo "add chain inet $table INPUT { type filter hook input priority 0; }" >> "$tmp" + echo "add chain inet $table OUTPUT { type filter hook output priority 0; }" >> "$tmp" + for c in $(seq 1 400); do + chain=$(printf "chain%03u" "$c") + echo "add chain inet $table $chain" >> "$tmp" + done + + for c in $(seq 1 400); do + chain=$(printf "chain%03u" "$c") + for BASE in INPUT OUTPUT; do + echo "add rule inet $table $BASE counter jump $chain" >> "$tmp" + done + echo "add rule inet $table $chain counter return" >> "$tmp" + done +done + +ip netns add "$testns" +ip -netns "$testns" link set lo up + +lscpu | grep ^CPU\(s\): | ( read cpu cpunum ; +cpunum=$((cpunum-1)) +for i in $(seq 0 $cpunum);do + mask=$(printf 0x%x $((1<<$i))) + ip netns exec "$testns" taskset $mask ping -4 127.0.0.1 -fq > /dev/null & + ip netns exec "$testns" taskset $mask ping -6 ::1 -fq > /dev/null & +done) + +sleep 1 + +for i in $(seq 1 10) ; do ip netns exec "$testns" nft -f "$tmp" & done + +for table in $tables;do + randsleep=$((RANDOM%10)) + sleep $randsleep + ip netns exec "$testns" nft delete table inet $table 2>/dev/null +done + +randsleep=$((RANDOM%10)) +sleep $randsleep + +pkill -9 ping + +wait + +rm -f "$tmp" +ip netns del "$testns"