whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit a50243b1ddcdd766d0d17fbfeeb1a22e62fdc461
parent 2901752c14b8e1b7dd898d2e5245c93e531aa624
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Sat,  9 Mar 2019 15:53:03 -0800

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma

Pull rdma updates from Jason Gunthorpe:
 "This has been a slightly more active cycle than normal with ongoing
  core changes and quite a lot of collected driver updates.

   - Various driver fixes for bnxt_re, cxgb4, hns, mlx5, pvrdma, rxe

   - A new data transfer mode for HFI1 giving higher performance

   - Significant functional and bug fix update to the mlx5
     On-Demand-Paging MR feature

   - A chip hang reset recovery system for hns

   - Change mm->pinned_vm to an atomic64

   - Update bnxt_re to support a new 57500 chip

   - A sane netlink 'rdma link add' method for creating rxe devices and
     fixing the various unregistration race conditions in rxe's
     unregister flow

   - Allow lookup up objects by an ID over netlink

   - Various reworking of the core to driver interface:
       - drivers should not assume umem SGLs are in PAGE_SIZE chunks
       - ucontext is accessed via udata not other means
       - start to make the core code responsible for object memory
         allocation
       - drivers should convert struct device to struct ib_device via a
         helper
       - drivers have more tools to avoid use after unregister problems"

* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (280 commits)
  net/mlx5: ODP support for XRC transport is not enabled by default in FW
  IB/hfi1: Close race condition on user context disable and close
  RDMA/umem: Revert broken 'off by one' fix
  RDMA/umem: minor bug fix in error handling path
  RDMA/hns: Use GFP_ATOMIC in hns_roce_v2_modify_qp
  cxgb4: kfree mhp after the debug print
  IB/rdmavt: Fix concurrency panics in QP post_send and modify to error
  IB/rdmavt: Fix loopback send with invalidate ordering
  IB/iser: Fix dma_nents type definition
  IB/mlx5: Set correct write permissions for implicit ODP MR
  bnxt_re: Clean cq for kernel consumers only
  RDMA/uverbs: Don't do double free of allocated PD
  RDMA: Handle ucontext allocations by IB/core
  RDMA/core: Fix a WARN() message
  bnxt_re: fix the regression due to changes in alloc_pbl
  IB/mlx4: Increase the timeout for CM cache
  IB/core: Abort page fault handler silently during owning process exit
  IB/mlx5: Validate correct PD before prefetch MR
  IB/mlx5: Protect against prefetch of invalid MR
  RDMA/uverbs: Store PR pointer before it is overwritten
  ...

Diffstat:
M.clang-format | 2++
MDocumentation/infiniband/user_verbs.txt | 4++--
Mdrivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c | 8+++++++-
Mdrivers/infiniband/Kconfig | 15+++++++--------
Mdrivers/infiniband/core/Makefile | 4++--
Mdrivers/infiniband/core/cache.c | 118+++++++++++++++++++++++++++++++------------------------------------------------
Mdrivers/infiniband/core/cgroup.c | 5++---
Mdrivers/infiniband/core/cm.c | 3+--
Mdrivers/infiniband/core/cma.c | 139+++++++++++++++++++++++++++++--------------------------------------------------
Mdrivers/infiniband/core/cma_priv.h | 4+++-
Mdrivers/infiniband/core/core_priv.h | 35++++++++++++++++++++++++-----------
Mdrivers/infiniband/core/device.c | 1323+++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------
Mdrivers/infiniband/core/iwcm.c | 13++++++++-----
Mdrivers/infiniband/core/iwpm_msg.c | 232+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------
Mdrivers/infiniband/core/iwpm_util.c | 86++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
Mdrivers/infiniband/core/iwpm_util.h | 12++++++++++++
Mdrivers/infiniband/core/mad.c | 4++--
Mdrivers/infiniband/core/netlink.c | 4+---
Mdrivers/infiniband/core/nldev.c | 492++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------
Mdrivers/infiniband/core/rdma_core.c | 42+++++++++++++++++++++++++++++++++++-------
Mdrivers/infiniband/core/restrack.c | 210+++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------
Adrivers/infiniband/core/restrack.h | 28++++++++++++++++++++++++++++
Mdrivers/infiniband/core/rw.c | 12+++++-------
Mdrivers/infiniband/core/sa_query.c | 4+---
Mdrivers/infiniband/core/security.c | 96++++++++++++++++++++++++++++++++++++++++++++-----------------------------------
Mdrivers/infiniband/core/sysfs.c | 93++++++++++++++++++++++++++++++++-----------------------------------------------
Mdrivers/infiniband/core/ucma.c | 7+++++++
Mdrivers/infiniband/core/umem.c | 60++++++++++++++++--------------------------------------------
Mdrivers/infiniband/core/umem_odp.c | 21++++++++++++++-------
Mdrivers/infiniband/core/user_mad.c | 52+++++++++++++++++++++-------------------------------
Mdrivers/infiniband/core/uverbs_cmd.c | 69+++++++++++++++++++++++++++++++++++++++------------------------------
Mdrivers/infiniband/core/uverbs_ioctl.c | 3+++
Mdrivers/infiniband/core/uverbs_main.c | 2++
Mdrivers/infiniband/core/uverbs_std_types.c | 2+-
Mdrivers/infiniband/core/uverbs_uapi.c | 15++++++++++-----
Mdrivers/infiniband/core/verbs.c | 73+++++++++++++++++++++++++++++++++++++++++--------------------------------
Mdrivers/infiniband/hw/bnxt_re/Kconfig | 1+
Mdrivers/infiniband/hw/bnxt_re/Makefile | 2+-
Mdrivers/infiniband/hw/bnxt_re/bnxt_re.h | 1+
Mdrivers/infiniband/hw/bnxt_re/ib_verbs.c | 268+++++++++++++++++++++++++++++++++++++++++--------------------------------------
Mdrivers/infiniband/hw/bnxt_re/ib_verbs.h | 16+++++++---------
Mdrivers/infiniband/hw/bnxt_re/main.c | 134++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------
Mdrivers/infiniband/hw/bnxt_re/qplib_fp.c | 193+++++++++++++++++++++++++++++++++++++++++++------------------------------------
Mdrivers/infiniband/hw/bnxt_re/qplib_fp.h | 47++++++++++++++++++++++++++++++++++++++++++-----
Mdrivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 40++++++++++++++++++++++++++--------------
Mdrivers/infiniband/hw/bnxt_re/qplib_rcfw.h | 45+++++++++++++++++++++++++++++++++++++++++----
Mdrivers/infiniband/hw/bnxt_re/qplib_res.c | 22++++++++++++----------
Mdrivers/infiniband/hw/bnxt_re/qplib_res.h | 30++++++++++++++++++++++++++++--
Mdrivers/infiniband/hw/bnxt_re/qplib_sp.c | 3++-
Mdrivers/infiniband/hw/bnxt_re/roce_hsi.h | 160++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------------
Mdrivers/infiniband/hw/cxgb3/Makefile | 2+-
Mdrivers/infiniband/hw/cxgb3/iwch.c | 2+-
Mdrivers/infiniband/hw/cxgb3/iwch_provider.c | 95++++++++++++++++++++++++++++++++++++-------------------------------------------
Mdrivers/infiniband/hw/cxgb4/Makefile | 4++--
Mdrivers/infiniband/hw/cxgb4/cm.c | 199++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------
Mdrivers/infiniband/hw/cxgb4/device.c | 10++--------
Mdrivers/infiniband/hw/cxgb4/iw_cxgb4.h | 16+++-------------
Mdrivers/infiniband/hw/cxgb4/mem.c | 36+++++++++++++++---------------------
Mdrivers/infiniband/hw/cxgb4/provider.c | 85++++++++++++++++++++++++++++++-------------------------------------------------
Mdrivers/infiniband/hw/cxgb4/qp.c | 33+++++++++++++++++++--------------
Mdrivers/infiniband/hw/cxgb4/t4.h | 1+
Mdrivers/infiniband/hw/hfi1/Makefile | 1+
Mdrivers/infiniband/hw/hfi1/chip.c | 13+++++++++++++
Mdrivers/infiniband/hw/hfi1/chip.h | 4+++-
Mdrivers/infiniband/hw/hfi1/common.h | 4++++
Mdrivers/infiniband/hw/hfi1/debugfs.c | 58++++++++++++++++++++++++++++------------------------------
Mdrivers/infiniband/hw/hfi1/debugfs.h | 12------------
Mdrivers/infiniband/hw/hfi1/driver.c | 58+++++++++++++++++++++++++++++++++++++---------------------
Mdrivers/infiniband/hw/hfi1/fault.c | 53++++++++++++++++++++---------------------------------
Mdrivers/infiniband/hw/hfi1/hfi.h | 24++++++++++++++++++++++--
Mdrivers/infiniband/hw/hfi1/init.c | 35++++++++++++++++++++++++++++-------
Mdrivers/infiniband/hw/hfi1/iowait.c | 34+++++++++++++++++++++++++++++++++-
Mdrivers/infiniband/hw/hfi1/iowait.h | 99+++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------
Adrivers/infiniband/hw/hfi1/opfn.c | 323+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adrivers/infiniband/hw/hfi1/opfn.h | 85+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mdrivers/infiniband/hw/hfi1/pio.c | 18++++++++++++------
Mdrivers/infiniband/hw/hfi1/qp.c | 76+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
Mdrivers/infiniband/hw/hfi1/qp.h | 7+++++++
Mdrivers/infiniband/hw/hfi1/rc.c | 1141+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------
Adrivers/infiniband/hw/hfi1/rc.h | 51+++++++++++++++++++++++++++++++++++++++++++++++++++
Mdrivers/infiniband/hw/hfi1/ruc.c | 48++++++++++++++++++++++++++++++++++--------------
Mdrivers/infiniband/hw/hfi1/sdma.c | 24+++++++++++++++---------
Mdrivers/infiniband/hw/hfi1/sdma_txreq.h | 1+
Mdrivers/infiniband/hw/hfi1/sysfs.c | 16++++++++--------
Mdrivers/infiniband/hw/hfi1/tid_rdma.c | 5418+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mdrivers/infiniband/hw/hfi1/tid_rdma.h | 311++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
Mdrivers/infiniband/hw/hfi1/trace.c | 118+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mdrivers/infiniband/hw/hfi1/trace.h | 1+
Mdrivers/infiniband/hw/hfi1/trace_ibhdrs.h | 8++++++++
Mdrivers/infiniband/hw/hfi1/trace_rc.h | 48++++++++++++++++++++++++++++++++++++++++++++++++
Mdrivers/infiniband/hw/hfi1/trace_rx.h | 107+------------------------------------------------------------------------------
Adrivers/infiniband/hw/hfi1/trace_tid.h | 1610+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mdrivers/infiniband/hw/hfi1/trace_tx.h | 18++++++++++++++++--
Mdrivers/infiniband/hw/hfi1/uc.c | 3++-
Mdrivers/infiniband/hw/hfi1/ud.c | 24++----------------------
Mdrivers/infiniband/hw/hfi1/user_exp_rcv.h | 1-
Mdrivers/infiniband/hw/hfi1/user_pages.c | 12+++---------
Mdrivers/infiniband/hw/hfi1/user_sdma.c | 9++++++---
Mdrivers/infiniband/hw/hfi1/verbs.c | 210++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------
Mdrivers/infiniband/hw/hfi1/verbs.h | 104++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
Mdrivers/infiniband/hw/hfi1/verbs_txreq.h | 1+
Mdrivers/infiniband/hw/hfi1/vnic_sdma.c | 6++++--
Mdrivers/infiniband/hw/hns/Kconfig | 1-
Mdrivers/infiniband/hw/hns/Makefile | 2+-
Mdrivers/infiniband/hw/hns/hns_roce_cmd.c | 32++++++++++++++++++++++++--------
Mdrivers/infiniband/hw/hns/hns_roce_cmd.h | 12++++++++++++
Mdrivers/infiniband/hw/hns/hns_roce_cq.c | 9+++++----
Mdrivers/infiniband/hw/hns/hns_roce_db.c | 6+++---
Mdrivers/infiniband/hw/hns/hns_roce_device.h | 63++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
Mdrivers/infiniband/hw/hns/hns_roce_hem.c | 68+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mdrivers/infiniband/hw/hns/hns_roce_hem.h | 3+++
Mdrivers/infiniband/hw/hns/hns_roce_hw_v1.c | 36++++++++++++++++++++----------------
Mdrivers/infiniband/hw/hns/hns_roce_hw_v2.c | 596+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
Mdrivers/infiniband/hw/hns/hns_roce_hw_v2.h | 92+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
Mdrivers/infiniband/hw/hns/hns_roce_main.c | 88+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
Mdrivers/infiniband/hw/hns/hns_roce_mr.c | 95++++++++++++++++++++++++++++++++++---------------------------------------------
Mdrivers/infiniband/hw/hns/hns_roce_pd.c | 25++++++++-----------------
Mdrivers/infiniband/hw/hns/hns_roce_qp.c | 92++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------------
Mdrivers/infiniband/hw/hns/hns_roce_srq.c | 16++++++++--------
Mdrivers/infiniband/hw/i40iw/Makefile | 2+-
Mdrivers/infiniband/hw/i40iw/i40iw_utils.c | 1-
Mdrivers/infiniband/hw/i40iw/i40iw_verbs.c | 137++++++++++++++++++++++++++++++-------------------------------------------------
Mdrivers/infiniband/hw/mlx4/Kconfig | 1-
Mdrivers/infiniband/hw/mlx4/cm.c | 2+-
Mdrivers/infiniband/hw/mlx4/cq.c | 19+++++++++----------
Mdrivers/infiniband/hw/mlx4/doorbell.c | 6+++---
Mdrivers/infiniband/hw/mlx4/main.c | 77++++++++++++++++++++++++++++++-----------------------------------------------
Mdrivers/infiniband/hw/mlx4/mlx4_ib.h | 3++-
Mdrivers/infiniband/hw/mlx4/mr.c | 13++++++-------
Mdrivers/infiniband/hw/mlx4/qp.c | 84+++++++++++++++++++++++++++++++++++++++++++++----------------------------------
Mdrivers/infiniband/hw/mlx4/srq.c | 12+++++++-----
Mdrivers/infiniband/hw/mlx5/Kconfig | 1-
Mdrivers/infiniband/hw/mlx5/cong.c | 15+++++----------
Mdrivers/infiniband/hw/mlx5/cq.c | 15+++++++--------
Mdrivers/infiniband/hw/mlx5/devx.c | 463+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
Mdrivers/infiniband/hw/mlx5/doorbell.c | 6+++---
Mdrivers/infiniband/hw/mlx5/ib_rep.c | 6++++--
Mdrivers/infiniband/hw/mlx5/main.c | 249++++++++++++++++++++++++++++++++++++++++++++-----------------------------------
Mdrivers/infiniband/hw/mlx5/mem.c | 5+----
Mdrivers/infiniband/hw/mlx5/mlx5_ib.h | 41+++++++++++++++++++++++++++--------------
Mdrivers/infiniband/hw/mlx5/mr.c | 126+++++++++++++++++++++++++++++--------------------------------------------------
Mdrivers/infiniband/hw/mlx5/odp.c | 316++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------
Mdrivers/infiniband/hw/mlx5/qp.c | 308++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------
Mdrivers/infiniband/hw/mlx5/srq.c | 11+++++------
Mdrivers/infiniband/hw/mlx5/srq.h | 2--
Mdrivers/infiniband/hw/mlx5/srq_cmd.c | 16+++++++---------
Mdrivers/infiniband/hw/mthca/mthca_main.c | 2+-
Mdrivers/infiniband/hw/mthca/mthca_provider.c | 139+++++++++++++++++++++++++++++++++----------------------------------------------
Mdrivers/infiniband/hw/mthca/mthca_qp.c | 13+++++++++----
Mdrivers/infiniband/hw/mthca/mthca_srq.c | 21+++++++++++++--------
Mdrivers/infiniband/hw/nes/Kconfig | 2+-
Mdrivers/infiniband/hw/nes/nes_verbs.c | 313+++++++++++++++++++++++++++++++------------------------------------------------
Mdrivers/infiniband/hw/nes/nes_verbs.h | 1-
Mdrivers/infiniband/hw/ocrdma/Makefile | 2+-
Mdrivers/infiniband/hw/ocrdma/ocrdma_main.c | 12++++++++----
Mdrivers/infiniband/hw/ocrdma/ocrdma_stats.c | 67++++++++++++++++++++++---------------------------------------------
Mdrivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 189+++++++++++++++++++++++++++++++++----------------------------------------------
Mdrivers/infiniband/hw/ocrdma/ocrdma_verbs.h | 11+++++------
Mdrivers/infiniband/hw/qedr/main.c | 9++++++---
Mdrivers/infiniband/hw/qedr/qedr_iw_cm.c | 2+-
Mdrivers/infiniband/hw/qedr/verbs.c | 192+++++++++++++++++++++++++++++++------------------------------------------------
Mdrivers/infiniband/hw/qedr/verbs.h | 10+++++-----
Mdrivers/infiniband/hw/qib/qib_debugfs.c | 27++++++++-------------------
Mdrivers/infiniband/hw/qib/qib_rc.c | 7+------
Mdrivers/infiniband/hw/qib/qib_sdma.c | 26++------------------------
Mdrivers/infiniband/hw/qib/qib_sysfs.c | 18+++++++++---------
Mdrivers/infiniband/hw/qib/qib_ud.c | 6+-----
Mdrivers/infiniband/hw/qib/qib_user_pages.c | 75++++++++++++++++++++++++++++-----------------------------------------------
Mdrivers/infiniband/hw/qib/qib_verbs.c | 20++++----------------
Mdrivers/infiniband/hw/usnic/Makefile | 2+-
Mdrivers/infiniband/hw/usnic/usnic_debugfs.c | 26--------------------------
Mdrivers/infiniband/hw/usnic/usnic_ib_main.c | 57+++++++++++++++++++++++++++------------------------------
Mdrivers/infiniband/hw/usnic/usnic_ib_sysfs.c | 26+++++++++++---------------
Mdrivers/infiniband/hw/usnic/usnic_ib_verbs.c | 114+++++++++++--------------------------------------------------------------------
Mdrivers/infiniband/hw/usnic/usnic_ib_verbs.h | 28+++++-----------------------
Mdrivers/infiniband/hw/usnic/usnic_uiom.c | 65+++++++++--------------------------------------------------------
Mdrivers/infiniband/hw/usnic/usnic_uiom.h | 1-
Mdrivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c | 2+-
Mdrivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h | 15++++++++++-----
Mdrivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c | 12+++++++++---
Mdrivers/infiniband/hw/vmw_pvrdma/pvrdma_misc.c | 21++++++++-------------
Mdrivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c | 3+--
Mdrivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c | 6++----
Mdrivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c | 4+---
Mdrivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c | 98++++++++++++++++++++++++++++---------------------------------------------------
Mdrivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h | 12+++++-------
Mdrivers/infiniband/sw/rdmavt/mr.c | 21+++++++++------------
Mdrivers/infiniband/sw/rdmavt/pd.c | 29++++++++---------------------
Mdrivers/infiniband/sw/rdmavt/pd.h | 7+++----
Mdrivers/infiniband/sw/rdmavt/qp.c | 104++++++++++++++++++++++++++++++++++++++++---------------------------------------
Mdrivers/infiniband/sw/rdmavt/rc.c | 13+++++++++++++
Mdrivers/infiniband/sw/rdmavt/srq.c | 5++++-
Mdrivers/infiniband/sw/rdmavt/trace_cq.h | 10++++++++--
Mdrivers/infiniband/sw/rdmavt/vt.c | 34++++++++++++----------------------
Mdrivers/infiniband/sw/rxe/rxe.c | 67+++++++++++++++++++++++++++++++++++++------------------------------
Mdrivers/infiniband/sw/rxe/rxe.h | 16+++++++++-------
Mdrivers/infiniband/sw/rxe/rxe_av.c | 7+++++++
Mdrivers/infiniband/sw/rxe/rxe_comp.c | 6+++---
Mdrivers/infiniband/sw/rxe/rxe_loc.h | 9++++-----
Mdrivers/infiniband/sw/rxe/rxe_mr.c | 15+++++++--------
Mdrivers/infiniband/sw/rxe/rxe_net.c | 97+++++++++++++++++++------------------------------------------------------------
Mdrivers/infiniband/sw/rxe/rxe_net.h | 2+-
Mdrivers/infiniband/sw/rxe/rxe_param.h | 3++-
Mdrivers/infiniband/sw/rxe/rxe_pool.c | 77++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------
Mdrivers/infiniband/sw/rxe/rxe_pool.h | 4++++
Mdrivers/infiniband/sw/rxe/rxe_qp.c | 15+++++++--------
Mdrivers/infiniband/sw/rxe/rxe_recv.c | 12+++++-------
Mdrivers/infiniband/sw/rxe/rxe_resp.c | 3+--
Mdrivers/infiniband/sw/rxe/rxe_sysfs.c | 40+++++++++++++++++-----------------------
Mdrivers/infiniband/sw/rxe/rxe_verbs.c | 103++++++++++++++++++++++++++++++++-----------------------------------------------
Mdrivers/infiniband/sw/rxe/rxe_verbs.h | 9+++------
Mdrivers/infiniband/ulp/ipoib/ipoib.h | 4++--
Mdrivers/infiniband/ulp/ipoib/ipoib_fs.c | 7+------
Mdrivers/infiniband/ulp/ipoib/ipoib_main.c | 14++++++--------
Mdrivers/infiniband/ulp/iser/iscsi_iser.h | 2+-
Mdrivers/infiniband/ulp/iser/iser_memory.c | 19+++++++++----------
Mdrivers/infiniband/ulp/isert/Makefile | 1-
Mdrivers/infiniband/ulp/srp/ib_srp.c | 26++++++++++++--------------
Mdrivers/infiniband/ulp/srpt/Makefile | 1-
Mdrivers/media/pci/intel/ipu3/ipu3-cio2.c | 4++--
Mdrivers/misc/mic/scif/scif_rma.c | 38++++++++++++--------------------------
Mdrivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c | 3---
Mdrivers/net/ethernet/chelsio/cxgb4/t4_msg.h | 8++++++++
Mdrivers/net/ethernet/chelsio/cxgb4/t4_tcb.h | 12++++++++++++
Mdrivers/net/ethernet/mellanox/mlx5/core/main.c | 85++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------------
Mfs/proc/task_mmu.c | 2+-
Minclude/linux/cgroup_rdma.h | 2+-
Minclude/linux/mlx5/driver.h | 5+----
Minclude/linux/mm_types.h | 2+-
Minclude/linux/scatterlist.h | 49++++++++++++++++++++++++++++++++++++++++---------
Minclude/rdma/ib_hdrs.h | 14+++++++++++++-
Minclude/rdma/ib_mad.h | 5++---
Minclude/rdma/ib_umem.h | 8+++++---
Minclude/rdma/ib_umem_odp.h | 34++++++++++++++--------------------
Minclude/rdma/ib_verbs.h | 274++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------
Minclude/rdma/iw_cm.h | 16+++++++++++++++-
Minclude/rdma/iw_portmap.h | 144+++----------------------------------------------------------------------------
Minclude/rdma/rdma_cm.h | 1+
Minclude/rdma/rdma_netlink.h | 11+++++++++++
Minclude/rdma/rdma_vt.h | 30+++++++++++++++++++++++-------
Minclude/rdma/rdmavt_qp.h | 20+++++++++++++++++++-
Minclude/rdma/restrack.h | 58++++++++++------------------------------------------------
Ainclude/rdma/tid_rdma_defs.h | 108+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Minclude/rdma/uverbs_ioctl.h | 18++++++++++++++++++
Minclude/rdma/uverbs_std_types.h | 18+++++++++++++-----
Minclude/rdma/uverbs_types.h | 1+
Minclude/uapi/rdma/bnxt_re-abi.h | 11+++++++++++
Minclude/uapi/rdma/ib_user_verbs.h | 2++
Minclude/uapi/rdma/mlx5_user_ioctl_cmds.h | 18++++++++++++++++++
Minclude/uapi/rdma/mlx5_user_ioctl_verbs.h | 5+++++
Minclude/uapi/rdma/rdma_netlink.h | 74++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
Minclude/uapi/rdma/rdma_user_cm.h | 4++++
Minclude/uapi/rdma/rdma_user_rxe.h | 3+--
Mkernel/cgroup/rdma.c | 5+----
Mkernel/events/core.c | 8++++----
Mkernel/fork.c | 2+-
Mlib/irq_poll.c | 2+-
Mlib/scatterlist.c | 26++++++++++++++++++++++++++
Mmm/debug.c | 5+++--
Mnet/rds/ib.h | 12++++--------
Mnet/rds/ib_fmr.c | 8++++----
Mnet/rds/ib_frmr.c | 4++--
Mnet/rds/ib_recv.c | 8+++-----
Mnet/rds/ib_send.c | 15+++++++--------
264 files changed, 16710 insertions(+), 5014 deletions(-)

diff --git a/.clang-format b/.clang-format @@ -240,6 +240,7 @@ ForEachMacros: - 'for_each_set_bit' - 'for_each_set_bit_from' - 'for_each_sg' + - 'for_each_sg_dma_page' - 'for_each_sg_page' - 'for_each_sibling_event' - '__for_each_thread' @@ -360,6 +361,7 @@ ForEachMacros: - 'radix_tree_for_each_slot' - 'radix_tree_for_each_tagged' - 'rbtree_postorder_for_each_entry_safe' + - 'rdma_for_each_port' - 'resource_list_for_each_entry' - 'resource_list_for_each_entry_safe' - 'rhl_for_each_entry_rcu' diff --git a/Documentation/infiniband/user_verbs.txt b/Documentation/infiniband/user_verbs.txt @@ -46,11 +46,11 @@ Memory pinning I/O targets be kept resident at the same physical address. The ib_uverbs module manages pinning and unpinning memory regions via get_user_pages() and put_page() calls. It also accounts for the - amount of memory pinned in the process's locked_vm, and checks that + amount of memory pinned in the process's pinned_vm, and checks that unprivileged processes do not exceed their RLIMIT_MEMLOCK limit. Pages that are pinned multiple times are counted each time they are - pinned, so the value of locked_vm may be an overestimate of the + pinned, so the value of pinned_vm may be an overestimate of the number of pages pinned by a process. /dev files diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c @@ -311,7 +311,13 @@ static dma_addr_t __vmw_piter_dma_addr(struct vmw_piter *viter) static dma_addr_t __vmw_piter_sg_addr(struct vmw_piter *viter) { - return sg_page_iter_dma_address(&viter->iter); + /* + * FIXME: This driver wrongly mixes DMA and CPU SG list iteration and + * needs revision. See + * https://lore.kernel.org/lkml/20190104223531.GA1705@ziepe.ca/ + */ + return sg_page_iter_dma_address( + container_of(&viter->iter, struct sg_dma_page_iter, base)); } diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig @@ -89,6 +89,7 @@ config INFINIBAND_ADDR_TRANS_CONFIGFS This allows the user to config the default GID type that the CM uses for each device, when initiaing new connections. +if INFINIBAND_USER_ACCESS || !INFINIBAND_USER_ACCESS source "drivers/infiniband/hw/mthca/Kconfig" source "drivers/infiniband/hw/qib/Kconfig" source "drivers/infiniband/hw/cxgb3/Kconfig" @@ -101,6 +102,12 @@ source "drivers/infiniband/hw/ocrdma/Kconfig" source "drivers/infiniband/hw/vmw_pvrdma/Kconfig" source "drivers/infiniband/hw/usnic/Kconfig" source "drivers/infiniband/hw/hns/Kconfig" +source "drivers/infiniband/hw/bnxt_re/Kconfig" +source "drivers/infiniband/hw/hfi1/Kconfig" +source "drivers/infiniband/hw/qedr/Kconfig" +source "drivers/infiniband/sw/rdmavt/Kconfig" +source "drivers/infiniband/sw/rxe/Kconfig" +endif source "drivers/infiniband/ulp/ipoib/Kconfig" @@ -111,13 +118,5 @@ source "drivers/infiniband/ulp/iser/Kconfig" source "drivers/infiniband/ulp/isert/Kconfig" source "drivers/infiniband/ulp/opa_vnic/Kconfig" -source "drivers/infiniband/sw/rdmavt/Kconfig" -source "drivers/infiniband/sw/rxe/Kconfig" - -source "drivers/infiniband/hw/hfi1/Kconfig" - -source "drivers/infiniband/hw/qedr/Kconfig" - -source "drivers/infiniband/hw/bnxt_re/Kconfig" endif # INFINIBAND diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile @@ -15,8 +15,6 @@ ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \ nldev.o restrack.o ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o -ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o -ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o ib_cm-y := cm.o @@ -39,3 +37,5 @@ ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \ uverbs_std_types_flow_action.o uverbs_std_types_dm.o \ uverbs_std_types_mr.o uverbs_std_types_counters.o \ uverbs_uapi.o uverbs_std_types_device.o +ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o +ib_uverbs-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c @@ -185,7 +185,7 @@ EXPORT_SYMBOL(ib_cache_gid_parse_type_str); static struct ib_gid_table *rdma_gid_table(struct ib_device *device, u8 port) { - return device->cache.ports[port - rdma_start_port(device)].gid; + return device->port_data[port].cache.gid; } static bool is_gid_entry_free(const struct ib_gid_table_entry *entry) @@ -547,21 +547,19 @@ int ib_cache_gid_add(struct ib_device *ib_dev, u8 port, unsigned long mask; int ret; - if (ib_dev->ops.get_netdev) { - idev = ib_dev->ops.get_netdev(ib_dev, port); - if (idev && attr->ndev != idev) { - union ib_gid default_gid; + idev = ib_device_get_netdev(ib_dev, port); + if (idev && attr->ndev != idev) { + union ib_gid default_gid; - /* Adding default GIDs in not permitted */ - make_default_gid(idev, &default_gid); - if (!memcmp(gid, &default_gid, sizeof(*gid))) { - dev_put(idev); - return -EPERM; - } - } - if (idev) + /* Adding default GIDs is not permitted */ + make_default_gid(idev, &default_gid); + if (!memcmp(gid, &default_gid, sizeof(*gid))) { dev_put(idev); + return -EPERM; + } } + if (idev) + dev_put(idev); mask = GID_ATTR_FIND_MASK_GID | GID_ATTR_FIND_MASK_GID_TYPE | @@ -765,7 +763,7 @@ err_free_table: return NULL; } -static void release_gid_table(struct ib_device *device, u8 port, +static void release_gid_table(struct ib_device *device, struct ib_gid_table *table) { bool leak = false; @@ -863,31 +861,27 @@ static void gid_table_reserve_default(struct ib_device *ib_dev, u8 port, static void gid_table_release_one(struct ib_device *ib_dev) { - struct ib_gid_table *table; - u8 port; + unsigned int p; - for (port = 0; port < ib_dev->phys_port_cnt; port++) { - table = ib_dev->cache.ports[port].gid; - release_gid_table(ib_dev, port, table); - ib_dev->cache.ports[port].gid = NULL; + rdma_for_each_port (ib_dev, p) { + release_gid_table(ib_dev, ib_dev->port_data[p].cache.gid); + ib_dev->port_data[p].cache.gid = NULL; } } static int _gid_table_setup_one(struct ib_device *ib_dev) { - u8 port; struct ib_gid_table *table; + unsigned int rdma_port; - for (port = 0; port < ib_dev->phys_port_cnt; port++) { - u8 rdma_port = port + rdma_start_port(ib_dev); - - table = alloc_gid_table( - ib_dev->port_immutable[rdma_port].gid_tbl_len); + rdma_for_each_port (ib_dev, rdma_port) { + table = alloc_gid_table( + ib_dev->port_data[rdma_port].immutable.gid_tbl_len); if (!table) goto rollback_table_setup; gid_table_reserve_default(ib_dev, rdma_port, table); - ib_dev->cache.ports[port].gid = table; + ib_dev->port_data[rdma_port].cache.gid = table; } return 0; @@ -898,14 +892,11 @@ rollback_table_setup: static void gid_table_cleanup_one(struct ib_device *ib_dev) { - struct ib_gid_table *table; - u8 port; + unsigned int p; - for (port = 0; port < ib_dev->phys_port_cnt; port++) { - table = ib_dev->cache.ports[port].gid; - cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev), - table); - } + rdma_for_each_port (ib_dev, p) + cleanup_gid_table_port(ib_dev, p, + ib_dev->port_data[p].cache.gid); } static int gid_table_setup_one(struct ib_device *ib_dev) @@ -983,17 +974,17 @@ const struct ib_gid_attr *rdma_find_gid(struct ib_device *device, unsigned long mask = GID_ATTR_FIND_MASK_GID | GID_ATTR_FIND_MASK_GID_TYPE; struct ib_gid_attr gid_attr_val = {.ndev = ndev, .gid_type = gid_type}; - u8 p; + unsigned int p; if (ndev) mask |= GID_ATTR_FIND_MASK_NETDEV; - for (p = 0; p < device->phys_port_cnt; p++) { + rdma_for_each_port(device, p) { struct ib_gid_table *table; unsigned long flags; int index; - table = device->cache.ports[p].gid; + table = device->port_data[p].cache.gid; read_lock_irqsave(&table->rwlock, flags); index = find_gid(table, gid, &gid_attr_val, false, mask, NULL); if (index >= 0) { @@ -1025,7 +1016,7 @@ int ib_get_cached_pkey(struct ib_device *device, read_lock_irqsave(&device->cache.lock, flags); - cache = device->cache.ports[port_num - rdma_start_port(device)].pkey; + cache = device->port_data[port_num].cache.pkey; if (index < 0 || index >= cache->table_len) ret = -EINVAL; @@ -1043,14 +1034,12 @@ int ib_get_cached_subnet_prefix(struct ib_device *device, u64 *sn_pfx) { unsigned long flags; - int p; if (!rdma_is_port_valid(device, port_num)) return -EINVAL; - p = port_num - rdma_start_port(device); read_lock_irqsave(&device->cache.lock, flags); - *sn_pfx = device->cache.ports[p].subnet_prefix; + *sn_pfx = device->port_data[port_num].cache.subnet_prefix; read_unlock_irqrestore(&device->cache.lock, flags); return 0; @@ -1073,7 +1062,7 @@ int ib_find_cached_pkey(struct ib_device *device, read_lock_irqsave(&device->cache.lock, flags); - cache = device->cache.ports[port_num - rdma_start_port(device)].pkey; + cache = device->port_data[port_num].cache.pkey; *index = -1; @@ -1113,7 +1102,7 @@ int ib_find_exact_cached_pkey(struct ib_device *device, read_lock_irqsave(&device->cache.lock, flags); - cache = device->cache.ports[port_num - rdma_start_port(device)].pkey; + cache = device->port_data[port_num].cache.pkey; *index = -1; @@ -1141,7 +1130,7 @@ int ib_get_cached_lmc(struct ib_device *device, return -EINVAL; read_lock_irqsave(&device->cache.lock, flags); - *lmc = device->cache.ports[port_num - rdma_start_port(device)].lmc; + *lmc = device->port_data[port_num].cache.lmc; read_unlock_irqrestore(&device->cache.lock, flags); return ret; @@ -1159,8 +1148,7 @@ int ib_get_cached_port_state(struct ib_device *device, return -EINVAL; read_lock_irqsave(&device->cache.lock, flags); - *port_state = device->cache.ports[port_num - - rdma_start_port(device)].port_state; + *port_state = device->port_data[port_num].cache.port_state; read_unlock_irqrestore(&device->cache.lock, flags); return ret; @@ -1361,16 +1349,13 @@ static void ib_cache_update(struct ib_device *device, write_lock_irq(&device->cache.lock); - old_pkey_cache = device->cache.ports[port - - rdma_start_port(device)].pkey; + old_pkey_cache = device->port_data[port].cache.pkey; - device->cache.ports[port - rdma_start_port(device)].pkey = pkey_cache; - device->cache.ports[port - rdma_start_port(device)].lmc = tprops->lmc; - device->cache.ports[port - rdma_start_port(device)].port_state = - tprops->state; + device->port_data[port].cache.pkey = pkey_cache; + device->port_data[port].cache.lmc = tprops->lmc; + device->port_data[port].cache.port_state = tprops->state; - device->cache.ports[port - rdma_start_port(device)].subnet_prefix = - tprops->subnet_prefix; + device->port_data[port].cache.subnet_prefix = tprops->subnet_prefix; write_unlock_irq(&device->cache.lock); if (enforce_security) @@ -1428,27 +1413,17 @@ static void ib_cache_event(struct ib_event_handler *handler, int ib_cache_setup_one(struct ib_device *device) { - int p; + unsigned int p; int err; rwlock_init(&device->cache.lock); - device->cache.ports = - kcalloc(rdma_end_port(device) - rdma_start_port(device) + 1, - sizeof(*device->cache.ports), - GFP_KERNEL); - if (!device->cache.ports) - return -ENOMEM; - err = gid_table_setup_one(device); - if (err) { - kfree(device->cache.ports); - device->cache.ports = NULL; + if (err) return err; - } - for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) - ib_cache_update(device, p + rdma_start_port(device), true); + rdma_for_each_port (device, p) + ib_cache_update(device, p, true); INIT_IB_EVENT_HANDLER(&device->cache.event_handler, device, ib_cache_event); @@ -1458,7 +1433,7 @@ int ib_cache_setup_one(struct ib_device *device) void ib_cache_release_one(struct ib_device *device) { - int p; + unsigned int p; /* * The release function frees all the cache elements. @@ -1466,11 +1441,10 @@ void ib_cache_release_one(struct ib_device *device) * all the device's resources when the cache could no * longer be accessed. */ - for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) - kfree(device->cache.ports[p].pkey); + rdma_for_each_port (device, p) + kfree(device->port_data[p].cache.pkey); gid_table_release_one(device); - kfree(device->cache.ports); } void ib_cache_cleanup_one(struct ib_device *device) diff --git a/drivers/infiniband/core/cgroup.c b/drivers/infiniband/core/cgroup.c @@ -21,12 +21,11 @@ * Register with the rdma cgroup. Should be called before * exposing rdma device to user space applications to avoid * resource accounting leak. - * Returns 0 on success or otherwise failure code. */ -int ib_device_register_rdmacg(struct ib_device *device) +void ib_device_register_rdmacg(struct ib_device *device) { device->cg_device.name = device->name; - return rdmacg_register_device(&device->cg_device); + rdmacg_register_device(&device->cg_device); } /** diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c @@ -4052,8 +4052,7 @@ static void cm_recv_handler(struct ib_mad_agent *mad_agent, atomic_long_inc(&port->counter_group[CM_RECV]. counter[attr_id - CM_ATTR_ID_OFFSET]); - work = kmalloc(sizeof(*work) + sizeof(struct sa_path_rec) * paths, - GFP_KERNEL); + work = kmalloc(struct_size(work, path, paths), GFP_KERNEL); if (!work) { ib_free_recv_mad(mad_recv_wc); return; diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c @@ -659,7 +659,7 @@ static int cma_acquire_dev_by_src_ip(struct rdma_id_private *id_priv) struct cma_device *cma_dev; enum ib_gid_type gid_type; int ret = -ENODEV; - u8 port; + unsigned int port; if (dev_addr->dev_type != ARPHRD_INFINIBAND && id_priv->id.ps == RDMA_PS_IPOIB) @@ -673,8 +673,7 @@ static int cma_acquire_dev_by_src_ip(struct rdma_id_private *id_priv) mutex_lock(&lock); list_for_each_entry(cma_dev, &dev_list, list) { - for (port = rdma_start_port(cma_dev->device); - port <= rdma_end_port(cma_dev->device); port++) { + rdma_for_each_port (cma_dev->device, port) { gidp = rdma_protocol_roce(cma_dev->device, port) ? &iboe_gid : &gid; gid_type = cma_dev->default_gid_type[port - 1]; @@ -888,6 +887,7 @@ struct rdma_cm_id *__rdma_create_id(struct net *net, id_priv->id.ps = ps; id_priv->id.qp_type = qp_type; id_priv->tos_set = false; + id_priv->timeout_set = false; id_priv->gid_type = IB_GID_TYPE_IB; spin_lock_init(&id_priv->lock); mutex_init(&id_priv->qp_mutex); @@ -1130,6 +1130,9 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, } else ret = -ENOSYS; + if ((*qp_attr_mask & IB_QP_TIMEOUT) && id_priv->timeout_set) + qp_attr->timeout = id_priv->timeout; + return ret; } EXPORT_SYMBOL(rdma_init_qp_attr); @@ -2410,6 +2413,7 @@ static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog) return PTR_ERR(id); id->tos = id_priv->tos; + id->tos_set = id_priv->tos_set; id_priv->cm_id.iw = id; memcpy(&id_priv->cm_id.iw->local_addr, cma_src_addr(id_priv), @@ -2462,6 +2466,8 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv, atomic_inc(&id_priv->refcount); dev_id_priv->internal_id = 1; dev_id_priv->afonly = id_priv->afonly; + dev_id_priv->tos_set = id_priv->tos_set; + dev_id_priv->tos = id_priv->tos; ret = rdma_listen(id, id_priv->backlog); if (ret) @@ -2490,6 +2496,34 @@ void rdma_set_service_type(struct rdma_cm_id *id, int tos) } EXPORT_SYMBOL(rdma_set_service_type); +/** + * rdma_set_ack_timeout() - Set the ack timeout of QP associated + * with a connection identifier. + * @id: Communication identifier to associated with service type. + * @timeout: Ack timeout to set a QP, expressed as 4.096 * 2^(timeout) usec. + * + * This function should be called before rdma_connect() on active side, + * and on passive side before rdma_accept(). It is applicable to primary + * path only. The timeout will affect the local side of the QP, it is not + * negotiated with remote side and zero disables the timer. + * + * Return: 0 for success + */ +int rdma_set_ack_timeout(struct rdma_cm_id *id, u8 timeout) +{ + struct rdma_id_private *id_priv; + + if (id->qp_type != IB_QPT_RC) + return -EINVAL; + + id_priv = container_of(id, struct rdma_id_private, id); + id_priv->timeout = timeout; + id_priv->timeout_set = true; + + return 0; +} +EXPORT_SYMBOL(rdma_set_ack_timeout); + static void cma_query_handler(int status, struct sa_path_rec *path_rec, void *context) { @@ -2966,13 +3000,22 @@ static void addr_handler(int status, struct sockaddr *src_addr, { struct rdma_id_private *id_priv = context; struct rdma_cm_event event = {}; + struct sockaddr *addr; + struct sockaddr_storage old_addr; mutex_lock(&id_priv->handler_mutex); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_RESOLVED)) goto out; - memcpy(cma_src_addr(id_priv), src_addr, rdma_addr_size(src_addr)); + /* + * Store the previous src address, so that if we fail to acquire + * matching rdma device, old address can be restored back, which helps + * to cancel the cma listen operation correctly. + */ + addr = cma_src_addr(id_priv); + memcpy(&old_addr, addr, rdma_addr_size(addr)); + memcpy(addr, src_addr, rdma_addr_size(src_addr)); if (!status && !id_priv->cma_dev) { status = cma_acquire_dev_by_src_ip(id_priv); if (status) @@ -2983,6 +3026,8 @@ static void addr_handler(int status, struct sockaddr *src_addr, } if (status) { + memcpy(addr, &old_addr, + rdma_addr_size((struct sockaddr *)&old_addr)); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ADDR_BOUND)) goto out; @@ -3798,6 +3843,7 @@ static int cma_connect_iw(struct rdma_id_private *id_priv, return PTR_ERR(cm_id); cm_id->tos = id_priv->tos; + cm_id->tos_set = id_priv->tos_set; id_priv->cm_id.iw = cm_id; memcpy(&cm_id->local_addr, cma_src_addr(id_priv), @@ -4501,7 +4547,7 @@ static void cma_add_one(struct ib_device *device) if (!cma_dev->default_roce_tos) goto free_gid_type; - for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) { + rdma_for_each_port (device, i) { supported_gids = roce_gid_type_mask_support(device, i); WARN_ON(!supported_gids); if (supported_gids & (1 << CMA_PREFERRED_ROCE_GID_TYPE)) @@ -4605,85 +4651,6 @@ static void cma_remove_one(struct ib_device *device, void *client_data) kfree(cma_dev); } -static int cma_get_id_stats(struct sk_buff *skb, struct netlink_callback *cb) -{ - struct nlmsghdr *nlh; - struct rdma_cm_id_stats *id_stats; - struct rdma_id_private *id_priv; - struct rdma_cm_id *id = NULL; - struct cma_device *cma_dev; - int i_dev = 0, i_id = 0; - - /* - * We export all of the IDs as a sequence of messages. Each - * ID gets its own netlink message. - */ - mutex_lock(&lock); - - list_for_each_entry(cma_dev, &dev_list, list) { - if (i_dev < cb->args[0]) { - i_dev++; - continue; - } - - i_id = 0; - list_for_each_entry(id_priv, &cma_dev->id_list, list) { - if (i_id < cb->args[1]) { - i_id++; - continue; - } - - id_stats = ibnl_put_msg(skb, &nlh, cb->nlh->nlmsg_seq, - sizeof *id_stats, RDMA_NL_RDMA_CM, - RDMA_NL_RDMA_CM_ID_STATS, - NLM_F_MULTI); - if (!id_stats) - goto out; - - memset(id_stats, 0, sizeof *id_stats); - id = &id_priv->id; - id_stats->node_type = id->route.addr.dev_addr.dev_type; - id_stats->port_num = id->port_num; - id_stats->bound_dev_if = - id->route.addr.dev_addr.bound_dev_if; - - if (ibnl_put_attr(skb, nlh, - rdma_addr_size(cma_src_addr(id_priv)), - cma_src_addr(id_priv), - RDMA_NL_RDMA_CM_ATTR_SRC_ADDR)) - goto out; - if (ibnl_put_attr(skb, nlh, - rdma_addr_size(cma_dst_addr(id_priv)), - cma_dst_addr(id_priv), - RDMA_NL_RDMA_CM_ATTR_DST_ADDR)) - goto out; - - id_stats->pid = task_pid_vnr(id_priv->res.task); - id_stats->port_space = id->ps; - id_stats->cm_state = id_priv->state; - id_stats->qp_num = id_priv->qp_num; - id_stats->qp_type = id->qp_type; - - i_id++; - nlmsg_end(skb, nlh); - } - - cb->args[1] = 0; - i_dev++; - } - -out: - mutex_unlock(&lock); - cb->args[0] = i_dev; - cb->args[1] = i_id; - - return skb->len; -} - -static const struct rdma_nl_cbs cma_cb_table[RDMA_NL_RDMA_CM_NUM_OPS] = { - [RDMA_NL_RDMA_CM_ID_STATS] = { .dump = cma_get_id_stats}, -}; - static int cma_init_net(struct net *net) { struct cma_pernet *pernet = cma_pernet(net); @@ -4732,7 +4699,6 @@ static int __init cma_init(void) if (ret) goto err; - rdma_nl_register(RDMA_NL_RDMA_CM, cma_cb_table); cma_configfs_init(); return 0; @@ -4748,7 +4714,6 @@ err_wq: static void __exit cma_cleanup(void) { cma_configfs_exit(); - rdma_nl_unregister(RDMA_NL_RDMA_CM); ib_unregister_client(&cma_client); unregister_netdevice_notifier(&cma_nb); ib_sa_unregister_client(&sa_client); @@ -4756,7 +4721,5 @@ static void __exit cma_cleanup(void) destroy_workqueue(cma_wq); } -MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_RDMA_CM, 1); - module_init(cma_init); module_exit(cma_cleanup); diff --git a/drivers/infiniband/core/cma_priv.h b/drivers/infiniband/core/cma_priv.h @@ -84,9 +84,11 @@ struct rdma_id_private { u32 options; u8 srq; u8 tos; - bool tos_set; + u8 tos_set:1; + u8 timeout_set:1; u8 reuseaddr; u8 afonly; + u8 timeout; enum ib_gid_type gid_type; /* diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h @@ -54,9 +54,9 @@ struct pkey_index_qp_list { struct list_head qp_list; }; -int ib_device_register_sysfs(struct ib_device *device, - int (*port_callback)(struct ib_device *, - u8, struct kobject *)); +extern const struct attribute_group ib_dev_attr_group; + +int ib_device_register_sysfs(struct ib_device *device); void ib_device_unregister_sysfs(struct ib_device *device); int ib_device_rename(struct ib_device *ibdev, const char *name); @@ -66,6 +66,9 @@ typedef void (*roce_netdev_callback)(struct ib_device *device, u8 port, typedef bool (*roce_netdev_filter)(struct ib_device *device, u8 port, struct net_device *idev, void *cookie); +struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, + unsigned int port); + void ib_enum_roce_netdev(struct ib_device *ib_dev, roce_netdev_filter filter, void *filter_cookie, @@ -117,7 +120,7 @@ void ib_cache_cleanup_one(struct ib_device *device); void ib_cache_release_one(struct ib_device *device); #ifdef CONFIG_CGROUP_RDMA -int ib_device_register_rdmacg(struct ib_device *device); +void ib_device_register_rdmacg(struct ib_device *device); void ib_device_unregister_rdmacg(struct ib_device *device); int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj, @@ -128,21 +131,26 @@ void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj, struct ib_device *device, enum rdmacg_resource_type resource_index); #else -static inline int ib_device_register_rdmacg(struct ib_device *device) -{ return 0; } +static inline void ib_device_register_rdmacg(struct ib_device *device) +{ +} static inline void ib_device_unregister_rdmacg(struct ib_device *device) -{ } +{ +} static inline int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj, struct ib_device *device, enum rdmacg_resource_type resource_index) -{ return 0; } +{ + return 0; +} static inline void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj, struct ib_device *device, enum rdmacg_resource_type resource_index) -{ } +{ +} #endif static inline bool rdma_is_upper_dev_rcu(struct net_device *dev, @@ -178,7 +186,7 @@ int ib_get_cached_subnet_prefix(struct ib_device *device, u64 *sn_pfx); #ifdef CONFIG_SECURITY_INFINIBAND -void ib_security_destroy_port_pkey_list(struct ib_device *device); +void ib_security_release_port_pkey_list(struct ib_device *device); void ib_security_cache_change(struct ib_device *device, u8 port_num, @@ -199,8 +207,9 @@ int ib_mad_agent_security_setup(struct ib_mad_agent *agent, enum ib_qp_type qp_type); void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent); int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index); +void ib_mad_agent_security_change(void); #else -static inline void ib_security_destroy_port_pkey_list(struct ib_device *device) +static inline void ib_security_release_port_pkey_list(struct ib_device *device) { } @@ -264,6 +273,10 @@ static inline int ib_mad_enforce_security(struct ib_mad_agent_private *map, { return 0; } + +static inline void ib_mad_agent_security_change(void) +{ +} #endif struct ib_device *ib_device_get_by_index(u32 ifindex); diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c @@ -37,54 +37,111 @@ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/init.h> -#include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/security.h> #include <linux/notifier.h> +#include <linux/hashtable.h> #include <rdma/rdma_netlink.h> #include <rdma/ib_addr.h> #include <rdma/ib_cache.h> #include "core_priv.h" +#include "restrack.h" MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("core kernel InfiniBand API"); MODULE_LICENSE("Dual BSD/GPL"); -struct ib_client_data { - struct list_head list; - struct ib_client *client; - void * data; - /* The device or client is going down. Do not call client or device - * callbacks other than remove(). */ - bool going_down; -}; - struct workqueue_struct *ib_comp_wq; struct workqueue_struct *ib_comp_unbound_wq; struct workqueue_struct *ib_wq; EXPORT_SYMBOL_GPL(ib_wq); -/* The device_list and client_list contain devices and clients after their - * registration has completed, and the devices and clients are removed - * during unregistration. */ -static LIST_HEAD(device_list); -static LIST_HEAD(client_list); +/* + * Each of the three rwsem locks (devices, clients, client_data) protects the + * xarray of the same name. Specifically it allows the caller to assert that + * the MARK will/will not be changing under the lock, and for devices and + * clients, that the value in the xarray is still a valid pointer. Change of + * the MARK is linked to the object state, so holding the lock and testing the + * MARK also asserts that the contained object is in a certain state. + * + * This is used to build a two stage register/unregister flow where objects + * can continue to be in the xarray even though they are still in progress to + * register/unregister. + * + * The xarray itself provides additional locking, and restartable iteration, + * which is also relied on. + * + * Locks should not be nested, with the exception of client_data, which is + * allowed to nest under the read side of the other two locks. + * + * The devices_rwsem also protects the device name list, any change or + * assignment of device name must also hold the write side to guarantee unique + * names. + */ /* - * device_mutex and lists_rwsem protect access to both device_list and - * client_list. device_mutex protects writer access by device and client - * registration / de-registration. lists_rwsem protects reader access to - * these lists. Iterators of these lists must lock it for read, while updates - * to the lists must be done with a write lock. A special case is when the - * device_mutex is locked. In this case locking the lists for read access is - * not necessary as the device_mutex implies it. + * devices contains devices that have had their names assigned. The + * devices may not be registered. Users that care about the registration + * status need to call ib_device_try_get() on the device to ensure it is + * registered, and keep it registered, for the required duration. * - * lists_rwsem also protects access to the client data list. */ -static DEFINE_MUTEX(device_mutex); -static DECLARE_RWSEM(lists_rwsem); +static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC); +static DECLARE_RWSEM(devices_rwsem); +#define DEVICE_REGISTERED XA_MARK_1 +static LIST_HEAD(client_list); +#define CLIENT_REGISTERED XA_MARK_1 +static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC); +static DECLARE_RWSEM(clients_rwsem); + +/* + * If client_data is registered then the corresponding client must also still + * be registered. + */ +#define CLIENT_DATA_REGISTERED XA_MARK_1 +/* + * xarray has this behavior where it won't iterate over NULL values stored in + * allocated arrays. So we need our own iterator to see all values stored in + * the array. This does the same thing as xa_for_each except that it also + * returns NULL valued entries if the array is allocating. Simplified to only + * work on simple xarrays. + */ +static void *xan_find_marked(struct xarray *xa, unsigned long *indexp, + xa_mark_t filter) +{ + XA_STATE(xas, xa, *indexp); + void *entry; + + rcu_read_lock(); + do { + entry = xas_find_marked(&xas, ULONG_MAX, filter); + if (xa_is_zero(entry)) + break; + } while (xas_retry(&xas, entry)); + rcu_read_unlock(); + + if (entry) { + *indexp = xas.xa_index; + if (xa_is_zero(entry)) + return NULL; + return entry; + } + return XA_ERROR(-ENOENT); +} +#define xan_for_each_marked(xa, index, entry, filter) \ + for (index = 0, entry = xan_find_marked(xa, &(index), filter); \ + !xa_is_err(entry); \ + (index)++, entry = xan_find_marked(xa, &(index), filter)) + +/* RCU hash table mapping netdevice pointers to struct ib_port_data */ +static DEFINE_SPINLOCK(ndev_hash_lock); +static DECLARE_HASHTABLE(ndev_hash, 5); + +static void free_netdevs(struct ib_device *ib_dev); +static void ib_unregister_work(struct work_struct *work); +static void __ib_unregister_device(struct ib_device *device); static int ib_security_change(struct notifier_block *nb, unsigned long event, void *lsm_data); static void ib_policy_change_task(struct work_struct *work); @@ -94,6 +151,12 @@ static struct notifier_block ibdev_lsm_nb = { .notifier_call = ib_security_change, }; +/* Pointer to the RCU head at the start of the ib_port_data array */ +struct ib_port_data_rcu { + struct rcu_head rcu_head; + struct ib_port_data pdata[]; +}; + static int ib_device_check_mandatory(struct ib_device *device) { #define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x } @@ -121,30 +184,18 @@ static int ib_device_check_mandatory(struct ib_device *device) }; int i; + device->kverbs_provider = true; for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { if (!*(void **) ((void *) &device->ops + mandatory_table[i].offset)) { - dev_warn(&device->dev, - "Device is missing mandatory function %s\n", - mandatory_table[i].name); - return -EINVAL; + device->kverbs_provider = false; + break; } } return 0; } -static struct ib_device *__ib_device_get_by_index(u32 index) -{ - struct ib_device *device; - - list_for_each_entry(device, &device_list, core_list) - if (device->index == index) - return device; - - return NULL; -} - /* * Caller must perform ib_device_put() to return the device reference count * when ib_device_get_by_index() returns valid device pointer. @@ -153,13 +204,13 @@ struct ib_device *ib_device_get_by_index(u32 index) { struct ib_device *device; - down_read(&lists_rwsem); - device = __ib_device_get_by_index(index); + down_read(&devices_rwsem); + device = xa_load(&devices, index); if (device) { if (!ib_device_try_get(device)) device = NULL; } - up_read(&lists_rwsem); + up_read(&devices_rwsem); return device; } @@ -180,28 +231,56 @@ EXPORT_SYMBOL(ib_device_put); static struct ib_device *__ib_device_get_by_name(const char *name) { struct ib_device *device; + unsigned long index; - list_for_each_entry(device, &device_list, core_list) + xa_for_each (&devices, index, device) if (!strcmp(name, dev_name(&device->dev))) return device; return NULL; } -int ib_device_rename(struct ib_device *ibdev, const char *name) +/** + * ib_device_get_by_name - Find an IB device by name + * @name: The name to look for + * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) + * + * Find and hold an ib_device by its name. The caller must call + * ib_device_put() on the returned pointer. + */ +struct ib_device *ib_device_get_by_name(const char *name, + enum rdma_driver_id driver_id) { struct ib_device *device; - int ret = 0; - if (!strcmp(name, dev_name(&ibdev->dev))) - return ret; + down_read(&devices_rwsem); + device = __ib_device_get_by_name(name); + if (device && driver_id != RDMA_DRIVER_UNKNOWN && + device->driver_id != driver_id) + device = NULL; - mutex_lock(&device_mutex); - list_for_each_entry(device, &device_list, core_list) { - if (!strcmp(name, dev_name(&device->dev))) { - ret = -EEXIST; - goto out; - } + if (device) { + if (!ib_device_try_get(device)) + device = NULL; + } + up_read(&devices_rwsem); + return device; +} +EXPORT_SYMBOL(ib_device_get_by_name); + +int ib_device_rename(struct ib_device *ibdev, const char *name) +{ + int ret; + + down_write(&devices_rwsem); + if (!strcmp(name, dev_name(&ibdev->dev))) { + ret = 0; + goto out; + } + + if (__ib_device_get_by_name(name)) { + ret = -EEXIST; + goto out; } ret = device_rename(&ibdev->dev, name); @@ -209,53 +288,60 @@ int ib_device_rename(struct ib_device *ibdev, const char *name) goto out; strlcpy(ibdev->name, name, IB_DEVICE_NAME_MAX); out: - mutex_unlock(&device_mutex); + up_write(&devices_rwsem); return ret; } static int alloc_name(struct ib_device *ibdev, const char *name) { - unsigned long *inuse; struct ib_device *device; + unsigned long index; + struct ida inuse; + int rc; int i; - inuse = (unsigned long *) get_zeroed_page(GFP_KERNEL); - if (!inuse) - return -ENOMEM; - - list_for_each_entry(device, &device_list, core_list) { + lockdep_assert_held_exclusive(&devices_rwsem); + ida_init(&inuse); + xa_for_each (&devices, index, device) { char buf[IB_DEVICE_NAME_MAX]; if (sscanf(dev_name(&device->dev), name, &i) != 1) continue; - if (i < 0 || i >= PAGE_SIZE * 8) + if (i < 0 || i >= INT_MAX) continue; snprintf(buf, sizeof buf, name, i); - if (!strcmp(buf, dev_name(&device->dev))) - set_bit(i, inuse); + if (strcmp(buf, dev_name(&device->dev)) != 0) + continue; + + rc = ida_alloc_range(&inuse, i, i, GFP_KERNEL); + if (rc < 0) + goto out; } - i = find_first_zero_bit(inuse, PAGE_SIZE * 8); - free_page((unsigned long) inuse); + rc = ida_alloc(&inuse, GFP_KERNEL); + if (rc < 0) + goto out; - return dev_set_name(&ibdev->dev, name, i); + rc = dev_set_name(&ibdev->dev, name, rc); +out: + ida_destroy(&inuse); + return rc; } static void ib_device_release(struct device *device) { struct ib_device *dev = container_of(device, struct ib_device, dev); - WARN_ON(dev->reg_state == IB_DEV_REGISTERED); - if (dev->reg_state == IB_DEV_UNREGISTERED) { - /* - * In IB_DEV_UNINITIALIZED state, cache or port table - * is not even created. Free cache and port table only when - * device reaches UNREGISTERED state. - */ - ib_cache_release_one(dev); - kfree(dev->port_immutable); - } - kfree(dev); + free_netdevs(dev); + WARN_ON(refcount_read(&dev->refcount)); + ib_cache_release_one(dev); + ib_security_release_port_pkey_list(dev); + xa_destroy(&dev->client_data); + if (dev->port_data) + kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu, + pdata[0]), + rcu_head); + kfree_rcu(dev, rcu_head); } static int ib_device_uevent(struct device *device, @@ -278,7 +364,7 @@ static struct class ib_class = { }; /** - * ib_alloc_device - allocate an IB device struct + * _ib_alloc_device - allocate an IB device struct * @size:size of structure to allocate * * Low-level drivers should use ib_alloc_device() to allocate &struct @@ -287,7 +373,7 @@ static struct class ib_class = { * ib_dealloc_device() must be used to free structures allocated with * ib_alloc_device(). */ -struct ib_device *ib_alloc_device(size_t size) +struct ib_device *_ib_alloc_device(size_t size) { struct ib_device *device; @@ -298,23 +384,32 @@ struct ib_device *ib_alloc_device(size_t size) if (!device) return NULL; - rdma_restrack_init(&device->res); + if (rdma_restrack_init(device)) { + kfree(device); + return NULL; + } device->dev.class = &ib_class; + device->groups[0] = &ib_dev_attr_group; + device->dev.groups = device->groups; device_initialize(&device->dev); - dev_set_drvdata(&device->dev, device); - INIT_LIST_HEAD(&device->event_handler_list); spin_lock_init(&device->event_handler_lock); - rwlock_init(&device->client_data_lock); - INIT_LIST_HEAD(&device->client_data_list); + mutex_init(&device->unregistration_lock); + /* + * client_data needs to be alloc because we don't want our mark to be + * destroyed if the user stores NULL in the client data. + */ + xa_init_flags(&device->client_data, XA_FLAGS_ALLOC); + init_rwsem(&device->client_data_rwsem); INIT_LIST_HEAD(&device->port_list); init_completion(&device->unreg_completion); + INIT_WORK(&device->unregistration_work, ib_unregister_work); return device; } -EXPORT_SYMBOL(ib_alloc_device); +EXPORT_SYMBOL(_ib_alloc_device); /** * ib_dealloc_device - free an IB device struct @@ -324,32 +419,153 @@ EXPORT_SYMBOL(ib_alloc_device); */ void ib_dealloc_device(struct ib_device *device) { - WARN_ON(!list_empty(&device->client_data_list)); - WARN_ON(device->reg_state != IB_DEV_UNREGISTERED && - device->reg_state != IB_DEV_UNINITIALIZED); - rdma_restrack_clean(&device->res); + if (device->ops.dealloc_driver) + device->ops.dealloc_driver(device); + + /* + * ib_unregister_driver() requires all devices to remain in the xarray + * while their ops are callable. The last op we call is dealloc_driver + * above. This is needed to create a fence on op callbacks prior to + * allowing the driver module to unload. + */ + down_write(&devices_rwsem); + if (xa_load(&devices, device->index) == device) + xa_erase(&devices, device->index); + up_write(&devices_rwsem); + + /* Expedite releasing netdev references */ + free_netdevs(device); + + WARN_ON(!xa_empty(&device->client_data)); + WARN_ON(refcount_read(&device->refcount)); + rdma_restrack_clean(device); + /* Balances with device_initialize */ put_device(&device->dev); } EXPORT_SYMBOL(ib_dealloc_device); -static int add_client_context(struct ib_device *device, struct ib_client *client) +/* + * add_client_context() and remove_client_context() must be safe against + * parallel calls on the same device - registration/unregistration of both the + * device and client can be occurring in parallel. + * + * The routines need to be a fence, any caller must not return until the add + * or remove is fully completed. + */ +static int add_client_context(struct ib_device *device, + struct ib_client *client) { - struct ib_client_data *context; + int ret = 0; - context = kmalloc(sizeof(*context), GFP_KERNEL); - if (!context) - return -ENOMEM; + if (!device->kverbs_provider && !client->no_kverbs_req) + return 0; + + down_write(&device->client_data_rwsem); + /* + * Another caller to add_client_context got here first and has already + * completely initialized context. + */ + if (xa_get_mark(&device->client_data, client->client_id, + CLIENT_DATA_REGISTERED)) + goto out; + + ret = xa_err(xa_store(&device->client_data, client->client_id, NULL, + GFP_KERNEL)); + if (ret) + goto out; + downgrade_write(&device->client_data_rwsem); + if (client->add) + client->add(device); + + /* Readers shall not see a client until add has been completed */ + xa_set_mark(&device->client_data, client->client_id, + CLIENT_DATA_REGISTERED); + up_read(&device->client_data_rwsem); + return 0; - context->client = client; - context->data = NULL; - context->going_down = false; +out: + up_write(&device->client_data_rwsem); + return ret; +} - down_write(&lists_rwsem); - write_lock_irq(&device->client_data_lock); - list_add(&context->list, &device->client_data_list); - write_unlock_irq(&device->client_data_lock); - up_write(&lists_rwsem); +static void remove_client_context(struct ib_device *device, + unsigned int client_id) +{ + struct ib_client *client; + void *client_data; + + down_write(&device->client_data_rwsem); + if (!xa_get_mark(&device->client_data, client_id, + CLIENT_DATA_REGISTERED)) { + up_write(&device->client_data_rwsem); + return; + } + client_data = xa_load(&device->client_data, client_id); + xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED); + client = xa_load(&clients, client_id); + downgrade_write(&device->client_data_rwsem); + + /* + * Notice we cannot be holding any exclusive locks when calling the + * remove callback as the remove callback can recurse back into any + * public functions in this module and thus try for any locks those + * functions take. + * + * For this reason clients and drivers should not call the + * unregistration functions will holdling any locks. + * + * It tempting to drop the client_data_rwsem too, but this is required + * to ensure that unregister_client does not return until all clients + * are completely unregistered, which is required to avoid module + * unloading races. + */ + if (client->remove) + client->remove(device, client_data); + + xa_erase(&device->client_data, client_id); + up_read(&device->client_data_rwsem); +} + +static int alloc_port_data(struct ib_device *device) +{ + struct ib_port_data_rcu *pdata_rcu; + unsigned int port; + + if (device->port_data) + return 0; + /* This can only be called once the physical port range is defined */ + if (WARN_ON(!device->phys_port_cnt)) + return -EINVAL; + + /* + * device->port_data is indexed directly by the port number to make + * access to this data as efficient as possible. + * + * Therefore port_data is declared as a 1 based array with potential + * empty slots at the beginning. + */ + pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata, + rdma_end_port(device) + 1), + GFP_KERNEL); + if (!pdata_rcu) + return -ENOMEM; + /* + * The rcu_head is put in front of the port data array and the stored + * pointer is adjusted since we never need to see that member until + * kfree_rcu. + */ + device->port_data = pdata_rcu->pdata; + + rdma_for_each_port (device, port) { + struct ib_port_data *pdata = &device->port_data[port]; + + pdata->ib_dev = device; + spin_lock_init(&pdata->pkey_list_lock); + INIT_LIST_HEAD(&pdata->pkey_list); + spin_lock_init(&pdata->netdev_lock); + INIT_HLIST_NODE(&pdata->ndev_hash_link); + } return 0; } @@ -359,29 +575,20 @@ static int verify_immutable(const struct ib_device *dev, u8 port) rdma_max_mad_size(dev, port) != 0); } -static int read_port_immutable(struct ib_device *device) +static int setup_port_data(struct ib_device *device) { + unsigned int port; int ret; - u8 start_port = rdma_start_port(device); - u8 end_port = rdma_end_port(device); - u8 port; - /** - * device->port_immutable is indexed directly by the port number to make - * access to this data as efficient as possible. - * - * Therefore port_immutable is declared as a 1 based array with - * potential empty slots at the beginning. - */ - device->port_immutable = kcalloc(end_port + 1, - sizeof(*device->port_immutable), - GFP_KERNEL); - if (!device->port_immutable) - return -ENOMEM; + ret = alloc_port_data(device); + if (ret) + return ret; - for (port = start_port; port <= end_port; ++port) { - ret = device->ops.get_port_immutable( - device, port, &device->port_immutable[port]); + rdma_for_each_port (device, port) { + struct ib_port_data *pdata = &device->port_data[port]; + + ret = device->ops.get_port_immutable(device, port, + &pdata->immutable); if (ret) return ret; @@ -400,39 +607,16 @@ void ib_get_device_fw_str(struct ib_device *dev, char *str) } EXPORT_SYMBOL(ib_get_device_fw_str); -static int setup_port_pkey_list(struct ib_device *device) -{ - int i; - - /** - * device->port_pkey_list is indexed directly by the port number, - * Therefore it is declared as a 1 based array with potential empty - * slots at the beginning. - */ - device->port_pkey_list = kcalloc(rdma_end_port(device) + 1, - sizeof(*device->port_pkey_list), - GFP_KERNEL); - - if (!device->port_pkey_list) - return -ENOMEM; - - for (i = 0; i < (rdma_end_port(device) + 1); i++) { - spin_lock_init(&device->port_pkey_list[i].list_lock); - INIT_LIST_HEAD(&device->port_pkey_list[i].pkey_list); - } - - return 0; -} - static void ib_policy_change_task(struct work_struct *work) { struct ib_device *dev; + unsigned long index; - down_read(&lists_rwsem); - list_for_each_entry(dev, &device_list, core_list) { - int i; + down_read(&devices_rwsem); + xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { + unsigned int i; - for (i = rdma_start_port(dev); i <= rdma_end_port(dev); i++) { + rdma_for_each_port (dev, i) { u64 sp; int ret = ib_get_cached_subnet_prefix(dev, i, @@ -445,7 +629,7 @@ static void ib_policy_change_task(struct work_struct *work) ib_security_cache_change(dev, i, sp); } } - up_read(&lists_rwsem); + up_read(&devices_rwsem); } static int ib_security_change(struct notifier_block *nb, unsigned long event, @@ -455,32 +639,52 @@ static int ib_security_change(struct notifier_block *nb, unsigned long event, return NOTIFY_DONE; schedule_work(&ib_policy_change_work); + ib_mad_agent_security_change(); return NOTIFY_OK; } -/** - * __dev_new_index - allocate an device index - * - * Returns a suitable unique value for a new device interface - * number. It assumes that there are less than 2^32-1 ib devices - * will be present in the system. +/* + * Assign the unique string device name and the unique device index. This is + * undone by ib_dealloc_device. */ -static u32 __dev_new_index(void) +static int assign_name(struct ib_device *device, const char *name) { - /* - * The device index to allow stable naming. - * Similar to struct net -> ifindex. - */ - static u32 index; + static u32 last_id; + int ret; - for (;;) { - if (!(++index)) - index = 1; + down_write(&devices_rwsem); + /* Assign a unique name to the device */ + if (strchr(name, '%')) + ret = alloc_name(device, name); + else + ret = dev_set_name(&device->dev, name); + if (ret) + goto out; + + if (__ib_device_get_by_name(dev_name(&device->dev))) { + ret = -ENFILE; + goto out; + } + strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); - if (!__ib_device_get_by_index(index)) - return index; + /* Cyclically allocate a user visible ID for the device */ + device->index = last_id; + ret = xa_alloc(&devices, &device->index, INT_MAX, device, GFP_KERNEL); + if (ret == -ENOSPC) { + device->index = 0; + ret = xa_alloc(&devices, &device->index, INT_MAX, device, + GFP_KERNEL); } + if (ret) + goto out; + last_id = device->index + 1; + + ret = 0; + +out: + up_write(&devices_rwsem); + return ret; } static void setup_dma_device(struct ib_device *device) @@ -518,27 +722,25 @@ static void setup_dma_device(struct ib_device *device) } } -static void cleanup_device(struct ib_device *device) -{ - ib_cache_cleanup_one(device); - ib_cache_release_one(device); - kfree(device->port_pkey_list); - kfree(device->port_immutable); -} - +/* + * setup_device() allocates memory and sets up data that requires calling the + * device ops, this is the only reason these actions are not done during + * ib_alloc_device. It is undone by ib_dealloc_device(). + */ static int setup_device(struct ib_device *device) { struct ib_udata uhw = {.outlen = 0, .inlen = 0}; int ret; + setup_dma_device(device); + ret = ib_device_check_mandatory(device); if (ret) return ret; - ret = read_port_immutable(device); + ret = setup_port_data(device); if (ret) { - dev_warn(&device->dev, - "Couldn't create per port immutable data\n"); + dev_warn(&device->dev, "Couldn't create per-port data\n"); return ret; } @@ -547,27 +749,76 @@ static int setup_device(struct ib_device *device) if (ret) { dev_warn(&device->dev, "Couldn't query the device attributes\n"); - goto port_cleanup; + return ret; } - ret = setup_port_pkey_list(device); - if (ret) { - dev_warn(&device->dev, "Couldn't create per port_pkey_list\n"); - goto port_cleanup; + return 0; +} + +static void disable_device(struct ib_device *device) +{ + struct ib_client *client; + + WARN_ON(!refcount_read(&device->refcount)); + + down_write(&devices_rwsem); + xa_clear_mark(&devices, device->index, DEVICE_REGISTERED); + up_write(&devices_rwsem); + + down_read(&clients_rwsem); + list_for_each_entry_reverse(client, &client_list, list) + remove_client_context(device, client->client_id); + up_read(&clients_rwsem); + + /* Pairs with refcount_set in enable_device */ + ib_device_put(device); + wait_for_completion(&device->unreg_completion); + + /* Expedite removing unregistered pointers from the hash table */ + free_netdevs(device); +} + +/* + * An enabled device is visible to all clients and to all the public facing + * APIs that return a device pointer. This always returns with a new get, even + * if it fails. + */ +static int enable_device_and_get(struct ib_device *device) +{ + struct ib_client *client; + unsigned long index; + int ret = 0; + + /* + * One ref belongs to the xa and the other belongs to this + * thread. This is needed to guard against parallel unregistration. + */ + refcount_set(&device->refcount, 2); + down_write(&devices_rwsem); + xa_set_mark(&devices, device->index, DEVICE_REGISTERED); + + /* + * By using downgrade_write() we ensure that no other thread can clear + * DEVICE_REGISTERED while we are completing the client setup. + */ + downgrade_write(&devices_rwsem); + + if (device->ops.enable_driver) { + ret = device->ops.enable_driver(device); + if (ret) + goto out; } - ret = ib_cache_setup_one(device); - if (ret) { - dev_warn(&device->dev, - "Couldn't set up InfiniBand P_Key/GID cache\n"); - goto pkey_cleanup; + down_read(&clients_rwsem); + xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { + ret = add_client_context(device, client); + if (ret) + break; } - return 0; + up_read(&clients_rwsem); -pkey_cleanup: - kfree(device->port_pkey_list); -port_cleanup: - kfree(device->port_immutable); +out: + up_read(&devices_rwsem); return ret; } @@ -579,133 +830,253 @@ port_cleanup: * devices with the IB core. All registered clients will receive a * callback for each device that is added. @device must be allocated * with ib_alloc_device(). + * + * If the driver uses ops.dealloc_driver and calls any ib_unregister_device() + * asynchronously then the device pointer may become freed as soon as this + * function returns. */ -int ib_register_device(struct ib_device *device, const char *name, - int (*port_callback)(struct ib_device *, u8, - struct kobject *)) +int ib_register_device(struct ib_device *device, const char *name) { int ret; - struct ib_client *client; - - setup_dma_device(device); - - mutex_lock(&device_mutex); - if (strchr(name, '%')) { - ret = alloc_name(device, name); - if (ret) - goto out; - } else { - ret = dev_set_name(&device->dev, name); - if (ret) - goto out; - } - if (__ib_device_get_by_name(dev_name(&device->dev))) { - ret = -ENFILE; - goto out; - } - strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); + ret = assign_name(device, name); + if (ret) + return ret; ret = setup_device(device); if (ret) - goto out; - - device->index = __dev_new_index(); + return ret; - ret = ib_device_register_rdmacg(device); + ret = ib_cache_setup_one(device); if (ret) { dev_warn(&device->dev, - "Couldn't register device with rdma cgroup\n"); - goto dev_cleanup; + "Couldn't set up InfiniBand P_Key/GID cache\n"); + return ret; } - ret = ib_device_register_sysfs(device, port_callback); + ib_device_register_rdmacg(device); + + ret = device_add(&device->dev); + if (ret) + goto cg_cleanup; + + ret = ib_device_register_sysfs(device); if (ret) { dev_warn(&device->dev, "Couldn't register device with driver model\n"); - goto cg_cleanup; + goto dev_cleanup; } - refcount_set(&device->refcount, 1); - device->reg_state = IB_DEV_REGISTERED; + ret = enable_device_and_get(device); + if (ret) { + void (*dealloc_fn)(struct ib_device *); - list_for_each_entry(client, &client_list, list) - if (!add_client_context(device, client) && client->add) - client->add(device); + /* + * If we hit this error flow then we don't want to + * automatically dealloc the device since the caller is + * expected to call ib_dealloc_device() after + * ib_register_device() fails. This is tricky due to the + * possibility for a parallel unregistration along with this + * error flow. Since we have a refcount here we know any + * parallel flow is stopped in disable_device and will see the + * NULL pointers, causing the responsibility to + * ib_dealloc_device() to revert back to this thread. + */ + dealloc_fn = device->ops.dealloc_driver; + device->ops.dealloc_driver = NULL; + ib_device_put(device); + __ib_unregister_device(device); + device->ops.dealloc_driver = dealloc_fn; + return ret; + } + ib_device_put(device); - down_write(&lists_rwsem); - list_add_tail(&device->core_list, &device_list); - up_write(&lists_rwsem); - mutex_unlock(&device_mutex); return 0; +dev_cleanup: + device_del(&device->dev); cg_cleanup: ib_device_unregister_rdmacg(device); -dev_cleanup: - cleanup_device(device); -out: - mutex_unlock(&device_mutex); + ib_cache_cleanup_one(device); return ret; } EXPORT_SYMBOL(ib_register_device); +/* Callers must hold a get on the device. */ +static void __ib_unregister_device(struct ib_device *ib_dev) +{ + /* + * We have a registration lock so that all the calls to unregister are + * fully fenced, once any unregister returns the device is truely + * unregistered even if multiple callers are unregistering it at the + * same time. This also interacts with the registration flow and + * provides sane semantics if register and unregister are racing. + */ + mutex_lock(&ib_dev->unregistration_lock); + if (!refcount_read(&ib_dev->refcount)) + goto out; + + disable_device(ib_dev); + ib_device_unregister_sysfs(ib_dev); + device_del(&ib_dev->dev); + ib_device_unregister_rdmacg(ib_dev); + ib_cache_cleanup_one(ib_dev); + + /* + * Drivers using the new flow may not call ib_dealloc_device except + * in error unwind prior to registration success. + */ + if (ib_dev->ops.dealloc_driver) { + WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1); + ib_dealloc_device(ib_dev); + } +out: + mutex_unlock(&ib_dev->unregistration_lock); +} + /** * ib_unregister_device - Unregister an IB device - * @device:Device to unregister + * @device: The device to unregister * * Unregister an IB device. All clients will receive a remove callback. + * + * Callers should call this routine only once, and protect against races with + * registration. Typically it should only be called as part of a remove + * callback in an implementation of driver core's struct device_driver and + * related. + * + * If ops.dealloc_driver is used then ib_dev will be freed upon return from + * this function. */ -void ib_unregister_device(struct ib_device *device) +void ib_unregister_device(struct ib_device *ib_dev) { - struct ib_client_data *context, *tmp; - unsigned long flags; + get_device(&ib_dev->dev); + __ib_unregister_device(ib_dev); + put_device(&ib_dev->dev); +} +EXPORT_SYMBOL(ib_unregister_device); - /* - * Wait for all netlink command callers to finish working on the - * device. - */ - ib_device_put(device); - wait_for_completion(&device->unreg_completion); +/** + * ib_unregister_device_and_put - Unregister a device while holding a 'get' + * device: The device to unregister + * + * This is the same as ib_unregister_device(), except it includes an internal + * ib_device_put() that should match a 'get' obtained by the caller. + * + * It is safe to call this routine concurrently from multiple threads while + * holding the 'get'. When the function returns the device is fully + * unregistered. + * + * Drivers using this flow MUST use the driver_unregister callback to clean up + * their resources associated with the device and dealloc it. + */ +void ib_unregister_device_and_put(struct ib_device *ib_dev) +{ + WARN_ON(!ib_dev->ops.dealloc_driver); + get_device(&ib_dev->dev); + ib_device_put(ib_dev); + __ib_unregister_device(ib_dev); + put_device(&ib_dev->dev); +} +EXPORT_SYMBOL(ib_unregister_device_and_put); - mutex_lock(&device_mutex); +/** + * ib_unregister_driver - Unregister all IB devices for a driver + * @driver_id: The driver to unregister + * + * This implements a fence for device unregistration. It only returns once all + * devices associated with the driver_id have fully completed their + * unregistration and returned from ib_unregister_device*(). + * + * If device's are not yet unregistered it goes ahead and starts unregistering + * them. + * + * This does not block creation of new devices with the given driver_id, that + * is the responsibility of the caller. + */ +void ib_unregister_driver(enum rdma_driver_id driver_id) +{ + struct ib_device *ib_dev; + unsigned long index; - down_write(&lists_rwsem); - list_del(&device->core_list); - write_lock_irq(&device->client_data_lock); - list_for_each_entry(context, &device->client_data_list, list) - context->going_down = true; - write_unlock_irq(&device->client_data_lock); - downgrade_write(&lists_rwsem); + down_read(&devices_rwsem); + xa_for_each (&devices, index, ib_dev) { + if (ib_dev->driver_id != driver_id) + continue; - list_for_each_entry(context, &device->client_data_list, list) { - if (context->client->remove) - context->client->remove(device, context->data); + get_device(&ib_dev->dev); + up_read(&devices_rwsem); + + WARN_ON(!ib_dev->ops.dealloc_driver); + __ib_unregister_device(ib_dev); + + put_device(&ib_dev->dev); + down_read(&devices_rwsem); } - up_read(&lists_rwsem); + up_read(&devices_rwsem); +} +EXPORT_SYMBOL(ib_unregister_driver); - ib_device_unregister_sysfs(device); - ib_device_unregister_rdmacg(device); +static void ib_unregister_work(struct work_struct *work) +{ + struct ib_device *ib_dev = + container_of(work, struct ib_device, unregistration_work); - mutex_unlock(&device_mutex); + __ib_unregister_device(ib_dev); + put_device(&ib_dev->dev); +} - ib_cache_cleanup_one(device); +/** + * ib_unregister_device_queued - Unregister a device using a work queue + * device: The device to unregister + * + * This schedules an asynchronous unregistration using a WQ for the device. A + * driver should use this to avoid holding locks while doing unregistration, + * such as holding the RTNL lock. + * + * Drivers using this API must use ib_unregister_driver before module unload + * to ensure that all scheduled unregistrations have completed. + */ +void ib_unregister_device_queued(struct ib_device *ib_dev) +{ + WARN_ON(!refcount_read(&ib_dev->refcount)); + WARN_ON(!ib_dev->ops.dealloc_driver); + get_device(&ib_dev->dev); + if (!queue_work(system_unbound_wq, &ib_dev->unregistration_work)) + put_device(&ib_dev->dev); +} +EXPORT_SYMBOL(ib_unregister_device_queued); - ib_security_destroy_port_pkey_list(device); - kfree(device->port_pkey_list); +static int assign_client_id(struct ib_client *client) +{ + int ret; - down_write(&lists_rwsem); - write_lock_irqsave(&device->client_data_lock, flags); - list_for_each_entry_safe(context, tmp, &device->client_data_list, - list) { - list_del(&context->list); - kfree(context); - } - write_unlock_irqrestore(&device->client_data_lock, flags); - up_write(&lists_rwsem); + down_write(&clients_rwsem); + /* + * The add/remove callbacks must be called in FIFO/LIFO order. To + * achieve this we assign client_ids so they are sorted in + * registration order, and retain a linked list we can reverse iterate + * to get the LIFO order. The extra linked list can go away if xarray + * learns to reverse iterate. + */ + if (list_empty(&client_list)) + client->client_id = 0; + else + client->client_id = + list_last_entry(&client_list, struct ib_client, list) + ->client_id; + ret = xa_alloc(&clients, &client->client_id, INT_MAX, client, + GFP_KERNEL); + if (ret) + goto out; + + xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED); + list_add_tail(&client->list, &client_list); - device->reg_state = IB_DEV_UNREGISTERED; +out: + up_write(&clients_rwsem); + return ret; } -EXPORT_SYMBOL(ib_unregister_device); /** * ib_register_client - Register an IB client @@ -723,19 +1094,23 @@ EXPORT_SYMBOL(ib_unregister_device); int ib_register_client(struct ib_client *client) { struct ib_device *device; + unsigned long index; + int ret; - mutex_lock(&device_mutex); - - list_for_each_entry(device, &device_list, core_list) - if (!add_client_context(device, client) && client->add) - client->add(device); - - down_write(&lists_rwsem); - list_add_tail(&client->list, &client_list); - up_write(&lists_rwsem); - - mutex_unlock(&device_mutex); + ret = assign_client_id(client); + if (ret) + return ret; + down_read(&devices_rwsem); + xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) { + ret = add_client_context(device, client); + if (ret) { + up_read(&devices_rwsem); + ib_unregister_client(client); + return ret; + } + } + up_read(&devices_rwsem); return 0; } EXPORT_SYMBOL(ib_register_client); @@ -747,108 +1122,56 @@ EXPORT_SYMBOL(ib_register_client); * Upper level users use ib_unregister_client() to remove their client * registration. When ib_unregister_client() is called, the client * will receive a remove callback for each IB device still registered. + * + * This is a full fence, once it returns no client callbacks will be called, + * or are running in another thread. */ void ib_unregister_client(struct ib_client *client) { - struct ib_client_data *context; struct ib_device *device; + unsigned long index; - mutex_lock(&device_mutex); + down_write(&clients_rwsem); + xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED); + up_write(&clients_rwsem); + /* + * Every device still known must be serialized to make sure we are + * done with the client callbacks before we return. + */ + down_read(&devices_rwsem); + xa_for_each (&devices, index, device) + remove_client_context(device, client->client_id); + up_read(&devices_rwsem); - down_write(&lists_rwsem); + down_write(&clients_rwsem); list_del(&client->list); - up_write(&lists_rwsem); - - list_for_each_entry(device, &device_list, core_list) { - struct ib_client_data *found_context = NULL; - - down_write(&lists_rwsem); - write_lock_irq(&device->client_data_lock); - list_for_each_entry(context, &device->client_data_list, list) - if (context->client == client) { - context->going_down = true; - found_context = context; - break; - } - write_unlock_irq(&device->client_data_lock); - up_write(&lists_rwsem); - - if (client->remove) - client->remove(device, found_context ? - found_context->data : NULL); - - if (!found_context) { - dev_warn(&device->dev, - "No client context found for %s\n", - client->name); - continue; - } - - down_write(&lists_rwsem); - write_lock_irq(&device->client_data_lock); - list_del(&found_context->list); - write_unlock_irq(&device->client_data_lock); - up_write(&lists_rwsem); - kfree(found_context); - } - - mutex_unlock(&device_mutex); + xa_erase(&clients, client->client_id); + up_write(&clients_rwsem); } EXPORT_SYMBOL(ib_unregister_client); /** - * ib_get_client_data - Get IB client context - * @device:Device to get context for - * @client:Client to get context for - * - * ib_get_client_data() returns client context set with - * ib_set_client_data(). - */ -void *ib_get_client_data(struct ib_device *device, struct ib_client *client) -{ - struct ib_client_data *context; - void *ret = NULL; - unsigned long flags; - - read_lock_irqsave(&device->client_data_lock, flags); - list_for_each_entry(context, &device->client_data_list, list) - if (context->client == client) { - ret = context->data; - break; - } - read_unlock_irqrestore(&device->client_data_lock, flags); - - return ret; -} -EXPORT_SYMBOL(ib_get_client_data); - -/** * ib_set_client_data - Set IB client context * @device:Device to set context for * @client:Client to set context for * @data:Context to set * - * ib_set_client_data() sets client context that can be retrieved with - * ib_get_client_data(). + * ib_set_client_data() sets client context data that can be retrieved with + * ib_get_client_data(). This can only be called while the client is + * registered to the device, once the ib_client remove() callback returns this + * cannot be called. */ void ib_set_client_data(struct ib_device *device, struct ib_client *client, void *data) { - struct ib_client_data *context; - unsigned long flags; - - write_lock_irqsave(&device->client_data_lock, flags); - list_for_each_entry(context, &device->client_data_list, list) - if (context->client == client) { - context->data = data; - goto out; - } + void *rc; - dev_warn(&device->dev, "No client context found for %s\n", - client->name); + if (WARN_ON(IS_ERR(data))) + data = NULL; -out: - write_unlock_irqrestore(&device->client_data_lock, flags); + rc = xa_store(&device->client_data, client->client_id, data, + GFP_KERNEL); + WARN_ON(xa_is_err(rc)); } EXPORT_SYMBOL(ib_set_client_data); @@ -947,6 +1270,185 @@ int ib_query_port(struct ib_device *device, } EXPORT_SYMBOL(ib_query_port); +static void add_ndev_hash(struct ib_port_data *pdata) +{ + unsigned long flags; + + might_sleep(); + + spin_lock_irqsave(&ndev_hash_lock, flags); + if (hash_hashed(&pdata->ndev_hash_link)) { + hash_del_rcu(&pdata->ndev_hash_link); + spin_unlock_irqrestore(&ndev_hash_lock, flags); + /* + * We cannot do hash_add_rcu after a hash_del_rcu until the + * grace period + */ + synchronize_rcu(); + spin_lock_irqsave(&ndev_hash_lock, flags); + } + if (pdata->netdev) + hash_add_rcu(ndev_hash, &pdata->ndev_hash_link, + (uintptr_t)pdata->netdev); + spin_unlock_irqrestore(&ndev_hash_lock, flags); +} + +/** + * ib_device_set_netdev - Associate the ib_dev with an underlying net_device + * @ib_dev: Device to modify + * @ndev: net_device to affiliate, may be NULL + * @port: IB port the net_device is connected to + * + * Drivers should use this to link the ib_device to a netdev so the netdev + * shows up in interfaces like ib_enum_roce_netdev. Only one netdev may be + * affiliated with any port. + * + * The caller must ensure that the given ndev is not unregistered or + * unregistering, and that either the ib_device is unregistered or + * ib_device_set_netdev() is called with NULL when the ndev sends a + * NETDEV_UNREGISTER event. + */ +int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, + unsigned int port) +{ + struct net_device *old_ndev; + struct ib_port_data *pdata; + unsigned long flags; + int ret; + + /* + * Drivers wish to call this before ib_register_driver, so we have to + * setup the port data early. + */ + ret = alloc_port_data(ib_dev); + if (ret) + return ret; + + if (!rdma_is_port_valid(ib_dev, port)) + return -EINVAL; + + pdata = &ib_dev->port_data[port]; + spin_lock_irqsave(&pdata->netdev_lock, flags); + old_ndev = rcu_dereference_protected( + pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); + if (old_ndev == ndev) { + spin_unlock_irqrestore(&pdata->netdev_lock, flags); + return 0; + } + + if (ndev) + dev_hold(ndev); + rcu_assign_pointer(pdata->netdev, ndev); + spin_unlock_irqrestore(&pdata->netdev_lock, flags); + + add_ndev_hash(pdata); + if (old_ndev) + dev_put(old_ndev); + + return 0; +} +EXPORT_SYMBOL(ib_device_set_netdev); + +static void free_netdevs(struct ib_device *ib_dev) +{ + unsigned long flags; + unsigned int port; + + rdma_for_each_port (ib_dev, port) { + struct ib_port_data *pdata = &ib_dev->port_data[port]; + struct net_device *ndev; + + spin_lock_irqsave(&pdata->netdev_lock, flags); + ndev = rcu_dereference_protected( + pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); + if (ndev) { + spin_lock(&ndev_hash_lock); + hash_del_rcu(&pdata->ndev_hash_link); + spin_unlock(&ndev_hash_lock); + + /* + * If this is the last dev_put there is still a + * synchronize_rcu before the netdev is kfreed, so we + * can continue to rely on unlocked pointer + * comparisons after the put + */ + rcu_assign_pointer(pdata->netdev, NULL); + dev_put(ndev); + } + spin_unlock_irqrestore(&pdata->netdev_lock, flags); + } +} + +struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, + unsigned int port) +{ + struct ib_port_data *pdata; + struct net_device *res; + + if (!rdma_is_port_valid(ib_dev, port)) + return NULL; + + pdata = &ib_dev->port_data[port]; + + /* + * New drivers should use ib_device_set_netdev() not the legacy + * get_netdev(). + */ + if (ib_dev->ops.get_netdev) + res = ib_dev->ops.get_netdev(ib_dev, port); + else { + spin_lock(&pdata->netdev_lock); + res = rcu_dereference_protected( + pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); + if (res) + dev_hold(res); + spin_unlock(&pdata->netdev_lock); + } + + /* + * If we are starting to unregister expedite things by preventing + * propagation of an unregistering netdev. + */ + if (res && res->reg_state != NETREG_REGISTERED) { + dev_put(res); + return NULL; + } + + return res; +} + +/** + * ib_device_get_by_netdev - Find an IB device associated with a netdev + * @ndev: netdev to locate + * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) + * + * Find and hold an ib_device that is associated with a netdev via + * ib_device_set_netdev(). The caller must call ib_device_put() on the + * returned pointer. + */ +struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, + enum rdma_driver_id driver_id) +{ + struct ib_device *res = NULL; + struct ib_port_data *cur; + + rcu_read_lock(); + hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link, + (uintptr_t)ndev) { + if (rcu_access_pointer(cur->netdev) == ndev && + (driver_id == RDMA_DRIVER_UNKNOWN || + cur->ib_dev->driver_id == driver_id) && + ib_device_try_get(cur->ib_dev)) { + res = cur->ib_dev; + break; + } + } + rcu_read_unlock(); + + return res; +} +EXPORT_SYMBOL(ib_device_get_by_netdev); + /** * ib_enum_roce_netdev - enumerate all RoCE ports * @ib_dev : IB device we want to query @@ -965,21 +1467,12 @@ void ib_enum_roce_netdev(struct ib_device *ib_dev, roce_netdev_callback cb, void *cookie) { - u8 port; + unsigned int port; - for (port = rdma_start_port(ib_dev); port <= rdma_end_port(ib_dev); - port++) + rdma_for_each_port (ib_dev, port) if (rdma_protocol_roce(ib_dev, port)) { - struct net_device *idev = NULL; - - if (ib_dev->ops.get_netdev) - idev = ib_dev->ops.get_netdev(ib_dev, port); - - if (idev && - idev->reg_state >= NETREG_UNREGISTERED) { - dev_put(idev); - idev = NULL; - } + struct net_device *idev = + ib_device_get_netdev(ib_dev, port); if (filter(ib_dev, port, idev, filter_cookie)) cb(ib_dev, port, idev, cookie); @@ -1006,11 +1499,12 @@ void ib_enum_all_roce_netdevs(roce_netdev_filter filter, void *cookie) { struct ib_device *dev; + unsigned long index; - down_read(&lists_rwsem); - list_for_each_entry(dev, &device_list, core_list) + down_read(&devices_rwsem); + xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie); - up_read(&lists_rwsem); + up_read(&devices_rwsem); } /** @@ -1022,19 +1516,19 @@ void ib_enum_all_roce_netdevs(roce_netdev_filter filter, int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, struct netlink_callback *cb) { + unsigned long index; struct ib_device *dev; unsigned int idx = 0; int ret = 0; - down_read(&lists_rwsem); - list_for_each_entry(dev, &device_list, core_list) { + down_read(&devices_rwsem); + xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { ret = nldev_cb(dev, skb, cb, idx); if (ret) break; idx++; } - - up_read(&lists_rwsem); + up_read(&devices_rwsem); return ret; } @@ -1121,13 +1615,15 @@ int ib_find_gid(struct ib_device *device, union ib_gid *gid, u8 *port_num, u16 *index) { union ib_gid tmp_gid; - int ret, port, i; + unsigned int port; + int ret, i; - for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) { + rdma_for_each_port (device, port) { if (!rdma_protocol_ib(device, port)) continue; - for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) { + for (i = 0; i < device->port_data[port].immutable.gid_tbl_len; + ++i) { ret = rdma_query_gid(device, port, i, &tmp_gid); if (ret) return ret; @@ -1159,7 +1655,8 @@ int ib_find_pkey(struct ib_device *device, u16 tmp_pkey; int partial_ix = -1; - for (i = 0; i < device->port_immutable[port_num].pkey_tbl_len; ++i) { + for (i = 0; i < device->port_data[port_num].immutable.pkey_tbl_len; + ++i) { ret = ib_query_pkey(device, port_num, i, &tmp_pkey); if (ret) return ret; @@ -1192,6 +1689,7 @@ EXPORT_SYMBOL(ib_find_pkey); * @gid: A GID that the net_dev uses to communicate. * @addr: Contains the IP address that the request specified as its * destination. + * */ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u8 port, @@ -1200,29 +1698,30 @@ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, const struct sockaddr *addr) { struct net_device *net_dev = NULL; - struct ib_client_data *context; + unsigned long index; + void *client_data; if (!rdma_protocol_ib(dev, port)) return NULL; - down_read(&lists_rwsem); - - list_for_each_entry(context, &dev->client_data_list, list) { - struct ib_client *client = context->client; + /* + * Holding the read side guarantees that the client will not become + * unregistered while we are calling get_net_dev_by_params() + */ + down_read(&dev->client_data_rwsem); + xan_for_each_marked (&dev->client_data, index, client_data, + CLIENT_DATA_REGISTERED) { + struct ib_client *client = xa_load(&clients, index); - if (context->going_down) + if (!client || !client->get_net_dev_by_params) continue; - if (client->get_net_dev_by_params) { - net_dev = client->get_net_dev_by_params(dev, port, pkey, - gid, addr, - context->data); - if (net_dev) - break; - } + net_dev = client->get_net_dev_by_params(dev, port, pkey, gid, + addr, client_data); + if (net_dev) + break; } - - up_read(&lists_rwsem); + up_read(&dev->client_data_rwsem); return net_dev; } @@ -1238,6 +1737,8 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) (ptr)->name = ops->name; \ } while (0) +#define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name) + SET_DEVICE_OP(dev_ops, add_gid); SET_DEVICE_OP(dev_ops, advise_mr); SET_DEVICE_OP(dev_ops, alloc_dm); @@ -1261,6 +1762,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, create_srq); SET_DEVICE_OP(dev_ops, create_wq); SET_DEVICE_OP(dev_ops, dealloc_dm); + SET_DEVICE_OP(dev_ops, dealloc_driver); SET_DEVICE_OP(dev_ops, dealloc_fmr); SET_DEVICE_OP(dev_ops, dealloc_mw); SET_DEVICE_OP(dev_ops, dealloc_pd); @@ -1281,6 +1783,8 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, disassociate_ucontext); SET_DEVICE_OP(dev_ops, drain_rq); SET_DEVICE_OP(dev_ops, drain_sq); + SET_DEVICE_OP(dev_ops, enable_driver); + SET_DEVICE_OP(dev_ops, fill_res_entry); SET_DEVICE_OP(dev_ops, get_dev_fw_str); SET_DEVICE_OP(dev_ops, get_dma_mr); SET_DEVICE_OP(dev_ops, get_hw_stats); @@ -1290,6 +1794,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, get_vector_affinity); SET_DEVICE_OP(dev_ops, get_vf_config); SET_DEVICE_OP(dev_ops, get_vf_stats); + SET_DEVICE_OP(dev_ops, init_port); SET_DEVICE_OP(dev_ops, map_mr_sg); SET_DEVICE_OP(dev_ops, map_phys_fmr); SET_DEVICE_OP(dev_ops, mmap); @@ -1325,6 +1830,9 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, set_vf_guid); SET_DEVICE_OP(dev_ops, set_vf_link_state); SET_DEVICE_OP(dev_ops, unmap_fmr); + + SET_OBJ_SIZE(dev_ops, ib_pd); + SET_OBJ_SIZE(dev_ops, ib_ucontext); } EXPORT_SYMBOL(ib_set_device_ops); @@ -1443,6 +1951,9 @@ static void __exit ib_core_cleanup(void) destroy_workqueue(ib_comp_wq); /* Make sure that any pending umem accounting work is done. */ destroy_workqueue(ib_wq); + flush_workqueue(system_unbound_wq); + WARN_ON(!xa_empty(&clients)); + WARN_ON(!xa_empty(&devices)); } MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4); diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c @@ -87,7 +87,8 @@ static struct rdma_nl_cbs iwcm_nl_cb_table[RDMA_NL_IWPM_NUM_OPS] = { [RDMA_NL_IWPM_REMOTE_INFO] = {.dump = iwpm_remote_info_cb}, [RDMA_NL_IWPM_HANDLE_ERR] = {.dump = iwpm_mapping_error_cb}, [RDMA_NL_IWPM_MAPINFO] = {.dump = iwpm_mapping_info_cb}, - [RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb} + [RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb}, + [RDMA_NL_IWPM_HELLO] = {.dump = iwpm_hello_cb} }; static struct workqueue_struct *iwcm_wq; @@ -504,7 +505,7 @@ static int iw_cm_map(struct iw_cm_id *cm_id, bool active) { const char *devname = dev_name(&cm_id->device->dev); const char *ifname = cm_id->device->iwcm->ifname; - struct iwpm_dev_data pm_reg_msg; + struct iwpm_dev_data pm_reg_msg = {}; struct iwpm_sa_data pm_msg; int status; @@ -515,8 +516,8 @@ static int iw_cm_map(struct iw_cm_id *cm_id, bool active) cm_id->m_local_addr = cm_id->local_addr; cm_id->m_remote_addr = cm_id->remote_addr; - strncpy(pm_reg_msg.dev_name, devname, sizeof(pm_reg_msg.dev_name)); - strncpy(pm_reg_msg.if_name, ifname, sizeof(pm_reg_msg.if_name)); + strcpy(pm_reg_msg.dev_name, devname); + strcpy(pm_reg_msg.if_name, ifname); if (iwpm_register_pid(&pm_reg_msg, RDMA_NL_IWCM) || !iwpm_valid_pid()) @@ -525,6 +526,8 @@ static int iw_cm_map(struct iw_cm_id *cm_id, bool active) cm_id->mapped = true; pm_msg.loc_addr = cm_id->local_addr; pm_msg.rem_addr = cm_id->remote_addr; + pm_msg.flags = (cm_id->device->iwcm->driver_flags & IW_F_NO_PORT_MAP) ? + IWPM_FLAGS_NO_PORT_MAP : 0; if (active) status = iwpm_add_and_query_mapping(&pm_msg, RDMA_NL_IWCM); @@ -543,7 +546,7 @@ static int iw_cm_map(struct iw_cm_id *cm_id, bool active) return iwpm_create_mapinfo(&cm_id->local_addr, &cm_id->m_local_addr, - RDMA_NL_IWCM); + RDMA_NL_IWCM, pm_msg.flags); } /* diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c @@ -34,18 +34,25 @@ #include "iwpm_util.h" static const char iwpm_ulib_name[IWPM_ULIBNAME_SIZE] = "iWarpPortMapperUser"; -static int iwpm_ulib_version = 3; +u16 iwpm_ulib_version = IWPM_UABI_VERSION_MIN; static int iwpm_user_pid = IWPM_PID_UNDEFINED; static atomic_t echo_nlmsg_seq; +/** + * iwpm_valid_pid - Check if the userspace iwarp port mapper pid is valid + * + * Returns true if the pid is greater than zero, otherwise returns false + */ int iwpm_valid_pid(void) { return iwpm_user_pid > 0; } -/* - * iwpm_register_pid - Send a netlink query to user space - * for the iwarp port mapper pid +/** + * iwpm_register_pid - Send a netlink query to userspace + * to get the iwarp port mapper pid + * @pm_msg: Contains driver info to send to the userspace port mapper + * @nl_client: The index of the netlink client * * nlmsg attributes: * [IWPM_NLA_REG_PID_SEQ] @@ -124,12 +131,19 @@ pid_query_error: return ret; } -/* - * iwpm_add_mapping - Send a netlink add mapping message - * to the port mapper +/** + * iwpm_add_mapping - Send a netlink add mapping request to + * the userspace port mapper + * @pm_msg: Contains the local ip/tcp address info to send + * @nl_client: The index of the netlink client + * * nlmsg attributes: * [IWPM_NLA_MANAGE_MAPPING_SEQ] * [IWPM_NLA_MANAGE_ADDR] + * [IWPM_NLA_MANAGE_FLAGS] + * + * If the request is successful, the pm_msg stores + * the port mapper response (mapped address info) */ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) { @@ -173,6 +187,18 @@ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) if (ret) goto add_mapping_error; + /* If flags are required and we're not V4, then return a quiet error */ + if (pm_msg->flags && iwpm_ulib_version == IWPM_UABI_VERSION_MIN) { + ret = -EINVAL; + goto add_mapping_error_nowarn; + } + if (iwpm_ulib_version > IWPM_UABI_VERSION_MIN) { + ret = ibnl_put_attr(skb, nlh, sizeof(u32), &pm_msg->flags, + IWPM_NLA_MANAGE_FLAGS); + if (ret) + goto add_mapping_error; + } + nlmsg_end(skb, nlh); nlmsg_request->req_buffer = pm_msg; @@ -187,6 +213,7 @@ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) return ret; add_mapping_error: pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client); +add_mapping_error_nowarn: if (skb) dev_kfree_skb(skb); if (nlmsg_request) @@ -194,13 +221,17 @@ add_mapping_error: return ret; } -/* - * iwpm_add_and_query_mapping - Send a netlink add and query - * mapping message to the port mapper +/** + * iwpm_add_and_query_mapping - Process the port mapper response to + * iwpm_add_and_query_mapping request + * @pm_msg: Contains the local ip/tcp address info to send + * @nl_client: The index of the netlink client + * * nlmsg attributes: * [IWPM_NLA_QUERY_MAPPING_SEQ] * [IWPM_NLA_QUERY_LOCAL_ADDR] * [IWPM_NLA_QUERY_REMOTE_ADDR] + * [IWPM_NLA_QUERY_FLAGS] */ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) { @@ -251,6 +282,18 @@ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) if (ret) goto query_mapping_error; + /* If flags are required and we're not V4, then return a quite error */ + if (pm_msg->flags && iwpm_ulib_version == IWPM_UABI_VERSION_MIN) { + ret = -EINVAL; + goto query_mapping_error_nowarn; + } + if (iwpm_ulib_version > IWPM_UABI_VERSION_MIN) { + ret = ibnl_put_attr(skb, nlh, sizeof(u32), &pm_msg->flags, + IWPM_NLA_QUERY_FLAGS); + if (ret) + goto query_mapping_error; + } + nlmsg_end(skb, nlh); nlmsg_request->req_buffer = pm_msg; @@ -264,6 +307,7 @@ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) return ret; query_mapping_error: pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client); +query_mapping_error_nowarn: if (skb) dev_kfree_skb(skb); if (nlmsg_request) @@ -271,9 +315,13 @@ query_mapping_error: return ret; } -/* - * iwpm_remove_mapping - Send a netlink remove mapping message - * to the port mapper +/** + * iwpm_remove_mapping - Send a netlink remove mapping request + * to the userspace port mapper + * + * @local_addr: Local ip/tcp address to remove + * @nl_client: The index of the netlink client + * * nlmsg attributes: * [IWPM_NLA_MANAGE_MAPPING_SEQ] * [IWPM_NLA_MANAGE_ADDR] @@ -344,9 +392,14 @@ static const struct nla_policy resp_reg_policy[IWPM_NLA_RREG_PID_MAX] = { [IWPM_NLA_RREG_PID_ERR] = { .type = NLA_U16 } }; -/* - * iwpm_register_pid_cb - Process a port mapper response to - * iwpm_register_pid() +/** + * iwpm_register_pid_cb - Process the port mapper response to + * iwpm_register_pid query + * @skb: + * @cb: Contains the received message (payload and netlink header) + * + * If successful, the function receives the userspace port mapper pid + * which is used in future communication with the port mapper */ int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb) { @@ -379,7 +432,7 @@ int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb) /* check device name, ulib name and version */ if (strcmp(pm_msg->dev_name, dev_name) || strcmp(iwpm_ulib_name, iwpm_name) || - iwpm_version != iwpm_ulib_version) { + iwpm_version < IWPM_UABI_VERSION_MIN) { pr_info("%s: Incorrect info (dev = %s name = %s version = %d)\n", __func__, dev_name, iwpm_name, iwpm_version); @@ -387,6 +440,10 @@ int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb) goto register_pid_response_exit; } iwpm_user_pid = cb->nlh->nlmsg_pid; + iwpm_ulib_version = iwpm_version; + if (iwpm_ulib_version < IWPM_UABI_VERSION) + pr_warn_once("%s: Down level iwpmd/pid %u. Continuing...", + __func__, iwpm_user_pid); atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n", __func__, iwpm_user_pid); @@ -403,15 +460,19 @@ register_pid_response_exit: /* netlink attribute policy for the received response to add mapping request */ static const struct nla_policy resp_add_policy[IWPM_NLA_RMANAGE_MAPPING_MAX] = { - [IWPM_NLA_MANAGE_MAPPING_SEQ] = { .type = NLA_U32 }, - [IWPM_NLA_MANAGE_ADDR] = { .len = sizeof(struct sockaddr_storage) }, - [IWPM_NLA_MANAGE_MAPPED_LOC_ADDR] = { .len = sizeof(struct sockaddr_storage) }, - [IWPM_NLA_RMANAGE_MAPPING_ERR] = { .type = NLA_U16 } + [IWPM_NLA_RMANAGE_MAPPING_SEQ] = { .type = NLA_U32 }, + [IWPM_NLA_RMANAGE_ADDR] = { + .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR] = { + .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RMANAGE_MAPPING_ERR] = { .type = NLA_U16 } }; -/* - * iwpm_add_mapping_cb - Process a port mapper response to - * iwpm_add_mapping() +/** + * iwpm_add_mapping_cb - Process the port mapper response to + * iwpm_add_mapping request + * @skb: + * @cb: Contains the received message (payload and netlink header) */ int iwpm_add_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb) { @@ -430,7 +491,7 @@ int iwpm_add_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb) atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); - msg_seq = nla_get_u32(nltb[IWPM_NLA_MANAGE_MAPPING_SEQ]); + msg_seq = nla_get_u32(nltb[IWPM_NLA_RMANAGE_MAPPING_SEQ]); nlmsg_request = iwpm_find_nlmsg_request(msg_seq); if (!nlmsg_request) { pr_info("%s: Could not find a matching request (seq = %u)\n", @@ -439,9 +500,9 @@ int iwpm_add_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb) } pm_msg = nlmsg_request->req_buffer; local_sockaddr = (struct sockaddr_storage *) - nla_data(nltb[IWPM_NLA_MANAGE_ADDR]); + nla_data(nltb[IWPM_NLA_RMANAGE_ADDR]); mapped_sockaddr = (struct sockaddr_storage *) - nla_data(nltb[IWPM_NLA_MANAGE_MAPPED_LOC_ADDR]); + nla_data(nltb[IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR]); if (iwpm_compare_sockaddr(local_sockaddr, &pm_msg->loc_addr)) { nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR; @@ -472,17 +533,23 @@ add_mapping_response_exit: /* netlink attribute policy for the response to add and query mapping request * and response with remote address info */ static const struct nla_policy resp_query_policy[IWPM_NLA_RQUERY_MAPPING_MAX] = { - [IWPM_NLA_QUERY_MAPPING_SEQ] = { .type = NLA_U32 }, - [IWPM_NLA_QUERY_LOCAL_ADDR] = { .len = sizeof(struct sockaddr_storage) }, - [IWPM_NLA_QUERY_REMOTE_ADDR] = { .len = sizeof(struct sockaddr_storage) }, - [IWPM_NLA_RQUERY_MAPPED_LOC_ADDR] = { .len = sizeof(struct sockaddr_storage) }, - [IWPM_NLA_RQUERY_MAPPED_REM_ADDR] = { .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RQUERY_MAPPING_SEQ] = { .type = NLA_U32 }, + [IWPM_NLA_RQUERY_LOCAL_ADDR] = { + .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RQUERY_REMOTE_ADDR] = { + .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RQUERY_MAPPED_LOC_ADDR] = { + .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RQUERY_MAPPED_REM_ADDR] = { + .len = sizeof(struct sockaddr_storage) }, [IWPM_NLA_RQUERY_MAPPING_ERR] = { .type = NLA_U16 } }; -/* - * iwpm_add_and_query_mapping_cb - Process a port mapper response to - * iwpm_add_and_query_mapping() +/** + * iwpm_add_and_query_mapping_cb - Process the port mapper response to + * iwpm_add_and_query_mapping request + * @skb: + * @cb: Contains the received message (payload and netlink header) */ int iwpm_add_and_query_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb) @@ -502,7 +569,7 @@ int iwpm_add_and_query_mapping_cb(struct sk_buff *skb, return -EINVAL; atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); - msg_seq = nla_get_u32(nltb[IWPM_NLA_QUERY_MAPPING_SEQ]); + msg_seq = nla_get_u32(nltb[IWPM_NLA_RQUERY_MAPPING_SEQ]); nlmsg_request = iwpm_find_nlmsg_request(msg_seq); if (!nlmsg_request) { pr_info("%s: Could not find a matching request (seq = %u)\n", @@ -511,9 +578,9 @@ int iwpm_add_and_query_mapping_cb(struct sk_buff *skb, } pm_msg = nlmsg_request->req_buffer; local_sockaddr = (struct sockaddr_storage *) - nla_data(nltb[IWPM_NLA_QUERY_LOCAL_ADDR]); + nla_data(nltb[IWPM_NLA_RQUERY_LOCAL_ADDR]); remote_sockaddr = (struct sockaddr_storage *) - nla_data(nltb[IWPM_NLA_QUERY_REMOTE_ADDR]); + nla_data(nltb[IWPM_NLA_RQUERY_REMOTE_ADDR]); mapped_loc_sockaddr = (struct sockaddr_storage *) nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR]); mapped_rem_sockaddr = (struct sockaddr_storage *) @@ -560,9 +627,13 @@ query_mapping_response_exit: return 0; } -/* - * iwpm_remote_info_cb - Process a port mapper message, containing - * the remote connecting peer address info +/** + * iwpm_remote_info_cb - Process remote connecting peer address info, which + * the port mapper has received from the connecting peer + * @skb: + * @cb: Contains the received message (payload and netlink header) + * + * Stores the IPv4/IPv6 address info in a hash table */ int iwpm_remote_info_cb(struct sk_buff *skb, struct netlink_callback *cb) { @@ -588,9 +659,9 @@ int iwpm_remote_info_cb(struct sk_buff *skb, struct netlink_callback *cb) atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); local_sockaddr = (struct sockaddr_storage *) - nla_data(nltb[IWPM_NLA_QUERY_LOCAL_ADDR]); + nla_data(nltb[IWPM_NLA_RQUERY_LOCAL_ADDR]); remote_sockaddr = (struct sockaddr_storage *) - nla_data(nltb[IWPM_NLA_QUERY_REMOTE_ADDR]); + nla_data(nltb[IWPM_NLA_RQUERY_REMOTE_ADDR]); mapped_loc_sockaddr = (struct sockaddr_storage *) nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR]); mapped_rem_sockaddr = (struct sockaddr_storage *) @@ -635,8 +706,14 @@ static const struct nla_policy resp_mapinfo_policy[IWPM_NLA_MAPINFO_REQ_MAX] = { [IWPM_NLA_MAPINFO_ULIB_VER] = { .type = NLA_U16 } }; -/* - * iwpm_mapping_info_cb - Process a port mapper request for mapping info +/** + * iwpm_mapping_info_cb - Process a notification that the userspace + * port mapper daemon is started + * @skb: + * @cb: Contains the received message (payload and netlink header) + * + * Using the received port mapper pid, send all the local mapping + * info records to the userspace port mapper */ int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb) { @@ -655,7 +732,7 @@ int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb) iwpm_name = (char *)nla_data(nltb[IWPM_NLA_MAPINFO_ULIB_NAME]); iwpm_version = nla_get_u16(nltb[IWPM_NLA_MAPINFO_ULIB_VER]); if (strcmp(iwpm_ulib_name, iwpm_name) || - iwpm_version != iwpm_ulib_version) { + iwpm_version < IWPM_UABI_VERSION_MIN) { pr_info("%s: Invalid port mapper name = %s version = %d\n", __func__, iwpm_name, iwpm_version); return ret; @@ -669,6 +746,11 @@ int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb) iwpm_set_registration(nl_client, IWPM_REG_INCOMPL); atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); iwpm_user_pid = cb->nlh->nlmsg_pid; + + if (iwpm_ulib_version < IWPM_UABI_VERSION) + pr_warn_once("%s: Down level iwpmd/pid %u. Continuing...", + __func__, iwpm_user_pid); + if (!iwpm_mapinfo_available()) return 0; pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n", @@ -684,9 +766,11 @@ static const struct nla_policy ack_mapinfo_policy[IWPM_NLA_MAPINFO_NUM_MAX] = { [IWPM_NLA_MAPINFO_ACK_NUM] = { .type = NLA_U32 } }; -/* - * iwpm_ack_mapping_info_cb - Process a port mapper ack for - * the provided mapping info records +/** + * iwpm_ack_mapping_info_cb - Process the port mapper ack for + * the provided local mapping info records + * @skb: + * @cb: Contains the received message (payload and netlink header) */ int iwpm_ack_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb) { @@ -712,8 +796,11 @@ static const struct nla_policy map_error_policy[IWPM_NLA_ERR_MAX] = { [IWPM_NLA_ERR_CODE] = { .type = NLA_U16 }, }; -/* - * iwpm_mapping_error_cb - Process a port mapper error message +/** + * iwpm_mapping_error_cb - Process port mapper notification for error + * + * @skb: + * @cb: Contains the received message (payload and netlink header) */ int iwpm_mapping_error_cb(struct sk_buff *skb, struct netlink_callback *cb) { @@ -748,3 +835,46 @@ int iwpm_mapping_error_cb(struct sk_buff *skb, struct netlink_callback *cb) up(&nlmsg_request->sem); return 0; } + +/* netlink attribute policy for the received hello request */ +static const struct nla_policy hello_policy[IWPM_NLA_HELLO_MAX] = { + [IWPM_NLA_HELLO_ABI_VERSION] = { .type = NLA_U16 } +}; + +/** + * iwpm_hello_cb - Process a hello message from iwpmd + * + * @skb: + * @cb: Contains the received message (payload and netlink header) + * + * Using the received port mapper pid, send the kernel's abi_version + * after adjusting it to support the iwpmd version. + */ +int iwpm_hello_cb(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nlattr *nltb[IWPM_NLA_HELLO_MAX]; + const char *msg_type = "Hello request"; + u8 nl_client; + u16 abi_version; + int ret = -EINVAL; + + if (iwpm_parse_nlmsg(cb, IWPM_NLA_HELLO_MAX, hello_policy, nltb, + msg_type)) { + pr_info("%s: Unable to parse nlmsg\n", __func__); + return ret; + } + abi_version = nla_get_u16(nltb[IWPM_NLA_HELLO_ABI_VERSION]); + nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type); + if (!iwpm_valid_client(nl_client)) { + pr_info("%s: Invalid port mapper client = %d\n", + __func__, nl_client); + return ret; + } + iwpm_set_registration(nl_client, IWPM_REG_INCOMPL); + atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); + iwpm_ulib_version = min_t(u16, IWPM_UABI_VERSION, abi_version); + pr_debug("Using ABI version %u\n", iwpm_ulib_version); + iwpm_user_pid = cb->nlh->nlmsg_pid; + ret = iwpm_send_hello(nl_client, iwpm_user_pid, iwpm_ulib_version); + return ret; +} diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c @@ -51,6 +51,12 @@ static DEFINE_SPINLOCK(iwpm_reminfo_lock); static DEFINE_MUTEX(iwpm_admin_lock); static struct iwpm_admin_data iwpm_admin; +/** + * iwpm_init - Allocate resources for the iwarp port mapper + * @nl_client: The index of the netlink client + * + * Should be called when network interface goes up. + */ int iwpm_init(u8 nl_client) { int ret = 0; @@ -87,6 +93,12 @@ init_exit: static void free_hash_bucket(void); static void free_reminfo_bucket(void); +/** + * iwpm_exit - Deallocate resources for the iwarp port mapper + * @nl_client: The index of the netlink client + * + * Should be called when network interface goes down. + */ int iwpm_exit(u8 nl_client) { @@ -112,9 +124,17 @@ int iwpm_exit(u8 nl_client) static struct hlist_head *get_mapinfo_hash_bucket(struct sockaddr_storage *, struct sockaddr_storage *); +/** + * iwpm_create_mapinfo - Store local and mapped IPv4/IPv6 address + * info in a hash table + * @local_addr: Local ip/tcp address + * @mapped_addr: Mapped local ip/tcp address + * @nl_client: The index of the netlink client + * @map_flags: IWPM mapping flags + */ int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr, struct sockaddr_storage *mapped_sockaddr, - u8 nl_client) + u8 nl_client, u32 map_flags) { struct hlist_head *hash_bucket_head = NULL; struct iwpm_mapping_info *map_info; @@ -132,6 +152,7 @@ int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr, memcpy(&map_info->mapped_sockaddr, mapped_sockaddr, sizeof(struct sockaddr_storage)); map_info->nl_client = nl_client; + map_info->map_flags = map_flags; spin_lock_irqsave(&iwpm_mapinfo_lock, flags); if (iwpm_hash_bucket) { @@ -150,6 +171,15 @@ int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr, return ret; } +/** + * iwpm_remove_mapinfo - Remove local and mapped IPv4/IPv6 address + * info from the hash table + * @local_addr: Local ip/tcp address + * @mapped_local_addr: Mapped local ip/tcp address + * + * Returns err code if mapping info is not found in the hash table, + * otherwise returns 0 + */ int iwpm_remove_mapinfo(struct sockaddr_storage *local_sockaddr, struct sockaddr_storage *mapped_local_addr) { @@ -250,6 +280,17 @@ void iwpm_add_remote_info(struct iwpm_remote_info *rem_info) spin_unlock_irqrestore(&iwpm_reminfo_lock, flags); } +/** + * iwpm_get_remote_info - Get the remote connecting peer address info + * + * @mapped_loc_addr: Mapped local address of the listening peer + * @mapped_rem_addr: Mapped remote address of the connecting peer + * @remote_addr: To store the remote address of the connecting peer + * @nl_client: The index of the netlink client + * + * The remote address info is retrieved and provided to the client in + * the remote_addr. After that it is removed from the hash table + */ int iwpm_get_remote_info(struct sockaddr_storage *mapped_loc_addr, struct sockaddr_storage *mapped_rem_addr, struct sockaddr_storage *remote_addr, @@ -686,6 +727,14 @@ int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid) if (ret) goto send_mapping_info_unlock; + if (iwpm_ulib_version > IWPM_UABI_VERSION_MIN) { + ret = ibnl_put_attr(skb, nlh, sizeof(u32), + &map_info->map_flags, + IWPM_NLA_MAPINFO_FLAGS); + if (ret) + goto send_mapping_info_unlock; + } + nlmsg_end(skb, nlh); iwpm_print_sockaddr(&map_info->local_sockaddr, @@ -754,3 +803,38 @@ int iwpm_mapinfo_available(void) spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); return full_bucket; } + +int iwpm_send_hello(u8 nl_client, int iwpm_pid, u16 abi_version) +{ + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; + const char *err_str = ""; + int ret = -EINVAL; + + skb = iwpm_create_nlmsg(RDMA_NL_IWPM_HELLO, &nlh, nl_client); + if (!skb) { + err_str = "Unable to create a nlmsg"; + goto hello_num_error; + } + nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); + err_str = "Unable to put attribute of abi_version into nlmsg"; + ret = ibnl_put_attr(skb, nlh, sizeof(u16), &abi_version, + IWPM_NLA_HELLO_ABI_VERSION); + if (ret) + goto hello_num_error; + nlmsg_end(skb, nlh); + + ret = rdma_nl_unicast(skb, iwpm_pid); + if (ret) { + skb = NULL; + err_str = "Unable to send a nlmsg"; + goto hello_num_error; + } + pr_debug("%s: Sent hello abi_version = %u\n", __func__, abi_version); + return 0; +hello_num_error: + pr_info("%s: %s\n", __func__, err_str); + if (skb) + dev_kfree_skb(skb); + return ret; +} diff --git a/drivers/infiniband/core/iwpm_util.h b/drivers/infiniband/core/iwpm_util.h @@ -78,6 +78,7 @@ struct iwpm_mapping_info { struct sockaddr_storage local_sockaddr; struct sockaddr_storage mapped_sockaddr; u8 nl_client; + u32 map_flags; }; struct iwpm_remote_info { @@ -266,4 +267,15 @@ int iwpm_parse_nlmsg(struct netlink_callback *cb, int policy_max, * @msg: Message to print */ void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg); + +/** + * iwpm_send_hello - Send hello response to iwpmd + * + * @nl_client: The index of the netlink client + * @abi_version: The kernel's abi_version + * + * Returns 0 on success or a negative error code + */ +int iwpm_send_hello(u8 nl_client, int iwpm_pid, u16 abi_version); +extern u16 iwpm_ulib_version; #endif diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c @@ -3326,9 +3326,9 @@ error: static void ib_mad_remove_device(struct ib_device *device, void *client_data) { - int i; + unsigned int i; - for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) { + rdma_for_each_port (device, i) { if (!rdma_cap_ib_mad(device, i)) continue; diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c @@ -56,7 +56,6 @@ EXPORT_SYMBOL(rdma_nl_chk_listeners); static bool is_nl_msg_valid(unsigned int type, unsigned int op) { static const unsigned int max_num_ops[RDMA_NL_NUM_CLIENTS] = { - [RDMA_NL_RDMA_CM] = RDMA_NL_RDMA_CM_NUM_OPS, [RDMA_NL_IWCM] = RDMA_NL_IWPM_NUM_OPS, [RDMA_NL_LS] = RDMA_NL_LS_NUM_OPS, [RDMA_NL_NLDEV] = RDMA_NLDEV_NUM_OPS, @@ -181,8 +180,7 @@ static int rdma_nl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, return -EINVAL; } /* FIXME: Convert IWCM to properly handle doit callbacks */ - if ((nlh->nlmsg_flags & NLM_F_DUMP) || index == RDMA_NL_RDMA_CM || - index == RDMA_NL_IWCM) { + if ((nlh->nlmsg_flags & NLM_F_DUMP) || index == RDMA_NL_IWCM) { struct netlink_dump_control c = { .dump = cb_table[op].dump, }; diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c @@ -33,12 +33,14 @@ #include <linux/module.h> #include <linux/pid.h> #include <linux/pid_namespace.h> +#include <linux/mutex.h> #include <net/netlink.h> #include <rdma/rdma_cm.h> #include <rdma/rdma_netlink.h> #include "core_priv.h" #include "cma_priv.h" +#include "restrack.h" static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_DEV_INDEX] = { .type = NLA_U32 }, @@ -107,6 +109,13 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_DRIVER_U32] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_DRIVER_S64] = { .type = NLA_S64 }, [RDMA_NLDEV_ATTR_DRIVER_U64] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_RES_PDN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_CQN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_MRN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_CM_IDN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_CTXN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_LINK_TYPE] = { .type = NLA_NUL_STRING, + .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN }, }; static int put_driver_name_print_type(struct sk_buff *msg, const char *name, @@ -262,9 +271,7 @@ static int fill_port_info(struct sk_buff *msg, if (nla_put_u8(msg, RDMA_NLDEV_ATTR_PORT_PHYS_STATE, attr.phys_state)) return -EMSGSIZE; - if (device->ops.get_netdev) - netdev = device->ops.get_netdev(device, port); - + netdev = ib_device_get_netdev(device, port); if (netdev && net_eq(dev_net(netdev), net)) { ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_NDEV_INDEX, netdev->ifindex); @@ -314,7 +321,6 @@ static int fill_res_info(struct sk_buff *msg, struct ib_device *device) [RDMA_RESTRACK_CTX] = "ctx", }; - struct rdma_restrack_root *res = &device->res; struct nlattr *table_attr; int ret, i, curr; @@ -328,7 +334,8 @@ static int fill_res_info(struct sk_buff *msg, struct ib_device *device) for (i = 0; i < RDMA_RESTRACK_MAX; i++) { if (!names[i]) continue; - curr = rdma_restrack_count(res, i, task_active_pid_ns(current)); + curr = rdma_restrack_count(device, i, + task_active_pid_ns(current)); ret = fill_res_info_entry(msg, names[i], curr); if (ret) goto err; @@ -361,13 +368,20 @@ static int fill_res_name_pid(struct sk_buff *msg, return 0; } -static int fill_res_qp_entry(struct sk_buff *msg, struct netlink_callback *cb, +static bool fill_res_entry(struct ib_device *dev, struct sk_buff *msg, + struct rdma_restrack_entry *res) +{ + if (!dev->ops.fill_res_entry) + return false; + return dev->ops.fill_res_entry(msg, res); +} + +static int fill_res_qp_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_qp *qp = container_of(res, struct ib_qp, res); - struct rdma_restrack_root *resroot = &qp->device->res; + struct ib_device *dev = qp->device; struct ib_qp_init_attr qp_init_attr; - struct nlattr *entry_attr; struct ib_qp_attr qp_attr; int ret; @@ -376,11 +390,7 @@ static int fill_res_qp_entry(struct sk_buff *msg, struct netlink_callback *cb, return ret; if (port && port != qp_attr.port_num) - return 0; - - entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP_ENTRY); - if (!entry_attr) - goto out; + return -EAGAIN; /* In create_qp() port is not set yet */ if (qp_attr.port_num && @@ -412,38 +422,32 @@ static int fill_res_qp_entry(struct sk_buff *msg, struct netlink_callback *cb, if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_STATE, qp_attr.qp_state)) goto err; + if (!rdma_is_kernel_res(res) && + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, qp->pd->res.id)) + goto err; + if (fill_res_name_pid(msg, res)) goto err; - if (resroot->fill_res_entry(msg, res)) + if (fill_res_entry(dev, msg, res)) goto err; - nla_nest_end(msg, entry_attr); return 0; -err: - nla_nest_cancel(msg, entry_attr); -out: - return -EMSGSIZE; +err: return -EMSGSIZE; } -static int fill_res_cm_id_entry(struct sk_buff *msg, - struct netlink_callback *cb, +static int fill_res_cm_id_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct rdma_id_private *id_priv = container_of(res, struct rdma_id_private, res); - struct rdma_restrack_root *resroot = &id_priv->id.device->res; + struct ib_device *dev = id_priv->id.device; struct rdma_cm_id *cm_id = &id_priv->id; - struct nlattr *entry_attr; if (port && port != cm_id->port_num) return 0; - entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY); - if (!entry_attr) - goto out; - if (cm_id->port_num && nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, cm_id->port_num)) goto err; @@ -472,31 +476,25 @@ static int fill_res_cm_id_entry(struct sk_buff *msg, &cm_id->route.addr.dst_addr)) goto err; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CM_IDN, res->id)) + goto err; + if (fill_res_name_pid(msg, res)) goto err; - if (resroot->fill_res_entry(msg, res)) + if (fill_res_entry(dev, msg, res)) goto err; - nla_nest_end(msg, entry_attr); return 0; -err: - nla_nest_cancel(msg, entry_attr); -out: - return -EMSGSIZE; +err: return -EMSGSIZE; } -static int fill_res_cq_entry(struct sk_buff *msg, struct netlink_callback *cb, +static int fill_res_cq_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_cq *cq = container_of(res, struct ib_cq, res); - struct rdma_restrack_root *resroot = &cq->device->res; - struct nlattr *entry_attr; - - entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_CQ_ENTRY); - if (!entry_attr) - goto out; + struct ib_device *dev = cq->device; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQE, cq->cqe)) goto err; @@ -509,33 +507,31 @@ static int fill_res_cq_entry(struct sk_buff *msg, struct netlink_callback *cb, nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_POLL_CTX, cq->poll_ctx)) goto err; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQN, res->id)) + goto err; + if (!rdma_is_kernel_res(res) && + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CTXN, + cq->uobject->context->res.id)) + goto err; + if (fill_res_name_pid(msg, res)) goto err; - if (resroot->fill_res_entry(msg, res)) + if (fill_res_entry(dev, msg, res)) goto err; - nla_nest_end(msg, entry_attr); return 0; -err: - nla_nest_cancel(msg, entry_attr); -out: - return -EMSGSIZE; +err: return -EMSGSIZE; } -static int fill_res_mr_entry(struct sk_buff *msg, struct netlink_callback *cb, +static int fill_res_mr_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_mr *mr = container_of(res, struct ib_mr, res); - struct rdma_restrack_root *resroot = &mr->pd->device->res; - struct nlattr *entry_attr; - - entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_MR_ENTRY); - if (!entry_attr) - goto out; + struct ib_device *dev = mr->pd->device; - if (netlink_capable(cb->skb, CAP_NET_ADMIN)) { + if (has_cap_net_admin) { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RKEY, mr->rkey)) goto err; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LKEY, mr->lkey)) @@ -546,33 +542,31 @@ static int fill_res_mr_entry(struct sk_buff *msg, struct netlink_callback *cb, RDMA_NLDEV_ATTR_PAD)) goto err; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_MRN, res->id)) + goto err; + + if (!rdma_is_kernel_res(res) && + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, mr->pd->res.id)) + goto err; + if (fill_res_name_pid(msg, res)) goto err; - if (resroot->fill_res_entry(msg, res)) + if (fill_res_entry(dev, msg, res)) goto err; - nla_nest_end(msg, entry_attr); return 0; -err: - nla_nest_cancel(msg, entry_attr); -out: - return -EMSGSIZE; +err: return -EMSGSIZE; } -static int fill_res_pd_entry(struct sk_buff *msg, struct netlink_callback *cb, +static int fill_res_pd_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_pd *pd = container_of(res, struct ib_pd, res); - struct rdma_restrack_root *resroot = &pd->device->res; - struct nlattr *entry_attr; + struct ib_device *dev = pd->device; - entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_PD_ENTRY); - if (!entry_attr) - goto out; - - if (netlink_capable(cb->skb, CAP_NET_ADMIN)) { + if (has_cap_net_admin) { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY, pd->local_dma_lkey)) goto err; @@ -585,19 +579,23 @@ static int fill_res_pd_entry(struct sk_buff *msg, struct netlink_callback *cb, atomic_read(&pd->usecnt), RDMA_NLDEV_ATTR_PAD)) goto err; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, res->id)) + goto err; + + if (!rdma_is_kernel_res(res) && + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CTXN, + pd->uobject->context->res.id)) + goto err; + if (fill_res_name_pid(msg, res)) goto err; - if (resroot->fill_res_entry(msg, res)) + if (fill_res_entry(dev, msg, res)) goto err; - nla_nest_end(msg, entry_attr); return 0; -err: - nla_nest_cancel(msg, entry_attr); -out: - return -EMSGSIZE; +err: return -EMSGSIZE; } static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -777,7 +775,7 @@ static int nldev_port_get_dumpit(struct sk_buff *skb, u32 idx = 0; u32 ifindex; int err; - u32 p; + unsigned int p; err = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, NULL); @@ -789,7 +787,7 @@ static int nldev_port_get_dumpit(struct sk_buff *skb, if (!device) return -EINVAL; - for (p = rdma_start_port(device); p <= rdma_end_port(device); ++p) { + rdma_for_each_port (device, p) { /* * The dumpit function returns all information from specific * index. This specific index is taken from the netlink @@ -905,10 +903,17 @@ static int nldev_res_get_dumpit(struct sk_buff *skb, } struct nldev_fill_res_entry { - int (*fill_res_func)(struct sk_buff *msg, struct netlink_callback *cb, + int (*fill_res_func)(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, u32 port); enum rdma_nldev_attr nldev_attr; enum rdma_nldev_command nldev_cmd; + u8 flags; + u32 entry; + u32 id; +}; + +enum nldev_res_flags { + NLDEV_PER_DEV = 1 << 0, }; static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = { @@ -916,29 +921,136 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = { .fill_res_func = fill_res_qp_entry, .nldev_cmd = RDMA_NLDEV_CMD_RES_QP_GET, .nldev_attr = RDMA_NLDEV_ATTR_RES_QP, + .entry = RDMA_NLDEV_ATTR_RES_QP_ENTRY, + .id = RDMA_NLDEV_ATTR_RES_LQPN, }, [RDMA_RESTRACK_CM_ID] = { .fill_res_func = fill_res_cm_id_entry, .nldev_cmd = RDMA_NLDEV_CMD_RES_CM_ID_GET, .nldev_attr = RDMA_NLDEV_ATTR_RES_CM_ID, + .entry = RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY, + .id = RDMA_NLDEV_ATTR_RES_CM_IDN, }, [RDMA_RESTRACK_CQ] = { .fill_res_func = fill_res_cq_entry, .nldev_cmd = RDMA_NLDEV_CMD_RES_CQ_GET, .nldev_attr = RDMA_NLDEV_ATTR_RES_CQ, + .flags = NLDEV_PER_DEV, + .entry = RDMA_NLDEV_ATTR_RES_CQ_ENTRY, + .id = RDMA_NLDEV_ATTR_RES_CQN, }, [RDMA_RESTRACK_MR] = { .fill_res_func = fill_res_mr_entry, .nldev_cmd = RDMA_NLDEV_CMD_RES_MR_GET, .nldev_attr = RDMA_NLDEV_ATTR_RES_MR, + .flags = NLDEV_PER_DEV, + .entry = RDMA_NLDEV_ATTR_RES_MR_ENTRY, + .id = RDMA_NLDEV_ATTR_RES_MRN, }, [RDMA_RESTRACK_PD] = { .fill_res_func = fill_res_pd_entry, .nldev_cmd = RDMA_NLDEV_CMD_RES_PD_GET, .nldev_attr = RDMA_NLDEV_ATTR_RES_PD, + .flags = NLDEV_PER_DEV, + .entry = RDMA_NLDEV_ATTR_RES_PD_ENTRY, + .id = RDMA_NLDEV_ATTR_RES_PDN, }, }; +static bool is_visible_in_pid_ns(struct rdma_restrack_entry *res) +{ + /* + * 1. Kern resources should be visible in init name space only + * 2. Present only resources visible in the current namespace + */ + if (rdma_is_kernel_res(res)) + return task_active_pid_ns(current) == &init_pid_ns; + return task_active_pid_ns(current) == task_active_pid_ns(res->task); +} + +static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, + enum rdma_restrack_type res_type) +{ + const struct nldev_fill_res_entry *fe = &fill_entries[res_type]; + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct rdma_restrack_entry *res; + struct ib_device *device; + u32 index, id, port = 0; + bool has_cap_net_admin; + struct sk_buff *msg; + int ret; + + ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !fe->id || !tb[fe->id]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(index); + if (!device) + return -EINVAL; + + if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) { + port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); + if (!rdma_is_port_valid(device, port)) { + ret = -EINVAL; + goto err; + } + } + + if ((port && fe->flags & NLDEV_PER_DEV) || + (!port && ~fe->flags & NLDEV_PER_DEV)) { + ret = -EINVAL; + goto err; + } + + id = nla_get_u32(tb[fe->id]); + res = rdma_restrack_get_byid(device, res_type, id); + if (IS_ERR(res)) { + ret = PTR_ERR(res); + goto err; + } + + if (!is_visible_in_pid_ns(res)) { + ret = -ENOENT; + goto err_get; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto err; + } + + nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, fe->nldev_cmd), + 0, 0); + + if (fill_nldev_handle(msg, device)) { + ret = -EMSGSIZE; + goto err_free; + } + + has_cap_net_admin = netlink_capable(skb, CAP_NET_ADMIN); + ret = fe->fill_res_func(msg, has_cap_net_admin, res, port); + rdma_restrack_put(res); + if (ret) + goto err_free; + + nlmsg_end(msg, nlh); + ib_device_put(device); + return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + +err_free: + nlmsg_free(msg); +err_get: + rdma_restrack_put(res); +err: + ib_device_put(device); + return ret; +} + static int res_get_common_dumpit(struct sk_buff *skb, struct netlink_callback *cb, enum rdma_restrack_type res_type) @@ -946,11 +1058,15 @@ static int res_get_common_dumpit(struct sk_buff *skb, const struct nldev_fill_res_entry *fe = &fill_entries[res_type]; struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; struct rdma_restrack_entry *res; + struct rdma_restrack_root *rt; int err, ret = 0, idx = 0; struct nlattr *table_attr; + struct nlattr *entry_attr; struct ib_device *device; int start = cb->args[0]; + bool has_cap_net_admin; struct nlmsghdr *nlh; + unsigned long id; u32 index, port = 0; bool filled = false; @@ -998,55 +1114,51 @@ static int res_get_common_dumpit(struct sk_buff *skb, goto err; } - down_read(&device->res.rwsem); - hash_for_each_possible(device->res.hash, res, node, res_type) { - if (idx < start) - goto next; + has_cap_net_admin = netlink_capable(cb->skb, CAP_NET_ADMIN); - if ((rdma_is_kernel_res(res) && - task_active_pid_ns(current) != &init_pid_ns) || - (!rdma_is_kernel_res(res) && task_active_pid_ns(current) != - task_active_pid_ns(res->task))) - /* - * 1. Kern resources should be visible in init - * namspace only - * 2. Present only resources visible in the current - * namespace - */ - goto next; + rt = &device->res[res_type]; + xa_lock(&rt->xa); + /* + * FIXME: if the skip ahead is something common this loop should + * use xas_for_each & xas_pause to optimize, we can have a lot of + * objects. + */ + xa_for_each(&rt->xa, id, res) { + if (!is_visible_in_pid_ns(res)) + continue; - if (!rdma_restrack_get(res)) - /* - * Resource is under release now, but we are not - * relesing lock now, so it will be released in - * our next pass, once we will get ->next pointer. - */ + if (idx < start || !rdma_restrack_get(res)) goto next; + xa_unlock(&rt->xa); + filled = true; - up_read(&device->res.rwsem); - ret = fe->fill_res_func(skb, cb, res, port); - down_read(&device->res.rwsem); - /* - * Return resource back, but it won't be released till - * the &device->res.rwsem will be released for write. - */ + entry_attr = nla_nest_start(skb, fe->entry); + if (!entry_attr) { + ret = -EMSGSIZE; + rdma_restrack_put(res); + goto msg_full; + } + + ret = fe->fill_res_func(skb, has_cap_net_admin, res, port); rdma_restrack_put(res); - if (ret == -EMSGSIZE) - /* - * There is a chance to optimize here. - * It can be done by using list_prepare_entry - * and list_for_each_entry_continue afterwards. - */ - break; - if (ret) + if (ret) { + nla_nest_cancel(skb, entry_attr); + if (ret == -EMSGSIZE) + goto msg_full; + if (ret == -EAGAIN) + goto again; goto res_err; + } + nla_nest_end(skb, entry_attr); +again: xa_lock(&rt->xa); next: idx++; } - up_read(&device->res.rwsem); + xa_unlock(&rt->xa); +msg_full: nla_nest_end(skb, table_attr); nlmsg_end(skb, nlh); cb->args[0] = idx; @@ -1063,7 +1175,6 @@ next: idx++; res_err: nla_nest_cancel(skb, table_attr); - up_read(&device->res.rwsem); err: nlmsg_cancel(skb, nlh); @@ -1073,34 +1184,132 @@ err_index: return ret; } -static int nldev_res_get_qp_dumpit(struct sk_buff *skb, - struct netlink_callback *cb) +#define RES_GET_FUNCS(name, type) \ + static int nldev_res_get_##name##_dumpit(struct sk_buff *skb, \ + struct netlink_callback *cb) \ + { \ + return res_get_common_dumpit(skb, cb, type); \ + } \ + static int nldev_res_get_##name##_doit(struct sk_buff *skb, \ + struct nlmsghdr *nlh, \ + struct netlink_ext_ack *extack) \ + { \ + return res_get_common_doit(skb, nlh, extack, type); \ + } + +RES_GET_FUNCS(qp, RDMA_RESTRACK_QP); +RES_GET_FUNCS(cm_id, RDMA_RESTRACK_CM_ID); +RES_GET_FUNCS(cq, RDMA_RESTRACK_CQ); +RES_GET_FUNCS(pd, RDMA_RESTRACK_PD); +RES_GET_FUNCS(mr, RDMA_RESTRACK_MR); + +static LIST_HEAD(link_ops); +static DECLARE_RWSEM(link_ops_rwsem); + +static const struct rdma_link_ops *link_ops_get(const char *type) { - return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_QP); + const struct rdma_link_ops *ops; + + list_for_each_entry(ops, &link_ops, list) { + if (!strcmp(ops->type, type)) + goto out; + } + ops = NULL; +out: + return ops; } -static int nldev_res_get_cm_id_dumpit(struct sk_buff *skb, - struct netlink_callback *cb) +void rdma_link_register(struct rdma_link_ops *ops) { - return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_CM_ID); + down_write(&link_ops_rwsem); + if (WARN_ON_ONCE(link_ops_get(ops->type))) + goto out; + list_add(&ops->list, &link_ops); +out: + up_write(&link_ops_rwsem); } +EXPORT_SYMBOL(rdma_link_register); -static int nldev_res_get_cq_dumpit(struct sk_buff *skb, - struct netlink_callback *cb) +void rdma_link_unregister(struct rdma_link_ops *ops) { - return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_CQ); + down_write(&link_ops_rwsem); + list_del(&ops->list); + up_write(&link_ops_rwsem); } +EXPORT_SYMBOL(rdma_link_unregister); -static int nldev_res_get_mr_dumpit(struct sk_buff *skb, - struct netlink_callback *cb) +static int nldev_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { - return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_MR); + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + char ibdev_name[IB_DEVICE_NAME_MAX]; + const struct rdma_link_ops *ops; + char ndev_name[IFNAMSIZ]; + struct net_device *ndev; + char type[IFNAMSIZ]; + int err; + + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (err || !tb[RDMA_NLDEV_ATTR_DEV_NAME] || + !tb[RDMA_NLDEV_ATTR_LINK_TYPE] || !tb[RDMA_NLDEV_ATTR_NDEV_NAME]) + return -EINVAL; + + nla_strlcpy(ibdev_name, tb[RDMA_NLDEV_ATTR_DEV_NAME], + sizeof(ibdev_name)); + if (strchr(ibdev_name, '%')) + return -EINVAL; + + nla_strlcpy(type, tb[RDMA_NLDEV_ATTR_LINK_TYPE], sizeof(type)); + nla_strlcpy(ndev_name, tb[RDMA_NLDEV_ATTR_NDEV_NAME], + sizeof(ndev_name)); + + ndev = dev_get_by_name(&init_net, ndev_name); + if (!ndev) + return -ENODEV; + + down_read(&link_ops_rwsem); + ops = link_ops_get(type); +#ifdef CONFIG_MODULES + if (!ops) { + up_read(&link_ops_rwsem); + request_module("rdma-link-%s", type); + down_read(&link_ops_rwsem); + ops = link_ops_get(type); + } +#endif + err = ops ? ops->newlink(ibdev_name, ndev) : -EINVAL; + up_read(&link_ops_rwsem); + dev_put(ndev); + + return err; } -static int nldev_res_get_pd_dumpit(struct sk_buff *skb, - struct netlink_callback *cb) +static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { - return res_get_common_dumpit(skb, cb, RDMA_RESTRACK_PD); + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; + u32 index; + int err; + + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(index); + if (!device) + return -EINVAL; + + if (!(device->attrs.device_cap_flags & IB_DEVICE_ALLOW_USER_UNREG)) { + ib_device_put(device); + return -EINVAL; + } + + ib_unregister_device_and_put(device); + return 0; } static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { @@ -1112,6 +1321,14 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { .doit = nldev_set_doit, .flags = RDMA_NL_ADMIN_PERM, }, + [RDMA_NLDEV_CMD_NEWLINK] = { + .doit = nldev_newlink, + .flags = RDMA_NL_ADMIN_PERM, + }, + [RDMA_NLDEV_CMD_DELLINK] = { + .doit = nldev_dellink, + .flags = RDMA_NL_ADMIN_PERM, + }, [RDMA_NLDEV_CMD_PORT_GET] = { .doit = nldev_port_get_doit, .dump = nldev_port_get_dumpit, @@ -1121,28 +1338,23 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { .dump = nldev_res_get_dumpit, }, [RDMA_NLDEV_CMD_RES_QP_GET] = { + .doit = nldev_res_get_qp_doit, .dump = nldev_res_get_qp_dumpit, - /* - * .doit is not implemented yet for two reasons: - * 1. It is not needed yet. - * 2. There is a need to provide identifier, while it is easy - * for the QPs (device index + port index + LQPN), it is not - * the case for the rest of resources (PD and CQ). Because it - * is better to provide similar interface for all resources, - * let's wait till we will have other resources implemented - * too. - */ }, [RDMA_NLDEV_CMD_RES_CM_ID_GET] = { + .doit = nldev_res_get_cm_id_doit, .dump = nldev_res_get_cm_id_dumpit, }, [RDMA_NLDEV_CMD_RES_CQ_GET] = { + .doit = nldev_res_get_cq_doit, .dump = nldev_res_get_cq_dumpit, }, [RDMA_NLDEV_CMD_RES_MR_GET] = { + .doit = nldev_res_get_mr_doit, .dump = nldev_res_get_mr_dumpit, }, [RDMA_NLDEV_CMD_RES_PD_GET] = { + .doit = nldev_res_get_pd_doit, .dump = nldev_res_get_pd_dumpit, }, }; diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c @@ -438,6 +438,38 @@ free: uverbs_uobject_put(uobj); return ERR_PTR(ret); } +struct ib_uobject *_uobj_get_read(enum uverbs_default_objects type, + u32 object_id, + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj; + + uobj = rdma_lookup_get_uobject(uobj_get_type(attrs, type), attrs->ufile, + object_id, UVERBS_LOOKUP_READ); + if (IS_ERR(uobj)) + return uobj; + + attrs->context = uobj->context; + + return uobj; +} + +struct ib_uobject *_uobj_get_write(enum uverbs_default_objects type, + u32 object_id, + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj; + + uobj = rdma_lookup_get_uobject(uobj_get_type(attrs, type), attrs->ufile, + object_id, UVERBS_LOOKUP_WRITE); + + if (IS_ERR(uobj)) + return uobj; + + attrs->context = uobj->context; + + return uobj; +} static struct ib_uobject * alloc_begin_idr_uobject(const struct uverbs_api_object *obj, @@ -801,6 +833,7 @@ void uverbs_close_fd(struct file *f) /* Pairs with filp->private_data in alloc_begin_fd_uobject */ uverbs_uobject_put(uobj); } +EXPORT_SYMBOL(uverbs_close_fd); /* * Drop the ucontext off the ufile and completely disconnect it from the @@ -811,7 +844,6 @@ static void ufile_destroy_ucontext(struct ib_uverbs_file *ufile, { struct ib_ucontext *ucontext = ufile->ucontext; struct ib_device *ib_dev = ucontext->device; - int ret; /* * If we are closing the FD then the user mmap VMAs must have @@ -829,12 +861,8 @@ static void ufile_destroy_ucontext(struct ib_uverbs_file *ufile, rdma_restrack_del(&ucontext->res); - /* - * FIXME: Drivers are not permitted to fail dealloc_ucontext, remove - * the error return. - */ - ret = ib_dev->ops.dealloc_ucontext(ucontext); - WARN_ON(ret); + ib_dev->ops.dealloc_ucontext(ucontext); + kfree(ucontext); ufile->ucontext = NULL; } diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c @@ -11,17 +11,51 @@ #include <linux/pid_namespace.h> #include "cma_priv.h" +#include "restrack.h" -static int fill_res_noop(struct sk_buff *msg, - struct rdma_restrack_entry *entry) +static int rt_xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry, + u32 *next) { - return 0; + int err; + + *id = *next; + if (*next == U32_MAX) + *id = 0; + + xa_lock(xa); + err = __xa_alloc(xa, id, U32_MAX, entry, GFP_KERNEL); + if (err && *next != U32_MAX) { + *id = 0; + err = __xa_alloc(xa, id, *next, entry, GFP_KERNEL); + } + + if (!err) + *next = *id + 1; + xa_unlock(xa); + return err; } -void rdma_restrack_init(struct rdma_restrack_root *res) +/** + * rdma_restrack_init() - initialize and allocate resource tracking + * @dev: IB device + * + * Return: 0 on success + */ +int rdma_restrack_init(struct ib_device *dev) { - init_rwsem(&res->rwsem); - res->fill_res_entry = fill_res_noop; + struct rdma_restrack_root *rt; + int i; + + dev->res = kcalloc(RDMA_RESTRACK_MAX, sizeof(*rt), GFP_KERNEL); + if (!dev->res) + return -ENOMEM; + + rt = dev->res; + + for (i = 0; i < RDMA_RESTRACK_MAX; i++) + xa_init_flags(&rt[i].xa, XA_FLAGS_ALLOC); + + return 0; } static const char *type2str(enum rdma_restrack_type type) @@ -38,55 +72,79 @@ static const char *type2str(enum rdma_restrack_type type) return names[type]; }; -void rdma_restrack_clean(struct rdma_restrack_root *res) +/** + * rdma_restrack_clean() - clean resource tracking + * @dev: IB device + */ +void rdma_restrack_clean(struct ib_device *dev) { + struct rdma_restrack_root *rt = dev->res; struct rdma_restrack_entry *e; char buf[TASK_COMM_LEN]; - struct ib_device *dev; + bool found = false; const char *owner; - int bkt; - - if (hash_empty(res->hash)) - return; - - dev = container_of(res, struct ib_device, res); - pr_err("restrack: %s", CUT_HERE); - dev_err(&dev->dev, "BUG: RESTRACK detected leak of resources\n"); - hash_for_each(res->hash, bkt, e, node) { - if (rdma_is_kernel_res(e)) { - owner = e->kern_name; - } else { - /* - * There is no need to call get_task_struct here, - * because we can be here only if there are more - * get_task_struct() call than put_task_struct(). - */ - get_task_comm(buf, e->task); - owner = buf; + int i; + + for (i = 0 ; i < RDMA_RESTRACK_MAX; i++) { + struct xarray *xa = &dev->res[i].xa; + + if (!xa_empty(xa)) { + unsigned long index; + + if (!found) { + pr_err("restrack: %s", CUT_HERE); + dev_err(&dev->dev, "BUG: RESTRACK detected leak of resources\n"); + } + xa_for_each(xa, index, e) { + if (rdma_is_kernel_res(e)) { + owner = e->kern_name; + } else { + /* + * There is no need to call get_task_struct here, + * because we can be here only if there are more + * get_task_struct() call than put_task_struct(). + */ + get_task_comm(buf, e->task); + owner = buf; + } + + pr_err("restrack: %s %s object allocated by %s is not freed\n", + rdma_is_kernel_res(e) ? "Kernel" : + "User", + type2str(e->type), owner); + } + found = true; } - - pr_err("restrack: %s %s object allocated by %s is not freed\n", - rdma_is_kernel_res(e) ? "Kernel" : "User", - type2str(e->type), owner); + xa_destroy(xa); } - pr_err("restrack: %s", CUT_HERE); + if (found) + pr_err("restrack: %s", CUT_HERE); + + kfree(rt); } -int rdma_restrack_count(struct rdma_restrack_root *res, - enum rdma_restrack_type type, +/** + * rdma_restrack_count() - the current usage of specific object + * @dev: IB device + * @type: actual type of object to operate + * @ns: PID namespace + */ +int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type, struct pid_namespace *ns) { + struct rdma_restrack_root *rt = &dev->res[type]; struct rdma_restrack_entry *e; + XA_STATE(xas, &rt->xa, 0); u32 cnt = 0; - down_read(&res->rwsem); - hash_for_each_possible(res->hash, e, node, type) { + xa_lock(&rt->xa); + xas_for_each(&xas, e, U32_MAX) { if (ns == &init_pid_ns || (!rdma_is_kernel_res(e) && ns == task_active_pid_ns(e->task))) cnt++; } - up_read(&res->rwsem); + xa_unlock(&rt->xa); return cnt; } EXPORT_SYMBOL(rdma_restrack_count); @@ -157,28 +215,28 @@ EXPORT_SYMBOL(rdma_restrack_set_task); static void rdma_restrack_add(struct rdma_restrack_entry *res) { struct ib_device *dev = res_to_dev(res); + struct rdma_restrack_root *rt; + int ret; if (!dev) return; - if (res->type != RDMA_RESTRACK_CM_ID || rdma_is_kernel_res(res)) - res->task = NULL; - - if (!rdma_is_kernel_res(res)) { - if (!res->task) - rdma_restrack_set_task(res, NULL); - res->kern_name = NULL; - } else { - set_kern_name(res); - } + rt = &dev->res[res->type]; kref_init(&res->kref); init_completion(&res->comp); - res->valid = true; + if (res->type != RDMA_RESTRACK_QP) + ret = rt_xa_alloc_cyclic(&rt->xa, &res->id, res, &rt->next_id); + else { + /* Special case to ensure that LQPN points to right QP */ + struct ib_qp *qp = container_of(res, struct ib_qp, res); + + ret = xa_insert(&rt->xa, qp->qp_num, res, GFP_KERNEL); + res->id = ret ? 0 : qp->qp_num; + } - down_write(&dev->res.rwsem); - hash_add(dev->res.hash, &res->node, res->type); - up_write(&dev->res.rwsem); + if (!ret) + res->valid = true; } /** @@ -187,6 +245,8 @@ static void rdma_restrack_add(struct rdma_restrack_entry *res) */ void rdma_restrack_kadd(struct rdma_restrack_entry *res) { + res->task = NULL; + set_kern_name(res); res->user = false; rdma_restrack_add(res); } @@ -198,6 +258,13 @@ EXPORT_SYMBOL(rdma_restrack_kadd); */ void rdma_restrack_uadd(struct rdma_restrack_entry *res) { + if (res->type != RDMA_RESTRACK_CM_ID) + res->task = NULL; + + if (!res->task) + rdma_restrack_set_task(res, NULL); + res->kern_name = NULL; + res->user = true; rdma_restrack_add(res); } @@ -209,6 +276,31 @@ int __must_check rdma_restrack_get(struct rdma_restrack_entry *res) } EXPORT_SYMBOL(rdma_restrack_get); +/** + * rdma_restrack_get_byid() - translate from ID to restrack object + * @dev: IB device + * @type: resource track type + * @id: ID to take a look + * + * Return: Pointer to restrack entry or -ENOENT in case of error. + */ +struct rdma_restrack_entry * +rdma_restrack_get_byid(struct ib_device *dev, + enum rdma_restrack_type type, u32 id) +{ + struct rdma_restrack_root *rt = &dev->res[type]; + struct rdma_restrack_entry *res; + + xa_lock(&rt->xa); + res = xa_load(&rt->xa, id); + if (!res || !rdma_restrack_get(res)) + res = ERR_PTR(-ENOENT); + xa_unlock(&rt->xa); + + return res; +} +EXPORT_SYMBOL(rdma_restrack_get_byid); + static void restrack_release(struct kref *kref) { struct rdma_restrack_entry *res; @@ -225,23 +317,25 @@ EXPORT_SYMBOL(rdma_restrack_put); void rdma_restrack_del(struct rdma_restrack_entry *res) { + struct rdma_restrack_entry *old; + struct rdma_restrack_root *rt; struct ib_device *dev; if (!res->valid) goto out; dev = res_to_dev(res); - if (!dev) + if (WARN_ON(!dev)) return; - rdma_restrack_put(res); + rt = &dev->res[res->type]; - wait_for_completion(&res->comp); - - down_write(&dev->res.rwsem); - hash_del(&res->node); + old = xa_erase(&rt->xa, res->id); + WARN_ON(old != res); res->valid = false; - up_write(&dev->res.rwsem); + + rdma_restrack_put(res); + wait_for_completion(&res->comp); out: if (res->task) { diff --git a/drivers/infiniband/core/restrack.h b/drivers/infiniband/core/restrack.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2017-2019 Mellanox Technologies. All rights reserved. + */ + +#ifndef _RDMA_CORE_RESTRACK_H_ +#define _RDMA_CORE_RESTRACK_H_ + +#include <linux/mutex.h> + +/** + * struct rdma_restrack_root - main resource tracking management + * entity, per-device + */ +struct rdma_restrack_root { + /** + * @xa: Array of XArray structure to hold restrack entries. + */ + struct xarray xa; + /** + * @next_id: Next ID to support cyclic allocation + */ + u32 next_id; +}; + +int rdma_restrack_init(struct ib_device *dev); +void rdma_restrack_clean(struct ib_device *dev); +#endif /* _RDMA_CORE_RESTRACK_H_ */ diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c @@ -179,7 +179,6 @@ static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, struct scatterlist *sg, u32 sg_cnt, u32 offset, u64 remote_addr, u32 rkey, enum dma_data_direction dir) { - struct ib_device *dev = qp->pd->device; u32 max_sge = dir == DMA_TO_DEVICE ? qp->max_write_sge : qp->max_read_sge; struct ib_sge *sge; @@ -209,8 +208,8 @@ static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, rdma_wr->wr.sg_list = sge; for (j = 0; j < nr_sge; j++, sg = sg_next(sg)) { - sge->addr = ib_sg_dma_address(dev, sg) + offset; - sge->length = ib_sg_dma_len(dev, sg) - offset; + sge->addr = sg_dma_address(sg) + offset; + sge->length = sg_dma_len(sg) - offset; sge->lkey = qp->pd->local_dma_lkey; total_len += sge->length; @@ -236,14 +235,13 @@ static int rdma_rw_init_single_wr(struct rdma_rw_ctx *ctx, struct ib_qp *qp, struct scatterlist *sg, u32 offset, u64 remote_addr, u32 rkey, enum dma_data_direction dir) { - struct ib_device *dev = qp->pd->device; struct ib_rdma_wr *rdma_wr = &ctx->single.wr; ctx->nr_ops = 1; ctx->single.sge.lkey = qp->pd->local_dma_lkey; - ctx->single.sge.addr = ib_sg_dma_address(dev, sg) + offset; - ctx->single.sge.length = ib_sg_dma_len(dev, sg) - offset; + ctx->single.sge.addr = sg_dma_address(sg) + offset; + ctx->single.sge.length = sg_dma_len(sg) - offset; memset(rdma_wr, 0, sizeof(*rdma_wr)); if (dir == DMA_TO_DEVICE) @@ -294,7 +292,7 @@ int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num, * Skip to the S/G entry that sg_offset falls into: */ for (;;) { - u32 len = ib_sg_dma_len(dev, sg); + u32 len = sg_dma_len(sg); if (sg_offset < len) break; diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c @@ -2342,9 +2342,7 @@ static void ib_sa_add_one(struct ib_device *device) s = rdma_start_port(device); e = rdma_end_port(device); - sa_dev = kzalloc(sizeof *sa_dev + - (e - s + 1) * sizeof (struct ib_sa_port), - GFP_KERNEL); + sa_dev = kzalloc(struct_size(sa_dev, port, e - s + 1), GFP_KERNEL); if (!sa_dev) return; diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c @@ -39,22 +39,25 @@ #include "core_priv.h" #include "mad_priv.h" +static LIST_HEAD(mad_agent_list); +/* Lock to protect mad_agent_list */ +static DEFINE_SPINLOCK(mad_agent_list_lock); + static struct pkey_index_qp_list *get_pkey_idx_qp_list(struct ib_port_pkey *pp) { struct pkey_index_qp_list *pkey = NULL; struct pkey_index_qp_list *tmp_pkey; struct ib_device *dev = pp->sec->dev; - spin_lock(&dev->port_pkey_list[pp->port_num].list_lock); - list_for_each_entry(tmp_pkey, - &dev->port_pkey_list[pp->port_num].pkey_list, - pkey_index_list) { + spin_lock(&dev->port_data[pp->port_num].pkey_list_lock); + list_for_each_entry (tmp_pkey, &dev->port_data[pp->port_num].pkey_list, + pkey_index_list) { if (tmp_pkey->pkey_index == pp->pkey_index) { pkey = tmp_pkey; break; } } - spin_unlock(&dev->port_pkey_list[pp->port_num].list_lock); + spin_unlock(&dev->port_data[pp->port_num].pkey_list_lock); return pkey; } @@ -259,12 +262,12 @@ static int port_pkey_list_insert(struct ib_port_pkey *pp) if (!pkey) return -ENOMEM; - spin_lock(&dev->port_pkey_list[port_num].list_lock); + spin_lock(&dev->port_data[port_num].pkey_list_lock); /* Check for the PKey again. A racing process may * have created it. */ list_for_each_entry(tmp_pkey, - &dev->port_pkey_list[port_num].pkey_list, + &dev->port_data[port_num].pkey_list, pkey_index_list) { if (tmp_pkey->pkey_index == pp->pkey_index) { kfree(pkey); @@ -279,9 +282,9 @@ static int port_pkey_list_insert(struct ib_port_pkey *pp) spin_lock_init(&pkey->qp_list_lock); INIT_LIST_HEAD(&pkey->qp_list); list_add(&pkey->pkey_index_list, - &dev->port_pkey_list[port_num].pkey_list); + &dev->port_data[port_num].pkey_list); } - spin_unlock(&dev->port_pkey_list[port_num].list_lock); + spin_unlock(&dev->port_data[port_num].pkey_list_lock); } spin_lock(&pkey->qp_list_lock); @@ -418,12 +421,15 @@ void ib_close_shared_qp_security(struct ib_qp_security *sec) int ib_create_qp_security(struct ib_qp *qp, struct ib_device *dev) { - u8 i = rdma_start_port(dev); + unsigned int i; bool is_ib = false; int ret; - while (i <= rdma_end_port(dev) && !is_ib) + rdma_for_each_port (dev, i) { is_ib = rdma_protocol_ib(dev, i++); + if (is_ib) + break; + } /* If this isn't an IB device don't create the security context */ if (!is_ib) @@ -544,9 +550,8 @@ void ib_security_cache_change(struct ib_device *device, { struct pkey_index_qp_list *pkey; - list_for_each_entry(pkey, - &device->port_pkey_list[port_num].pkey_list, - pkey_index_list) { + list_for_each_entry (pkey, &device->port_data[port_num].pkey_list, + pkey_index_list) { check_pkey_qps(pkey, device, port_num, @@ -554,21 +559,19 @@ void ib_security_cache_change(struct ib_device *device, } } -void ib_security_destroy_port_pkey_list(struct ib_device *device) +void ib_security_release_port_pkey_list(struct ib_device *device) { struct pkey_index_qp_list *pkey, *tmp_pkey; - int i; + unsigned int i; - for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) { - spin_lock(&device->port_pkey_list[i].list_lock); + rdma_for_each_port (device, i) { list_for_each_entry_safe(pkey, tmp_pkey, - &device->port_pkey_list[i].pkey_list, + &device->port_data[i].pkey_list, pkey_index_list) { list_del(&pkey->pkey_index_list); kfree(pkey); } - spin_unlock(&device->port_pkey_list[i].list_lock); } } @@ -676,19 +679,18 @@ static int ib_security_pkey_access(struct ib_device *dev, return security_ib_pkey_access(sec, subnet_prefix, pkey); } -static int ib_mad_agent_security_change(struct notifier_block *nb, - unsigned long event, - void *data) +void ib_mad_agent_security_change(void) { - struct ib_mad_agent *ag = container_of(nb, struct ib_mad_agent, lsm_nb); - - if (event != LSM_POLICY_CHANGE) - return NOTIFY_DONE; - - ag->smp_allowed = !security_ib_endport_manage_subnet( - ag->security, dev_name(&ag->device->dev), ag->port_num); - - return NOTIFY_OK; + struct ib_mad_agent *ag; + + spin_lock(&mad_agent_list_lock); + list_for_each_entry(ag, + &mad_agent_list, + mad_agent_sec_list) + WRITE_ONCE(ag->smp_allowed, + !security_ib_endport_manage_subnet(ag->security, + dev_name(&ag->device->dev), ag->port_num)); + spin_unlock(&mad_agent_list_lock); } int ib_mad_agent_security_setup(struct ib_mad_agent *agent, @@ -699,6 +701,8 @@ int ib_mad_agent_security_setup(struct ib_mad_agent *agent, if (!rdma_protocol_ib(agent->device, agent->port_num)) return 0; + INIT_LIST_HEAD(&agent->mad_agent_sec_list); + ret = security_ib_alloc_security(&agent->security); if (ret) return ret; @@ -706,20 +710,22 @@ int ib_mad_agent_security_setup(struct ib_mad_agent *agent, if (qp_type != IB_QPT_SMI) return 0; + spin_lock(&mad_agent_list_lock); ret = security_ib_endport_manage_subnet(agent->security, dev_name(&agent->device->dev), agent->port_num); if (ret) - return ret; + goto free_security; - agent->lsm_nb.notifier_call = ib_mad_agent_security_change; - ret = register_lsm_notifier(&agent->lsm_nb); - if (ret) - return ret; - - agent->smp_allowed = true; - agent->lsm_nb_reg = true; + WRITE_ONCE(agent->smp_allowed, true); + list_add(&agent->mad_agent_sec_list, &mad_agent_list); + spin_unlock(&mad_agent_list_lock); return 0; + +free_security: + spin_unlock(&mad_agent_list_lock); + security_ib_free_security(agent->security); + return ret; } void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent) @@ -727,9 +733,13 @@ void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent) if (!rdma_protocol_ib(agent->device, agent->port_num)) return; + if (agent->qp->qp_type == IB_QPT_SMI) { + spin_lock(&mad_agent_list_lock); + list_del(&agent->mad_agent_sec_list); + spin_unlock(&mad_agent_list_lock); + } + security_ib_free_security(agent->security); - if (agent->lsm_nb_reg) - unregister_lsm_notifier(&agent->lsm_nb); } int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index) @@ -738,7 +748,7 @@ int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index) return 0; if (map->agent.qp->qp_type == IB_QPT_SMI) { - if (!map->agent.smp_allowed) + if (!READ_ONCE(map->agent.smp_allowed)) return -EACCES; return 0; } diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c @@ -1015,9 +1015,7 @@ err_free_stats: return; } -static int add_port(struct ib_device *device, int port_num, - int (*port_callback)(struct ib_device *, - u8, struct kobject *)) +static int add_port(struct ib_device *device, int port_num) { struct ib_port *p; struct ib_port_attr attr; @@ -1113,8 +1111,8 @@ static int add_port(struct ib_device *device, int port_num, if (ret) goto err_free_pkey; - if (port_callback) { - ret = port_callback(device, port_num, &p->kobj); + if (device->ops.init_port) { + ret = device->ops.init_port(device, port_num, &p->kobj); if (ret) goto err_remove_pkey; } @@ -1189,7 +1187,7 @@ err_put: static ssize_t node_type_show(struct device *device, struct device_attribute *attr, char *buf) { - struct ib_device *dev = container_of(device, struct ib_device, dev); + struct ib_device *dev = rdma_device_to_ibdev(device); switch (dev->node_type) { case RDMA_NODE_IB_CA: return sprintf(buf, "%d: CA\n", dev->node_type); @@ -1206,7 +1204,7 @@ static DEVICE_ATTR_RO(node_type); static ssize_t sys_image_guid_show(struct device *device, struct device_attribute *dev_attr, char *buf) { - struct ib_device *dev = container_of(device, struct ib_device, dev); + struct ib_device *dev = rdma_device_to_ibdev(device); return sprintf(buf, "%04x:%04x:%04x:%04x\n", be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[0]), @@ -1219,7 +1217,7 @@ static DEVICE_ATTR_RO(sys_image_guid); static ssize_t node_guid_show(struct device *device, struct device_attribute *attr, char *buf) { - struct ib_device *dev = container_of(device, struct ib_device, dev); + struct ib_device *dev = rdma_device_to_ibdev(device); return sprintf(buf, "%04x:%04x:%04x:%04x\n", be16_to_cpu(((__be16 *) &dev->node_guid)[0]), @@ -1232,7 +1230,7 @@ static DEVICE_ATTR_RO(node_guid); static ssize_t node_desc_show(struct device *device, struct device_attribute *attr, char *buf) { - struct ib_device *dev = container_of(device, struct ib_device, dev); + struct ib_device *dev = rdma_device_to_ibdev(device); return sprintf(buf, "%.64s\n", dev->node_desc); } @@ -1241,7 +1239,7 @@ static ssize_t node_desc_store(struct device *device, struct device_attribute *attr, const char *buf, size_t count) { - struct ib_device *dev = container_of(device, struct ib_device, dev); + struct ib_device *dev = rdma_device_to_ibdev(device); struct ib_device_modify desc = {}; int ret; @@ -1260,7 +1258,7 @@ static DEVICE_ATTR_RW(node_desc); static ssize_t fw_ver_show(struct device *device, struct device_attribute *attr, char *buf) { - struct ib_device *dev = container_of(device, struct ib_device, dev); + struct ib_device *dev = rdma_device_to_ibdev(device); ib_get_device_fw_str(dev, buf); strlcat(buf, "\n", IB_FW_VERSION_NAME_MAX); @@ -1277,21 +1275,21 @@ static struct attribute *ib_dev_attrs[] = { NULL, }; -static const struct attribute_group dev_attr_group = { +const struct attribute_group ib_dev_attr_group = { .attrs = ib_dev_attrs, }; -static void free_port_list_attributes(struct ib_device *device) +static void ib_free_port_attrs(struct ib_device *device) { struct kobject *p, *t; list_for_each_entry_safe(p, t, &device->port_list, entry) { struct ib_port *port = container_of(p, struct ib_port, kobj); + list_del(&p->entry); - if (port->hw_stats) { - kfree(port->hw_stats); + if (port->hw_stats_ag) free_hsag(&port->kobj, port->hw_stats_ag); - } + kfree(port->hw_stats); if (port->pma_table) sysfs_remove_group(p, port->pma_table); @@ -1308,62 +1306,47 @@ static void free_port_list_attributes(struct ib_device *device) kobject_put(device->ports_kobj); } -int ib_device_register_sysfs(struct ib_device *device, - int (*port_callback)(struct ib_device *, - u8, struct kobject *)) +static int ib_setup_port_attrs(struct ib_device *device) { - struct device *class_dev = &device->dev; + unsigned int port; int ret; - int i; - - device->groups[0] = &dev_attr_group; - class_dev->groups = device->groups; - ret = device_add(class_dev); - if (ret) - goto err; - - device->ports_kobj = kobject_create_and_add("ports", &class_dev->kobj); - if (!device->ports_kobj) { - ret = -ENOMEM; - goto err_put; - } + device->ports_kobj = kobject_create_and_add("ports", &device->dev.kobj); + if (!device->ports_kobj) + return -ENOMEM; - if (rdma_cap_ib_switch(device)) { - ret = add_port(device, 0, port_callback); + rdma_for_each_port (device, port) { + ret = add_port(device, port); if (ret) goto err_put; - } else { - for (i = 1; i <= device->phys_port_cnt; ++i) { - ret = add_port(device, i, port_callback); - if (ret) - goto err_put; - } } - if (device->ops.alloc_hw_stats) - setup_hw_stats(device, NULL, 0); - return 0; err_put: - free_port_list_attributes(device); - device_del(class_dev); -err: + ib_free_port_attrs(device); return ret; } -void ib_device_unregister_sysfs(struct ib_device *device) +int ib_device_register_sysfs(struct ib_device *device) { - /* Hold device until ib_dealloc_device() */ - get_device(&device->dev); + int ret; + + ret = ib_setup_port_attrs(device); + if (ret) + return ret; + + if (device->ops.alloc_hw_stats) + setup_hw_stats(device, NULL, 0); - free_port_list_attributes(device); + return 0; +} - if (device->hw_stats) { - kfree(device->hw_stats); +void ib_device_unregister_sysfs(struct ib_device *device) +{ + if (device->hw_stats_ag) free_hsag(&device->dev.kobj, device->hw_stats_ag); - } + kfree(device->hw_stats); - device_unregister(&device->dev); + ib_free_port_attrs(device); } diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c @@ -1236,6 +1236,13 @@ static int ucma_set_option_id(struct ucma_context *ctx, int optname, } ret = rdma_set_afonly(ctx->cm_id, *((int *) optval) ? 1 : 0); break; + case RDMA_OPTION_ID_ACK_TIMEOUT: + if (optlen != sizeof(u8)) { + ret = -EINVAL; + break; + } + ret = rdma_set_ack_timeout(ctx->cm_id, *((u8 *)optval)); + break; default: ret = -ENOSYS; } diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c @@ -72,15 +72,16 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d * If access flags indicate ODP memory, avoid pinning. Instead, stores * the mm for future page fault handling in conjunction with MMU notifiers. * - * @context: userspace context to pin memory for + * @udata: userspace context to pin memory for * @addr: userspace virtual address to start at * @size: length of region to pin * @access: IB_ACCESS_xxx flags for memory being pinned * @dmasync: flush in-flight DMA when the memory region is written */ -struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, +struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, size_t size, int access, int dmasync) { + struct ib_ucontext *context; struct ib_umem *umem; struct page **page_list; struct vm_area_struct **vma_list; @@ -95,6 +96,14 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, struct scatterlist *sg, *sg_list_start; unsigned int gup_flags = FOLL_WRITE; + if (!udata) + return ERR_PTR(-EIO); + + context = container_of(udata, struct uverbs_attr_bundle, driver_udata) + ->context; + if (!context) + return ERR_PTR(-EIO); + if (dmasync) dma_attrs |= DMA_ATTR_WRITE_BARRIER; @@ -160,15 +169,12 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - down_write(&mm->mmap_sem); - if (check_add_overflow(mm->pinned_vm, npages, &new_pinned) || - (new_pinned > lock_limit && !capable(CAP_IPC_LOCK))) { - up_write(&mm->mmap_sem); + new_pinned = atomic64_add_return(npages, &mm->pinned_vm); + if (new_pinned > lock_limit && !capable(CAP_IPC_LOCK)) { + atomic64_sub(npages, &mm->pinned_vm); ret = -ENOMEM; goto out; } - mm->pinned_vm = new_pinned; - up_write(&mm->mmap_sem); cur_base = addr & PAGE_MASK; @@ -228,9 +234,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, umem_release: __ib_umem_release(context->device, umem, 0); vma: - down_write(&mm->mmap_sem); - mm->pinned_vm -= ib_umem_num_pages(umem); - up_write(&mm->mmap_sem); + atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm); out: if (vma_list) free_page((unsigned long) vma_list); @@ -253,25 +257,12 @@ static void __ib_umem_release_tail(struct ib_umem *umem) kfree(umem); } -static void ib_umem_release_defer(struct work_struct *work) -{ - struct ib_umem *umem = container_of(work, struct ib_umem, work); - - down_write(&umem->owning_mm->mmap_sem); - umem->owning_mm->pinned_vm -= ib_umem_num_pages(umem); - up_write(&umem->owning_mm->mmap_sem); - - __ib_umem_release_tail(umem); -} - /** * ib_umem_release - release memory pinned with ib_umem_get * @umem: umem struct to release */ void ib_umem_release(struct ib_umem *umem) { - struct ib_ucontext *context = umem->context; - if (umem->is_odp) { ib_umem_odp_release(to_ib_umem_odp(umem)); __ib_umem_release_tail(umem); @@ -280,26 +271,7 @@ void ib_umem_release(struct ib_umem *umem) __ib_umem_release(umem->context->device, umem, 1); - /* - * We may be called with the mm's mmap_sem already held. This - * can happen when a userspace munmap() is the call that drops - * the last reference to our file and calls our release - * method. If there are memory regions to destroy, we'll end - * up here and not be able to take the mmap_sem. In that case - * we defer the vm_locked accounting a workqueue. - */ - if (context->closing) { - if (!down_write_trylock(&umem->owning_mm->mmap_sem)) { - INIT_WORK(&umem->work, ib_umem_release_defer); - queue_work(ib_wq, &umem->work); - return; - } - } else { - down_write(&umem->owning_mm->mmap_sem); - } - umem->owning_mm->pinned_vm -= ib_umem_num_pages(umem); - up_write(&umem->owning_mm->mmap_sem); - + atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm); __ib_umem_release_tail(umem); } EXPORT_SYMBOL(ib_umem_release); diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c @@ -40,6 +40,7 @@ #include <linux/vmalloc.h> #include <linux/hugetlb.h> #include <linux/interval_tree_generic.h> +#include <linux/pagemap.h> #include <rdma/ib_verbs.h> #include <rdma/ib_umem.h> @@ -299,7 +300,7 @@ static void free_per_mm(struct rcu_head *rcu) kfree(container_of(rcu, struct ib_ucontext_per_mm, rcu)); } -void put_per_mm(struct ib_umem_odp *umem_odp) +static void put_per_mm(struct ib_umem_odp *umem_odp) { struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; struct ib_ucontext *ctx = umem_odp->umem.context; @@ -332,9 +333,10 @@ void put_per_mm(struct ib_umem_odp *umem_odp) mmu_notifier_call_srcu(&per_mm->rcu, free_per_mm); } -struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext_per_mm *per_mm, +struct ib_umem_odp *ib_alloc_odp_umem(struct ib_umem_odp *root, unsigned long addr, size_t size) { + struct ib_ucontext_per_mm *per_mm = root->per_mm; struct ib_ucontext *ctx = per_mm->context; struct ib_umem_odp *odp_data; struct ib_umem *umem; @@ -349,7 +351,7 @@ struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext_per_mm *per_mm, umem->length = size; umem->address = addr; umem->page_shift = PAGE_SHIFT; - umem->writable = 1; + umem->writable = root->umem.writable; umem->is_odp = 1; odp_data->per_mm = per_mm; umem->owning_mm = per_mm->mm; @@ -617,7 +619,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, * mmget_not_zero will fail in this case. */ owning_process = get_pid_task(umem_odp->per_mm->tgid, PIDTYPE_PID); - if (WARN_ON(!mmget_not_zero(umem_odp->umem.owning_mm))) { + if (!owning_process || !mmget_not_zero(owning_mm)) { ret = -EINVAL; goto out_put_task; } @@ -684,9 +686,14 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, mutex_unlock(&umem_odp->umem_mutex); if (ret < 0) { - /* Release left over pages when handling errors. */ - for (++j; j < npages; ++j) - put_page(local_page_list[j]); + /* + * Release pages, remembering that the first page + * to hit an error was already released by + * ib_umem_odp_map_dma_single_page(). + */ + if (npages - (j + 1) > 0) + release_pages(&local_page_list[j+1], + npages - (j + 1)); break; } } diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c @@ -957,19 +957,22 @@ static int ib_umad_open(struct inode *inode, struct file *filp) { struct ib_umad_port *port; struct ib_umad_file *file; - int ret = -ENXIO; + int ret = 0; port = container_of(inode->i_cdev, struct ib_umad_port, cdev); mutex_lock(&port->file_mutex); - if (!port->ib_dev) + if (!port->ib_dev) { + ret = -ENXIO; goto out; + } - ret = -ENOMEM; - file = kzalloc(sizeof *file, GFP_KERNEL); - if (!file) + file = kzalloc(sizeof(*file), GFP_KERNEL); + if (!file) { + ret = -ENOMEM; goto out; + } mutex_init(&file->mutex); spin_lock_init(&file->send_lock); @@ -982,14 +985,7 @@ static int ib_umad_open(struct inode *inode, struct file *filp) list_add_tail(&file->port_list, &port->file_list); - ret = nonseekable_open(inode, filp); - if (ret) { - list_del(&file->port_list); - kfree(file); - goto out; - } - - ib_umad_dev_get(port->umad_dev); + nonseekable_open(inode, filp); out: mutex_unlock(&port->file_mutex); return ret; @@ -998,7 +994,6 @@ out: static int ib_umad_close(struct inode *inode, struct file *filp) { struct ib_umad_file *file = filp->private_data; - struct ib_umad_device *dev = file->port->umad_dev; struct ib_umad_packet *packet, *tmp; int already_dead; int i; @@ -1027,7 +1022,6 @@ static int ib_umad_close(struct inode *inode, struct file *filp) mutex_unlock(&file->port->file_mutex); kfree(file); - ib_umad_dev_put(dev); return 0; } @@ -1073,17 +1067,9 @@ static int ib_umad_sm_open(struct inode *inode, struct file *filp) filp->private_data = port; - ret = nonseekable_open(inode, filp); - if (ret) - goto err_clr_sm_cap; - - ib_umad_dev_get(port->umad_dev); + nonseekable_open(inode, filp); return 0; -err_clr_sm_cap: - swap(props.set_port_cap_mask, props.clr_port_cap_mask); - ib_modify_port(port->ib_dev, port->port_num, 0, &props); - err_up_sem: up(&port->sm_sem); @@ -1106,7 +1092,6 @@ static int ib_umad_sm_close(struct inode *inode, struct file *filp) up(&port->sm_sem); - ib_umad_dev_put(port->umad_dev); return ret; } @@ -1283,10 +1268,12 @@ static void ib_umad_kill_port(struct ib_umad_port *port) mutex_unlock(&port->file_mutex); cdev_device_del(&port->sm_cdev, &port->sm_dev); - put_device(&port->sm_dev); cdev_device_del(&port->cdev, &port->dev); - put_device(&port->dev); ida_free(&umad_ida, port->dev_num); + + /* balances device_initialize() */ + put_device(&port->sm_dev); + put_device(&port->dev); } static void ib_umad_add_one(struct ib_device *device) @@ -1329,21 +1316,24 @@ err: ib_umad_kill_port(&umad_dev->ports[i - s]); } free: + /* balances kref_init */ ib_umad_dev_put(umad_dev); } static void ib_umad_remove_one(struct ib_device *device, void *client_data) { struct ib_umad_device *umad_dev = client_data; - int i; + unsigned int i; if (!umad_dev) return; - for (i = 0; i <= rdma_end_port(device) - rdma_start_port(device); ++i) { - if (rdma_cap_ib_mad(device, i + rdma_start_port(device))) - ib_umad_kill_port(&umad_dev->ports[i]); + rdma_for_each_port (device, i) { + if (rdma_cap_ib_mad(device, i)) + ib_umad_kill_port( + &umad_dev->ports[i - rdma_start_port(device)]); } + /* balances kref_init() */ ib_umad_dev_put(umad_dev); } diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c @@ -224,12 +224,13 @@ static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs) if (ret) goto err; - ucontext = ib_dev->ops.alloc_ucontext(ib_dev, &attrs->driver_udata); - if (IS_ERR(ucontext)) { - ret = PTR_ERR(ucontext); + ucontext = rdma_zalloc_drv_obj(ib_dev, ib_ucontext); + if (!ucontext) { + ret = -ENOMEM; goto err_alloc; } + ucontext->res.type = RDMA_RESTRACK_CTX; ucontext->device = ib_dev; ucontext->cg_obj = cg_obj; /* ufile is required when some objects are released */ @@ -238,15 +239,8 @@ static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs) ucontext->closing = false; ucontext->cleanup_retryable = false; -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING mutex_init(&ucontext->per_mm_list_lock); INIT_LIST_HEAD(&ucontext->per_mm_list); - if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)) - ucontext->invalidate_range = NULL; - -#endif - - resp.num_comp_vectors = file->device->num_comp_vectors; ret = get_unused_fd_flags(O_CLOEXEC); if (ret < 0) @@ -259,15 +253,22 @@ static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs) goto err_fd; } + resp.num_comp_vectors = file->device->num_comp_vectors; + ret = uverbs_response(attrs, &resp, sizeof(resp)); if (ret) goto err_file; - fd_install(resp.async_fd, filp); + ret = ib_dev->ops.alloc_ucontext(ucontext, &attrs->driver_udata); + if (ret) + goto err_file; + if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)) + ucontext->invalidate_range = NULL; - ucontext->res.type = RDMA_RESTRACK_CTX; rdma_restrack_uadd(&ucontext->res); + fd_install(resp.async_fd, filp); + /* * Make sure that ib_uverbs_get_ucontext() sees the pointer update * only after all writes to setup the ucontext have completed @@ -286,7 +287,7 @@ err_fd: put_unused_fd(resp.async_fd); err_free: - ib_dev->ops.dealloc_ucontext(ucontext); + kfree(ucontext); err_alloc: ib_rdmacg_uncharge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE); @@ -410,9 +411,9 @@ static int ib_uverbs_alloc_pd(struct uverbs_attr_bundle *attrs) if (IS_ERR(uobj)) return PTR_ERR(uobj); - pd = ib_dev->ops.alloc_pd(ib_dev, uobj->context, &attrs->driver_udata); - if (IS_ERR(pd)) { - ret = PTR_ERR(pd); + pd = rdma_zalloc_drv_obj(ib_dev, ib_pd); + if (!pd) { + ret = -ENOMEM; goto err; } @@ -420,11 +421,15 @@ static int ib_uverbs_alloc_pd(struct uverbs_attr_bundle *attrs) pd->uobject = uobj; pd->__internal_mr = NULL; atomic_set(&pd->usecnt, 0); + pd->res.type = RDMA_RESTRACK_PD; + + ret = ib_dev->ops.alloc_pd(pd, uobj->context, &attrs->driver_udata); + if (ret) + goto err_alloc; uobj->object = pd; memset(&resp, 0, sizeof resp); resp.pd_handle = uobj->id; - pd->res.type = RDMA_RESTRACK_PD; rdma_restrack_uadd(&pd->res); ret = uverbs_response(attrs, &resp, sizeof(resp)); @@ -435,7 +440,9 @@ static int ib_uverbs_alloc_pd(struct uverbs_attr_bundle *attrs) err_copy: ib_dealloc_pd(pd); - + pd = NULL; +err_alloc: + kfree(pd); err: uobj_alloc_abort(uobj); return ret; @@ -822,14 +829,13 @@ static int ib_uverbs_rereg_mr(struct uverbs_attr_bundle *attrs) cmd.length, cmd.hca_va, cmd.access_flags, pd, &attrs->driver_udata); - if (!ret) { - if (cmd.flags & IB_MR_REREG_PD) { - atomic_inc(&pd->usecnt); - mr->pd = pd; - atomic_dec(&old_pd->usecnt); - } - } else { + if (ret) goto put_uobj_pd; + + if (cmd.flags & IB_MR_REREG_PD) { + atomic_inc(&pd->usecnt); + mr->pd = pd; + atomic_dec(&old_pd->usecnt); } memset(&resp, 0, sizeof(resp)); @@ -884,6 +890,11 @@ static int ib_uverbs_alloc_mw(struct uverbs_attr_bundle *attrs) goto err_free; } + if (cmd.mw_type != IB_MW_TYPE_1 && cmd.mw_type != IB_MW_TYPE_2) { + ret = -EINVAL; + goto err_put; + } + mw = pd->device->ops.alloc_mw(pd, cmd.mw_type, &attrs->driver_udata); if (IS_ERR(mw)) { ret = PTR_ERR(mw); @@ -1184,12 +1195,11 @@ static int ib_uverbs_poll_cq(struct uverbs_attr_bundle *attrs) ret = -EFAULT; goto out_put; } + ret = 0; if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CORE_OUT)) ret = uverbs_output_written(attrs, UVERBS_ATTR_CORE_OUT); - ret = 0; - out_put: uobj_put_obj_read(cq); return ret; @@ -2632,7 +2642,7 @@ void flow_resources_add(struct ib_uflow_resources *uflow_res, } EXPORT_SYMBOL(flow_resources_add); -static int kern_spec_to_ib_spec_action(const struct uverbs_attr_bundle *attrs, +static int kern_spec_to_ib_spec_action(struct uverbs_attr_bundle *attrs, struct ib_uverbs_flow_spec *kern_spec, union ib_flow_spec *ib_spec, struct ib_uflow_resources *uflow_res) @@ -3618,7 +3628,6 @@ static int ib_uverbs_ex_query_device(struct uverbs_attr_bundle *attrs) copy_query_dev_fields(ucontext, &resp.base, &attr); -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING resp.odp_caps.general_caps = attr.odp_caps.general_caps; resp.odp_caps.per_transport_caps.rc_odp_caps = attr.odp_caps.per_transport_caps.rc_odp_caps; @@ -3626,7 +3635,7 @@ static int ib_uverbs_ex_query_device(struct uverbs_attr_bundle *attrs) attr.odp_caps.per_transport_caps.uc_odp_caps; resp.odp_caps.per_transport_caps.ud_odp_caps = attr.odp_caps.per_transport_caps.ud_odp_caps; -#endif + resp.xrc_odp_caps = attr.odp_caps.per_transport_caps.xrc_odp_caps; resp.timestamp_mask = attr.timestamp_mask; resp.hca_core_clock = attr.hca_core_clock; diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c @@ -213,6 +213,7 @@ static int uverbs_process_idrs_array(struct bundle_priv *pbundle, ret = PTR_ERR(attr->uobjects[i]); break; } + pbundle->bundle.context = attr->uobjects[i]->context; } attr->len = i; @@ -330,6 +331,7 @@ static int uverbs_process_attr(struct bundle_priv *pbundle, uattr->data_s64); if (IS_ERR(o_attr->uobject)) return PTR_ERR(o_attr->uobject); + pbundle->bundle.context = o_attr->uobject->context; __set_bit(attr_bkey, pbundle->uobj_finalize); if (spec->u.obj.access == UVERBS_ACCESS_NEW) { @@ -592,6 +594,7 @@ static int ib_uverbs_cmd_verbs(struct ib_uverbs_file *ufile, pbundle->method_elm = method_elm; pbundle->method_key = attrs_iter.index; pbundle->bundle.ufile = ufile; + pbundle->bundle.context = NULL; /* only valid if bundle has uobject */ pbundle->radix = &uapi->radix; pbundle->radix_slots = slot; pbundle->radix_slots_len = radix_tree_chunk_size(&attrs_iter); diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c @@ -695,6 +695,7 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, memset(bundle.attr_present, 0, sizeof(bundle.attr_present)); bundle.ufile = file; + bundle.context = NULL; /* only valid if bundle has uobject */ if (!method_elm->is_ex) { size_t in_len = hdr.in_words * 4 - sizeof(hdr); size_t out_len = hdr.out_words * 4; @@ -1135,6 +1136,7 @@ static const struct file_operations uverbs_mmap_fops = { static struct ib_client uverbs_client = { .name = "uverbs", + .no_kverbs_req = true, .add = ib_uverbs_add_one, .remove = ib_uverbs_remove_one }; diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c @@ -188,7 +188,7 @@ static int uverbs_free_pd(struct ib_uobject *uobject, if (ret) return ret; - ib_dealloc_pd((struct ib_pd *)uobject->object); + ib_dealloc_pd(pd); return 0; } diff --git a/drivers/infiniband/core/uverbs_uapi.c b/drivers/infiniband/core/uverbs_uapi.c @@ -188,13 +188,18 @@ static int uapi_merge_obj_tree(struct uverbs_api *uapi, obj_elm->type_attrs = obj->type_attrs; obj_elm->type_class = obj->type_attrs->type_class; /* - * Today drivers are only permitted to use idr_class - * types. They cannot use FD types because we currently have - * no way to revoke the fops pointer after device - * disassociation. + * Today drivers are only permitted to use idr_class and + * fd_class types. We can revoke the IDR types during + * disassociation, and the FD types require the driver to use + * struct file_operations.owner to prevent the driver module + * code from unloading while the file is open. This provides + * enough safety that uverbs_close_fd() will continue to work. + * Drivers using FD are responsible to handle disassociation of + * the device on their own. */ if (WARN_ON(is_driver && - obj->type_attrs->type_class != &uverbs_idr_class)) + obj->type_attrs->type_class != &uverbs_idr_class && + obj->type_attrs->type_class != &uverbs_fd_class)) return -EINVAL; } diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c @@ -254,10 +254,11 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, { struct ib_pd *pd; int mr_access_flags = 0; + int ret; - pd = device->ops.alloc_pd(device, NULL, NULL); - if (IS_ERR(pd)) - return pd; + pd = rdma_zalloc_drv_obj(device, ib_pd); + if (!pd) + return ERR_PTR(-ENOMEM); pd->device = device; pd->uobject = NULL; @@ -265,6 +266,16 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, atomic_set(&pd->usecnt, 0); pd->flags = flags; + pd->res.type = RDMA_RESTRACK_PD; + rdma_restrack_set_task(&pd->res, caller); + + ret = device->ops.alloc_pd(pd, NULL, NULL); + if (ret) { + kfree(pd); + return ERR_PTR(ret); + } + rdma_restrack_kadd(&pd->res); + if (device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) pd->local_dma_lkey = device->local_dma_lkey; else @@ -275,10 +286,6 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, mr_access_flags |= IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE; } - pd->res.type = RDMA_RESTRACK_PD; - rdma_restrack_set_task(&pd->res, caller); - rdma_restrack_kadd(&pd->res); - if (mr_access_flags) { struct ib_mr *mr; @@ -329,10 +336,8 @@ void ib_dealloc_pd(struct ib_pd *pd) WARN_ON(atomic_read(&pd->usecnt)); rdma_restrack_del(&pd->res); - /* Making delalloc_pd a void return is a WIP, no driver should return - an error here. */ - ret = pd->device->ops.dealloc_pd(pd); - WARN_ONCE(ret, "Infiniband HW driver failed dealloc_pd"); + pd->device->ops.dealloc_pd(pd); + kfree(pd); } EXPORT_SYMBOL(ib_dealloc_pd); @@ -1106,8 +1111,8 @@ struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd, } EXPORT_SYMBOL(ib_open_qp); -static struct ib_qp *ib_create_xrc_qp(struct ib_qp *qp, - struct ib_qp_init_attr *qp_init_attr) +static struct ib_qp *create_xrc_qp(struct ib_qp *qp, + struct ib_qp_init_attr *qp_init_attr) { struct ib_qp *real_qp = qp; @@ -1122,10 +1127,10 @@ static struct ib_qp *ib_create_xrc_qp(struct ib_qp *qp, qp = __ib_open_qp(real_qp, qp_init_attr->event_handler, qp_init_attr->qp_context); - if (!IS_ERR(qp)) - __ib_insert_xrcd_qp(qp_init_attr->xrcd, real_qp); - else - real_qp->device->ops.destroy_qp(real_qp); + if (IS_ERR(qp)) + return qp; + + __ib_insert_xrcd_qp(qp_init_attr->xrcd, real_qp); return qp; } @@ -1156,10 +1161,8 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, return qp; ret = ib_create_qp_security(qp, device); - if (ret) { - ib_destroy_qp(qp); - return ERR_PTR(ret); - } + if (ret) + goto err; qp->real_qp = qp; qp->qp_type = qp_init_attr->qp_type; @@ -1172,8 +1175,15 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, INIT_LIST_HEAD(&qp->sig_mrs); qp->port = 0; - if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) - return ib_create_xrc_qp(qp, qp_init_attr); + if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) { + struct ib_qp *xrc_qp = create_xrc_qp(qp, qp_init_attr); + + if (IS_ERR(xrc_qp)) { + ret = PTR_ERR(xrc_qp); + goto err; + } + return xrc_qp; + } qp->event_handler = qp_init_attr->event_handler; qp->qp_context = qp_init_attr->qp_context; @@ -1200,11 +1210,8 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, if (qp_init_attr->cap.max_rdma_ctxs) { ret = rdma_rw_init_mrs(qp, qp_init_attr); - if (ret) { - pr_err("failed to init MR pool ret= %d\n", ret); - ib_destroy_qp(qp); - return ERR_PTR(ret); - } + if (ret) + goto err; } /* @@ -1217,6 +1224,11 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, device->attrs.max_sge_rd); return qp; + +err: + ib_destroy_qp(qp); + return ERR_PTR(ret); + } EXPORT_SYMBOL(ib_create_qp); @@ -1711,10 +1723,7 @@ int ib_get_eth_speed(struct ib_device *dev, u8 port_num, u8 *speed, u8 *width) if (rdma_port_get_link_layer(dev, port_num) != IB_LINK_LAYER_ETHERNET) return -EINVAL; - if (!dev->ops.get_netdev) - return -EOPNOTSUPP; - - netdev = dev->ops.get_netdev(dev, port_num); + netdev = ib_device_get_netdev(dev, port_num); if (!netdev) return -ENODEV; diff --git a/drivers/infiniband/hw/bnxt_re/Kconfig b/drivers/infiniband/hw/bnxt_re/Kconfig @@ -1,5 +1,6 @@ config INFINIBAND_BNXT_RE tristate "Broadcom Netxtreme HCA support" + depends on 64BIT depends on ETHERNET && NETDEVICES && PCI && INET && DCB select NET_VENDOR_BROADCOM select BNXT diff --git a/drivers/infiniband/hw/bnxt_re/Makefile b/drivers/infiniband/hw/bnxt_re/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 -ccflags-y := -Idrivers/net/ethernet/broadcom/bnxt +ccflags-y := -I $(srctree)/drivers/net/ethernet/broadcom/bnxt obj-$(CONFIG_INFINIBAND_BNXT_RE) += bnxt_re.o bnxt_re-y := main.o ib_verbs.o \ qplib_res.o qplib_rcfw.o \ diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -124,6 +124,7 @@ struct bnxt_re_dev { #define BNXT_RE_FLAG_ISSUE_ROCE_STATS 29 struct net_device *netdev; unsigned int version, major, minor; + struct bnxt_qplib_chip_ctx chip_ctx; struct bnxt_en_dev *en_dev; struct bnxt_msix_entry msix_entries[BNXT_RE_MAX_MSIX]; int num_msix; diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -48,6 +48,7 @@ #include <rdma/ib_addr.h> #include <rdma/ib_mad.h> #include <rdma/ib_cache.h> +#include <rdma/uverbs_ioctl.h> #include "bnxt_ulp.h" @@ -563,41 +564,29 @@ fail: } /* Protection Domains */ -int bnxt_re_dealloc_pd(struct ib_pd *ib_pd) +void bnxt_re_dealloc_pd(struct ib_pd *ib_pd) { struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd); struct bnxt_re_dev *rdev = pd->rdev; - int rc; bnxt_re_destroy_fence_mr(pd); - if (pd->qplib_pd.id) { - rc = bnxt_qplib_dealloc_pd(&rdev->qplib_res, - &rdev->qplib_res.pd_tbl, - &pd->qplib_pd); - if (rc) - dev_err(rdev_to_dev(rdev), "Failed to deallocate HW PD"); - } - - kfree(pd); - return 0; + if (pd->qplib_pd.id) + bnxt_qplib_dealloc_pd(&rdev->qplib_res, &rdev->qplib_res.pd_tbl, + &pd->qplib_pd); } -struct ib_pd *bnxt_re_alloc_pd(struct ib_device *ibdev, - struct ib_ucontext *ucontext, - struct ib_udata *udata) +int bnxt_re_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *ucontext, + struct ib_udata *udata) { + struct ib_device *ibdev = ibpd->device; struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev); struct bnxt_re_ucontext *ucntx = container_of(ucontext, struct bnxt_re_ucontext, ib_uctx); - struct bnxt_re_pd *pd; + struct bnxt_re_pd *pd = container_of(ibpd, struct bnxt_re_pd, ib_pd); int rc; - pd = kzalloc(sizeof(*pd), GFP_KERNEL); - if (!pd) - return ERR_PTR(-ENOMEM); - pd->rdev = rdev; if (bnxt_qplib_alloc_pd(&rdev->qplib_res.pd_tbl, &pd->qplib_pd)) { dev_err(rdev_to_dev(rdev), "Failed to allocate HW PD"); @@ -637,13 +626,12 @@ struct ib_pd *bnxt_re_alloc_pd(struct ib_device *ibdev, if (bnxt_re_create_fence_mr(pd)) dev_warn(rdev_to_dev(rdev), "Failed to create Fence-MR\n"); - return &pd->ib_pd; + return 0; dbfail: - (void)bnxt_qplib_dealloc_pd(&rdev->qplib_res, &rdev->qplib_res.pd_tbl, - &pd->qplib_pd); + bnxt_qplib_dealloc_pd(&rdev->qplib_res, &rdev->qplib_res.pd_tbl, + &pd->qplib_pd); fail: - kfree(pd); - return ERR_PTR(rc); + return rc; } /* Address Handles */ @@ -663,17 +651,36 @@ int bnxt_re_destroy_ah(struct ib_ah *ib_ah, u32 flags) return 0; } +static u8 bnxt_re_stack_to_dev_nw_type(enum rdma_network_type ntype) +{ + u8 nw_type; + + switch (ntype) { + case RDMA_NETWORK_IPV4: + nw_type = CMDQ_CREATE_AH_TYPE_V2IPV4; + break; + case RDMA_NETWORK_IPV6: + nw_type = CMDQ_CREATE_AH_TYPE_V2IPV6; + break; + default: + nw_type = CMDQ_CREATE_AH_TYPE_V1; + break; + } + return nw_type; +} + struct ib_ah *bnxt_re_create_ah(struct ib_pd *ib_pd, struct rdma_ah_attr *ah_attr, u32 flags, struct ib_udata *udata) { struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd); + const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr); struct bnxt_re_dev *rdev = pd->rdev; + const struct ib_gid_attr *sgid_attr; struct bnxt_re_ah *ah; - const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr); - int rc; u8 nw_type; + int rc; if (!(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH)) { dev_err(rdev_to_dev(rdev), "Failed to alloc AH: GRH not set"); @@ -700,28 +707,11 @@ struct ib_ah *bnxt_re_create_ah(struct ib_pd *ib_pd, ah->qplib_ah.flow_label = grh->flow_label; ah->qplib_ah.hop_limit = grh->hop_limit; ah->qplib_ah.sl = rdma_ah_get_sl(ah_attr); - if (udata && - !rdma_is_multicast_addr((struct in6_addr *) - grh->dgid.raw) && - !rdma_link_local_addr((struct in6_addr *) - grh->dgid.raw)) { - const struct ib_gid_attr *sgid_attr; - sgid_attr = grh->sgid_attr; - /* Get network header type for this GID */ - nw_type = rdma_gid_attr_network_type(sgid_attr); - switch (nw_type) { - case RDMA_NETWORK_IPV4: - ah->qplib_ah.nw_type = CMDQ_CREATE_AH_TYPE_V2IPV4; - break; - case RDMA_NETWORK_IPV6: - ah->qplib_ah.nw_type = CMDQ_CREATE_AH_TYPE_V2IPV6; - break; - default: - ah->qplib_ah.nw_type = CMDQ_CREATE_AH_TYPE_V1; - break; - } - } + sgid_attr = grh->sgid_attr; + /* Get network header type for this GID */ + nw_type = rdma_gid_attr_network_type(sgid_attr); + ah->qplib_ah.nw_type = bnxt_re_stack_to_dev_nw_type(nw_type); memcpy(ah->qplib_ah.dmac, ah_attr->roce.dmac, ETH_ALEN); rc = bnxt_qplib_create_ah(&rdev->qplib_res, &ah->qplib_ah, @@ -733,12 +723,11 @@ struct ib_ah *bnxt_re_create_ah(struct ib_pd *ib_pd, /* Write AVID to shared page. */ if (udata) { - struct ib_ucontext *ib_uctx = ib_pd->uobject->context; - struct bnxt_re_ucontext *uctx; + struct bnxt_re_ucontext *uctx = rdma_udata_to_drv_context( + udata, struct bnxt_re_ucontext, ib_uctx); unsigned long flag; u32 *wrptr; - uctx = container_of(ib_uctx, struct bnxt_re_ucontext, ib_uctx); spin_lock_irqsave(&uctx->sh_lock, flag); wrptr = (u32 *)(uctx->shpg + BNXT_RE_AVID_OFFT); *wrptr = ah->qplib_ah.id; @@ -804,8 +793,8 @@ int bnxt_re_destroy_qp(struct ib_qp *ib_qp) { struct bnxt_re_qp *qp = container_of(ib_qp, struct bnxt_re_qp, ib_qp); struct bnxt_re_dev *rdev = qp->rdev; - int rc; unsigned int flags; + int rc; bnxt_qplib_flush_cqn_wq(&qp->qplib_qp); rc = bnxt_qplib_destroy_qp(&rdev->qplib_res, &qp->qplib_qp); @@ -814,9 +803,12 @@ int bnxt_re_destroy_qp(struct ib_qp *ib_qp) return rc; } - flags = bnxt_re_lock_cqs(qp); - bnxt_qplib_clean_qp(&qp->qplib_qp); - bnxt_re_unlock_cqs(qp, flags); + if (rdma_is_kernel_res(&qp->ib_qp.res)) { + flags = bnxt_re_lock_cqs(qp); + bnxt_qplib_clean_qp(&qp->qplib_qp); + bnxt_re_unlock_cqs(qp, flags); + } + bnxt_qplib_free_qp_res(&rdev->qplib_res, &qp->qplib_qp); if (ib_qp->qp_type == IB_QPT_GSI && rdev->qp1_sqp) { @@ -882,21 +874,23 @@ static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd, struct bnxt_re_qp_req ureq; struct bnxt_qplib_qp *qplib_qp = &qp->qplib_qp; struct ib_umem *umem; - int bytes = 0; - struct ib_ucontext *context = pd->ib_pd.uobject->context; - struct bnxt_re_ucontext *cntx = container_of(context, - struct bnxt_re_ucontext, - ib_uctx); + int bytes = 0, psn_sz; + struct bnxt_re_ucontext *cntx = rdma_udata_to_drv_context( + udata, struct bnxt_re_ucontext, ib_uctx); + if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) return -EFAULT; bytes = (qplib_qp->sq.max_wqe * BNXT_QPLIB_MAX_SQE_ENTRY_SIZE); /* Consider mapping PSN search memory only for RC QPs. */ - if (qplib_qp->type == CMDQ_CREATE_QP_TYPE_RC) - bytes += (qplib_qp->sq.max_wqe * sizeof(struct sq_psn_search)); + if (qplib_qp->type == CMDQ_CREATE_QP_TYPE_RC) { + psn_sz = bnxt_qplib_is_chip_gen_p5(&rdev->chip_ctx) ? + sizeof(struct sq_psn_search_ext) : + sizeof(struct sq_psn_search); + bytes += (qplib_qp->sq.max_wqe * psn_sz); + } bytes = PAGE_ALIGN(bytes); - umem = ib_umem_get(context, ureq.qpsva, bytes, - IB_ACCESS_LOCAL_WRITE, 1); + umem = ib_umem_get(udata, ureq.qpsva, bytes, IB_ACCESS_LOCAL_WRITE, 1); if (IS_ERR(umem)) return PTR_ERR(umem); @@ -908,7 +902,7 @@ static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd, if (!qp->qplib_qp.srq) { bytes = (qplib_qp->rq.max_wqe * BNXT_QPLIB_MAX_RQE_ENTRY_SIZE); bytes = PAGE_ALIGN(bytes); - umem = ib_umem_get(context, ureq.qprva, bytes, + umem = ib_umem_get(udata, ureq.qprva, bytes, IB_ACCESS_LOCAL_WRITE, 1); if (IS_ERR(umem)) goto rqfail; @@ -1066,12 +1060,17 @@ struct ib_qp *bnxt_re_create_qp(struct ib_pd *ib_pd, qp->qplib_qp.pd = &pd->qplib_pd; qp->qplib_qp.qp_handle = (u64)(unsigned long)(&qp->qplib_qp); qp->qplib_qp.type = __from_ib_qp_type(qp_init_attr->qp_type); + + if (qp_init_attr->qp_type == IB_QPT_GSI && + bnxt_qplib_is_chip_gen_p5(&rdev->chip_ctx)) + qp->qplib_qp.type = CMDQ_CREATE_QP_TYPE_GSI; if (qp->qplib_qp.type == IB_QPT_MAX) { dev_err(rdev_to_dev(rdev), "QP type 0x%x not supported", qp->qplib_qp.type); rc = -EINVAL; goto fail; } + qp->qplib_qp.max_inline_data = qp_init_attr->cap.max_inline_data; qp->qplib_qp.sig_type = ((qp_init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) ? true : false); @@ -1132,7 +1131,8 @@ struct ib_qp *bnxt_re_create_qp(struct ib_pd *ib_pd, qp->qplib_qp.mtu = ib_mtu_enum_to_int(iboe_get_mtu(rdev->netdev->mtu)); - if (qp_init_attr->qp_type == IB_QPT_GSI) { + if (qp_init_attr->qp_type == IB_QPT_GSI && + !(bnxt_qplib_is_chip_gen_p5(&rdev->chip_ctx))) { /* Allocate 1 more than what's provided */ entries = roundup_pow_of_two(qp_init_attr->cap.max_send_wr + 1); qp->qplib_qp.sq.max_wqe = min_t(u32, entries, @@ -1361,17 +1361,15 @@ static int bnxt_re_init_user_srq(struct bnxt_re_dev *rdev, struct bnxt_qplib_srq *qplib_srq = &srq->qplib_srq; struct ib_umem *umem; int bytes = 0; - struct ib_ucontext *context = pd->ib_pd.uobject->context; - struct bnxt_re_ucontext *cntx = container_of(context, - struct bnxt_re_ucontext, - ib_uctx); + struct bnxt_re_ucontext *cntx = rdma_udata_to_drv_context( + udata, struct bnxt_re_ucontext, ib_uctx); + if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) return -EFAULT; bytes = (qplib_srq->max_wqe * BNXT_QPLIB_MAX_RQE_ENTRY_SIZE); bytes = PAGE_ALIGN(bytes); - umem = ib_umem_get(context, ureq.srqva, bytes, - IB_ACCESS_LOCAL_WRITE, 1); + umem = ib_umem_get(udata, ureq.srqva, bytes, IB_ACCESS_LOCAL_WRITE, 1); if (IS_ERR(umem)) return PTR_ERR(umem); @@ -1646,6 +1644,9 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, __from_ib_access_flags(qp_attr->qp_access_flags); /* LOCAL_WRITE access must be set to allow RC receive */ qp->qplib_qp.access |= BNXT_QPLIB_ACCESS_LOCAL_WRITE; + /* Temp: Set all params on QP as of now */ + qp->qplib_qp.access |= CMDQ_MODIFY_QP_ACCESS_REMOTE_WRITE; + qp->qplib_qp.access |= CMDQ_MODIFY_QP_ACCESS_REMOTE_READ; } if (qp_attr_mask & IB_QP_PKEY_INDEX) { qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_PKEY; @@ -2093,7 +2094,8 @@ static int bnxt_re_build_qp1_shadow_qp_recv(struct bnxt_re_qp *qp, static int is_ud_qp(struct bnxt_re_qp *qp) { - return qp->qplib_qp.type == CMDQ_CREATE_QP_TYPE_UD; + return (qp->qplib_qp.type == CMDQ_CREATE_QP_TYPE_UD || + qp->qplib_qp.type == CMDQ_CREATE_QP_TYPE_GSI); } static int bnxt_re_build_send_wqe(struct bnxt_re_qp *qp, @@ -2397,7 +2399,7 @@ int bnxt_re_post_send(struct ib_qp *ib_qp, const struct ib_send_wr *wr, switch (wr->opcode) { case IB_WR_SEND: case IB_WR_SEND_WITH_IMM: - if (ib_qp->qp_type == IB_QPT_GSI) { + if (qp->qplib_qp.type == CMDQ_CREATE_QP1_TYPE_GSI) { rc = bnxt_re_build_qp1_send_v2(qp, wr, &wqe, payload_sz); if (rc) @@ -2527,7 +2529,8 @@ int bnxt_re_post_recv(struct ib_qp *ib_qp, const struct ib_recv_wr *wr, wqe.wr_id = wr->wr_id; wqe.type = BNXT_QPLIB_SWQE_TYPE_RECV; - if (ib_qp->qp_type == IB_QPT_GSI) + if (ib_qp->qp_type == IB_QPT_GSI && + qp->qplib_qp.type != CMDQ_CREATE_QP_TYPE_GSI) rc = bnxt_re_build_qp1_shadow_qp_recv(qp, wr, &wqe, payload_sz); if (!rc) @@ -2622,7 +2625,7 @@ struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev, goto fail; } - cq->umem = ib_umem_get(context, req.cq_va, + cq->umem = ib_umem_get(udata, req.cq_va, entries * sizeof(struct cq_base), IB_ACCESS_LOCAL_WRITE, 1); if (IS_ERR(cq->umem)) { @@ -3122,19 +3125,33 @@ static void bnxt_re_process_res_shadow_qp_wc(struct bnxt_re_qp *qp, } } -static void bnxt_re_process_res_ud_wc(struct ib_wc *wc, +static void bnxt_re_process_res_ud_wc(struct bnxt_re_qp *qp, + struct ib_wc *wc, struct bnxt_qplib_cqe *cqe) { + u8 nw_type; + wc->opcode = IB_WC_RECV; wc->status = __rc_to_ib_wc_status(cqe->status); - if (cqe->flags & CQ_RES_RC_FLAGS_IMM) + if (cqe->flags & CQ_RES_UD_FLAGS_IMM) wc->wc_flags |= IB_WC_WITH_IMM; - if (cqe->flags & CQ_RES_RC_FLAGS_INV) - wc->wc_flags |= IB_WC_WITH_INVALIDATE; - if ((cqe->flags & (CQ_RES_RC_FLAGS_RDMA | CQ_RES_RC_FLAGS_IMM)) == - (CQ_RES_RC_FLAGS_RDMA | CQ_RES_RC_FLAGS_IMM)) - wc->opcode = IB_WC_RECV_RDMA_WITH_IMM; + /* report only on GSI QP for Thor */ + if (qp->qplib_qp.type == CMDQ_CREATE_QP_TYPE_GSI) { + wc->wc_flags |= IB_WC_GRH; + memcpy(wc->smac, cqe->smac, ETH_ALEN); + wc->wc_flags |= IB_WC_WITH_SMAC; + if (cqe->flags & CQ_RES_UD_FLAGS_META_FORMAT_VLAN) { + wc->vlan_id = (cqe->cfa_meta & 0xFFF); + if (wc->vlan_id < 0x1000) + wc->wc_flags |= IB_WC_WITH_VLAN; + } + nw_type = (cqe->flags & CQ_RES_UD_FLAGS_ROCE_IP_VER_MASK) >> + CQ_RES_UD_FLAGS_ROCE_IP_VER_SFT; + wc->network_hdr_type = bnxt_re_to_ib_nw_type(nw_type); + wc->wc_flags |= IB_WC_WITH_NETWORK_HDR_TYPE; + } + } static int send_phantom_wqe(struct bnxt_re_qp *qp) @@ -3226,7 +3243,7 @@ int bnxt_re_poll_cq(struct ib_cq *ib_cq, int num_entries, struct ib_wc *wc) switch (cqe->opcode) { case CQ_BASE_CQE_TYPE_REQ: - if (qp->qplib_qp.id == + if (qp->rdev->qp1_sqp && qp->qplib_qp.id == qp->rdev->qp1_sqp->qplib_qp.id) { /* Handle this completion with * the stored completion @@ -3261,7 +3278,7 @@ int bnxt_re_poll_cq(struct ib_cq *ib_cq, int num_entries, struct ib_wc *wc) bnxt_re_process_res_rc_wc(wc, cqe); break; case CQ_BASE_CQE_TYPE_RES_UD: - if (qp->qplib_qp.id == + if (qp->rdev->qp1_sqp && qp->qplib_qp.id == qp->rdev->qp1_sqp->qplib_qp.id) { /* Handle this completion with * the stored completion @@ -3274,7 +3291,7 @@ int bnxt_re_poll_cq(struct ib_cq *ib_cq, int num_entries, struct ib_wc *wc) break; } } - bnxt_re_process_res_ud_wc(wc, cqe); + bnxt_re_process_res_ud_wc(qp, wc, cqe); break; default: dev_err(rdev_to_dev(cq->rdev), @@ -3301,10 +3318,10 @@ int bnxt_re_req_notify_cq(struct ib_cq *ib_cq, spin_lock_irqsave(&cq->cq_lock, flags); /* Trigger on the very next completion */ if (ib_cqn_flags & IB_CQ_NEXT_COMP) - type = DBR_DBR_TYPE_CQ_ARMALL; + type = DBC_DBC_TYPE_CQ_ARMALL; /* Trigger on the next solicited completion */ else if (ib_cqn_flags & IB_CQ_SOLICITED) - type = DBR_DBR_TYPE_CQ_ARMSE; + type = DBC_DBC_TYPE_CQ_ARMSE; /* Poll to see if there are missed events */ if ((ib_cqn_flags & IB_CQ_REPORT_MISSED_EVENTS) && @@ -3537,19 +3554,14 @@ static int fill_umem_pbl_tbl(struct ib_umem *umem, u64 *pbl_tbl_orig, u64 *pbl_tbl = pbl_tbl_orig; u64 paddr; u64 page_mask = (1ULL << page_shift) - 1; - int i, pages; - struct scatterlist *sg; - int entry; - - for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { - pages = sg_dma_len(sg) >> PAGE_SHIFT; - for (i = 0; i < pages; i++) { - paddr = sg_dma_address(sg) + (i << PAGE_SHIFT); - if (pbl_tbl == pbl_tbl_orig) - *pbl_tbl++ = paddr & ~page_mask; - else if ((paddr & page_mask) == 0) - *pbl_tbl++ = paddr; - } + struct sg_dma_page_iter sg_iter; + + for_each_sg_dma_page (umem->sg_head.sgl, &sg_iter, umem->nmap, 0) { + paddr = sg_page_iter_dma_address(&sg_iter); + if (pbl_tbl == pbl_tbl_orig) + *pbl_tbl++ = paddr & ~page_mask; + else if ((paddr & page_mask) == 0) + *pbl_tbl++ = paddr; } return pbl_tbl - pbl_tbl_orig; } @@ -3589,8 +3601,7 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length, /* The fixed portion of the rkey is the same as the lkey */ mr->ib_mr.rkey = mr->qplib_mr.rkey; - umem = ib_umem_get(ib_pd->uobject->context, start, length, - mr_access_flags, 0); + umem = ib_umem_get(udata, start, length, mr_access_flags, 0); if (IS_ERR(umem)) { dev_err(rdev_to_dev(rdev), "Failed to get umem"); rc = -EFAULT; @@ -3613,7 +3624,7 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length, goto free_umem; } - page_shift = umem->page_shift; + page_shift = PAGE_SHIFT; if (!bnxt_re_page_size_ok(page_shift)) { dev_err(rdev_to_dev(rdev), "umem page size unsupported!"); @@ -3660,13 +3671,15 @@ free_mr: return ERR_PTR(rc); } -struct ib_ucontext *bnxt_re_alloc_ucontext(struct ib_device *ibdev, - struct ib_udata *udata) +int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata) { + struct ib_device *ibdev = ctx->device; + struct bnxt_re_ucontext *uctx = + container_of(ctx, struct bnxt_re_ucontext, ib_uctx); struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev); - struct bnxt_re_uctx_resp resp; - struct bnxt_re_ucontext *uctx; struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; + struct bnxt_re_uctx_resp resp; + u32 chip_met_rev_num = 0; int rc; dev_dbg(rdev_to_dev(rdev), "ABI version requested %d", @@ -3675,13 +3688,9 @@ struct ib_ucontext *bnxt_re_alloc_ucontext(struct ib_device *ibdev, if (ibdev->uverbs_abi_ver != BNXT_RE_ABI_VERSION) { dev_dbg(rdev_to_dev(rdev), " is different from the device %d ", BNXT_RE_ABI_VERSION); - return ERR_PTR(-EPERM); + return -EPERM; } - uctx = kzalloc(sizeof(*uctx), GFP_KERNEL); - if (!uctx) - return ERR_PTR(-ENOMEM); - uctx->rdev = rdev; uctx->shpg = (void *)__get_free_page(GFP_KERNEL); @@ -3691,37 +3700,45 @@ struct ib_ucontext *bnxt_re_alloc_ucontext(struct ib_device *ibdev, } spin_lock_init(&uctx->sh_lock); - resp.dev_id = rdev->en_dev->pdev->devfn; /*Temp, Use idr_alloc instead*/ + resp.comp_mask = BNXT_RE_UCNTX_CMASK_HAVE_CCTX; + chip_met_rev_num = rdev->chip_ctx.chip_num; + chip_met_rev_num |= ((u32)rdev->chip_ctx.chip_rev & 0xFF) << + BNXT_RE_CHIP_ID0_CHIP_REV_SFT; + chip_met_rev_num |= ((u32)rdev->chip_ctx.chip_metal & 0xFF) << + BNXT_RE_CHIP_ID0_CHIP_MET_SFT; + resp.chip_id0 = chip_met_rev_num; + /* Future extension of chip info */ + resp.chip_id1 = 0; + /*Temp, Use idr_alloc instead */ + resp.dev_id = rdev->en_dev->pdev->devfn; resp.max_qp = rdev->qplib_ctx.qpc_count; resp.pg_size = PAGE_SIZE; resp.cqe_sz = sizeof(struct cq_base); resp.max_cqd = dev_attr->max_cq_wqes; resp.rsvd = 0; - rc = ib_copy_to_udata(udata, &resp, sizeof(resp)); + rc = ib_copy_to_udata(udata, &resp, min(udata->outlen, sizeof(resp))); if (rc) { dev_err(rdev_to_dev(rdev), "Failed to copy user context"); rc = -EFAULT; goto cfail; } - return &uctx->ib_uctx; + return 0; cfail: free_page((unsigned long)uctx->shpg); uctx->shpg = NULL; fail: - kfree(uctx); - return ERR_PTR(rc); + return rc; } -int bnxt_re_dealloc_ucontext(struct ib_ucontext *ib_uctx) +void bnxt_re_dealloc_ucontext(struct ib_ucontext *ib_uctx) { struct bnxt_re_ucontext *uctx = container_of(ib_uctx, struct bnxt_re_ucontext, ib_uctx); struct bnxt_re_dev *rdev = uctx->rdev; - int rc = 0; if (uctx->shpg) free_page((unsigned long)uctx->shpg); @@ -3730,17 +3747,10 @@ int bnxt_re_dealloc_ucontext(struct ib_ucontext *ib_uctx) /* Free DPI only if this is the first PD allocated by the * application and mark the context dpi as NULL */ - rc = bnxt_qplib_dealloc_dpi(&rdev->qplib_res, - &rdev->qplib_res.dpi_tbl, - &uctx->dpi); - if (rc) - dev_err(rdev_to_dev(rdev), "Deallocate HW DPI failed!"); - /* Don't fail, continue*/ + bnxt_qplib_dealloc_dpi(&rdev->qplib_res, + &rdev->qplib_res.dpi_tbl, &uctx->dpi); uctx->dpi.dbr = NULL; } - - kfree(uctx); - return 0; } /* Helper function to mmap the virtual memory from user app */ diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h @@ -56,8 +56,8 @@ struct bnxt_re_fence_data { }; struct bnxt_re_pd { + struct ib_pd ib_pd; struct bnxt_re_dev *rdev; - struct ib_pd ib_pd; struct bnxt_qplib_pd qplib_pd; struct bnxt_re_fence_data fence; }; @@ -135,8 +135,8 @@ struct bnxt_re_mw { }; struct bnxt_re_ucontext { + struct ib_ucontext ib_uctx; struct bnxt_re_dev *rdev; - struct ib_ucontext ib_uctx; struct bnxt_qplib_dpi dpi; void *shpg; spinlock_t sh_lock; /* protect shpg */ @@ -163,10 +163,9 @@ int bnxt_re_query_gid(struct ib_device *ibdev, u8 port_num, int index, union ib_gid *gid); enum rdma_link_layer bnxt_re_get_link_layer(struct ib_device *ibdev, u8 port_num); -struct ib_pd *bnxt_re_alloc_pd(struct ib_device *ibdev, - struct ib_ucontext *context, - struct ib_udata *udata); -int bnxt_re_dealloc_pd(struct ib_pd *pd); +int bnxt_re_alloc_pd(struct ib_pd *pd, struct ib_ucontext *context, + struct ib_udata *udata); +void bnxt_re_dealloc_pd(struct ib_pd *pd); struct ib_ah *bnxt_re_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr, u32 flags, @@ -216,9 +215,8 @@ int bnxt_re_dealloc_mw(struct ib_mw *mw); struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int mr_access_flags, struct ib_udata *udata); -struct ib_ucontext *bnxt_re_alloc_ucontext(struct ib_device *ibdev, - struct ib_udata *udata); -int bnxt_re_dealloc_ucontext(struct ib_ucontext *context); +int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata); +void bnxt_re_dealloc_ucontext(struct ib_ucontext *context); int bnxt_re_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); unsigned long bnxt_re_lock_cqs(struct bnxt_re_qp *qp); diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c @@ -80,6 +80,29 @@ static DEFINE_MUTEX(bnxt_re_dev_lock); static struct workqueue_struct *bnxt_re_wq; static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev); +static void bnxt_re_destroy_chip_ctx(struct bnxt_re_dev *rdev) +{ + rdev->rcfw.res = NULL; + rdev->qplib_res.cctx = NULL; +} + +static int bnxt_re_setup_chip_ctx(struct bnxt_re_dev *rdev) +{ + struct bnxt_en_dev *en_dev; + struct bnxt *bp; + + en_dev = rdev->en_dev; + bp = netdev_priv(en_dev->net); + + rdev->chip_ctx.chip_num = bp->chip_num; + /* rest members to follow eventually */ + + rdev->qplib_res.cctx = &rdev->chip_ctx; + rdev->rcfw.res = &rdev->qplib_res; + + return 0; +} + /* SR-IOV helper functions */ static void bnxt_re_get_sriov_func_type(struct bnxt_re_dev *rdev) @@ -278,6 +301,7 @@ static int bnxt_re_register_netdev(struct bnxt_re_dev *rdev) rc = en_dev->en_ops->bnxt_register_device(en_dev, BNXT_ROCE_ULP, &bnxt_re_ulp_ops, rdev); + rdev->qplib_res.pdev = rdev->en_dev->pdev; return rc; } @@ -345,7 +369,8 @@ static void bnxt_re_fill_fw_msg(struct bnxt_fw_msg *fw_msg, void *msg, fw_msg->timeout = timeout; } -static int bnxt_re_net_ring_free(struct bnxt_re_dev *rdev, u16 fw_ring_id) +static int bnxt_re_net_ring_free(struct bnxt_re_dev *rdev, + u16 fw_ring_id, int type) { struct bnxt_en_dev *en_dev = rdev->en_dev; struct hwrm_ring_free_input req = {0}; @@ -359,7 +384,7 @@ static int bnxt_re_net_ring_free(struct bnxt_re_dev *rdev, u16 fw_ring_id) memset(&fw_msg, 0, sizeof(fw_msg)); bnxt_re_init_hwrm_hdr(rdev, (void *)&req, HWRM_RING_FREE, -1, -1); - req.ring_type = RING_ALLOC_REQ_RING_TYPE_L2_CMPL; + req.ring_type = type; req.ring_id = cpu_to_le16(fw_ring_id); bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp, sizeof(resp), DFLT_HWRM_CMD_TIMEOUT); @@ -396,7 +421,7 @@ static int bnxt_re_net_ring_alloc(struct bnxt_re_dev *rdev, dma_addr_t *dma_arr, /* Association of ring index with doorbell index and MSIX number */ req.logical_id = cpu_to_le16(map_index); req.length = cpu_to_le32(ring_mask + 1); - req.ring_type = RING_ALLOC_REQ_RING_TYPE_L2_CMPL; + req.ring_type = type; req.int_mode = RING_ALLOC_REQ_INT_MODE_MSIX; bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp, sizeof(resp), DFLT_HWRM_CMD_TIMEOUT); @@ -538,7 +563,8 @@ static struct bnxt_en_dev *bnxt_re_dev_probe(struct net_device *netdev) static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr, char *buf) { - struct bnxt_re_dev *rdev = to_bnxt_re_dev(device, ibdev.dev); + struct bnxt_re_dev *rdev = + rdma_device_to_drv_device(device, struct bnxt_re_dev, ibdev); return scnprintf(buf, PAGE_SIZE, "0x%x\n", rdev->en_dev->pdev->vendor); } @@ -547,7 +573,8 @@ static DEVICE_ATTR_RO(hw_rev); static ssize_t hca_type_show(struct device *device, struct device_attribute *attr, char *buf) { - struct bnxt_re_dev *rdev = to_bnxt_re_dev(device, ibdev.dev); + struct bnxt_re_dev *rdev = + rdma_device_to_drv_device(device, struct bnxt_re_dev, ibdev); return scnprintf(buf, PAGE_SIZE, "%s\n", rdev->ibdev.node_desc); } @@ -610,6 +637,8 @@ static const struct ib_device_ops bnxt_re_dev_ops = { .query_srq = bnxt_re_query_srq, .reg_user_mr = bnxt_re_reg_user_mr, .req_notify_cq = bnxt_re_req_notify_cq, + INIT_RDMA_OBJ_SIZE(ib_pd, bnxt_re_pd, ib_pd), + INIT_RDMA_OBJ_SIZE(ib_ucontext, bnxt_re_ucontext, ib_uctx), }; static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) @@ -662,7 +691,7 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) rdma_set_device_sysfs_group(ibdev, &bnxt_re_dev_attr_group); ibdev->driver_id = RDMA_DRIVER_BNXT_RE; ib_set_device_ops(ibdev, &bnxt_re_dev_ops); - return ib_register_device(ibdev, "bnxt_re%d", NULL); + return ib_register_device(ibdev, "bnxt_re%d"); } static void bnxt_re_dev_remove(struct bnxt_re_dev *rdev) @@ -686,7 +715,7 @@ static struct bnxt_re_dev *bnxt_re_dev_add(struct net_device *netdev, struct bnxt_re_dev *rdev; /* Allocate bnxt_re_dev instance here */ - rdev = (struct bnxt_re_dev *)ib_alloc_device(sizeof(*rdev)); + rdev = ib_alloc_device(bnxt_re_dev, ibdev); if (!rdev) { dev_err(NULL, "%s: bnxt_re_dev allocation failure!", ROCE_DRV_MODULE_NAME); @@ -858,6 +887,12 @@ static int bnxt_re_cqn_handler(struct bnxt_qplib_nq *nq, return 0; } +static u32 bnxt_re_get_nqdb_offset(struct bnxt_re_dev *rdev, u16 indx) +{ + return bnxt_qplib_is_chip_gen_p5(&rdev->chip_ctx) ? + 0x10000 : rdev->msix_entries[indx].db_offset; +} + static void bnxt_re_cleanup_res(struct bnxt_re_dev *rdev) { int i; @@ -871,18 +906,18 @@ static void bnxt_re_cleanup_res(struct bnxt_re_dev *rdev) static int bnxt_re_init_res(struct bnxt_re_dev *rdev) { - int rc = 0, i; int num_vec_enabled = 0; + int rc = 0, i; + u32 db_offt; bnxt_qplib_init_res(&rdev->qplib_res); for (i = 1; i < rdev->num_msix ; i++) { + db_offt = bnxt_re_get_nqdb_offset(rdev, i); rc = bnxt_qplib_enable_nq(rdev->en_dev->pdev, &rdev->nq[i - 1], i - 1, rdev->msix_entries[i].vector, - rdev->msix_entries[i].db_offset, - &bnxt_re_cqn_handler, + db_offt, &bnxt_re_cqn_handler, &bnxt_re_srqn_handler); - if (rc) { dev_err(rdev_to_dev(rdev), "Failed to enable NQ with rc = 0x%x", rc); @@ -894,16 +929,18 @@ static int bnxt_re_init_res(struct bnxt_re_dev *rdev) fail: for (i = num_vec_enabled; i >= 0; i--) bnxt_qplib_disable_nq(&rdev->nq[i]); - return rc; } static void bnxt_re_free_nq_res(struct bnxt_re_dev *rdev) { + u8 type; int i; for (i = 0; i < rdev->num_msix - 1; i++) { - bnxt_re_net_ring_free(rdev, rdev->nq[i].ring_id); + type = bnxt_qplib_get_ring_type(&rdev->chip_ctx); + bnxt_re_net_ring_free(rdev, rdev->nq[i].ring_id, type); + rdev->nq[i].res = NULL; bnxt_qplib_free_nq(&rdev->nq[i]); } } @@ -925,8 +962,11 @@ static void bnxt_re_free_res(struct bnxt_re_dev *rdev) static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev) { - int rc = 0, i; int num_vec_created = 0; + dma_addr_t *pg_map; + int rc = 0, i; + int pages; + u8 type; /* Configure and allocate resources for qplib */ rdev->qplib_res.rcfw = &rdev->rcfw; @@ -947,6 +987,7 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev) goto dealloc_res; for (i = 0; i < rdev->num_msix - 1; i++) { + rdev->nq[i].res = &rdev->qplib_res; rdev->nq[i].hwq.max_elements = BNXT_RE_MAX_CQ_COUNT + BNXT_RE_MAX_SRQC_COUNT + 2; rc = bnxt_qplib_alloc_nq(rdev->en_dev->pdev, &rdev->nq[i]); @@ -955,13 +996,13 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev) i, rc); goto free_nq; } - rc = bnxt_re_net_ring_alloc - (rdev, rdev->nq[i].hwq.pbl[PBL_LVL_0].pg_map_arr, - rdev->nq[i].hwq.pbl[rdev->nq[i].hwq.level].pg_count, - HWRM_RING_ALLOC_CMPL, - BNXT_QPLIB_NQE_MAX_CNT - 1, - rdev->msix_entries[i + 1].ring_idx, - &rdev->nq[i].ring_id); + type = bnxt_qplib_get_ring_type(&rdev->chip_ctx); + pg_map = rdev->nq[i].hwq.pbl[PBL_LVL_0].pg_map_arr; + pages = rdev->nq[i].hwq.pbl[rdev->nq[i].hwq.level].pg_count; + rc = bnxt_re_net_ring_alloc(rdev, pg_map, pages, type, + BNXT_QPLIB_NQE_MAX_CNT - 1, + rdev->msix_entries[i + 1].ring_idx, + &rdev->nq[i].ring_id); if (rc) { dev_err(rdev_to_dev(rdev), "Failed to allocate NQ fw id with rc = 0x%x", @@ -974,7 +1015,8 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev) return 0; free_nq: for (i = num_vec_created; i >= 0; i--) { - bnxt_re_net_ring_free(rdev, rdev->nq[i].ring_id); + type = bnxt_qplib_get_ring_type(&rdev->chip_ctx); + bnxt_re_net_ring_free(rdev, rdev->nq[i].ring_id, type); bnxt_qplib_free_nq(&rdev->nq[i]); } bnxt_qplib_dealloc_dpi(&rdev->qplib_res, @@ -1228,6 +1270,7 @@ static void bnxt_re_query_hwrm_intf_version(struct bnxt_re_dev *rdev) static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev) { + u8 type; int rc; if (test_and_clear_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags)) { @@ -1251,7 +1294,8 @@ static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev) bnxt_re_net_stats_ctx_free(rdev, rdev->qplib_ctx.stats.fw_id); bnxt_qplib_free_ctx(rdev->en_dev->pdev, &rdev->qplib_ctx); bnxt_qplib_disable_rcfw_channel(&rdev->rcfw); - bnxt_re_net_ring_free(rdev, rdev->rcfw.creq_ring_id); + type = bnxt_qplib_get_ring_type(&rdev->chip_ctx); + bnxt_re_net_ring_free(rdev, rdev->rcfw.creq_ring_id, type); bnxt_qplib_free_rcfw_channel(&rdev->rcfw); } if (test_and_clear_bit(BNXT_RE_FLAG_GOT_MSIX, &rdev->flags)) { @@ -1260,6 +1304,8 @@ static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev) dev_warn(rdev_to_dev(rdev), "Failed to free MSI-X vectors: %#x", rc); } + + bnxt_re_destroy_chip_ctx(rdev); if (test_and_clear_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags)) { rc = bnxt_re_unregister_netdev(rdev); if (rc) @@ -1280,9 +1326,12 @@ static void bnxt_re_worker(struct work_struct *work) static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev) { - int rc; - + dma_addr_t *pg_map; + u32 db_offt, ridx; + int pages, vid; bool locked; + u8 type; + int rc; /* Acquire rtnl lock through out this function */ rtnl_lock(); @@ -1297,6 +1346,12 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev) } set_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags); + rc = bnxt_re_setup_chip_ctx(rdev); + if (rc) { + dev_err(rdev_to_dev(rdev), "Failed to get chip context\n"); + return -EINVAL; + } + /* Check whether VF or PF */ bnxt_re_get_sriov_func_type(rdev); @@ -1320,21 +1375,22 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev) pr_err("Failed to allocate RCFW Channel: %#x\n", rc); goto fail; } - rc = bnxt_re_net_ring_alloc - (rdev, rdev->rcfw.creq.pbl[PBL_LVL_0].pg_map_arr, - rdev->rcfw.creq.pbl[rdev->rcfw.creq.level].pg_count, - HWRM_RING_ALLOC_CMPL, BNXT_QPLIB_CREQE_MAX_CNT - 1, - rdev->msix_entries[BNXT_RE_AEQ_IDX].ring_idx, - &rdev->rcfw.creq_ring_id); + type = bnxt_qplib_get_ring_type(&rdev->chip_ctx); + pg_map = rdev->rcfw.creq.pbl[PBL_LVL_0].pg_map_arr; + pages = rdev->rcfw.creq.pbl[rdev->rcfw.creq.level].pg_count; + ridx = rdev->msix_entries[BNXT_RE_AEQ_IDX].ring_idx; + rc = bnxt_re_net_ring_alloc(rdev, pg_map, pages, type, + BNXT_QPLIB_CREQE_MAX_CNT - 1, + ridx, &rdev->rcfw.creq_ring_id); if (rc) { pr_err("Failed to allocate CREQ: %#x\n", rc); goto free_rcfw; } - rc = bnxt_qplib_enable_rcfw_channel - (rdev->en_dev->pdev, &rdev->rcfw, - rdev->msix_entries[BNXT_RE_AEQ_IDX].vector, - rdev->msix_entries[BNXT_RE_AEQ_IDX].db_offset, - rdev->is_virtfn, &bnxt_re_aeq_handler); + db_offt = bnxt_re_get_nqdb_offset(rdev, BNXT_RE_AEQ_IDX); + vid = rdev->msix_entries[BNXT_RE_AEQ_IDX].vector; + rc = bnxt_qplib_enable_rcfw_channel(rdev->en_dev->pdev, &rdev->rcfw, + vid, db_offt, rdev->is_virtfn, + &bnxt_re_aeq_handler); if (rc) { pr_err("Failed to enable RCFW channel: %#x\n", rc); goto free_ring; @@ -1347,7 +1403,8 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev) if (!rdev->is_virtfn) bnxt_re_set_resource_limits(rdev); - rc = bnxt_qplib_alloc_ctx(rdev->en_dev->pdev, &rdev->qplib_ctx, 0); + rc = bnxt_qplib_alloc_ctx(rdev->en_dev->pdev, &rdev->qplib_ctx, 0, + bnxt_qplib_is_chip_gen_p5(&rdev->chip_ctx)); if (rc) { pr_err("Failed to allocate QPLIB context: %#x\n", rc); goto disable_rcfw; @@ -1418,7 +1475,8 @@ free_ctx: disable_rcfw: bnxt_qplib_disable_rcfw_channel(&rdev->rcfw); free_ring: - bnxt_re_net_ring_free(rdev, rdev->rcfw.creq_ring_id); + type = bnxt_qplib_get_ring_type(&rdev->chip_ctx); + bnxt_re_net_ring_free(rdev, rdev->rcfw.creq_ring_id, type); free_rcfw: bnxt_qplib_free_rcfw_channel(&rdev->rcfw); fail: diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -44,6 +44,7 @@ #include <linux/slab.h> #include <linux/pci.h> #include <linux/prefetch.h> +#include <linux/if_ether.h> #include "roce_hsi.h" @@ -244,6 +245,7 @@ static void bnxt_qplib_service_nq(unsigned long data) u16 type; int budget = nq->budget; uintptr_t q_handle; + bool gen_p5 = bnxt_qplib_is_chip_gen_p5(nq->res->cctx); /* Service the NQ until empty */ raw_cons = hwq->cons; @@ -290,7 +292,7 @@ static void bnxt_qplib_service_nq(unsigned long data) q_handle |= (u64)le32_to_cpu(nqsrqe->srq_handle_high) << 32; bnxt_qplib_arm_srq((struct bnxt_qplib_srq *)q_handle, - DBR_DBR_TYPE_SRQ_ARMENA); + DBC_DBC_TYPE_SRQ_ARMENA); if (!nq->srqn_handler(nq, (struct bnxt_qplib_srq *)q_handle, nqsrqe->event)) @@ -312,7 +314,9 @@ static void bnxt_qplib_service_nq(unsigned long data) } if (hwq->cons != raw_cons) { hwq->cons = raw_cons; - NQ_DB_REARM(nq->bar_reg_iomem, hwq->cons, hwq->max_elements); + bnxt_qplib_ring_nq_db_rearm(nq->bar_reg_iomem, hwq->cons, + hwq->max_elements, nq->ring_id, + gen_p5); } } @@ -336,9 +340,11 @@ static irqreturn_t bnxt_qplib_nq_irq(int irq, void *dev_instance) void bnxt_qplib_nq_stop_irq(struct bnxt_qplib_nq *nq, bool kill) { + bool gen_p5 = bnxt_qplib_is_chip_gen_p5(nq->res->cctx); tasklet_disable(&nq->worker); /* Mask h/w interrupt */ - NQ_DB(nq->bar_reg_iomem, nq->hwq.cons, nq->hwq.max_elements); + bnxt_qplib_ring_nq_db(nq->bar_reg_iomem, nq->hwq.cons, + nq->hwq.max_elements, nq->ring_id, gen_p5); /* Sync with last running IRQ handler */ synchronize_irq(nq->vector); if (kill) @@ -373,6 +379,7 @@ void bnxt_qplib_disable_nq(struct bnxt_qplib_nq *nq) int bnxt_qplib_nq_start_irq(struct bnxt_qplib_nq *nq, int nq_indx, int msix_vector, bool need_init) { + bool gen_p5 = bnxt_qplib_is_chip_gen_p5(nq->res->cctx); int rc; if (nq->requested) @@ -399,7 +406,8 @@ int bnxt_qplib_nq_start_irq(struct bnxt_qplib_nq *nq, int nq_indx, nq->vector, nq_indx); } nq->requested = true; - NQ_DB_REARM(nq->bar_reg_iomem, nq->hwq.cons, nq->hwq.max_elements); + bnxt_qplib_ring_nq_db_rearm(nq->bar_reg_iomem, nq->hwq.cons, + nq->hwq.max_elements, nq->ring_id, gen_p5); return rc; } @@ -433,7 +441,8 @@ int bnxt_qplib_enable_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq, rc = -ENOMEM; goto fail; } - nq->bar_reg_iomem = ioremap_nocache(nq_base + nq->bar_reg_off, 4); + /* Unconditionally map 8 bytes to support 57500 series */ + nq->bar_reg_iomem = ioremap_nocache(nq_base + nq->bar_reg_off, 8); if (!nq->bar_reg_iomem) { rc = -ENOMEM; goto fail; @@ -462,15 +471,17 @@ void bnxt_qplib_free_nq(struct bnxt_qplib_nq *nq) int bnxt_qplib_alloc_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq) { + u8 hwq_type; + nq->pdev = pdev; if (!nq->hwq.max_elements || nq->hwq.max_elements > BNXT_QPLIB_NQE_MAX_CNT) nq->hwq.max_elements = BNXT_QPLIB_NQE_MAX_CNT; - + hwq_type = bnxt_qplib_get_hwq_type(nq->res); if (bnxt_qplib_alloc_init_hwq(nq->pdev, &nq->hwq, NULL, 0, &nq->hwq.max_elements, BNXT_QPLIB_MAX_NQE_ENTRY_SIZE, 0, - PAGE_SIZE, HWQ_TYPE_L2_CMPL)) + PAGE_SIZE, hwq_type)) return -ENOMEM; nq->budget = 8; @@ -481,21 +492,19 @@ int bnxt_qplib_alloc_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq) static void bnxt_qplib_arm_srq(struct bnxt_qplib_srq *srq, u32 arm_type) { struct bnxt_qplib_hwq *srq_hwq = &srq->hwq; - struct dbr_dbr db_msg = { 0 }; void __iomem *db; - u32 sw_prod = 0; + u32 sw_prod; + u64 val = 0; /* Ring DB */ - sw_prod = (arm_type == DBR_DBR_TYPE_SRQ_ARM) ? srq->threshold : - HWQ_CMP(srq_hwq->prod, srq_hwq); - db_msg.index = cpu_to_le32((sw_prod << DBR_DBR_INDEX_SFT) & - DBR_DBR_INDEX_MASK); - db_msg.type_xid = cpu_to_le32(((srq->id << DBR_DBR_XID_SFT) & - DBR_DBR_XID_MASK) | arm_type); - db = (arm_type == DBR_DBR_TYPE_SRQ_ARMENA) ? - srq->dbr_base : srq->dpi->dbr; - wmb(); /* barrier before db ring */ - __iowrite64_copy(db, &db_msg, sizeof(db_msg) / sizeof(u64)); + sw_prod = (arm_type == DBC_DBC_TYPE_SRQ_ARM) ? + srq->threshold : HWQ_CMP(srq_hwq->prod, srq_hwq); + db = (arm_type == DBC_DBC_TYPE_SRQ_ARMENA) ? srq->dbr_base : + srq->dpi->dbr; + val = ((srq->id << DBC_DBC_XID_SFT) & DBC_DBC_XID_MASK) | arm_type; + val <<= 32; + val |= (sw_prod << DBC_DBC_INDEX_SFT) & DBC_DBC_INDEX_MASK; + writeq(val, db); } int bnxt_qplib_destroy_srq(struct bnxt_qplib_res *res, @@ -590,7 +599,7 @@ int bnxt_qplib_create_srq(struct bnxt_qplib_res *res, srq->id = le32_to_cpu(resp.xid); srq->dbr_base = res->dpi_tbl.dbr_bar_reg_iomem; if (srq->threshold) - bnxt_qplib_arm_srq(srq, DBR_DBR_TYPE_SRQ_ARMENA); + bnxt_qplib_arm_srq(srq, DBC_DBC_TYPE_SRQ_ARMENA); srq->arm_req = false; return 0; @@ -614,7 +623,7 @@ int bnxt_qplib_modify_srq(struct bnxt_qplib_res *res, srq_hwq->max_elements - sw_cons + sw_prod; if (count > srq->threshold) { srq->arm_req = false; - bnxt_qplib_arm_srq(srq, DBR_DBR_TYPE_SRQ_ARM); + bnxt_qplib_arm_srq(srq, DBC_DBC_TYPE_SRQ_ARM); } else { /* Deferred arming */ srq->arm_req = true; @@ -702,10 +711,10 @@ int bnxt_qplib_post_srq_recv(struct bnxt_qplib_srq *srq, srq_hwq->max_elements - sw_cons + sw_prod; spin_unlock(&srq_hwq->lock); /* Ring DB */ - bnxt_qplib_arm_srq(srq, DBR_DBR_TYPE_SRQ); + bnxt_qplib_arm_srq(srq, DBC_DBC_TYPE_SRQ); if (srq->arm_req == true && count > srq->threshold) { srq->arm_req = false; - bnxt_qplib_arm_srq(srq, DBR_DBR_TYPE_SRQ_ARM); + bnxt_qplib_arm_srq(srq, DBC_DBC_TYPE_SRQ_ARM); } done: return rc; @@ -853,18 +862,19 @@ exit: int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) { struct bnxt_qplib_rcfw *rcfw = res->rcfw; - struct sq_send *hw_sq_send_hdr, **hw_sq_send_ptr; - struct cmdq_create_qp req; - struct creq_create_qp_resp resp; - struct bnxt_qplib_pbl *pbl; - struct sq_psn_search **psn_search_ptr; unsigned long int psn_search, poff = 0; + struct sq_psn_search **psn_search_ptr; struct bnxt_qplib_q *sq = &qp->sq; struct bnxt_qplib_q *rq = &qp->rq; + int i, rc, req_size, psn_sz = 0; + struct sq_send **hw_sq_send_ptr; + struct creq_create_qp_resp resp; struct bnxt_qplib_hwq *xrrq; - int i, rc, req_size, psn_sz; u16 cmd_flags = 0, max_ssge; - u32 sw_prod, qp_flags = 0; + struct cmdq_create_qp req; + struct bnxt_qplib_pbl *pbl; + u32 qp_flags = 0; + u16 max_rsge; RCFW_CMD_PREP(req, CREATE_QP, cmd_flags); @@ -874,8 +884,11 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) req.qp_handle = cpu_to_le64(qp->qp_handle); /* SQ */ - psn_sz = (qp->type == CMDQ_CREATE_QP_TYPE_RC) ? - sizeof(struct sq_psn_search) : 0; + if (qp->type == CMDQ_CREATE_QP_TYPE_RC) { + psn_sz = bnxt_qplib_is_chip_gen_p5(res->cctx) ? + sizeof(struct sq_psn_search_ext) : + sizeof(struct sq_psn_search); + } sq->hwq.max_elements = sq->max_wqe; rc = bnxt_qplib_alloc_init_hwq(res->pdev, &sq->hwq, sq->sglist, sq->nmap, &sq->hwq.max_elements, @@ -905,10 +918,16 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) poff = (psn_search & ~PAGE_MASK) / BNXT_QPLIB_MAX_PSNE_ENTRY_SIZE; } - for (i = 0; i < sq->hwq.max_elements; i++) + for (i = 0; i < sq->hwq.max_elements; i++) { sq->swq[i].psn_search = &psn_search_ptr[get_psne_pg(i + poff)] [get_psne_idx(i + poff)]; + /*psns_ext will be used only for P5 chips. */ + sq->swq[i].psn_ext = + (struct sq_psn_search_ext *) + &psn_search_ptr[get_psne_pg(i + poff)] + [get_psne_idx(i + poff)]; + } } pbl = &sq->hwq.pbl[PBL_LVL_0]; req.sq_pbl = cpu_to_le64(pbl->pg_map_arr[0]); @@ -929,14 +948,6 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) CMDQ_CREATE_QP_SQ_PG_SIZE_PG_1G : CMDQ_CREATE_QP_SQ_PG_SIZE_PG_4K); - /* initialize all SQ WQEs to LOCAL_INVALID (sq prep for hw fetch) */ - hw_sq_send_ptr = (struct sq_send **)sq->hwq.pbl_ptr; - for (sw_prod = 0; sw_prod < sq->hwq.max_elements; sw_prod++) { - hw_sq_send_hdr = &hw_sq_send_ptr[get_sqe_pg(sw_prod)] - [get_sqe_idx(sw_prod)]; - hw_sq_send_hdr->wqe_type = SQ_BASE_WQE_TYPE_LOCAL_INVALID; - } - if (qp->scq) req.scq_cid = cpu_to_le32(qp->scq->id); @@ -1007,8 +1018,9 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) req.sq_fwo_sq_sge = cpu_to_le16( ((max_ssge & CMDQ_CREATE_QP_SQ_SGE_MASK) << CMDQ_CREATE_QP_SQ_SGE_SFT) | 0); + max_rsge = bnxt_qplib_is_chip_gen_p5(res->cctx) ? 6 : rq->max_sge; req.rq_fwo_rq_sge = cpu_to_le16( - ((rq->max_sge & CMDQ_CREATE_QP_RQ_SGE_MASK) + ((max_rsge & CMDQ_CREATE_QP_RQ_SGE_MASK) << CMDQ_CREATE_QP_RQ_SGE_SFT) | 0); /* ORRQ and IRRQ */ if (psn_sz) { @@ -1053,6 +1065,7 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) qp->id = le32_to_cpu(resp.xid); qp->cur_qp_state = CMDQ_MODIFY_QP_NEW_STATE_RESET; + qp->cctx = res->cctx; INIT_LIST_HEAD(&qp->sq_flush); INIT_LIST_HEAD(&qp->rq_flush); rcfw->qp_tbl[qp->id].qp_id = qp->id; @@ -1494,19 +1507,16 @@ void *bnxt_qplib_get_qp1_rq_buf(struct bnxt_qplib_qp *qp, void bnxt_qplib_post_send_db(struct bnxt_qplib_qp *qp) { struct bnxt_qplib_q *sq = &qp->sq; - struct dbr_dbr db_msg = { 0 }; u32 sw_prod; + u64 val = 0; + val = (((qp->id << DBC_DBC_XID_SFT) & DBC_DBC_XID_MASK) | + DBC_DBC_TYPE_SQ); + val <<= 32; sw_prod = HWQ_CMP(sq->hwq.prod, &sq->hwq); - - db_msg.index = cpu_to_le32((sw_prod << DBR_DBR_INDEX_SFT) & - DBR_DBR_INDEX_MASK); - db_msg.type_xid = - cpu_to_le32(((qp->id << DBR_DBR_XID_SFT) & DBR_DBR_XID_MASK) | - DBR_DBR_TYPE_SQ); + val |= (sw_prod << DBC_DBC_INDEX_SFT) & DBC_DBC_INDEX_MASK; /* Flush all the WQE writes to HW */ - wmb(); - __iowrite64_copy(qp->dpi->dbr, &db_msg, sizeof(db_msg) / sizeof(u64)); + writeq(val, qp->dpi->dbr); } int bnxt_qplib_post_send(struct bnxt_qplib_qp *qp, @@ -1617,7 +1627,8 @@ int bnxt_qplib_post_send(struct bnxt_qplib_qp *qp, ((offsetof(typeof(*sqe), data) + 15) >> 4); sqe->inv_key_or_imm_data = cpu_to_le32( wqe->send.inv_key); - if (qp->type == CMDQ_CREATE_QP_TYPE_UD) { + if (qp->type == CMDQ_CREATE_QP_TYPE_UD || + qp->type == CMDQ_CREATE_QP_TYPE_GSI) { sqe->q_key = cpu_to_le32(wqe->send.q_key); sqe->dst_qp = cpu_to_le32( wqe->send.dst_qp & SQ_SEND_DST_QP_MASK); @@ -1741,14 +1752,26 @@ int bnxt_qplib_post_send(struct bnxt_qplib_qp *qp, } swq->next_psn = sq->psn & BTH_PSN_MASK; if (swq->psn_search) { - swq->psn_search->opcode_start_psn = cpu_to_le32( - ((swq->start_psn << SQ_PSN_SEARCH_START_PSN_SFT) & - SQ_PSN_SEARCH_START_PSN_MASK) | - ((wqe->type << SQ_PSN_SEARCH_OPCODE_SFT) & - SQ_PSN_SEARCH_OPCODE_MASK)); - swq->psn_search->flags_next_psn = cpu_to_le32( - ((swq->next_psn << SQ_PSN_SEARCH_NEXT_PSN_SFT) & - SQ_PSN_SEARCH_NEXT_PSN_MASK)); + u32 opcd_spsn; + u32 flg_npsn; + + opcd_spsn = ((swq->start_psn << SQ_PSN_SEARCH_START_PSN_SFT) & + SQ_PSN_SEARCH_START_PSN_MASK); + opcd_spsn |= ((wqe->type << SQ_PSN_SEARCH_OPCODE_SFT) & + SQ_PSN_SEARCH_OPCODE_MASK); + flg_npsn = ((swq->next_psn << SQ_PSN_SEARCH_NEXT_PSN_SFT) & + SQ_PSN_SEARCH_NEXT_PSN_MASK); + if (bnxt_qplib_is_chip_gen_p5(qp->cctx)) { + swq->psn_ext->opcode_start_psn = + cpu_to_le32(opcd_spsn); + swq->psn_ext->flags_next_psn = + cpu_to_le32(flg_npsn); + } else { + swq->psn_search->opcode_start_psn = + cpu_to_le32(opcd_spsn); + swq->psn_search->flags_next_psn = + cpu_to_le32(flg_npsn); + } } queue_err: if (sch_handler) { @@ -1785,19 +1808,16 @@ done: void bnxt_qplib_post_recv_db(struct bnxt_qplib_qp *qp) { struct bnxt_qplib_q *rq = &qp->rq; - struct dbr_dbr db_msg = { 0 }; u32 sw_prod; + u64 val = 0; + val = (((qp->id << DBC_DBC_XID_SFT) & DBC_DBC_XID_MASK) | + DBC_DBC_TYPE_RQ); + val <<= 32; sw_prod = HWQ_CMP(rq->hwq.prod, &rq->hwq); - db_msg.index = cpu_to_le32((sw_prod << DBR_DBR_INDEX_SFT) & - DBR_DBR_INDEX_MASK); - db_msg.type_xid = - cpu_to_le32(((qp->id << DBR_DBR_XID_SFT) & DBR_DBR_XID_MASK) | - DBR_DBR_TYPE_RQ); - + val |= (sw_prod << DBC_DBC_INDEX_SFT) & DBC_DBC_INDEX_MASK; /* Flush the writes to HW Rx WQE before the ringing Rx DB */ - wmb(); - __iowrite64_copy(qp->dpi->dbr, &db_msg, sizeof(db_msg) / sizeof(u64)); + writeq(val, qp->dpi->dbr); } int bnxt_qplib_post_recv(struct bnxt_qplib_qp *qp, @@ -1881,32 +1901,28 @@ done: /* Spinlock must be held */ static void bnxt_qplib_arm_cq_enable(struct bnxt_qplib_cq *cq) { - struct dbr_dbr db_msg = { 0 }; + u64 val = 0; - db_msg.type_xid = - cpu_to_le32(((cq->id << DBR_DBR_XID_SFT) & DBR_DBR_XID_MASK) | - DBR_DBR_TYPE_CQ_ARMENA); + val = ((cq->id << DBC_DBC_XID_SFT) & DBC_DBC_XID_MASK) | + DBC_DBC_TYPE_CQ_ARMENA; + val <<= 32; /* Flush memory writes before enabling the CQ */ - wmb(); - __iowrite64_copy(cq->dbr_base, &db_msg, sizeof(db_msg) / sizeof(u64)); + writeq(val, cq->dbr_base); } static void bnxt_qplib_arm_cq(struct bnxt_qplib_cq *cq, u32 arm_type) { struct bnxt_qplib_hwq *cq_hwq = &cq->hwq; - struct dbr_dbr db_msg = { 0 }; u32 sw_cons; + u64 val = 0; /* Ring DB */ + val = ((cq->id << DBC_DBC_XID_SFT) & DBC_DBC_XID_MASK) | arm_type; + val <<= 32; sw_cons = HWQ_CMP(cq_hwq->cons, cq_hwq); - db_msg.index = cpu_to_le32((sw_cons << DBR_DBR_INDEX_SFT) & - DBR_DBR_INDEX_MASK); - db_msg.type_xid = - cpu_to_le32(((cq->id << DBR_DBR_XID_SFT) & DBR_DBR_XID_MASK) | - arm_type); + val |= (sw_cons << DBC_DBC_INDEX_SFT) & DBC_DBC_INDEX_MASK; /* flush memory writes before arming the CQ */ - wmb(); - __iowrite64_copy(cq->dpi->dbr, &db_msg, sizeof(db_msg) / sizeof(u64)); + writeq(val, cq->dpi->dbr); } int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq) @@ -2053,6 +2069,7 @@ static int __flush_rq(struct bnxt_qplib_q *rq, struct bnxt_qplib_qp *qp, opcode = CQ_BASE_CQE_TYPE_RES_RC; break; case CMDQ_CREATE_QP_TYPE_UD: + case CMDQ_CREATE_QP_TYPE_GSI: opcode = CQ_BASE_CQE_TYPE_RES_UD; break; } @@ -2125,7 +2142,7 @@ static int do_wa9060(struct bnxt_qplib_qp *qp, struct bnxt_qplib_cq *cq, sq->send_phantom = true; /* TODO: Only ARM if the previous SQE is ARMALL */ - bnxt_qplib_arm_cq(cq, DBR_DBR_TYPE_CQ_ARMALL); + bnxt_qplib_arm_cq(cq, DBC_DBC_TYPE_CQ_ARMALL); rc = -EAGAIN; goto out; @@ -2410,12 +2427,14 @@ static int bnxt_qplib_cq_process_res_ud(struct bnxt_qplib_cq *cq, } cqe = *pcqe; cqe->opcode = hwcqe->cqe_type_toggle & CQ_BASE_CQE_TYPE_MASK; - cqe->length = le32_to_cpu(hwcqe->length); + cqe->length = (u32)le16_to_cpu(hwcqe->length); + cqe->cfa_meta = le16_to_cpu(hwcqe->cfa_metadata); cqe->invrkey = le32_to_cpu(hwcqe->imm_data); cqe->flags = le16_to_cpu(hwcqe->flags); cqe->status = hwcqe->status; cqe->qp_handle = (u64)(unsigned long)qp; - memcpy(cqe->smac, hwcqe->src_mac, 6); + /*FIXME: Endianness fix needed for smace */ + memcpy(cqe->smac, hwcqe->src_mac, ETH_ALEN); wr_id_idx = le32_to_cpu(hwcqe->src_qp_high_srq_or_rq_wr_id) & CQ_RES_UD_SRQ_OR_RQ_WR_ID_MASK; cqe->src_qp = le16_to_cpu(hwcqe->src_qp_low) | @@ -2794,7 +2813,7 @@ int bnxt_qplib_poll_cq(struct bnxt_qplib_cq *cq, struct bnxt_qplib_cqe *cqe, } if (cq->hwq.cons != raw_cons) { cq->hwq.cons = raw_cons; - bnxt_qplib_arm_cq(cq, DBR_DBR_TYPE_CQ); + bnxt_qplib_arm_cq(cq, DBC_DBC_TYPE_CQ); } exit: return num_cqes - budget; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h @@ -106,6 +106,7 @@ struct bnxt_qplib_swq { u32 start_psn; u32 next_psn; struct sq_psn_search *psn_search; + struct sq_psn_search_ext *psn_ext; }; struct bnxt_qplib_swqe { @@ -254,6 +255,7 @@ struct bnxt_qplib_q { struct bnxt_qplib_qp { struct bnxt_qplib_pd *pd; struct bnxt_qplib_dpi *dpi; + struct bnxt_qplib_chip_ctx *cctx; u64 qp_handle; #define BNXT_QPLIB_QP_ID_INVALID 0xFFFFFFFF u32 id; @@ -347,6 +349,7 @@ struct bnxt_qplib_cqe { u8 type; u8 opcode; u32 length; + u16 cfa_meta; u64 wr_id; union { __be32 immdata; @@ -432,13 +435,47 @@ struct bnxt_qplib_cq { #define NQ_DB_CP_FLAGS (NQ_DB_KEY_CP | \ NQ_DB_IDX_VALID | \ NQ_DB_IRQ_DIS) -#define NQ_DB_REARM(db, raw_cons, cp_bit) \ - writel(NQ_DB_CP_FLAGS_REARM | ((raw_cons) & ((cp_bit) - 1)), db) -#define NQ_DB(db, raw_cons, cp_bit) \ - writel(NQ_DB_CP_FLAGS | ((raw_cons) & ((cp_bit) - 1)), db) + +static inline void bnxt_qplib_ring_nq_db64(void __iomem *db, u32 index, + u32 xid, bool arm) +{ + u64 val; + + val = xid & DBC_DBC_XID_MASK; + val |= DBC_DBC_PATH_ROCE; + val |= arm ? DBC_DBC_TYPE_NQ_ARM : DBC_DBC_TYPE_NQ; + val <<= 32; + val |= index & DBC_DBC_INDEX_MASK; + writeq(val, db); +} + +static inline void bnxt_qplib_ring_nq_db_rearm(void __iomem *db, u32 raw_cons, + u32 max_elements, u32 xid, + bool gen_p5) +{ + u32 index = raw_cons & (max_elements - 1); + + if (gen_p5) + bnxt_qplib_ring_nq_db64(db, index, xid, true); + else + writel(NQ_DB_CP_FLAGS_REARM | (index & DBC_DBC32_XID_MASK), db); +} + +static inline void bnxt_qplib_ring_nq_db(void __iomem *db, u32 raw_cons, + u32 max_elements, u32 xid, + bool gen_p5) +{ + u32 index = raw_cons & (max_elements - 1); + + if (gen_p5) + bnxt_qplib_ring_nq_db64(db, index, xid, false); + else + writel(NQ_DB_CP_FLAGS | (index & DBC_DBC32_XID_MASK), db); +} struct bnxt_qplib_nq { struct pci_dev *pdev; + struct bnxt_qplib_res *res; int vector; cpumask_t mask; @@ -448,7 +485,7 @@ struct bnxt_qplib_nq { struct bnxt_qplib_hwq hwq; u16 bar_reg; - u16 bar_reg_off; + u32 bar_reg_off; u16 ring_id; void __iomem *bar_reg_iomem; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -359,11 +359,12 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw, static void bnxt_qplib_service_creq(unsigned long data) { struct bnxt_qplib_rcfw *rcfw = (struct bnxt_qplib_rcfw *)data; + bool gen_p5 = bnxt_qplib_is_chip_gen_p5(rcfw->res->cctx); struct bnxt_qplib_hwq *creq = &rcfw->creq; + u32 type, budget = CREQ_ENTRY_POLL_BUDGET; struct creq_base *creqe, **creq_ptr; u32 sw_cons, raw_cons; unsigned long flags; - u32 type, budget = CREQ_ENTRY_POLL_BUDGET; /* Service the CREQ until budget is over */ spin_lock_irqsave(&creq->lock, flags); @@ -407,8 +408,9 @@ static void bnxt_qplib_service_creq(unsigned long data) if (creq->cons != raw_cons) { creq->cons = raw_cons; - CREQ_DB_REARM(rcfw->creq_bar_reg_iomem, raw_cons, - creq->max_elements); + bnxt_qplib_ring_creq_db_rearm(rcfw->creq_bar_reg_iomem, + raw_cons, creq->max_elements, + rcfw->creq_ring_id, gen_p5); } spin_unlock_irqrestore(&creq->lock, flags); } @@ -480,11 +482,13 @@ int bnxt_qplib_init_rcfw(struct bnxt_qplib_rcfw *rcfw, req.log2_dbr_pg_size = cpu_to_le16(PAGE_SHIFT - RCFW_DBR_BASE_PAGE_SHIFT); /* - * VFs need not setup the HW context area, PF + * Gen P5 devices doesn't require this allocation + * as the L2 driver does the same for RoCE also. + * Also, VFs need not setup the HW context area, PF * shall setup this area for VF. Skipping the * HW programming */ - if (is_virtfn) + if (is_virtfn || bnxt_qplib_is_chip_gen_p5(rcfw->res->cctx)) goto skip_ctx_setup; level = ctx->qpc_tbl.level; @@ -560,12 +564,15 @@ int bnxt_qplib_alloc_rcfw_channel(struct pci_dev *pdev, struct bnxt_qplib_ctx *ctx, int qp_tbl_sz) { + u8 hwq_type; + rcfw->pdev = pdev; rcfw->creq.max_elements = BNXT_QPLIB_CREQE_MAX_CNT; + hwq_type = bnxt_qplib_get_hwq_type(rcfw->res); if (bnxt_qplib_alloc_init_hwq(rcfw->pdev, &rcfw->creq, NULL, 0, &rcfw->creq.max_elements, - BNXT_QPLIB_CREQE_UNITS, 0, PAGE_SIZE, - HWQ_TYPE_L2_CMPL)) { + BNXT_QPLIB_CREQE_UNITS, + 0, PAGE_SIZE, hwq_type)) { dev_err(&rcfw->pdev->dev, "HW channel CREQ allocation failed\n"); goto fail; @@ -607,10 +614,13 @@ fail: void bnxt_qplib_rcfw_stop_irq(struct bnxt_qplib_rcfw *rcfw, bool kill) { + bool gen_p5 = bnxt_qplib_is_chip_gen_p5(rcfw->res->cctx); + tasklet_disable(&rcfw->worker); /* Mask h/w interrupts */ - CREQ_DB(rcfw->creq_bar_reg_iomem, rcfw->creq.cons, - rcfw->creq.max_elements); + bnxt_qplib_ring_creq_db(rcfw->creq_bar_reg_iomem, rcfw->creq.cons, + rcfw->creq.max_elements, rcfw->creq_ring_id, + gen_p5); /* Sync with last running IRQ-handler */ synchronize_irq(rcfw->vector); if (kill) @@ -647,6 +657,7 @@ void bnxt_qplib_disable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw) int bnxt_qplib_rcfw_start_irq(struct bnxt_qplib_rcfw *rcfw, int msix_vector, bool need_init) { + bool gen_p5 = bnxt_qplib_is_chip_gen_p5(rcfw->res->cctx); int rc; if (rcfw->requested) @@ -663,8 +674,9 @@ int bnxt_qplib_rcfw_start_irq(struct bnxt_qplib_rcfw *rcfw, int msix_vector, if (rc) return rc; rcfw->requested = true; - CREQ_DB_REARM(rcfw->creq_bar_reg_iomem, rcfw->creq.cons, - rcfw->creq.max_elements); + bnxt_qplib_ring_creq_db_rearm(rcfw->creq_bar_reg_iomem, + rcfw->creq.cons, rcfw->creq.max_elements, + rcfw->creq_ring_id, gen_p5); return 0; } @@ -684,8 +696,7 @@ int bnxt_qplib_enable_rcfw_channel(struct pci_dev *pdev, /* General */ rcfw->seq_num = 0; set_bit(FIRMWARE_FIRST_FLAG, &rcfw->flags); - bmap_size = BITS_TO_LONGS(rcfw->cmdq_depth * - sizeof(unsigned long)); + bmap_size = BITS_TO_LONGS(rcfw->cmdq_depth) * sizeof(unsigned long); rcfw->cmdq_bitmap = kzalloc(bmap_size, GFP_KERNEL); if (!rcfw->cmdq_bitmap) return -ENOMEM; @@ -718,8 +729,9 @@ int bnxt_qplib_enable_rcfw_channel(struct pci_dev *pdev, dev_err(&rcfw->pdev->dev, "CREQ BAR region %d resc start is 0!\n", rcfw->creq_bar_reg); + /* Unconditionally map 8 bytes to support 57500 series */ rcfw->creq_bar_reg_iomem = ioremap_nocache(res_base + cp_bar_reg_off, - 4); + 8); if (!rcfw->creq_bar_reg_iomem) { dev_err(&rcfw->pdev->dev, "CREQ BAR region %d mapping failed\n", rcfw->creq_bar_reg); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h @@ -157,10 +157,46 @@ static inline u32 get_creq_idx(u32 val) #define CREQ_DB_CP_FLAGS (CREQ_DB_KEY_CP | \ CREQ_DB_IDX_VALID | \ CREQ_DB_IRQ_DIS) -#define CREQ_DB_REARM(db, raw_cons, cp_bit) \ - writel(CREQ_DB_CP_FLAGS_REARM | ((raw_cons) & ((cp_bit) - 1)), db) -#define CREQ_DB(db, raw_cons, cp_bit) \ - writel(CREQ_DB_CP_FLAGS | ((raw_cons) & ((cp_bit) - 1)), db) + +static inline void bnxt_qplib_ring_creq_db64(void __iomem *db, u32 index, + u32 xid, bool arm) +{ + u64 val = 0; + + val = xid & DBC_DBC_XID_MASK; + val |= DBC_DBC_PATH_ROCE; + val |= arm ? DBC_DBC_TYPE_NQ_ARM : DBC_DBC_TYPE_NQ; + val <<= 32; + val |= index & DBC_DBC_INDEX_MASK; + + writeq(val, db); +} + +static inline void bnxt_qplib_ring_creq_db_rearm(void __iomem *db, u32 raw_cons, + u32 max_elements, u32 xid, + bool gen_p5) +{ + u32 index = raw_cons & (max_elements - 1); + + if (gen_p5) + bnxt_qplib_ring_creq_db64(db, index, xid, true); + else + writel(CREQ_DB_CP_FLAGS_REARM | (index & DBC_DBC32_XID_MASK), + db); +} + +static inline void bnxt_qplib_ring_creq_db(void __iomem *db, u32 raw_cons, + u32 max_elements, u32 xid, + bool gen_p5) +{ + u32 index = raw_cons & (max_elements - 1); + + if (gen_p5) + bnxt_qplib_ring_creq_db64(db, index, xid, true); + else + writel(CREQ_DB_CP_FLAGS | (index & DBC_DBC32_XID_MASK), + db); +} #define CREQ_ENTRY_POLL_BUDGET 0x100 @@ -187,6 +223,7 @@ struct bnxt_qplib_qp_node { /* RCFW Communication Channels */ struct bnxt_qplib_rcfw { struct pci_dev *pdev; + struct bnxt_qplib_res *res; int vector; struct tasklet_struct worker; bool requested; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c @@ -85,7 +85,7 @@ static void __free_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, struct scatterlist *sghead, u32 pages, u32 pg_size) { - struct scatterlist *sg; + struct sg_dma_page_iter sg_iter; bool is_umem = false; int i; @@ -116,13 +116,11 @@ static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, } else { i = 0; is_umem = true; - for_each_sg(sghead, sg, pages, i) { - pbl->pg_map_arr[i] = sg_dma_address(sg); - pbl->pg_arr[i] = sg_virt(sg); - if (!pbl->pg_arr[i]) - goto fail; - + for_each_sg_dma_page (sghead, &sg_iter, pages, 0) { + pbl->pg_map_arr[i] = sg_page_iter_dma_address(&sg_iter); + pbl->pg_arr[i] = NULL; pbl->pg_count++; + i++; } } @@ -330,13 +328,13 @@ void bnxt_qplib_free_ctx(struct pci_dev *pdev, */ int bnxt_qplib_alloc_ctx(struct pci_dev *pdev, struct bnxt_qplib_ctx *ctx, - bool virt_fn) + bool virt_fn, bool is_p5) { int i, j, k, rc = 0; int fnz_idx = -1; __le64 **pbl_ptr; - if (virt_fn) + if (virt_fn || is_p5) goto stats_alloc; /* QPC Tables */ @@ -762,7 +760,11 @@ static int bnxt_qplib_alloc_stats_ctx(struct pci_dev *pdev, { memset(stats, 0, sizeof(*stats)); stats->fw_id = -1; - stats->size = sizeof(struct ctx_hw_stats); + /* 128 byte aligned context memory is required only for 57500. + * However making this unconditional, it does not harm previous + * generation. + */ + stats->size = ALIGN(sizeof(struct ctx_hw_stats), 128); stats->dma = dma_alloc_coherent(&pdev->dev, stats->size, &stats->dma_map, GFP_KERNEL); if (!stats->dma) { diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h @@ -180,12 +180,20 @@ struct bnxt_qplib_ctx { u64 hwrm_intf_ver; }; +struct bnxt_qplib_chip_ctx { + u16 chip_num; + u8 chip_rev; + u8 chip_metal; +}; + +#define CHIP_NUM_57500 0x1750 + struct bnxt_qplib_res { struct pci_dev *pdev; + struct bnxt_qplib_chip_ctx *cctx; struct net_device *netdev; struct bnxt_qplib_rcfw *rcfw; - struct bnxt_qplib_pd_tbl pd_tbl; struct bnxt_qplib_sgid_tbl sgid_tbl; struct bnxt_qplib_pkey_tbl pkey_tbl; @@ -193,6 +201,24 @@ struct bnxt_qplib_res { bool prio; }; +static inline bool bnxt_qplib_is_chip_gen_p5(struct bnxt_qplib_chip_ctx *cctx) +{ + return (cctx->chip_num == CHIP_NUM_57500); +} + +static inline u8 bnxt_qplib_get_hwq_type(struct bnxt_qplib_res *res) +{ + return bnxt_qplib_is_chip_gen_p5(res->cctx) ? + HWQ_TYPE_QUEUE : HWQ_TYPE_L2_CMPL; +} + +static inline u8 bnxt_qplib_get_ring_type(struct bnxt_qplib_chip_ctx *cctx) +{ + return bnxt_qplib_is_chip_gen_p5(cctx) ? + RING_ALLOC_REQ_RING_TYPE_NQ : + RING_ALLOC_REQ_RING_TYPE_ROCE_CMPL; +} + #define to_bnxt_qplib(ptr, type, member) \ container_of(ptr, type, member) @@ -226,5 +252,5 @@ void bnxt_qplib_free_ctx(struct pci_dev *pdev, struct bnxt_qplib_ctx *ctx); int bnxt_qplib_alloc_ctx(struct pci_dev *pdev, struct bnxt_qplib_ctx *ctx, - bool virt_fn); + bool virt_fn, bool is_p5); #endif /* __BNXT_QPLIB_RES_H__ */ diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -119,7 +119,8 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, * reporting the max number */ attr->max_qp_wqes -= BNXT_QPLIB_RESERVED_QP_WRS; - attr->max_qp_sges = sb->max_sge; + attr->max_qp_sges = bnxt_qplib_is_chip_gen_p5(rcfw->res->cctx) ? + 6 : sb->max_sge; attr->max_cq = le32_to_cpu(sb->max_cq); attr->max_cq_wqes = le32_to_cpu(sb->max_cqe); attr->max_cq_sges = attr->max_qp_sges; diff --git a/drivers/infiniband/hw/bnxt_re/roce_hsi.h b/drivers/infiniband/hw/bnxt_re/roce_hsi.h @@ -49,11 +49,11 @@ struct cmpl_doorbell { #define CMPL_DOORBELL_IDX_SFT 0 #define CMPL_DOORBELL_RESERVED_MASK 0x3000000UL #define CMPL_DOORBELL_RESERVED_SFT 24 - #define CMPL_DOORBELL_IDX_VALID 0x4000000UL + #define CMPL_DOORBELL_IDX_VALID 0x4000000UL #define CMPL_DOORBELL_MASK 0x8000000UL #define CMPL_DOORBELL_KEY_MASK 0xf0000000UL #define CMPL_DOORBELL_KEY_SFT 28 - #define CMPL_DOORBELL_KEY_CMPL (0x2UL << 28) + #define CMPL_DOORBELL_KEY_CMPL (0x2UL << 28) }; /* Status Door Bell Format (4 bytes) */ @@ -71,46 +71,56 @@ struct status_doorbell { /* RoCE Host Structures */ /* Doorbell Structures */ -/* 64b Doorbell Format (8 bytes) */ -struct dbr_dbr { - __le32 index; - #define DBR_DBR_INDEX_MASK 0xfffffUL - #define DBR_DBR_INDEX_SFT 0 - #define DBR_DBR_RESERVED12_MASK 0xfff00000UL - #define DBR_DBR_RESERVED12_SFT 20 - __le32 type_xid; - #define DBR_DBR_XID_MASK 0xfffffUL - #define DBR_DBR_XID_SFT 0 - #define DBR_DBR_RESERVED8_MASK 0xff00000UL - #define DBR_DBR_RESERVED8_SFT 20 - #define DBR_DBR_TYPE_MASK 0xf0000000UL - #define DBR_DBR_TYPE_SFT 28 - #define DBR_DBR_TYPE_SQ (0x0UL << 28) - #define DBR_DBR_TYPE_RQ (0x1UL << 28) - #define DBR_DBR_TYPE_SRQ (0x2UL << 28) - #define DBR_DBR_TYPE_SRQ_ARM (0x3UL << 28) - #define DBR_DBR_TYPE_CQ (0x4UL << 28) - #define DBR_DBR_TYPE_CQ_ARMSE (0x5UL << 28) - #define DBR_DBR_TYPE_CQ_ARMALL (0x6UL << 28) - #define DBR_DBR_TYPE_CQ_ARMENA (0x7UL << 28) - #define DBR_DBR_TYPE_SRQ_ARMENA (0x8UL << 28) - #define DBR_DBR_TYPE_CQ_CUTOFF_ACK (0x9UL << 28) - #define DBR_DBR_TYPE_NULL (0xfUL << 28) -}; - -/* 32b Doorbell Format (4 bytes) */ -struct dbr_dbr32 { - __le32 type_abs_incr_xid; - #define DBR_DBR32_XID_MASK 0xfffffUL - #define DBR_DBR32_XID_SFT 0 - #define DBR_DBR32_RESERVED4_MASK 0xf00000UL - #define DBR_DBR32_RESERVED4_SFT 20 - #define DBR_DBR32_INCR_MASK 0xf000000UL - #define DBR_DBR32_INCR_SFT 24 - #define DBR_DBR32_ABS 0x10000000UL - #define DBR_DBR32_TYPE_MASK 0xe0000000UL - #define DBR_DBR32_TYPE_SFT 29 - #define DBR_DBR32_TYPE_SQ (0x0UL << 29) +/* dbc_dbc (size:64b/8B) */ +struct dbc_dbc { + __le32 index; + #define DBC_DBC_INDEX_MASK 0xffffffUL + #define DBC_DBC_INDEX_SFT 0 + __le32 type_path_xid; + #define DBC_DBC_XID_MASK 0xfffffUL + #define DBC_DBC_XID_SFT 0 + #define DBC_DBC_PATH_MASK 0x3000000UL + #define DBC_DBC_PATH_SFT 24 + #define DBC_DBC_PATH_ROCE (0x0UL << 24) + #define DBC_DBC_PATH_L2 (0x1UL << 24) + #define DBC_DBC_PATH_ENGINE (0x2UL << 24) + #define DBC_DBC_PATH_LAST DBC_DBC_PATH_ENGINE + #define DBC_DBC_DEBUG_TRACE 0x8000000UL + #define DBC_DBC_TYPE_MASK 0xf0000000UL + #define DBC_DBC_TYPE_SFT 28 + #define DBC_DBC_TYPE_SQ (0x0UL << 28) + #define DBC_DBC_TYPE_RQ (0x1UL << 28) + #define DBC_DBC_TYPE_SRQ (0x2UL << 28) + #define DBC_DBC_TYPE_SRQ_ARM (0x3UL << 28) + #define DBC_DBC_TYPE_CQ (0x4UL << 28) + #define DBC_DBC_TYPE_CQ_ARMSE (0x5UL << 28) + #define DBC_DBC_TYPE_CQ_ARMALL (0x6UL << 28) + #define DBC_DBC_TYPE_CQ_ARMENA (0x7UL << 28) + #define DBC_DBC_TYPE_SRQ_ARMENA (0x8UL << 28) + #define DBC_DBC_TYPE_CQ_CUTOFF_ACK (0x9UL << 28) + #define DBC_DBC_TYPE_NQ (0xaUL << 28) + #define DBC_DBC_TYPE_NQ_ARM (0xbUL << 28) + #define DBC_DBC_TYPE_NULL (0xfUL << 28) + #define DBC_DBC_TYPE_LAST DBC_DBC_TYPE_NULL +}; + +/* dbc_dbc32 (size:32b/4B) */ +struct dbc_dbc32 { + __le32 type_abs_incr_xid; + #define DBC_DBC32_XID_MASK 0xfffffUL + #define DBC_DBC32_XID_SFT 0 + #define DBC_DBC32_PATH_MASK 0xc00000UL + #define DBC_DBC32_PATH_SFT 22 + #define DBC_DBC32_PATH_ROCE (0x0UL << 22) + #define DBC_DBC32_PATH_L2 (0x1UL << 22) + #define DBC_DBC32_PATH_LAST DBC_DBC32_PATH_L2 + #define DBC_DBC32_INCR_MASK 0xf000000UL + #define DBC_DBC32_INCR_SFT 24 + #define DBC_DBC32_ABS 0x10000000UL + #define DBC_DBC32_TYPE_MASK 0xe0000000UL + #define DBC_DBC32_TYPE_SFT 29 + #define DBC_DBC32_TYPE_SQ (0x0UL << 29) + #define DBC_DBC32_TYPE_LAST DBC_DBC32_TYPE_SQ }; /* SQ WQE Structures */ @@ -149,7 +159,24 @@ struct sq_psn_search { #define SQ_PSN_SEARCH_NEXT_PSN_MASK 0xffffffUL #define SQ_PSN_SEARCH_NEXT_PSN_SFT 0 #define SQ_PSN_SEARCH_FLAGS_MASK 0xff000000UL - #define SQ_PSN_SEARCH_FLAGS_SFT 24 + #define SQ_PSN_SEARCH_FLAGS_SFT 24 +}; + +/* sq_psn_search_ext (size:128b/16B) */ +struct sq_psn_search_ext { + __le32 opcode_start_psn; + #define SQ_PSN_SEARCH_EXT_START_PSN_MASK 0xffffffUL + #define SQ_PSN_SEARCH_EXT_START_PSN_SFT 0 + #define SQ_PSN_SEARCH_EXT_OPCODE_MASK 0xff000000UL + #define SQ_PSN_SEARCH_EXT_OPCODE_SFT 24 + __le32 flags_next_psn; + #define SQ_PSN_SEARCH_EXT_NEXT_PSN_MASK 0xffffffUL + #define SQ_PSN_SEARCH_EXT_NEXT_PSN_SFT 0 + #define SQ_PSN_SEARCH_EXT_FLAGS_MASK 0xff000000UL + #define SQ_PSN_SEARCH_EXT_FLAGS_SFT 24 + __le16 start_slot_idx; + __le16 reserved16; + __le32 reserved32; }; /* Send SQ WQE (40 bytes) */ @@ -505,22 +532,24 @@ struct cq_res_rc { /* Responder UD CQE (32 bytes) */ struct cq_res_ud { - __le32 length; + __le16 length; #define CQ_RES_UD_LENGTH_MASK 0x3fffUL #define CQ_RES_UD_LENGTH_SFT 0 - #define CQ_RES_UD_RESERVED18_MASK 0xffffc000UL - #define CQ_RES_UD_RESERVED18_SFT 14 + __le16 cfa_metadata; + #define CQ_RES_UD_CFA_METADATA_VID_MASK 0xfffUL + #define CQ_RES_UD_CFA_METADATA_VID_SFT 0 + #define CQ_RES_UD_CFA_METADATA_DE 0x1000UL + #define CQ_RES_UD_CFA_METADATA_PRI_MASK 0xe000UL + #define CQ_RES_UD_CFA_METADATA_PRI_SFT 13 __le32 imm_data; __le64 qp_handle; __le16 src_mac[3]; __le16 src_qp_low; u8 cqe_type_toggle; - #define CQ_RES_UD_TOGGLE 0x1UL - #define CQ_RES_UD_CQE_TYPE_MASK 0x1eUL - #define CQ_RES_UD_CQE_TYPE_SFT 1 + #define CQ_RES_UD_TOGGLE 0x1UL + #define CQ_RES_UD_CQE_TYPE_MASK 0x1eUL + #define CQ_RES_UD_CQE_TYPE_SFT 1 #define CQ_RES_UD_CQE_TYPE_RES_UD (0x2UL << 1) - #define CQ_RES_UD_RESERVED3_MASK 0xe0UL - #define CQ_RES_UD_RESERVED3_SFT 5 u8 status; #define CQ_RES_UD_STATUS_OK 0x0UL #define CQ_RES_UD_STATUS_LOCAL_ACCESS_ERROR 0x1UL @@ -536,18 +565,30 @@ struct cq_res_ud { #define CQ_RES_UD_FLAGS_SRQ_SRQ (0x1UL << 0) #define CQ_RES_UD_FLAGS_SRQ_LAST CQ_RES_UD_FLAGS_SRQ_SRQ #define CQ_RES_UD_FLAGS_IMM 0x2UL - #define CQ_RES_UD_FLAGS_ROCE_IP_VER_MASK 0xcUL - #define CQ_RES_UD_FLAGS_ROCE_IP_VER_SFT 2 - #define CQ_RES_UD_FLAGS_ROCE_IP_VER_V1 (0x0UL << 2) - #define CQ_RES_UD_FLAGS_ROCE_IP_VER_V2IPV4 (0x2UL << 2) - #define CQ_RES_UD_FLAGS_ROCE_IP_VER_V2IPV6 (0x3UL << 2) + #define CQ_RES_UD_FLAGS_UNUSED_MASK 0xcUL + #define CQ_RES_UD_FLAGS_UNUSED_SFT 2 + #define CQ_RES_UD_FLAGS_ROCE_IP_VER_MASK 0x30UL + #define CQ_RES_UD_FLAGS_ROCE_IP_VER_SFT 4 + #define CQ_RES_UD_FLAGS_ROCE_IP_VER_V1 (0x0UL << 4) + #define CQ_RES_UD_FLAGS_ROCE_IP_VER_V2IPV4 (0x2UL << 4) + #define CQ_RES_UD_FLAGS_ROCE_IP_VER_V2IPV6 (0x3UL << 4) #define CQ_RES_UD_FLAGS_ROCE_IP_VER_LAST \ CQ_RES_UD_FLAGS_ROCE_IP_VER_V2IPV6 + #define CQ_RES_UD_FLAGS_META_FORMAT_MASK 0x3c0UL + #define CQ_RES_UD_FLAGS_META_FORMAT_SFT 6 + #define CQ_RES_UD_FLAGS_META_FORMAT_NONE (0x0UL << 6) + #define CQ_RES_UD_FLAGS_META_FORMAT_VLAN (0x1UL << 6) + #define CQ_RES_UD_FLAGS_META_FORMAT_TUNNEL_ID (0x2UL << 6) + #define CQ_RES_UD_FLAGS_META_FORMAT_CHDR_DATA (0x3UL << 6) + #define CQ_RES_UD_FLAGS_META_FORMAT_HDR_OFFSET (0x4UL << 6) + #define CQ_RES_UD_FLAGS_META_FORMAT_LAST \ + CQ_RES_UD_FLAGS_META_FORMAT_HDR_OFFSET + #define CQ_RES_UD_FLAGS_EXT_META_FORMAT_MASK 0xc00UL + #define CQ_RES_UD_FLAGS_EXT_META_FORMAT_SFT 10 + __le32 src_qp_high_srq_or_rq_wr_id; #define CQ_RES_UD_SRQ_OR_RQ_WR_ID_MASK 0xfffffUL #define CQ_RES_UD_SRQ_OR_RQ_WR_ID_SFT 0 - #define CQ_RES_UD_RESERVED4_MASK 0xf00000UL - #define CQ_RES_UD_RESERVED4_SFT 20 #define CQ_RES_UD_SRC_QP_HIGH_MASK 0xff000000UL #define CQ_RES_UD_SRC_QP_HIGH_SFT 24 }; @@ -983,6 +1024,7 @@ struct cmdq_create_qp { #define CMDQ_CREATE_QP_TYPE_RC 0x2UL #define CMDQ_CREATE_QP_TYPE_UD 0x4UL #define CMDQ_CREATE_QP_TYPE_RAW_ETHERTYPE 0x6UL + #define CMDQ_CREATE_QP_TYPE_GSI 0x7UL u8 sq_pg_size_sq_lvl; #define CMDQ_CREATE_QP_SQ_LVL_MASK 0xfUL #define CMDQ_CREATE_QP_SQ_LVL_SFT 0 @@ -2719,6 +2761,8 @@ struct creq_query_func_resp_sb { __le16 max_srq; __le32 max_gid; __le32 tqm_alloc_reqs[12]; + __le32 max_dpi; + __le32 reserved_32; }; /* Set resources command response (16 bytes) */ diff --git a/drivers/infiniband/hw/cxgb3/Makefile b/drivers/infiniband/hw/cxgb3/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -ccflags-y := -Idrivers/net/ethernet/chelsio/cxgb3 +ccflags-y := -I $(srctree)/drivers/net/ethernet/chelsio/cxgb3 obj-$(CONFIG_INFINIBAND_CXGB3) += iw_cxgb3.o diff --git a/drivers/infiniband/hw/cxgb3/iwch.c b/drivers/infiniband/hw/cxgb3/iwch.c @@ -146,7 +146,7 @@ static void open_rnic_dev(struct t3cdev *tdev) pr_debug("%s t3cdev %p\n", __func__, tdev); pr_info_once("Chelsio T3 RDMA Driver - version %s\n", DRV_VERSION); - rnicp = (struct iwch_dev *)ib_alloc_device(sizeof(*rnicp)); + rnicp = ib_alloc_device(iwch_dev, ibdev); if (!rnicp) { pr_err("Cannot allocate ib device\n"); return; diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -53,6 +53,7 @@ #include <rdma/ib_smi.h> #include <rdma/ib_umem.h> #include <rdma/ib_user_verbs.h> +#include <rdma/uverbs_ioctl.h> #include "cxio_hal.h" #include "iwch.h" @@ -61,7 +62,7 @@ #include <rdma/cxgb3-abi.h> #include "common.h" -static int iwch_dealloc_ucontext(struct ib_ucontext *context) +static void iwch_dealloc_ucontext(struct ib_ucontext *context) { struct iwch_dev *rhp = to_iwch_dev(context->device); struct iwch_ucontext *ucontext = to_iwch_ucontext(context); @@ -71,24 +72,20 @@ static int iwch_dealloc_ucontext(struct ib_ucontext *context) list_for_each_entry_safe(mm, tmp, &ucontext->mmaps, entry) kfree(mm); cxio_release_ucontext(&rhp->rdev, &ucontext->uctx); - kfree(ucontext); - return 0; } -static struct ib_ucontext *iwch_alloc_ucontext(struct ib_device *ibdev, - struct ib_udata *udata) +static int iwch_alloc_ucontext(struct ib_ucontext *ucontext, + struct ib_udata *udata) { - struct iwch_ucontext *context; + struct ib_device *ibdev = ucontext->device; + struct iwch_ucontext *context = to_iwch_ucontext(ucontext); struct iwch_dev *rhp = to_iwch_dev(ibdev); pr_debug("%s ibdev %p\n", __func__, ibdev); - context = kzalloc(sizeof(*context), GFP_KERNEL); - if (!context) - return ERR_PTR(-ENOMEM); cxio_init_ucontext(&rhp->rdev, &context->uctx); INIT_LIST_HEAD(&context->mmaps); spin_lock_init(&context->mmap_lock); - return &context->ibucontext; + return 0; } static int iwch_destroy_cq(struct ib_cq *ib_cq) @@ -370,7 +367,7 @@ static int iwch_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) return ret; } -static int iwch_deallocate_pd(struct ib_pd *pd) +static void iwch_deallocate_pd(struct ib_pd *pd) { struct iwch_dev *rhp; struct iwch_pd *php; @@ -379,15 +376,13 @@ static int iwch_deallocate_pd(struct ib_pd *pd) rhp = php->rhp; pr_debug("%s ibpd %p pdid 0x%x\n", __func__, pd, php->pdid); cxio_hal_put_pdid(rhp->rdev.rscp, php->pdid); - kfree(php); - return 0; } -static struct ib_pd *iwch_allocate_pd(struct ib_device *ibdev, - struct ib_ucontext *context, - struct ib_udata *udata) +static int iwch_allocate_pd(struct ib_pd *pd, struct ib_ucontext *context, + struct ib_udata *udata) { - struct iwch_pd *php; + struct iwch_pd *php = to_iwch_pd(pd); + struct ib_device *ibdev = pd->device; u32 pdid; struct iwch_dev *rhp; @@ -395,12 +390,8 @@ static struct ib_pd *iwch_allocate_pd(struct ib_device *ibdev, rhp = (struct iwch_dev *) ibdev; pdid = cxio_hal_get_pdid(rhp->rdev.rscp); if (!pdid) - return ERR_PTR(-EINVAL); - php = kzalloc(sizeof(*php), GFP_KERNEL); - if (!php) { - cxio_hal_put_pdid(rhp->rdev.rscp, pdid); - return ERR_PTR(-ENOMEM); - } + return -EINVAL; + php->pdid = pdid; php->rhp = rhp; if (context) { @@ -408,11 +399,11 @@ static struct ib_pd *iwch_allocate_pd(struct ib_device *ibdev, if (ib_copy_to_udata(udata, &resp, sizeof(resp))) { iwch_deallocate_pd(&php->ibpd); - return ERR_PTR(-EFAULT); + return -EFAULT; } } pr_debug("%s pdid 0x%0x ptr 0x%p\n", __func__, pdid, php); - return &php->ibpd; + return 0; } static int iwch_dereg_mr(struct ib_mr *ib_mr) @@ -522,14 +513,13 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt, int acc, struct ib_udata *udata) { __be64 *pages; - int shift, n, len; - int i, k, entry; + int shift, n, i; int err = 0; struct iwch_dev *rhp; struct iwch_pd *php; struct iwch_mr *mhp; struct iwch_reg_user_mr_resp uresp; - struct scatterlist *sg; + struct sg_dma_page_iter sg_iter; pr_debug("%s ib_pd %p\n", __func__, pd); php = to_iwch_pd(pd); @@ -540,14 +530,14 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, mhp->rhp = rhp; - mhp->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0); + mhp->umem = ib_umem_get(udata, start, length, acc, 0); if (IS_ERR(mhp->umem)) { err = PTR_ERR(mhp->umem); kfree(mhp); return ERR_PTR(err); } - shift = mhp->umem->page_shift; + shift = PAGE_SHIFT; n = mhp->umem->nmap; @@ -563,19 +553,15 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, i = n = 0; - for_each_sg(mhp->umem->sg_head.sgl, sg, mhp->umem->nmap, entry) { - len = sg_dma_len(sg) >> shift; - for (k = 0; k < len; ++k) { - pages[i++] = cpu_to_be64(sg_dma_address(sg) + - (k << shift)); - if (i == PAGE_SIZE / sizeof *pages) { - err = iwch_write_pbl(mhp, pages, i, n); - if (err) - goto pbl_done; - n += i; - i = 0; - } - } + for_each_sg_dma_page(mhp->umem->sg_head.sgl, &sg_iter, mhp->umem->nmap, 0) { + pages[i++] = cpu_to_be64(sg_page_iter_dma_address(&sg_iter)); + if (i == PAGE_SIZE / sizeof *pages) { + err = iwch_write_pbl(mhp, pages, i, n); + if (err) + goto pbl_done; + n += i; + i = 0; + } } if (i) @@ -836,7 +822,8 @@ static struct ib_qp *iwch_create_qp(struct ib_pd *pd, * Kernel users need more wq space for fastreg WRs which can take * 2 WR fragments. */ - ucontext = udata ? to_iwch_ucontext(pd->uobject->context) : NULL; + ucontext = rdma_udata_to_drv_context(udata, struct iwch_ucontext, + ibucontext); if (!ucontext && wqsize < (rqsize + (2 * sqsize))) wqsize = roundup_pow_of_two(rqsize + roundup_pow_of_two(attrs->cap.max_send_wr * 2)); @@ -1130,8 +1117,9 @@ static int iwch_query_port(struct ib_device *ibdev, static ssize_t hw_rev_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev, - ibdev.dev); + struct iwch_dev *iwch_dev = + rdma_device_to_drv_device(dev, struct iwch_dev, ibdev); + pr_debug("%s dev 0x%p\n", __func__, dev); return sprintf(buf, "%d\n", iwch_dev->rdev.t3cdev_p->type); } @@ -1140,8 +1128,8 @@ static DEVICE_ATTR_RO(hw_rev); static ssize_t hca_type_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev, - ibdev.dev); + struct iwch_dev *iwch_dev = + rdma_device_to_drv_device(dev, struct iwch_dev, ibdev); struct ethtool_drvinfo info; struct net_device *lldev = iwch_dev->rdev.t3cdev_p->lldev; @@ -1154,8 +1142,9 @@ static DEVICE_ATTR_RO(hca_type); static ssize_t board_id_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev, - ibdev.dev); + struct iwch_dev *iwch_dev = + rdma_device_to_drv_device(dev, struct iwch_dev, ibdev); + pr_debug("%s dev 0x%p\n", __func__, dev); return sprintf(buf, "%x.%x\n", iwch_dev->rdev.rnic_info.pdev->vendor, iwch_dev->rdev.rnic_info.pdev->device); @@ -1348,6 +1337,8 @@ static const struct ib_device_ops iwch_dev_ops = { .reg_user_mr = iwch_reg_user_mr, .req_notify_cq = iwch_arm_cq, .resize_cq = iwch_resize_cq, + INIT_RDMA_OBJ_SIZE(ib_pd, iwch_pd, ibpd), + INIT_RDMA_OBJ_SIZE(ib_ucontext, iwch_ucontext, ibucontext), }; int iwch_register_device(struct iwch_dev *dev) @@ -1391,7 +1382,7 @@ int iwch_register_device(struct iwch_dev *dev) dev->ibdev.dev.parent = &dev->rdev.rnic_info.pdev->dev; dev->ibdev.uverbs_abi_ver = IWCH_UVERBS_ABI_VERSION; - dev->ibdev.iwcm = kmalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL); + dev->ibdev.iwcm = kzalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL); if (!dev->ibdev.iwcm) return -ENOMEM; @@ -1409,7 +1400,7 @@ int iwch_register_device(struct iwch_dev *dev) dev->ibdev.driver_id = RDMA_DRIVER_CXGB3; rdma_set_device_sysfs_group(&dev->ibdev, &iwch_attr_group); ib_set_device_ops(&dev->ibdev, &iwch_dev_ops); - ret = ib_register_device(&dev->ibdev, "cxgb3_%d", NULL); + ret = ib_register_device(&dev->ibdev, "cxgb3_%d"); if (ret) kfree(dev->ibdev.iwcm); return ret; diff --git a/drivers/infiniband/hw/cxgb4/Makefile b/drivers/infiniband/hw/cxgb4/Makefile @@ -1,5 +1,5 @@ -ccflags-y := -Idrivers/net/ethernet/chelsio/cxgb4 -ccflags-y += -Idrivers/net/ethernet/chelsio/libcxgb +ccflags-y := -I $(srctree)/drivers/net/ethernet/chelsio/cxgb4 +ccflags-y += -I $(srctree)/drivers/net/ethernet/chelsio/libcxgb obj-$(CONFIG_INFINIBAND_CXGB4) += iw_cxgb4.o diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c @@ -655,7 +655,33 @@ static int send_halfclose(struct c4iw_ep *ep) return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); } -static int send_abort(struct c4iw_ep *ep) +static void read_tcb(struct c4iw_ep *ep) +{ + struct sk_buff *skb; + struct cpl_get_tcb *req; + int wrlen = roundup(sizeof(*req), 16); + + skb = get_skb(NULL, sizeof(*req), GFP_KERNEL); + if (WARN_ON(!skb)) + return; + + set_wr_txq(skb, CPL_PRIORITY_CONTROL, ep->ctrlq_idx); + req = (struct cpl_get_tcb *) skb_put(skb, wrlen); + memset(req, 0, wrlen); + INIT_TP_WR(req, ep->hwtid); + OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_GET_TCB, ep->hwtid)); + req->reply_ctrl = htons(REPLY_CHAN_V(0) | QUEUENO_V(ep->rss_qid)); + + /* + * keep a ref on the ep so the tcb is not unlocked before this + * cpl completes. The ref is released in read_tcb_rpl(). + */ + c4iw_get_ep(&ep->com); + if (WARN_ON(c4iw_ofld_send(&ep->com.dev->rdev, skb))) + c4iw_put_ep(&ep->com); +} + +static int send_abort_req(struct c4iw_ep *ep) { u32 wrlen = roundup(sizeof(struct cpl_abort_req), 16); struct sk_buff *req_skb = skb_dequeue(&ep->com.ep_skb_list); @@ -670,6 +696,17 @@ static int send_abort(struct c4iw_ep *ep) return c4iw_l2t_send(&ep->com.dev->rdev, req_skb, ep->l2t); } +static int send_abort(struct c4iw_ep *ep) +{ + if (!ep->com.qp || !ep->com.qp->srq) { + send_abort_req(ep); + return 0; + } + set_bit(ABORT_REQ_IN_PROGRESS, &ep->com.flags); + read_tcb(ep); + return 0; +} + static int send_connect(struct c4iw_ep *ep) { struct cpl_act_open_req *req = NULL; @@ -1851,14 +1888,11 @@ static int rx_data(struct c4iw_dev *dev, struct sk_buff *skb) return 0; } -static void complete_cached_srq_buffers(struct c4iw_ep *ep, - __be32 srqidx_status) +static void complete_cached_srq_buffers(struct c4iw_ep *ep, u32 srqidx) { enum chip_type adapter_type; - u32 srqidx; adapter_type = ep->com.dev->rdev.lldi.adapter_type; - srqidx = ABORT_RSS_SRQIDX_G(be32_to_cpu(srqidx_status)); /* * If this TCB had a srq buffer cached, then we must complete @@ -1876,6 +1910,7 @@ static void complete_cached_srq_buffers(struct c4iw_ep *ep, static int abort_rpl(struct c4iw_dev *dev, struct sk_buff *skb) { + u32 srqidx; struct c4iw_ep *ep; struct cpl_abort_rpl_rss6 *rpl = cplhdr(skb); int release = 0; @@ -1887,7 +1922,10 @@ static int abort_rpl(struct c4iw_dev *dev, struct sk_buff *skb) return 0; } - complete_cached_srq_buffers(ep, rpl->srqidx_status); + if (ep->com.qp && ep->com.qp->srq) { + srqidx = ABORT_RSS_SRQIDX_G(be32_to_cpu(rpl->srqidx_status)); + complete_cached_srq_buffers(ep, srqidx ? srqidx : ep->srqe_idx); + } pr_debug("ep %p tid %u\n", ep, ep->hwtid); mutex_lock(&ep->com.mutex); @@ -1903,8 +1941,10 @@ static int abort_rpl(struct c4iw_dev *dev, struct sk_buff *skb) } mutex_unlock(&ep->com.mutex); - if (release) + if (release) { + close_complete_upcall(ep, -ECONNRESET); release_ep_resources(ep); + } c4iw_put_ep(&ep->com); return 0; } @@ -2072,7 +2112,7 @@ static int import_ep(struct c4iw_ep *ep, int iptype, __u8 *peer_ip, } else { pdev = get_real_dev(n->dev); ep->l2t = cxgb4_l2t_get(cdev->rdev.lldi.l2t, - n, pdev, 0); + n, pdev, rt_tos2priority(tos)); if (!ep->l2t) goto out; ep->mtu = dst_mtu(dst); @@ -2161,7 +2201,8 @@ static int c4iw_reconnect(struct c4iw_ep *ep) laddr6->sin6_addr.s6_addr, raddr6->sin6_addr.s6_addr, laddr6->sin6_port, - raddr6->sin6_port, 0, + raddr6->sin6_port, + ep->com.cm_id->tos, raddr6->sin6_scope_id); iptype = 6; ra = (__u8 *)&raddr6->sin6_addr; @@ -2476,7 +2517,7 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb) u16 peer_mss = ntohs(req->tcpopt.mss); int iptype; unsigned short hdrs; - u8 tos = PASS_OPEN_TOS_G(ntohl(req->tos_stid)); + u8 tos; parent_ep = (struct c4iw_ep *)get_ep_from_stid(dev, stid); if (!parent_ep) { @@ -2490,6 +2531,11 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb) goto reject; } + if (parent_ep->com.cm_id->tos_set) + tos = parent_ep->com.cm_id->tos; + else + tos = PASS_OPEN_TOS_G(ntohl(req->tos_stid)); + cxgb_get_4tuple(req, parent_ep->com.dev->rdev.lldi.adapter_type, &iptype, local_ip, peer_ip, &local_port, &peer_port); @@ -2509,7 +2555,7 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb) ntohs(peer_port), peer_mss); dst = cxgb_find_route6(&dev->rdev.lldi, get_real_dev, local_ip, peer_ip, local_port, peer_port, - PASS_OPEN_TOS_G(ntohl(req->tos_stid)), + tos, ((struct sockaddr_in6 *) &parent_ep->com.local_addr)->sin6_scope_id); } @@ -2740,6 +2786,21 @@ static int peer_close(struct c4iw_dev *dev, struct sk_buff *skb) return 0; } +static void finish_peer_abort(struct c4iw_dev *dev, struct c4iw_ep *ep) +{ + complete_cached_srq_buffers(ep, ep->srqe_idx); + if (ep->com.cm_id && ep->com.qp) { + struct c4iw_qp_attributes attrs; + + attrs.next_state = C4IW_QP_STATE_ERROR; + c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, + C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); + } + peer_abort_upcall(ep); + release_ep_resources(ep); + c4iw_put_ep(&ep->com); +} + static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb) { struct cpl_abort_req_rss6 *req = cplhdr(skb); @@ -2750,6 +2811,7 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb) int release = 0; unsigned int tid = GET_TID(req); u8 status; + u32 srqidx; u32 len = roundup(sizeof(struct cpl_abort_rpl), 16); @@ -2769,8 +2831,6 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb) goto deref_ep; } - complete_cached_srq_buffers(ep, req->srqidx_status); - pr_debug("ep %p tid %u state %u\n", ep, ep->hwtid, ep->com.state); set_bit(PEER_ABORT, &ep->com.history); @@ -2819,6 +2879,23 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb) stop_ep_timer(ep); /*FALLTHROUGH*/ case FPDU_MODE: + if (ep->com.qp && ep->com.qp->srq) { + srqidx = ABORT_RSS_SRQIDX_G( + be32_to_cpu(req->srqidx_status)); + if (srqidx) { + complete_cached_srq_buffers(ep, + req->srqidx_status); + } else { + /* Hold ep ref until finish_peer_abort() */ + c4iw_get_ep(&ep->com); + __state_set(&ep->com, ABORTING); + set_bit(PEER_ABORT_IN_PROGRESS, &ep->com.flags); + read_tcb(ep); + break; + + } + } + if (ep->com.cm_id && ep->com.qp) { attrs.next_state = C4IW_QP_STATE_ERROR; ret = c4iw_modify_qp(ep->com.qp->rhp, @@ -2942,15 +3019,18 @@ static int terminate(struct c4iw_dev *dev, struct sk_buff *skb) ep = get_ep_from_tid(dev, tid); - if (ep && ep->com.qp) { - pr_warn("TERM received tid %u qpid %u\n", - tid, ep->com.qp->wq.sq.qid); - attrs.next_state = C4IW_QP_STATE_TERMINATE; - c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, - C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); + if (ep) { + if (ep->com.qp) { + pr_warn("TERM received tid %u qpid %u\n", tid, + ep->com.qp->wq.sq.qid); + attrs.next_state = C4IW_QP_STATE_TERMINATE; + c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, + C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); + } + + c4iw_put_ep(&ep->com); } else pr_warn("TERM received tid %u no ep/qp\n", tid); - c4iw_put_ep(&ep->com); return 0; } @@ -3318,7 +3398,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) laddr6->sin6_addr.s6_addr, raddr6->sin6_addr.s6_addr, laddr6->sin6_port, - raddr6->sin6_port, 0, + raddr6->sin6_port, cm_id->tos, raddr6->sin6_scope_id); } if (!ep->dst) { @@ -3606,7 +3686,6 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp) if (close) { if (abrupt) { set_bit(EP_DISC_ABORT, &ep->com.history); - close_complete_upcall(ep, -ECONNRESET); ret = send_abort(ep); } else { set_bit(EP_DISC_CLOSE, &ep->com.history); @@ -3717,6 +3796,80 @@ static void passive_ofld_conn_reply(struct c4iw_dev *dev, struct sk_buff *skb, return; } +static inline u64 t4_tcb_get_field64(__be64 *tcb, u16 word) +{ + u64 tlo = be64_to_cpu(tcb[((31 - word) / 2)]); + u64 thi = be64_to_cpu(tcb[((31 - word) / 2) - 1]); + u64 t; + u32 shift = 32; + + t = (thi << shift) | (tlo >> shift); + + return t; +} + +static inline u32 t4_tcb_get_field32(__be64 *tcb, u16 word, u32 mask, u32 shift) +{ + u32 v; + u64 t = be64_to_cpu(tcb[(31 - word) / 2]); + + if (word & 0x1) + shift += 32; + v = (t >> shift) & mask; + return v; +} + +static int read_tcb_rpl(struct c4iw_dev *dev, struct sk_buff *skb) +{ + struct cpl_get_tcb_rpl *rpl = cplhdr(skb); + __be64 *tcb = (__be64 *)(rpl + 1); + unsigned int tid = GET_TID(rpl); + struct c4iw_ep *ep; + u64 t_flags_64; + u32 rx_pdu_out; + + ep = get_ep_from_tid(dev, tid); + if (!ep) + return 0; + /* Examine the TF_RX_PDU_OUT (bit 49 of the t_flags) in order to + * determine if there's a rx PDU feedback event pending. + * + * If that bit is set, it means we'll need to re-read the TCB's + * rq_start value. The final value is the one present in a TCB + * with the TF_RX_PDU_OUT bit cleared. + */ + + t_flags_64 = t4_tcb_get_field64(tcb, TCB_T_FLAGS_W); + rx_pdu_out = (t_flags_64 & TF_RX_PDU_OUT_V(1)) >> TF_RX_PDU_OUT_S; + + c4iw_put_ep(&ep->com); /* from get_ep_from_tid() */ + c4iw_put_ep(&ep->com); /* from read_tcb() */ + + /* If TF_RX_PDU_OUT bit is set, re-read the TCB */ + if (rx_pdu_out) { + if (++ep->rx_pdu_out_cnt >= 2) { + WARN_ONCE(1, "tcb re-read() reached the guard limit, finishing the cleanup\n"); + goto cleanup; + } + read_tcb(ep); + return 0; + } + + ep->srqe_idx = t4_tcb_get_field32(tcb, TCB_RQ_START_W, TCB_RQ_START_W, + TCB_RQ_START_S); +cleanup: + pr_debug("ep %p tid %u %016x\n", ep, ep->hwtid, ep->srqe_idx); + + if (test_bit(PEER_ABORT_IN_PROGRESS, &ep->com.flags)) + finish_peer_abort(dev, ep); + else if (test_bit(ABORT_REQ_IN_PROGRESS, &ep->com.flags)) + send_abort_req(ep); + else + WARN_ONCE(1, "unexpected state!"); + + return 0; +} + static int deferred_fw6_msg(struct c4iw_dev *dev, struct sk_buff *skb) { struct cpl_fw6_msg *rpl = cplhdr(skb); @@ -4037,6 +4190,7 @@ static c4iw_handler_func work_handlers[NUM_CPL_CMDS + NUM_FAKE_CPLS] = { [CPL_CLOSE_CON_RPL] = close_con_rpl, [CPL_RDMA_TERMINATE] = terminate, [CPL_FW4_ACK] = fw4_ack, + [CPL_GET_TCB_RPL] = read_tcb_rpl, [CPL_FW6_MSG] = deferred_fw6_msg, [CPL_RX_PKT] = rx_pkt, [FAKE_CPL_PUT_EP_SAFE] = _put_ep_safe, @@ -4268,6 +4422,7 @@ c4iw_handler_func c4iw_handlers[NUM_CPL_CMDS] = { [CPL_RDMA_TERMINATE] = sched, [CPL_FW4_ACK] = sched, [CPL_SET_TCB_RPL] = set_tcb_rpl, + [CPL_GET_TCB_RPL] = sched, [CPL_FW6_MSG] = fw6_msg, [CPL_RX_PKT] = sched }; diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c @@ -720,11 +720,8 @@ static const struct file_operations ep_debugfs_fops = { .read = debugfs_read, }; -static int setup_debugfs(struct c4iw_dev *devp) +static void setup_debugfs(struct c4iw_dev *devp) { - if (!devp->debugfs_root) - return -1; - debugfs_create_file_size("qps", S_IWUSR, devp->debugfs_root, (void *)devp, &qp_debugfs_fops, 4096); @@ -740,7 +737,6 @@ static int setup_debugfs(struct c4iw_dev *devp) if (c4iw_wr_log) debugfs_create_file_size("wr_log", S_IWUSR, devp->debugfs_root, (void *)devp, &wr_log_debugfs_fops, 4096); - return 0; } void c4iw_release_dev_ucontext(struct c4iw_rdev *rdev, @@ -981,7 +977,7 @@ static struct c4iw_dev *c4iw_alloc(const struct cxgb4_lld_info *infop) pr_info("%s: On-Chip Queues not supported on this device\n", pci_name(infop->pdev)); - devp = (struct c4iw_dev *)ib_alloc_device(sizeof(*devp)); + devp = ib_alloc_device(c4iw_dev, ibdev); if (!devp) { pr_err("Cannot allocate ib device\n"); return ERR_PTR(-ENOMEM); @@ -1564,8 +1560,6 @@ static int __init c4iw_init_module(void) return err; c4iw_debugfs_root = debugfs_create_dir(DRV_NAME, NULL); - if (!c4iw_debugfs_root) - pr_warn("could not create debugfs entry, continuing\n"); reg_workq = create_singlethread_workqueue("Register_iWARP_device"); if (!reg_workq) { diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -589,7 +589,6 @@ struct c4iw_ucontext { u32 key; spinlock_t mmap_lock; struct list_head mmaps; - struct kref kref; bool is_32b_cqe; }; @@ -598,18 +597,6 @@ static inline struct c4iw_ucontext *to_c4iw_ucontext(struct ib_ucontext *c) return container_of(c, struct c4iw_ucontext, ibucontext); } -void _c4iw_free_ucontext(struct kref *kref); - -static inline void c4iw_put_ucontext(struct c4iw_ucontext *ucontext) -{ - kref_put(&ucontext->kref, _c4iw_free_ucontext); -} - -static inline void c4iw_get_ucontext(struct c4iw_ucontext *ucontext) -{ - kref_get(&ucontext->kref); -} - struct c4iw_mm_entry { struct list_head entry; u64 addr; @@ -982,6 +969,9 @@ struct c4iw_ep { int rcv_win; u32 snd_wscale; struct c4iw_ep_stats stats; + u32 srqe_idx; + u32 rx_pdu_out_cnt; + struct sk_buff *peer_abort_skb; }; static inline struct c4iw_ep *to_ep(struct iw_cm_id *cm_id) diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c @@ -502,10 +502,9 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt, int acc, struct ib_udata *udata) { __be64 *pages; - int shift, n, len; - int i, k, entry; + int shift, n, i; int err = -ENOMEM; - struct scatterlist *sg; + struct sg_dma_page_iter sg_iter; struct c4iw_dev *rhp; struct c4iw_pd *php; struct c4iw_mr *mhp; @@ -537,11 +536,11 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, mhp->rhp = rhp; - mhp->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0); + mhp->umem = ib_umem_get(udata, start, length, acc, 0); if (IS_ERR(mhp->umem)) goto err_free_skb; - shift = mhp->umem->page_shift; + shift = PAGE_SHIFT; n = mhp->umem->nmap; err = alloc_pbl(mhp, n); @@ -556,21 +555,16 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, i = n = 0; - for_each_sg(mhp->umem->sg_head.sgl, sg, mhp->umem->nmap, entry) { - len = sg_dma_len(sg) >> shift; - for (k = 0; k < len; ++k) { - pages[i++] = cpu_to_be64(sg_dma_address(sg) + - (k << shift)); - if (i == PAGE_SIZE / sizeof *pages) { - err = write_pbl(&mhp->rhp->rdev, - pages, - mhp->attr.pbl_addr + (n << 3), i, - mhp->wr_waitp); - if (err) - goto pbl_done; - n += i; - i = 0; - } + for_each_sg_dma_page(mhp->umem->sg_head.sgl, &sg_iter, mhp->umem->nmap, 0) { + pages[i++] = cpu_to_be64(sg_page_iter_dma_address(&sg_iter)); + if (i == PAGE_SIZE / sizeof(*pages)) { + err = write_pbl(&mhp->rhp->rdev, pages, + mhp->attr.pbl_addr + (n << 3), i, + mhp->wr_waitp); + if (err) + goto pbl_done; + n += i; + i = 0; } } @@ -684,8 +678,8 @@ int c4iw_dealloc_mw(struct ib_mw *mw) mhp->wr_waitp); kfree_skb(mhp->dereg_skb); c4iw_put_wr_wait(mhp->wr_waitp); - kfree(mhp); pr_debug("ib_mw %p mmid 0x%x ptr %p\n", mw, mmid, mhp); + kfree(mhp); return 0; } diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c @@ -58,51 +58,34 @@ static int fastreg_support = 1; module_param(fastreg_support, int, 0644); MODULE_PARM_DESC(fastreg_support, "Advertise fastreg support (default=1)"); -void _c4iw_free_ucontext(struct kref *kref) +static void c4iw_dealloc_ucontext(struct ib_ucontext *context) { - struct c4iw_ucontext *ucontext; + struct c4iw_ucontext *ucontext = to_c4iw_ucontext(context); struct c4iw_dev *rhp; struct c4iw_mm_entry *mm, *tmp; - ucontext = container_of(kref, struct c4iw_ucontext, kref); + pr_debug("context %p\n", context); rhp = to_c4iw_dev(ucontext->ibucontext.device); - pr_debug("ucontext %p\n", ucontext); list_for_each_entry_safe(mm, tmp, &ucontext->mmaps, entry) kfree(mm); c4iw_release_dev_ucontext(&rhp->rdev, &ucontext->uctx); - kfree(ucontext); } -static int c4iw_dealloc_ucontext(struct ib_ucontext *context) +static int c4iw_alloc_ucontext(struct ib_ucontext *ucontext, + struct ib_udata *udata) { - struct c4iw_ucontext *ucontext = to_c4iw_ucontext(context); - - pr_debug("context %p\n", context); - c4iw_put_ucontext(ucontext); - return 0; -} - -static struct ib_ucontext *c4iw_alloc_ucontext(struct ib_device *ibdev, - struct ib_udata *udata) -{ - struct c4iw_ucontext *context; + struct ib_device *ibdev = ucontext->device; + struct c4iw_ucontext *context = to_c4iw_ucontext(ucontext); struct c4iw_dev *rhp = to_c4iw_dev(ibdev); struct c4iw_alloc_ucontext_resp uresp; int ret = 0; struct c4iw_mm_entry *mm = NULL; pr_debug("ibdev %p\n", ibdev); - context = kzalloc(sizeof(*context), GFP_KERNEL); - if (!context) { - ret = -ENOMEM; - goto err; - } - c4iw_init_dev_ucontext(&rhp->rdev, &context->uctx); INIT_LIST_HEAD(&context->mmaps); spin_lock_init(&context->mmap_lock); - kref_init(&context->kref); if (udata->outlen < sizeof(uresp) - sizeof(uresp.reserved)) { pr_err_once("Warning - downlevel libcxgb4 (non-fatal), device status page disabled\n"); @@ -111,7 +94,7 @@ static struct ib_ucontext *c4iw_alloc_ucontext(struct ib_device *ibdev, mm = kmalloc(sizeof(*mm), GFP_KERNEL); if (!mm) { ret = -ENOMEM; - goto err_free; + goto err; } uresp.status_page_size = PAGE_SIZE; @@ -131,13 +114,11 @@ static struct ib_ucontext *c4iw_alloc_ucontext(struct ib_device *ibdev, mm->len = PAGE_SIZE; insert_mmap(context, mm); } - return &context->ibucontext; + return 0; err_mm: kfree(mm); -err_free: - kfree(context); err: - return ERR_PTR(ret); + return ret; } static int c4iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) @@ -209,7 +190,7 @@ static int c4iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) return ret; } -static int c4iw_deallocate_pd(struct ib_pd *pd) +static void c4iw_deallocate_pd(struct ib_pd *pd) { struct c4iw_dev *rhp; struct c4iw_pd *php; @@ -221,15 +202,13 @@ static int c4iw_deallocate_pd(struct ib_pd *pd) mutex_lock(&rhp->rdev.stats.lock); rhp->rdev.stats.pd.cur--; mutex_unlock(&rhp->rdev.stats.lock); - kfree(php); - return 0; } -static struct ib_pd *c4iw_allocate_pd(struct ib_device *ibdev, - struct ib_ucontext *context, - struct ib_udata *udata) +static int c4iw_allocate_pd(struct ib_pd *pd, struct ib_ucontext *context, + struct ib_udata *udata) { - struct c4iw_pd *php; + struct c4iw_pd *php = to_c4iw_pd(pd); + struct ib_device *ibdev = pd->device; u32 pdid; struct c4iw_dev *rhp; @@ -237,12 +216,8 @@ static struct ib_pd *c4iw_allocate_pd(struct ib_device *ibdev, rhp = (struct c4iw_dev *) ibdev; pdid = c4iw_get_resource(&rhp->rdev.resource.pdid_table); if (!pdid) - return ERR_PTR(-EINVAL); - php = kzalloc(sizeof(*php), GFP_KERNEL); - if (!php) { - c4iw_put_resource(&rhp->rdev.resource.pdid_table, pdid); - return ERR_PTR(-ENOMEM); - } + return -EINVAL; + php->pdid = pdid; php->rhp = rhp; if (context) { @@ -250,7 +225,7 @@ static struct ib_pd *c4iw_allocate_pd(struct ib_device *ibdev, if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) { c4iw_deallocate_pd(&php->ibpd); - return ERR_PTR(-EFAULT); + return -EFAULT; } } mutex_lock(&rhp->rdev.stats.lock); @@ -259,7 +234,7 @@ static struct ib_pd *c4iw_allocate_pd(struct ib_device *ibdev, rhp->rdev.stats.pd.max = rhp->rdev.stats.pd.cur; mutex_unlock(&rhp->rdev.stats.lock); pr_debug("pdid 0x%0x ptr 0x%p\n", pdid, php); - return &php->ibpd; + return 0; } static int c4iw_query_pkey(struct ib_device *ibdev, u8 port, u16 index, @@ -376,8 +351,9 @@ static int c4iw_query_port(struct ib_device *ibdev, u8 port, static ssize_t hw_rev_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev, - ibdev.dev); + struct c4iw_dev *c4iw_dev = + rdma_device_to_drv_device(dev, struct c4iw_dev, ibdev); + pr_debug("dev 0x%p\n", dev); return sprintf(buf, "%d\n", CHELSIO_CHIP_RELEASE(c4iw_dev->rdev.lldi.adapter_type)); @@ -387,8 +363,8 @@ static DEVICE_ATTR_RO(hw_rev); static ssize_t hca_type_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev, - ibdev.dev); + struct c4iw_dev *c4iw_dev = + rdma_device_to_drv_device(dev, struct c4iw_dev, ibdev); struct ethtool_drvinfo info; struct net_device *lldev = c4iw_dev->rdev.lldi.ports[0]; @@ -401,8 +377,9 @@ static DEVICE_ATTR_RO(hca_type); static ssize_t board_id_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev, - ibdev.dev); + struct c4iw_dev *c4iw_dev = + rdma_device_to_drv_device(dev, struct c4iw_dev, ibdev); + pr_debug("dev 0x%p\n", dev); return sprintf(buf, "%x.%x\n", c4iw_dev->rdev.lldi.pdev->vendor, c4iw_dev->rdev.lldi.pdev->device); @@ -547,6 +524,7 @@ static const struct ib_device_ops c4iw_dev_ops = { .destroy_cq = c4iw_destroy_cq, .destroy_qp = c4iw_destroy_qp, .destroy_srq = c4iw_destroy_srq, + .fill_res_entry = fill_res_entry, .get_dev_fw_str = get_dev_fw_str, .get_dma_mr = c4iw_get_dma_mr, .get_hw_stats = c4iw_get_mib, @@ -567,6 +545,8 @@ static const struct ib_device_ops c4iw_dev_ops = { .query_qp = c4iw_ib_query_qp, .reg_user_mr = c4iw_reg_user_mr, .req_notify_cq = c4iw_arm_cq, + INIT_RDMA_OBJ_SIZE(ib_pd, c4iw_pd, ibpd), + INIT_RDMA_OBJ_SIZE(ib_ucontext, c4iw_ucontext, ibucontext), }; void c4iw_register_device(struct work_struct *work) @@ -613,7 +593,7 @@ void c4iw_register_device(struct work_struct *work) dev->ibdev.dev.parent = &dev->rdev.lldi.pdev->dev; dev->ibdev.uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION; - dev->ibdev.iwcm = kmalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL); + dev->ibdev.iwcm = kzalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL); if (!dev->ibdev.iwcm) { ret = -ENOMEM; goto err_dealloc_ctx; @@ -627,14 +607,13 @@ void c4iw_register_device(struct work_struct *work) dev->ibdev.iwcm->add_ref = c4iw_qp_add_ref; dev->ibdev.iwcm->rem_ref = c4iw_qp_rem_ref; dev->ibdev.iwcm->get_qp = c4iw_get_qp; - dev->ibdev.res.fill_res_entry = fill_res_entry; memcpy(dev->ibdev.iwcm->ifname, dev->rdev.lldi.ports[0]->name, sizeof(dev->ibdev.iwcm->ifname)); rdma_set_device_sysfs_group(&dev->ibdev, &c4iw_attr_group); dev->ibdev.driver_id = RDMA_DRIVER_CXGB4; ib_set_device_ops(&dev->ibdev, &c4iw_dev_ops); - ret = ib_register_device(&dev->ibdev, "cxgb4_%d", NULL); + ret = ib_register_device(&dev->ibdev, "cxgb4_%d"); if (ret) goto err_kfree_iwcm; return; diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c @@ -31,6 +31,7 @@ */ #include <linux/module.h> +#include <rdma/uverbs_ioctl.h> #include "iw_cxgb4.h" @@ -632,7 +633,10 @@ static void build_rdma_write_cmpl(struct t4_sq *sq, wcwr->stag_sink = cpu_to_be32(rdma_wr(wr)->rkey); wcwr->to_sink = cpu_to_be64(rdma_wr(wr)->remote_addr); - wcwr->stag_inv = cpu_to_be32(wr->next->ex.invalidate_rkey); + if (wr->next->opcode == IB_WR_SEND) + wcwr->stag_inv = 0; + else + wcwr->stag_inv = cpu_to_be32(wr->next->ex.invalidate_rkey); wcwr->r2 = 0; wcwr->r3 = 0; @@ -726,7 +730,10 @@ static void post_write_cmpl(struct c4iw_qp *qhp, const struct ib_send_wr *wr) /* SEND_WITH_INV swsqe */ swsqe = &qhp->wq.sq.sw_sq[qhp->wq.sq.pidx]; - swsqe->opcode = FW_RI_SEND_WITH_INV; + if (wr->next->opcode == IB_WR_SEND) + swsqe->opcode = FW_RI_SEND; + else + swsqe->opcode = FW_RI_SEND_WITH_INV; swsqe->idx = qhp->wq.sq.pidx; swsqe->complete = 0; swsqe->signaled = send_signaled; @@ -897,8 +904,6 @@ static void free_qp_work(struct work_struct *work) destroy_qp(&rhp->rdev, &qhp->wq, ucontext ? &ucontext->uctx : &rhp->rdev.uctx, !qhp->srq); - if (ucontext) - c4iw_put_ucontext(ucontext); c4iw_put_wr_wait(qhp->wr_waitp); kfree(qhp); } @@ -1133,9 +1138,9 @@ int c4iw_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, /* * Fastpath for NVMe-oF target WRITE + SEND_WITH_INV wr chain which is * the response for small NVMEe-oF READ requests. If the chain is - * exactly a WRITE->SEND_WITH_INV and the sgl depths and lengths - * meet the requirements of the fw_ri_write_cmpl_wr work request, - * then build and post the write_cmpl WR. If any of the tests + * exactly a WRITE->SEND_WITH_INV or a WRITE->SEND and the sgl depths + * and lengths meet the requirements of the fw_ri_write_cmpl_wr work + * request, then build and post the write_cmpl WR. If any of the tests * below are not true, then we continue on with the tradtional WRITE * and SEND WRs. */ @@ -1145,7 +1150,8 @@ int c4iw_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, wr && wr->next && !wr->next->next && wr->opcode == IB_WR_RDMA_WRITE && wr->sg_list[0].length && wr->num_sge <= T4_WRITE_CMPL_MAX_SGL && - wr->next->opcode == IB_WR_SEND_WITH_INV && + (wr->next->opcode == IB_WR_SEND || + wr->next->opcode == IB_WR_SEND_WITH_INV) && wr->next->sg_list[0].length == T4_WRITE_CMPL_MAX_CQE && wr->next->num_sge == 1 && num_wrs >= 2) { post_write_cmpl(qhp, wr); @@ -2129,7 +2135,8 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, struct c4iw_cq *rchp; struct c4iw_create_qp_resp uresp; unsigned int sqsize, rqsize = 0; - struct c4iw_ucontext *ucontext; + struct c4iw_ucontext *ucontext = rdma_udata_to_drv_context( + udata, struct c4iw_ucontext, ibucontext); int ret; struct c4iw_mm_entry *sq_key_mm, *rq_key_mm = NULL, *sq_db_key_mm; struct c4iw_mm_entry *rq_db_key_mm = NULL, *ma_sync_key_mm = NULL; @@ -2163,8 +2170,6 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, if (sqsize < 8) sqsize = 8; - ucontext = udata ? to_c4iw_ucontext(pd->uobject->context) : NULL; - qhp = kzalloc(sizeof(*qhp), GFP_KERNEL); if (!qhp) return ERR_PTR(-ENOMEM); @@ -2331,7 +2336,6 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, insert_mmap(ucontext, ma_sync_key_mm); } - c4iw_get_ucontext(ucontext); qhp->ucontext = ucontext; } if (!attrs->srq) { @@ -2589,7 +2593,7 @@ static int alloc_srq_queue(struct c4iw_srq *srq, struct c4iw_dev_ucontext *uctx, /* build fw_ri_res_wr */ wr_len = sizeof(*res_wr) + sizeof(*res); - skb = alloc_skb(wr_len, GFP_KERNEL | __GFP_NOFAIL); + skb = alloc_skb(wr_len, GFP_KERNEL); if (!skb) goto err_free_queue; set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0); @@ -2711,7 +2715,8 @@ struct ib_srq *c4iw_create_srq(struct ib_pd *pd, struct ib_srq_init_attr *attrs, rqsize = attrs->attr.max_wr + 1; rqsize = roundup_pow_of_two(max_t(u16, rqsize, 16)); - ucontext = udata ? to_c4iw_ucontext(pd->uobject->context) : NULL; + ucontext = rdma_udata_to_drv_context(udata, struct c4iw_ucontext, + ibucontext); srq = kzalloc(sizeof(*srq), GFP_KERNEL); if (!srq) diff --git a/drivers/infiniband/hw/cxgb4/t4.h b/drivers/infiniband/hw/cxgb4/t4.h @@ -35,6 +35,7 @@ #include "t4_regs.h" #include "t4_values.h" #include "t4_msg.h" +#include "t4_tcb.h" #include "t4fw_ri_api.h" #define T4_MAX_NUM_PD 65536 diff --git a/drivers/infiniband/hw/hfi1/Makefile b/drivers/infiniband/hw/hfi1/Makefile @@ -24,6 +24,7 @@ hfi1-y := \ mad.o \ mmu_rb.o \ msix.o \ + opfn.o \ pcie.o \ pio.o \ pio_copy.o \ diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c @@ -4253,6 +4253,8 @@ static struct cntr_entry dev_cntrs[DEV_CNTR_LAST] = { access_sw_pio_drain), [C_SW_KMEM_WAIT] = CNTR_ELEM("KmemWait", 0, 0, CNTR_NORMAL, access_sw_kmem_wait), +[C_SW_TID_WAIT] = CNTR_ELEM("TidWait", 0, 0, CNTR_NORMAL, + hfi1_access_sw_tid_wait), [C_SW_SEND_SCHED] = CNTR_ELEM("SendSched", 0, 0, CNTR_NORMAL, access_sw_send_schedule), [C_SDMA_DESC_FETCHED_CNT] = CNTR_ELEM("SDEDscFdCn", @@ -5222,6 +5224,17 @@ int is_bx(struct hfi1_devdata *dd) return (chip_rev_minor & 0xF0) == 0x10; } +/* return true is kernel urg disabled for rcd */ +bool is_urg_masked(struct hfi1_ctxtdata *rcd) +{ + u64 mask; + u32 is = IS_RCVURGENT_START + rcd->ctxt; + u8 bit = is % 64; + + mask = read_csr(rcd->dd, CCE_INT_MASK + (8 * (is / 64))); + return !(mask & BIT_ULL(bit)); +} + /* * Append string s to buffer buf. Arguments curp and len are the current * position and remaining length, respectively. diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h @@ -1,7 +1,7 @@ #ifndef _CHIP_H #define _CHIP_H /* - * Copyright(c) 2015 - 2017 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -804,6 +804,7 @@ void clear_linkup_counters(struct hfi1_devdata *dd); u32 hdrqempty(struct hfi1_ctxtdata *rcd); int is_ax(struct hfi1_devdata *dd); int is_bx(struct hfi1_devdata *dd); +bool is_urg_masked(struct hfi1_ctxtdata *rcd); u32 read_physical_state(struct hfi1_devdata *dd); u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate); const char *opa_lstate_name(u32 lstate); @@ -926,6 +927,7 @@ enum { C_SW_PIO_WAIT, C_SW_PIO_DRAIN, C_SW_KMEM_WAIT, + C_SW_TID_WAIT, C_SW_SEND_SCHED, C_SDMA_DESC_FETCHED_CNT, C_SDMA_INT_CNT, diff --git a/drivers/infiniband/hw/hfi1/common.h b/drivers/infiniband/hw/hfi1/common.h @@ -340,6 +340,10 @@ struct diag_pkt { #define HFI1_PSM_IOC_BASE_SEQ 0x0 +/* Number of BTH.PSN bits used for sequence number in expected rcvs */ +#define HFI1_KDETH_BTH_SEQ_SHIFT 11 +#define HFI1_KDETH_BTH_SEQ_MASK (BIT(HFI1_KDETH_BTH_SEQ_SHIFT) - 1) + static inline __u64 rhf_to_cpu(const __le32 *rbuf) { return __le64_to_cpu(*((__le64 *)rbuf)); diff --git a/drivers/infiniband/hw/hfi1/debugfs.c b/drivers/infiniband/hw/hfi1/debugfs.c @@ -1167,6 +1167,7 @@ void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd) char link[10]; struct hfi1_devdata *dd = dd_from_dev(ibd); struct hfi1_pportdata *ppd; + struct dentry *root; int unit = dd->unit; int i, j; @@ -1174,31 +1175,29 @@ void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd) return; snprintf(name, sizeof(name), "%s_%d", class_name(), unit); snprintf(link, sizeof(link), "%d", unit); - ibd->hfi1_ibdev_dbg = debugfs_create_dir(name, hfi1_dbg_root); - if (!ibd->hfi1_ibdev_dbg) { - pr_warn("create of %s failed\n", name); - return; - } + root = debugfs_create_dir(name, hfi1_dbg_root); + ibd->hfi1_ibdev_dbg = root; + ibd->hfi1_ibdev_link = debugfs_create_symlink(link, hfi1_dbg_root, name); - if (!ibd->hfi1_ibdev_link) { - pr_warn("create of %s symlink failed\n", name); - return; - } - DEBUGFS_SEQ_FILE_CREATE(opcode_stats, ibd->hfi1_ibdev_dbg, ibd); - DEBUGFS_SEQ_FILE_CREATE(tx_opcode_stats, ibd->hfi1_ibdev_dbg, ibd); - DEBUGFS_SEQ_FILE_CREATE(ctx_stats, ibd->hfi1_ibdev_dbg, ibd); - DEBUGFS_SEQ_FILE_CREATE(qp_stats, ibd->hfi1_ibdev_dbg, ibd); - DEBUGFS_SEQ_FILE_CREATE(sdes, ibd->hfi1_ibdev_dbg, ibd); - DEBUGFS_SEQ_FILE_CREATE(rcds, ibd->hfi1_ibdev_dbg, ibd); - DEBUGFS_SEQ_FILE_CREATE(pios, ibd->hfi1_ibdev_dbg, ibd); - DEBUGFS_SEQ_FILE_CREATE(sdma_cpu_list, ibd->hfi1_ibdev_dbg, ibd); + + debugfs_create_file("opcode_stats", 0444, root, ibd, + &_opcode_stats_file_ops); + debugfs_create_file("tx_opcode_stats", 0444, root, ibd, + &_tx_opcode_stats_file_ops); + debugfs_create_file("ctx_stats", 0444, root, ibd, &_ctx_stats_file_ops); + debugfs_create_file("qp_stats", 0444, root, ibd, &_qp_stats_file_ops); + debugfs_create_file("sdes", 0444, root, ibd, &_sdes_file_ops); + debugfs_create_file("rcds", 0444, root, ibd, &_rcds_file_ops); + debugfs_create_file("pios", 0444, root, ibd, &_pios_file_ops); + debugfs_create_file("sdma_cpu_list", 0444, root, ibd, + &_sdma_cpu_list_file_ops); + /* dev counter files */ for (i = 0; i < ARRAY_SIZE(cntr_ops); i++) - DEBUGFS_FILE_CREATE(cntr_ops[i].name, - ibd->hfi1_ibdev_dbg, - dd, - &cntr_ops[i].ops, S_IRUGO); + debugfs_create_file(cntr_ops[i].name, 0444, root, dd, + &cntr_ops[i].ops); + /* per port files */ for (ppd = dd->pport, j = 0; j < dd->num_pports; j++, ppd++) for (i = 0; i < ARRAY_SIZE(port_cntr_ops); i++) { @@ -1206,12 +1205,11 @@ void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd) sizeof(name), port_cntr_ops[i].name, j + 1); - DEBUGFS_FILE_CREATE(name, - ibd->hfi1_ibdev_dbg, - ppd, - &port_cntr_ops[i].ops, + debugfs_create_file(name, !port_cntr_ops[i].ops.write ? - S_IRUGO : S_IRUGO | S_IWUSR); + S_IRUGO : + S_IRUGO | S_IWUSR, + root, ppd, &port_cntr_ops[i].ops); } hfi1_fault_init_debugfs(ibd); @@ -1341,10 +1339,10 @@ DEBUGFS_FILE_OPS(driver_stats); void hfi1_dbg_init(void) { hfi1_dbg_root = debugfs_create_dir(DRIVER_NAME, NULL); - if (!hfi1_dbg_root) - pr_warn("init of debugfs failed\n"); - DEBUGFS_SEQ_FILE_CREATE(driver_stats_names, hfi1_dbg_root, NULL); - DEBUGFS_SEQ_FILE_CREATE(driver_stats, hfi1_dbg_root, NULL); + debugfs_create_file("driver_stats_names", 0444, hfi1_dbg_root, NULL, + &_driver_stats_names_file_ops); + debugfs_create_file("driver_stats", 0444, hfi1_dbg_root, NULL, + &_driver_stats_file_ops); } void hfi1_dbg_exit(void) diff --git a/drivers/infiniband/hw/hfi1/debugfs.h b/drivers/infiniband/hw/hfi1/debugfs.h @@ -49,16 +49,6 @@ struct hfi1_ibdev; -#define DEBUGFS_FILE_CREATE(name, parent, data, ops, mode) \ -do { \ - struct dentry *ent; \ - const char *__name = name; \ - ent = debugfs_create_file(__name, mode, parent, \ - data, ops); \ - if (!ent) \ - pr_warn("create of %s failed\n", __name); \ -} while (0) - #define DEBUGFS_SEQ_FILE_OPS(name) \ static const struct seq_operations _##name##_seq_ops = { \ .start = _##name##_seq_start, \ @@ -89,8 +79,6 @@ static const struct file_operations _##name##_file_ops = { \ .release = seq_release \ } -#define DEBUGFS_SEQ_FILE_CREATE(name, parent, data) \ - DEBUGFS_FILE_CREATE(#name, parent, data, &_##name##_file_ops, 0444) ssize_t hfi1_seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos); diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c @@ -1575,25 +1575,32 @@ drop: return -EINVAL; } -void handle_eflags(struct hfi1_packet *packet) +static void show_eflags_errs(struct hfi1_packet *packet) { struct hfi1_ctxtdata *rcd = packet->rcd; u32 rte = rhf_rcv_type_err(packet->rhf); + dd_dev_err(rcd->dd, + "receive context %d: rhf 0x%016llx, errs [ %s%s%s%s%s%s%s%s] rte 0x%x\n", + rcd->ctxt, packet->rhf, + packet->rhf & RHF_K_HDR_LEN_ERR ? "k_hdr_len " : "", + packet->rhf & RHF_DC_UNC_ERR ? "dc_unc " : "", + packet->rhf & RHF_DC_ERR ? "dc " : "", + packet->rhf & RHF_TID_ERR ? "tid " : "", + packet->rhf & RHF_LEN_ERR ? "len " : "", + packet->rhf & RHF_ECC_ERR ? "ecc " : "", + packet->rhf & RHF_VCRC_ERR ? "vcrc " : "", + packet->rhf & RHF_ICRC_ERR ? "icrc " : "", + rte); +} + +void handle_eflags(struct hfi1_packet *packet) +{ + struct hfi1_ctxtdata *rcd = packet->rcd; + rcv_hdrerr(rcd, rcd->ppd, packet); if (rhf_err_flags(packet->rhf)) - dd_dev_err(rcd->dd, - "receive context %d: rhf 0x%016llx, errs [ %s%s%s%s%s%s%s%s] rte 0x%x\n", - rcd->ctxt, packet->rhf, - packet->rhf & RHF_K_HDR_LEN_ERR ? "k_hdr_len " : "", - packet->rhf & RHF_DC_UNC_ERR ? "dc_unc " : "", - packet->rhf & RHF_DC_ERR ? "dc " : "", - packet->rhf & RHF_TID_ERR ? "tid " : "", - packet->rhf & RHF_LEN_ERR ? "len " : "", - packet->rhf & RHF_ECC_ERR ? "ecc " : "", - packet->rhf & RHF_VCRC_ERR ? "vcrc " : "", - packet->rhf & RHF_ICRC_ERR ? "icrc " : "", - rte); + show_eflags_errs(packet); } /* @@ -1699,11 +1706,14 @@ static int kdeth_process_expected(struct hfi1_packet *packet) if (unlikely(hfi1_dbg_should_fault_rx(packet))) return RHF_RCV_CONTINUE; - if (unlikely(rhf_err_flags(packet->rhf))) - handle_eflags(packet); + if (unlikely(rhf_err_flags(packet->rhf))) { + struct hfi1_ctxtdata *rcd = packet->rcd; - dd_dev_err(packet->rcd->dd, - "Unhandled expected packet received. Dropping.\n"); + if (hfi1_handle_kdeth_eflags(rcd, rcd->ppd, packet)) + return RHF_RCV_CONTINUE; + } + + hfi1_kdeth_expected_rcv(packet); return RHF_RCV_CONTINUE; } @@ -1712,11 +1722,17 @@ static int kdeth_process_eager(struct hfi1_packet *packet) hfi1_setup_9B_packet(packet); if (unlikely(hfi1_dbg_should_fault_rx(packet))) return RHF_RCV_CONTINUE; - if (unlikely(rhf_err_flags(packet->rhf))) - handle_eflags(packet); - dd_dev_err(packet->rcd->dd, - "Unhandled eager packet received. Dropping.\n"); + trace_hfi1_rcvhdr(packet); + if (unlikely(rhf_err_flags(packet->rhf))) { + struct hfi1_ctxtdata *rcd = packet->rcd; + + show_eflags_errs(packet); + if (hfi1_handle_kdeth_eflags(rcd, rcd->ppd, packet)) + return RHF_RCV_CONTINUE; + } + + hfi1_kdeth_eager_rcv(packet); return RHF_RCV_CONTINUE; } diff --git a/drivers/infiniband/hw/hfi1/fault.c b/drivers/infiniband/hw/hfi1/fault.c @@ -250,6 +250,7 @@ void hfi1_fault_exit_debugfs(struct hfi1_ibdev *ibd) int hfi1_fault_init_debugfs(struct hfi1_ibdev *ibd) { struct dentry *parent = ibd->hfi1_ibdev_dbg; + struct dentry *fault_dir; ibd->fault = kzalloc(sizeof(*ibd->fault), GFP_KERNEL); if (!ibd->fault) @@ -269,45 +270,31 @@ int hfi1_fault_init_debugfs(struct hfi1_ibdev *ibd) bitmap_zero(ibd->fault->opcodes, sizeof(ibd->fault->opcodes) * BITS_PER_BYTE); - ibd->fault->dir = - fault_create_debugfs_attr("fault", parent, - &ibd->fault->attr); - if (IS_ERR(ibd->fault->dir)) { + fault_dir = + fault_create_debugfs_attr("fault", parent, &ibd->fault->attr); + if (IS_ERR(fault_dir)) { kfree(ibd->fault); ibd->fault = NULL; return -ENOENT; } - - DEBUGFS_SEQ_FILE_CREATE(fault_stats, ibd->fault->dir, ibd); - if (!debugfs_create_bool("enable", 0600, ibd->fault->dir, - &ibd->fault->enable)) - goto fail; - if (!debugfs_create_bool("suppress_err", 0600, - ibd->fault->dir, - &ibd->fault->suppress_err)) - goto fail; - if (!debugfs_create_bool("opcode_mode", 0600, ibd->fault->dir, - &ibd->fault->opcode)) - goto fail; - if (!debugfs_create_file("opcodes", 0600, ibd->fault->dir, - ibd->fault, &__fault_opcodes_fops)) - goto fail; - if (!debugfs_create_u64("skip_pkts", 0600, - ibd->fault->dir, - &ibd->fault->fault_skip)) - goto fail; - if (!debugfs_create_u64("skip_usec", 0600, - ibd->fault->dir, - &ibd->fault->fault_skip_usec)) - goto fail; - if (!debugfs_create_u8("direction", 0600, ibd->fault->dir, - &ibd->fault->direction)) - goto fail; + ibd->fault->dir = fault_dir; + + debugfs_create_file("fault_stats", 0444, fault_dir, ibd, + &_fault_stats_file_ops); + debugfs_create_bool("enable", 0600, fault_dir, &ibd->fault->enable); + debugfs_create_bool("suppress_err", 0600, fault_dir, + &ibd->fault->suppress_err); + debugfs_create_bool("opcode_mode", 0600, fault_dir, + &ibd->fault->opcode); + debugfs_create_file("opcodes", 0600, fault_dir, ibd->fault, + &__fault_opcodes_fops); + debugfs_create_u64("skip_pkts", 0600, fault_dir, + &ibd->fault->fault_skip); + debugfs_create_u64("skip_usec", 0600, fault_dir, + &ibd->fault->fault_skip_usec); + debugfs_create_u8("direction", 0600, fault_dir, &ibd->fault->direction); return 0; -fail: - hfi1_fault_exit_debugfs(ibd); - return -ENOMEM; } bool hfi1_dbg_fault_suppress_err(struct hfi1_ibdev *ibd) diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h @@ -73,6 +73,7 @@ #include "chip_registers.h" #include "common.h" +#include "opfn.h" #include "verbs.h" #include "pio.h" #include "chip.h" @@ -98,6 +99,8 @@ #define NEIGHBOR_TYPE_HFI 0 #define NEIGHBOR_TYPE_SWITCH 1 +#define HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES 5 + extern unsigned long hfi1_cap_mask; #define HFI1_CAP_KGET_MASK(mask, cap) ((mask) & HFI1_CAP_##cap) #define HFI1_CAP_UGET_MASK(mask, cap) \ @@ -195,6 +198,14 @@ struct exp_tid_set { }; typedef int (*rhf_rcv_function_ptr)(struct hfi1_packet *packet); + +struct tid_queue { + struct list_head queue_head; + /* queue head for QP TID resource waiters */ + u32 enqueue; /* count of tid enqueues */ + u32 dequeue; /* count of tid dequeues */ +}; + struct hfi1_ctxtdata { /* rcvhdrq base, needs mmap before useful */ void *rcvhdrq; @@ -288,6 +299,12 @@ struct hfi1_ctxtdata { /* PSM Specific fields */ /* lock protecting all Expected TID data */ struct mutex exp_mutex; + /* lock protecting all Expected TID data of kernel contexts */ + spinlock_t exp_lock; + /* Queue for QP's waiting for HW TID flows */ + struct tid_queue flow_queue; + /* Queue for QP's waiting for HW receive array entries */ + struct tid_queue rarr_queue; /* when waiting for rcv or pioavail */ wait_queue_head_t wait; /* uuid from PSM */ @@ -320,6 +337,9 @@ struct hfi1_ctxtdata { */ u8 subctxt_cnt; + /* Bit mask to track free TID RDMA HW flows */ + unsigned long flow_mask; + struct tid_flow_state flows[RXE_NUM_TID_FLOWS]; }; /** @@ -1435,7 +1455,7 @@ void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd, struct hfi1_devdata *dd, u8 hw_pidx, u8 port); void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd); int hfi1_rcd_put(struct hfi1_ctxtdata *rcd); -void hfi1_rcd_get(struct hfi1_ctxtdata *rcd); +int hfi1_rcd_get(struct hfi1_ctxtdata *rcd); struct hfi1_ctxtdata *hfi1_rcd_get_by_index_safe(struct hfi1_devdata *dd, u16 ctxt); struct hfi1_ctxtdata *hfi1_rcd_get_by_index(struct hfi1_devdata *dd, u16 ctxt); @@ -2100,7 +2120,7 @@ static inline u64 hfi1_pkt_default_send_ctxt_mask(struct hfi1_devdata *dd, SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_TEST_SMASK | #endif HFI1_PKT_USER_SC_INTEGRITY; - else + else if (ctxt_type != SC_KERNEL) base_sc_integrity |= HFI1_PKT_KERNEL_SC_INTEGRITY; /* turn on send-side job key checks if !A0 */ diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c @@ -73,7 +73,6 @@ #undef pr_fmt #define pr_fmt(fmt) DRIVER_NAME ": " fmt -#define HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES 5 /* * min buffers we want to have per context, after driver */ @@ -216,12 +215,12 @@ static void hfi1_rcd_free(struct kref *kref) struct hfi1_ctxtdata *rcd = container_of(kref, struct hfi1_ctxtdata, kref); - hfi1_free_ctxtdata(rcd->dd, rcd); - spin_lock_irqsave(&rcd->dd->uctxt_lock, flags); rcd->dd->rcd[rcd->ctxt] = NULL; spin_unlock_irqrestore(&rcd->dd->uctxt_lock, flags); + hfi1_free_ctxtdata(rcd->dd, rcd); + kfree(rcd); } @@ -244,10 +243,13 @@ int hfi1_rcd_put(struct hfi1_ctxtdata *rcd) * @rcd: pointer to an initialized rcd data structure * * Use this to get a reference after the init. + * + * Return : reflect kref_get_unless_zero(), which returns non-zero on + * increment, otherwise 0. */ -void hfi1_rcd_get(struct hfi1_ctxtdata *rcd) +int hfi1_rcd_get(struct hfi1_ctxtdata *rcd) { - kref_get(&rcd->kref); + return kref_get_unless_zero(&rcd->kref); } /** @@ -327,7 +329,8 @@ struct hfi1_ctxtdata *hfi1_rcd_get_by_index(struct hfi1_devdata *dd, u16 ctxt) spin_lock_irqsave(&dd->uctxt_lock, flags); if (dd->rcd[ctxt]) { rcd = dd->rcd[ctxt]; - hfi1_rcd_get(rcd); + if (!hfi1_rcd_get(rcd)) + rcd = NULL; } spin_unlock_irqrestore(&dd->uctxt_lock, flags); @@ -372,6 +375,9 @@ int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa, rcd->rhf_rcv_function_map = normal_rhf_rcv_functions; mutex_init(&rcd->exp_mutex); + spin_lock_init(&rcd->exp_lock); + INIT_LIST_HEAD(&rcd->flow_queue.queue_head); + INIT_LIST_HEAD(&rcd->rarr_queue.queue_head); hfi1_cdbg(PROC, "setting up context %u\n", rcd->ctxt); @@ -474,6 +480,9 @@ int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa, GFP_KERNEL, numa); if (!rcd->opstats) goto bail; + + /* Initialize TID flow generations for the context */ + hfi1_kern_init_ctxt_generations(rcd); } *context = rcd; @@ -773,6 +782,8 @@ static void enable_chip(struct hfi1_devdata *dd) rcvmask |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB; if (HFI1_CAP_KGET_MASK(rcd->flags, NODROP_EGR_FULL)) rcvmask |= HFI1_RCVCTRL_NO_EGR_DROP_ENB; + if (HFI1_CAP_IS_KSET(TID_RDMA)) + rcvmask |= HFI1_RCVCTRL_TIDFLOW_ENB; hfi1_rcvctrl(dd, rcvmask, rcd); sc_enable(rcd->sc); hfi1_rcd_put(rcd); @@ -928,6 +939,8 @@ int hfi1_init(struct hfi1_devdata *dd, int reinit) lastfail = hfi1_create_rcvhdrq(dd, rcd); if (!lastfail) lastfail = hfi1_setup_eagerbufs(rcd); + if (!lastfail) + lastfail = hfi1_kern_exp_rcv_init(rcd, reinit); if (lastfail) { dd_dev_err(dd, "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n"); @@ -1498,6 +1511,13 @@ static int __init hfi1_mod_init(void) /* sanitize link CRC options */ link_crc_mask &= SUPPORTED_CRCS; + ret = opfn_init(); + if (ret < 0) { + pr_err("Failed to allocate opfn_wq"); + goto bail_dev; + } + + hfi1_compute_tid_rdma_flow_wt(); /* * These must be called before the driver is registered with * the PCI subsystem. @@ -1528,6 +1548,7 @@ module_init(hfi1_mod_init); static void __exit hfi1_mod_cleanup(void) { pci_unregister_driver(&hfi1_pci_driver); + opfn_exit(); node_affinity_destroy_all(); hfi1_dbg_exit(); @@ -1582,7 +1603,7 @@ static void cleanup_device_data(struct hfi1_devdata *dd) struct hfi1_ctxtdata *rcd = dd->rcd[ctxt]; if (rcd) { - hfi1_clear_tids(rcd); + hfi1_free_ctxt_rcv_groups(rcd); hfi1_free_ctxt(rcd); } } diff --git a/drivers/infiniband/hw/hfi1/iowait.c b/drivers/infiniband/hw/hfi1/iowait.c @@ -6,6 +6,9 @@ #include "iowait.h" #include "trace_iowait.h" +/* 1 priority == 16 starve_cnt */ +#define IOWAIT_PRIORITY_STARVE_SHIFT 4 + void iowait_set_flag(struct iowait *wait, u32 flag) { trace_hfi1_iowait_set(wait, flag); @@ -44,7 +47,8 @@ void iowait_init(struct iowait *wait, u32 tx_limit, uint seq, bool pkts_sent), void (*wakeup)(struct iowait *wait, int reason), - void (*sdma_drained)(struct iowait *wait)) + void (*sdma_drained)(struct iowait *wait), + void (*init_priority)(struct iowait *wait)) { int i; @@ -58,6 +62,7 @@ void iowait_init(struct iowait *wait, u32 tx_limit, wait->sleep = sleep; wait->wakeup = wakeup; wait->sdma_drained = sdma_drained; + wait->init_priority = init_priority; wait->flags = 0; for (i = 0; i < IOWAIT_SES; i++) { wait->wait[i].iow = wait; @@ -92,3 +97,30 @@ int iowait_set_work_flag(struct iowait_work *w) iowait_set_flag(w->iow, IOWAIT_PENDING_TID); return IOWAIT_TID_SE; } + +/** + * iowait_priority_update_top - update the top priority entry + * @w: the iowait struct + * @top: a pointer to the top priority entry + * @idx: the index of the current iowait in an array + * @top_idx: the array index for the iowait entry that has the top priority + * + * This function is called to compare the priority of a given + * iowait with the given top priority entry. The top index will + * be returned. + */ +uint iowait_priority_update_top(struct iowait *w, + struct iowait *top, + uint idx, uint top_idx) +{ + u8 cnt, tcnt; + + /* Convert priority into starve_cnt and compare the total.*/ + cnt = (w->priority << IOWAIT_PRIORITY_STARVE_SHIFT) + w->starved_cnt; + tcnt = (top->priority << IOWAIT_PRIORITY_STARVE_SHIFT) + + top->starved_cnt; + if (cnt > tcnt) + return idx; + else + return top_idx; +} diff --git a/drivers/infiniband/hw/hfi1/iowait.h b/drivers/infiniband/hw/hfi1/iowait.h @@ -100,6 +100,7 @@ struct iowait_work { * @sleep: no space callback * @wakeup: space callback wakeup * @sdma_drained: sdma count drained + * @init_priority: callback to manipulate priority * @lock: lock protected head of wait queue * @iowork: workqueue overhead * @wait_dma: wait for sdma_busy == 0 @@ -109,7 +110,7 @@ struct iowait_work { * @tx_limit: limit for overflow queuing * @tx_count: number of tx entry's in tx_head'ed list * @flags: wait flags (one per QP) - * @wait: SE array + * @wait: SE array for multiple legs * * This is to be embedded in user's state structure * (QP or PQ). @@ -120,10 +121,13 @@ struct iowait_work { * are callbacks for the ULP to implement * what ever queuing/dequeuing of * the embedded iowait and its containing struct - * when a resource shortage like SDMA ring space is seen. + * when a resource shortage like SDMA ring space + * or PIO credit space is seen. * * Both potentially have locks help - * so sleeping is not allowed. + * so sleeping is not allowed and it is not + * supported to submit txreqs from the wakeup + * call directly because of lock conflicts. * * The wait_dma member along with the iow * @@ -143,6 +147,7 @@ struct iowait { ); void (*wakeup)(struct iowait *wait, int reason); void (*sdma_drained)(struct iowait *wait); + void (*init_priority)(struct iowait *wait); seqlock_t *lock; wait_queue_head_t wait_dma; wait_queue_head_t wait_pio; @@ -152,6 +157,7 @@ struct iowait { u32 tx_limit; u32 tx_count; u8 starved_cnt; + u8 priority; unsigned long flags; struct iowait_work wait[IOWAIT_SES]; }; @@ -171,7 +177,8 @@ void iowait_init(struct iowait *wait, u32 tx_limit, uint seq, bool pkts_sent), void (*wakeup)(struct iowait *wait, int reason), - void (*sdma_drained)(struct iowait *wait)); + void (*sdma_drained)(struct iowait *wait), + void (*init_priority)(struct iowait *wait)); /** * iowait_schedule() - schedule the default send engine work @@ -186,6 +193,18 @@ static inline bool iowait_schedule(struct iowait *wait, } /** + * iowait_tid_schedule - schedule the tid SE + * @wait: the iowait structure + * @wq: the work queue + * @cpu: the cpu + */ +static inline bool iowait_tid_schedule(struct iowait *wait, + struct workqueue_struct *wq, int cpu) +{ + return !!queue_work_on(cpu, wq, &wait->wait[IOWAIT_TID_SE].iowork); +} + +/** * iowait_sdma_drain() - wait for DMAs to drain * * @wait: iowait structure @@ -327,6 +346,8 @@ static inline u16 iowait_get_desc(struct iowait_work *w) tx = list_first_entry(&w->tx_head, struct sdma_txreq, list); num_desc = tx->num_desc; + if (tx->flags & SDMA_TXREQ_F_VIP) + w->iow->priority++; } return num_desc; } @@ -340,6 +361,37 @@ static inline u32 iowait_get_all_desc(struct iowait *w) return num_desc; } +static inline void iowait_update_priority(struct iowait_work *w) +{ + struct sdma_txreq *tx = NULL; + + if (!list_empty(&w->tx_head)) { + tx = list_first_entry(&w->tx_head, struct sdma_txreq, + list); + if (tx->flags & SDMA_TXREQ_F_VIP) + w->iow->priority++; + } +} + +static inline void iowait_update_all_priority(struct iowait *w) +{ + iowait_update_priority(&w->wait[IOWAIT_IB_SE]); + iowait_update_priority(&w->wait[IOWAIT_TID_SE]); +} + +static inline void iowait_init_priority(struct iowait *w) +{ + w->priority = 0; + if (w->init_priority) + w->init_priority(w); +} + +static inline void iowait_get_priority(struct iowait *w) +{ + iowait_init_priority(w); + iowait_update_all_priority(w); +} + /** * iowait_queue - Put the iowait on a wait queue * @pkts_sent: have some packets been sent before queuing? @@ -356,14 +408,18 @@ static inline void iowait_queue(bool pkts_sent, struct iowait *w, /* * To play fair, insert the iowait at the tail of the wait queue if it * has already sent some packets; Otherwise, put it at the head. + * However, if it has priority packets to send, also put it at the + * head. */ - if (pkts_sent) { - list_add_tail(&w->list, wait_head); + if (pkts_sent) w->starved_cnt = 0; - } else { - list_add(&w->list, wait_head); + else w->starved_cnt++; - } + + if (w->priority > 0 || !pkts_sent) + list_add(&w->list, wait_head); + else + list_add_tail(&w->list, wait_head); } /** @@ -380,27 +436,10 @@ static inline void iowait_starve_clear(bool pkts_sent, struct iowait *w) w->starved_cnt = 0; } -/** - * iowait_starve_find_max - Find the maximum of the starve count - * @w: the iowait struct - * @max: a variable containing the max starve count - * @idx: the index of the current iowait in an array - * @max_idx: a variable containing the array index for the - * iowait entry that has the max starve count - * - * This function is called to compare the starve count of a - * given iowait with the given max starve count. The max starve - * count and the index will be updated if the iowait's start - * count is larger. - */ -static inline void iowait_starve_find_max(struct iowait *w, u8 *max, - uint idx, uint *max_idx) -{ - if (w->starved_cnt > *max) { - *max = w->starved_cnt; - *max_idx = idx; - } -} +/* Update the top priority index */ +uint iowait_priority_update_top(struct iowait *w, + struct iowait *top, + uint idx, uint top_idx); /** * iowait_packet_queued() - determine if a packet is queued diff --git a/drivers/infiniband/hw/hfi1/opfn.c b/drivers/infiniband/hw/hfi1/opfn.c @@ -0,0 +1,323 @@ +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) +/* + * Copyright(c) 2018 Intel Corporation. + * + */ +#include "hfi.h" +#include "trace.h" +#include "qp.h" +#include "opfn.h" + +#define IB_BTHE_E BIT(IB_BTHE_E_SHIFT) + +#define OPFN_CODE(code) BIT((code) - 1) +#define OPFN_MASK(code) OPFN_CODE(STL_VERBS_EXTD_##code) + +struct hfi1_opfn_type { + bool (*request)(struct rvt_qp *qp, u64 *data); + bool (*response)(struct rvt_qp *qp, u64 *data); + bool (*reply)(struct rvt_qp *qp, u64 data); + void (*error)(struct rvt_qp *qp); +}; + +static struct hfi1_opfn_type hfi1_opfn_handlers[STL_VERBS_EXTD_MAX] = { + [STL_VERBS_EXTD_TID_RDMA] = { + .request = tid_rdma_conn_req, + .response = tid_rdma_conn_resp, + .reply = tid_rdma_conn_reply, + .error = tid_rdma_conn_error, + }, +}; + +static struct workqueue_struct *opfn_wq; + +static void opfn_schedule_conn_request(struct rvt_qp *qp); + +static bool hfi1_opfn_extended(u32 bth1) +{ + return !!(bth1 & IB_BTHE_E); +} + +static void opfn_conn_request(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct ib_atomic_wr wr; + u16 mask, capcode; + struct hfi1_opfn_type *extd; + u64 data; + unsigned long flags; + int ret = 0; + + trace_hfi1_opfn_state_conn_request(qp); + spin_lock_irqsave(&priv->opfn.lock, flags); + /* + * Exit if the extended bit is not set, or if nothing is requested, or + * if we have completed all requests, or if a previous request is in + * progress + */ + if (!priv->opfn.extended || !priv->opfn.requested || + priv->opfn.requested == priv->opfn.completed || priv->opfn.curr) + goto done; + + mask = priv->opfn.requested & ~priv->opfn.completed; + capcode = ilog2(mask & ~(mask - 1)) + 1; + if (capcode >= STL_VERBS_EXTD_MAX) { + priv->opfn.completed |= OPFN_CODE(capcode); + goto done; + } + + extd = &hfi1_opfn_handlers[capcode]; + if (!extd || !extd->request || !extd->request(qp, &data)) { + /* + * Either there is no handler for this capability or the request + * packet could not be generated. Either way, mark it as done so + * we don't keep attempting to complete it. + */ + priv->opfn.completed |= OPFN_CODE(capcode); + goto done; + } + + trace_hfi1_opfn_data_conn_request(qp, capcode, data); + data = (data & ~0xf) | capcode; + + memset(&wr, 0, sizeof(wr)); + wr.wr.opcode = IB_WR_OPFN; + wr.remote_addr = HFI1_VERBS_E_ATOMIC_VADDR; + wr.compare_add = data; + + priv->opfn.curr = capcode; /* A new request is now in progress */ + /* Drop opfn.lock before calling ib_post_send() */ + spin_unlock_irqrestore(&priv->opfn.lock, flags); + + ret = ib_post_send(&qp->ibqp, &wr.wr, NULL); + if (ret) + goto err; + trace_hfi1_opfn_state_conn_request(qp); + return; +err: + trace_hfi1_msg_opfn_conn_request(qp, "ib_ost_send failed: ret = ", + (u64)ret); + spin_lock_irqsave(&priv->opfn.lock, flags); + /* + * In case of an unexpected error return from ib_post_send + * clear opfn.curr and reschedule to try again + */ + priv->opfn.curr = STL_VERBS_EXTD_NONE; + opfn_schedule_conn_request(qp); +done: + spin_unlock_irqrestore(&priv->opfn.lock, flags); +} + +void opfn_send_conn_request(struct work_struct *work) +{ + struct hfi1_opfn_data *od; + struct hfi1_qp_priv *qpriv; + + od = container_of(work, struct hfi1_opfn_data, opfn_work); + qpriv = container_of(od, struct hfi1_qp_priv, opfn); + + opfn_conn_request(qpriv->owner); +} + +/* + * When QP s_lock is held in the caller, the OPFN request must be scheduled + * to a different workqueue to avoid double locking QP s_lock in call to + * ib_post_send in opfn_conn_request + */ +static void opfn_schedule_conn_request(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + trace_hfi1_opfn_state_sched_conn_request(qp); + queue_work(opfn_wq, &priv->opfn.opfn_work); +} + +void opfn_conn_response(struct rvt_qp *qp, struct rvt_ack_entry *e, + struct ib_atomic_eth *ateth) +{ + struct hfi1_qp_priv *priv = qp->priv; + u64 data = be64_to_cpu(ateth->compare_data); + struct hfi1_opfn_type *extd; + u8 capcode; + unsigned long flags; + + trace_hfi1_opfn_state_conn_response(qp); + capcode = data & 0xf; + trace_hfi1_opfn_data_conn_response(qp, capcode, data); + if (!capcode || capcode >= STL_VERBS_EXTD_MAX) + return; + + extd = &hfi1_opfn_handlers[capcode]; + + if (!extd || !extd->response) { + e->atomic_data = capcode; + return; + } + + spin_lock_irqsave(&priv->opfn.lock, flags); + if (priv->opfn.completed & OPFN_CODE(capcode)) { + /* + * We are receiving a request for a feature that has already + * been negotiated. This may mean that the other side has reset + */ + priv->opfn.completed &= ~OPFN_CODE(capcode); + if (extd->error) + extd->error(qp); + } + + if (extd->response(qp, &data)) + priv->opfn.completed |= OPFN_CODE(capcode); + e->atomic_data = (data & ~0xf) | capcode; + trace_hfi1_opfn_state_conn_response(qp); + spin_unlock_irqrestore(&priv->opfn.lock, flags); +} + +void opfn_conn_reply(struct rvt_qp *qp, u64 data) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_opfn_type *extd; + u8 capcode; + unsigned long flags; + + trace_hfi1_opfn_state_conn_reply(qp); + capcode = data & 0xf; + trace_hfi1_opfn_data_conn_reply(qp, capcode, data); + if (!capcode || capcode >= STL_VERBS_EXTD_MAX) + return; + + spin_lock_irqsave(&priv->opfn.lock, flags); + /* + * Either there is no previous request or the reply is not for the + * current request + */ + if (!priv->opfn.curr || capcode != priv->opfn.curr) + goto done; + + extd = &hfi1_opfn_handlers[capcode]; + + if (!extd || !extd->reply) + goto clear; + + if (extd->reply(qp, data)) + priv->opfn.completed |= OPFN_CODE(capcode); +clear: + /* + * Clear opfn.curr to indicate that the previous request is no longer in + * progress + */ + priv->opfn.curr = STL_VERBS_EXTD_NONE; + trace_hfi1_opfn_state_conn_reply(qp); +done: + spin_unlock_irqrestore(&priv->opfn.lock, flags); +} + +void opfn_conn_error(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_opfn_type *extd = NULL; + unsigned long flags; + u16 capcode; + + trace_hfi1_opfn_state_conn_error(qp); + trace_hfi1_msg_opfn_conn_error(qp, "error. qp state ", (u64)qp->state); + /* + * The QP has gone into the Error state. We have to invalidate all + * negotiated feature, including the one in progress (if any). The RC + * QP handling will clean the WQE for the connection request. + */ + spin_lock_irqsave(&priv->opfn.lock, flags); + while (priv->opfn.completed) { + capcode = priv->opfn.completed & ~(priv->opfn.completed - 1); + extd = &hfi1_opfn_handlers[ilog2(capcode) + 1]; + if (extd->error) + extd->error(qp); + priv->opfn.completed &= ~OPFN_CODE(capcode); + } + priv->opfn.extended = 0; + priv->opfn.requested = 0; + priv->opfn.curr = STL_VERBS_EXTD_NONE; + spin_unlock_irqrestore(&priv->opfn.lock, flags); +} + +void opfn_qp_init(struct rvt_qp *qp, struct ib_qp_attr *attr, int attr_mask) +{ + struct ib_qp *ibqp = &qp->ibqp; + struct hfi1_qp_priv *priv = qp->priv; + unsigned long flags; + + if (attr_mask & IB_QP_RETRY_CNT) + priv->s_retry = attr->retry_cnt; + + spin_lock_irqsave(&priv->opfn.lock, flags); + if (ibqp->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) { + struct tid_rdma_params *local = &priv->tid_rdma.local; + + if (attr_mask & IB_QP_TIMEOUT) + priv->tid_retry_timeout_jiffies = qp->timeout_jiffies; + if (qp->pmtu == enum_to_mtu(OPA_MTU_4096) || + qp->pmtu == enum_to_mtu(OPA_MTU_8192)) { + tid_rdma_opfn_init(qp, local); + /* + * We only want to set the OPFN requested bit when the + * QP transitions to RTS. + */ + if (attr_mask & IB_QP_STATE && + attr->qp_state == IB_QPS_RTS) { + priv->opfn.requested |= OPFN_MASK(TID_RDMA); + /* + * If the QP is transitioning to RTS and the + * opfn.completed for TID RDMA has already been + * set, the QP is being moved *back* into RTS. + * We can now renegotiate the TID RDMA + * parameters. + */ + if (priv->opfn.completed & + OPFN_MASK(TID_RDMA)) { + priv->opfn.completed &= + ~OPFN_MASK(TID_RDMA); + /* + * Since the opfn.completed bit was + * already set, it is safe to assume + * that the opfn.extended is also set. + */ + opfn_schedule_conn_request(qp); + } + } + } else { + memset(local, 0, sizeof(*local)); + } + } + spin_unlock_irqrestore(&priv->opfn.lock, flags); +} + +void opfn_trigger_conn_request(struct rvt_qp *qp, u32 bth1) +{ + struct hfi1_qp_priv *priv = qp->priv; + + if (!priv->opfn.extended && hfi1_opfn_extended(bth1) && + HFI1_CAP_IS_KSET(OPFN)) { + priv->opfn.extended = 1; + if (qp->state == IB_QPS_RTS) + opfn_conn_request(qp); + } +} + +int opfn_init(void) +{ + opfn_wq = alloc_workqueue("hfi_opfn", + WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE | + WQ_MEM_RECLAIM, + HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES); + if (!opfn_wq) + return -ENOMEM; + + return 0; +} + +void opfn_exit(void) +{ + if (opfn_wq) { + destroy_workqueue(opfn_wq); + opfn_wq = NULL; + } +} diff --git a/drivers/infiniband/hw/hfi1/opfn.h b/drivers/infiniband/hw/hfi1/opfn.h @@ -0,0 +1,85 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ +/* + * Copyright(c) 2018 Intel Corporation. + * + */ +#ifndef _HFI1_OPFN_H +#define _HFI1_OPFN_H + +/** + * DOC: Omni Path Feature Negotion (OPFN) + * + * OPFN is a discovery protocol for Intel Omni-Path fabric that + * allows two RC QPs to negotiate a common feature that both QPs + * can support. Currently, the only OPA feature that OPFN + * supports is TID RDMA. + * + * Architecture + * + * OPFN involves the communication between two QPs on the HFI + * level on an Omni-Path fabric, and ULPs have no knowledge of + * OPFN at all. + * + * Implementation + * + * OPFN extends the existing IB RC protocol with the following + * changes: + * -- Uses Bit 24 (reserved) of DWORD 1 of Base Transport + * Header (BTH1) to indicate that the RC QP supports OPFN; + * -- Uses a combination of RC COMPARE_SWAP opcode (0x13) and + * the address U64_MAX (0xFFFFFFFFFFFFFFFF) as an OPFN + * request; The 64-bit data carried with the request/response + * contains the parameters for negotiation and will be + * defined in tid_rdma.c file; + * -- Defines IB_WR_RESERVED3 as IB_WR_OPFN. + * + * The OPFN communication will be triggered when an RC QP + * receives a request with Bit 24 of BTH1 set. The responder QP + * will then post send an OPFN request with its local + * parameters, which will be sent to the requester QP once all + * existing requests on the responder QP side have been sent. + * Once the requester QP receives the OPFN request, it will + * keep a copy of the responder QP's parameters, and return a + * response packet with its own local parameters. The responder + * QP receives the response packet and keeps a copy of the requester + * QP's parameters. After this exchange, each side has the parameters + * for both sides and therefore can select the right parameters + * for future transactions + */ + +/* STL Verbs Extended */ +#define IB_BTHE_E_SHIFT 24 +#define HFI1_VERBS_E_ATOMIC_VADDR U64_MAX + +struct ib_atomic_eth; + +enum hfi1_opfn_codes { + STL_VERBS_EXTD_NONE = 0, + STL_VERBS_EXTD_TID_RDMA, + STL_VERBS_EXTD_MAX +}; + +struct hfi1_opfn_data { + u8 extended; + u16 requested; + u16 completed; + enum hfi1_opfn_codes curr; + /* serialize opfn function calls */ + spinlock_t lock; + struct work_struct opfn_work; +}; + +/* WR opcode for OPFN */ +#define IB_WR_OPFN IB_WR_RESERVED3 + +void opfn_send_conn_request(struct work_struct *work); +void opfn_conn_response(struct rvt_qp *qp, struct rvt_ack_entry *e, + struct ib_atomic_eth *ateth); +void opfn_conn_reply(struct rvt_qp *qp, u64 data); +void opfn_conn_error(struct rvt_qp *qp); +void opfn_qp_init(struct rvt_qp *qp, struct ib_qp_attr *attr, int attr_mask); +void opfn_trigger_conn_request(struct rvt_qp *qp, u32 bth1); +int opfn_init(void); +void opfn_exit(void); + +#endif /* _HFI1_OPFN_H */ diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c @@ -1599,8 +1599,7 @@ static void sc_piobufavail(struct send_context *sc) struct rvt_qp *qp; struct hfi1_qp_priv *priv; unsigned long flags; - uint i, n = 0, max_idx = 0; - u8 max_starved_cnt = 0; + uint i, n = 0, top_idx = 0; if (dd->send_contexts[sc->sw_index].type != SC_KERNEL && dd->send_contexts[sc->sw_index].type != SC_VL15) @@ -1619,11 +1618,18 @@ static void sc_piobufavail(struct send_context *sc) if (n == ARRAY_SIZE(qps)) break; wait = list_first_entry(list, struct iowait, list); + iowait_get_priority(wait); qp = iowait_to_qp(wait); priv = qp->priv; list_del_init(&priv->s_iowait.list); priv->s_iowait.lock = NULL; - iowait_starve_find_max(wait, &max_starved_cnt, n, &max_idx); + if (n) { + priv = qps[top_idx]->priv; + top_idx = iowait_priority_update_top(wait, + &priv->s_iowait, + n, top_idx); + } + /* refcount held until actual wake up */ qps[n++] = qp; } @@ -1638,12 +1644,12 @@ static void sc_piobufavail(struct send_context *sc) } write_sequnlock_irqrestore(&sc->waitlock, flags); - /* Wake up the most starved one first */ + /* Wake up the top-priority one first */ if (n) - hfi1_qp_wakeup(qps[max_idx], + hfi1_qp_wakeup(qps[top_idx], RVT_S_WAIT_PIO | HFI1_S_WAIT_PIO_DRAIN); for (i = 0; i < n; i++) - if (i != max_idx) + if (i != top_idx) hfi1_qp_wakeup(qps[i], RVT_S_WAIT_PIO | HFI1_S_WAIT_PIO_DRAIN); } diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c @@ -132,6 +132,18 @@ const struct rvt_operation_params hfi1_post_parms[RVT_OPERATION_MAX] = { .qpt_support = BIT(IB_QPT_RC), }, +[IB_WR_OPFN] = { + .length = sizeof(struct ib_atomic_wr), + .qpt_support = BIT(IB_QPT_RC), + .flags = RVT_OPERATION_USE_RESERVE, +}, + +[IB_WR_TID_RDMA_WRITE] = { + .length = sizeof(struct ib_rdma_wr), + .qpt_support = BIT(IB_QPT_RC), + .flags = RVT_OPERATION_IGN_RNR_CNT, +}, + }; static void flush_list_head(struct list_head *l) @@ -285,6 +297,8 @@ void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc); qp_set_16b(qp); } + + opfn_qp_init(qp, attr, attr_mask); } /** @@ -311,6 +325,8 @@ int hfi1_setup_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, bool *call_send) switch (qp->ibqp.qp_type) { case IB_QPT_RC: + hfi1_setup_tid_rdma_wqe(qp, wqe); + /* fall through */ case IB_QPT_UC: if (wqe->length > 0x80000000U) return -EINVAL; @@ -422,6 +438,11 @@ static void hfi1_qp_schedule(struct rvt_qp *qp) if (ret) iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_IB); } + if (iowait_flag_set(&priv->s_iowait, IOWAIT_PENDING_TID)) { + ret = hfi1_schedule_tid_send(qp); + if (ret) + iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_TID); + } } void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag) @@ -441,8 +462,27 @@ void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag) void hfi1_qp_unbusy(struct rvt_qp *qp, struct iowait_work *wait) { - if (iowait_set_work_flag(wait) == IOWAIT_IB_SE) + struct hfi1_qp_priv *priv = qp->priv; + + if (iowait_set_work_flag(wait) == IOWAIT_IB_SE) { qp->s_flags &= ~RVT_S_BUSY; + /* + * If we are sending a first-leg packet from the second leg, + * we need to clear the busy flag from priv->s_flags to + * avoid a race condition when the qp wakes up before + * the call to hfi1_verbs_send() returns to the second + * leg. In that case, the second leg will terminate without + * being re-scheduled, resulting in failure to send TID RDMA + * WRITE DATA and TID RDMA ACK packets. + */ + if (priv->s_flags & HFI1_S_TID_BUSY_SET) { + priv->s_flags &= ~(HFI1_S_TID_BUSY_SET | + RVT_S_BUSY); + iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID); + } + } else { + priv->s_flags &= ~RVT_S_BUSY; + } } static int iowait_sleep( @@ -479,6 +519,7 @@ static int iowait_sleep( ibp->rvp.n_dmawait++; qp->s_flags |= RVT_S_WAIT_DMA_DESC; + iowait_get_priority(&priv->s_iowait); iowait_queue(pkts_sent, &priv->s_iowait, &sde->dmawait); priv->s_iowait.lock = &sde->waitlock; @@ -528,6 +569,17 @@ static void iowait_sdma_drained(struct iowait *wait) spin_unlock_irqrestore(&qp->s_lock, flags); } +static void hfi1_init_priority(struct iowait *w) +{ + struct rvt_qp *qp = iowait_to_qp(w); + struct hfi1_qp_priv *priv = qp->priv; + + if (qp->s_flags & RVT_S_ACK_PENDING) + w->priority++; + if (priv->s_flags & RVT_S_ACK_PENDING) + w->priority++; +} + /** * qp_to_sdma_engine - map a qp to a send engine * @qp: the QP @@ -685,10 +737,11 @@ void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp) &priv->s_iowait, 1, _hfi1_do_send, - NULL, + _hfi1_do_tid_send, iowait_sleep, iowait_wakeup, - iowait_sdma_drained); + iowait_sdma_drained, + hfi1_init_priority); return priv; } @@ -696,6 +749,7 @@ void qp_priv_free(struct rvt_dev_info *rdi, struct rvt_qp *qp) { struct hfi1_qp_priv *priv = qp->priv; + hfi1_qp_priv_tid_free(rdi, qp); kfree(priv->s_ahg); kfree(priv); } @@ -729,6 +783,7 @@ void flush_qp_waiters(struct rvt_qp *qp) { lockdep_assert_held(&qp->s_lock); flush_iowait(qp); + hfi1_tid_rdma_flush_wait(qp); } void stop_send_queue(struct rvt_qp *qp) @@ -736,12 +791,16 @@ void stop_send_queue(struct rvt_qp *qp) struct hfi1_qp_priv *priv = qp->priv; iowait_cancel_work(&priv->s_iowait); + if (cancel_work_sync(&priv->tid_rdma.trigger_work)) + rvt_put_qp(qp); } void quiesce_qp(struct rvt_qp *qp) { struct hfi1_qp_priv *priv = qp->priv; + hfi1_del_tid_reap_timer(qp); + hfi1_del_tid_retry_timer(qp); iowait_sdma_drain(&priv->s_iowait); qp_pio_drain(qp); flush_tx_list(qp); @@ -749,8 +808,13 @@ void quiesce_qp(struct rvt_qp *qp) void notify_qp_reset(struct rvt_qp *qp) { + hfi1_qp_kern_exp_rcv_clear_all(qp); qp->r_adefered = 0; clear_ahg(qp); + + /* Clear any OPFN state */ + if (qp->ibqp.qp_type == IB_QPT_RC) + opfn_conn_error(qp); } /* @@ -832,7 +896,8 @@ void notify_error_qp(struct rvt_qp *qp) if (lock) { write_seqlock(lock); if (!list_empty(&priv->s_iowait.list) && - !(qp->s_flags & RVT_S_BUSY)) { + !(qp->s_flags & RVT_S_BUSY) && + !(priv->s_flags & RVT_S_BUSY)) { qp->s_flags &= ~RVT_S_ANY_WAIT_IO; list_del_init(&priv->s_iowait.list); priv->s_iowait.lock = NULL; @@ -841,7 +906,8 @@ void notify_error_qp(struct rvt_qp *qp) write_sequnlock(lock); } - if (!(qp->s_flags & RVT_S_BUSY)) { + if (!(qp->s_flags & RVT_S_BUSY) && !(priv->s_flags & RVT_S_BUSY)) { + qp->s_hdrwords = 0; if (qp->s_rdma_mr) { rvt_put_mr(qp->s_rdma_mr); qp->s_rdma_mr = NULL; diff --git a/drivers/infiniband/hw/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h @@ -63,11 +63,17 @@ extern const struct rvt_operation_params hfi1_post_parms[]; * HFI1_S_AHG_VALID - ahg header valid on chip * HFI1_S_AHG_CLEAR - have send engine clear ahg state * HFI1_S_WAIT_PIO_DRAIN - qp waiting for PIOs to drain + * HFI1_S_WAIT_TID_SPACE - a QP is waiting for TID resource + * HFI1_S_WAIT_TID_RESP - waiting for a TID RDMA WRITE response + * HFI1_S_WAIT_HALT - halt the first leg send engine * HFI1_S_MIN_BIT_MASK - the lowest bit that can be used by hfi1 */ #define HFI1_S_AHG_VALID 0x80000000 #define HFI1_S_AHG_CLEAR 0x40000000 #define HFI1_S_WAIT_PIO_DRAIN 0x20000000 +#define HFI1_S_WAIT_TID_SPACE 0x10000000 +#define HFI1_S_WAIT_TID_RESP 0x08000000 +#define HFI1_S_WAIT_HALT 0x04000000 #define HFI1_S_MIN_BIT_MASK 0x01000000 /* @@ -76,6 +82,7 @@ extern const struct rvt_operation_params hfi1_post_parms[]; #define HFI1_S_ANY_WAIT_IO (RVT_S_ANY_WAIT_IO | HFI1_S_WAIT_PIO_DRAIN) #define HFI1_S_ANY_WAIT (HFI1_S_ANY_WAIT_IO | RVT_S_ANY_WAIT_SEND) +#define HFI1_S_ANY_TID_WAIT_SEND (RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_DMA) /* * Send if not busy or waiting for I/O and either diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c @@ -51,24 +51,48 @@ #include "hfi.h" #include "qp.h" +#include "rc.h" #include "verbs_txreq.h" #include "trace.h" -/* cut down ridiculously long IB macro names */ -#define OP(x) RC_OP(x) - -static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, - u32 psn, u32 pmtu) +struct rvt_ack_entry *find_prev_entry(struct rvt_qp *qp, u32 psn, u8 *prev, + u8 *prev_ack, bool *scheduled) + __must_hold(&qp->s_lock) { - u32 len; - - len = delta_psn(psn, wqe->psn) * pmtu; - ss->sge = wqe->sg_list[0]; - ss->sg_list = wqe->sg_list + 1; - ss->num_sge = wqe->wr.num_sge; - ss->total_len = wqe->length; - rvt_skip_sge(ss, len, false); - return wqe->length - len; + struct rvt_ack_entry *e = NULL; + u8 i, p; + bool s = true; + + for (i = qp->r_head_ack_queue; ; i = p) { + if (i == qp->s_tail_ack_queue) + s = false; + if (i) + p = i - 1; + else + p = rvt_size_atomic(ib_to_rvt(qp->ibqp.device)); + if (p == qp->r_head_ack_queue) { + e = NULL; + break; + } + e = &qp->s_ack_queue[p]; + if (!e->opcode) { + e = NULL; + break; + } + if (cmp_psn(psn, e->psn) >= 0) { + if (p == qp->s_tail_ack_queue && + cmp_psn(psn, e->lpsn) <= 0) + s = false; + break; + } + } + if (prev) + *prev = p; + if (prev_ack) + *prev_ack = i; + if (scheduled) + *scheduled = s; + return e; } /** @@ -87,20 +111,25 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, struct hfi1_pkt_state *ps) { struct rvt_ack_entry *e; - u32 hwords; - u32 len; - u32 bth0; - u32 bth2; + u32 hwords, hdrlen; + u32 len = 0; + u32 bth0 = 0, bth2 = 0; + u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT); int middle = 0; u32 pmtu = qp->pmtu; - struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_qp_priv *qpriv = qp->priv; + bool last_pkt; + u32 delta; + u8 next = qp->s_tail_ack_queue; + struct tid_rdma_request *req; + trace_hfi1_rsp_make_rc_ack(qp, 0); lockdep_assert_held(&qp->s_lock); /* Don't send an ACK if we aren't supposed to. */ if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) goto bail; - if (priv->hdr_type == HFI1_PKT_TYPE_9B) + if (qpriv->hdr_type == HFI1_PKT_TYPE_9B) /* header size in 32-bit words LRH+BTH = (8+12)/4. */ hwords = 5; else @@ -122,8 +151,18 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, * response has been sent instead of only being * constructed. */ - if (++qp->s_tail_ack_queue > HFI1_MAX_RDMA_ATOMIC) - qp->s_tail_ack_queue = 0; + if (++next > rvt_size_atomic(&dev->rdi)) + next = 0; + /* + * Only advance the s_acked_ack_queue pointer if there + * have been no TID RDMA requests. + */ + e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + if (e->opcode != TID_OP(WRITE_REQ) && + qp->s_acked_ack_queue == qp->s_tail_ack_queue) + qp->s_acked_ack_queue = next; + qp->s_tail_ack_queue = next; + trace_hfi1_rsp_make_rc_ack(qp, e->psn); /* FALLTHROUGH */ case OP(SEND_ONLY): case OP(ACKNOWLEDGE): @@ -135,6 +174,12 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, } e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + /* Check for tid write fence */ + if ((qpriv->s_flags & HFI1_R_TID_WAIT_INTERLCK) || + hfi1_tid_rdma_ack_interlock(qp, e)) { + iowait_set_flag(&qpriv->s_iowait, IOWAIT_PENDING_IB); + goto bail; + } if (e->opcode == OP(RDMA_READ_REQUEST)) { /* * If a RDMA read response is being resent and @@ -144,6 +189,10 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, */ len = e->rdma_sge.sge_length; if (len && !e->rdma_sge.mr) { + if (qp->s_acked_ack_queue == + qp->s_tail_ack_queue) + qp->s_acked_ack_queue = + qp->r_head_ack_queue; qp->s_tail_ack_queue = qp->r_head_ack_queue; goto bail; } @@ -165,6 +214,45 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, hwords++; qp->s_ack_rdma_psn = e->psn; bth2 = mask_psn(qp->s_ack_rdma_psn++); + } else if (e->opcode == TID_OP(WRITE_REQ)) { + /* + * If a TID RDMA WRITE RESP is being resent, we have to + * wait for the actual request. All requests that are to + * be resent will have their state set to + * TID_REQUEST_RESEND. When the new request arrives, the + * state will be changed to TID_REQUEST_RESEND_ACTIVE. + */ + req = ack_to_tid_req(e); + if (req->state == TID_REQUEST_RESEND || + req->state == TID_REQUEST_INIT_RESEND) + goto bail; + qp->s_ack_state = TID_OP(WRITE_RESP); + qp->s_ack_rdma_psn = mask_psn(e->psn + req->cur_seg); + goto write_resp; + } else if (e->opcode == TID_OP(READ_REQ)) { + /* + * If a TID RDMA read response is being resent and + * we haven't seen the duplicate request yet, + * then stop sending the remaining responses the + * responder has seen until the requester re-sends it. + */ + len = e->rdma_sge.sge_length; + if (len && !e->rdma_sge.mr) { + if (qp->s_acked_ack_queue == + qp->s_tail_ack_queue) + qp->s_acked_ack_queue = + qp->r_head_ack_queue; + qp->s_tail_ack_queue = qp->r_head_ack_queue; + goto bail; + } + /* Copy SGE state in case we need to resend */ + ps->s_txreq->mr = e->rdma_sge.mr; + if (ps->s_txreq->mr) + rvt_get_mr(ps->s_txreq->mr); + qp->s_ack_rdma_sge.sge = e->rdma_sge; + qp->s_ack_rdma_sge.num_sge = 1; + qp->s_ack_state = TID_OP(READ_RESP); + goto read_resp; } else { /* COMPARE_SWAP or FETCH_ADD */ ps->s_txreq->ss = NULL; @@ -176,6 +264,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, bth2 = mask_psn(e->psn); e->sent = 1; } + trace_hfi1_tid_write_rsp_make_rc_ack(qp); bth0 = qp->s_ack_state << 24; break; @@ -202,6 +291,83 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, bth2 = mask_psn(qp->s_ack_rdma_psn++); break; + case TID_OP(WRITE_RESP): +write_resp: + /* + * 1. Check if RVT_S_ACK_PENDING is set. If yes, + * goto normal. + * 2. Attempt to allocate TID resources. + * 3. Remove RVT_S_RESP_PENDING flags from s_flags + * 4. If resources not available: + * 4.1 Set RVT_S_WAIT_TID_SPACE + * 4.2 Queue QP on RCD TID queue + * 4.3 Put QP on iowait list. + * 4.4 Build IB RNR NAK with appropriate timeout value + * 4.5 Return indication progress made. + * 5. If resources are available: + * 5.1 Program HW flow CSRs + * 5.2 Build TID RDMA WRITE RESP packet + * 5.3 If more resources needed, do 2.1 - 2.3. + * 5.4 Wake up next QP on RCD TID queue. + * 5.5 Return indication progress made. + */ + + e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + req = ack_to_tid_req(e); + + /* + * Send scheduled RNR NAK's. RNR NAK's need to be sent at + * segment boundaries, not at request boundaries. Don't change + * s_ack_state because we are still in the middle of a request + */ + if (qpriv->rnr_nak_state == TID_RNR_NAK_SEND && + qp->s_tail_ack_queue == qpriv->r_tid_alloc && + req->cur_seg == req->alloc_seg) { + qpriv->rnr_nak_state = TID_RNR_NAK_SENT; + goto normal_no_state; + } + + bth2 = mask_psn(qp->s_ack_rdma_psn); + hdrlen = hfi1_build_tid_rdma_write_resp(qp, e, ohdr, &bth1, + bth2, &len, + &ps->s_txreq->ss); + if (!hdrlen) + return 0; + + hwords += hdrlen; + bth0 = qp->s_ack_state << 24; + qp->s_ack_rdma_psn++; + trace_hfi1_tid_req_make_rc_ack_write(qp, 0, e->opcode, e->psn, + e->lpsn, req); + if (req->cur_seg != req->total_segs) + break; + + e->sent = 1; + qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST); + break; + + case TID_OP(READ_RESP): +read_resp: + e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + ps->s_txreq->ss = &qp->s_ack_rdma_sge; + delta = hfi1_build_tid_rdma_read_resp(qp, e, ohdr, &bth0, + &bth1, &bth2, &len, + &last_pkt); + if (delta == 0) + goto error_qp; + hwords += delta; + if (last_pkt) { + e->sent = 1; + /* + * Increment qp->s_tail_ack_queue through s_ack_state + * transition. + */ + qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST); + } + break; + case TID_OP(READ_REQ): + goto bail; + default: normal: /* @@ -211,8 +377,7 @@ normal: * (see above). */ qp->s_ack_state = OP(SEND_ONLY); - qp->s_flags &= ~RVT_S_ACK_PENDING; - ps->s_txreq->ss = NULL; +normal_no_state: if (qp->s_nak_state) ohdr->u.aeth = cpu_to_be32((qp->r_msn & IB_MSN_MASK) | @@ -224,14 +389,24 @@ normal: len = 0; bth0 = OP(ACKNOWLEDGE) << 24; bth2 = mask_psn(qp->s_ack_psn); + qp->s_flags &= ~RVT_S_ACK_PENDING; + ps->s_txreq->txreq.flags |= SDMA_TXREQ_F_VIP; + ps->s_txreq->ss = NULL; } qp->s_rdma_ack_cnt++; - ps->s_txreq->sde = priv->s_sde; + ps->s_txreq->sde = qpriv->s_sde; ps->s_txreq->s_cur_size = len; ps->s_txreq->hdr_dwords = hwords; - hfi1_make_ruc_header(qp, ohdr, bth0, bth2, middle, ps); + hfi1_make_ruc_header(qp, ohdr, bth0, bth1, bth2, middle, ps); return 1; - +error_qp: + spin_unlock_irqrestore(&qp->s_lock, ps->flags); + spin_lock_irqsave(&qp->r_lock, ps->flags); + spin_lock(&qp->s_lock); + rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); + spin_unlock(&qp->s_lock); + spin_unlock_irqrestore(&qp->r_lock, ps->flags); + spin_lock_irqsave(&qp->s_lock, ps->flags); bail: qp->s_ack_state = OP(ACKNOWLEDGE); /* @@ -258,17 +433,23 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) struct hfi1_qp_priv *priv = qp->priv; struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); struct ib_other_headers *ohdr; - struct rvt_sge_state *ss; + struct rvt_sge_state *ss = NULL; struct rvt_swqe *wqe; - u32 hwords; - u32 len; - u32 bth0 = 0; - u32 bth2; + struct hfi1_swqe_priv *wpriv; + struct tid_rdma_request *req = NULL; + /* header size in 32-bit words LRH+BTH = (8+12)/4. */ + u32 hwords = 5; + u32 len = 0; + u32 bth0 = 0, bth2 = 0; + u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT); u32 pmtu = qp->pmtu; char newreq; int middle = 0; int delta; + struct tid_rdma_flow *flow = NULL; + struct tid_rdma_params *remote; + trace_hfi1_sender_make_rc_req(qp); lockdep_assert_held(&qp->s_lock); ps->s_txreq = get_txreq(ps->dev, qp); if (!ps->s_txreq) @@ -309,13 +490,13 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) } clear_ahg(qp); wqe = rvt_get_swqe_ptr(qp, qp->s_last); - rvt_send_complete(qp, wqe, qp->s_last != qp->s_acked ? - IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR); + hfi1_trdma_send_complete(qp, wqe, qp->s_last != qp->s_acked ? + IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR); /* will get called again */ goto done_free_tx; } - if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK)) + if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK | HFI1_S_WAIT_HALT)) goto bail; if (cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) { @@ -329,6 +510,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) /* Send a request. */ wqe = rvt_get_swqe_ptr(qp, qp->s_cur); +check_s_state: switch (qp->s_state) { default: if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK)) @@ -350,9 +532,13 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) /* * If a fence is requested, wait for previous * RDMA read and atomic operations to finish. + * However, there is no need to guard against + * TID RDMA READ after TID RDMA READ. */ if ((wqe->wr.send_flags & IB_SEND_FENCE) && - qp->s_num_rd_atomic) { + qp->s_num_rd_atomic && + (wqe->wr.opcode != IB_WR_TID_RDMA_READ || + priv->pending_tid_r_segs < qp->s_num_rd_atomic)) { qp->s_flags |= RVT_S_WAIT_FENCE; goto bail; } @@ -397,6 +583,15 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) len = wqe->length; ss = &qp->s_sge; bth2 = mask_psn(qp->s_psn); + + /* + * Interlock between various IB requests and TID RDMA + * if necessary. + */ + if ((priv->s_flags & HFI1_S_TID_WAIT_INTERLCK) || + hfi1_tid_rdma_wqe_interlock(qp, wqe)) + goto bail; + switch (wqe->wr.opcode) { case IB_WR_SEND: case IB_WR_SEND_WITH_IMM: @@ -473,21 +668,126 @@ no_flow_control: qp->s_cur = 0; break; + case IB_WR_TID_RDMA_WRITE: + if (newreq) { + /* + * Limit the number of TID RDMA WRITE requests. + */ + if (atomic_read(&priv->n_tid_requests) >= + HFI1_TID_RDMA_WRITE_CNT) + goto bail; + + if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) + qp->s_lsn++; + } + + hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr, + &bth1, &bth2, + &len); + ss = NULL; + if (priv->s_tid_cur == HFI1_QP_WQE_INVALID) { + priv->s_tid_cur = qp->s_cur; + if (priv->s_tid_tail == HFI1_QP_WQE_INVALID) { + priv->s_tid_tail = qp->s_cur; + priv->s_state = TID_OP(WRITE_RESP); + } + } else if (priv->s_tid_cur == priv->s_tid_head) { + struct rvt_swqe *__w; + struct tid_rdma_request *__r; + + __w = rvt_get_swqe_ptr(qp, priv->s_tid_cur); + __r = wqe_to_tid_req(__w); + + /* + * The s_tid_cur pointer is advanced to s_cur if + * any of the following conditions about the WQE + * to which s_ti_cur currently points to are + * satisfied: + * 1. The request is not a TID RDMA WRITE + * request, + * 2. The request is in the INACTIVE or + * COMPLETE states (TID RDMA READ requests + * stay at INACTIVE and TID RDMA WRITE + * transition to COMPLETE when done), + * 3. The request is in the ACTIVE or SYNC + * state and the number of completed + * segments is equal to the total segment + * count. + * (If ACTIVE, the request is waiting for + * ACKs. If SYNC, the request has not + * received any responses because it's + * waiting on a sync point.) + */ + if (__w->wr.opcode != IB_WR_TID_RDMA_WRITE || + __r->state == TID_REQUEST_INACTIVE || + __r->state == TID_REQUEST_COMPLETE || + ((__r->state == TID_REQUEST_ACTIVE || + __r->state == TID_REQUEST_SYNC) && + __r->comp_seg == __r->total_segs)) { + if (priv->s_tid_tail == + priv->s_tid_cur && + priv->s_state == + TID_OP(WRITE_DATA_LAST)) { + priv->s_tid_tail = qp->s_cur; + priv->s_state = + TID_OP(WRITE_RESP); + } + priv->s_tid_cur = qp->s_cur; + } + /* + * A corner case: when the last TID RDMA WRITE + * request was completed, s_tid_head, + * s_tid_cur, and s_tid_tail all point to the + * same location. Other requests are posted and + * s_cur wraps around to the same location, + * where a new TID RDMA WRITE is posted. In + * this case, none of the indices need to be + * updated. However, the priv->s_state should. + */ + if (priv->s_tid_tail == qp->s_cur && + priv->s_state == TID_OP(WRITE_DATA_LAST)) + priv->s_state = TID_OP(WRITE_RESP); + } + req = wqe_to_tid_req(wqe); + if (newreq) { + priv->s_tid_head = qp->s_cur; + priv->pending_tid_w_resp += req->total_segs; + atomic_inc(&priv->n_tid_requests); + atomic_dec(&priv->n_requests); + } else { + req->state = TID_REQUEST_RESEND; + req->comp_seg = delta_psn(bth2, wqe->psn); + /* + * Pull back any segments since we are going + * to re-receive them. + */ + req->setup_head = req->clear_tail; + priv->pending_tid_w_resp += + delta_psn(wqe->lpsn, bth2) + 1; + } + + trace_hfi1_tid_write_sender_make_req(qp, newreq); + trace_hfi1_tid_req_make_req_write(qp, newreq, + wqe->wr.opcode, + wqe->psn, wqe->lpsn, + req); + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + case IB_WR_RDMA_READ: /* * Don't allow more operations to be started * than the QP limits allow. */ - if (newreq) { - if (qp->s_num_rd_atomic >= - qp->s_max_rd_atomic) { - qp->s_flags |= RVT_S_WAIT_RDMAR; - goto bail; - } - qp->s_num_rd_atomic++; - if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) - qp->s_lsn++; + if (qp->s_num_rd_atomic >= + qp->s_max_rd_atomic) { + qp->s_flags |= RVT_S_WAIT_RDMAR; + goto bail; } + qp->s_num_rd_atomic++; + if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) + qp->s_lsn++; put_ib_reth_vaddr( wqe->rdma_wr.remote_addr, &ohdr->u.rc.reth); @@ -503,23 +803,99 @@ no_flow_control: qp->s_cur = 0; break; + case IB_WR_TID_RDMA_READ: + trace_hfi1_tid_read_sender_make_req(qp, newreq); + wpriv = wqe->priv; + req = wqe_to_tid_req(wqe); + trace_hfi1_tid_req_make_req_read(qp, newreq, + wqe->wr.opcode, + wqe->psn, wqe->lpsn, + req); + delta = cmp_psn(qp->s_psn, wqe->psn); + + /* + * Don't allow more operations to be started + * than the QP limits allow. We could get here under + * three conditions; (1) It's a new request; (2) We are + * sending the second or later segment of a request, + * but the qp->s_state is set to OP(RDMA_READ_REQUEST) + * when the last segment of a previous request is + * received just before this; (3) We are re-sending a + * request. + */ + if (qp->s_num_rd_atomic >= qp->s_max_rd_atomic) { + qp->s_flags |= RVT_S_WAIT_RDMAR; + goto bail; + } + if (newreq) { + struct tid_rdma_flow *flow = + &req->flows[req->setup_head]; + + /* + * Set up s_sge as it is needed for TID + * allocation. However, if the pages have been + * walked and mapped, skip it. An earlier try + * has failed to allocate the TID entries. + */ + if (!flow->npagesets) { + qp->s_sge.sge = wqe->sg_list[0]; + qp->s_sge.sg_list = wqe->sg_list + 1; + qp->s_sge.num_sge = wqe->wr.num_sge; + qp->s_sge.total_len = wqe->length; + qp->s_len = wqe->length; + req->isge = 0; + req->clear_tail = req->setup_head; + req->flow_idx = req->setup_head; + req->state = TID_REQUEST_ACTIVE; + } + } else if (delta == 0) { + /* Re-send a request */ + req->cur_seg = 0; + req->comp_seg = 0; + req->ack_pending = 0; + req->flow_idx = req->clear_tail; + req->state = TID_REQUEST_RESEND; + } + req->s_next_psn = qp->s_psn; + /* Read one segment at a time */ + len = min_t(u32, req->seg_len, + wqe->length - req->seg_len * req->cur_seg); + delta = hfi1_build_tid_rdma_read_req(qp, wqe, ohdr, + &bth1, &bth2, + &len); + if (delta <= 0) { + /* Wait for TID space */ + goto bail; + } + if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) + qp->s_lsn++; + hwords += delta; + ss = &wpriv->ss; + /* Check if this is the last segment */ + if (req->cur_seg >= req->total_segs && + ++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + case IB_WR_ATOMIC_CMP_AND_SWP: case IB_WR_ATOMIC_FETCH_AND_ADD: /* * Don't allow more operations to be started * than the QP limits allow. */ - if (newreq) { - if (qp->s_num_rd_atomic >= - qp->s_max_rd_atomic) { - qp->s_flags |= RVT_S_WAIT_RDMAR; - goto bail; - } - qp->s_num_rd_atomic++; - if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) - qp->s_lsn++; + if (qp->s_num_rd_atomic >= + qp->s_max_rd_atomic) { + qp->s_flags |= RVT_S_WAIT_RDMAR; + goto bail; } - if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) { + qp->s_num_rd_atomic++; + + /* FALLTHROUGH */ + case IB_WR_OPFN: + if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) + qp->s_lsn++; + if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wqe->wr.opcode == IB_WR_OPFN) { qp->s_state = OP(COMPARE_SWAP); put_ib_ateth_swap(wqe->atomic_wr.swap, &ohdr->u.atomic_eth); @@ -546,18 +922,23 @@ no_flow_control: default: goto bail; } - qp->s_sge.sge = wqe->sg_list[0]; - qp->s_sge.sg_list = wqe->sg_list + 1; - qp->s_sge.num_sge = wqe->wr.num_sge; - qp->s_sge.total_len = wqe->length; - qp->s_len = wqe->length; + if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) { + qp->s_sge.sge = wqe->sg_list[0]; + qp->s_sge.sg_list = wqe->sg_list + 1; + qp->s_sge.num_sge = wqe->wr.num_sge; + qp->s_sge.total_len = wqe->length; + qp->s_len = wqe->length; + } if (newreq) { qp->s_tail++; if (qp->s_tail >= qp->s_size) qp->s_tail = 0; } - if (wqe->wr.opcode == IB_WR_RDMA_READ) + if (wqe->wr.opcode == IB_WR_RDMA_READ || + wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) qp->s_psn = wqe->lpsn + 1; + else if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) + qp->s_psn = req->s_next_psn; else qp->s_psn++; break; @@ -674,10 +1055,137 @@ no_flow_control: if (qp->s_cur == qp->s_size) qp->s_cur = 0; break; + + case TID_OP(WRITE_RESP): + /* + * This value for s_state is used for restarting a TID RDMA + * WRITE request. See comment in OP(RDMA_READ_RESPONSE_MIDDLE + * for more). + */ + req = wqe_to_tid_req(wqe); + req->state = TID_REQUEST_RESEND; + rcu_read_lock(); + remote = rcu_dereference(priv->tid_rdma.remote); + req->comp_seg = delta_psn(qp->s_psn, wqe->psn); + len = wqe->length - (req->comp_seg * remote->max_len); + rcu_read_unlock(); + + bth2 = mask_psn(qp->s_psn); + hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr, &bth1, + &bth2, &len); + qp->s_psn = wqe->lpsn + 1; + ss = NULL; + qp->s_state = TID_OP(WRITE_REQ); + priv->pending_tid_w_resp += delta_psn(wqe->lpsn, bth2) + 1; + priv->s_tid_cur = qp->s_cur; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + trace_hfi1_tid_req_make_req_write(qp, 0, wqe->wr.opcode, + wqe->psn, wqe->lpsn, req); + break; + + case TID_OP(READ_RESP): + if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) + goto bail; + /* This is used to restart a TID read request */ + req = wqe_to_tid_req(wqe); + wpriv = wqe->priv; + /* + * Back down. The field qp->s_psn has been set to the psn with + * which the request should be restart. It's OK to use division + * as this is on the retry path. + */ + req->cur_seg = delta_psn(qp->s_psn, wqe->psn) / priv->pkts_ps; + + /* + * The following function need to be redefined to return the + * status to make sure that we find the flow. At the same + * time, we can use the req->state change to check if the + * call succeeds or not. + */ + req->state = TID_REQUEST_RESEND; + hfi1_tid_rdma_restart_req(qp, wqe, &bth2); + if (req->state != TID_REQUEST_ACTIVE) { + /* + * Failed to find the flow. Release all allocated tid + * resources. + */ + hfi1_kern_exp_rcv_clear_all(req); + hfi1_kern_clear_hw_flow(priv->rcd, qp); + + hfi1_trdma_send_complete(qp, wqe, IB_WC_LOC_QP_OP_ERR); + goto bail; + } + req->state = TID_REQUEST_RESEND; + len = min_t(u32, req->seg_len, + wqe->length - req->seg_len * req->cur_seg); + flow = &req->flows[req->flow_idx]; + len -= flow->sent; + req->s_next_psn = flow->flow_state.ib_lpsn + 1; + delta = hfi1_build_tid_rdma_read_packet(wqe, ohdr, &bth1, + &bth2, &len); + if (delta <= 0) { + /* Wait for TID space */ + goto bail; + } + hwords += delta; + ss = &wpriv->ss; + /* Check if this is the last segment */ + if (req->cur_seg >= req->total_segs && + ++qp->s_cur == qp->s_size) + qp->s_cur = 0; + qp->s_psn = req->s_next_psn; + trace_hfi1_tid_req_make_req_read(qp, 0, wqe->wr.opcode, + wqe->psn, wqe->lpsn, req); + break; + case TID_OP(READ_REQ): + req = wqe_to_tid_req(wqe); + delta = cmp_psn(qp->s_psn, wqe->psn); + /* + * If the current WR is not TID RDMA READ, or this is the start + * of a new request, we need to change the qp->s_state so that + * the request can be set up properly. + */ + if (wqe->wr.opcode != IB_WR_TID_RDMA_READ || delta == 0 || + qp->s_cur == qp->s_tail) { + qp->s_state = OP(RDMA_READ_REQUEST); + if (delta == 0 || qp->s_cur == qp->s_tail) + goto check_s_state; + else + goto bail; + } + + /* Rate limiting */ + if (qp->s_num_rd_atomic >= qp->s_max_rd_atomic) { + qp->s_flags |= RVT_S_WAIT_RDMAR; + goto bail; + } + + wpriv = wqe->priv; + /* Read one segment at a time */ + len = min_t(u32, req->seg_len, + wqe->length - req->seg_len * req->cur_seg); + delta = hfi1_build_tid_rdma_read_req(qp, wqe, ohdr, &bth1, + &bth2, &len); + if (delta <= 0) { + /* Wait for TID space */ + goto bail; + } + hwords += delta; + ss = &wpriv->ss; + /* Check if this is the last segment */ + if (req->cur_seg >= req->total_segs && + ++qp->s_cur == qp->s_size) + qp->s_cur = 0; + qp->s_psn = req->s_next_psn; + trace_hfi1_tid_req_make_req_read(qp, 0, wqe->wr.opcode, + wqe->psn, wqe->lpsn, req); + break; } qp->s_sending_hpsn = bth2; delta = delta_psn(bth2, wqe->psn); - if (delta && delta % HFI1_PSN_CREDIT == 0) + if (delta && delta % HFI1_PSN_CREDIT == 0 && + wqe->wr.opcode != IB_WR_TID_RDMA_WRITE) bth2 |= IB_BTH_REQ_ACK; if (qp->s_flags & RVT_S_SEND_ONE) { qp->s_flags &= ~RVT_S_SEND_ONE; @@ -693,6 +1201,7 @@ no_flow_control: qp, ohdr, bth0 | (qp->s_state << 24), + bth1, bth2, middle, ps); @@ -709,6 +1218,12 @@ bail: bail_no_tx: ps->s_txreq = NULL; qp->s_flags &= ~RVT_S_BUSY; + /* + * If we didn't get a txreq, the QP will be woken up later to try + * again. Set the flags to indicate which work item to wake + * up. + */ + iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_IB); return 0; } @@ -796,6 +1311,11 @@ static inline void hfi1_make_rc_ack_9B(struct hfi1_packet *packet, if (qp->s_mig_state == IB_MIG_MIGRATED) bth0 |= IB_BTH_MIG_REQ; bth1 = (!!is_fecn) << IB_BECN_SHIFT; + /* + * Inline ACKs go out without the use of the Verbs send engine, so + * we need to set the STL Verbs Extended bit here + */ + bth1 |= HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT; hfi1_make_bth_aeth(qp, ohdr, bth0, bth1); } @@ -936,6 +1456,43 @@ void hfi1_send_rc_ack(struct hfi1_packet *packet, bool is_fecn) } /** + * update_num_rd_atomic - update the qp->s_num_rd_atomic + * @qp: the QP + * @psn: the packet sequence number to restart at + * @wqe: the wqe + * + * This is called from reset_psn() to update qp->s_num_rd_atomic + * for the current wqe. + * Called at interrupt level with the QP s_lock held. + */ +static void update_num_rd_atomic(struct rvt_qp *qp, u32 psn, + struct rvt_swqe *wqe) +{ + u32 opcode = wqe->wr.opcode; + + if (opcode == IB_WR_RDMA_READ || + opcode == IB_WR_ATOMIC_CMP_AND_SWP || + opcode == IB_WR_ATOMIC_FETCH_AND_ADD) { + qp->s_num_rd_atomic++; + } else if (opcode == IB_WR_TID_RDMA_READ) { + struct tid_rdma_request *req = wqe_to_tid_req(wqe); + struct hfi1_qp_priv *priv = qp->priv; + + if (cmp_psn(psn, wqe->lpsn) <= 0) { + u32 cur_seg; + + cur_seg = (psn - wqe->psn) / priv->pkts_ps; + req->ack_pending = cur_seg - req->comp_seg; + priv->pending_tid_r_segs += req->ack_pending; + qp->s_num_rd_atomic += req->ack_pending; + } else { + priv->pending_tid_r_segs += req->total_segs; + qp->s_num_rd_atomic += req->total_segs; + } + } +} + +/** * reset_psn - reset the QP state to send starting from PSN * @qp: the QP * @psn: the packet sequence number to restart at @@ -949,9 +1506,13 @@ static void reset_psn(struct rvt_qp *qp, u32 psn) u32 n = qp->s_acked; struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, n); u32 opcode; + struct hfi1_qp_priv *priv = qp->priv; lockdep_assert_held(&qp->s_lock); qp->s_cur = n; + priv->pending_tid_r_segs = 0; + priv->pending_tid_w_resp = 0; + qp->s_num_rd_atomic = 0; /* * If we are starting the request from the beginning, @@ -961,9 +1522,9 @@ static void reset_psn(struct rvt_qp *qp, u32 psn) qp->s_state = OP(SEND_LAST); goto done; } + update_num_rd_atomic(qp, psn, wqe); /* Find the work request opcode corresponding to the given PSN. */ - opcode = wqe->wr.opcode; for (;;) { int diff; @@ -973,8 +1534,11 @@ static void reset_psn(struct rvt_qp *qp, u32 psn) break; wqe = rvt_get_swqe_ptr(qp, n); diff = cmp_psn(psn, wqe->psn); - if (diff < 0) + if (diff < 0) { + /* Point wqe back to the previous one*/ + wqe = rvt_get_swqe_ptr(qp, qp->s_cur); break; + } qp->s_cur = n; /* * If we are starting the request from the beginning, @@ -984,8 +1548,10 @@ static void reset_psn(struct rvt_qp *qp, u32 psn) qp->s_state = OP(SEND_LAST); goto done; } - opcode = wqe->wr.opcode; + + update_num_rd_atomic(qp, psn, wqe); } + opcode = wqe->wr.opcode; /* * Set the state to restart in the middle of a request. @@ -1003,10 +1569,18 @@ static void reset_psn(struct rvt_qp *qp, u32 psn) qp->s_state = OP(RDMA_READ_RESPONSE_LAST); break; + case IB_WR_TID_RDMA_WRITE: + qp->s_state = TID_OP(WRITE_RESP); + break; + case IB_WR_RDMA_READ: qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE); break; + case IB_WR_TID_RDMA_READ: + qp->s_state = TID_OP(READ_RESP); + break; + default: /* * This case shouldn't happen since its only @@ -1015,6 +1589,7 @@ static void reset_psn(struct rvt_qp *qp, u32 psn) qp->s_state = OP(SEND_LAST); } done: + priv->s_flags &= ~HFI1_S_TID_WAIT_INTERLCK; qp->s_psn = psn; /* * Set RVT_S_WAIT_PSN as rc_complete() may start the timer @@ -1025,6 +1600,7 @@ done: (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)) qp->s_flags |= RVT_S_WAIT_PSN; qp->s_flags &= ~HFI1_S_AHG_VALID; + trace_hfi1_sender_reset_psn(qp); } /* @@ -1033,18 +1609,47 @@ done: */ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait) { + struct hfi1_qp_priv *priv = qp->priv; struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_acked); struct hfi1_ibport *ibp; lockdep_assert_held(&qp->r_lock); lockdep_assert_held(&qp->s_lock); + trace_hfi1_sender_restart_rc(qp); if (qp->s_retry == 0) { if (qp->s_mig_state == IB_MIG_ARMED) { hfi1_migrate_qp(qp); qp->s_retry = qp->s_retry_cnt; } else if (qp->s_last == qp->s_acked) { - rvt_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); - rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); + /* + * We need special handling for the OPFN request WQEs as + * they are not allowed to generate real user errors + */ + if (wqe->wr.opcode == IB_WR_OPFN) { + struct hfi1_ibport *ibp = + to_iport(qp->ibqp.device, qp->port_num); + /* + * Call opfn_conn_reply() with capcode and + * remaining data as 0 to close out the + * current request + */ + opfn_conn_reply(qp, priv->opfn.curr); + wqe = do_rc_completion(qp, wqe, ibp); + qp->s_flags &= ~RVT_S_WAIT_ACK; + } else { + trace_hfi1_tid_write_sender_restart_rc(qp, 0); + if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { + struct tid_rdma_request *req; + + req = wqe_to_tid_req(wqe); + hfi1_kern_exp_rcv_clear_all(req); + hfi1_kern_clear_hw_flow(priv->rcd, qp); + } + + hfi1_trdma_send_complete(qp, wqe, + IB_WC_RETRY_EXC_ERR); + rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); + } return; } else { /* need to handle delayed completion */ return; @@ -1054,14 +1659,15 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait) } ibp = to_iport(qp->ibqp.device, qp->port_num); - if (wqe->wr.opcode == IB_WR_RDMA_READ) + if (wqe->wr.opcode == IB_WR_RDMA_READ || + wqe->wr.opcode == IB_WR_TID_RDMA_READ) ibp->rvp.n_rc_resends++; else ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn); qp->s_flags &= ~(RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR | RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_PSN | - RVT_S_WAIT_ACK); + RVT_S_WAIT_ACK | HFI1_S_WAIT_TID_RESP); if (wait) qp->s_flags |= RVT_S_SEND_ONE; reset_psn(qp, psn); @@ -1069,7 +1675,8 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait) /* * Set qp->s_sending_psn to the next PSN after the given one. - * This would be psn+1 except when RDMA reads are present. + * This would be psn+1 except when RDMA reads or TID RDMA ops + * are present. */ static void reset_sending_psn(struct rvt_qp *qp, u32 psn) { @@ -1081,7 +1688,9 @@ static void reset_sending_psn(struct rvt_qp *qp, u32 psn) for (;;) { wqe = rvt_get_swqe_ptr(qp, n); if (cmp_psn(psn, wqe->lpsn) <= 0) { - if (wqe->wr.opcode == IB_WR_RDMA_READ) + if (wqe->wr.opcode == IB_WR_RDMA_READ || + wqe->wr.opcode == IB_WR_TID_RDMA_READ || + wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) qp->s_sending_psn = wqe->lpsn + 1; else qp->s_sending_psn = psn + 1; @@ -1104,8 +1713,9 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah) struct rvt_swqe *wqe; struct ib_header *hdr = NULL; struct hfi1_16b_header *hdr_16b = NULL; - u32 opcode; + u32 opcode, head, tail; u32 psn; + struct tid_rdma_request *req; lockdep_assert_held(&qp->s_lock); if (!(ib_rvt_state_ops[qp->state] & RVT_SEND_OR_FLUSH_OR_RECV_OK)) @@ -1130,25 +1740,85 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah) } opcode = ib_bth_get_opcode(ohdr); - if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) && - opcode <= OP(ATOMIC_ACKNOWLEDGE)) { + if ((opcode >= OP(RDMA_READ_RESPONSE_FIRST) && + opcode <= OP(ATOMIC_ACKNOWLEDGE)) || + opcode == TID_OP(READ_RESP) || + opcode == TID_OP(WRITE_RESP)) { WARN_ON(!qp->s_rdma_ack_cnt); qp->s_rdma_ack_cnt--; return; } psn = ib_bth_get_psn(ohdr); - reset_sending_psn(qp, psn); + /* + * Don't attempt to reset the sending PSN for packets in the + * KDETH PSN space since the PSN does not match anything. + */ + if (opcode != TID_OP(WRITE_DATA) && + opcode != TID_OP(WRITE_DATA_LAST) && + opcode != TID_OP(ACK) && opcode != TID_OP(RESYNC)) + reset_sending_psn(qp, psn); + + /* Handle TID RDMA WRITE packets differently */ + if (opcode >= TID_OP(WRITE_REQ) && + opcode <= TID_OP(WRITE_DATA_LAST)) { + head = priv->s_tid_head; + tail = priv->s_tid_cur; + /* + * s_tid_cur is set to s_tid_head in the case, where + * a new TID RDMA request is being started and all + * previous ones have been completed. + * Therefore, we need to do a secondary check in order + * to properly determine whether we should start the + * RC timer. + */ + wqe = rvt_get_swqe_ptr(qp, tail); + req = wqe_to_tid_req(wqe); + if (head == tail && req->comp_seg < req->total_segs) { + if (tail == 0) + tail = qp->s_size - 1; + else + tail -= 1; + } + } else { + head = qp->s_tail; + tail = qp->s_acked; + } /* * Start timer after a packet requesting an ACK has been sent and * there are still requests that haven't been acked. */ - if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail && + if ((psn & IB_BTH_REQ_ACK) && tail != head && + opcode != TID_OP(WRITE_DATA) && opcode != TID_OP(WRITE_DATA_LAST) && + opcode != TID_OP(RESYNC) && !(qp->s_flags & - (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) && - (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) - rvt_add_retry_timer(qp); + (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) && + (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { + if (opcode == TID_OP(READ_REQ)) + rvt_add_retry_timer_ext(qp, priv->timeout_shift); + else + rvt_add_retry_timer(qp); + } + + /* Start TID RDMA ACK timer */ + if ((opcode == TID_OP(WRITE_DATA) || + opcode == TID_OP(WRITE_DATA_LAST) || + opcode == TID_OP(RESYNC)) && + (psn & IB_BTH_REQ_ACK) && + !(priv->s_flags & HFI1_S_TID_RETRY_TIMER) && + (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { + /* + * The TID RDMA ACK packet could be received before this + * function is called. Therefore, add the timer only if TID + * RDMA ACK packets are actually pending. + */ + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); + req = wqe_to_tid_req(wqe); + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE && + req->ack_seg < req->cur_seg) + hfi1_add_tid_retry_timer(qp); + } while (qp->s_last != qp->s_acked) { u32 s_last; @@ -1157,6 +1827,7 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah) if (cmp_psn(wqe->lpsn, qp->s_sending_psn) >= 0 && cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) break; + trdma_clean_swqe(qp, wqe); rvt_qp_wqe_unreserve(qp, wqe); s_last = qp->s_last; trace_hfi1_qp_send_completion(qp, wqe, s_last); @@ -1195,20 +1866,24 @@ static inline void update_last_psn(struct rvt_qp *qp, u32 psn) * This is similar to hfi1_send_complete but has to check to be sure * that the SGEs are not being referenced if the SWQE is being resent. */ -static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, - struct rvt_swqe *wqe, - struct hfi1_ibport *ibp) +struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, + struct rvt_swqe *wqe, + struct hfi1_ibport *ibp) { + struct hfi1_qp_priv *priv = qp->priv; + lockdep_assert_held(&qp->s_lock); /* * Don't decrement refcount and don't generate a * completion if the SWQE is being resent until the send * is finished. */ + trace_hfi1_rc_completion(qp, wqe->lpsn); if (cmp_psn(wqe->lpsn, qp->s_sending_psn) < 0 || cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) { u32 s_last; + trdma_clean_swqe(qp, wqe); rvt_put_swqe(wqe); rvt_qp_wqe_unreserve(qp, wqe); s_last = qp->s_last; @@ -1243,7 +1918,16 @@ static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, } qp->s_retry = qp->s_retry_cnt; - update_last_psn(qp, wqe->lpsn); + /* + * Don't update the last PSN if the request being completed is + * a TID RDMA WRITE request. + * Completion of the TID RDMA WRITE requests are done by the + * TID RDMA ACKs and as such could be for a request that has + * already been ACKed as far as the IB state machine is + * concerned. + */ + if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE) + update_last_psn(qp, wqe->lpsn); /* * If we are completing a request which is in the process of @@ -1266,9 +1950,61 @@ static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, qp->s_draining = 0; wqe = rvt_get_swqe_ptr(qp, qp->s_acked); } + if (priv->s_flags & HFI1_S_TID_WAIT_INTERLCK) { + priv->s_flags &= ~HFI1_S_TID_WAIT_INTERLCK; + hfi1_schedule_send(qp); + } return wqe; } +static void set_restart_qp(struct rvt_qp *qp, struct hfi1_ctxtdata *rcd) +{ + /* Retry this request. */ + if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) { + qp->r_flags |= RVT_R_RDMAR_SEQ; + hfi1_restart_rc(qp, qp->s_last_psn + 1, 0); + if (list_empty(&qp->rspwait)) { + qp->r_flags |= RVT_R_RSP_SEND; + rvt_get_qp(qp); + list_add_tail(&qp->rspwait, &rcd->qp_wait_list); + } + } +} + +/** + * update_qp_retry_state - Update qp retry state. + * @qp: the QP + * @psn: the packet sequence number of the TID RDMA WRITE RESP. + * @spsn: The start psn for the given TID RDMA WRITE swqe. + * @lpsn: The last psn for the given TID RDMA WRITE swqe. + * + * This function is called to update the qp retry state upon + * receiving a TID WRITE RESP after the qp is scheduled to retry + * a request. + */ +static void update_qp_retry_state(struct rvt_qp *qp, u32 psn, u32 spsn, + u32 lpsn) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + + qp->s_psn = psn + 1; + /* + * If this is the first TID RDMA WRITE RESP packet for the current + * request, change the s_state so that the retry will be processed + * correctly. Similarly, if this is the last TID RDMA WRITE RESP + * packet, change the s_state and advance the s_cur. + */ + if (cmp_psn(psn, lpsn) >= 0) { + qp->s_cur = qpriv->s_tid_cur + 1; + if (qp->s_cur >= qp->s_size) + qp->s_cur = 0; + qp->s_state = TID_OP(WRITE_REQ); + } else if (!cmp_psn(psn, spsn)) { + qp->s_cur = qpriv->s_tid_cur; + qp->s_state = TID_OP(WRITE_RESP); + } +} + /** * do_rc_ack - process an incoming RC ACK * @qp: the QP the ACK came in on @@ -1280,15 +2016,17 @@ static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, * May be called at interrupt level, with the QP s_lock held. * Returns 1 if OK, 0 if current operation should be aborted (NAK). */ -static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, - u64 val, struct hfi1_ctxtdata *rcd) +int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, + u64 val, struct hfi1_ctxtdata *rcd) { struct hfi1_ibport *ibp; enum ib_wc_status status; + struct hfi1_qp_priv *qpriv = qp->priv; struct rvt_swqe *wqe; int ret = 0; u32 ack_psn; int diff; + struct rvt_dev_info *rdi; lockdep_assert_held(&qp->s_lock); /* @@ -1331,20 +2069,14 @@ static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, */ if ((wqe->wr.opcode == IB_WR_RDMA_READ && (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) || + (wqe->wr.opcode == IB_WR_TID_RDMA_READ && + (opcode != TID_OP(READ_RESP) || diff != 0)) || ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) && - (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) { - /* Retry this request. */ - if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) { - qp->r_flags |= RVT_R_RDMAR_SEQ; - hfi1_restart_rc(qp, qp->s_last_psn + 1, 0); - if (list_empty(&qp->rspwait)) { - qp->r_flags |= RVT_R_RSP_SEND; - rvt_get_qp(qp); - list_add_tail(&qp->rspwait, - &rcd->qp_wait_list); - } - } + (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0)) || + (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE && + (delta_psn(psn, qp->s_last_psn) != 1))) { + set_restart_qp(qp, rcd); /* * No need to process the ACK/NAK since we are * restarting an earlier request. @@ -1356,6 +2088,9 @@ static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, u64 *vaddr = wqe->sg_list[0].vaddr; *vaddr = val; } + if (wqe->wr.opcode == IB_WR_OPFN) + opfn_conn_reply(qp, val); + if (qp->s_num_rd_atomic && (wqe->wr.opcode == IB_WR_RDMA_READ || wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || @@ -1373,26 +2108,85 @@ static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, hfi1_schedule_send(qp); } } + + /* + * TID RDMA WRITE requests will be completed by the TID RDMA + * ACK packet handler (see tid_rdma.c). + */ + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) + break; + wqe = do_rc_completion(qp, wqe, ibp); if (qp->s_acked == qp->s_tail) break; } + trace_hfi1_rc_ack_do(qp, aeth, psn, wqe); + trace_hfi1_sender_do_rc_ack(qp); switch (aeth >> IB_AETH_NAK_SHIFT) { case 0: /* ACK */ this_cpu_inc(*ibp->rvp.rc_acks); - if (qp->s_acked != qp->s_tail) { - /* - * We are expecting more ACKs so - * mod the retry timer. - */ - rvt_mod_retry_timer(qp); + if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { + if (wqe_to_tid_req(wqe)->ack_pending) + rvt_mod_retry_timer_ext(qp, + qpriv->timeout_shift); + else + rvt_stop_rc_timers(qp); + } else if (qp->s_acked != qp->s_tail) { + struct rvt_swqe *__w = NULL; + + if (qpriv->s_tid_cur != HFI1_QP_WQE_INVALID) + __w = rvt_get_swqe_ptr(qp, qpriv->s_tid_cur); + /* - * We can stop re-sending the earlier packets and - * continue with the next packet the receiver wants. + * Stop timers if we've received all of the TID RDMA + * WRITE * responses. */ - if (cmp_psn(qp->s_psn, psn) <= 0) - reset_psn(qp, psn + 1); + if (__w && __w->wr.opcode == IB_WR_TID_RDMA_WRITE && + opcode == TID_OP(WRITE_RESP)) { + /* + * Normally, the loop above would correctly + * process all WQEs from s_acked onward and + * either complete them or check for correct + * PSN sequencing. + * However, for TID RDMA, due to pipelining, + * the response may not be for the request at + * s_acked so the above look would just be + * skipped. This does not allow for checking + * the PSN sequencing. It has to be done + * separately. + */ + if (cmp_psn(psn, qp->s_last_psn + 1)) { + set_restart_qp(qp, rcd); + goto bail_stop; + } + /* + * If the psn is being resent, stop the + * resending. + */ + if (qp->s_cur != qp->s_tail && + cmp_psn(qp->s_psn, psn) <= 0) + update_qp_retry_state(qp, psn, + __w->psn, + __w->lpsn); + else if (--qpriv->pending_tid_w_resp) + rvt_mod_retry_timer(qp); + else + rvt_stop_rc_timers(qp); + } else { + /* + * We are expecting more ACKs so + * mod the retry timer. + */ + rvt_mod_retry_timer(qp); + /* + * We can stop re-sending the earlier packets + * and continue with the next packet the + * receiver wants. + */ + if (cmp_psn(qp->s_psn, psn) <= 0) + reset_psn(qp, psn + 1); + } } else { /* No more acks - kill all timers */ rvt_stop_rc_timers(qp); @@ -1408,6 +2202,15 @@ static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, rvt_get_credit(qp, aeth); qp->s_rnr_retry = qp->s_rnr_retry_cnt; qp->s_retry = qp->s_retry_cnt; + /* + * If the current request is a TID RDMA WRITE request and the + * response is not a TID RDMA WRITE RESP packet, s_last_psn + * can't be advanced. + */ + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE && + opcode != TID_OP(WRITE_RESP) && + cmp_psn(psn, wqe->psn) >= 0) + return 1; update_last_psn(qp, psn); return 1; @@ -1417,20 +2220,31 @@ static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, goto bail_stop; if (qp->s_flags & RVT_S_WAIT_RNR) goto bail_stop; - if (qp->s_rnr_retry == 0) { + rdi = ib_to_rvt(qp->ibqp.device); + if (qp->s_rnr_retry == 0 && + !((rdi->post_parms[wqe->wr.opcode].flags & + RVT_OPERATION_IGN_RNR_CNT) && + qp->s_rnr_retry_cnt == 0)) { status = IB_WC_RNR_RETRY_EXC_ERR; goto class_b; } - if (qp->s_rnr_retry_cnt < 7) + if (qp->s_rnr_retry_cnt < 7 && qp->s_rnr_retry_cnt > 0) qp->s_rnr_retry--; - /* The last valid PSN is the previous PSN. */ - update_last_psn(qp, psn - 1); + /* + * The last valid PSN is the previous PSN. For TID RDMA WRITE + * request, s_last_psn should be incremented only when a TID + * RDMA WRITE RESP is received to avoid skipping lost TID RDMA + * WRITE RESP packets. + */ + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) { + reset_psn(qp, qp->s_last_psn + 1); + } else { + update_last_psn(qp, psn - 1); + reset_psn(qp, psn); + } ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn); - - reset_psn(qp, psn); - qp->s_flags &= ~(RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_ACK); rvt_stop_rc_timers(qp); rvt_add_rnr_timer(qp, aeth); @@ -1470,7 +2284,10 @@ static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, ibp->rvp.n_other_naks++; class_b: if (qp->s_last == qp->s_acked) { - rvt_send_complete(qp, wqe, status); + if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) + hfi1_kern_read_tid_flow_free(qp); + + hfi1_trdma_send_complete(qp, wqe, status); rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); } break; @@ -1511,6 +2328,8 @@ static void rdma_seq_err(struct rvt_qp *qp, struct hfi1_ibport *ibp, u32 psn, while (cmp_psn(psn, wqe->lpsn) > 0) { if (wqe->wr.opcode == IB_WR_RDMA_READ || + wqe->wr.opcode == IB_WR_TID_RDMA_READ || + wqe->wr.opcode == IB_WR_TID_RDMA_WRITE || wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) break; @@ -1717,16 +2536,6 @@ bail: return; } -static inline void rc_defered_ack(struct hfi1_ctxtdata *rcd, - struct rvt_qp *qp) -{ - if (list_empty(&qp->rspwait)) { - qp->r_flags |= RVT_R_RSP_NAK; - rvt_get_qp(qp); - list_add_tail(&qp->rspwait, &rcd->qp_wait_list); - } -} - static inline void rc_cancel_ack(struct rvt_qp *qp) { qp->r_adefered = 0; @@ -1759,8 +2568,9 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data, struct hfi1_ibport *ibp = rcd_to_iport(rcd); struct rvt_ack_entry *e; unsigned long flags; - u8 i, prev; - int old_req; + u8 prev; + u8 mra; /* most recent ACK */ + bool old_req; trace_hfi1_rcv_error(qp, psn); if (diff > 0) { @@ -1806,29 +2616,8 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data, spin_lock_irqsave(&qp->s_lock, flags); - for (i = qp->r_head_ack_queue; ; i = prev) { - if (i == qp->s_tail_ack_queue) - old_req = 0; - if (i) - prev = i - 1; - else - prev = HFI1_MAX_RDMA_ATOMIC; - if (prev == qp->r_head_ack_queue) { - e = NULL; - break; - } - e = &qp->s_ack_queue[prev]; - if (!e->opcode) { - e = NULL; - break; - } - if (cmp_psn(psn, e->psn) >= 0) { - if (prev == qp->s_tail_ack_queue && - cmp_psn(psn, e->lpsn) <= 0) - old_req = 0; - break; - } - } + e = find_prev_entry(qp, psn, &prev, &mra, &old_req); + switch (opcode) { case OP(RDMA_READ_REQUEST): { struct ib_reth *reth; @@ -1875,6 +2664,8 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data, e->psn = psn; if (old_req) goto unlock_done; + if (qp->s_acked_ack_queue == qp->s_tail_ack_queue) + qp->s_acked_ack_queue = prev; qp->s_tail_ack_queue = prev; break; } @@ -1888,6 +2679,8 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data, */ if (!e || e->opcode != (u8)opcode || old_req) goto unlock_done; + if (qp->s_tail_ack_queue == qp->s_acked_ack_queue) + qp->s_acked_ack_queue = prev; qp->s_tail_ack_queue = prev; break; } @@ -1903,7 +2696,7 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data, * Resend the most recent ACK if this request is * after all the previous RDMA reads and atomics. */ - if (i == qp->r_head_ack_queue) { + if (mra == qp->r_head_ack_queue) { spin_unlock_irqrestore(&qp->s_lock, flags); qp->r_nak_state = 0; qp->r_ack_psn = qp->r_psn - 1; @@ -1914,7 +2707,9 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data, * Resend the RDMA read or atomic op which * ACKs this duplicate request. */ - qp->s_tail_ack_queue = i; + if (qp->s_tail_ack_queue == qp->s_acked_ack_queue) + qp->s_acked_ack_queue = mra; + qp->s_tail_ack_queue = mra; break; } qp->s_ack_state = OP(ACKNOWLEDGE); @@ -1931,17 +2726,6 @@ send_ack: return 0; } -static inline void update_ack_queue(struct rvt_qp *qp, unsigned n) -{ - unsigned next; - - next = n + 1; - if (next > HFI1_MAX_RDMA_ATOMIC) - next = 0; - qp->s_tail_ack_queue = next; - qp->s_ack_state = OP(ACKNOWLEDGE); -} - static void log_cca_event(struct hfi1_pportdata *ppd, u8 sl, u32 rlid, u32 lqpn, u32 rqpn, u8 svc_type) { @@ -2039,6 +2823,7 @@ void hfi1_rc_rcv(struct hfi1_packet *packet) void *data = packet->payload; u32 tlen = packet->tlen; struct rvt_qp *qp = packet->qp; + struct hfi1_qp_priv *qpriv = qp->priv; struct hfi1_ibport *ibp = rcd_to_iport(rcd); struct ib_other_headers *ohdr = packet->ohdr; u32 opcode = packet->opcode; @@ -2061,6 +2846,7 @@ void hfi1_rc_rcv(struct hfi1_packet *packet) return; fecn = process_ecn(qp, packet); + opfn_trigger_conn_request(qp, be32_to_cpu(ohdr->bth[1])); /* * Process responses (ACKs) before anything else. Note that the @@ -2292,11 +3078,11 @@ send_last: if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) goto nack_inv; next = qp->r_head_ack_queue + 1; - /* s_ack_queue is size HFI1_MAX_RDMA_ATOMIC+1 so use > not >= */ - if (next > HFI1_MAX_RDMA_ATOMIC) + /* s_ack_queue is size rvt_size_atomic()+1 so use > not >= */ + if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) next = 0; spin_lock_irqsave(&qp->s_lock, flags); - if (unlikely(next == qp->s_tail_ack_queue)) { + if (unlikely(next == qp->s_acked_ack_queue)) { if (!qp->s_ack_queue[next].sent) goto nack_inv_unlck; update_ack_queue(qp, next); @@ -2343,6 +3129,7 @@ send_last: qp->r_state = opcode; qp->r_nak_state = 0; qp->r_head_ack_queue = next; + qpriv->r_tid_alloc = qp->r_head_ack_queue; /* Schedule the send engine. */ qp->s_flags |= RVT_S_RESP_PENDING; @@ -2356,21 +3143,24 @@ send_last: case OP(COMPARE_SWAP): case OP(FETCH_ADD): { - struct ib_atomic_eth *ateth; + struct ib_atomic_eth *ateth = &ohdr->u.atomic_eth; + u64 vaddr = get_ib_ateth_vaddr(ateth); + bool opfn = opcode == OP(COMPARE_SWAP) && + vaddr == HFI1_VERBS_E_ATOMIC_VADDR; struct rvt_ack_entry *e; - u64 vaddr; atomic64_t *maddr; u64 sdata; u32 rkey; u8 next; - if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) && + !opfn)) goto nack_inv; next = qp->r_head_ack_queue + 1; - if (next > HFI1_MAX_RDMA_ATOMIC) + if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) next = 0; spin_lock_irqsave(&qp->s_lock, flags); - if (unlikely(next == qp->s_tail_ack_queue)) { + if (unlikely(next == qp->s_acked_ack_queue)) { if (!qp->s_ack_queue[next].sent) goto nack_inv_unlck; update_ack_queue(qp, next); @@ -2380,8 +3170,11 @@ send_last: rvt_put_mr(e->rdma_sge.mr); e->rdma_sge.mr = NULL; } - ateth = &ohdr->u.atomic_eth; - vaddr = get_ib_ateth_vaddr(ateth); + /* Process OPFN special virtual address */ + if (opfn) { + opfn_conn_response(qp, e, ateth); + goto ack; + } if (unlikely(vaddr & (sizeof(u64) - 1))) goto nack_inv_unlck; rkey = be32_to_cpu(ateth->rkey); @@ -2400,6 +3193,7 @@ send_last: sdata); rvt_put_mr(qp->r_sge.sge.mr); qp->r_sge.num_sge = 0; +ack: e->opcode = opcode; e->sent = 0; e->psn = psn; @@ -2409,6 +3203,7 @@ send_last: qp->r_state = opcode; qp->r_nak_state = 0; qp->r_head_ack_queue = next; + qpriv->r_tid_alloc = qp->r_head_ack_queue; /* Schedule the send engine. */ qp->s_flags |= RVT_S_RESP_PENDING; diff --git a/drivers/infiniband/hw/hfi1/rc.h b/drivers/infiniband/hw/hfi1/rc.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ +/* + * Copyright(c) 2018 Intel Corporation. + * + */ + +#ifndef HFI1_RC_H +#define HFI1_RC_H + +/* cut down ridiculously long IB macro names */ +#define OP(x) IB_OPCODE_RC_##x + +static inline void update_ack_queue(struct rvt_qp *qp, unsigned int n) +{ + unsigned int next; + + next = n + 1; + if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) + next = 0; + qp->s_tail_ack_queue = next; + qp->s_acked_ack_queue = next; + qp->s_ack_state = OP(ACKNOWLEDGE); +} + +static inline void rc_defered_ack(struct hfi1_ctxtdata *rcd, + struct rvt_qp *qp) +{ + if (list_empty(&qp->rspwait)) { + qp->r_flags |= RVT_R_RSP_NAK; + rvt_get_qp(qp); + list_add_tail(&qp->rspwait, &rcd->qp_wait_list); + } +} + +static inline u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, + u32 psn, u32 pmtu) +{ + u32 len; + + len = delta_psn(psn, wqe->psn) * pmtu; + return rvt_restart_sge(ss, wqe, len); +} + +struct rvt_ack_entry *find_prev_entry(struct rvt_qp *qp, u32 psn, u8 *prev, + u8 *prev_ack, bool *scheduled); +int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, u64 val, + struct hfi1_ctxtdata *rcd); +struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, struct rvt_swqe *wqe, + struct hfi1_ibport *ibp); + +#endif /* HFI1_RC_H */ diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c @@ -250,7 +250,6 @@ static inline void hfi1_make_ruc_bth(struct rvt_qp *qp, struct ib_other_headers *ohdr, u32 bth0, u32 bth1, u32 bth2) { - bth1 |= qp->remote_qpn; ohdr->bth[0] = cpu_to_be32(bth0); ohdr->bth[1] = cpu_to_be32(bth1); ohdr->bth[2] = cpu_to_be32(bth2); @@ -272,13 +271,13 @@ static inline void hfi1_make_ruc_bth(struct rvt_qp *qp, */ static inline void hfi1_make_ruc_header_16B(struct rvt_qp *qp, struct ib_other_headers *ohdr, - u32 bth0, u32 bth2, int middle, + u32 bth0, u32 bth1, u32 bth2, + int middle, struct hfi1_pkt_state *ps) { struct hfi1_qp_priv *priv = qp->priv; struct hfi1_ibport *ibp = ps->ibp; struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); - u32 bth1 = 0; u32 slid; u16 pkey = hfi1_get_pkey(ibp, qp->s_pkey_index); u8 l4 = OPA_16B_L4_IB_LOCAL; @@ -360,12 +359,12 @@ static inline void hfi1_make_ruc_header_16B(struct rvt_qp *qp, */ static inline void hfi1_make_ruc_header_9B(struct rvt_qp *qp, struct ib_other_headers *ohdr, - u32 bth0, u32 bth2, int middle, + u32 bth0, u32 bth1, u32 bth2, + int middle, struct hfi1_pkt_state *ps) { struct hfi1_qp_priv *priv = qp->priv; struct hfi1_ibport *ibp = ps->ibp; - u32 bth1 = 0; u16 pkey = hfi1_get_pkey(ibp, qp->s_pkey_index); u16 lrh0 = HFI1_LRH_BTH; u8 extra_bytes = -ps->s_txreq->s_cur_size & 3; @@ -415,7 +414,7 @@ static inline void hfi1_make_ruc_header_9B(struct rvt_qp *qp, typedef void (*hfi1_make_ruc_hdr)(struct rvt_qp *qp, struct ib_other_headers *ohdr, - u32 bth0, u32 bth2, int middle, + u32 bth0, u32 bth1, u32 bth2, int middle, struct hfi1_pkt_state *ps); /* We support only two types - 9B and 16B for now */ @@ -425,7 +424,7 @@ static const hfi1_make_ruc_hdr hfi1_ruc_header_tbl[2] = { }; void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr, - u32 bth0, u32 bth2, int middle, + u32 bth0, u32 bth1, u32 bth2, int middle, struct hfi1_pkt_state *ps) { struct hfi1_qp_priv *priv = qp->priv; @@ -446,18 +445,21 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr, priv->s_ahg->ahgidx = 0; /* Make the appropriate header */ - hfi1_ruc_header_tbl[priv->hdr_type](qp, ohdr, bth0, bth2, middle, ps); + hfi1_ruc_header_tbl[priv->hdr_type](qp, ohdr, bth0, bth1, bth2, middle, + ps); } /* when sending, force a reschedule every one of these periods */ #define SEND_RESCHED_TIMEOUT (5 * HZ) /* 5s in jiffies */ /** - * schedule_send_yield - test for a yield required for QP send engine + * hfi1_schedule_send_yield - test for a yield required for QP + * send engine * @timeout: Final time for timeout slice for jiffies * @qp: a pointer to QP * @ps: a pointer to a structure with commonly lookup values for * the the send engine progress + * @tid - true if it is the tid leg * * This routine checks if the time slice for the QP has expired * for RC QPs, if so an additional work entry is queued. At this @@ -465,8 +467,8 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr, * returns true if a yield is required, otherwise, false * is returned. */ -static bool schedule_send_yield(struct rvt_qp *qp, - struct hfi1_pkt_state *ps) +bool hfi1_schedule_send_yield(struct rvt_qp *qp, struct hfi1_pkt_state *ps, + bool tid) { ps->pkts_sent = true; @@ -474,8 +476,24 @@ static bool schedule_send_yield(struct rvt_qp *qp, if (!ps->in_thread || workqueue_congested(ps->cpu, ps->ppd->hfi1_wq)) { spin_lock_irqsave(&qp->s_lock, ps->flags); - qp->s_flags &= ~RVT_S_BUSY; - hfi1_schedule_send(qp); + if (!tid) { + qp->s_flags &= ~RVT_S_BUSY; + hfi1_schedule_send(qp); + } else { + struct hfi1_qp_priv *priv = qp->priv; + + if (priv->s_flags & + HFI1_S_TID_BUSY_SET) { + qp->s_flags &= ~RVT_S_BUSY; + priv->s_flags &= + ~(HFI1_S_TID_BUSY_SET | + RVT_S_BUSY); + } else { + priv->s_flags &= ~RVT_S_BUSY; + } + hfi1_schedule_tid_send(qp); + } + spin_unlock_irqrestore(&qp->s_lock, ps->flags); this_cpu_inc(*ps->ppd->dd->send_schedule); trace_hfi1_rc_expired_time_slice(qp, true); @@ -576,6 +594,8 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) do { /* Check for a constructed packet to be sent. */ if (ps.s_txreq) { + if (priv->s_flags & HFI1_S_TID_BUSY_SET) + qp->s_flags |= RVT_S_BUSY; spin_unlock_irqrestore(&qp->s_lock, ps.flags); /* * If the packet cannot be sent now, return and @@ -585,7 +605,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) return; /* allow other tasks to run */ - if (schedule_send_yield(qp, &ps)) + if (hfi1_schedule_send_yield(qp, &ps, false)) return; spin_lock_irqsave(&qp->s_lock, ps.flags); diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c @@ -1747,10 +1747,9 @@ retry: */ static void sdma_desc_avail(struct sdma_engine *sde, uint avail) { - struct iowait *wait, *nw; + struct iowait *wait, *nw, *twait; struct iowait *waits[SDMA_WAIT_BATCH_SIZE]; - uint i, n = 0, seq, max_idx = 0; - u8 max_starved_cnt = 0; + uint i, n = 0, seq, tidx = 0; #ifdef CONFIG_SDMA_VERBOSITY dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx, @@ -1775,13 +1774,20 @@ static void sdma_desc_avail(struct sdma_engine *sde, uint avail) continue; if (n == ARRAY_SIZE(waits)) break; + iowait_init_priority(wait); num_desc = iowait_get_all_desc(wait); if (num_desc > avail) break; avail -= num_desc; - /* Find the most starved wait memeber */ - iowait_starve_find_max(wait, &max_starved_cnt, - n, &max_idx); + /* Find the top-priority wait memeber */ + if (n) { + twait = waits[tidx]; + tidx = + iowait_priority_update_top(wait, + twait, + n, + tidx); + } list_del_init(&wait->list); waits[n++] = wait; } @@ -1790,12 +1796,12 @@ static void sdma_desc_avail(struct sdma_engine *sde, uint avail) } } while (read_seqretry(&sde->waitlock, seq)); - /* Schedule the most starved one first */ + /* Schedule the top-priority entry first */ if (n) - waits[max_idx]->wakeup(waits[max_idx], SDMA_AVAIL_REASON); + waits[tidx]->wakeup(waits[tidx], SDMA_AVAIL_REASON); for (i = 0; i < n; i++) - if (i != max_idx) + if (i != tidx) waits[i]->wakeup(waits[i], SDMA_AVAIL_REASON); } diff --git a/drivers/infiniband/hw/hfi1/sdma_txreq.h b/drivers/infiniband/hw/hfi1/sdma_txreq.h @@ -91,6 +91,7 @@ struct sdma_desc { #define SDMA_TXREQ_F_URGENT 0x0001 #define SDMA_TXREQ_F_AHG_COPY 0x0002 #define SDMA_TXREQ_F_USE_AHG 0x0004 +#define SDMA_TXREQ_F_VIP 0x0010 struct sdma_txreq; typedef void (*callback_t)(struct sdma_txreq *, int); diff --git a/drivers/infiniband/hw/hfi1/sysfs.c b/drivers/infiniband/hw/hfi1/sysfs.c @@ -498,7 +498,7 @@ static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = - container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); + rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev); return sprintf(buf, "%x\n", dd_from_dev(dev)->minrev); } @@ -508,7 +508,7 @@ static ssize_t board_id_show(struct device *device, struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = - container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); + rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev); struct hfi1_devdata *dd = dd_from_dev(dev); int ret; @@ -524,7 +524,7 @@ static ssize_t boardversion_show(struct device *device, struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = - container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); + rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev); struct hfi1_devdata *dd = dd_from_dev(dev); /* The string printed here is already newline-terminated. */ @@ -536,7 +536,7 @@ static ssize_t nctxts_show(struct device *device, struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = - container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); + rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev); struct hfi1_devdata *dd = dd_from_dev(dev); /* @@ -555,7 +555,7 @@ static ssize_t nfreectxts_show(struct device *device, struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = - container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); + rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev); struct hfi1_devdata *dd = dd_from_dev(dev); /* Return the number of free user ports (contexts) available. */ @@ -567,7 +567,7 @@ static ssize_t serial_show(struct device *device, struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = - container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); + rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev); struct hfi1_devdata *dd = dd_from_dev(dev); return scnprintf(buf, PAGE_SIZE, "%s", dd->serial); @@ -579,7 +579,7 @@ static ssize_t chip_reset_store(struct device *device, size_t count) { struct hfi1_ibdev *dev = - container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); + rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev); struct hfi1_devdata *dd = dd_from_dev(dev); int ret; @@ -609,7 +609,7 @@ static ssize_t tempsense_show(struct device *device, struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = - container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); + rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev); struct hfi1_devdata *dd = dd_from_dev(dev); struct hfi1_temp temp; int ret; diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -5,8 +5,282 @@ */ #include "hfi.h" +#include "qp.h" +#include "rc.h" #include "verbs.h" #include "tid_rdma.h" +#include "exp_rcv.h" +#include "trace.h" + +/** + * DOC: TID RDMA READ protocol + * + * This is an end-to-end protocol at the hfi1 level between two nodes that + * improves performance by avoiding data copy on the requester side. It + * converts a qualified RDMA READ request into a TID RDMA READ request on + * the requester side and thereafter handles the request and response + * differently. To be qualified, the RDMA READ request should meet the + * following: + * -- The total data length should be greater than 256K; + * -- The total data length should be a multiple of 4K page size; + * -- Each local scatter-gather entry should be 4K page aligned; + * -- Each local scatter-gather entry should be a multiple of 4K page size; + */ + +#define RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK BIT_ULL(32) +#define RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK BIT_ULL(33) +#define RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK BIT_ULL(34) +#define RCV_TID_FLOW_TABLE_CTRL_KEEP_ON_GEN_ERR_SMASK BIT_ULL(35) +#define RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK BIT_ULL(37) +#define RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK BIT_ULL(38) + +/* Maximum number of packets within a flow generation. */ +#define MAX_TID_FLOW_PSN BIT(HFI1_KDETH_BTH_SEQ_SHIFT) + +#define GENERATION_MASK 0xFFFFF + +static u32 mask_generation(u32 a) +{ + return a & GENERATION_MASK; +} + +/* Reserved generation value to set to unused flows for kernel contexts */ +#define KERN_GENERATION_RESERVED mask_generation(U32_MAX) + +/* + * J_KEY for kernel contexts when TID RDMA is used. + * See generate_jkey() in hfi.h for more information. + */ +#define TID_RDMA_JKEY 32 +#define HFI1_KERNEL_MIN_JKEY HFI1_ADMIN_JKEY_RANGE +#define HFI1_KERNEL_MAX_JKEY (2 * HFI1_ADMIN_JKEY_RANGE - 1) + +/* Maximum number of segments in flight per QP request. */ +#define TID_RDMA_MAX_READ_SEGS_PER_REQ 6 +#define TID_RDMA_MAX_WRITE_SEGS_PER_REQ 4 +#define MAX_REQ max_t(u16, TID_RDMA_MAX_READ_SEGS_PER_REQ, \ + TID_RDMA_MAX_WRITE_SEGS_PER_REQ) +#define MAX_FLOWS roundup_pow_of_two(MAX_REQ + 1) + +#define MAX_EXPECTED_PAGES (MAX_EXPECTED_BUFFER / PAGE_SIZE) + +#define TID_RDMA_DESTQP_FLOW_SHIFT 11 +#define TID_RDMA_DESTQP_FLOW_MASK 0x1f + +#define TID_FLOW_SW_PSN BIT(0) + +#define TID_OPFN_QP_CTXT_MASK 0xff +#define TID_OPFN_QP_CTXT_SHIFT 56 +#define TID_OPFN_QP_KDETH_MASK 0xff +#define TID_OPFN_QP_KDETH_SHIFT 48 +#define TID_OPFN_MAX_LEN_MASK 0x7ff +#define TID_OPFN_MAX_LEN_SHIFT 37 +#define TID_OPFN_TIMEOUT_MASK 0x1f +#define TID_OPFN_TIMEOUT_SHIFT 32 +#define TID_OPFN_RESERVED_MASK 0x3f +#define TID_OPFN_RESERVED_SHIFT 26 +#define TID_OPFN_URG_MASK 0x1 +#define TID_OPFN_URG_SHIFT 25 +#define TID_OPFN_VER_MASK 0x7 +#define TID_OPFN_VER_SHIFT 22 +#define TID_OPFN_JKEY_MASK 0x3f +#define TID_OPFN_JKEY_SHIFT 16 +#define TID_OPFN_MAX_READ_MASK 0x3f +#define TID_OPFN_MAX_READ_SHIFT 10 +#define TID_OPFN_MAX_WRITE_MASK 0x3f +#define TID_OPFN_MAX_WRITE_SHIFT 4 + +/* + * OPFN TID layout + * + * 63 47 31 15 + * NNNNNNNNKKKKKKKK MMMMMMMMMMMTTTTT DDDDDDUVVVJJJJJJ RRRRRRWWWWWWCCCC + * 3210987654321098 7654321098765432 1098765432109876 5432109876543210 + * N - the context Number + * K - the Kdeth_qp + * M - Max_len + * T - Timeout + * D - reserveD + * V - version + * U - Urg capable + * J - Jkey + * R - max_Read + * W - max_Write + * C - Capcode + */ + +static u32 tid_rdma_flow_wt; + +static void tid_rdma_trigger_resume(struct work_struct *work); +static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req); +static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req, + gfp_t gfp); +static void hfi1_init_trdma_req(struct rvt_qp *qp, + struct tid_rdma_request *req); +static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx); +static void hfi1_tid_timeout(struct timer_list *t); +static void hfi1_add_tid_reap_timer(struct rvt_qp *qp); +static void hfi1_mod_tid_reap_timer(struct rvt_qp *qp); +static void hfi1_mod_tid_retry_timer(struct rvt_qp *qp); +static int hfi1_stop_tid_retry_timer(struct rvt_qp *qp); +static void hfi1_tid_retry_timeout(struct timer_list *t); +static int make_tid_rdma_ack(struct rvt_qp *qp, + struct ib_other_headers *ohdr, + struct hfi1_pkt_state *ps); +static void hfi1_do_tid_send(struct rvt_qp *qp); + +static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p) +{ + return + (((u64)p->qp & TID_OPFN_QP_CTXT_MASK) << + TID_OPFN_QP_CTXT_SHIFT) | + ((((u64)p->qp >> 16) & TID_OPFN_QP_KDETH_MASK) << + TID_OPFN_QP_KDETH_SHIFT) | + (((u64)((p->max_len >> PAGE_SHIFT) - 1) & + TID_OPFN_MAX_LEN_MASK) << TID_OPFN_MAX_LEN_SHIFT) | + (((u64)p->timeout & TID_OPFN_TIMEOUT_MASK) << + TID_OPFN_TIMEOUT_SHIFT) | + (((u64)p->urg & TID_OPFN_URG_MASK) << TID_OPFN_URG_SHIFT) | + (((u64)p->jkey & TID_OPFN_JKEY_MASK) << TID_OPFN_JKEY_SHIFT) | + (((u64)p->max_read & TID_OPFN_MAX_READ_MASK) << + TID_OPFN_MAX_READ_SHIFT) | + (((u64)p->max_write & TID_OPFN_MAX_WRITE_MASK) << + TID_OPFN_MAX_WRITE_SHIFT); +} + +static void tid_rdma_opfn_decode(struct tid_rdma_params *p, u64 data) +{ + p->max_len = (((data >> TID_OPFN_MAX_LEN_SHIFT) & + TID_OPFN_MAX_LEN_MASK) + 1) << PAGE_SHIFT; + p->jkey = (data >> TID_OPFN_JKEY_SHIFT) & TID_OPFN_JKEY_MASK; + p->max_write = (data >> TID_OPFN_MAX_WRITE_SHIFT) & + TID_OPFN_MAX_WRITE_MASK; + p->max_read = (data >> TID_OPFN_MAX_READ_SHIFT) & + TID_OPFN_MAX_READ_MASK; + p->qp = + ((((data >> TID_OPFN_QP_KDETH_SHIFT) & TID_OPFN_QP_KDETH_MASK) + << 16) | + ((data >> TID_OPFN_QP_CTXT_SHIFT) & TID_OPFN_QP_CTXT_MASK)); + p->urg = (data >> TID_OPFN_URG_SHIFT) & TID_OPFN_URG_MASK; + p->timeout = (data >> TID_OPFN_TIMEOUT_SHIFT) & TID_OPFN_TIMEOUT_MASK; +} + +void tid_rdma_opfn_init(struct rvt_qp *qp, struct tid_rdma_params *p) +{ + struct hfi1_qp_priv *priv = qp->priv; + + p->qp = (kdeth_qp << 16) | priv->rcd->ctxt; + p->max_len = TID_RDMA_MAX_SEGMENT_SIZE; + p->jkey = priv->rcd->jkey; + p->max_read = TID_RDMA_MAX_READ_SEGS_PER_REQ; + p->max_write = TID_RDMA_MAX_WRITE_SEGS_PER_REQ; + p->timeout = qp->timeout; + p->urg = is_urg_masked(priv->rcd); +} + +bool tid_rdma_conn_req(struct rvt_qp *qp, u64 *data) +{ + struct hfi1_qp_priv *priv = qp->priv; + + *data = tid_rdma_opfn_encode(&priv->tid_rdma.local); + return true; +} + +bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct tid_rdma_params *remote, *old; + bool ret = true; + + old = rcu_dereference_protected(priv->tid_rdma.remote, + lockdep_is_held(&priv->opfn.lock)); + data &= ~0xfULL; + /* + * If data passed in is zero, return true so as not to continue the + * negotiation process + */ + if (!data || !HFI1_CAP_IS_KSET(TID_RDMA)) + goto null; + /* + * If kzalloc fails, return false. This will result in: + * * at the requester a new OPFN request being generated to retry + * the negotiation + * * at the responder, 0 being returned to the requester so as to + * disable TID RDMA at both the requester and the responder + */ + remote = kzalloc(sizeof(*remote), GFP_ATOMIC); + if (!remote) { + ret = false; + goto null; + } + + tid_rdma_opfn_decode(remote, data); + priv->tid_timer_timeout_jiffies = + usecs_to_jiffies((((4096UL * (1UL << remote->timeout)) / + 1000UL) << 3) * 7); + trace_hfi1_opfn_param(qp, 0, &priv->tid_rdma.local); + trace_hfi1_opfn_param(qp, 1, remote); + rcu_assign_pointer(priv->tid_rdma.remote, remote); + /* + * A TID RDMA READ request's segment size is not equal to + * remote->max_len only when the request's data length is smaller + * than remote->max_len. In that case, there will be only one segment. + * Therefore, when priv->pkts_ps is used to calculate req->cur_seg + * during retry, it will lead to req->cur_seg = 0, which is exactly + * what is expected. + */ + priv->pkts_ps = (u16)rvt_div_mtu(qp, remote->max_len); + priv->timeout_shift = ilog2(priv->pkts_ps - 1) + 1; + goto free; +null: + RCU_INIT_POINTER(priv->tid_rdma.remote, NULL); + priv->timeout_shift = 0; +free: + if (old) + kfree_rcu(old, rcu_head); + return ret; +} + +bool tid_rdma_conn_resp(struct rvt_qp *qp, u64 *data) +{ + bool ret; + + ret = tid_rdma_conn_reply(qp, *data); + *data = 0; + /* + * If tid_rdma_conn_reply() returns error, set *data as 0 to indicate + * TID RDMA could not be enabled. This will result in TID RDMA being + * disabled at the requester too. + */ + if (ret) + (void)tid_rdma_conn_req(qp, data); + return ret; +} + +void tid_rdma_conn_error(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct tid_rdma_params *old; + + old = rcu_dereference_protected(priv->tid_rdma.remote, + lockdep_is_held(&priv->opfn.lock)); + RCU_INIT_POINTER(priv->tid_rdma.remote, NULL); + if (old) + kfree_rcu(old, rcu_head); +} + +/* This is called at context initialization time */ +int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit) +{ + if (reinit) + return 0; + + BUILD_BUG_ON(TID_RDMA_JKEY < HFI1_KERNEL_MIN_JKEY); + BUILD_BUG_ON(TID_RDMA_JKEY > HFI1_KERNEL_MAX_JKEY); + rcd->jkey = TID_RDMA_JKEY; + hfi1_set_ctxt_jkey(rcd->dd, rcd, rcd->jkey); + return hfi1_alloc_ctxt_rcv_groups(rcd); +} /** * qp_to_rcd - determine the receive context used by a qp @@ -41,8 +315,5152 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, struct ib_qp_init_attr *init_attr) { struct hfi1_qp_priv *qpriv = qp->priv; + int i, ret; qpriv->rcd = qp_to_rcd(rdi, qp); + spin_lock_init(&qpriv->opfn.lock); + INIT_WORK(&qpriv->opfn.opfn_work, opfn_send_conn_request); + INIT_WORK(&qpriv->tid_rdma.trigger_work, tid_rdma_trigger_resume); + qpriv->flow_state.psn = 0; + qpriv->flow_state.index = RXE_NUM_TID_FLOWS; + qpriv->flow_state.last_index = RXE_NUM_TID_FLOWS; + qpriv->flow_state.generation = KERN_GENERATION_RESERVED; + qpriv->s_state = TID_OP(WRITE_RESP); + qpriv->s_tid_cur = HFI1_QP_WQE_INVALID; + qpriv->s_tid_head = HFI1_QP_WQE_INVALID; + qpriv->s_tid_tail = HFI1_QP_WQE_INVALID; + qpriv->rnr_nak_state = TID_RNR_NAK_INIT; + qpriv->r_tid_head = HFI1_QP_WQE_INVALID; + qpriv->r_tid_tail = HFI1_QP_WQE_INVALID; + qpriv->r_tid_ack = HFI1_QP_WQE_INVALID; + qpriv->r_tid_alloc = HFI1_QP_WQE_INVALID; + atomic_set(&qpriv->n_requests, 0); + atomic_set(&qpriv->n_tid_requests, 0); + timer_setup(&qpriv->s_tid_timer, hfi1_tid_timeout, 0); + timer_setup(&qpriv->s_tid_retry_timer, hfi1_tid_retry_timeout, 0); + INIT_LIST_HEAD(&qpriv->tid_wait); + + if (init_attr->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) { + struct hfi1_devdata *dd = qpriv->rcd->dd; + + qpriv->pages = kzalloc_node(TID_RDMA_MAX_PAGES * + sizeof(*qpriv->pages), + GFP_KERNEL, dd->node); + if (!qpriv->pages) + return -ENOMEM; + for (i = 0; i < qp->s_size; i++) { + struct hfi1_swqe_priv *priv; + struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, i); + + priv = kzalloc_node(sizeof(*priv), GFP_KERNEL, + dd->node); + if (!priv) + return -ENOMEM; + + hfi1_init_trdma_req(qp, &priv->tid_req); + priv->tid_req.e.swqe = wqe; + wqe->priv = priv; + } + for (i = 0; i < rvt_max_atomic(rdi); i++) { + struct hfi1_ack_priv *priv; + + priv = kzalloc_node(sizeof(*priv), GFP_KERNEL, + dd->node); + if (!priv) + return -ENOMEM; + + hfi1_init_trdma_req(qp, &priv->tid_req); + priv->tid_req.e.ack = &qp->s_ack_queue[i]; + + ret = hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req, + GFP_KERNEL); + if (ret) { + kfree(priv); + return ret; + } + qp->s_ack_queue[i].priv = priv; + } + } + + return 0; +} + +void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + struct rvt_swqe *wqe; + u32 i; + + if (qp->ibqp.qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) { + for (i = 0; i < qp->s_size; i++) { + wqe = rvt_get_swqe_ptr(qp, i); + kfree(wqe->priv); + wqe->priv = NULL; + } + for (i = 0; i < rvt_max_atomic(rdi); i++) { + struct hfi1_ack_priv *priv = qp->s_ack_queue[i].priv; + + if (priv) + hfi1_kern_exp_rcv_free_flows(&priv->tid_req); + kfree(priv); + qp->s_ack_queue[i].priv = NULL; + } + cancel_work_sync(&qpriv->opfn.opfn_work); + kfree(qpriv->pages); + qpriv->pages = NULL; + } +} + +/* Flow and tid waiter functions */ +/** + * DOC: lock ordering + * + * There are two locks involved with the queuing + * routines: the qp s_lock and the exp_lock. + * + * Since the tid space allocation is called from + * the send engine, the qp s_lock is already held. + * + * The allocation routines will get the exp_lock. + * + * The first_qp() call is provided to allow the head of + * the rcd wait queue to be fetched under the exp_lock and + * followed by a drop of the exp_lock. + * + * Any qp in the wait list will have the qp reference count held + * to hold the qp in memory. + */ + +/* + * return head of rcd wait list + * + * Must hold the exp_lock. + * + * Get a reference to the QP to hold the QP in memory. + * + * The caller must release the reference when the local + * is no longer being used. + */ +static struct rvt_qp *first_qp(struct hfi1_ctxtdata *rcd, + struct tid_queue *queue) + __must_hold(&rcd->exp_lock) +{ + struct hfi1_qp_priv *priv; + + lockdep_assert_held(&rcd->exp_lock); + priv = list_first_entry_or_null(&queue->queue_head, + struct hfi1_qp_priv, + tid_wait); + if (!priv) + return NULL; + rvt_get_qp(priv->owner); + return priv->owner; +} + +/** + * kernel_tid_waiters - determine rcd wait + * @rcd: the receive context + * @qp: the head of the qp being processed + * + * This routine will return false IFF + * the list is NULL or the head of the + * list is the indicated qp. + * + * Must hold the qp s_lock and the exp_lock. + * + * Return: + * false if either of the conditions below are statisfied: + * 1. The list is empty or + * 2. The indicated qp is at the head of the list and the + * HFI1_S_WAIT_TID_SPACE bit is set in qp->s_flags. + * true is returned otherwise. + */ +static bool kernel_tid_waiters(struct hfi1_ctxtdata *rcd, + struct tid_queue *queue, struct rvt_qp *qp) + __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock) +{ + struct rvt_qp *fqp; + bool ret = true; + + lockdep_assert_held(&qp->s_lock); + lockdep_assert_held(&rcd->exp_lock); + fqp = first_qp(rcd, queue); + if (!fqp || (fqp == qp && (qp->s_flags & HFI1_S_WAIT_TID_SPACE))) + ret = false; + rvt_put_qp(fqp); + return ret; +} + +/** + * dequeue_tid_waiter - dequeue the qp from the list + * @qp - the qp to remove the wait list + * + * This routine removes the indicated qp from the + * wait list if it is there. + * + * This should be done after the hardware flow and + * tid array resources have been allocated. + * + * Must hold the qp s_lock and the rcd exp_lock. + * + * It assumes the s_lock to protect the s_flags + * field and to reliably test the HFI1_S_WAIT_TID_SPACE flag. + */ +static void dequeue_tid_waiter(struct hfi1_ctxtdata *rcd, + struct tid_queue *queue, struct rvt_qp *qp) + __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock) +{ + struct hfi1_qp_priv *priv = qp->priv; + + lockdep_assert_held(&qp->s_lock); + lockdep_assert_held(&rcd->exp_lock); + if (list_empty(&priv->tid_wait)) + return; + list_del_init(&priv->tid_wait); + qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE; + queue->dequeue++; + rvt_put_qp(qp); +} + +/** + * queue_qp_for_tid_wait - suspend QP on tid space + * @rcd: the receive context + * @qp: the qp + * + * The qp is inserted at the tail of the rcd + * wait queue and the HFI1_S_WAIT_TID_SPACE s_flag is set. + * + * Must hold the qp s_lock and the exp_lock. + */ +static void queue_qp_for_tid_wait(struct hfi1_ctxtdata *rcd, + struct tid_queue *queue, struct rvt_qp *qp) + __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock) +{ + struct hfi1_qp_priv *priv = qp->priv; + + lockdep_assert_held(&qp->s_lock); + lockdep_assert_held(&rcd->exp_lock); + if (list_empty(&priv->tid_wait)) { + qp->s_flags |= HFI1_S_WAIT_TID_SPACE; + list_add_tail(&priv->tid_wait, &queue->queue_head); + priv->tid_enqueue = ++queue->enqueue; + rcd->dd->verbs_dev.n_tidwait++; + trace_hfi1_qpsleep(qp, HFI1_S_WAIT_TID_SPACE); + rvt_get_qp(qp); + } +} + +/** + * __trigger_tid_waiter - trigger tid waiter + * @qp: the qp + * + * This is a private entrance to schedule the qp + * assuming the caller is holding the qp->s_lock. + */ +static void __trigger_tid_waiter(struct rvt_qp *qp) + __must_hold(&qp->s_lock) +{ + lockdep_assert_held(&qp->s_lock); + if (!(qp->s_flags & HFI1_S_WAIT_TID_SPACE)) + return; + trace_hfi1_qpwakeup(qp, HFI1_S_WAIT_TID_SPACE); + hfi1_schedule_send(qp); +} + +/** + * tid_rdma_schedule_tid_wakeup - schedule wakeup for a qp + * @qp - the qp + * + * trigger a schedule or a waiting qp in a deadlock + * safe manner. The qp reference is held prior + * to this call via first_qp(). + * + * If the qp trigger was already scheduled (!rval) + * the the reference is dropped, otherwise the resume + * or the destroy cancel will dispatch the reference. + */ +static void tid_rdma_schedule_tid_wakeup(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv; + struct hfi1_ibport *ibp; + struct hfi1_pportdata *ppd; + struct hfi1_devdata *dd; + bool rval; + + if (!qp) + return; + + priv = qp->priv; + ibp = to_iport(qp->ibqp.device, qp->port_num); + ppd = ppd_from_ibp(ibp); + dd = dd_from_ibdev(qp->ibqp.device); + + rval = queue_work_on(priv->s_sde ? + priv->s_sde->cpu : + cpumask_first(cpumask_of_node(dd->node)), + ppd->hfi1_wq, + &priv->tid_rdma.trigger_work); + if (!rval) + rvt_put_qp(qp); +} + +/** + * tid_rdma_trigger_resume - field a trigger work request + * @work - the work item + * + * Complete the off qp trigger processing by directly + * calling the progress routine. + */ +static void tid_rdma_trigger_resume(struct work_struct *work) +{ + struct tid_rdma_qp_params *tr; + struct hfi1_qp_priv *priv; + struct rvt_qp *qp; + + tr = container_of(work, struct tid_rdma_qp_params, trigger_work); + priv = container_of(tr, struct hfi1_qp_priv, tid_rdma); + qp = priv->owner; + spin_lock_irq(&qp->s_lock); + if (qp->s_flags & HFI1_S_WAIT_TID_SPACE) { + spin_unlock_irq(&qp->s_lock); + hfi1_do_send(priv->owner, true); + } else { + spin_unlock_irq(&qp->s_lock); + } + rvt_put_qp(qp); +} + +/** + * tid_rdma_flush_wait - unwind any tid space wait + * + * This is called when resetting a qp to + * allow a destroy or reset to get rid + * of any tid space linkage and reference counts. + */ +static void _tid_rdma_flush_wait(struct rvt_qp *qp, struct tid_queue *queue) + __must_hold(&qp->s_lock) +{ + struct hfi1_qp_priv *priv; + + if (!qp) + return; + lockdep_assert_held(&qp->s_lock); + priv = qp->priv; + qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE; + spin_lock(&priv->rcd->exp_lock); + if (!list_empty(&priv->tid_wait)) { + list_del_init(&priv->tid_wait); + qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE; + queue->dequeue++; + rvt_put_qp(qp); + } + spin_unlock(&priv->rcd->exp_lock); +} + +void hfi1_tid_rdma_flush_wait(struct rvt_qp *qp) + __must_hold(&qp->s_lock) +{ + struct hfi1_qp_priv *priv = qp->priv; + + _tid_rdma_flush_wait(qp, &priv->rcd->flow_queue); + _tid_rdma_flush_wait(qp, &priv->rcd->rarr_queue); +} + +/* Flow functions */ +/** + * kern_reserve_flow - allocate a hardware flow + * @rcd - the context to use for allocation + * @last - the index of the preferred flow. Use RXE_NUM_TID_FLOWS to + * signify "don't care". + * + * Use a bit mask based allocation to reserve a hardware + * flow for use in receiving KDETH data packets. If a preferred flow is + * specified the function will attempt to reserve that flow again, if + * available. + * + * The exp_lock must be held. + * + * Return: + * On success: a value postive value between 0 and RXE_NUM_TID_FLOWS - 1 + * On failure: -EAGAIN + */ +static int kern_reserve_flow(struct hfi1_ctxtdata *rcd, int last) + __must_hold(&rcd->exp_lock) +{ + int nr; + + /* Attempt to reserve the preferred flow index */ + if (last >= 0 && last < RXE_NUM_TID_FLOWS && + !test_and_set_bit(last, &rcd->flow_mask)) + return last; + + nr = ffz(rcd->flow_mask); + BUILD_BUG_ON(RXE_NUM_TID_FLOWS >= + (sizeof(rcd->flow_mask) * BITS_PER_BYTE)); + if (nr > (RXE_NUM_TID_FLOWS - 1)) + return -EAGAIN; + set_bit(nr, &rcd->flow_mask); + return nr; +} + +static void kern_set_hw_flow(struct hfi1_ctxtdata *rcd, u32 generation, + u32 flow_idx) +{ + u64 reg; + + reg = ((u64)generation << HFI1_KDETH_BTH_SEQ_SHIFT) | + RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK | + RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK | + RCV_TID_FLOW_TABLE_CTRL_KEEP_ON_GEN_ERR_SMASK | + RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK | + RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK; + + if (generation != KERN_GENERATION_RESERVED) + reg |= RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK; + + write_uctxt_csr(rcd->dd, rcd->ctxt, + RCV_TID_FLOW_TABLE + 8 * flow_idx, reg); +} + +static u32 kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx) + __must_hold(&rcd->exp_lock) +{ + u32 generation = rcd->flows[flow_idx].generation; + + kern_set_hw_flow(rcd, generation, flow_idx); + return generation; +} + +static u32 kern_flow_generation_next(u32 gen) +{ + u32 generation = mask_generation(gen + 1); + + if (generation == KERN_GENERATION_RESERVED) + generation = mask_generation(generation + 1); + return generation; +} + +static void kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx) + __must_hold(&rcd->exp_lock) +{ + rcd->flows[flow_idx].generation = + kern_flow_generation_next(rcd->flows[flow_idx].generation); + kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, flow_idx); +} + +int hfi1_kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp) +{ + struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv; + struct tid_flow_state *fs = &qpriv->flow_state; + struct rvt_qp *fqp; + unsigned long flags; + int ret = 0; + + /* The QP already has an allocated flow */ + if (fs->index != RXE_NUM_TID_FLOWS) + return ret; + + spin_lock_irqsave(&rcd->exp_lock, flags); + if (kernel_tid_waiters(rcd, &rcd->flow_queue, qp)) + goto queue; + + ret = kern_reserve_flow(rcd, fs->last_index); + if (ret < 0) + goto queue; + fs->index = ret; + fs->last_index = fs->index; + + /* Generation received in a RESYNC overrides default flow generation */ + if (fs->generation != KERN_GENERATION_RESERVED) + rcd->flows[fs->index].generation = fs->generation; + fs->generation = kern_setup_hw_flow(rcd, fs->index); + fs->psn = 0; + fs->flags = 0; + dequeue_tid_waiter(rcd, &rcd->flow_queue, qp); + /* get head before dropping lock */ + fqp = first_qp(rcd, &rcd->flow_queue); + spin_unlock_irqrestore(&rcd->exp_lock, flags); + + tid_rdma_schedule_tid_wakeup(fqp); + return 0; +queue: + queue_qp_for_tid_wait(rcd, &rcd->flow_queue, qp); + spin_unlock_irqrestore(&rcd->exp_lock, flags); + return -EAGAIN; +} + +void hfi1_kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp) +{ + struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv; + struct tid_flow_state *fs = &qpriv->flow_state; + struct rvt_qp *fqp; + unsigned long flags; + + if (fs->index >= RXE_NUM_TID_FLOWS) + return; + spin_lock_irqsave(&rcd->exp_lock, flags); + kern_clear_hw_flow(rcd, fs->index); + clear_bit(fs->index, &rcd->flow_mask); + fs->index = RXE_NUM_TID_FLOWS; + fs->psn = 0; + fs->generation = KERN_GENERATION_RESERVED; + + /* get head before dropping lock */ + fqp = first_qp(rcd, &rcd->flow_queue); + spin_unlock_irqrestore(&rcd->exp_lock, flags); + + if (fqp == qp) { + __trigger_tid_waiter(fqp); + rvt_put_qp(fqp); + } else { + tid_rdma_schedule_tid_wakeup(fqp); + } +} + +void hfi1_kern_init_ctxt_generations(struct hfi1_ctxtdata *rcd) +{ + int i; + + for (i = 0; i < RXE_NUM_TID_FLOWS; i++) { + rcd->flows[i].generation = mask_generation(prandom_u32()); + kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, i); + } +} + +/* TID allocation functions */ +static u8 trdma_pset_order(struct tid_rdma_pageset *s) +{ + u8 count = s->count; + + return ilog2(count) + 1; +} + +/** + * tid_rdma_find_phys_blocks_4k - get groups base on mr info + * @npages - number of pages + * @pages - pointer to an array of page structs + * @list - page set array to return + * + * This routine returns the number of groups associated with + * the current sge information. This implementation is based + * on the expected receive find_phys_blocks() adjusted to + * use the MR information vs. the pfn. + * + * Return: + * the number of RcvArray entries + */ +static u32 tid_rdma_find_phys_blocks_4k(struct tid_rdma_flow *flow, + struct page **pages, + u32 npages, + struct tid_rdma_pageset *list) +{ + u32 pagecount, pageidx, setcount = 0, i; + void *vaddr, *this_vaddr; + + if (!npages) + return 0; + + /* + * Look for sets of physically contiguous pages in the user buffer. + * This will allow us to optimize Expected RcvArray entry usage by + * using the bigger supported sizes. + */ + vaddr = page_address(pages[0]); + trace_hfi1_tid_flow_page(flow->req->qp, flow, 0, 0, 0, vaddr); + for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) { + this_vaddr = i < npages ? page_address(pages[i]) : NULL; + trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 0, 0, + this_vaddr); + /* + * If the vaddr's are not sequential, pages are not physically + * contiguous. + */ + if (this_vaddr != (vaddr + PAGE_SIZE)) { + /* + * At this point we have to loop over the set of + * physically contiguous pages and break them down it + * sizes supported by the HW. + * There are two main constraints: + * 1. The max buffer size is MAX_EXPECTED_BUFFER. + * If the total set size is bigger than that + * program only a MAX_EXPECTED_BUFFER chunk. + * 2. The buffer size has to be a power of two. If + * it is not, round down to the closes power of + * 2 and program that size. + */ + while (pagecount) { + int maxpages = pagecount; + u32 bufsize = pagecount * PAGE_SIZE; + + if (bufsize > MAX_EXPECTED_BUFFER) + maxpages = + MAX_EXPECTED_BUFFER >> + PAGE_SHIFT; + else if (!is_power_of_2(bufsize)) + maxpages = + rounddown_pow_of_two(bufsize) >> + PAGE_SHIFT; + + list[setcount].idx = pageidx; + list[setcount].count = maxpages; + trace_hfi1_tid_pageset(flow->req->qp, setcount, + list[setcount].idx, + list[setcount].count); + pagecount -= maxpages; + pageidx += maxpages; + setcount++; + } + pageidx = i; + pagecount = 1; + vaddr = this_vaddr; + } else { + vaddr += PAGE_SIZE; + pagecount++; + } + } + /* insure we always return an even number of sets */ + if (setcount & 1) + list[setcount++].count = 0; + return setcount; +} + +/** + * tid_flush_pages - dump out pages into pagesets + * @list - list of pagesets + * @idx - pointer to current page index + * @pages - number of pages to dump + * @sets - current number of pagesset + * + * This routine flushes out accumuated pages. + * + * To insure an even number of sets the + * code may add a filler. + * + * This can happen with when pages is not + * a power of 2 or pages is a power of 2 + * less than the maximum pages. + * + * Return: + * The new number of sets + */ + +static u32 tid_flush_pages(struct tid_rdma_pageset *list, + u32 *idx, u32 pages, u32 sets) +{ + while (pages) { + u32 maxpages = pages; + + if (maxpages > MAX_EXPECTED_PAGES) + maxpages = MAX_EXPECTED_PAGES; + else if (!is_power_of_2(maxpages)) + maxpages = rounddown_pow_of_two(maxpages); + list[sets].idx = *idx; + list[sets++].count = maxpages; + *idx += maxpages; + pages -= maxpages; + } + /* might need a filler */ + if (sets & 1) + list[sets++].count = 0; + return sets; +} + +/** + * tid_rdma_find_phys_blocks_8k - get groups base on mr info + * @pages - pointer to an array of page structs + * @npages - number of pages + * @list - page set array to return + * + * This routine parses an array of pages to compute pagesets + * in an 8k compatible way. + * + * pages are tested two at a time, i, i + 1 for contiguous + * pages and i - 1 and i contiguous pages. + * + * If any condition is false, any accumlated pages are flushed and + * v0,v1 are emitted as separate PAGE_SIZE pagesets + * + * Otherwise, the current 8k is totaled for a future flush. + * + * Return: + * The number of pagesets + * list set with the returned number of pagesets + * + */ +static u32 tid_rdma_find_phys_blocks_8k(struct tid_rdma_flow *flow, + struct page **pages, + u32 npages, + struct tid_rdma_pageset *list) +{ + u32 idx, sets = 0, i; + u32 pagecnt = 0; + void *v0, *v1, *vm1; + + if (!npages) + return 0; + for (idx = 0, i = 0, vm1 = NULL; i < npages; i += 2) { + /* get a new v0 */ + v0 = page_address(pages[i]); + trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 1, 0, v0); + v1 = i + 1 < npages ? + page_address(pages[i + 1]) : NULL; + trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 1, 1, v1); + /* compare i, i + 1 vaddr */ + if (v1 != (v0 + PAGE_SIZE)) { + /* flush out pages */ + sets = tid_flush_pages(list, &idx, pagecnt, sets); + /* output v0,v1 as two pagesets */ + list[sets].idx = idx++; + list[sets++].count = 1; + if (v1) { + list[sets].count = 1; + list[sets++].idx = idx++; + } else { + list[sets++].count = 0; + } + vm1 = NULL; + pagecnt = 0; + continue; + } + /* i,i+1 consecutive, look at i-1,i */ + if (vm1 && v0 != (vm1 + PAGE_SIZE)) { + /* flush out pages */ + sets = tid_flush_pages(list, &idx, pagecnt, sets); + pagecnt = 0; + } + /* pages will always be a multiple of 8k */ + pagecnt += 2; + /* save i-1 */ + vm1 = v1; + /* move to next pair */ + } + /* dump residual pages at end */ + sets = tid_flush_pages(list, &idx, npages - idx, sets); + /* by design cannot be odd sets */ + WARN_ON(sets & 1); + return sets; +} + +/** + * Find pages for one segment of a sge array represented by @ss. The function + * does not check the sge, the sge must have been checked for alignment with a + * prior call to hfi1_kern_trdma_ok. Other sge checking is done as part of + * rvt_lkey_ok and rvt_rkey_ok. Also, the function only modifies the local sge + * copy maintained in @ss->sge, the original sge is not modified. + * + * Unlike IB RDMA WRITE, we can't decrement ss->num_sge here because we are not + * releasing the MR reference count at the same time. Otherwise, we'll "leak" + * references to the MR. This difference requires that we keep track of progress + * into the sg_list. This is done by the cur_seg cursor in the tid_rdma_request + * structure. + */ +static u32 kern_find_pages(struct tid_rdma_flow *flow, + struct page **pages, + struct rvt_sge_state *ss, bool *last) +{ + struct tid_rdma_request *req = flow->req; + struct rvt_sge *sge = &ss->sge; + u32 length = flow->req->seg_len; + u32 len = PAGE_SIZE; + u32 i = 0; + + while (length && req->isge < ss->num_sge) { + pages[i++] = virt_to_page(sge->vaddr); + + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (!sge->sge_length) { + if (++req->isge < ss->num_sge) + *sge = ss->sg_list[req->isge - 1]; + } else if (sge->length == 0 && sge->mr->lkey) { + if (++sge->n >= RVT_SEGSZ) { + ++sge->m; + sge->n = 0; + } + sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = sge->mr->map[sge->m]->segs[sge->n].length; + } + length -= len; + } + + flow->length = flow->req->seg_len - length; + *last = req->isge == ss->num_sge ? false : true; + return i; +} + +static void dma_unmap_flow(struct tid_rdma_flow *flow) +{ + struct hfi1_devdata *dd; + int i; + struct tid_rdma_pageset *pset; + + dd = flow->req->rcd->dd; + for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets; + i++, pset++) { + if (pset->count && pset->addr) { + dma_unmap_page(&dd->pcidev->dev, + pset->addr, + PAGE_SIZE * pset->count, + DMA_FROM_DEVICE); + pset->mapped = 0; + } + } +} + +static int dma_map_flow(struct tid_rdma_flow *flow, struct page **pages) +{ + int i; + struct hfi1_devdata *dd = flow->req->rcd->dd; + struct tid_rdma_pageset *pset; + + for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets; + i++, pset++) { + if (pset->count) { + pset->addr = dma_map_page(&dd->pcidev->dev, + pages[pset->idx], + 0, + PAGE_SIZE * pset->count, + DMA_FROM_DEVICE); + + if (dma_mapping_error(&dd->pcidev->dev, pset->addr)) { + dma_unmap_flow(flow); + return -ENOMEM; + } + pset->mapped = 1; + } + } return 0; } + +static inline bool dma_mapped(struct tid_rdma_flow *flow) +{ + return !!flow->pagesets[0].mapped; +} + +/* + * Get pages pointers and identify contiguous physical memory chunks for a + * segment. All segments are of length flow->req->seg_len. + */ +static int kern_get_phys_blocks(struct tid_rdma_flow *flow, + struct page **pages, + struct rvt_sge_state *ss, bool *last) +{ + u8 npages; + + /* Reuse previously computed pagesets, if any */ + if (flow->npagesets) { + trace_hfi1_tid_flow_alloc(flow->req->qp, flow->req->setup_head, + flow); + if (!dma_mapped(flow)) + return dma_map_flow(flow, pages); + return 0; + } + + npages = kern_find_pages(flow, pages, ss, last); + + if (flow->req->qp->pmtu == enum_to_mtu(OPA_MTU_4096)) + flow->npagesets = + tid_rdma_find_phys_blocks_4k(flow, pages, npages, + flow->pagesets); + else + flow->npagesets = + tid_rdma_find_phys_blocks_8k(flow, pages, npages, + flow->pagesets); + + return dma_map_flow(flow, pages); +} + +static inline void kern_add_tid_node(struct tid_rdma_flow *flow, + struct hfi1_ctxtdata *rcd, char *s, + struct tid_group *grp, u8 cnt) +{ + struct kern_tid_node *node = &flow->tnode[flow->tnode_cnt++]; + + WARN_ON_ONCE(flow->tnode_cnt >= + (TID_RDMA_MAX_SEGMENT_SIZE >> PAGE_SHIFT)); + if (WARN_ON_ONCE(cnt & 1)) + dd_dev_err(rcd->dd, + "unexpected odd allocation cnt %u map 0x%x used %u", + cnt, grp->map, grp->used); + + node->grp = grp; + node->map = grp->map; + node->cnt = cnt; + trace_hfi1_tid_node_add(flow->req->qp, s, flow->tnode_cnt - 1, + grp->base, grp->map, grp->used, cnt); +} + +/* + * Try to allocate pageset_count TID's from TID groups for a context + * + * This function allocates TID's without moving groups between lists or + * modifying grp->map. This is done as follows, being cogizant of the lists + * between which the TID groups will move: + * 1. First allocate complete groups of 8 TID's since this is more efficient, + * these groups will move from group->full without affecting used + * 2. If more TID's are needed allocate from used (will move from used->full or + * stay in used) + * 3. If we still don't have the required number of TID's go back and look again + * at a complete group (will move from group->used) + */ +static int kern_alloc_tids(struct tid_rdma_flow *flow) +{ + struct hfi1_ctxtdata *rcd = flow->req->rcd; + struct hfi1_devdata *dd = rcd->dd; + u32 ngroups, pageidx = 0; + struct tid_group *group = NULL, *used; + u8 use; + + flow->tnode_cnt = 0; + ngroups = flow->npagesets / dd->rcv_entries.group_size; + if (!ngroups) + goto used_list; + + /* First look at complete groups */ + list_for_each_entry(group, &rcd->tid_group_list.list, list) { + kern_add_tid_node(flow, rcd, "complete groups", group, + group->size); + + pageidx += group->size; + if (!--ngroups) + break; + } + + if (pageidx >= flow->npagesets) + goto ok; + +used_list: + /* Now look at partially used groups */ + list_for_each_entry(used, &rcd->tid_used_list.list, list) { + use = min_t(u32, flow->npagesets - pageidx, + used->size - used->used); + kern_add_tid_node(flow, rcd, "used groups", used, use); + + pageidx += use; + if (pageidx >= flow->npagesets) + goto ok; + } + + /* + * Look again at a complete group, continuing from where we left. + * However, if we are at the head, we have reached the end of the + * complete groups list from the first loop above + */ + if (group && &group->list == &rcd->tid_group_list.list) + goto bail_eagain; + group = list_prepare_entry(group, &rcd->tid_group_list.list, + list); + if (list_is_last(&group->list, &rcd->tid_group_list.list)) + goto bail_eagain; + group = list_next_entry(group, list); + use = min_t(u32, flow->npagesets - pageidx, group->size); + kern_add_tid_node(flow, rcd, "complete continue", group, use); + pageidx += use; + if (pageidx >= flow->npagesets) + goto ok; +bail_eagain: + trace_hfi1_msg_alloc_tids(flow->req->qp, " insufficient tids: needed ", + (u64)flow->npagesets); + return -EAGAIN; +ok: + return 0; +} + +static void kern_program_rcv_group(struct tid_rdma_flow *flow, int grp_num, + u32 *pset_idx) +{ + struct hfi1_ctxtdata *rcd = flow->req->rcd; + struct hfi1_devdata *dd = rcd->dd; + struct kern_tid_node *node = &flow->tnode[grp_num]; + struct tid_group *grp = node->grp; + struct tid_rdma_pageset *pset; + u32 pmtu_pg = flow->req->qp->pmtu >> PAGE_SHIFT; + u32 rcventry, npages = 0, pair = 0, tidctrl; + u8 i, cnt = 0; + + for (i = 0; i < grp->size; i++) { + rcventry = grp->base + i; + + if (node->map & BIT(i) || cnt >= node->cnt) { + rcv_array_wc_fill(dd, rcventry); + continue; + } + pset = &flow->pagesets[(*pset_idx)++]; + if (pset->count) { + hfi1_put_tid(dd, rcventry, PT_EXPECTED, + pset->addr, trdma_pset_order(pset)); + } else { + hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0); + } + npages += pset->count; + + rcventry -= rcd->expected_base; + tidctrl = pair ? 0x3 : rcventry & 0x1 ? 0x2 : 0x1; + /* + * A single TID entry will be used to use a rcvarr pair (with + * tidctrl 0x3), if ALL these are true (a) the bit pos is even + * (b) the group map shows current and the next bits as free + * indicating two consecutive rcvarry entries are available (c) + * we actually need 2 more entries + */ + pair = !(i & 0x1) && !((node->map >> i) & 0x3) && + node->cnt >= cnt + 2; + if (!pair) { + if (!pset->count) + tidctrl = 0x1; + flow->tid_entry[flow->tidcnt++] = + EXP_TID_SET(IDX, rcventry >> 1) | + EXP_TID_SET(CTRL, tidctrl) | + EXP_TID_SET(LEN, npages); + trace_hfi1_tid_entry_alloc(/* entry */ + flow->req->qp, flow->tidcnt - 1, + flow->tid_entry[flow->tidcnt - 1]); + + /* Efficient DIV_ROUND_UP(npages, pmtu_pg) */ + flow->npkts += (npages + pmtu_pg - 1) >> ilog2(pmtu_pg); + npages = 0; + } + + if (grp->used == grp->size - 1) + tid_group_move(grp, &rcd->tid_used_list, + &rcd->tid_full_list); + else if (!grp->used) + tid_group_move(grp, &rcd->tid_group_list, + &rcd->tid_used_list); + + grp->used++; + grp->map |= BIT(i); + cnt++; + } +} + +static void kern_unprogram_rcv_group(struct tid_rdma_flow *flow, int grp_num) +{ + struct hfi1_ctxtdata *rcd = flow->req->rcd; + struct hfi1_devdata *dd = rcd->dd; + struct kern_tid_node *node = &flow->tnode[grp_num]; + struct tid_group *grp = node->grp; + u32 rcventry; + u8 i, cnt = 0; + + for (i = 0; i < grp->size; i++) { + rcventry = grp->base + i; + + if (node->map & BIT(i) || cnt >= node->cnt) { + rcv_array_wc_fill(dd, rcventry); + continue; + } + + hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0); + + grp->used--; + grp->map &= ~BIT(i); + cnt++; + + if (grp->used == grp->size - 1) + tid_group_move(grp, &rcd->tid_full_list, + &rcd->tid_used_list); + else if (!grp->used) + tid_group_move(grp, &rcd->tid_used_list, + &rcd->tid_group_list); + } + if (WARN_ON_ONCE(cnt & 1)) { + struct hfi1_ctxtdata *rcd = flow->req->rcd; + struct hfi1_devdata *dd = rcd->dd; + + dd_dev_err(dd, "unexpected odd free cnt %u map 0x%x used %u", + cnt, grp->map, grp->used); + } +} + +static void kern_program_rcvarray(struct tid_rdma_flow *flow) +{ + u32 pset_idx = 0; + int i; + + flow->npkts = 0; + flow->tidcnt = 0; + for (i = 0; i < flow->tnode_cnt; i++) + kern_program_rcv_group(flow, i, &pset_idx); + trace_hfi1_tid_flow_alloc(flow->req->qp, flow->req->setup_head, flow); +} + +/** + * hfi1_kern_exp_rcv_setup() - setup TID's and flow for one segment of a + * TID RDMA request + * + * @req: TID RDMA request for which the segment/flow is being set up + * @ss: sge state, maintains state across successive segments of a sge + * @last: set to true after the last sge segment has been processed + * + * This function + * (1) finds a free flow entry in the flow circular buffer + * (2) finds pages and continuous physical chunks constituing one segment + * of an sge + * (3) allocates TID group entries for those chunks + * (4) programs rcvarray entries in the hardware corresponding to those + * TID's + * (5) computes a tidarray with formatted TID entries which can be sent + * to the sender + * (6) Reserves and programs HW flows. + * (7) It also manages queing the QP when TID/flow resources are not + * available. + * + * @req points to struct tid_rdma_request of which the segments are a part. The + * function uses qp, rcd and seg_len members of @req. In the absence of errors, + * req->flow_idx is the index of the flow which has been prepared in this + * invocation of function call. With flow = &req->flows[req->flow_idx], + * flow->tid_entry contains the TID array which the sender can use for TID RDMA + * sends and flow->npkts contains number of packets required to send the + * segment. + * + * hfi1_check_sge_align should be called prior to calling this function and if + * it signals error TID RDMA cannot be used for this sge and this function + * should not be called. + * + * For the queuing, caller must hold the flow->req->qp s_lock from the send + * engine and the function will procure the exp_lock. + * + * Return: + * The function returns -EAGAIN if sufficient number of TID/flow resources to + * map the segment could not be allocated. In this case the function should be + * called again with previous arguments to retry the TID allocation. There are + * no other error returns. The function returns 0 on success. + */ +int hfi1_kern_exp_rcv_setup(struct tid_rdma_request *req, + struct rvt_sge_state *ss, bool *last) + __must_hold(&req->qp->s_lock) +{ + struct tid_rdma_flow *flow = &req->flows[req->setup_head]; + struct hfi1_ctxtdata *rcd = req->rcd; + struct hfi1_qp_priv *qpriv = req->qp->priv; + unsigned long flags; + struct rvt_qp *fqp; + u16 clear_tail = req->clear_tail; + + lockdep_assert_held(&req->qp->s_lock); + /* + * We return error if either (a) we don't have space in the flow + * circular buffer, or (b) we already have max entries in the buffer. + * Max entries depend on the type of request we are processing and the + * negotiated TID RDMA parameters. + */ + if (!CIRC_SPACE(req->setup_head, clear_tail, MAX_FLOWS) || + CIRC_CNT(req->setup_head, clear_tail, MAX_FLOWS) >= + req->n_flows) + return -EINVAL; + + /* + * Get pages, identify contiguous physical memory chunks for the segment + * If we can not determine a DMA address mapping we will treat it just + * like if we ran out of space above. + */ + if (kern_get_phys_blocks(flow, qpriv->pages, ss, last)) { + hfi1_wait_kmem(flow->req->qp); + return -ENOMEM; + } + + spin_lock_irqsave(&rcd->exp_lock, flags); + if (kernel_tid_waiters(rcd, &rcd->rarr_queue, flow->req->qp)) + goto queue; + + /* + * At this point we know the number of pagesets and hence the number of + * TID's to map the segment. Allocate the TID's from the TID groups. If + * we cannot allocate the required number we exit and try again later + */ + if (kern_alloc_tids(flow)) + goto queue; + /* + * Finally program the TID entries with the pagesets, compute the + * tidarray and enable the HW flow + */ + kern_program_rcvarray(flow); + + /* + * Setup the flow state with relevant information. + * This information is used for tracking the sequence of data packets + * for the segment. + * The flow is setup here as this is the most accurate time and place + * to do so. Doing at a later time runs the risk of the flow data in + * qpriv getting out of sync. + */ + memset(&flow->flow_state, 0x0, sizeof(flow->flow_state)); + flow->idx = qpriv->flow_state.index; + flow->flow_state.generation = qpriv->flow_state.generation; + flow->flow_state.spsn = qpriv->flow_state.psn; + flow->flow_state.lpsn = flow->flow_state.spsn + flow->npkts - 1; + flow->flow_state.r_next_psn = + full_flow_psn(flow, flow->flow_state.spsn); + qpriv->flow_state.psn += flow->npkts; + + dequeue_tid_waiter(rcd, &rcd->rarr_queue, flow->req->qp); + /* get head before dropping lock */ + fqp = first_qp(rcd, &rcd->rarr_queue); + spin_unlock_irqrestore(&rcd->exp_lock, flags); + tid_rdma_schedule_tid_wakeup(fqp); + + req->setup_head = (req->setup_head + 1) & (MAX_FLOWS - 1); + return 0; +queue: + queue_qp_for_tid_wait(rcd, &rcd->rarr_queue, flow->req->qp); + spin_unlock_irqrestore(&rcd->exp_lock, flags); + return -EAGAIN; +} + +static void hfi1_tid_rdma_reset_flow(struct tid_rdma_flow *flow) +{ + flow->npagesets = 0; +} + +/* + * This function is called after one segment has been successfully sent to + * release the flow and TID HW/SW resources for that segment. The segments for a + * TID RDMA request are setup and cleared in FIFO order which is managed using a + * circular buffer. + */ +int hfi1_kern_exp_rcv_clear(struct tid_rdma_request *req) + __must_hold(&req->qp->s_lock) +{ + struct tid_rdma_flow *flow = &req->flows[req->clear_tail]; + struct hfi1_ctxtdata *rcd = req->rcd; + unsigned long flags; + int i; + struct rvt_qp *fqp; + + lockdep_assert_held(&req->qp->s_lock); + /* Exit if we have nothing in the flow circular buffer */ + if (!CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS)) + return -EINVAL; + + spin_lock_irqsave(&rcd->exp_lock, flags); + + for (i = 0; i < flow->tnode_cnt; i++) + kern_unprogram_rcv_group(flow, i); + /* To prevent double unprogramming */ + flow->tnode_cnt = 0; + /* get head before dropping lock */ + fqp = first_qp(rcd, &rcd->rarr_queue); + spin_unlock_irqrestore(&rcd->exp_lock, flags); + + dma_unmap_flow(flow); + + hfi1_tid_rdma_reset_flow(flow); + req->clear_tail = (req->clear_tail + 1) & (MAX_FLOWS - 1); + + if (fqp == req->qp) { + __trigger_tid_waiter(fqp); + rvt_put_qp(fqp); + } else { + tid_rdma_schedule_tid_wakeup(fqp); + } + + return 0; +} + +/* + * This function is called to release all the tid entries for + * a request. + */ +void hfi1_kern_exp_rcv_clear_all(struct tid_rdma_request *req) + __must_hold(&req->qp->s_lock) +{ + /* Use memory barrier for proper ordering */ + while (CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS)) { + if (hfi1_kern_exp_rcv_clear(req)) + break; + } +} + +/** + * hfi1_kern_exp_rcv_free_flows - free priviously allocated flow information + * @req - the tid rdma request to be cleaned + */ +static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req) +{ + kfree(req->flows); + req->flows = NULL; +} + +/** + * __trdma_clean_swqe - clean up for large sized QPs + * @qp: the queue patch + * @wqe: the send wqe + */ +void __trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe) +{ + struct hfi1_swqe_priv *p = wqe->priv; + + hfi1_kern_exp_rcv_free_flows(&p->tid_req); +} + +/* + * This can be called at QP create time or in the data path. + */ +static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req, + gfp_t gfp) +{ + struct tid_rdma_flow *flows; + int i; + + if (likely(req->flows)) + return 0; + flows = kmalloc_node(MAX_FLOWS * sizeof(*flows), gfp, + req->rcd->numa_id); + if (!flows) + return -ENOMEM; + /* mini init */ + for (i = 0; i < MAX_FLOWS; i++) { + flows[i].req = req; + flows[i].npagesets = 0; + flows[i].pagesets[0].mapped = 0; + } + req->flows = flows; + return 0; +} + +static void hfi1_init_trdma_req(struct rvt_qp *qp, + struct tid_rdma_request *req) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + + /* + * Initialize various TID RDMA request variables. + * These variables are "static", which is why they + * can be pre-initialized here before the WRs has + * even been submitted. + * However, non-NULL values for these variables do not + * imply that this WQE has been enabled for TID RDMA. + * Drivers should check the WQE's opcode to determine + * if a request is a TID RDMA one or not. + */ + req->qp = qp; + req->rcd = qpriv->rcd; +} + +u64 hfi1_access_sw_tid_wait(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = context; + + return dd->verbs_dev.n_tidwait; +} + +static struct tid_rdma_flow *find_flow_ib(struct tid_rdma_request *req, + u32 psn, u16 *fidx) +{ + u16 head, tail; + struct tid_rdma_flow *flow; + + head = req->setup_head; + tail = req->clear_tail; + for ( ; CIRC_CNT(head, tail, MAX_FLOWS); + tail = CIRC_NEXT(tail, MAX_FLOWS)) { + flow = &req->flows[tail]; + if (cmp_psn(psn, flow->flow_state.ib_spsn) >= 0 && + cmp_psn(psn, flow->flow_state.ib_lpsn) <= 0) { + if (fidx) + *fidx = tail; + return flow; + } + } + return NULL; +} + +static struct tid_rdma_flow * +__find_flow_ranged(struct tid_rdma_request *req, u16 head, u16 tail, + u32 psn, u16 *fidx) +{ + for ( ; CIRC_CNT(head, tail, MAX_FLOWS); + tail = CIRC_NEXT(tail, MAX_FLOWS)) { + struct tid_rdma_flow *flow = &req->flows[tail]; + u32 spsn, lpsn; + + spsn = full_flow_psn(flow, flow->flow_state.spsn); + lpsn = full_flow_psn(flow, flow->flow_state.lpsn); + + if (cmp_psn(psn, spsn) >= 0 && cmp_psn(psn, lpsn) <= 0) { + if (fidx) + *fidx = tail; + return flow; + } + } + return NULL; +} + +static struct tid_rdma_flow *find_flow(struct tid_rdma_request *req, + u32 psn, u16 *fidx) +{ + return __find_flow_ranged(req, req->setup_head, req->clear_tail, psn, + fidx); +} + +/* TID RDMA READ functions */ +u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, u32 *bth1, + u32 *bth2, u32 *len) +{ + struct tid_rdma_request *req = wqe_to_tid_req(wqe); + struct tid_rdma_flow *flow = &req->flows[req->flow_idx]; + struct rvt_qp *qp = req->qp; + struct hfi1_qp_priv *qpriv = qp->priv; + struct hfi1_swqe_priv *wpriv = wqe->priv; + struct tid_rdma_read_req *rreq = &ohdr->u.tid_rdma.r_req; + struct tid_rdma_params *remote; + u32 req_len = 0; + void *req_addr = NULL; + + /* This is the IB psn used to send the request */ + *bth2 = mask_psn(flow->flow_state.ib_spsn + flow->pkt); + trace_hfi1_tid_flow_build_read_pkt(qp, req->flow_idx, flow); + + /* TID Entries for TID RDMA READ payload */ + req_addr = &flow->tid_entry[flow->tid_idx]; + req_len = sizeof(*flow->tid_entry) * + (flow->tidcnt - flow->tid_idx); + + memset(&ohdr->u.tid_rdma.r_req, 0, sizeof(ohdr->u.tid_rdma.r_req)); + wpriv->ss.sge.vaddr = req_addr; + wpriv->ss.sge.sge_length = req_len; + wpriv->ss.sge.length = wpriv->ss.sge.sge_length; + /* + * We can safely zero these out. Since the first SGE covers the + * entire packet, nothing else should even look at the MR. + */ + wpriv->ss.sge.mr = NULL; + wpriv->ss.sge.m = 0; + wpriv->ss.sge.n = 0; + + wpriv->ss.sg_list = NULL; + wpriv->ss.total_len = wpriv->ss.sge.sge_length; + wpriv->ss.num_sge = 1; + + /* Construct the TID RDMA READ REQ packet header */ + rcu_read_lock(); + remote = rcu_dereference(qpriv->tid_rdma.remote); + + KDETH_RESET(rreq->kdeth0, KVER, 0x1); + KDETH_RESET(rreq->kdeth1, JKEY, remote->jkey); + rreq->reth.vaddr = cpu_to_be64(wqe->rdma_wr.remote_addr + + req->cur_seg * req->seg_len + flow->sent); + rreq->reth.rkey = cpu_to_be32(wqe->rdma_wr.rkey); + rreq->reth.length = cpu_to_be32(*len); + rreq->tid_flow_psn = + cpu_to_be32((flow->flow_state.generation << + HFI1_KDETH_BTH_SEQ_SHIFT) | + ((flow->flow_state.spsn + flow->pkt) & + HFI1_KDETH_BTH_SEQ_MASK)); + rreq->tid_flow_qp = + cpu_to_be32(qpriv->tid_rdma.local.qp | + ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) << + TID_RDMA_DESTQP_FLOW_SHIFT) | + qpriv->rcd->ctxt); + rreq->verbs_qp = cpu_to_be32(qp->remote_qpn); + *bth1 &= ~RVT_QPN_MASK; + *bth1 |= remote->qp; + *bth2 |= IB_BTH_REQ_ACK; + rcu_read_unlock(); + + /* We are done with this segment */ + flow->sent += *len; + req->cur_seg++; + qp->s_state = TID_OP(READ_REQ); + req->ack_pending++; + req->flow_idx = (req->flow_idx + 1) & (MAX_FLOWS - 1); + qpriv->pending_tid_r_segs++; + qp->s_num_rd_atomic++; + + /* Set the TID RDMA READ request payload size */ + *len = req_len; + + return sizeof(ohdr->u.tid_rdma.r_req) / sizeof(u32); +} + +/* + * @len: contains the data length to read upon entry and the read request + * payload length upon exit. + */ +u32 hfi1_build_tid_rdma_read_req(struct rvt_qp *qp, struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, u32 *bth1, + u32 *bth2, u32 *len) + __must_hold(&qp->s_lock) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + struct tid_rdma_request *req = wqe_to_tid_req(wqe); + struct tid_rdma_flow *flow = NULL; + u32 hdwords = 0; + bool last; + bool retry = true; + u32 npkts = rvt_div_round_up_mtu(qp, *len); + + trace_hfi1_tid_req_build_read_req(qp, 0, wqe->wr.opcode, wqe->psn, + wqe->lpsn, req); + /* + * Check sync conditions. Make sure that there are no pending + * segments before freeing the flow. + */ +sync_check: + if (req->state == TID_REQUEST_SYNC) { + if (qpriv->pending_tid_r_segs) + goto done; + + hfi1_kern_clear_hw_flow(req->rcd, qp); + req->state = TID_REQUEST_ACTIVE; + } + + /* + * If the request for this segment is resent, the tid resources should + * have been allocated before. In this case, req->flow_idx should + * fall behind req->setup_head. + */ + if (req->flow_idx == req->setup_head) { + retry = false; + if (req->state == TID_REQUEST_RESEND) { + /* + * This is the first new segment for a request whose + * earlier segments have been re-sent. We need to + * set up the sge pointer correctly. + */ + restart_sge(&qp->s_sge, wqe, req->s_next_psn, + qp->pmtu); + req->isge = 0; + req->state = TID_REQUEST_ACTIVE; + } + + /* + * Check sync. The last PSN of each generation is reserved for + * RESYNC. + */ + if ((qpriv->flow_state.psn + npkts) > MAX_TID_FLOW_PSN - 1) { + req->state = TID_REQUEST_SYNC; + goto sync_check; + } + + /* Allocate the flow if not yet */ + if (hfi1_kern_setup_hw_flow(qpriv->rcd, qp)) + goto done; + + /* + * The following call will advance req->setup_head after + * allocating the tid entries. + */ + if (hfi1_kern_exp_rcv_setup(req, &qp->s_sge, &last)) { + req->state = TID_REQUEST_QUEUED; + + /* + * We don't have resources for this segment. The QP has + * already been queued. + */ + goto done; + } + } + + /* req->flow_idx should only be one slot behind req->setup_head */ + flow = &req->flows[req->flow_idx]; + flow->pkt = 0; + flow->tid_idx = 0; + flow->sent = 0; + if (!retry) { + /* Set the first and last IB PSN for the flow in use.*/ + flow->flow_state.ib_spsn = req->s_next_psn; + flow->flow_state.ib_lpsn = + flow->flow_state.ib_spsn + flow->npkts - 1; + } + + /* Calculate the next segment start psn.*/ + req->s_next_psn += flow->npkts; + + /* Build the packet header */ + hdwords = hfi1_build_tid_rdma_read_packet(wqe, ohdr, bth1, bth2, len); +done: + return hdwords; +} + +/* + * Validate and accept the TID RDMA READ request parameters. + * Return 0 if the request is accepted successfully; + * Return 1 otherwise. + */ +static int tid_rdma_rcv_read_request(struct rvt_qp *qp, + struct rvt_ack_entry *e, + struct hfi1_packet *packet, + struct ib_other_headers *ohdr, + u32 bth0, u32 psn, u64 vaddr, u32 len) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + struct tid_rdma_request *req; + struct tid_rdma_flow *flow; + u32 flow_psn, i, tidlen = 0, pktlen, tlen; + + req = ack_to_tid_req(e); + + /* Validate the payload first */ + flow = &req->flows[req->setup_head]; + + /* payload length = packet length - (header length + ICRC length) */ + pktlen = packet->tlen - (packet->hlen + 4); + if (pktlen > sizeof(flow->tid_entry)) + return 1; + memcpy(flow->tid_entry, packet->ebuf, pktlen); + flow->tidcnt = pktlen / sizeof(*flow->tid_entry); + + /* + * Walk the TID_ENTRY list to make sure we have enough space for a + * complete segment. Also calculate the number of required packets. + */ + flow->npkts = rvt_div_round_up_mtu(qp, len); + for (i = 0; i < flow->tidcnt; i++) { + trace_hfi1_tid_entry_rcv_read_req(qp, i, + flow->tid_entry[i]); + tlen = EXP_TID_GET(flow->tid_entry[i], LEN); + if (!tlen) + return 1; + + /* + * For tid pair (tidctr == 3), the buffer size of the pair + * should be the sum of the buffer size described by each + * tid entry. However, only the first entry needs to be + * specified in the request (see WFR HAS Section 8.5.7.1). + */ + tidlen += tlen; + } + if (tidlen * PAGE_SIZE < len) + return 1; + + /* Empty the flow array */ + req->clear_tail = req->setup_head; + flow->pkt = 0; + flow->tid_idx = 0; + flow->tid_offset = 0; + flow->sent = 0; + flow->tid_qpn = be32_to_cpu(ohdr->u.tid_rdma.r_req.tid_flow_qp); + flow->idx = (flow->tid_qpn >> TID_RDMA_DESTQP_FLOW_SHIFT) & + TID_RDMA_DESTQP_FLOW_MASK; + flow_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_req.tid_flow_psn)); + flow->flow_state.generation = flow_psn >> HFI1_KDETH_BTH_SEQ_SHIFT; + flow->flow_state.spsn = flow_psn & HFI1_KDETH_BTH_SEQ_MASK; + flow->length = len; + + flow->flow_state.lpsn = flow->flow_state.spsn + + flow->npkts - 1; + flow->flow_state.ib_spsn = psn; + flow->flow_state.ib_lpsn = flow->flow_state.ib_spsn + flow->npkts - 1; + + trace_hfi1_tid_flow_rcv_read_req(qp, req->setup_head, flow); + /* Set the initial flow index to the current flow. */ + req->flow_idx = req->setup_head; + + /* advance circular buffer head */ + req->setup_head = (req->setup_head + 1) & (MAX_FLOWS - 1); + + /* + * Compute last PSN for request. + */ + e->opcode = (bth0 >> 24) & 0xff; + e->psn = psn; + e->lpsn = psn + flow->npkts - 1; + e->sent = 0; + + req->n_flows = qpriv->tid_rdma.local.max_read; + req->state = TID_REQUEST_ACTIVE; + req->cur_seg = 0; + req->comp_seg = 0; + req->ack_seg = 0; + req->isge = 0; + req->seg_len = qpriv->tid_rdma.local.max_len; + req->total_len = len; + req->total_segs = 1; + req->r_flow_psn = e->psn; + + trace_hfi1_tid_req_rcv_read_req(qp, 0, e->opcode, e->psn, e->lpsn, + req); + return 0; +} + +static int tid_rdma_rcv_error(struct hfi1_packet *packet, + struct ib_other_headers *ohdr, + struct rvt_qp *qp, u32 psn, int diff) +{ + struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + struct hfi1_ctxtdata *rcd = ((struct hfi1_qp_priv *)qp->priv)->rcd; + struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); + struct hfi1_qp_priv *qpriv = qp->priv; + struct rvt_ack_entry *e; + struct tid_rdma_request *req; + unsigned long flags; + u8 prev; + bool old_req; + + trace_hfi1_rsp_tid_rcv_error(qp, psn); + trace_hfi1_tid_rdma_rcv_err(qp, 0, psn, diff); + if (diff > 0) { + /* sequence error */ + if (!qp->r_nak_state) { + ibp->rvp.n_rc_seqnak++; + qp->r_nak_state = IB_NAK_PSN_ERROR; + qp->r_ack_psn = qp->r_psn; + rc_defered_ack(rcd, qp); + } + goto done; + } + + ibp->rvp.n_rc_dupreq++; + + spin_lock_irqsave(&qp->s_lock, flags); + e = find_prev_entry(qp, psn, &prev, NULL, &old_req); + if (!e || (e->opcode != TID_OP(READ_REQ) && + e->opcode != TID_OP(WRITE_REQ))) + goto unlock; + + req = ack_to_tid_req(e); + req->r_flow_psn = psn; + trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn, e->lpsn, req); + if (e->opcode == TID_OP(READ_REQ)) { + struct ib_reth *reth; + u32 offset; + u32 len; + u32 rkey; + u64 vaddr; + int ok; + u32 bth0; + + reth = &ohdr->u.tid_rdma.r_req.reth; + /* + * The requester always restarts from the start of the original + * request. + */ + offset = delta_psn(psn, e->psn) * qp->pmtu; + len = be32_to_cpu(reth->length); + if (psn != e->psn || len != req->total_len) + goto unlock; + + if (e->rdma_sge.mr) { + rvt_put_mr(e->rdma_sge.mr); + e->rdma_sge.mr = NULL; + } + + rkey = be32_to_cpu(reth->rkey); + vaddr = get_ib_reth_vaddr(reth); + + qp->r_len = len; + ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey, + IB_ACCESS_REMOTE_READ); + if (unlikely(!ok)) + goto unlock; + + /* + * If all the response packets for the current request have + * been sent out and this request is complete (old_request + * == false) and the TID flow may be unusable (the + * req->clear_tail is advanced). However, when an earlier + * request is received, this request will not be complete any + * more (qp->s_tail_ack_queue is moved back, see below). + * Consequently, we need to update the TID flow info everytime + * a duplicate request is received. + */ + bth0 = be32_to_cpu(ohdr->bth[0]); + if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn, + vaddr, len)) + goto unlock; + + /* + * True if the request is already scheduled (between + * qp->s_tail_ack_queue and qp->r_head_ack_queue); + */ + if (old_req) + goto unlock; + } else { + struct flow_state *fstate; + bool schedule = false; + u8 i; + + if (req->state == TID_REQUEST_RESEND) { + req->state = TID_REQUEST_RESEND_ACTIVE; + } else if (req->state == TID_REQUEST_INIT_RESEND) { + req->state = TID_REQUEST_INIT; + schedule = true; + } + + /* + * True if the request is already scheduled (between + * qp->s_tail_ack_queue and qp->r_head_ack_queue). + * Also, don't change requests, which are at the SYNC + * point and haven't generated any responses yet. + * There is nothing to retransmit for them yet. + */ + if (old_req || req->state == TID_REQUEST_INIT || + (req->state == TID_REQUEST_SYNC && !req->cur_seg)) { + for (i = prev + 1; ; i++) { + if (i > rvt_size_atomic(&dev->rdi)) + i = 0; + if (i == qp->r_head_ack_queue) + break; + e = &qp->s_ack_queue[i]; + req = ack_to_tid_req(e); + if (e->opcode == TID_OP(WRITE_REQ) && + req->state == TID_REQUEST_INIT) + req->state = TID_REQUEST_INIT_RESEND; + } + /* + * If the state of the request has been changed, + * the first leg needs to get scheduled in order to + * pick up the change. Otherwise, normal response + * processing should take care of it. + */ + if (!schedule) + goto unlock; + } + + /* + * If there is no more allocated segment, just schedule the qp + * without changing any state. + */ + if (req->clear_tail == req->setup_head) + goto schedule; + /* + * If this request has sent responses for segments, which have + * not received data yet (flow_idx != clear_tail), the flow_idx + * pointer needs to be adjusted so the same responses can be + * re-sent. + */ + if (CIRC_CNT(req->flow_idx, req->clear_tail, MAX_FLOWS)) { + fstate = &req->flows[req->clear_tail].flow_state; + qpriv->pending_tid_w_segs -= + CIRC_CNT(req->flow_idx, req->clear_tail, + MAX_FLOWS); + req->flow_idx = + CIRC_ADD(req->clear_tail, + delta_psn(psn, fstate->resp_ib_psn), + MAX_FLOWS); + qpriv->pending_tid_w_segs += + delta_psn(psn, fstate->resp_ib_psn); + /* + * When flow_idx == setup_head, we've gotten a duplicate + * request for a segment, which has not been allocated + * yet. In that case, don't adjust this request. + * However, we still want to go through the loop below + * to adjust all subsequent requests. + */ + if (CIRC_CNT(req->setup_head, req->flow_idx, + MAX_FLOWS)) { + req->cur_seg = delta_psn(psn, e->psn); + req->state = TID_REQUEST_RESEND_ACTIVE; + } + } + + for (i = prev + 1; ; i++) { + /* + * Look at everything up to and including + * s_tail_ack_queue + */ + if (i > rvt_size_atomic(&dev->rdi)) + i = 0; + if (i == qp->r_head_ack_queue) + break; + e = &qp->s_ack_queue[i]; + req = ack_to_tid_req(e); + trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn, + e->lpsn, req); + if (e->opcode != TID_OP(WRITE_REQ) || + req->cur_seg == req->comp_seg || + req->state == TID_REQUEST_INIT || + req->state == TID_REQUEST_INIT_RESEND) { + if (req->state == TID_REQUEST_INIT) + req->state = TID_REQUEST_INIT_RESEND; + continue; + } + qpriv->pending_tid_w_segs -= + CIRC_CNT(req->flow_idx, + req->clear_tail, + MAX_FLOWS); + req->flow_idx = req->clear_tail; + req->state = TID_REQUEST_RESEND; + req->cur_seg = req->comp_seg; + } + qpriv->s_flags &= ~HFI1_R_TID_WAIT_INTERLCK; + } + /* Re-process old requests.*/ + if (qp->s_acked_ack_queue == qp->s_tail_ack_queue) + qp->s_acked_ack_queue = prev; + qp->s_tail_ack_queue = prev; + /* + * Since the qp->s_tail_ack_queue is modified, the + * qp->s_ack_state must be changed to re-initialize + * qp->s_ack_rdma_sge; Otherwise, we will end up in + * wrong memory region. + */ + qp->s_ack_state = OP(ACKNOWLEDGE); +schedule: + /* + * It's possible to receive a retry psn that is earlier than an RNRNAK + * psn. In this case, the rnrnak state should be cleared. + */ + if (qpriv->rnr_nak_state) { + qp->s_nak_state = 0; + qpriv->rnr_nak_state = TID_RNR_NAK_INIT; + qp->r_psn = e->lpsn + 1; + hfi1_tid_write_alloc_resources(qp, true); + } + + qp->r_state = e->opcode; + qp->r_nak_state = 0; + qp->s_flags |= RVT_S_RESP_PENDING; + hfi1_schedule_send(qp); +unlock: + spin_unlock_irqrestore(&qp->s_lock, flags); +done: + return 1; +} + +void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet) +{ + /* HANDLER FOR TID RDMA READ REQUEST packet (Responder side)*/ + + /* + * 1. Verify TID RDMA READ REQ as per IB_OPCODE_RC_RDMA_READ + * (see hfi1_rc_rcv()) + * 2. Put TID RDMA READ REQ into the response queueu (s_ack_queue) + * - Setup struct tid_rdma_req with request info + * - Initialize struct tid_rdma_flow info; + * - Copy TID entries; + * 3. Set the qp->s_ack_state. + * 4. Set RVT_S_RESP_PENDING in s_flags. + * 5. Kick the send engine (hfi1_schedule_send()) + */ + struct hfi1_ctxtdata *rcd = packet->rcd; + struct rvt_qp *qp = packet->qp; + struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + struct ib_other_headers *ohdr = packet->ohdr; + struct rvt_ack_entry *e; + unsigned long flags; + struct ib_reth *reth; + struct hfi1_qp_priv *qpriv = qp->priv; + u32 bth0, psn, len, rkey; + bool is_fecn; + u8 next; + u64 vaddr; + int diff; + u8 nack_state = IB_NAK_INVALID_REQUEST; + + bth0 = be32_to_cpu(ohdr->bth[0]); + if (hfi1_ruc_check_hdr(ibp, packet)) + return; + + is_fecn = process_ecn(qp, packet); + psn = mask_psn(be32_to_cpu(ohdr->bth[2])); + trace_hfi1_rsp_rcv_tid_read_req(qp, psn); + + if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST)) + rvt_comm_est(qp); + + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) + goto nack_inv; + + reth = &ohdr->u.tid_rdma.r_req.reth; + vaddr = be64_to_cpu(reth->vaddr); + len = be32_to_cpu(reth->length); + /* The length needs to be in multiples of PAGE_SIZE */ + if (!len || len & ~PAGE_MASK || len > qpriv->tid_rdma.local.max_len) + goto nack_inv; + + diff = delta_psn(psn, qp->r_psn); + if (unlikely(diff)) { + if (tid_rdma_rcv_error(packet, ohdr, qp, psn, diff)) + return; + goto send_ack; + } + + /* We've verified the request, insert it into the ack queue. */ + next = qp->r_head_ack_queue + 1; + if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) + next = 0; + spin_lock_irqsave(&qp->s_lock, flags); + if (unlikely(next == qp->s_tail_ack_queue)) { + if (!qp->s_ack_queue[next].sent) { + nack_state = IB_NAK_REMOTE_OPERATIONAL_ERROR; + goto nack_inv_unlock; + } + update_ack_queue(qp, next); + } + e = &qp->s_ack_queue[qp->r_head_ack_queue]; + if (e->rdma_sge.mr) { + rvt_put_mr(e->rdma_sge.mr); + e->rdma_sge.mr = NULL; + } + + rkey = be32_to_cpu(reth->rkey); + qp->r_len = len; + + if (unlikely(!rvt_rkey_ok(qp, &e->rdma_sge, qp->r_len, vaddr, + rkey, IB_ACCESS_REMOTE_READ))) + goto nack_acc; + + /* Accept the request parameters */ + if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn, vaddr, + len)) + goto nack_inv_unlock; + + qp->r_state = e->opcode; + qp->r_nak_state = 0; + /* + * We need to increment the MSN here instead of when we + * finish sending the result since a duplicate request would + * increment it more than once. + */ + qp->r_msn++; + qp->r_psn += e->lpsn - e->psn + 1; + + qp->r_head_ack_queue = next; + + /* + * For all requests other than TID WRITE which are added to the ack + * queue, qpriv->r_tid_alloc follows qp->r_head_ack_queue. It is ok to + * do this because of interlocks between these and TID WRITE + * requests. The same change has also been made in hfi1_rc_rcv(). + */ + qpriv->r_tid_alloc = qp->r_head_ack_queue; + + /* Schedule the send tasklet. */ + qp->s_flags |= RVT_S_RESP_PENDING; + hfi1_schedule_send(qp); + + spin_unlock_irqrestore(&qp->s_lock, flags); + if (is_fecn) + goto send_ack; + return; + +nack_inv_unlock: + spin_unlock_irqrestore(&qp->s_lock, flags); +nack_inv: + rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); + qp->r_nak_state = nack_state; + qp->r_ack_psn = qp->r_psn; + /* Queue NAK for later */ + rc_defered_ack(rcd, qp); + return; +nack_acc: + spin_unlock_irqrestore(&qp->s_lock, flags); + rvt_rc_error(qp, IB_WC_LOC_PROT_ERR); + qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR; + qp->r_ack_psn = qp->r_psn; +send_ack: + hfi1_send_rc_ack(packet, is_fecn); +} + +u32 hfi1_build_tid_rdma_read_resp(struct rvt_qp *qp, struct rvt_ack_entry *e, + struct ib_other_headers *ohdr, u32 *bth0, + u32 *bth1, u32 *bth2, u32 *len, bool *last) +{ + struct hfi1_ack_priv *epriv = e->priv; + struct tid_rdma_request *req = &epriv->tid_req; + struct hfi1_qp_priv *qpriv = qp->priv; + struct tid_rdma_flow *flow = &req->flows[req->clear_tail]; + u32 tidentry = flow->tid_entry[flow->tid_idx]; + u32 tidlen = EXP_TID_GET(tidentry, LEN) << PAGE_SHIFT; + struct tid_rdma_read_resp *resp = &ohdr->u.tid_rdma.r_rsp; + u32 next_offset, om = KDETH_OM_LARGE; + bool last_pkt; + u32 hdwords = 0; + struct tid_rdma_params *remote; + + *len = min_t(u32, qp->pmtu, tidlen - flow->tid_offset); + flow->sent += *len; + next_offset = flow->tid_offset + *len; + last_pkt = (flow->sent >= flow->length); + + trace_hfi1_tid_entry_build_read_resp(qp, flow->tid_idx, tidentry); + trace_hfi1_tid_flow_build_read_resp(qp, req->clear_tail, flow); + + rcu_read_lock(); + remote = rcu_dereference(qpriv->tid_rdma.remote); + if (!remote) { + rcu_read_unlock(); + goto done; + } + KDETH_RESET(resp->kdeth0, KVER, 0x1); + KDETH_SET(resp->kdeth0, SH, !last_pkt); + KDETH_SET(resp->kdeth0, INTR, !!(!last_pkt && remote->urg)); + KDETH_SET(resp->kdeth0, TIDCTRL, EXP_TID_GET(tidentry, CTRL)); + KDETH_SET(resp->kdeth0, TID, EXP_TID_GET(tidentry, IDX)); + KDETH_SET(resp->kdeth0, OM, om == KDETH_OM_LARGE); + KDETH_SET(resp->kdeth0, OFFSET, flow->tid_offset / om); + KDETH_RESET(resp->kdeth1, JKEY, remote->jkey); + resp->verbs_qp = cpu_to_be32(qp->remote_qpn); + rcu_read_unlock(); + + resp->aeth = rvt_compute_aeth(qp); + resp->verbs_psn = cpu_to_be32(mask_psn(flow->flow_state.ib_spsn + + flow->pkt)); + + *bth0 = TID_OP(READ_RESP) << 24; + *bth1 = flow->tid_qpn; + *bth2 = mask_psn(((flow->flow_state.spsn + flow->pkt++) & + HFI1_KDETH_BTH_SEQ_MASK) | + (flow->flow_state.generation << + HFI1_KDETH_BTH_SEQ_SHIFT)); + *last = last_pkt; + if (last_pkt) + /* Advance to next flow */ + req->clear_tail = (req->clear_tail + 1) & + (MAX_FLOWS - 1); + + if (next_offset >= tidlen) { + flow->tid_offset = 0; + flow->tid_idx++; + } else { + flow->tid_offset = next_offset; + } + + hdwords = sizeof(ohdr->u.tid_rdma.r_rsp) / sizeof(u32); + +done: + return hdwords; +} + +static inline struct tid_rdma_request * +find_tid_request(struct rvt_qp *qp, u32 psn, enum ib_wr_opcode opcode) + __must_hold(&qp->s_lock) +{ + struct rvt_swqe *wqe; + struct tid_rdma_request *req = NULL; + u32 i, end; + + end = qp->s_cur + 1; + if (end == qp->s_size) + end = 0; + for (i = qp->s_acked; i != end;) { + wqe = rvt_get_swqe_ptr(qp, i); + if (cmp_psn(psn, wqe->psn) >= 0 && + cmp_psn(psn, wqe->lpsn) <= 0) { + if (wqe->wr.opcode == opcode) + req = wqe_to_tid_req(wqe); + break; + } + if (++i == qp->s_size) + i = 0; + } + + return req; +} + +void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet) +{ + /* HANDLER FOR TID RDMA READ RESPONSE packet (Requestor side */ + + /* + * 1. Find matching SWQE + * 2. Check that the entire segment has been read. + * 3. Remove HFI1_S_WAIT_TID_RESP from s_flags. + * 4. Free the TID flow resources. + * 5. Kick the send engine (hfi1_schedule_send()) + */ + struct ib_other_headers *ohdr = packet->ohdr; + struct rvt_qp *qp = packet->qp; + struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_ctxtdata *rcd = packet->rcd; + struct tid_rdma_request *req; + struct tid_rdma_flow *flow; + u32 opcode, aeth; + bool is_fecn; + unsigned long flags; + u32 kpsn, ipsn; + + trace_hfi1_sender_rcv_tid_read_resp(qp); + is_fecn = process_ecn(qp, packet); + kpsn = mask_psn(be32_to_cpu(ohdr->bth[2])); + aeth = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.aeth); + opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; + + spin_lock_irqsave(&qp->s_lock, flags); + ipsn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn)); + req = find_tid_request(qp, ipsn, IB_WR_TID_RDMA_READ); + if (unlikely(!req)) + goto ack_op_err; + + flow = &req->flows[req->clear_tail]; + /* When header suppression is disabled */ + if (cmp_psn(ipsn, flow->flow_state.ib_lpsn)) + goto ack_done; + req->ack_pending--; + priv->pending_tid_r_segs--; + qp->s_num_rd_atomic--; + if ((qp->s_flags & RVT_S_WAIT_FENCE) && + !qp->s_num_rd_atomic) { + qp->s_flags &= ~(RVT_S_WAIT_FENCE | + RVT_S_WAIT_ACK); + hfi1_schedule_send(qp); + } + if (qp->s_flags & RVT_S_WAIT_RDMAR) { + qp->s_flags &= ~(RVT_S_WAIT_RDMAR | RVT_S_WAIT_ACK); + hfi1_schedule_send(qp); + } + + trace_hfi1_ack(qp, ipsn); + trace_hfi1_tid_req_rcv_read_resp(qp, 0, req->e.swqe->wr.opcode, + req->e.swqe->psn, req->e.swqe->lpsn, + req); + trace_hfi1_tid_flow_rcv_read_resp(qp, req->clear_tail, flow); + + /* Release the tid resources */ + hfi1_kern_exp_rcv_clear(req); + + if (!do_rc_ack(qp, aeth, ipsn, opcode, 0, rcd)) + goto ack_done; + + /* If not done yet, build next read request */ + if (++req->comp_seg >= req->total_segs) { + priv->tid_r_comp++; + req->state = TID_REQUEST_COMPLETE; + } + + /* + * Clear the hw flow under two conditions: + * 1. This request is a sync point and it is complete;