Merge tag 'for-linus-2' of git://git.kernel.org/pub/scm/linux/kernel/git/dledford...
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 5 Aug 2016 00:26:31 +0000 (20:26 -0400)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 5 Aug 2016 00:26:31 +0000 (20:26 -0400)
Pull second round of rdma updates from Doug Ledford:
 "This can be split out into just two categories:

   - fixes to the RDMA R/W API in regards to SG list length limits
     (about 5 patches)

   - fixes/features for the Intel hfi1 driver (everything else)

  The hfi1 driver is still being brought to full feature support by
  Intel, and they have a lot of people working on it, so that amounts to
  almost the entirety of this pull request"

* tag 'for-linus-2' of git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma: (84 commits)
  IB/hfi1: Add cache evict LRU list
  IB/hfi1: Fix memory leak during unexpected shutdown
  IB/hfi1: Remove unneeded mm argument in remove function
  IB/hfi1: Consistently call ops->remove outside spinlock
  IB/hfi1: Use evict mmu rb operation
  IB/hfi1: Add evict operation to the mmu rb handler
  IB/hfi1: Fix TID caching actions
  IB/hfi1: Make the cache handler own its rb tree root
  IB/hfi1: Make use of mm consistent
  IB/hfi1: Fix user SDMA racy user request claim
  IB/hfi1: Fix error condition that needs to clean up
  IB/hfi1: Release node on insert failure
  IB/hfi1: Validate SDMA user iovector count
  IB/hfi1: Validate SDMA user request index
  IB/hfi1: Use the same capability state for all shared contexts
  IB/hfi1: Prevent null pointer dereference
  IB/hfi1: Rename TID mmu_rb_* functions
  IB/hfi1: Remove unneeded empty check in hfi1_mmu_rb_unregister()
  IB/hfi1: Restructure hfi1_file_open
  IB/hfi1: Make iovec loop index easy to understand
  ...

66 files changed:
drivers/infiniband/core/rw.c
drivers/infiniband/core/verbs.c
drivers/infiniband/hw/hfi1/Kconfig
drivers/infiniband/hw/hfi1/Makefile
drivers/infiniband/hw/hfi1/affinity.c
drivers/infiniband/hw/hfi1/affinity.h
drivers/infiniband/hw/hfi1/chip.c
drivers/infiniband/hw/hfi1/chip.h
drivers/infiniband/hw/hfi1/chip_registers.h
drivers/infiniband/hw/hfi1/driver.c
drivers/infiniband/hw/hfi1/file_ops.c
drivers/infiniband/hw/hfi1/firmware.c
drivers/infiniband/hw/hfi1/hfi.h
drivers/infiniband/hw/hfi1/init.c
drivers/infiniband/hw/hfi1/mad.c
drivers/infiniband/hw/hfi1/mad.h
drivers/infiniband/hw/hfi1/mmu_rb.c
drivers/infiniband/hw/hfi1/mmu_rb.h
drivers/infiniband/hw/hfi1/pcie.c
drivers/infiniband/hw/hfi1/pio.c
drivers/infiniband/hw/hfi1/platform.c
drivers/infiniband/hw/hfi1/qp.c
drivers/infiniband/hw/hfi1/qp.h
drivers/infiniband/hw/hfi1/qsfp.c
drivers/infiniband/hw/hfi1/qsfp.h
drivers/infiniband/hw/hfi1/rc.c
drivers/infiniband/hw/hfi1/ruc.c
drivers/infiniband/hw/hfi1/sysfs.c
drivers/infiniband/hw/hfi1/trace.h
drivers/infiniband/hw/hfi1/trace_ctxts.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/trace_dbg.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/trace_ibhdrs.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/trace_misc.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/trace_rc.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/trace_rx.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/trace_tx.h [new file with mode: 0644]
drivers/infiniband/hw/hfi1/twsi.c [deleted file]
drivers/infiniband/hw/hfi1/twsi.h [deleted file]
drivers/infiniband/hw/hfi1/uc.c
drivers/infiniband/hw/hfi1/ud.c
drivers/infiniband/hw/hfi1/user_exp_rcv.c
drivers/infiniband/hw/hfi1/user_pages.c
drivers/infiniband/hw/hfi1/user_sdma.c
drivers/infiniband/hw/hfi1/user_sdma.h
drivers/infiniband/hw/hfi1/verbs.c
drivers/infiniband/hw/hfi1/verbs.h
drivers/infiniband/hw/hfi1/verbs_txreq.h
drivers/infiniband/hw/qib/qib_qp.c
drivers/infiniband/hw/qib/qib_ud.c
drivers/infiniband/hw/qib/qib_verbs.c
drivers/infiniband/hw/qib/qib_verbs.h
drivers/infiniband/sw/rdmavt/cq.c
drivers/infiniband/sw/rdmavt/mr.c
drivers/infiniband/sw/rdmavt/mr.h
drivers/infiniband/sw/rdmavt/qp.c
drivers/infiniband/sw/rdmavt/vt.c
drivers/infiniband/ulp/isert/ib_isert.c
drivers/infiniband/ulp/isert/ib_isert.h
drivers/infiniband/ulp/srpt/ib_srpt.c
drivers/infiniband/ulp/srpt/ib_srpt.h
include/rdma/ib_verbs.h
include/rdma/opa_port_info.h
include/rdma/rdma_vt.h
include/rdma/rdmavt_mr.h
include/rdma/rdmavt_qp.h
include/uapi/rdma/hfi/hfi1_user.h

index 1eb9b1294a6383c689363bb02a713ee1cd660037..dbfd854c32c936b8e934bcc1cb27ce4325b61d8d 100644 (file)
@@ -58,19 +58,13 @@ static inline bool rdma_rw_io_needs_mr(struct ib_device *dev, u8 port_num,
        return false;
 }
 
-static inline u32 rdma_rw_max_sge(struct ib_device *dev,
-               enum dma_data_direction dir)
-{
-       return dir == DMA_TO_DEVICE ?
-               dev->attrs.max_sge : dev->attrs.max_sge_rd;
-}
-
 static inline u32 rdma_rw_fr_page_list_len(struct ib_device *dev)
 {
        /* arbitrary limit to avoid allocating gigantic resources */
        return min_t(u32, dev->attrs.max_fast_reg_page_list_len, 256);
 }
 
+/* Caller must have zero-initialized *reg. */
 static int rdma_rw_init_one_mr(struct ib_qp *qp, u8 port_num,
                struct rdma_rw_reg_ctx *reg, struct scatterlist *sg,
                u32 sg_cnt, u32 offset)
@@ -114,6 +108,7 @@ static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
                u8 port_num, struct scatterlist *sg, u32 sg_cnt, u32 offset,
                u64 remote_addr, u32 rkey, enum dma_data_direction dir)
 {
+       struct rdma_rw_reg_ctx *prev = NULL;
        u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device);
        int i, j, ret = 0, count = 0;
 
@@ -125,7 +120,6 @@ static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
        }
 
        for (i = 0; i < ctx->nr_ops; i++) {
-               struct rdma_rw_reg_ctx *prev = i ? &ctx->reg[i - 1] : NULL;
                struct rdma_rw_reg_ctx *reg = &ctx->reg[i];
                u32 nents = min(sg_cnt, pages_per_mr);
 
@@ -162,9 +156,13 @@ static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
                sg_cnt -= nents;
                for (j = 0; j < nents; j++)
                        sg = sg_next(sg);
+               prev = reg;
                offset = 0;
        }
 
+       if (prev)
+               prev->wr.wr.next = NULL;
+
        ctx->type = RDMA_RW_MR;
        return count;
 
@@ -181,7 +179,8 @@ static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
                u64 remote_addr, u32 rkey, enum dma_data_direction dir)
 {
        struct ib_device *dev = qp->pd->device;
-       u32 max_sge = rdma_rw_max_sge(dev, dir);
+       u32 max_sge = dir == DMA_TO_DEVICE ? qp->max_write_sge :
+                     qp->max_read_sge;
        struct ib_sge *sge;
        u32 total_len = 0, i, j;
 
@@ -205,11 +204,10 @@ static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
                        rdma_wr->wr.opcode = IB_WR_RDMA_READ;
                rdma_wr->remote_addr = remote_addr + total_len;
                rdma_wr->rkey = rkey;
+               rdma_wr->wr.num_sge = nr_sge;
                rdma_wr->wr.sg_list = sge;
 
                for (j = 0; j < nr_sge; j++, sg = sg_next(sg)) {
-                       rdma_wr->wr.num_sge++;
-
                        sge->addr = ib_sg_dma_address(dev, sg) + offset;
                        sge->length = ib_sg_dma_len(dev, sg) - offset;
                        sge->lkey = qp->pd->local_dma_lkey;
@@ -220,8 +218,8 @@ static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
                        offset = 0;
                }
 
-               if (i + 1 < ctx->nr_ops)
-                       rdma_wr->wr.next = &ctx->map.wrs[i + 1].wr;
+               rdma_wr->wr.next = i + 1 < ctx->nr_ops ?
+                       &ctx->map.wrs[i + 1].wr : NULL;
        }
 
        ctx->type = RDMA_RW_MULTI_WR;
index 2e813edcddabd9919849f881c6a4da61bc6382e6..f2b776efab3a3ee1ffdc306ef7af01d3093e9ed7 100644 (file)
@@ -825,6 +825,15 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd,
                }
        }
 
+       /*
+        * Note: all hw drivers guarantee that max_send_sge is lower than
+        * the device RDMA WRITE SGE limit but not all hw drivers ensure that
+        * max_send_sge <= max_sge_rd.
+        */
+       qp->max_write_sge = qp_init_attr->cap.max_send_sge;
+       qp->max_read_sge = min_t(u32, qp_init_attr->cap.max_send_sge,
+                                device->attrs.max_sge_rd);
+
        return qp;
 }
 EXPORT_SYMBOL(ib_create_qp);
index f846fd51b85b925b268679f6d28e1e65db56dac7..f6ea0881765a1e789d2107479e7245d4cc0640c5 100644 (file)
@@ -1,8 +1,9 @@
 config INFINIBAND_HFI1
        tristate "Intel OPA Gen1 support"
-       depends on X86_64 && INFINIBAND_RDMAVT
+       depends on X86_64 && INFINIBAND_RDMAVT && I2C
        select MMU_NOTIFIER
        select CRC32
+       select I2C_ALGOBIT
        ---help---
        This is a low-level driver for Intel OPA Gen1 adapter.
 config HFI1_DEBUG_SDMA_ORDER
index 9b5382c94b0c86d1a2fa96f8c60e9069bd832b0c..0cf97a09b64b6e780dece48ffda7a02a1126753d 100644 (file)
@@ -10,7 +10,7 @@ obj-$(CONFIG_INFINIBAND_HFI1) += hfi1.o
 hfi1-y := affinity.o chip.o device.o driver.o efivar.o \
        eprom.o file_ops.o firmware.o \
        init.o intr.o mad.o mmu_rb.o pcie.o pio.o pio_copy.o platform.o \
-       qp.o qsfp.o rc.o ruc.o sdma.o sysfs.o trace.o twsi.o \
+       qp.o qsfp.o rc.o ruc.o sdma.o sysfs.o trace.o \
        uc.o ud.o user_exp_rcv.o user_pages.o user_sdma.o verbs.o \
        verbs_txreq.o
 hfi1-$(CONFIG_DEBUG_FS) += debugfs.o
index 14d7eeb09be6545f5f21144f0f11ea41c53203a4..79575ee873f21af3361264b711419c267b6a23d0 100644 (file)
 #include <linux/topology.h>
 #include <linux/cpumask.h>
 #include <linux/module.h>
+#include <linux/cpumask.h>
 
 #include "hfi.h"
 #include "affinity.h"
 #include "sdma.h"
 #include "trace.h"
 
+struct hfi1_affinity_node_list node_affinity = {
+       .list = LIST_HEAD_INIT(node_affinity.list),
+       .lock = __SPIN_LOCK_UNLOCKED(&node_affinity.lock),
+};
+
 /* Name of IRQ types, indexed by enum irq_type */
 static const char * const irq_type_names[] = {
        "SDMA",
@@ -61,6 +67,9 @@ static const char * const irq_type_names[] = {
        "OTHER",
 };
 
+/* Per NUMA node count of HFI devices */
+static unsigned int *hfi1_per_node_cntr;
+
 static inline void init_cpu_mask_set(struct cpu_mask_set *set)
 {
        cpumask_clear(&set->mask);
@@ -69,47 +78,136 @@ static inline void init_cpu_mask_set(struct cpu_mask_set *set)
 }
 
 /* Initialize non-HT cpu cores mask */
-int init_real_cpu_mask(struct hfi1_devdata *dd)
+void init_real_cpu_mask(void)
 {
-       struct hfi1_affinity *info;
        int possible, curr_cpu, i, ht;
 
-       info = kzalloc(sizeof(*info), GFP_KERNEL);
-       if (!info)
-               return -ENOMEM;
-
-       cpumask_clear(&info->real_cpu_mask);
+       cpumask_clear(&node_affinity.real_cpu_mask);
 
        /* Start with cpu online mask as the real cpu mask */
-       cpumask_copy(&info->real_cpu_mask, cpu_online_mask);
+       cpumask_copy(&node_affinity.real_cpu_mask, cpu_online_mask);
 
        /*
         * Remove HT cores from the real cpu mask.  Do this in two steps below.
         */
-       possible = cpumask_weight(&info->real_cpu_mask);
+       possible = cpumask_weight(&node_affinity.real_cpu_mask);
        ht = cpumask_weight(topology_sibling_cpumask(
-                                       cpumask_first(&info->real_cpu_mask)));
+                               cpumask_first(&node_affinity.real_cpu_mask)));
        /*
         * Step 1.  Skip over the first N HT siblings and use them as the
         * "real" cores.  Assumes that HT cores are not enumerated in
         * succession (except in the single core case).
         */
-       curr_cpu = cpumask_first(&info->real_cpu_mask);
+       curr_cpu = cpumask_first(&node_affinity.real_cpu_mask);
        for (i = 0; i < possible / ht; i++)
-               curr_cpu = cpumask_next(curr_cpu, &info->real_cpu_mask);
+               curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask);
        /*
         * Step 2.  Remove the remaining HT siblings.  Use cpumask_next() to
         * skip any gaps.
         */
        for (; i < possible; i++) {
-               cpumask_clear_cpu(curr_cpu, &info->real_cpu_mask);
-               curr_cpu = cpumask_next(curr_cpu, &info->real_cpu_mask);
+               cpumask_clear_cpu(curr_cpu, &node_affinity.real_cpu_mask);
+               curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask);
+       }
+}
+
+int node_affinity_init(void)
+{
+       int node;
+       struct pci_dev *dev = NULL;
+       const struct pci_device_id *ids = hfi1_pci_tbl;
+
+       cpumask_clear(&node_affinity.proc.used);
+       cpumask_copy(&node_affinity.proc.mask, cpu_online_mask);
+
+       node_affinity.proc.gen = 0;
+       node_affinity.num_core_siblings =
+                               cpumask_weight(topology_sibling_cpumask(
+                                       cpumask_first(&node_affinity.proc.mask)
+                                       ));
+       node_affinity.num_online_nodes = num_online_nodes();
+       node_affinity.num_online_cpus = num_online_cpus();
+
+       /*
+        * The real cpu mask is part of the affinity struct but it has to be
+        * initialized early. It is needed to calculate the number of user
+        * contexts in set_up_context_variables().
+        */
+       init_real_cpu_mask();
+
+       hfi1_per_node_cntr = kcalloc(num_possible_nodes(),
+                                    sizeof(*hfi1_per_node_cntr), GFP_KERNEL);
+       if (!hfi1_per_node_cntr)
+               return -ENOMEM;
+
+       while (ids->vendor) {
+               dev = NULL;
+               while ((dev = pci_get_device(ids->vendor, ids->device, dev))) {
+                       node = pcibus_to_node(dev->bus);
+                       if (node < 0)
+                               node = numa_node_id();
+
+                       hfi1_per_node_cntr[node]++;
+               }
+               ids++;
        }
 
-       dd->affinity = info;
        return 0;
 }
 
+void node_affinity_destroy(void)
+{
+       struct list_head *pos, *q;
+       struct hfi1_affinity_node *entry;
+
+       spin_lock(&node_affinity.lock);
+       list_for_each_safe(pos, q, &node_affinity.list) {
+               entry = list_entry(pos, struct hfi1_affinity_node,
+                                  list);
+               list_del(pos);
+               kfree(entry);
+       }
+       spin_unlock(&node_affinity.lock);
+       kfree(hfi1_per_node_cntr);
+}
+
+static struct hfi1_affinity_node *node_affinity_allocate(int node)
+{
+       struct hfi1_affinity_node *entry;
+
+       entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+       if (!entry)
+               return NULL;
+       entry->node = node;
+       INIT_LIST_HEAD(&entry->list);
+
+       return entry;
+}
+
+/*
+ * It appends an entry to the list.
+ * It *must* be called with node_affinity.lock held.
+ */
+static void node_affinity_add_tail(struct hfi1_affinity_node *entry)
+{
+       list_add_tail(&entry->list, &node_affinity.list);
+}
+
+/* It must be called with node_affinity.lock held */
+static struct hfi1_affinity_node *node_affinity_lookup(int node)
+{
+       struct list_head *pos;
+       struct hfi1_affinity_node *entry;
+
+       list_for_each(pos, &node_affinity.list) {
+               entry = list_entry(pos, struct hfi1_affinity_node, list);
+               if (entry->node == node)
+                       return entry;
+       }
+
+       return NULL;
+}
+
 /*
  * Interrupt affinity.
  *
@@ -121,10 +219,10 @@ int init_real_cpu_mask(struct hfi1_devdata *dd)
  * to the node relative 1 as necessary.
  *
  */
-void hfi1_dev_affinity_init(struct hfi1_devdata *dd)
+int hfi1_dev_affinity_init(struct hfi1_devdata *dd)
 {
        int node = pcibus_to_node(dd->pcidev->bus);
-       struct hfi1_affinity *info = dd->affinity;
+       struct hfi1_affinity_node *entry;
        const struct cpumask *local_mask;
        int curr_cpu, possible, i;
 
@@ -132,56 +230,93 @@ void hfi1_dev_affinity_init(struct hfi1_devdata *dd)
                node = numa_node_id();
        dd->node = node;
 
-       spin_lock_init(&info->lock);
-
-       init_cpu_mask_set(&info->def_intr);
-       init_cpu_mask_set(&info->rcv_intr);
-       init_cpu_mask_set(&info->proc);
-
        local_mask = cpumask_of_node(dd->node);
        if (cpumask_first(local_mask) >= nr_cpu_ids)
                local_mask = topology_core_cpumask(0);
-       /* Use the "real" cpu mask of this node as the default */
-       cpumask_and(&info->def_intr.mask, &info->real_cpu_mask, local_mask);
-
-       /*  fill in the receive list */
-       possible = cpumask_weight(&info->def_intr.mask);
-       curr_cpu = cpumask_first(&info->def_intr.mask);
-       if (possible == 1) {
-               /*  only one CPU, everyone will use it */
-               cpumask_set_cpu(curr_cpu, &info->rcv_intr.mask);
-       } else {
-               /*
-                * Retain the first CPU in the default list for the control
-                * context.
-                */
-               curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask);
-               /*
-                * Remove the remaining kernel receive queues from
-                * the default list and add them to the receive list.
-                */
-               for (i = 0; i < dd->n_krcv_queues - 1; i++) {
-                       cpumask_clear_cpu(curr_cpu, &info->def_intr.mask);
-                       cpumask_set_cpu(curr_cpu, &info->rcv_intr.mask);
-                       curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask);
-                       if (curr_cpu >= nr_cpu_ids)
-                               break;
+
+       spin_lock(&node_affinity.lock);
+       entry = node_affinity_lookup(dd->node);
+       spin_unlock(&node_affinity.lock);
+
+       /*
+        * If this is the first time this NUMA node's affinity is used,
+        * create an entry in the global affinity structure and initialize it.
+        */
+       if (!entry) {
+               entry = node_affinity_allocate(node);
+               if (!entry) {
+                       dd_dev_err(dd,
+                                  "Unable to allocate global affinity node\n");
+                       return -ENOMEM;
                }
-       }
+               init_cpu_mask_set(&entry->def_intr);
+               init_cpu_mask_set(&entry->rcv_intr);
+               cpumask_clear(&entry->general_intr_mask);
+               /* Use the "real" cpu mask of this node as the default */
+               cpumask_and(&entry->def_intr.mask, &node_affinity.real_cpu_mask,
+                           local_mask);
+
+               /* fill in the receive list */
+               possible = cpumask_weight(&entry->def_intr.mask);
+               curr_cpu = cpumask_first(&entry->def_intr.mask);
+
+               if (possible == 1) {
+                       /* only one CPU, everyone will use it */
+                       cpumask_set_cpu(curr_cpu, &entry->rcv_intr.mask);
+                       cpumask_set_cpu(curr_cpu, &entry->general_intr_mask);
+               } else {
+                       /*
+                        * The general/control context will be the first CPU in
+                        * the default list, so it is removed from the default
+                        * list and added to the general interrupt list.
+                        */
+                       cpumask_clear_cpu(curr_cpu, &entry->def_intr.mask);
+                       cpumask_set_cpu(curr_cpu, &entry->general_intr_mask);
+                       curr_cpu = cpumask_next(curr_cpu,
+                                               &entry->def_intr.mask);
 
-       cpumask_copy(&info->proc.mask, cpu_online_mask);
-}
+                       /*
+                        * Remove the remaining kernel receive queues from
+                        * the default list and add them to the receive list.
+                        */
+                       for (i = 0;
+                            i < (dd->n_krcv_queues - 1) *
+                                 hfi1_per_node_cntr[dd->node];
+                            i++) {
+                               cpumask_clear_cpu(curr_cpu,
+                                                 &entry->def_intr.mask);
+                               cpumask_set_cpu(curr_cpu,
+                                               &entry->rcv_intr.mask);
+                               curr_cpu = cpumask_next(curr_cpu,
+                                                       &entry->def_intr.mask);
+                               if (curr_cpu >= nr_cpu_ids)
+                                       break;
+                       }
 
-void hfi1_dev_affinity_free(struct hfi1_devdata *dd)
-{
-       kfree(dd->affinity);
+                       /*
+                        * If there ends up being 0 CPU cores leftover for SDMA
+                        * engines, use the same CPU cores as general/control
+                        * context.
+                        */
+                       if (cpumask_weight(&entry->def_intr.mask) == 0)
+                               cpumask_copy(&entry->def_intr.mask,
+                                            &entry->general_intr_mask);
+               }
+
+               spin_lock(&node_affinity.lock);
+               node_affinity_add_tail(entry);
+               spin_unlock(&node_affinity.lock);
+       }
+
+       return 0;
 }
 
 int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix)
 {
        int ret;
        cpumask_var_t diff;
-       struct cpu_mask_set *set;
+       struct hfi1_affinity_node *entry;
+       struct cpu_mask_set *set = NULL;
        struct sdma_engine *sde = NULL;
        struct hfi1_ctxtdata *rcd = NULL;
        char extra[64];
@@ -194,22 +329,25 @@ int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix)
        if (!ret)
                return -ENOMEM;
 
+       spin_lock(&node_affinity.lock);
+       entry = node_affinity_lookup(dd->node);
+       spin_unlock(&node_affinity.lock);
+
        switch (msix->type) {
        case IRQ_SDMA:
                sde = (struct sdma_engine *)msix->arg;
                scnprintf(extra, 64, "engine %u", sde->this_idx);
-               /* fall through */
+               set = &entry->def_intr;
+               break;
        case IRQ_GENERAL:
-               set = &dd->affinity->def_intr;
+               cpu = cpumask_first(&entry->general_intr_mask);
                break;
        case IRQ_RCVCTXT:
                rcd = (struct hfi1_ctxtdata *)msix->arg;
-               if (rcd->ctxt == HFI1_CTRL_CTXT) {
-                       set = &dd->affinity->def_intr;
-                       cpu = cpumask_first(&set->mask);
-               } else {
-                       set = &dd->affinity->rcv_intr;
-               }
+               if (rcd->ctxt == HFI1_CTRL_CTXT)
+                       cpu = cpumask_first(&entry->general_intr_mask);
+               else
+                       set = &entry->rcv_intr;
                scnprintf(extra, 64, "ctxt %u", rcd->ctxt);
                break;
        default:
@@ -218,12 +356,12 @@ int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix)
        }
 
        /*
-        * The control receive context is placed on a particular CPU, which
-        * is set above.  Skip accounting for it.  Everything else finds its
-        * CPU here.
+        * The general and control contexts are placed on a particular
+        * CPU, which is set above. Skip accounting for it. Everything else
+        * finds its CPU here.
         */
-       if (cpu == -1) {
-               spin_lock(&dd->affinity->lock);
+       if (cpu == -1 && set) {
+               spin_lock(&node_affinity.lock);
                if (cpumask_equal(&set->mask, &set->used)) {
                        /*
                         * We've used up all the CPUs, bump up the generation
@@ -235,7 +373,7 @@ int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix)
                cpumask_andnot(diff, &set->mask, &set->used);
                cpu = cpumask_first(diff);
                cpumask_set_cpu(cpu, &set->used);
-               spin_unlock(&dd->affinity->lock);
+               spin_unlock(&node_affinity.lock);
        }
 
        switch (msix->type) {
@@ -263,43 +401,84 @@ void hfi1_put_irq_affinity(struct hfi1_devdata *dd,
 {
        struct cpu_mask_set *set = NULL;
        struct hfi1_ctxtdata *rcd;
+       struct hfi1_affinity_node *entry;
+
+       spin_lock(&node_affinity.lock);
+       entry = node_affinity_lookup(dd->node);
+       spin_unlock(&node_affinity.lock);
 
        switch (msix->type) {
        case IRQ_SDMA:
+               set = &entry->def_intr;
+               break;
        case IRQ_GENERAL:
-               set = &dd->affinity->def_intr;
+               /* Don't do accounting for general contexts */
                break;
        case IRQ_RCVCTXT:
                rcd = (struct hfi1_ctxtdata *)msix->arg;
-               /* only do accounting for non control contexts */
+               /* Don't do accounting for control contexts */
                if (rcd->ctxt != HFI1_CTRL_CTXT)
-                       set = &dd->affinity->rcv_intr;
+                       set = &entry->rcv_intr;
                break;
        default:
                return;
        }
 
        if (set) {
-               spin_lock(&dd->affinity->lock);
+               spin_lock(&node_affinity.lock);
                cpumask_andnot(&set->used, &set->used, &msix->mask);
                if (cpumask_empty(&set->used) && set->gen) {
                        set->gen--;
                        cpumask_copy(&set->used, &set->mask);
                }
-               spin_unlock(&dd->affinity->lock);
+               spin_unlock(&node_affinity.lock);
        }
 
        irq_set_affinity_hint(msix->msix.vector, NULL);
        cpumask_clear(&msix->mask);
 }
 
-int hfi1_get_proc_affinity(struct hfi1_devdata *dd, int node)
+/* This should be called with node_affinity.lock held */
+static void find_hw_thread_mask(uint hw_thread_no, cpumask_var_t hw_thread_mask,
+                               struct hfi1_affinity_node_list *affinity)
 {
-       int cpu = -1, ret;
-       cpumask_var_t diff, mask, intrs;
+       int possible, curr_cpu, i;
+       uint num_cores_per_socket = node_affinity.num_online_cpus /
+                                       affinity->num_core_siblings /
+                                               node_affinity.num_online_nodes;
+
+       cpumask_copy(hw_thread_mask, &affinity->proc.mask);
+       if (affinity->num_core_siblings > 0) {
+               /* Removing other siblings not needed for now */
+               possible = cpumask_weight(hw_thread_mask);
+               curr_cpu = cpumask_first(hw_thread_mask);
+               for (i = 0;
+                    i < num_cores_per_socket * node_affinity.num_online_nodes;
+                    i++)
+                       curr_cpu = cpumask_next(curr_cpu, hw_thread_mask);
+
+               for (; i < possible; i++) {
+                       cpumask_clear_cpu(curr_cpu, hw_thread_mask);
+                       curr_cpu = cpumask_next(curr_cpu, hw_thread_mask);
+               }
+
+               /* Identifying correct HW threads within physical cores */
+               cpumask_shift_left(hw_thread_mask, hw_thread_mask,
+                                  num_cores_per_socket *
+                                  node_affinity.num_online_nodes *
+                                  hw_thread_no);
+       }
+}
+
+int hfi1_get_proc_affinity(int node)
+{
+       int cpu = -1, ret, i;
+       struct hfi1_affinity_node *entry;
+       cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask;
        const struct cpumask *node_mask,
                *proc_mask = tsk_cpus_allowed(current);
-       struct cpu_mask_set *set = &dd->affinity->proc;
+       struct hfi1_affinity_node_list *affinity = &node_affinity;
+       struct cpu_mask_set *set = &affinity->proc;
 
        /*
         * check whether process/context affinity has already
@@ -325,22 +504,41 @@ int hfi1_get_proc_affinity(struct hfi1_devdata *dd, int node)
 
        /*
         * The process does not have a preset CPU affinity so find one to
-        * recommend. We prefer CPUs on the same NUMA as the device.
+        * recommend using the following algorithm:
+        *
+        * For each user process that is opening a context on HFI Y:
+        *  a) If all cores are filled, reinitialize the bitmask
+        *  b) Fill real cores first, then HT cores (First set of HT
+        *     cores on all physical cores, then second set of HT core,
+        *     and, so on) in the following order:
+        *
+        *     1. Same NUMA node as HFI Y and not running an IRQ
+        *        handler
+        *     2. Same NUMA node as HFI Y and running an IRQ handler
+        *     3. Different NUMA node to HFI Y and not running an IRQ
+        *        handler
+        *     4. Different NUMA node to HFI Y and running an IRQ
+        *        handler
+        *  c) Mark core as filled in the bitmask. As user processes are
+        *     done, clear cores from the bitmask.
         */
 
        ret = zalloc_cpumask_var(&diff, GFP_KERNEL);
        if (!ret)
                goto done;
-       ret = zalloc_cpumask_var(&mask, GFP_KERNEL);
+       ret = zalloc_cpumask_var(&hw_thread_mask, GFP_KERNEL);
        if (!ret)
                goto free_diff;
-       ret = zalloc_cpumask_var(&intrs, GFP_KERNEL);
+       ret = zalloc_cpumask_var(&available_mask, GFP_KERNEL);
+       if (!ret)
+               goto free_hw_thread_mask;
+       ret = zalloc_cpumask_var(&intrs_mask, GFP_KERNEL);
        if (!ret)
-               goto free_mask;
+               goto free_available_mask;
 
-       spin_lock(&dd->affinity->lock);
+       spin_lock(&affinity->lock);
        /*
-        * If we've used all available CPUs, clear the mask and start
+        * If we've used all available HW threads, clear the mask and start
         * overloading.
         */
        if (cpumask_equal(&set->mask, &set->used)) {
@@ -348,81 +546,198 @@ int hfi1_get_proc_affinity(struct hfi1_devdata *dd, int node)
                cpumask_clear(&set->used);
        }
 
-       /* CPUs used by interrupt handlers */
-       cpumask_copy(intrs, (dd->affinity->def_intr.gen ?
-                            &dd->affinity->def_intr.mask :
-                            &dd->affinity->def_intr.used));
-       cpumask_or(intrs, intrs, (dd->affinity->rcv_intr.gen ?
-                                 &dd->affinity->rcv_intr.mask :
-                                 &dd->affinity->rcv_intr.used));
+       /*
+        * If NUMA node has CPUs used by interrupt handlers, include them in the
+        * interrupt handler mask.
+        */
+       entry = node_affinity_lookup(node);
+       if (entry) {
+               cpumask_copy(intrs_mask, (entry->def_intr.gen ?
+                                         &entry->def_intr.mask :
+                                         &entry->def_intr.used));
+               cpumask_or(intrs_mask, intrs_mask, (entry->rcv_intr.gen ?
+                                                   &entry->rcv_intr.mask :
+                                                   &entry->rcv_intr.used));
+               cpumask_or(intrs_mask, intrs_mask, &entry->general_intr_mask);
+       }
        hfi1_cdbg(PROC, "CPUs used by interrupts: %*pbl",
-                 cpumask_pr_args(intrs));
+                 cpumask_pr_args(intrs_mask));
+
+       cpumask_copy(hw_thread_mask, &set->mask);
 
        /*
-        * If we don't have a NUMA node requested, preference is towards
-        * device NUMA node
+        * If HT cores are enabled, identify which HW threads within the
+        * physical cores should be used.
         */
-       if (node == -1)
-               node = dd->node;
+       if (affinity->num_core_siblings > 0) {
+               for (i = 0; i < affinity->num_core_siblings; i++) {
+                       find_hw_thread_mask(i, hw_thread_mask, affinity);
+
+                       /*
+                        * If there's at least one available core for this HW
+                        * thread number, stop looking for a core.
+                        *
+                        * diff will always be not empty at least once in this
+                        * loop as the used mask gets reset when
+                        * (set->mask == set->used) before this loop.
+                        */
+                       cpumask_andnot(diff, hw_thread_mask, &set->used);
+                       if (!cpumask_empty(diff))
+                               break;
+               }
+       }
+       hfi1_cdbg(PROC, "Same available HW thread on all physical CPUs: %*pbl",
+                 cpumask_pr_args(hw_thread_mask));
+
        node_mask = cpumask_of_node(node);
-       hfi1_cdbg(PROC, "device on NUMA %u, CPUs %*pbl", node,
+       hfi1_cdbg(PROC, "Device on NUMA %u, CPUs %*pbl", node,
                  cpumask_pr_args(node_mask));
 
-       /* diff will hold all unused cpus */
-       cpumask_andnot(diff, &set->mask, &set->used);
-       hfi1_cdbg(PROC, "unused CPUs (all) %*pbl", cpumask_pr_args(diff));
-
-       /* get cpumask of available CPUs on preferred NUMA */
-       cpumask_and(mask, diff, node_mask);
-       hfi1_cdbg(PROC, "available cpus on NUMA %*pbl", cpumask_pr_args(mask));
+       /* Get cpumask of available CPUs on preferred NUMA */
+       cpumask_and(available_mask, hw_thread_mask, node_mask);
+       cpumask_andnot(available_mask, available_mask, &set->used);
+       hfi1_cdbg(PROC, "Available CPUs on NUMA %u: %*pbl", node,
+                 cpumask_pr_args(available_mask));
 
        /*
         * At first, we don't want to place processes on the same
-        * CPUs as interrupt handlers.
+        * CPUs as interrupt handlers. Then, CPUs running interrupt
+        * handlers are used.
+        *
+        * 1) If diff is not empty, then there are CPUs not running
+        *    non-interrupt handlers available, so diff gets copied
+        *    over to available_mask.
+        * 2) If diff is empty, then all CPUs not running interrupt
+        *    handlers are taken, so available_mask contains all
+        *    available CPUs running interrupt handlers.
+        * 3) If available_mask is empty, then all CPUs on the
+        *    preferred NUMA node are taken, so other NUMA nodes are
+        *    used for process assignments using the same method as
+        *    the preferred NUMA node.
         */
-       cpumask_andnot(diff, mask, intrs);
+       cpumask_andnot(diff, available_mask, intrs_mask);
        if (!cpumask_empty(diff))
-               cpumask_copy(mask, diff);
+               cpumask_copy(available_mask, diff);
 
-       /*
-        * if we don't have a cpu on the preferred NUMA, get
-        * the list of the remaining available CPUs
-        */
-       if (cpumask_empty(mask)) {
-               cpumask_andnot(diff, &set->mask, &set->used);
-               cpumask_andnot(mask, diff, node_mask);
+       /* If we don't have CPUs on the preferred node, use other NUMA nodes */
+       if (cpumask_empty(available_mask)) {
+               cpumask_andnot(available_mask, hw_thread_mask, &set->used);
+               /* Excluding preferred NUMA cores */
+               cpumask_andnot(available_mask, available_mask, node_mask);
+               hfi1_cdbg(PROC,
+                         "Preferred NUMA node cores are taken, cores available in other NUMA nodes: %*pbl",
+                         cpumask_pr_args(available_mask));
+
+               /*
+                * At first, we don't want to place processes on the same
+                * CPUs as interrupt handlers.
+                */
+               cpumask_andnot(diff, available_mask, intrs_mask);
+               if (!cpumask_empty(diff))
+                       cpumask_copy(available_mask, diff);
        }
-       hfi1_cdbg(PROC, "possible CPUs for process %*pbl",
-                 cpumask_pr_args(mask));
+       hfi1_cdbg(PROC, "Possible CPUs for process: %*pbl",
+                 cpumask_pr_args(available_mask));
 
-       cpu = cpumask_first(mask);
+       cpu = cpumask_first(available_mask);
        if (cpu >= nr_cpu_ids) /* empty */
                cpu = -1;
        else
                cpumask_set_cpu(cpu, &set->used);
-       spin_unlock(&dd->affinity->lock);
-
-       free_cpumask_var(intrs);
-free_mask:
-       free_cpumask_var(mask);
+       spin_unlock(&affinity->lock);
+       hfi1_cdbg(PROC, "Process assigned to CPU %d", cpu);
+
+       free_cpumask_var(intrs_mask);
+free_available_mask:
+       free_cpumask_var(available_mask);
+free_hw_thread_mask:
+       free_cpumask_var(hw_thread_mask);
 free_diff:
        free_cpumask_var(diff);
 done:
        return cpu;
 }
 
-void hfi1_put_proc_affinity(struct hfi1_devdata *dd, int cpu)
+void hfi1_put_proc_affinity(int cpu)
 {
-       struct cpu_mask_set *set = &dd->affinity->proc;
+       struct hfi1_affinity_node_list *affinity = &node_affinity;
+       struct cpu_mask_set *set = &affinity->proc;
 
        if (cpu < 0)
                return;
-       spin_lock(&dd->affinity->lock);
+       spin_lock(&affinity->lock);
        cpumask_clear_cpu(cpu, &set->used);
+       hfi1_cdbg(PROC, "Returning CPU %d for future process assignment", cpu);
        if (cpumask_empty(&set->used) && set->gen) {
                set->gen--;
                cpumask_copy(&set->used, &set->mask);
        }
-       spin_unlock(&dd->affinity->lock);
+       spin_unlock(&affinity->lock);
 }
 
+/* Prevents concurrent reads and writes of the sdma_affinity attrib */
+static DEFINE_MUTEX(sdma_affinity_mutex);
+
+int hfi1_set_sdma_affinity(struct hfi1_devdata *dd, const char *buf,
+                          size_t count)
+{
+       struct hfi1_affinity_node *entry;
+       struct cpumask mask;
+       int ret, i;
+
+       spin_lock(&node_affinity.lock);
+       entry = node_affinity_lookup(dd->node);
+       spin_unlock(&node_affinity.lock);
+
+       if (!entry)
+               return -EINVAL;
+
+       ret = cpulist_parse(buf, &mask);
+       if (ret)
+               return ret;
+
+       if (!cpumask_subset(&mask, cpu_online_mask) || cpumask_empty(&mask)) {
+               dd_dev_warn(dd, "Invalid CPU mask\n");
+               return -EINVAL;
+       }
+
+       mutex_lock(&sdma_affinity_mutex);
+       /* reset the SDMA interrupt affinity details */
+       init_cpu_mask_set(&entry->def_intr);
+       cpumask_copy(&entry->def_intr.mask, &mask);
+       /*
+        * Reassign the affinity for each SDMA interrupt.
+        */
+       for (i = 0; i < dd->num_msix_entries; i++) {
+               struct hfi1_msix_entry *msix;
+
+               msix = &dd->msix_entries[i];
+               if (msix->type != IRQ_SDMA)
+                       continue;
+
+               ret = hfi1_get_irq_affinity(dd, msix);
+
+               if (ret)
+                       break;
+       }
+
+       mutex_unlock(&sdma_affinity_mutex);
+       return ret ? ret : strnlen(buf, PAGE_SIZE);
+}
+
+int hfi1_get_sdma_affinity(struct hfi1_devdata *dd, char *buf)
+{
+       struct hfi1_affinity_node *entry;
+
+       spin_lock(&node_affinity.lock);
+       entry = node_affinity_lookup(dd->node);
+       spin_unlock(&node_affinity.lock);
+
+       if (!entry)
+               return -EINVAL;
+
+       mutex_lock(&sdma_affinity_mutex);
+       cpumap_print_to_pagebuf(true, buf, &entry->def_intr.mask);
+       mutex_unlock(&sdma_affinity_mutex);
+       return strnlen(buf, PAGE_SIZE);
+}
index 20f52fe7409161cb772b90aae96f8e4c7a012e05..8879cf7a8cac54fede679918f835c2a180a5e962 100644 (file)
@@ -73,7 +73,6 @@ struct cpu_mask_set {
 struct hfi1_affinity {
        struct cpu_mask_set def_intr;
        struct cpu_mask_set rcv_intr;
-       struct cpu_mask_set proc;
        struct cpumask real_cpu_mask;
        /* spin lock to protect affinity struct */
        spinlock_t lock;
@@ -82,11 +81,9 @@ struct hfi1_affinity {
 struct hfi1_msix_entry;
 
 /* Initialize non-HT cpu cores mask */
-int init_real_cpu_mask(struct hfi1_devdata *);
+void init_real_cpu_mask(void);
 /* Initialize driver affinity data */
-void hfi1_dev_affinity_init(struct hfi1_devdata *);
-/* Free driver affinity data */
-void hfi1_dev_affinity_free(struct hfi1_devdata *);
+int hfi1_dev_affinity_init(struct hfi1_devdata *);
 /*
  * Set IRQ affinity to a CPU. The function will determine the
  * CPU and set the affinity to it.
@@ -101,8 +98,35 @@ void hfi1_put_irq_affinity(struct hfi1_devdata *, struct hfi1_msix_entry *);
  * Determine a CPU affinity for a user process, if the process does not
  * have an affinity set yet.
  */
-int hfi1_get_proc_affinity(struct hfi1_devdata *, int);
+int hfi1_get_proc_affinity(int);
 /* Release a CPU used by a user process. */
-void hfi1_put_proc_affinity(struct hfi1_devdata *, int);
+void hfi1_put_proc_affinity(int);
+
+int hfi1_get_sdma_affinity(struct hfi1_devdata *dd, char *buf);
+int hfi1_set_sdma_affinity(struct hfi1_devdata *dd, const char *buf,
+                          size_t count);
+
+struct hfi1_affinity_node {
+       int node;
+       struct cpu_mask_set def_intr;
+       struct cpu_mask_set rcv_intr;
+       struct cpumask general_intr_mask;
+       struct list_head list;
+};
+
+struct hfi1_affinity_node_list {
+       struct list_head list;
+       struct cpumask real_cpu_mask;
+       struct cpu_mask_set proc;
+       int num_core_siblings;
+       int num_online_nodes;
+       int num_online_cpus;
+       /* protect affinity node list */
+       spinlock_t lock;
+};
+
+int node_affinity_init(void);
+void node_affinity_destroy(void);
+extern struct hfi1_affinity_node_list node_affinity;
 
 #endif /* _HFI1_AFFINITY_H */
index dad4d0ebbdffb45e2c667cc95b86aabf39b53a87..b32638d58ae82c73f920afee4a7fd0a4386581a8 100644 (file)
@@ -63,6 +63,7 @@
 #include "efivar.h"
 #include "platform.h"
 #include "aspm.h"
+#include "affinity.h"
 
 #define NUM_IB_PORTS 1
 
@@ -121,6 +122,7 @@ struct flag_table {
 #define SEC_SC_HALTED          0x4     /* per-context only */
 #define SEC_SPC_FREEZE         0x8     /* per-HFI only */
 
+#define DEFAULT_KRCVQS           2
 #define MIN_KERNEL_KCTXTS         2
 #define FIRST_KERNEL_KCTXT        1
 /* sizes for both the QP and RSM map tables */
@@ -238,6 +240,9 @@ struct flag_table {
 /* all CceStatus sub-block RXE pause bits */
 #define ALL_RXE_PAUSE CCE_STATUS_RXE_PAUSED_SMASK
 
+#define CNTR_MAX 0xFFFFFFFFFFFFFFFFULL
+#define CNTR_32BIT_MAX 0x00000000FFFFFFFF
+
 /*
  * CCE Error flags.
  */
@@ -3947,6 +3952,28 @@ static u64 access_sdma_wrong_dw_err_cnt(const struct cntr_entry *entry,
        return dd->sw_send_dma_eng_err_status_cnt[0];
 }
 
+static u64 access_dc_rcv_err_cnt(const struct cntr_entry *entry,
+                                void *context, int vl, int mode,
+                                u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       u64 val = 0;
+       u64 csr = entry->csr;
+
+       val = read_write_csr(dd, csr, mode, data);
+       if (mode == CNTR_MODE_R) {
+               val = val > CNTR_MAX - dd->sw_rcv_bypass_packet_errors ?
+                       CNTR_MAX : val + dd->sw_rcv_bypass_packet_errors;
+       } else if (mode == CNTR_MODE_W) {
+               dd->sw_rcv_bypass_packet_errors = 0;
+       } else {
+               dd_dev_err(dd, "Invalid cntr register access mode");
+               return 0;
+       }
+       return val;
+}
+
 #define def_access_sw_cpu(cntr) \
 static u64 access_sw_cpu_##cntr(const struct cntr_entry *entry,                      \
                              void *context, int vl, int mode, u64 data)      \
@@ -4020,7 +4047,8 @@ static struct cntr_entry dev_cntrs[DEV_CNTR_LAST] = {
                        CCE_SEND_CREDIT_INT_CNT, CNTR_NORMAL),
 [C_DC_UNC_ERR] = DC_PERF_CNTR(DcUnctblErr, DCC_ERR_UNCORRECTABLE_CNT,
                              CNTR_SYNTH),
-[C_DC_RCV_ERR] = DC_PERF_CNTR(DcRecvErr, DCC_ERR_PORTRCV_ERR_CNT, CNTR_SYNTH),
+[C_DC_RCV_ERR] = CNTR_ELEM("DcRecvErr", DCC_ERR_PORTRCV_ERR_CNT, 0, CNTR_SYNTH,
+                           access_dc_rcv_err_cnt),
 [C_DC_FM_CFG_ERR] = DC_PERF_CNTR(DcFmCfgErr, DCC_ERR_FMCONFIG_ERR_CNT,
                                 CNTR_SYNTH),
 [C_DC_RMT_PHY_ERR] = DC_PERF_CNTR(DcRmtPhyErr, DCC_ERR_RCVREMOTE_PHY_ERR_CNT,
@@ -8798,30 +8826,6 @@ static int write_tx_settings(struct hfi1_devdata *dd,
        return load_8051_config(dd, TX_SETTINGS, GENERAL_CONFIG, frame);
 }
 
-static void check_fabric_firmware_versions(struct hfi1_devdata *dd)
-{
-       u32 frame, version, prod_id;
-       int ret, lane;
-
-       /* 4 lanes */
-       for (lane = 0; lane < 4; lane++) {
-               ret = read_8051_config(dd, SPICO_FW_VERSION, lane, &frame);
-               if (ret) {
-                       dd_dev_err(dd,
-                                  "Unable to read lane %d firmware details\n",
-                                  lane);
-                       continue;
-               }
-               version = (frame >> SPICO_ROM_VERSION_SHIFT)
-                                       & SPICO_ROM_VERSION_MASK;
-               prod_id = (frame >> SPICO_ROM_PROD_ID_SHIFT)
-                                       & SPICO_ROM_PROD_ID_MASK;
-               dd_dev_info(dd,
-                           "Lane %d firmware: version 0x%04x, prod_id 0x%04x\n",
-                           lane, version, prod_id);
-       }
-}
-
 /*
  * Read an idle LCB message.
  *
@@ -9187,17 +9191,24 @@ static void wait_for_qsfp_init(struct hfi1_pportdata *ppd)
        unsigned long timeout;
 
        /*
-        * Check for QSFP interrupt for t_init (SFF 8679)
+        * Some QSFP cables have a quirk that asserts the IntN line as a side
+        * effect of power up on plug-in. We ignore this false positive
+        * interrupt until the module has finished powering up by waiting for
+        * a minimum timeout of the module inrush initialization time of
+        * 500 ms (SFF 8679 Table 5-6) to ensure the voltage rails in the
+        * module have stabilized.
+        */
+       msleep(500);
+
+       /*
+        * Check for QSFP interrupt for t_init (SFF 8679 Table 8-1)
         */
        timeout = jiffies + msecs_to_jiffies(2000);
        while (1) {
                mask = read_csr(dd, dd->hfi1_id ?
                                ASIC_QSFP2_IN : ASIC_QSFP1_IN);
-               if (!(mask & QSFP_HFI0_INT_N)) {
-                       write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_CLEAR :
-                                 ASIC_QSFP1_CLEAR, QSFP_HFI0_INT_N);
+               if (!(mask & QSFP_HFI0_INT_N))
                        break;
-               }
                if (time_after(jiffies, timeout)) {
                        dd_dev_info(dd, "%s: No IntN detected, reset complete\n",
                                    __func__);
@@ -9213,10 +9224,17 @@ static void set_qsfp_int_n(struct hfi1_pportdata *ppd, u8 enable)
        u64 mask;
 
        mask = read_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK);
-       if (enable)
+       if (enable) {
+               /*
+                * Clear the status register to avoid an immediate interrupt
+                * when we re-enable the IntN pin
+                */
+               write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_CLEAR : ASIC_QSFP1_CLEAR,
+                         QSFP_HFI0_INT_N);
                mask |= (u64)QSFP_HFI0_INT_N;
-       else
+       } else {
                mask &= ~(u64)QSFP_HFI0_INT_N;
+       }
        write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK, mask);
 }
 
@@ -9630,14 +9648,6 @@ void hfi1_clear_tids(struct hfi1_ctxtdata *rcd)
                hfi1_put_tid(dd, i, PT_INVALID, 0, 0);
 }
 
-int hfi1_get_base_kinfo(struct hfi1_ctxtdata *rcd,
-                       struct hfi1_ctxt_info *kinfo)
-{
-       kinfo->runtime_flags = (HFI1_MISC_GET() << HFI1_CAP_USER_SHIFT) |
-               HFI1_CAP_UGET(MASK) | HFI1_CAP_KGET(K2U);
-       return 0;
-}
-
 struct hfi1_message_header *hfi1_get_msgheader(
                                struct hfi1_devdata *dd, __le32 *rhf_addr)
 {
@@ -9890,6 +9900,131 @@ static int wait_phy_linkstate(struct hfi1_devdata *dd, u32 state, u32 msecs)
        return 0;
 }
 
+static const char *state_completed_string(u32 completed)
+{
+       static const char * const state_completed[] = {
+               "EstablishComm",
+               "OptimizeEQ",
+               "VerifyCap"
+       };
+
+       if (completed < ARRAY_SIZE(state_completed))
+               return state_completed[completed];
+
+       return "unknown";
+}
+
+static const char all_lanes_dead_timeout_expired[] =
+       "All lanes were inactive â€“ was the interconnect media removed?";
+static const char tx_out_of_policy[] =
+       "Passing lanes on local port do not meet the local link width policy";
+static const char no_state_complete[] =
+       "State timeout occurred before link partner completed the state";
+static const char * const state_complete_reasons[] = {
+       [0x00] = "Reason unknown",
+       [0x01] = "Link was halted by driver, refer to LinkDownReason",
+       [0x02] = "Link partner reported failure",
+       [0x10] = "Unable to achieve frame sync on any lane",
+       [0x11] =
+         "Unable to find a common bit rate with the link partner",
+       [0x12] =
+         "Unable to achieve frame sync on sufficient lanes to meet the local link width policy",
+       [0x13] =
+         "Unable to identify preset equalization on sufficient lanes to meet the local link width policy",
+       [0x14] = no_state_complete,
+       [0x15] =
+         "State timeout occurred before link partner identified equalization presets",
+       [0x16] =
+         "Link partner completed the EstablishComm state, but the passing lanes do not meet the local link width policy",
+       [0x17] = tx_out_of_policy,
+       [0x20] = all_lanes_dead_timeout_expired,
+       [0x21] =
+         "Unable to achieve acceptable BER on sufficient lanes to meet the local link width policy",
+       [0x22] = no_state_complete,
+       [0x23] =
+         "Link partner completed the OptimizeEq state, but the passing lanes do not meet the local link width policy",
+       [0x24] = tx_out_of_policy,
+       [0x30] = all_lanes_dead_timeout_expired,
+       [0x31] =
+         "State timeout occurred waiting for host to process received frames",
+       [0x32] = no_state_complete,
+       [0x33] =
+         "Link partner completed the VerifyCap state, but the passing lanes do not meet the local link width policy",
+       [0x34] = tx_out_of_policy,
+};
+
+static const char *state_complete_reason_code_string(struct hfi1_pportdata *ppd,
+                                                    u32 code)
+{
+       const char *str = NULL;
+
+       if (code < ARRAY_SIZE(state_complete_reasons))
+               str = state_complete_reasons[code];
+
+       if (str)
+               return str;
+       return "Reserved";
+}
+
+/* describe the given last state complete frame */
+static void decode_state_complete(struct hfi1_pportdata *ppd, u32 frame,
+                                 const char *prefix)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u32 success;
+       u32 state;
+       u32 reason;
+       u32 lanes;
+
+       /*
+        * Decode frame:
+        *  [ 0: 0] - success
+        *  [ 3: 1] - state
+        *  [ 7: 4] - next state timeout
+        *  [15: 8] - reason code
+        *  [31:16] - lanes
+        */
+       success = frame & 0x1;
+       state = (frame >> 1) & 0x7;
+       reason = (frame >> 8) & 0xff;
+       lanes = (frame >> 16) & 0xffff;
+
+       dd_dev_err(dd, "Last %s LNI state complete frame 0x%08x:\n",
+                  prefix, frame);
+       dd_dev_err(dd, "    last reported state state: %s (0x%x)\n",
+                  state_completed_string(state), state);
+       dd_dev_err(dd, "    state successfully completed: %s\n",
+                  success ? "yes" : "no");
+       dd_dev_err(dd, "    fail reason 0x%x: %s\n",
+                  reason, state_complete_reason_code_string(ppd, reason));
+       dd_dev_err(dd, "    passing lane mask: 0x%x", lanes);
+}
+
+/*
+ * Read the last state complete frames and explain them.  This routine
+ * expects to be called if the link went down during link negotiation
+ * and initialization (LNI).  That is, anywhere between polling and link up.
+ */
+static void check_lni_states(struct hfi1_pportdata *ppd)
+{
+       u32 last_local_state;
+       u32 last_remote_state;
+
+       read_last_local_state(ppd->dd, &last_local_state);
+       read_last_remote_state(ppd->dd, &last_remote_state);
+
+       /*
+        * Don't report anything if there is nothing to report.  A value of
+        * 0 means the link was taken down while polling and there was no
+        * training in-process.
+        */
+       if (last_local_state == 0 && last_remote_state == 0)
+               return;
+
+       decode_state_complete(ppd, last_local_state, "transmitted");
+       decode_state_complete(ppd, last_remote_state, "received");
+}
+
 /*
  * Helper for set_link_state().  Do not call except from that routine.
  * Expects ppd->hls_mutex to be held.
@@ -9902,8 +10037,6 @@ static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason)
 {
        struct hfi1_devdata *dd = ppd->dd;
        u32 pstate, previous_state;
-       u32 last_local_state;
-       u32 last_remote_state;
        int ret;
        int do_transition;
        int do_wait;
@@ -10003,12 +10136,7 @@ static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason)
        } else if (previous_state
                        & (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) {
                /* went down while attempting link up */
-               /* byte 1 of last_*_state is the failure reason */
-               read_last_local_state(dd, &last_local_state);
-               read_last_remote_state(dd, &last_remote_state);
-               dd_dev_err(dd,
-                          "LNI failure last states: local 0x%08x, remote 0x%08x\n",
-                          last_local_state, last_remote_state);
+               check_lni_states(ppd);
        }
 
        /* the active link width (downgrade) is 0 on link down */
@@ -11668,9 +11796,6 @@ static void free_cntrs(struct hfi1_devdata *dd)
        dd->cntrnames = NULL;
 }
 
-#define CNTR_MAX 0xFFFFFFFFFFFFFFFFULL
-#define CNTR_32BIT_MAX 0x00000000FFFFFFFF
-
 static u64 read_dev_port_cntr(struct hfi1_devdata *dd, struct cntr_entry *entry,
                              u64 *psval, void *context, int vl)
 {
@@ -12325,37 +12450,6 @@ u8 hfi1_ibphys_portstate(struct hfi1_pportdata *ppd)
        return ib_pstate;
 }
 
-/*
- * Read/modify/write ASIC_QSFP register bits as selected by mask
- * data: 0 or 1 in the positions depending on what needs to be written
- * dir: 0 for read, 1 for write
- * mask: select by setting
- *      I2CCLK  (bit 0)
- *      I2CDATA (bit 1)
- */
-u64 hfi1_gpio_mod(struct hfi1_devdata *dd, u32 target, u32 data, u32 dir,
-                 u32 mask)
-{
-       u64 qsfp_oe, target_oe;
-
-       target_oe = target ? ASIC_QSFP2_OE : ASIC_QSFP1_OE;
-       if (mask) {
-               /* We are writing register bits, so lock access */
-               dir &= mask;
-               data &= mask;
-
-               qsfp_oe = read_csr(dd, target_oe);
-               qsfp_oe = (qsfp_oe & ~(u64)mask) | (u64)dir;
-               write_csr(dd, target_oe, qsfp_oe);
-       }
-       /* We are exclusively reading bits here, but it is unlikely
-        * we'll get valid data when we set the direction of the pin
-        * in the same call, so read should call this function again
-        * to get valid data
-        */
-       return read_csr(dd, target ? ASIC_QSFP2_IN : ASIC_QSFP1_IN);
-}
-
 #define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \
 (r &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
 
@@ -12780,7 +12874,6 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
 
        /*
         * Kernel receive contexts:
-        * - min of 2 or 1 context/numa (excluding control context)
         * - Context 0 - control context (VL15/multicast/error)
         * - Context 1 - first kernel context
         * - Context 2 - second kernel context
@@ -12794,9 +12887,7 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
                 */
                num_kernel_contexts = n_krcvqs + 1;
        else
-               num_kernel_contexts = num_online_nodes() + 1;
-       num_kernel_contexts =
-               max_t(int, MIN_KERNEL_KCTXTS, num_kernel_contexts);
+               num_kernel_contexts = DEFAULT_KRCVQS + 1;
        /*
         * Every kernel receive context needs an ACK send context.
         * one send context is allocated for each VL{0-7} and VL15
@@ -12815,7 +12906,7 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
         */
        if (num_user_contexts < 0)
                num_user_contexts =
-                       cpumask_weight(&dd->affinity->real_cpu_mask);
+                       cpumask_weight(&node_affinity.real_cpu_mask);
 
        total_contexts = num_kernel_contexts + num_user_contexts;
 
@@ -14141,6 +14232,11 @@ static int init_asic_data(struct hfi1_devdata *dd)
        }
        dd->asic_data->dds[dd->hfi1_id] = dd; /* self back-pointer */
        spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+
+       /* first one through - set up i2c devices */
+       if (!peer)
+               ret = set_up_i2c(dd, dd->asic_data);
+
        return ret;
 }
 
@@ -14445,19 +14541,6 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
                 (dd->revision >> CCE_REVISION_SW_SHIFT)
                    & CCE_REVISION_SW_MASK);
 
-       /*
-        * The real cpu mask is part of the affinity struct but has to be
-        * initialized earlier than the rest of the affinity struct because it
-        * is needed to calculate the number of user contexts in
-        * set_up_context_variables(). However, hfi1_dev_affinity_init(),
-        * which initializes the rest of the affinity struct members,
-        * depends on set_up_context_variables() for the number of kernel
-        * contexts, so it cannot be called before set_up_context_variables().
-        */
-       ret = init_real_cpu_mask(dd);
-       if (ret)
-               goto bail_cleanup;
-
        ret = set_up_context_variables(dd);
        if (ret)
                goto bail_cleanup;
@@ -14471,7 +14554,9 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
        /* set up KDETH QP prefix in both RX and TX CSRs */
        init_kdeth_qp(dd);
 
-       hfi1_dev_affinity_init(dd);
+       ret = hfi1_dev_affinity_init(dd);
+       if (ret)
+               goto bail_cleanup;
 
        /* send contexts must be set up before receive contexts */
        ret = init_send_contexts(dd);
@@ -14508,8 +14593,14 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
        /* set up LCB access - must be after set_up_interrupts() */
        init_lcb_access(dd);
 
+       /*
+        * Serial number is created from the base guid:
+        * [27:24] = base guid [38:35]
+        * [23: 0] = base guid [23: 0]
+        */
        snprintf(dd->serial, SERIAL_MAX, "0x%08llx\n",
-                dd->base_guid & 0xFFFFFF);
+                (dd->base_guid & 0xFFFFFF) |
+                    ((dd->base_guid >> 11) & 0xF000000));
 
        dd->oui1 = dd->base_guid >> 56 & 0xFF;
        dd->oui2 = dd->base_guid >> 48 & 0xFF;
@@ -14518,7 +14609,6 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
        ret = load_firmware(dd); /* asymmetric with dispose_firmware() */
        if (ret)
                goto bail_clear_intr;
-       check_fabric_firmware_versions(dd);
 
        thermal_init(dd);
 
index 66a327978739dd0b33e82146b60cc26c069e8703..ed11107c50fe50614efc39b469a31cb7e240291c 100644 (file)
@@ -640,6 +640,7 @@ extern uint platform_config_load;
 /* SBus commands */
 #define RESET_SBUS_RECEIVER 0x20
 #define WRITE_SBUS_RECEIVER 0x21
+#define READ_SBUS_RECEIVER  0x22
 void sbus_request(struct hfi1_devdata *dd,
                  u8 receiver_addr, u8 data_addr, u8 command, u32 data_in);
 int sbus_request_slow(struct hfi1_devdata *dd,
@@ -1336,10 +1337,6 @@ void hfi1_start_cleanup(struct hfi1_devdata *dd);
 void hfi1_clear_tids(struct hfi1_ctxtdata *rcd);
 struct hfi1_message_header *hfi1_get_msgheader(
                                struct hfi1_devdata *dd, __le32 *rhf_addr);
-int hfi1_get_base_kinfo(struct hfi1_ctxtdata *rcd,
-                       struct hfi1_ctxt_info *kinfo);
-u64 hfi1_gpio_mod(struct hfi1_devdata *dd, u32 target, u32 data, u32 dir,
-                 u32 mask);
 int hfi1_init_ctxt(struct send_context *sc);
 void hfi1_put_tid(struct hfi1_devdata *dd, u32 index,
                  u32 type, unsigned long pa, u16 order);
index 8744de6667c25fbe22c37f84c247540744886c4b..5b999389978946003b84f202c4f715e0a0ade242 100644 (file)
 #define ASIC_STS_SBUS_RESULT (ASIC + 0x000000000010)
 #define ASIC_STS_SBUS_RESULT_DONE_SMASK 0x1ull
 #define ASIC_STS_SBUS_RESULT_RCV_DATA_VALID_SMASK 0x2ull
+#define ASIC_STS_SBUS_RESULT_RESULT_CODE_SHIFT 2
+#define ASIC_STS_SBUS_RESULT_RESULT_CODE_MASK 0x7ull
+#define ASIC_STS_SBUS_RESULT_DATA_OUT_SHIFT 32
+#define ASIC_STS_SBUS_RESULT_DATA_OUT_MASK 0xFFFFFFFFull
 #define ASIC_STS_THERM (ASIC + 0x000000000058)
 #define ASIC_STS_THERM_CRIT_TEMP_MASK 0x7FFull
 #define ASIC_STS_THERM_CRIT_TEMP_SHIFT 18
index c75b0ae688f87713a922be6df18ab19efb67e56b..8246dc7d0573a1be768fb7d65fab2fba01c7c686 100644 (file)
@@ -392,9 +392,7 @@ static void rcv_hdrerr(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd,
                        u16 rlid;
                        u8 svc_type, sl, sc5;
 
-                       sc5  = (be16_to_cpu(rhdr->lrh[0]) >> 12) & 0xf;
-                       if (rhf_dc_info(packet->rhf))
-                               sc5 |= 0x10;
+                       sc5 = hdr2sc(rhdr, packet->rhf);
                        sl = ibp->sc_to_sl[sc5];
 
                        lqpn = be32_to_cpu(bth[1]) & RVT_QPN_MASK;
@@ -450,14 +448,20 @@ static inline void init_packet(struct hfi1_ctxtdata *rcd,
        packet->rcv_flags = 0;
 }
 
-static void process_ecn(struct rvt_qp *qp, struct hfi1_ib_header *hdr,
-                       struct hfi1_other_headers *ohdr,
-                       u64 rhf, u32 bth1, struct ib_grh *grh)
+void hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt,
+                              bool do_cnp)
 {
        struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
-       u32 rqpn = 0;
-       u16 rlid;
-       u8 sc5, svc_type;
+       struct hfi1_ib_header *hdr = pkt->hdr;
+       struct hfi1_other_headers *ohdr = pkt->ohdr;
+       struct ib_grh *grh = NULL;
+       u32 rqpn = 0, bth1;
+       u16 rlid, dlid = be16_to_cpu(hdr->lrh[1]);
+       u8 sc, svc_type;
+       bool is_mcast = false;
+
+       if (pkt->rcv_flags & HFI1_HAS_GRH)
+               grh = &hdr->u.l.grh;
 
        switch (qp->ibqp.qp_type) {
        case IB_QPT_SMI:
@@ -466,6 +470,8 @@ static void process_ecn(struct rvt_qp *qp, struct hfi1_ib_header *hdr,
                rlid = be16_to_cpu(hdr->lrh[3]);
                rqpn = be32_to_cpu(ohdr->u.ud.deth[1]) & RVT_QPN_MASK;
                svc_type = IB_CC_SVCTYPE_UD;
+               is_mcast = (dlid > be16_to_cpu(IB_MULTICAST_LID_BASE)) &&
+                       (dlid != be16_to_cpu(IB_LID_PERMISSIVE));
                break;
        case IB_QPT_UC:
                rlid = qp->remote_ah_attr.dlid;
@@ -481,24 +487,23 @@ static void process_ecn(struct rvt_qp *qp, struct hfi1_ib_header *hdr,
                return;
        }
 
-       sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
-       if (rhf_dc_info(rhf))
-               sc5 |= 0x10;
+       sc = hdr2sc((struct hfi1_message_header *)hdr, pkt->rhf);
 
-       if (bth1 & HFI1_FECN_SMASK) {
+       bth1 = be32_to_cpu(ohdr->bth[1]);
+       if (do_cnp && (bth1 & HFI1_FECN_SMASK)) {
                u16 pkey = (u16)be32_to_cpu(ohdr->bth[0]);
-               u16 dlid = be16_to_cpu(hdr->lrh[1]);
 
-               return_cnp(ibp, qp, rqpn, pkey, dlid, rlid, sc5, grh);
+               return_cnp(ibp, qp, rqpn, pkey, dlid, rlid, sc, grh);
        }
 
-       if (bth1 & HFI1_BECN_SMASK) {
+       if (!is_mcast && (bth1 & HFI1_BECN_SMASK)) {
                struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
                u32 lqpn = bth1 & RVT_QPN_MASK;
-               u8 sl = ibp->sc_to_sl[sc5];
+               u8 sl = ibp->sc_to_sl[sc];
 
                process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type);
        }
+
 }
 
 struct ps_mdata {
@@ -596,7 +601,6 @@ static void __prescan_rxq(struct hfi1_packet *packet)
                struct rvt_qp *qp;
                struct hfi1_ib_header *hdr;
                struct hfi1_other_headers *ohdr;
-               struct ib_grh *grh = NULL;
                struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
                u64 rhf = rhf_to_cpu(rhf_addr);
                u32 etype = rhf_rcv_type(rhf), qpn, bth1;
@@ -616,14 +620,13 @@ static void __prescan_rxq(struct hfi1_packet *packet)
                        hfi1_get_msgheader(dd, rhf_addr);
                lnh = be16_to_cpu(hdr->lrh[0]) & 3;
 
-               if (lnh == HFI1_LRH_BTH) {
+               if (lnh == HFI1_LRH_BTH)
                        ohdr = &hdr->u.oth;
-               } else if (lnh == HFI1_LRH_GRH) {
+               else if (lnh == HFI1_LRH_GRH)
                        ohdr = &hdr->u.l.oth;
-                       grh = &hdr->u.l.grh;
-               } else {
+               else
                        goto next; /* just in case */
-               }
+
                bth1 = be32_to_cpu(ohdr->bth[1]);
                is_ecn = !!(bth1 & (HFI1_FECN_SMASK | HFI1_BECN_SMASK));
 
@@ -639,7 +642,7 @@ static void __prescan_rxq(struct hfi1_packet *packet)
                        goto next;
                }
 
-               process_ecn(qp, hdr, ohdr, rhf, bth1, grh);
+               process_ecn(qp, packet, true);
                rcu_read_unlock();
 
                /* turn off BECN, FECN */
@@ -1362,6 +1365,7 @@ int process_receive_bypass(struct hfi1_packet *packet)
 
        dd_dev_err(packet->rcd->dd,
                   "Bypass packets are not supported in normal operation. Dropping\n");
+       incr_cntr64(&packet->rcd->dd->sw_rcv_bypass_packet_errors);
        return RHF_RCV_CONTINUE;
 }
 
index 32c19fad12a4c4df5d074ac5540e3a2f8ee4973c..1ecbec1923589c3ec96d0958ec45541aa2b5724b 100644 (file)
@@ -168,6 +168,7 @@ static inline int is_valid_mmap(u64 token)
 
 static int hfi1_file_open(struct inode *inode, struct file *fp)
 {
+       struct hfi1_filedata *fd;
        struct hfi1_devdata *dd = container_of(inode->i_cdev,
                                               struct hfi1_devdata,
                                               user_cdev);
@@ -176,10 +177,17 @@ static int hfi1_file_open(struct inode *inode, struct file *fp)
        kobject_get(&dd->kobj);
 
        /* The real work is performed later in assign_ctxt() */
-       fp->private_data = kzalloc(sizeof(struct hfi1_filedata), GFP_KERNEL);
-       if (fp->private_data) /* no cpu affinity by default */
-               ((struct hfi1_filedata *)fp->private_data)->rec_cpu_num = -1;
-       return fp->private_data ? 0 : -ENOMEM;
+
+       fd = kzalloc(sizeof(*fd), GFP_KERNEL);
+
+       if (fd) {
+               fd->rec_cpu_num = -1; /* no cpu affinity by default */
+               fd->mm = current->mm;
+       }
+
+       fp->private_data = fd;
+
+       return fd ? 0 : -ENOMEM;
 }
 
 static long hfi1_file_ioctl(struct file *fp, unsigned int cmd,
@@ -392,41 +400,38 @@ static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from)
        struct hfi1_filedata *fd = kiocb->ki_filp->private_data;
        struct hfi1_user_sdma_pkt_q *pq = fd->pq;
        struct hfi1_user_sdma_comp_q *cq = fd->cq;
-       int ret = 0, done = 0, reqs = 0;
+       int done = 0, reqs = 0;
        unsigned long dim = from->nr_segs;
 
-       if (!cq || !pq) {
-               ret = -EIO;
-               goto done;
-       }
+       if (!cq || !pq)
+               return -EIO;
 
-       if (!iter_is_iovec(from) || !dim) {
-               ret = -EINVAL;
-               goto done;
-       }
+       if (!iter_is_iovec(from) || !dim)
+               return -EINVAL;
 
        hfi1_cdbg(SDMA, "SDMA request from %u:%u (%lu)",
                  fd->uctxt->ctxt, fd->subctxt, dim);
 
-       if (atomic_read(&pq->n_reqs) == pq->n_max_reqs) {
-               ret = -ENOSPC;
-               goto done;
-       }
+       if (atomic_read(&pq->n_reqs) == pq->n_max_reqs)
+               return -ENOSPC;
 
        while (dim) {
+               int ret;
                unsigned long count = 0;
 
                ret = hfi1_user_sdma_process_request(
                        kiocb->ki_filp, (struct iovec *)(from->iov + done),
                        dim, &count);
-               if (ret)
-                       goto done;
+               if (ret) {
+                       reqs = ret;
+                       break;
+               }
                dim -= count;
                done += count;
                reqs++;
        }
-done:
-       return ret ? ret : reqs;
+
+       return reqs;
 }
 
 static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma)
@@ -718,7 +723,7 @@ static int hfi1_file_close(struct inode *inode, struct file *fp)
        hfi1_user_sdma_free_queues(fdata);
 
        /* release the cpu */
-       hfi1_put_proc_affinity(dd, fdata->rec_cpu_num);
+       hfi1_put_proc_affinity(fdata->rec_cpu_num);
 
        /*
         * Clear any left over, unhandled events so the next process that
@@ -730,7 +735,6 @@ static int hfi1_file_close(struct inode *inode, struct file *fp)
 
        if (--uctxt->cnt) {
                uctxt->active_slaves &= ~(1 << fdata->subctxt);
-               uctxt->subpid[fdata->subctxt] = 0;
                mutex_unlock(&hfi1_mutex);
                goto done;
        }
@@ -756,7 +760,6 @@ static int hfi1_file_close(struct inode *inode, struct file *fp)
        write_kctxt_csr(dd, uctxt->sc->hw_context, SEND_CTXT_CHECK_ENABLE,
                        hfi1_pkt_default_send_ctxt_mask(dd, uctxt->sc->type));
        sc_disable(uctxt->sc);
-       uctxt->pid = 0;
        spin_unlock_irqrestore(&dd->uctxt_lock, flags);
 
        dd->rcd[uctxt->ctxt] = NULL;
@@ -818,9 +821,10 @@ static int assign_ctxt(struct file *fp, struct hfi1_user_info *uinfo)
                ret = find_shared_ctxt(fp, uinfo);
                if (ret < 0)
                        goto done_unlock;
-               if (ret)
-                       fd->rec_cpu_num = hfi1_get_proc_affinity(
-                               fd->uctxt->dd, fd->uctxt->numa_id);
+               if (ret) {
+                       fd->rec_cpu_num =
+                               hfi1_get_proc_affinity(fd->uctxt->numa_id);
+               }
        }
 
        /*
@@ -895,7 +899,6 @@ static int find_shared_ctxt(struct file *fp,
                        }
                        fd->uctxt = uctxt;
                        fd->subctxt  = uctxt->cnt++;
-                       uctxt->subpid[fd->subctxt] = current->pid;
                        uctxt->active_slaves |= 1 << fd->subctxt;
                        ret = 1;
                        goto done;
@@ -932,7 +935,11 @@ static int allocate_ctxt(struct file *fp, struct hfi1_devdata *dd,
        if (ctxt == dd->num_rcv_contexts)
                return -EBUSY;
 
-       fd->rec_cpu_num = hfi1_get_proc_affinity(dd, -1);
+       /*
+        * If we don't have a NUMA node requested, preference is towards
+        * device NUMA node.
+        */
+       fd->rec_cpu_num = hfi1_get_proc_affinity(dd->node);
        if (fd->rec_cpu_num != -1)
                numa = cpu_to_node(fd->rec_cpu_num);
        else
@@ -976,8 +983,7 @@ static int allocate_ctxt(struct file *fp, struct hfi1_devdata *dd,
                        return ret;
        }
        uctxt->userversion = uinfo->userversion;
-       uctxt->pid = current->pid;
-       uctxt->flags = HFI1_CAP_UGET(MASK);
+       uctxt->flags = hfi1_cap_mask; /* save current flag state */
        init_waitqueue_head(&uctxt->wait);
        strlcpy(uctxt->comm, current->comm, sizeof(uctxt->comm));
        memcpy(uctxt->uuid, uinfo->uuid, sizeof(uctxt->uuid));
@@ -1080,18 +1086,18 @@ static int user_init(struct file *fp)
        hfi1_set_ctxt_jkey(uctxt->dd, uctxt->ctxt, uctxt->jkey);
 
        rcvctrl_ops = HFI1_RCVCTRL_CTXT_ENB;
-       if (HFI1_CAP_KGET_MASK(uctxt->flags, HDRSUPP))
+       if (HFI1_CAP_UGET_MASK(uctxt->flags, HDRSUPP))
                rcvctrl_ops |= HFI1_RCVCTRL_TIDFLOW_ENB;
        /*
         * Ignore the bit in the flags for now until proper
         * support for multiple packet per rcv array entry is
         * added.
         */
-       if (!HFI1_CAP_KGET_MASK(uctxt->flags, MULTI_PKT_EGR))
+       if (!HFI1_CAP_UGET_MASK(uctxt->flags, MULTI_PKT_EGR))
                rcvctrl_ops |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB;
-       if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_EGR_FULL))
+       if (HFI1_CAP_UGET_MASK(uctxt->flags, NODROP_EGR_FULL))
                rcvctrl_ops |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
-       if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_RHQ_FULL))
+       if (HFI1_CAP_UGET_MASK(uctxt->flags, NODROP_RHQ_FULL))
                rcvctrl_ops |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
        /*
         * The RcvCtxtCtrl.TailUpd bit has to be explicitly written.
@@ -1099,7 +1105,7 @@ static int user_init(struct file *fp)
         * uses of the chip or ctxt. Therefore, add the rcvctrl op
         * for both cases.
         */
-       if (HFI1_CAP_KGET_MASK(uctxt->flags, DMA_RTAIL))
+       if (HFI1_CAP_UGET_MASK(uctxt->flags, DMA_RTAIL))
                rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_ENB;
        else
                rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_DIS;
@@ -1122,9 +1128,14 @@ static int get_ctxt_info(struct file *fp, void __user *ubase, __u32 len)
        int ret = 0;
 
        memset(&cinfo, 0, sizeof(cinfo));
-       ret = hfi1_get_base_kinfo(uctxt, &cinfo);
-       if (ret < 0)
-               goto done;
+       cinfo.runtime_flags = (((uctxt->flags >> HFI1_CAP_MISC_SHIFT) &
+                               HFI1_CAP_MISC_MASK) << HFI1_CAP_USER_SHIFT) |
+                       HFI1_CAP_UGET_MASK(uctxt->flags, MASK) |
+                       HFI1_CAP_KGET_MASK(uctxt->flags, K2U);
+       /* adjust flag if this fd is not able to cache */
+       if (!fd->handler)
+               cinfo.runtime_flags |= HFI1_CAP_TID_UNMAP; /* no caching */
+
        cinfo.num_active = hfi1_count_active_units();
        cinfo.unit = uctxt->dd->unit;
        cinfo.ctxt = uctxt->ctxt;
@@ -1146,7 +1157,7 @@ static int get_ctxt_info(struct file *fp, void __user *ubase, __u32 len)
        trace_hfi1_ctxt_info(uctxt->dd, uctxt->ctxt, fd->subctxt, cinfo);
        if (copy_to_user(ubase, &cinfo, sizeof(cinfo)))
                ret = -EFAULT;
-done:
+
        return ret;
 }
 
index ed680fda611dfdcee292e2613ee6e2d13fdfd891..13db8eb4f4ece1949e699c9e8b509a4fa70598a0 100644 (file)
@@ -206,6 +206,9 @@ static const struct firmware *platform_config;
 /* the number of fabric SerDes on the SBus */
 #define NUM_FABRIC_SERDES 4
 
+/* ASIC_STS_SBUS_RESULT.RESULT_CODE value */
+#define SBUS_READ_COMPLETE 0x4
+
 /* SBus fabric SerDes addresses, one set per HFI */
 static const u8 fabric_serdes_addrs[2][NUM_FABRIC_SERDES] = {
        { 0x01, 0x02, 0x03, 0x04 },
@@ -240,6 +243,7 @@ static const u8 all_pcie_serdes_broadcast = 0xe0;
 static void dispose_one_firmware(struct firmware_details *fdet);
 static int load_fabric_serdes_firmware(struct hfi1_devdata *dd,
                                       struct firmware_details *fdet);
+static void dump_fw_version(struct hfi1_devdata *dd);
 
 /*
  * Read a single 64-bit value from 8051 data memory.
@@ -1078,6 +1082,44 @@ void sbus_request(struct hfi1_devdata *dd,
                   ASIC_CFG_SBUS_REQUEST_RECEIVER_ADDR_SHIFT));
 }
 
+/*
+ * Read a value from the SBus.
+ *
+ * Requires the caller to be in fast mode
+ */
+static u32 sbus_read(struct hfi1_devdata *dd, u8 receiver_addr, u8 data_addr,
+                    u32 data_in)
+{
+       u64 reg;
+       int retries;
+       int success = 0;
+       u32 result = 0;
+       u32 result_code = 0;
+
+       sbus_request(dd, receiver_addr, data_addr, READ_SBUS_RECEIVER, data_in);
+
+       for (retries = 0; retries < 100; retries++) {
+               usleep_range(1000, 1200); /* arbitrary */
+               reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
+               result_code = (reg >> ASIC_STS_SBUS_RESULT_RESULT_CODE_SHIFT)
+                               & ASIC_STS_SBUS_RESULT_RESULT_CODE_MASK;
+               if (result_code != SBUS_READ_COMPLETE)
+                       continue;
+
+               success = 1;
+               result = (reg >> ASIC_STS_SBUS_RESULT_DATA_OUT_SHIFT)
+                          & ASIC_STS_SBUS_RESULT_DATA_OUT_MASK;
+               break;
+       }
+
+       if (!success) {
+               dd_dev_err(dd, "%s: read failed, result code 0x%x\n", __func__,
+                          result_code);
+       }
+
+       return result;
+}
+
 /*
  * Turn off the SBus and fabric serdes spicos.
  *
@@ -1636,6 +1678,7 @@ int load_firmware(struct hfi1_devdata *dd)
                        return ret;
        }
 
+       dump_fw_version(dd);
        return 0;
 }
 
@@ -2054,3 +2097,85 @@ void read_guid(struct hfi1_devdata *dd)
        dd_dev_info(dd, "GUID %llx",
                    (unsigned long long)dd->base_guid);
 }
+
+/* read and display firmware version info */
+static void dump_fw_version(struct hfi1_devdata *dd)
+{
+       u32 pcie_vers[NUM_PCIE_SERDES];
+       u32 fabric_vers[NUM_FABRIC_SERDES];
+       u32 sbus_vers;
+       int i;
+       int all_same;
+       int ret;
+       u8 rcv_addr;
+
+       ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT);
+       if (ret) {
+               dd_dev_err(dd, "Unable to acquire SBus to read firmware versions\n");
+               return;
+       }
+
+       /* set fast mode */
+       set_sbus_fast_mode(dd);
+
+       /* read version for SBus Master */
+       sbus_request(dd, SBUS_MASTER_BROADCAST, 0x02, WRITE_SBUS_RECEIVER, 0);
+       sbus_request(dd, SBUS_MASTER_BROADCAST, 0x07, WRITE_SBUS_RECEIVER, 0x1);
+       /* wait for interrupt to be processed */
+       usleep_range(10000, 11000);
+       sbus_vers = sbus_read(dd, SBUS_MASTER_BROADCAST, 0x08, 0x1);
+       dd_dev_info(dd, "SBus Master firmware version 0x%08x\n", sbus_vers);
+
+       /* read version for PCIe SerDes */
+       all_same = 1;
+       pcie_vers[0] = 0;
+       for (i = 0; i < NUM_PCIE_SERDES; i++) {
+               rcv_addr = pcie_serdes_addrs[dd->hfi1_id][i];
+               sbus_request(dd, rcv_addr, 0x03, WRITE_SBUS_RECEIVER, 0);
+               /* wait for interrupt to be processed */
+               usleep_range(10000, 11000);
+               pcie_vers[i] = sbus_read(dd, rcv_addr, 0x04, 0x0);
+               if (i > 0 && pcie_vers[0] != pcie_vers[i])
+                       all_same = 0;
+       }
+
+       if (all_same) {
+               dd_dev_info(dd, "PCIe SerDes firmware version 0x%x\n",
+                           pcie_vers[0]);
+       } else {
+               dd_dev_warn(dd, "PCIe SerDes do not have the same firmware version\n");
+               for (i = 0; i < NUM_PCIE_SERDES; i++) {
+                       dd_dev_info(dd,
+                                   "PCIe SerDes lane %d firmware version 0x%x\n",
+                                   i, pcie_vers[i]);
+               }
+       }
+
+       /* read version for fabric SerDes */
+       all_same = 1;
+       fabric_vers[0] = 0;
+       for (i = 0; i < NUM_FABRIC_SERDES; i++) {
+               rcv_addr = fabric_serdes_addrs[dd->hfi1_id][i];
+               sbus_request(dd, rcv_addr, 0x03, WRITE_SBUS_RECEIVER, 0);
+               /* wait for interrupt to be processed */
+               usleep_range(10000, 11000);
+               fabric_vers[i] = sbus_read(dd, rcv_addr, 0x04, 0x0);
+               if (i > 0 && fabric_vers[0] != fabric_vers[i])
+                       all_same = 0;
+       }
+
+       if (all_same) {
+               dd_dev_info(dd, "Fabric SerDes firmware version 0x%x\n",
+                           fabric_vers[0]);
+       } else {
+               dd_dev_warn(dd, "Fabric SerDes do not have the same firmware version\n");
+               for (i = 0; i < NUM_FABRIC_SERDES; i++) {
+                       dd_dev_info(dd,
+                                   "Fabric SerDes lane %d firmware version 0x%x\n",
+                                   i, fabric_vers[i]);
+               }
+       }
+
+       clear_sbus_fast_mode(dd);
+       release_chip_resource(dd, CR_SBUS);
+}
index 49a71e24a8f00922298b5405e246117781ff1fc7..1000e0fd96d9b4972cec3327739f5ca7ff59b7b3 100644 (file)
@@ -62,6 +62,8 @@
 #include <linux/cdev.h>
 #include <linux/delay.h>
 #include <linux/kthread.h>
+#include <linux/i2c.h>
+#include <linux/i2c-algo-bit.h>
 #include <rdma/rdma_vt.h>
 
 #include "chip_registers.h"
@@ -253,7 +255,7 @@ struct hfi1_ctxtdata {
        /* chip offset of PIO buffers for this ctxt */
        u32 piobufs;
        /* per-context configuration flags */
-       u32 flags;
+       unsigned long flags;
        /* per-context event flags for fileops/intr communication */
        unsigned long event_flags;
        /* WAIT_RCV that timed out, no interrupt */
@@ -268,9 +270,6 @@ struct hfi1_ctxtdata {
        u32 urgent;
        /* saved total number of polled urgent packets for poll edge trigger */
        u32 urgent_poll;
-       /* pid of process using this ctxt */
-       pid_t pid;
-       pid_t subpid[HFI1_MAX_SHARED_CTXTS];
        /* same size as task_struct .comm[], command that opened context */
        char comm[TASK_COMM_LEN];
        /* so file ops can get at unit */
@@ -366,11 +365,6 @@ struct hfi1_packet {
        u8 etype;
 };
 
-static inline bool has_sc4_bit(struct hfi1_packet *p)
-{
-       return !!rhf_dc_info(p->rhf);
-}
-
 /*
  * Private data for snoop/capture support.
  */
@@ -805,10 +799,19 @@ struct hfi1_temp {
        u8 triggers;      /* temperature triggers */
 };
 
+struct hfi1_i2c_bus {
+       struct hfi1_devdata *controlling_dd; /* current controlling device */
+       struct i2c_adapter adapter;     /* bus details */
+       struct i2c_algo_bit_data algo;  /* bus algorithm details */
+       int num;                        /* bus number, 0 or 1 */
+};
+
 /* common data between shared ASIC HFIs */
 struct hfi1_asic_data {
        struct hfi1_devdata *dds[2];    /* back pointers */
        struct mutex asic_resource_mutex;
+       struct hfi1_i2c_bus *i2c_bus0;
+       struct hfi1_i2c_bus *i2c_bus1;
 };
 
 /* device data struct now contains only "general per-device" info.
@@ -1128,7 +1131,8 @@ struct hfi1_devdata {
                NUM_SEND_DMA_ENG_ERR_STATUS_COUNTERS];
        /* Software counter that aggregates all cce_err_status errors */
        u64 sw_cce_err_status_aggregate;
-
+       /* Software counter that aggregates all bypass packet rcv errors */
+       u64 sw_rcv_bypass_packet_errors;
        /* receive interrupt functions */
        rhf_rcv_function_ptr *rhf_rcv_function_map;
        rhf_rcv_function_ptr normal_rhf_rcv_functions[8];
@@ -1184,6 +1188,7 @@ struct hfi1_devdata {
 
 struct tid_rb_node;
 struct mmu_rb_node;
+struct mmu_rb_handler;
 
 /* Private data for file operations */
 struct hfi1_filedata {
@@ -1194,7 +1199,7 @@ struct hfi1_filedata {
        /* for cpu affinity; -1 if none */
        int rec_cpu_num;
        u32 tid_n_pinned;
-       struct rb_root tid_rb_root;
+       struct mmu_rb_handler *handler;
        struct tid_rb_node **entry_to_rb;
        spinlock_t tid_lock; /* protect tid_[limit,used] counters */
        u32 tid_limit;
@@ -1203,6 +1208,7 @@ struct hfi1_filedata {
        u32 invalid_tid_idx;
        /* protect invalid_tids array and invalid_tid_idx */
        spinlock_t invalid_lock;
+       struct mm_struct *mm;
 };
 
 extern struct list_head hfi1_dev_list;
@@ -1236,6 +1242,8 @@ int handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *, int);
 int handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *, int);
 void set_all_slowpath(struct hfi1_devdata *dd);
 
+extern const struct pci_device_id hfi1_pci_tbl[];
+
 /* receive packet handler dispositions */
 #define RCV_PKT_OK      0x0 /* keep going */
 #define RCV_PKT_LIMIT   0x1 /* stop, hit limit, start thread */
@@ -1261,7 +1269,7 @@ void receive_interrupt_work(struct work_struct *work);
 static inline int hdr2sc(struct hfi1_message_header *hdr, u64 rhf)
 {
        return ((be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf) |
-              ((!!(rhf & RHF_DC_INFO_SMASK)) << 4);
+              ((!!(rhf_dc_info(rhf))) << 4);
 }
 
 static inline u16 generate_jkey(kuid_t uid)
@@ -1571,6 +1579,22 @@ static inline struct hfi1_ibport *to_iport(struct ib_device *ibdev, u8 port)
        return &dd->pport[pidx].ibport_data;
 }
 
+void hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt,
+                              bool do_cnp);
+static inline bool process_ecn(struct rvt_qp *qp, struct hfi1_packet *pkt,
+                              bool do_cnp)
+{
+       struct hfi1_other_headers *ohdr = pkt->ohdr;
+       u32 bth1;
+
+       bth1 = be32_to_cpu(ohdr->bth[1]);
+       if (unlikely(bth1 & (HFI1_BECN_SMASK | HFI1_FECN_SMASK))) {
+               hfi1_process_ecn_slowpath(qp, pkt, do_cnp);
+               return bth1 & HFI1_FECN_SMASK;
+       }
+       return false;
+}
+
 /*
  * Return the indexed PKEY from the port PKEY table.
  */
@@ -1588,14 +1612,23 @@ static inline u16 hfi1_get_pkey(struct hfi1_ibport *ibp, unsigned index)
 }
 
 /*
- * Readers of cc_state must call get_cc_state() under rcu_read_lock().
- * Writers of cc_state must call get_cc_state() under cc_state_lock.
+ * Called by readers of cc_state only, must call under rcu_read_lock().
  */
 static inline struct cc_state *get_cc_state(struct hfi1_pportdata *ppd)
 {
        return rcu_dereference(ppd->cc_state);
 }
 
+/*
+ * Called by writers of cc_state only,  must call under cc_state_lock.
+ */
+static inline
+struct cc_state *get_cc_state_protected(struct hfi1_pportdata *ppd)
+{
+       return rcu_dereference_protected(ppd->cc_state,
+                                        lockdep_is_held(&ppd->cc_state_lock));
+}
+
 /*
  * values for dd->flags (_device_ related flags)
  */
@@ -1671,9 +1704,12 @@ void shutdown_led_override(struct hfi1_pportdata *ppd);
  */
 #define DEFAULT_RCVHDR_ENTSIZE 32
 
-bool hfi1_can_pin_pages(struct hfi1_devdata *, u32, u32);
-int hfi1_acquire_user_pages(unsigned long, size_t, bool, struct page **);
-void hfi1_release_user_pages(struct mm_struct *, struct page **, size_t, bool);
+bool hfi1_can_pin_pages(struct hfi1_devdata *dd, struct mm_struct *mm,
+                       u32 nlocked, u32 npages);
+int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr,
+                           size_t npages, bool writable, struct page **pages);
+void hfi1_release_user_pages(struct mm_struct *mm, struct page **p,
+                            size_t npages, bool dirty);
 
 static inline void clear_rcvhdrtail(const struct hfi1_ctxtdata *rcd)
 {
@@ -1949,4 +1985,55 @@ static inline u32 qsfp_resource(struct hfi1_devdata *dd)
 
 int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp);
 
+#define DD_DEV_ENTRY(dd)       __string(dev, dev_name(&(dd)->pcidev->dev))
+#define DD_DEV_ASSIGN(dd)      __assign_str(dev, dev_name(&(dd)->pcidev->dev))
+
+#define packettype_name(etype) { RHF_RCV_TYPE_##etype, #etype }
+#define show_packettype(etype)                  \
+__print_symbolic(etype,                         \
+       packettype_name(EXPECTED),              \
+       packettype_name(EAGER),                 \
+       packettype_name(IB),                    \
+       packettype_name(ERROR),                 \
+       packettype_name(BYPASS))
+
+#define ib_opcode_name(opcode) { IB_OPCODE_##opcode, #opcode  }
+#define show_ib_opcode(opcode)                             \
+__print_symbolic(opcode,                                   \
+       ib_opcode_name(RC_SEND_FIRST),                     \
+       ib_opcode_name(RC_SEND_MIDDLE),                    \
+       ib_opcode_name(RC_SEND_LAST),                      \
+       ib_opcode_name(RC_SEND_LAST_WITH_IMMEDIATE),       \
+       ib_opcode_name(RC_SEND_ONLY),                      \
+       ib_opcode_name(RC_SEND_ONLY_WITH_IMMEDIATE),       \
+       ib_opcode_name(RC_RDMA_WRITE_FIRST),               \
+       ib_opcode_name(RC_RDMA_WRITE_MIDDLE),              \
+       ib_opcode_name(RC_RDMA_WRITE_LAST),                \
+       ib_opcode_name(RC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \
+       ib_opcode_name(RC_RDMA_WRITE_ONLY),                \
+       ib_opcode_name(RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \
+       ib_opcode_name(RC_RDMA_READ_REQUEST),              \
+       ib_opcode_name(RC_RDMA_READ_RESPONSE_FIRST),       \
+       ib_opcode_name(RC_RDMA_READ_RESPONSE_MIDDLE),      \
+       ib_opcode_name(RC_RDMA_READ_RESPONSE_LAST),        \
+       ib_opcode_name(RC_RDMA_READ_RESPONSE_ONLY),        \
+       ib_opcode_name(RC_ACKNOWLEDGE),                    \
+       ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE),             \
+       ib_opcode_name(RC_COMPARE_SWAP),                   \
+       ib_opcode_name(RC_FETCH_ADD),                      \
+       ib_opcode_name(UC_SEND_FIRST),                     \
+       ib_opcode_name(UC_SEND_MIDDLE),                    \
+       ib_opcode_name(UC_SEND_LAST),                      \
+       ib_opcode_name(UC_SEND_LAST_WITH_IMMEDIATE),       \
+       ib_opcode_name(UC_SEND_ONLY),                      \
+       ib_opcode_name(UC_SEND_ONLY_WITH_IMMEDIATE),       \
+       ib_opcode_name(UC_RDMA_WRITE_FIRST),               \
+       ib_opcode_name(UC_RDMA_WRITE_MIDDLE),              \
+       ib_opcode_name(UC_RDMA_WRITE_LAST),                \
+       ib_opcode_name(UC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \
+       ib_opcode_name(UC_RDMA_WRITE_ONLY),                \
+       ib_opcode_name(UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \
+       ib_opcode_name(UD_SEND_ONLY),                      \
+       ib_opcode_name(UD_SEND_ONLY_WITH_IMMEDIATE),       \
+       ib_opcode_name(CNP))
 #endif                          /* _HFI1_KERNEL_H */
index eed971ccd2a1e88e7180f2eae5aba92da5448c96..a358d23ecd54d563d05a14e10c0cd885f6546b8f 100644 (file)
@@ -64,6 +64,7 @@
 #include "debugfs.h"
 #include "verbs.h"
 #include "aspm.h"
+#include "affinity.h"
 
 #undef pr_fmt
 #define pr_fmt(fmt) DRIVER_NAME ": " fmt
@@ -474,8 +475,9 @@ static enum hrtimer_restart cca_timer_fn(struct hrtimer *t)
 void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd,
                         struct hfi1_devdata *dd, u8 hw_pidx, u8 port)
 {
-       int i, size;
+       int i;
        uint default_pkey_idx;
+       struct cc_state *cc_state;
 
        ppd->dd = dd;
        ppd->hw_pidx = hw_pidx;
@@ -526,9 +528,9 @@ void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd,
 
        spin_lock_init(&ppd->cc_state_lock);
        spin_lock_init(&ppd->cc_log_lock);
-       size = sizeof(struct cc_state);
-       RCU_INIT_POINTER(ppd->cc_state, kzalloc(size, GFP_KERNEL));
-       if (!rcu_dereference(ppd->cc_state))
+       cc_state = kzalloc(sizeof(*cc_state), GFP_KERNEL);
+       RCU_INIT_POINTER(ppd->cc_state, cc_state);
+       if (!cc_state)
                goto bail;
        return;
 
@@ -972,39 +974,49 @@ void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd)
 
 /*
  * Release our hold on the shared asic data.  If we are the last one,
- * free the structure.  Must be holding hfi1_devs_lock.
+ * return the structure to be finalized outside the lock.  Must be
+ * holding hfi1_devs_lock.
  */
-static void release_asic_data(struct hfi1_devdata *dd)
+static struct hfi1_asic_data *release_asic_data(struct hfi1_devdata *dd)
 {
+       struct hfi1_asic_data *ad;
        int other;
 
        if (!dd->asic_data)
-               return;
+               return NULL;
        dd->asic_data->dds[dd->hfi1_id] = NULL;
        other = dd->hfi1_id ? 0 : 1;
-       if (!dd->asic_data->dds[other]) {
-               /* we are the last holder, free it */
-               kfree(dd->asic_data);
-       }
+       ad = dd->asic_data;
        dd->asic_data = NULL;
+       /* return NULL if the other dd still has a link */
+       return ad->dds[other] ? NULL : ad;
+}
+
+static void finalize_asic_data(struct hfi1_devdata *dd,
+                              struct hfi1_asic_data *ad)
+{
+       clean_up_i2c(dd, ad);
+       kfree(ad);
 }
 
 static void __hfi1_free_devdata(struct kobject *kobj)
 {
        struct hfi1_devdata *dd =
                container_of(kobj, struct hfi1_devdata, kobj);
+       struct hfi1_asic_data *ad;
        unsigned long flags;
 
        spin_lock_irqsave(&hfi1_devs_lock, flags);
        idr_remove(&hfi1_unit_table, dd->unit);
        list_del(&dd->list);
-       release_asic_data(dd);
+       ad = release_asic_data(dd);
        spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+       if (ad)
+               finalize_asic_data(dd, ad);
        free_platform_config(dd);
        rcu_barrier(); /* wait for rcu callbacks to complete */
        free_percpu(dd->int_counter);
        free_percpu(dd->rcv_limit);
-       hfi1_dev_affinity_free(dd);
        free_percpu(dd->send_schedule);
        rvt_dealloc_device(&dd->verbs_dev.rdi);
 }
@@ -1162,7 +1174,7 @@ static int init_one(struct pci_dev *, const struct pci_device_id *);
 #define DRIVER_LOAD_MSG "Intel " DRIVER_NAME " loaded: "
 #define PFX DRIVER_NAME ": "
 
-static const struct pci_device_id hfi1_pci_tbl[] = {
+const struct pci_device_id hfi1_pci_tbl[] = {
        { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL0) },
        { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL1) },
        { 0, }
@@ -1198,6 +1210,10 @@ static int __init hfi1_mod_init(void)
        if (ret)
                goto bail;
 
+       ret = node_affinity_init();
+       if (ret)
+               goto bail;
+
        /* validate max MTU before any devices start */
        if (!valid_opa_max_mtu(hfi1_max_mtu)) {
                pr_err("Invalid max_mtu 0x%x, using 0x%x instead\n",
@@ -1278,6 +1294,7 @@ module_init(hfi1_mod_init);
 static void __exit hfi1_mod_cleanup(void)
 {
        pci_unregister_driver(&hfi1_pci_driver);
+       node_affinity_destroy();
        hfi1_wss_exit();
        hfi1_dbg_exit();
        hfi1_cpulist_count = 0;
@@ -1311,7 +1328,7 @@ static void cleanup_device_data(struct hfi1_devdata *dd)
                        hrtimer_cancel(&ppd->cca_timer[i].hrtimer);
 
                spin_lock(&ppd->cc_state_lock);
-               cc_state = get_cc_state(ppd);
+               cc_state = get_cc_state_protected(ppd);
                RCU_INIT_POINTER(ppd->cc_state, NULL);
                spin_unlock(&ppd->cc_state_lock);
 
@@ -1760,8 +1777,8 @@ int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd)
 
        hfi1_cdbg(PROC,
                  "ctxt%u: Alloced %u rcv tid entries @ %uKB, total %zuKB\n",
-                 rcd->ctxt, rcd->egrbufs.alloced, rcd->egrbufs.rcvtid_size,
-                 rcd->egrbufs.size);
+                 rcd->ctxt, rcd->egrbufs.alloced,
+                 rcd->egrbufs.rcvtid_size / 1024, rcd->egrbufs.size / 1024);
 
        /*
         * Set the contexts rcv array head update threshold to the closest
index fca07a1d6c284b90238543777e85462cc2249651..1263abe01999e3e84306b5594921bc4b11019fca 100644 (file)
@@ -588,7 +588,6 @@ static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
 
        pi->port_phys_conf = (ppd->port_type & 0xf);
 
-#if PI_LED_ENABLE_SUP
        pi->port_states.ledenable_offlinereason = ppd->neighbor_normal << 4;
        pi->port_states.ledenable_offlinereason |=
                ppd->is_sm_config_started << 5;
@@ -602,11 +601,6 @@ static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
        pi->port_states.ledenable_offlinereason |= is_beaconing_active << 6;
        pi->port_states.ledenable_offlinereason |=
                ppd->offline_disabled_reason;
-#else
-       pi->port_states.offline_reason = ppd->neighbor_normal << 4;
-       pi->port_states.offline_reason |= ppd->is_sm_config_started << 5;
-       pi->port_states.offline_reason |= ppd->offline_disabled_reason;
-#endif /* PI_LED_ENABLE_SUP */
 
        pi->port_states.portphysstate_portstate =
                (hfi1_ibphys_portstate(ppd) << 4) | state;
@@ -1752,17 +1746,11 @@ static int __subn_get_opa_psi(struct opa_smp *smp, u32 am, u8 *data,
        if (start_of_sm_config && (lstate == IB_PORT_INIT))
                ppd->is_sm_config_started = 1;
 
-#if PI_LED_ENABLE_SUP
        psi->port_states.ledenable_offlinereason = ppd->neighbor_normal << 4;
        psi->port_states.ledenable_offlinereason |=
                ppd->is_sm_config_started << 5;
        psi->port_states.ledenable_offlinereason |=
                ppd->offline_disabled_reason;
-#else
-       psi->port_states.offline_reason = ppd->neighbor_normal << 4;
-       psi->port_states.offline_reason |= ppd->is_sm_config_started << 5;
-       psi->port_states.offline_reason |= ppd->offline_disabled_reason;
-#endif /* PI_LED_ENABLE_SUP */
 
        psi->port_states.portphysstate_portstate =
                (hfi1_ibphys_portstate(ppd) << 4) | (lstate & 0xf);
@@ -2430,14 +2418,9 @@ static int pma_get_opa_portstatus(struct opa_pma_mad *pmp,
        rsp->port_rcv_remote_physical_errors =
                cpu_to_be64(read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
                                          CNTR_INVALID_VL));
-       tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL);
-       tmp2 = tmp + read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL);
-       if (tmp2 < tmp) {
-               /* overflow/wrapped */
-               rsp->local_link_integrity_errors = cpu_to_be64(~0);
-       } else {
-               rsp->local_link_integrity_errors = cpu_to_be64(tmp2);
-       }
+       rsp->local_link_integrity_errors =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_RX_REPLAY,
+                                         CNTR_INVALID_VL));
        tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL);
        tmp2 = tmp + read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT,
                                   CNTR_INVALID_VL);
@@ -2499,6 +2482,9 @@ static int pma_get_opa_portstatus(struct opa_pma_mad *pmp,
                        cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN_VL,
                                                  idx_from_vl(vl)));
 
+               rsp->vls[vfi].port_vl_xmit_discards =
+                       cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD_VL,
+                                                  idx_from_vl(vl)));
                vlinfo++;
                vfi++;
        }
@@ -2529,9 +2515,8 @@ static u64 get_error_counter_summary(struct ib_device *ibdev, u8 port,
        error_counter_summary += read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
                                               CNTR_INVALID_VL);
        /* local link integrity must be right-shifted by the lli resolution */
-       tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL);
-       tmp += read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL);
-       error_counter_summary += (tmp >> res_lli);
+       error_counter_summary += (read_dev_cntr(dd, C_DC_RX_REPLAY,
+                                               CNTR_INVALID_VL) >> res_lli);
        /* link error recovery must b right-shifted by the ler resolution */
        tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL);
        tmp += read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, CNTR_INVALID_VL);
@@ -2800,14 +2785,9 @@ static void pma_get_opa_port_ectrs(struct ib_device *ibdev,
        rsp->port_rcv_constraint_errors =
                cpu_to_be64(read_port_cntr(ppd, C_SW_RCV_CSTR_ERR,
                                           CNTR_INVALID_VL));
-       tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL);
-       tmp2 = tmp + read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL);
-       if (tmp2 < tmp) {
-               /* overflow/wrapped */
-               rsp->local_link_integrity_errors = cpu_to_be64(~0);
-       } else {
-               rsp->local_link_integrity_errors = cpu_to_be64(tmp2);
-       }
+       rsp->local_link_integrity_errors =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_RX_REPLAY,
+                                         CNTR_INVALID_VL));
        rsp->excessive_buffer_overruns =
                cpu_to_be64(read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL));
 }
@@ -2883,14 +2863,17 @@ static int pma_get_opa_porterrors(struct opa_pma_mad *pmp,
        tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL);
 
        rsp->uncorrectable_errors = tmp < 0x100 ? (tmp & 0xff) : 0xff;
-
+       rsp->port_rcv_errors =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL));
        vlinfo = &rsp->vls[0];
        vfi = 0;
        vl_select_mask = be32_to_cpu(req->vl_select_mask);
        for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
                         8 * sizeof(req->vl_select_mask)) {
                memset(vlinfo, 0, sizeof(*vlinfo));
-               /* vlinfo->vls[vfi].port_vl_xmit_discards ??? */
+               rsp->vls[vfi].port_vl_xmit_discards =
+                       cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD_VL,
+                                                  idx_from_vl(vl)));
                vlinfo += 1;
                vfi++;
        }
@@ -3162,10 +3145,8 @@ static int pma_set_opa_portstatus(struct opa_pma_mad *pmp,
        if (counter_select & CS_PORT_RCV_REMOTE_PHYSICAL_ERRORS)
                write_dev_cntr(dd, C_DC_RMT_PHY_ERR, CNTR_INVALID_VL, 0);
 
-       if (counter_select & CS_LOCAL_LINK_INTEGRITY_ERRORS) {
-               write_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL, 0);
+       if (counter_select & CS_LOCAL_LINK_INTEGRITY_ERRORS)
                write_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL, 0);
-       }
 
        if (counter_select & CS_LINK_ERROR_RECOVERY) {
                write_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL, 0);
@@ -3223,7 +3204,9 @@ static int pma_set_opa_portstatus(struct opa_pma_mad *pmp,
                /* if (counter_select & CS_PORT_MARK_FECN)
                 *     write_csr(dd, DCC_PRF_PORT_VL_MARK_FECN_CNT + offset, 0);
                 */
-               /* port_vl_xmit_discards ??? */
+               if (counter_select & C_SW_XMIT_DSCD_VL)
+                       write_port_cntr(ppd, C_SW_XMIT_DSCD_VL,
+                                       idx_from_vl(vl), 0);
        }
 
        if (resp_len)
@@ -3392,7 +3375,7 @@ static void apply_cc_state(struct hfi1_pportdata *ppd)
         */
        spin_lock(&ppd->cc_state_lock);
 
-       old_cc_state = get_cc_state(ppd);
+       old_cc_state = get_cc_state_protected(ppd);
        if (!old_cc_state) {
                /* never active, or shutting down */
                spin_unlock(&ppd->cc_state_lock);
@@ -3960,7 +3943,6 @@ void clear_linkup_counters(struct hfi1_devdata *dd)
        write_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL, 0);
        write_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, CNTR_INVALID_VL, 0);
        /* LocalLinkIntegrityErrors */
-       write_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL, 0);
        write_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL, 0);
        /* ExcessiveBufferOverruns */
        write_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL, 0);
index 8b734aaae88adf48be52a1a2e3d17ca4f800b7ce..5aa3fd1be6538e6b37842c3cd734dedaf18f8dc0 100644 (file)
 #define _HFI1_MAD_H
 
 #include <rdma/ib_pma.h>
-#define USE_PI_LED_ENABLE      1 /*
-                                  * use led enabled bit in struct
-                                  * opa_port_states, if available
-                                  */
 #include <rdma/opa_smi.h>
 #include <rdma/opa_port_info.h>
-#ifndef PI_LED_ENABLE_SUP
-#define PI_LED_ENABLE_SUP 0
-#endif
 #include "opa_compat.h"
 
 /*
index b7a80aa1ae30601b3fdb67501ca7f5a4ba1d812e..7ad30898fc19de4cef022fb4eec70418ae6ee5ad 100644 (file)
 #include "trace.h"
 
 struct mmu_rb_handler {
-       struct list_head list;
        struct mmu_notifier mn;
-       struct rb_root *root;
+       struct rb_root root;
+       void *ops_arg;
        spinlock_t lock;        /* protect the RB tree */
        struct mmu_rb_ops *ops;
+       struct mm_struct *mm;
+       struct list_head lru_list;
+       struct work_struct del_work;
+       struct list_head del_list;
+       struct workqueue_struct *wq;
 };
 
-static LIST_HEAD(mmu_rb_handlers);
-static DEFINE_SPINLOCK(mmu_rb_lock); /* protect mmu_rb_handlers list */
-
 static unsigned long mmu_node_start(struct mmu_rb_node *);
 static unsigned long mmu_node_last(struct mmu_rb_node *);
-static struct mmu_rb_handler *find_mmu_handler(struct rb_root *);
 static inline void mmu_notifier_page(struct mmu_notifier *, struct mm_struct *,
                                     unsigned long);
 static inline void mmu_notifier_range_start(struct mmu_notifier *,
@@ -76,6 +77,9 @@ static void mmu_notifier_mem_invalidate(struct mmu_notifier *,
                                        unsigned long, unsigned long);
 static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *,
                                           unsigned long, unsigned long);
+static void do_remove(struct mmu_rb_handler *handler,
+                     struct list_head *del_list);
+static void handle_remove(struct work_struct *work);
 
 static struct mmu_notifier_ops mn_opts = {
        .invalidate_page = mmu_notifier_page,
@@ -95,73 +99,79 @@ static unsigned long mmu_node_last(struct mmu_rb_node *node)
        return PAGE_ALIGN(node->addr + node->len) - 1;
 }
 
-int hfi1_mmu_rb_register(struct rb_root *root, struct mmu_rb_ops *ops)
+int hfi1_mmu_rb_register(void *ops_arg, struct mm_struct *mm,
+                        struct mmu_rb_ops *ops,
+                        struct workqueue_struct *wq,
+                        struct mmu_rb_handler **handler)
 {
        struct mmu_rb_handler *handlr;
-
-       if (!ops->invalidate)
-               return -EINVAL;
+       int ret;
 
        handlr = kmalloc(sizeof(*handlr), GFP_KERNEL);
        if (!handlr)
                return -ENOMEM;
 
-       handlr->root = root;
+       handlr->root = RB_ROOT;
        handlr->ops = ops;
+       handlr->ops_arg = ops_arg;
        INIT_HLIST_NODE(&handlr->mn.hlist);
        spin_lock_init(&handlr->lock);
        handlr->mn.ops = &mn_opts;
-       spin_lock(&mmu_rb_lock);
-       list_add_tail_rcu(&handlr->list, &mmu_rb_handlers);
-       spin_unlock(&mmu_rb_lock);
+       handlr->mm = mm;
+       INIT_WORK(&handlr->del_work, handle_remove);
+       INIT_LIST_HEAD(&handlr->del_list);
+       INIT_LIST_HEAD(&handlr->lru_list);
+       handlr->wq = wq;
+
+       ret = mmu_notifier_register(&handlr->mn, handlr->mm);
+       if (ret) {
+               kfree(handlr);
+               return ret;
+       }
 
-       return mmu_notifier_register(&handlr->mn, current->mm);
+       *handler = handlr;
+       return 0;
 }
 
-void hfi1_mmu_rb_unregister(struct rb_root *root)
+void hfi1_mmu_rb_unregister(struct mmu_rb_handler *handler)
 {
-       struct mmu_rb_handler *handler = find_mmu_handler(root);
+       struct mmu_rb_node *rbnode;
+       struct rb_node *node;
        unsigned long flags;
-
-       if (!handler)
-               return;
+       struct list_head del_list;
 
        /* Unregister first so we don't get any more notifications. */
-       if (current->mm)
-               mmu_notifier_unregister(&handler->mn, current->mm);
+       mmu_notifier_unregister(&handler->mn, handler->mm);
 
-       spin_lock(&mmu_rb_lock);
-       list_del_rcu(&handler->list);
-       spin_unlock(&mmu_rb_lock);
-       synchronize_rcu();
+       /*
+        * Make sure the wq delete handler is finished running.  It will not
+        * be triggered once the mmu notifiers are unregistered above.
+        */
+       flush_work(&handler->del_work);
+
+       INIT_LIST_HEAD(&del_list);
 
        spin_lock_irqsave(&handler->lock, flags);
-       if (!RB_EMPTY_ROOT(root)) {
-               struct rb_node *node;
-               struct mmu_rb_node *rbnode;
-
-               while ((node = rb_first(root))) {
-                       rbnode = rb_entry(node, struct mmu_rb_node, node);
-                       rb_erase(node, root);
-                       if (handler->ops->remove)
-                               handler->ops->remove(root, rbnode, NULL);
-               }
+       while ((node = rb_first(&handler->root))) {
+               rbnode = rb_entry(node, struct mmu_rb_node, node);
+               rb_erase(node, &handler->root);
+               /* move from LRU list to delete list */
+               list_move(&rbnode->list, &del_list);
        }
        spin_unlock_irqrestore(&handler->lock, flags);
 
+       do_remove(handler, &del_list);
+
        kfree(handler);
 }
 
-int hfi1_mmu_rb_insert(struct rb_root *root, struct mmu_rb_node *mnode)
+int hfi1_mmu_rb_insert(struct mmu_rb_handler *handler,
+                      struct mmu_rb_node *mnode)
 {
-       struct mmu_rb_handler *handler = find_mmu_handler(root);
        struct mmu_rb_node *node;
        unsigned long flags;
        int ret = 0;
 
-       if (!handler)
-               return -EINVAL;
-
        spin_lock_irqsave(&handler->lock, flags);
        hfi1_cdbg(MMU, "Inserting node addr 0x%llx, len %u", mnode->addr,
                  mnode->len);
@@ -170,12 +180,13 @@ int hfi1_mmu_rb_insert(struct rb_root *root, struct mmu_rb_node *mnode)
                ret = -EINVAL;
                goto unlock;
        }
-       __mmu_int_rb_insert(mnode, root);
+       __mmu_int_rb_insert(mnode, &handler->root);
+       list_add(&mnode->list, &handler->lru_list);
 
-       if (handler->ops->insert) {
-               ret = handler->ops->insert(root, mnode);
-               if (ret)
-                       __mmu_int_rb_remove(mnode, root);
+       ret = handler->ops->insert(handler->ops_arg, mnode);
+       if (ret) {
+               __mmu_int_rb_remove(mnode, &handler->root);
+               list_del(&mnode->list); /* remove from LRU list */
        }
 unlock:
        spin_unlock_irqrestore(&handler->lock, flags);
@@ -191,10 +202,10 @@ static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *handler,
 
        hfi1_cdbg(MMU, "Searching for addr 0x%llx, len %u", addr, len);
        if (!handler->ops->filter) {
-               node = __mmu_int_rb_iter_first(handler->root, addr,
+               node = __mmu_int_rb_iter_first(&handler->root, addr,
                                               (addr + len) - 1);
        } else {
-               for (node = __mmu_int_rb_iter_first(handler->root, addr,
+               for (node = __mmu_int_rb_iter_first(&handler->root, addr,
                                                    (addr + len) - 1);
                     node;
                     node = __mmu_int_rb_iter_next(node, addr,
@@ -206,82 +217,72 @@ static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *handler,
        return node;
 }
 
-/* Caller must *not* hold handler lock. */
-static void __mmu_rb_remove(struct mmu_rb_handler *handler,
-                           struct mmu_rb_node *node, struct mm_struct *mm)
-{
-       unsigned long flags;
-
-       /* Validity of handler and node pointers has been checked by caller. */
-       hfi1_cdbg(MMU, "Removing node addr 0x%llx, len %u", node->addr,
-                 node->len);
-       spin_lock_irqsave(&handler->lock, flags);
-       __mmu_int_rb_remove(node, handler->root);
-       spin_unlock_irqrestore(&handler->lock, flags);
-
-       if (handler->ops->remove)
-               handler->ops->remove(handler->root, node, mm);
-}
-
-struct mmu_rb_node *hfi1_mmu_rb_search(struct rb_root *root, unsigned long addr,
-                                      unsigned long len)
+struct mmu_rb_node *hfi1_mmu_rb_extract(struct mmu_rb_handler *handler,
+                                       unsigned long addr, unsigned long len)
 {
-       struct mmu_rb_handler *handler = find_mmu_handler(root);
        struct mmu_rb_node *node;
        unsigned long flags;
 
-       if (!handler)
-               return ERR_PTR(-EINVAL);
-
        spin_lock_irqsave(&handler->lock, flags);
        node = __mmu_rb_search(handler, addr, len);
+       if (node) {
+               __mmu_int_rb_remove(node, &handler->root);
+               list_del(&node->list); /* remove from LRU list */
+       }
        spin_unlock_irqrestore(&handler->lock, flags);
 
        return node;
 }
 
-struct mmu_rb_node *hfi1_mmu_rb_extract(struct rb_root *root,
-                                       unsigned long addr, unsigned long len)
+void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg)
 {
-       struct mmu_rb_handler *handler = find_mmu_handler(root);
-       struct mmu_rb_node *node;
+       struct mmu_rb_node *rbnode, *ptr;
+       struct list_head del_list;
        unsigned long flags;
+       bool stop = false;
 
-       if (!handler)
-               return ERR_PTR(-EINVAL);
+       INIT_LIST_HEAD(&del_list);
 
        spin_lock_irqsave(&handler->lock, flags);
-       node = __mmu_rb_search(handler, addr, len);
-       if (node)
-               __mmu_int_rb_remove(node, handler->root);
+       list_for_each_entry_safe_reverse(rbnode, ptr, &handler->lru_list,
+                                        list) {
+               if (handler->ops->evict(handler->ops_arg, rbnode, evict_arg,
+                                       &stop)) {
+                       __mmu_int_rb_remove(rbnode, &handler->root);
+                       /* move from LRU list to delete list */
+                       list_move(&rbnode->list, &del_list);
+               }
+               if (stop)
+                       break;
+       }
        spin_unlock_irqrestore(&handler->lock, flags);
 
-       return node;
+       while (!list_empty(&del_list)) {
+               rbnode = list_first_entry(&del_list, struct mmu_rb_node, list);
+               list_del(&rbnode->list);
+               handler->ops->remove(handler->ops_arg, rbnode);
+       }
 }
 
-void hfi1_mmu_rb_remove(struct rb_root *root, struct mmu_rb_node *node)
+/*
+ * It is up to the caller to ensure that this function does not race with the
+ * mmu invalidate notifier which may be calling the users remove callback on
+ * 'node'.
+ */
+void hfi1_mmu_rb_remove(struct mmu_rb_handler *handler,
+                       struct mmu_rb_node *node)
 {
-       struct mmu_rb_handler *handler = find_mmu_handler(root);
-
-       if (!handler || !node)
-               return;
-
-       __mmu_rb_remove(handler, node, NULL);
-}
+       unsigned long flags;
 
-static struct mmu_rb_handler *find_mmu_handler(struct rb_root *root)
-{
-       struct mmu_rb_handler *handler;
+       /* Validity of handler and node pointers has been checked by caller. */
+       hfi1_cdbg(MMU, "Removing node addr 0x%llx, len %u", node->addr,
+                 node->len);
+       spin_lock_irqsave(&handler->lock, flags);
+       __mmu_int_rb_remove(node, &handler->root);
+       list_del(&node->list); /* remove from LRU list */
+       spin_unlock_irqrestore(&handler->lock, flags);
 
-       rcu_read_lock();
-       list_for_each_entry_rcu(handler, &mmu_rb_handlers, list) {
-               if (handler->root == root)
-                       goto unlock;
-       }
-       handler = NULL;
-unlock:
-       rcu_read_unlock();
-       return handler;
+       handler->ops->remove(handler->ops_arg, node);
 }
 
 static inline void mmu_notifier_page(struct mmu_notifier *mn,
@@ -304,9 +305,10 @@ static void mmu_notifier_mem_invalidate(struct mmu_notifier *mn,
 {
        struct mmu_rb_handler *handler =
                container_of(mn, struct mmu_rb_handler, mn);
-       struct rb_root *root = handler->root;
+       struct rb_root *root = &handler->root;
        struct mmu_rb_node *node, *ptr = NULL;
        unsigned long flags;
+       bool added = false;
 
        spin_lock_irqsave(&handler->lock, flags);
        for (node = __mmu_int_rb_iter_first(root, start, end - 1);
@@ -315,11 +317,53 @@ static void mmu_notifier_mem_invalidate(struct mmu_notifier *mn,
                ptr = __mmu_int_rb_iter_next(node, start, end - 1);
                hfi1_cdbg(MMU, "Invalidating node addr 0x%llx, len %u",
                          node->addr, node->len);
-               if (handler->ops->invalidate(root, node)) {
+               if (handler->ops->invalidate(handler->ops_arg, node)) {
                        __mmu_int_rb_remove(node, root);
-                       if (handler->ops->remove)
-                               handler->ops->remove(root, node, mm);
+                       /* move from LRU list to delete list */
+                       list_move(&node->list, &handler->del_list);
+                       added = true;
                }
        }
        spin_unlock_irqrestore(&handler->lock, flags);
+
+       if (added)
+               queue_work(handler->wq, &handler->del_work);
+}
+
+/*
+ * Call the remove function for the given handler and the list.  This
+ * is expected to be called with a delete list extracted from handler.
+ * The caller should not be holding the handler lock.
+ */
+static void do_remove(struct mmu_rb_handler *handler,
+                     struct list_head *del_list)
+{
+       struct mmu_rb_node *node;
+
+       while (!list_empty(del_list)) {
+               node = list_first_entry(del_list, struct mmu_rb_node, list);
+               list_del(&node->list);
+               handler->ops->remove(handler->ops_arg, node);
+       }
+}
+
+/*
+ * Work queue function to remove all nodes that have been queued up to
+ * be removed.  The key feature is that mm->mmap_sem is not being held
+ * and the remove callback can sleep while taking it, if needed.
+ */
+static void handle_remove(struct work_struct *work)
+{
+       struct mmu_rb_handler *handler = container_of(work,
+                                               struct mmu_rb_handler,
+                                               del_work);
+       struct list_head del_list;
+       unsigned long flags;
+
+       /* remove anything that is queued to get removed */
+       spin_lock_irqsave(&handler->lock, flags);
+       list_replace_init(&handler->del_list, &del_list);
+       spin_unlock_irqrestore(&handler->lock, flags);
+
+       do_remove(handler, &del_list);
 }
index 7a57b9c49d271fdce65f5fa46a7f03c7f96fd89d..754f6ebf13fb1ac61d42dee6f2a31c726d98d42e 100644 (file)
@@ -54,23 +54,34 @@ struct mmu_rb_node {
        unsigned long len;
        unsigned long __last;
        struct rb_node node;
+       struct list_head list;
 };
 
+/*
+ * NOTE: filter, insert, invalidate, and evict must not sleep.  Only remove is
+ * allowed to sleep.
+ */
 struct mmu_rb_ops {
-       bool (*filter)(struct mmu_rb_node *, unsigned long, unsigned long);
-       int (*insert)(struct rb_root *, struct mmu_rb_node *);
-       void (*remove)(struct rb_root *, struct mmu_rb_node *,
-                      struct mm_struct *);
-       int (*invalidate)(struct rb_root *, struct mmu_rb_node *);
+       bool (*filter)(struct mmu_rb_node *node, unsigned long addr,
+                      unsigned long len);
+       int (*insert)(void *ops_arg, struct mmu_rb_node *mnode);
+       void (*remove)(void *ops_arg, struct mmu_rb_node *mnode);
+       int (*invalidate)(void *ops_arg, struct mmu_rb_node *node);
+       int (*evict)(void *ops_arg, struct mmu_rb_node *mnode,
+                    void *evict_arg, bool *stop);
 };
 
-int hfi1_mmu_rb_register(struct rb_root *root, struct mmu_rb_ops *ops);
-void hfi1_mmu_rb_unregister(struct rb_root *);
-int hfi1_mmu_rb_insert(struct rb_root *, struct mmu_rb_node *);
-void hfi1_mmu_rb_remove(struct rb_root *, struct mmu_rb_node *);
-struct mmu_rb_node *hfi1_mmu_rb_search(struct rb_root *, unsigned long,
-                                      unsigned long);
-struct mmu_rb_node *hfi1_mmu_rb_extract(struct rb_root *, unsigned long,
-                                       unsigned long);
+int hfi1_mmu_rb_register(void *ops_arg, struct mm_struct *mm,
+                        struct mmu_rb_ops *ops,
+                        struct workqueue_struct *wq,
+                        struct mmu_rb_handler **handler);
+void hfi1_mmu_rb_unregister(struct mmu_rb_handler *handler);
+int hfi1_mmu_rb_insert(struct mmu_rb_handler *handler,
+                      struct mmu_rb_node *mnode);
+void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg);
+void hfi1_mmu_rb_remove(struct mmu_rb_handler *handler,
+                       struct mmu_rb_node *mnode);
+struct mmu_rb_node *hfi1_mmu_rb_extract(struct mmu_rb_handler *handler,
+                                       unsigned long addr, unsigned long len);
 
 #endif /* _HFI1_MMU_RB_H */
index 0bac21e6a658ca242b856910e06be7a91cbf7284..89c68da1c273297c71476fe0acbe37d93f7f97a3 100644 (file)
@@ -679,6 +679,10 @@ static uint pcie_pset = UNSET_PSET;
 module_param(pcie_pset, uint, S_IRUGO);
 MODULE_PARM_DESC(pcie_pset, "PCIe Eq Pset value to use, range is 0-10");
 
+static uint pcie_ctle = 1; /* discrete on, integrated off */
+module_param(pcie_ctle, uint, S_IRUGO);
+MODULE_PARM_DESC(pcie_ctle, "PCIe static CTLE mode, bit 0 - discrete on/off, bit 1 - integrated on/off");
+
 /* equalization columns */
 #define PREC 0
 #define ATTN 1
@@ -716,6 +720,36 @@ static const u8 integrated_preliminary_eq[11][3] = {
        {  0x00,  0x1e,  0x0a },        /* p10 */
 };
 
+static const u8 discrete_ctle_tunings[11][4] = {
+       /* DC     LF     HF     BW */
+       {  0x48,  0x0b,  0x04,  0x04 }, /* p0 */
+       {  0x60,  0x05,  0x0f,  0x0a }, /* p1 */
+       {  0x50,  0x09,  0x06,  0x06 }, /* p2 */
+       {  0x68,  0x05,  0x0f,  0x0a }, /* p3 */
+       {  0x80,  0x05,  0x0f,  0x0a }, /* p4 */
+       {  0x70,  0x05,  0x0f,  0x0a }, /* p5 */
+       {  0x68,  0x05,  0x0f,  0x0a }, /* p6 */
+       {  0x38,  0x0f,  0x00,  0x00 }, /* p7 */
+       {  0x48,  0x09,  0x06,  0x06 }, /* p8 */
+       {  0x60,  0x05,  0x0f,  0x0a }, /* p9 */
+       {  0x38,  0x0f,  0x00,  0x00 }, /* p10 */
+};
+
+static const u8 integrated_ctle_tunings[11][4] = {
+       /* DC     LF     HF     BW */
+       {  0x38,  0x0f,  0x00,  0x00 }, /* p0 */
+       {  0x38,  0x0f,  0x00,  0x00 }, /* p1 */
+       {  0x38,  0x0f,  0x00,  0x00 }, /* p2 */
+       {  0x38,  0x0f,  0x00,  0x00 }, /* p3 */
+       {  0x58,  0x0a,  0x05,  0x05 }, /* p4 */
+       {  0x48,  0x0a,  0x05,  0x05 }, /* p5 */
+       {  0x40,  0x0a,  0x05,  0x05 }, /* p6 */
+       {  0x38,  0x0f,  0x00,  0x00 }, /* p7 */
+       {  0x38,  0x0f,  0x00,  0x00 }, /* p8 */
+       {  0x38,  0x09,  0x06,  0x06 }, /* p9 */
+       {  0x38,  0x0e,  0x01,  0x01 }, /* p10 */
+};
+
 /* helper to format the value to write to hardware */
 #define eq_value(pre, curr, post) \
        ((((u32)(pre)) << \
@@ -951,11 +985,14 @@ int do_pcie_gen3_transition(struct hfi1_devdata *dd)
        u32 status, err;
        int ret;
        int do_retry, retry_count = 0;
+       int intnum = 0;
        uint default_pset;
        u16 target_vector, target_speed;
        u16 lnkctl2, vendor;
        u8 div;
        const u8 (*eq)[3];
+       const u8 (*ctle_tunings)[4];
+       uint static_ctle_mode;
        int return_error = 0;
 
        /* PCIe Gen3 is for the ASIC only */
@@ -1089,6 +1126,9 @@ retry:
                div = 3;
                eq = discrete_preliminary_eq;
                default_pset = DEFAULT_DISCRETE_PSET;
+               ctle_tunings = discrete_ctle_tunings;
+               /* bit 0 - discrete on/off */
+               static_ctle_mode = pcie_ctle & 0x1;
        } else {
                /* 400mV, FS=29, LF = 9 */
                fs = 29;
@@ -1096,6 +1136,9 @@ retry:
                div = 1;
                eq = integrated_preliminary_eq;
                default_pset = DEFAULT_MCP_PSET;
+               ctle_tunings = integrated_ctle_tunings;
+               /* bit 1 - integrated on/off */
+               static_ctle_mode = (pcie_ctle >> 1) & 0x1;
        }
        pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL101,
                               (fs <<
@@ -1135,16 +1178,33 @@ retry:
         * step 5c: Program gasket interrupts
         */
        /* set the Rx Bit Rate to REFCLK ratio */
-       write_gasket_interrupt(dd, 0, 0x0006, 0x0050);
+       write_gasket_interrupt(dd, intnum++, 0x0006, 0x0050);
        /* disable pCal for PCIe Gen3 RX equalization */
-       write_gasket_interrupt(dd, 1, 0x0026, 0x5b01);
+       /* select adaptive or static CTLE */
+       write_gasket_interrupt(dd, intnum++, 0x0026,
+                              0x5b01 | (static_ctle_mode << 3));
        /*
         * Enable iCal for PCIe Gen3 RX equalization, and set which
         * evaluation of RX_EQ_EVAL will launch the iCal procedure.
         */
-       write_gasket_interrupt(dd, 2, 0x0026, 0x5202);
+       write_gasket_interrupt(dd, intnum++, 0x0026, 0x5202);
+
+       if (static_ctle_mode) {
+               /* apply static CTLE tunings */
+               u8 pcie_dc, pcie_lf, pcie_hf, pcie_bw;
+
+               pcie_dc = ctle_tunings[pcie_pset][0];
+               pcie_lf = ctle_tunings[pcie_pset][1];
+               pcie_hf = ctle_tunings[pcie_pset][2];
+               pcie_bw = ctle_tunings[pcie_pset][3];
+               write_gasket_interrupt(dd, intnum++, 0x0026, 0x0200 | pcie_dc);
+               write_gasket_interrupt(dd, intnum++, 0x0026, 0x0100 | pcie_lf);
+               write_gasket_interrupt(dd, intnum++, 0x0026, 0x0000 | pcie_hf);
+               write_gasket_interrupt(dd, intnum++, 0x0026, 0x5500 | pcie_bw);
+       }
+
        /* terminate list */
-       write_gasket_interrupt(dd, 3, 0x0000, 0x0000);
+       write_gasket_interrupt(dd, intnum++, 0x0000, 0x0000);
 
        /*
         * step 5d: program XMT margin
index d4022450b73f7a9c6d8d528fbe44b3dd2fa0e00a..ac1bf4a73571ff72d1612644618c14403611d585 100644 (file)
@@ -1952,13 +1952,17 @@ int init_pervl_scs(struct hfi1_devdata *dd)
        dd->vld[15].sc = sc_alloc(dd, SC_VL15,
                                  dd->rcd[0]->rcvhdrqentsize, dd->node);
        if (!dd->vld[15].sc)
-               goto nomem;
+               return -ENOMEM;
+
        hfi1_init_ctxt(dd->vld[15].sc);
        dd->vld[15].mtu = enum_to_mtu(OPA_MTU_2048);
 
-       dd->kernel_send_context = kmalloc_node(dd->num_send_contexts *
+       dd->kernel_send_context = kzalloc_node(dd->num_send_contexts *
                                        sizeof(struct send_context *),
                                        GFP_KERNEL, dd->node);
+       if (!dd->kernel_send_context)
+               goto freesc15;
+
        dd->kernel_send_context[0] = dd->vld[15].sc;
 
        for (i = 0; i < num_vls; i++) {
@@ -2010,12 +2014,21 @@ int init_pervl_scs(struct hfi1_devdata *dd)
        if (pio_map_init(dd, ppd->port - 1, num_vls, NULL))
                goto nomem;
        return 0;
+
 nomem:
-       sc_free(dd->vld[15].sc);
-       for (i = 0; i < num_vls; i++)
+       for (i = 0; i < num_vls; i++) {
                sc_free(dd->vld[i].sc);
+               dd->vld[i].sc = NULL;
+       }
+
        for (i = num_vls; i < INIT_SC_PER_VL * num_vls; i++)
                sc_free(dd->kernel_send_context[i + 1]);
+
+       kfree(dd->kernel_send_context);
+       dd->kernel_send_context = NULL;
+
+freesc15:
+       sc_free(dd->vld[15].sc);
        return -ENOMEM;
 }
 
index 03df9322f862984af68e82fca84295ad143ebed1..965c8aef0c604cce98e473bd0c6b9c11e1ad9a40 100644 (file)
@@ -537,20 +537,6 @@ static void apply_tunings(
        u8 precur = 0, attn = 0, postcur = 0, external_device_config = 0;
        u8 *cache = ppd->qsfp_info.cache;
 
-       /* Enable external device config if channel is limiting active */
-       read_8051_config(ppd->dd, LINK_OPTIMIZATION_SETTINGS,
-                        GENERAL_CONFIG, &config_data);
-       config_data &= ~(0xff << ENABLE_EXT_DEV_CONFIG_SHIFT);
-       config_data |= ((u32)limiting_active << ENABLE_EXT_DEV_CONFIG_SHIFT);
-       ret = load_8051_config(ppd->dd, LINK_OPTIMIZATION_SETTINGS,
-                              GENERAL_CONFIG, config_data);
-       if (ret != HCMD_SUCCESS)
-               dd_dev_err(
-                       ppd->dd,
-                       "%s: Failed to set enable external device config\n",
-                       __func__);
-
-       config_data = 0; /* re-init  */
        /* Pass tuning method to 8051 */
        read_8051_config(ppd->dd, LINK_TUNING_PARAMETERS, GENERAL_CONFIG,
                         &config_data);
@@ -638,9 +624,13 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset,
        if (ret)
                return ret;
 
+       /*
+        * We'll change the QSFP memory contents from here on out, thus we set a
+        * flag here to remind ourselves to reset the QSFP module. This prevents
+        * reuse of stale settings established in our previous pass through.
+        */
        if (ppd->qsfp_info.reset_needed) {
                reset_qsfp(ppd);
-               ppd->qsfp_info.reset_needed = 0;
                refresh_qsfp_cache(ppd, &ppd->qsfp_info);
        } else {
                ppd->qsfp_info.reset_needed = 1;
index 1a942ffba4cb0880d53b40eaae6446ffa2b05024..a5aa3517e7d5c537d36786fd0d512cef6d60544c 100644 (file)
@@ -52,6 +52,7 @@
 #include <linux/seq_file.h>
 #include <rdma/rdma_vt.h>
 #include <rdma/rdmavt_qp.h>
+#include <rdma/ib_verbs.h>
 
 #include "hfi.h"
 #include "qp.h"
@@ -115,6 +116,66 @@ static const u16 credit_table[31] = {
        32768                   /* 1E */
 };
 
+const struct rvt_operation_params hfi1_post_parms[RVT_OPERATION_MAX] = {
+[IB_WR_RDMA_WRITE] = {
+       .length = sizeof(struct ib_rdma_wr),
+       .qpt_support = BIT(IB_QPT_UC) | BIT(IB_QPT_RC),
+},
+
+[IB_WR_RDMA_READ] = {
+       .length = sizeof(struct ib_rdma_wr),
+       .qpt_support = BIT(IB_QPT_RC),
+       .flags = RVT_OPERATION_ATOMIC,
+},
+
+[IB_WR_ATOMIC_CMP_AND_SWP] = {
+       .length = sizeof(struct ib_atomic_wr),
+       .qpt_support = BIT(IB_QPT_RC),
+       .flags = RVT_OPERATION_ATOMIC | RVT_OPERATION_ATOMIC_SGE,
+},
+
+[IB_WR_ATOMIC_FETCH_AND_ADD] = {
+       .length = sizeof(struct ib_atomic_wr),
+       .qpt_support = BIT(IB_QPT_RC),
+       .flags = RVT_OPERATION_ATOMIC | RVT_OPERATION_ATOMIC_SGE,
+},
+
+[IB_WR_RDMA_WRITE_WITH_IMM] = {
+       .length = sizeof(struct ib_rdma_wr),
+       .qpt_support = BIT(IB_QPT_UC) | BIT(IB_QPT_RC),
+},
+
+[IB_WR_SEND] = {
+       .length = sizeof(struct ib_send_wr),
+       .qpt_support = BIT(IB_QPT_UD) | BIT(IB_QPT_SMI) | BIT(IB_QPT_GSI) |
+                      BIT(IB_QPT_UC) | BIT(IB_QPT_RC),
+},
+
+[IB_WR_SEND_WITH_IMM] = {
+       .length = sizeof(struct ib_send_wr),
+       .qpt_support = BIT(IB_QPT_UD) | BIT(IB_QPT_SMI) | BIT(IB_QPT_GSI) |
+                      BIT(IB_QPT_UC) | BIT(IB_QPT_RC),
+},
+
+[IB_WR_REG_MR] = {
+       .length = sizeof(struct ib_reg_wr),
+       .qpt_support = BIT(IB_QPT_UC) | BIT(IB_QPT_RC),
+       .flags = RVT_OPERATION_LOCAL,
+},
+
+[IB_WR_LOCAL_INV] = {
+       .length = sizeof(struct ib_send_wr),
+       .qpt_support = BIT(IB_QPT_UC) | BIT(IB_QPT_RC),
+       .flags = RVT_OPERATION_LOCAL,
+},
+
+[IB_WR_SEND_WITH_INV] = {
+       .length = sizeof(struct ib_send_wr),
+       .qpt_support = BIT(IB_QPT_RC),
+},
+
+};
+
 static void flush_tx_list(struct rvt_qp *qp)
 {
        struct hfi1_qp_priv *priv = qp->priv;
@@ -745,8 +806,9 @@ void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp,
 
        priv->owner = qp;
 
-       priv->s_hdr = kzalloc_node(sizeof(*priv->s_hdr), gfp, rdi->dparms.node);
-       if (!priv->s_hdr) {
+       priv->s_ahg = kzalloc_node(sizeof(*priv->s_ahg), gfp,
+                                  rdi->dparms.node);
+       if (!priv->s_ahg) {
                kfree(priv);
                return ERR_PTR(-ENOMEM);
        }
@@ -759,7 +821,7 @@ void qp_priv_free(struct rvt_dev_info *rdi, struct rvt_qp *qp)
 {
        struct hfi1_qp_priv *priv = qp->priv;
 
-       kfree(priv->s_hdr);
+       kfree(priv->s_ahg);
        kfree(priv);
 }
 
index e7bc8d6cf681cd4faca8896481b030b6a48c3aec..587d84d65bb8124656e0ad2b56686f6ac964978a 100644 (file)
@@ -54,6 +54,8 @@
 
 extern unsigned int hfi1_qp_table_size;
 
+extern const struct rvt_operation_params hfi1_post_parms[];
+
 /*
  * free_ahg - clear ahg from QP
  */
@@ -61,7 +63,7 @@ static inline void clear_ahg(struct rvt_qp *qp)
 {
        struct hfi1_qp_priv *priv = qp->priv;
 
-       priv->s_hdr->ahgcount = 0;
+       priv->s_ahg->ahgcount = 0;
        qp->s_flags &= ~(RVT_S_AHG_VALID | RVT_S_AHG_CLEAR);
        if (priv->s_sde && qp->s_ahgidx >= 0)
                sdma_ahg_free(priv->s_sde, qp->s_ahgidx);
index 9fb561682c661b92e2a09beca2b53daf9f8c5b5e..a207717ade2aac0da5fbf34e8adb70a53efc2a42 100644 (file)
 #include <linux/vmalloc.h>
 
 #include "hfi.h"
-#include "twsi.h"
+
+/* for the given bus number, return the CSR for reading an i2c line */
+static inline u32 i2c_in_csr(u32 bus_num)
+{
+       return bus_num ? ASIC_QSFP2_IN : ASIC_QSFP1_IN;
+}
+
+/* for the given bus number, return the CSR for writing an i2c line */
+static inline u32 i2c_oe_csr(u32 bus_num)
+{
+       return bus_num ? ASIC_QSFP2_OE : ASIC_QSFP1_OE;
+}
+
+static void hfi1_setsda(void *data, int state)
+{
+       struct hfi1_i2c_bus *bus = (struct hfi1_i2c_bus *)data;
+       struct hfi1_devdata *dd = bus->controlling_dd;
+       u64 reg;
+       u32 target_oe;
+
+       target_oe = i2c_oe_csr(bus->num);
+       reg = read_csr(dd, target_oe);
+       /*
+        * The OE bit value is inverted and connected to the pin.  When
+        * OE is 0 the pin is left to be pulled up, when the OE is 1
+        * the pin is driven low.  This matches the "open drain" or "open
+        * collector" convention.
+        */
+       if (state)
+               reg &= ~QSFP_HFI0_I2CDAT;
+       else
+               reg |= QSFP_HFI0_I2CDAT;
+       write_csr(dd, target_oe, reg);
+       /* do a read to force the write into the chip */
+       (void)read_csr(dd, target_oe);
+}
+
+static void hfi1_setscl(void *data, int state)
+{
+       struct hfi1_i2c_bus *bus = (struct hfi1_i2c_bus *)data;
+       struct hfi1_devdata *dd = bus->controlling_dd;
+       u64 reg;
+       u32 target_oe;
+
+       target_oe = i2c_oe_csr(bus->num);
+       reg = read_csr(dd, target_oe);
+       /*
+        * The OE bit value is inverted and connected to the pin.  When
+        * OE is 0 the pin is left to be pulled up, when the OE is 1
+        * the pin is driven low.  This matches the "open drain" or "open
+        * collector" convention.
+        */
+       if (state)
+               reg &= ~QSFP_HFI0_I2CCLK;
+       else
+               reg |= QSFP_HFI0_I2CCLK;
+       write_csr(dd, target_oe, reg);
+       /* do a read to force the write into the chip */
+       (void)read_csr(dd, target_oe);
+}
+
+static int hfi1_getsda(void *data)
+{
+       struct hfi1_i2c_bus *bus = (struct hfi1_i2c_bus *)data;
+       u64 reg;
+       u32 target_in;
+
+       hfi1_setsda(data, 1);   /* clear OE so we do not pull line down */
+       udelay(2);              /* 1us pull up + 250ns hold */
+
+       target_in = i2c_in_csr(bus->num);
+       reg = read_csr(bus->controlling_dd, target_in);
+       return !!(reg & QSFP_HFI0_I2CDAT);
+}
+
+static int hfi1_getscl(void *data)
+{
+       struct hfi1_i2c_bus *bus = (struct hfi1_i2c_bus *)data;
+       u64 reg;
+       u32 target_in;
+
+       hfi1_setscl(data, 1);   /* clear OE so we do not pull line down */
+       udelay(2);              /* 1us pull up + 250ns hold */
+
+       target_in = i2c_in_csr(bus->num);
+       reg = read_csr(bus->controlling_dd, target_in);
+       return !!(reg & QSFP_HFI0_I2CCLK);
+}
 
 /*
- * QSFP support for hfi driver, using "Two Wire Serial Interface" driver
- * in twsi.c
+ * Allocate and initialize the given i2c bus number.
+ * Returns NULL on failure.
  */
-#define I2C_MAX_RETRY 4
+static struct hfi1_i2c_bus *init_i2c_bus(struct hfi1_devdata *dd,
+                                        struct hfi1_asic_data *ad, int num)
+{
+       struct hfi1_i2c_bus *bus;
+       int ret;
+
+       bus = kzalloc(sizeof(*bus), GFP_KERNEL);
+       if (!bus)
+               return NULL;
+
+       bus->controlling_dd = dd;
+       bus->num = num; /* our bus number */
+
+       bus->algo.setsda = hfi1_setsda;
+       bus->algo.setscl = hfi1_setscl;
+       bus->algo.getsda = hfi1_getsda;
+       bus->algo.getscl = hfi1_getscl;
+       bus->algo.udelay = 5;
+       bus->algo.timeout = usecs_to_jiffies(50);
+       bus->algo.data = bus;
+
+       bus->adapter.owner = THIS_MODULE;
+       bus->adapter.algo_data = &bus->algo;
+       bus->adapter.dev.parent = &dd->pcidev->dev;
+       snprintf(bus->adapter.name, sizeof(bus->adapter.name),
+                "hfi1_i2c%d", num);
+
+       ret = i2c_bit_add_bus(&bus->adapter);
+       if (ret) {
+               dd_dev_info(dd, "%s: unable to add i2c bus %d, err %d\n",
+                           __func__, num, ret);
+               kfree(bus);
+               return NULL;
+       }
+
+       return bus;
+}
 
 /*
- * Raw i2c write.  No set-up or lock checking.
+ * Initialize i2c buses.
+ * Return 0 on success, -errno on error.
  */
-static int __i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
-                      int offset, void *bp, int len)
+int set_up_i2c(struct hfi1_devdata *dd, struct hfi1_asic_data *ad)
 {
-       struct hfi1_devdata *dd = ppd->dd;
-       int ret, cnt;
-       u8 *buff = bp;
+       ad->i2c_bus0 = init_i2c_bus(dd, ad, 0);
+       ad->i2c_bus1 = init_i2c_bus(dd, ad, 1);
+       if (!ad->i2c_bus0 || !ad->i2c_bus1)
+               return -ENOMEM;
+       return 0;
+};
 
-       cnt = 0;
-       while (cnt < len) {
-               int wlen = len - cnt;
+static void clean_i2c_bus(struct hfi1_i2c_bus *bus)
+{
+       if (bus) {
+               i2c_del_adapter(&bus->adapter);
+               kfree(bus);
+       }
+}
 
-               ret = hfi1_twsi_blk_wr(dd, target, i2c_addr, offset,
-                                      buff + cnt, wlen);
-               if (ret) {
-                       /* hfi1_twsi_blk_wr() 1 for error, else 0 */
-                       return -EIO;
-               }
-               offset += wlen;
-               cnt += wlen;
+void clean_up_i2c(struct hfi1_devdata *dd, struct hfi1_asic_data *ad)
+{
+       clean_i2c_bus(ad->i2c_bus0);
+       ad->i2c_bus0 = NULL;
+       clean_i2c_bus(ad->i2c_bus1);
+       ad->i2c_bus1 = NULL;
+}
+
+static int i2c_bus_write(struct hfi1_devdata *dd, struct hfi1_i2c_bus *i2c,
+                        u8 slave_addr, int offset, int offset_size,
+                        u8 *data, u16 len)
+{
+       int ret;
+       int num_msgs;
+       u8 offset_bytes[2];
+       struct i2c_msg msgs[2];
+
+       switch (offset_size) {
+       case 0:
+               num_msgs = 1;
+               msgs[0].addr = slave_addr;
+               msgs[0].flags = 0;
+               msgs[0].len = len;
+               msgs[0].buf = data;
+               break;
+       case 2:
+               offset_bytes[1] = (offset >> 8) & 0xff;
+               /* fall through */
+       case 1:
+               num_msgs = 2;
+               offset_bytes[0] = offset & 0xff;
+
+               msgs[0].addr = slave_addr;
+               msgs[0].flags = 0;
+               msgs[0].len = offset_size;
+               msgs[0].buf = offset_bytes;
+
+               msgs[1].addr = slave_addr;
+               msgs[1].flags = I2C_M_NOSTART,
+               msgs[1].len = len;
+               msgs[1].buf = data;
+               break;
+       default:
+               return -EINVAL;
        }
 
-       /* Must wait min 20us between qsfp i2c transactions */
-       udelay(20);
+       i2c->controlling_dd = dd;
+       ret = i2c_transfer(&i2c->adapter, msgs, num_msgs);
+       if (ret != num_msgs) {
+               dd_dev_err(dd, "%s: bus %d, i2c slave 0x%x, offset 0x%x, len 0x%x; write failed, ret %d\n",
+                          __func__, i2c->num, slave_addr, offset, len, ret);
+               return ret < 0 ? ret : -EIO;
+       }
+       return 0;
+}
+
+static int i2c_bus_read(struct hfi1_devdata *dd, struct hfi1_i2c_bus *bus,
+                       u8 slave_addr, int offset, int offset_size,
+                       u8 *data, u16 len)
+{
+       int ret;
+       int num_msgs;
+       u8 offset_bytes[2];
+       struct i2c_msg msgs[2];
+
+       switch (offset_size) {
+       case 0:
+               num_msgs = 1;
+               msgs[0].addr = slave_addr;
+               msgs[0].flags = I2C_M_RD;
+               msgs[0].len = len;
+               msgs[0].buf = data;
+               break;
+       case 2:
+               offset_bytes[1] = (offset >> 8) & 0xff;
+               /* fall through */
+       case 1:
+               num_msgs = 2;
+               offset_bytes[0] = offset & 0xff;
+
+               msgs[0].addr = slave_addr;
+               msgs[0].flags = 0;
+               msgs[0].len = offset_size;
+               msgs[0].buf = offset_bytes;
+
+               msgs[1].addr = slave_addr;
+               msgs[1].flags = I2C_M_RD,
+               msgs[1].len = len;
+               msgs[1].buf = data;
+               break;
+       default:
+               return -EINVAL;
+       }
 
-       return cnt;
+       bus->controlling_dd = dd;
+       ret = i2c_transfer(&bus->adapter, msgs, num_msgs);
+       if (ret != num_msgs) {
+               dd_dev_err(dd, "%s: bus %d, i2c slave 0x%x, offset 0x%x, len 0x%x; read failed, ret %d\n",
+                          __func__, bus->num, slave_addr, offset, len, ret);
+               return ret < 0 ? ret : -EIO;
+       }
+       return 0;
+}
+
+/*
+ * Raw i2c write.  No set-up or lock checking.
+ *
+ * Return 0 on success, -errno on error.
+ */
+static int __i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
+                      int offset, void *bp, int len)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       struct hfi1_i2c_bus *bus;
+       u8 slave_addr;
+       int offset_size;
+
+       bus = target ? dd->asic_data->i2c_bus1 : dd->asic_data->i2c_bus0;
+       slave_addr = (i2c_addr & 0xff) >> 1; /* convert to 7-bit addr */
+       offset_size = (i2c_addr >> 8) & 0x3;
+       return i2c_bus_write(dd, bus, slave_addr, offset, offset_size, bp, len);
 }
 
 /*
  * Caller must hold the i2c chain resource.
+ *
+ * Return number of bytes written, or -errno.
  */
 int i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset,
              void *bp, int len)
@@ -99,63 +338,36 @@ int i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset,
        if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
                return -EACCES;
 
-       /* make sure the TWSI bus is in a sane state */
-       ret = hfi1_twsi_reset(ppd->dd, target);
-       if (ret) {
-               hfi1_dev_porterr(ppd->dd, ppd->port,
-                                "I2C chain %d write interface reset failed\n",
-                                target);
+       ret = __i2c_write(ppd, target, i2c_addr, offset, bp, len);
+       if (ret)
                return ret;
-       }
 
-       return __i2c_write(ppd, target, i2c_addr, offset, bp, len);
+       return len;
 }
 
 /*
  * Raw i2c read.  No set-up or lock checking.
+ *
+ * Return 0 on success, -errno on error.
  */
 static int __i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
                      int offset, void *bp, int len)
 {
        struct hfi1_devdata *dd = ppd->dd;
-       int ret, cnt, pass = 0;
-       int orig_offset = offset;
-
-       cnt = 0;
-       while (cnt < len) {
-               int rlen = len - cnt;
-
-               ret = hfi1_twsi_blk_rd(dd, target, i2c_addr, offset,
-                                      bp + cnt, rlen);
-               /* Some QSFP's fail first try. Retry as experiment */
-               if (ret && cnt == 0 && ++pass < I2C_MAX_RETRY)
-                       continue;
-               if (ret) {
-                       /* hfi1_twsi_blk_rd() 1 for error, else 0 */
-                       ret = -EIO;
-                       goto exit;
-               }
-               offset += rlen;
-               cnt += rlen;
-       }
-
-       ret = cnt;
-
-exit:
-       if (ret < 0) {
-               hfi1_dev_porterr(dd, ppd->port,
-                                "I2C chain %d read failed, addr 0x%x, offset 0x%x, len %d\n",
-                                target, i2c_addr, orig_offset, len);
-       }
-
-       /* Must wait min 20us between qsfp i2c transactions */
-       udelay(20);
-
-       return ret;
+       struct hfi1_i2c_bus *bus;
+       u8 slave_addr;
+       int offset_size;
+
+       bus = target ? dd->asic_data->i2c_bus1 : dd->asic_data->i2c_bus0;
+       slave_addr = (i2c_addr & 0xff) >> 1; /* convert to 7-bit addr */
+       offset_size = (i2c_addr >> 8) & 0x3;
+       return i2c_bus_read(dd, bus, slave_addr, offset, offset_size, bp, len);
 }
 
 /*
  * Caller must hold the i2c chain resource.
+ *
+ * Return number of bytes read, or -errno.
  */
 int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset,
             void *bp, int len)
@@ -165,16 +377,11 @@ int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset,
        if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
                return -EACCES;
 
-       /* make sure the TWSI bus is in a sane state */
-       ret = hfi1_twsi_reset(ppd->dd, target);
-       if (ret) {
-               hfi1_dev_porterr(ppd->dd, ppd->port,
-                                "I2C chain %d read interface reset failed\n",
-                                target);
+       ret = __i2c_read(ppd, target, i2c_addr, offset, bp, len);
+       if (ret)
                return ret;
-       }
 
-       return __i2c_read(ppd, target, i2c_addr, offset, bp, len);
+       return len;
 }
 
 /*
@@ -182,6 +389,8 @@ int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset,
  * by writing @addr = ((256 * n) + m)
  *
  * Caller must hold the i2c chain resource.
+ *
+ * Return number of bytes written or -errno.
  */
 int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
               int len)
@@ -189,21 +398,12 @@ int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
        int count = 0;
        int offset;
        int nwrite;
-       int ret;
+       int ret = 0;
        u8 page;
 
        if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
                return -EACCES;
 
-       /* make sure the TWSI bus is in a sane state */
-       ret = hfi1_twsi_reset(ppd->dd, target);
-       if (ret) {
-               hfi1_dev_porterr(ppd->dd, ppd->port,
-                                "QSFP chain %d write interface reset failed\n",
-                                target);
-               return ret;
-       }
-
        while (count < len) {
                /*
                 * Set the qsfp page based on a zero-based address
@@ -213,11 +413,12 @@ int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
 
                ret = __i2c_write(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE,
                                  QSFP_PAGE_SELECT_BYTE_OFFS, &page, 1);
-               if (ret != 1) {
+               /* QSFPs require a 5-10msec delay after write operations */
+               mdelay(5);
+               if (ret) {
                        hfi1_dev_porterr(ppd->dd, ppd->port,
                                         "QSFP chain %d can't write QSFP_PAGE_SELECT_BYTE: %d\n",
                                         target, ret);
-                       ret = -EIO;
                        break;
                }
 
@@ -229,11 +430,13 @@ int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
 
                ret = __i2c_write(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE,
                                  offset, bp + count, nwrite);
-               if (ret <= 0)   /* stop on error or nothing written */
+               /* QSFPs require a 5-10msec delay after write operations */
+               mdelay(5);
+               if (ret)        /* stop on error */
                        break;
 
-               count += ret;
-               addr += ret;
+               count += nwrite;
+               addr += nwrite;
        }
 
        if (ret < 0)
@@ -243,7 +446,7 @@ int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
 
 /*
  * Perform a stand-alone single QSFP write.  Acquire the resource, do the
- * read, then release the resource.
+ * write, then release the resource.
  */
 int one_qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
                   int len)
@@ -266,6 +469,8 @@ int one_qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
  * by reading @addr = ((256 * n) + m)
  *
  * Caller must hold the i2c chain resource.
+ *
+ * Return the number of bytes read or -errno.
  */
 int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
              int len)
@@ -273,21 +478,12 @@ int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
        int count = 0;
        int offset;
        int nread;
-       int ret;
+       int ret = 0;
        u8 page;
 
        if (!check_chip_resource(ppd->dd, i2c_target(target), __func__))
                return -EACCES;
 
-       /* make sure the TWSI bus is in a sane state */
-       ret = hfi1_twsi_reset(ppd->dd, target);
-       if (ret) {
-               hfi1_dev_porterr(ppd->dd, ppd->port,
-                                "QSFP chain %d read interface reset failed\n",
-                                target);
-               return ret;
-       }
-
        while (count < len) {
                /*
                 * Set the qsfp page based on a zero-based address
@@ -296,11 +492,12 @@ int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
                page = (u8)(addr / QSFP_PAGESIZE);
                ret = __i2c_write(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE,
                                  QSFP_PAGE_SELECT_BYTE_OFFS, &page, 1);
-               if (ret != 1) {
+               /* QSFPs require a 5-10msec delay after write operations */
+               mdelay(5);
+               if (ret) {
                        hfi1_dev_porterr(ppd->dd, ppd->port,
                                         "QSFP chain %d can't write QSFP_PAGE_SELECT_BYTE: %d\n",
                                         target, ret);
-                       ret = -EIO;
                        break;
                }
 
@@ -310,15 +507,13 @@ int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
                if (((addr % QSFP_RW_BOUNDARY) + nread) > QSFP_RW_BOUNDARY)
                        nread = QSFP_RW_BOUNDARY - (addr % QSFP_RW_BOUNDARY);
 
-               /* QSFPs require a 5-10msec delay after write operations */
-               mdelay(5);
                ret = __i2c_read(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE,
                                 offset, bp + count, nread);
-               if (ret <= 0)   /* stop on error or nothing read */
+               if (ret)        /* stop on error */
                        break;
 
-               count += ret;
-               addr += ret;
+               count += nread;
+               addr += nread;
        }
 
        if (ret < 0)
index dadc66c442b982130da735bcb7bcf1e81307590f..69275ebd9597322b599e97fce5804d17efa526f1 100644 (file)
@@ -238,3 +238,6 @@ int one_qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
                   int len);
 int one_qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
                  int len);
+struct hfi1_asic_data;
+int set_up_i2c(struct hfi1_devdata *dd, struct hfi1_asic_data *ad);
+void clean_up_i2c(struct hfi1_devdata *dd, struct hfi1_asic_data *ad);
index 792f15eb8efeceeff89cd40e05e4180dace441b2..5da190e6011b1390f7861b2bd89a5a09e7976c14 100644 (file)
@@ -477,6 +477,37 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
                                qp->s_flags |= RVT_S_WAIT_FENCE;
                                goto bail;
                        }
+                       /*
+                        * Local operations are processed immediately
+                        * after all prior requests have completed
+                        */
+                       if (wqe->wr.opcode == IB_WR_REG_MR ||
+                           wqe->wr.opcode == IB_WR_LOCAL_INV) {
+                               int local_ops = 0;
+                               int err = 0;
+
+                               if (qp->s_last != qp->s_cur)
+                                       goto bail;
+                               if (++qp->s_cur == qp->s_size)
+                                       qp->s_cur = 0;
+                               if (++qp->s_tail == qp->s_size)
+                                       qp->s_tail = 0;
+                               if (!(wqe->wr.send_flags &
+                                     RVT_SEND_COMPLETION_ONLY)) {
+                                       err = rvt_invalidate_rkey(
+                                               qp,
+                                               wqe->wr.ex.invalidate_rkey);
+                                       local_ops = 1;
+                               }
+                               hfi1_send_complete(qp, wqe,
+                                                  err ? IB_WC_LOC_PROT_ERR
+                                                      : IB_WC_SUCCESS);
+                               if (local_ops)
+                                       atomic_dec(&qp->local_ops_pending);
+                               qp->s_hdrwords = 0;
+                               goto done_free_tx;
+                       }
+
                        newreq = 1;
                        qp->s_psn = wqe->psn;
                }
@@ -491,6 +522,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
                switch (wqe->wr.opcode) {
                case IB_WR_SEND:
                case IB_WR_SEND_WITH_IMM:
+               case IB_WR_SEND_WITH_INV:
                        /* If no credit, return. */
                        if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) &&
                            cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
@@ -504,11 +536,17 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
                        }
                        if (wqe->wr.opcode == IB_WR_SEND) {
                                qp->s_state = OP(SEND_ONLY);
-                       } else {
+                       } else if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
                                qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);
                                /* Immediate data comes after the BTH */
                                ohdr->u.imm_data = wqe->wr.ex.imm_data;
                                hwords += 1;
+                       } else {
+                               qp->s_state = OP(SEND_ONLY_WITH_INVALIDATE);
+                               /* Invalidate rkey comes after the BTH */
+                               ohdr->u.ieth = cpu_to_be32(
+                                               wqe->wr.ex.invalidate_rkey);
+                               hwords += 1;
                        }
                        if (wqe->wr.send_flags & IB_SEND_SOLICITED)
                                bth0 |= IB_BTH_SOLICITED;
@@ -671,11 +709,16 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
                }
                if (wqe->wr.opcode == IB_WR_SEND) {
                        qp->s_state = OP(SEND_LAST);
-               } else {
+               } else if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
                        qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
                        /* Immediate data comes after the BTH */
                        ohdr->u.imm_data = wqe->wr.ex.imm_data;
                        hwords += 1;
+               } else {
+                       qp->s_state = OP(SEND_LAST_WITH_INVALIDATE);
+                       /* invalidate data comes after the BTH */
+                       ohdr->u.ieth = cpu_to_be32(wqe->wr.ex.invalidate_rkey);
+                       hwords += 1;
                }
                if (wqe->wr.send_flags & IB_SEND_SOLICITED)
                        bth0 |= IB_BTH_SOLICITED;
@@ -1047,7 +1090,7 @@ void hfi1_rc_timeout(unsigned long arg)
                ibp->rvp.n_rc_timeouts++;
                qp->s_flags &= ~RVT_S_TIMER;
                del_timer(&qp->s_timer);
-               trace_hfi1_rc_timeout(qp, qp->s_last_psn + 1);
+               trace_hfi1_timeout(qp, qp->s_last_psn + 1);
                restart_rc(qp, qp->s_last_psn + 1, 1);
                hfi1_schedule_send(qp);
        }
@@ -1171,7 +1214,7 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_ib_header *hdr)
         * If we were waiting for sends to complete before re-sending,
         * and they are now complete, restart sending.
         */
-       trace_hfi1_rc_sendcomplete(qp, psn);
+       trace_hfi1_sendcomplete(qp, psn);
        if (qp->s_flags & RVT_S_WAIT_PSN &&
            cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
                qp->s_flags &= ~RVT_S_WAIT_PSN;
@@ -1567,7 +1610,7 @@ static void rc_rcv_resp(struct hfi1_ibport *ibp,
 
        spin_lock_irqsave(&qp->s_lock, flags);
 
-       trace_hfi1_rc_ack(qp, psn);
+       trace_hfi1_ack(qp, psn);
 
        /* Ignore invalid responses. */
        smp_read_barrier_depends(); /* see post_one_send */
@@ -1782,7 +1825,7 @@ static noinline int rc_rcv_error(struct hfi1_other_headers *ohdr, void *data,
        u8 i, prev;
        int old_req;
 
-       trace_hfi1_rc_rcv_error(qp, psn);
+       trace_hfi1_rcv_error(qp, psn);
        if (diff > 0) {
                /*
                 * Packet sequence error.
@@ -2086,7 +2129,6 @@ void hfi1_rc_rcv(struct hfi1_packet *packet)
        u32 tlen = packet->tlen;
        struct rvt_qp *qp = packet->qp;
        struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
-       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
        struct hfi1_other_headers *ohdr = packet->ohdr;
        u32 bth0, opcode;
        u32 hdrsize = packet->hlen;
@@ -2097,30 +2139,15 @@ void hfi1_rc_rcv(struct hfi1_packet *packet)
        int diff;
        struct ib_reth *reth;
        unsigned long flags;
-       u32 bth1;
        int ret, is_fecn = 0;
        int copy_last = 0;
+       u32 rkey;
 
        bth0 = be32_to_cpu(ohdr->bth[0]);
        if (hfi1_ruc_check_hdr(ibp, hdr, rcv_flags & HFI1_HAS_GRH, qp, bth0))
                return;
 
-       bth1 = be32_to_cpu(ohdr->bth[1]);
-       if (unlikely(bth1 & (HFI1_BECN_SMASK | HFI1_FECN_SMASK))) {
-               if (bth1 & HFI1_BECN_SMASK) {
-                       u16 rlid = qp->remote_ah_attr.dlid;
-                       u32 lqpn, rqpn;
-
-                       lqpn = qp->ibqp.qp_num;
-                       rqpn = qp->remote_qpn;
-                       process_becn(
-                               ppd,
-                               qp->remote_ah_attr.sl,
-                               rlid, lqpn, rqpn,
-                               IB_CC_SVCTYPE_RC);
-               }
-               is_fecn = bth1 & HFI1_FECN_SMASK;
-       }
+       is_fecn = process_ecn(qp, packet, false);
 
        psn = be32_to_cpu(ohdr->bth[2]);
        opcode = (bth0 >> 24) & 0xff;
@@ -2154,7 +2181,8 @@ void hfi1_rc_rcv(struct hfi1_packet *packet)
        case OP(SEND_MIDDLE):
                if (opcode == OP(SEND_MIDDLE) ||
                    opcode == OP(SEND_LAST) ||
-                   opcode == OP(SEND_LAST_WITH_IMMEDIATE))
+                   opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
+                   opcode == OP(SEND_LAST_WITH_INVALIDATE))
                        break;
                goto nack_inv;
 
@@ -2170,6 +2198,7 @@ void hfi1_rc_rcv(struct hfi1_packet *packet)
                if (opcode == OP(SEND_MIDDLE) ||
                    opcode == OP(SEND_LAST) ||
                    opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
+                   opcode == OP(SEND_LAST_WITH_INVALIDATE) ||
                    opcode == OP(RDMA_WRITE_MIDDLE) ||
                    opcode == OP(RDMA_WRITE_LAST) ||
                    opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
@@ -2218,6 +2247,7 @@ send_middle:
 
        case OP(SEND_ONLY):
        case OP(SEND_ONLY_WITH_IMMEDIATE):
+       case OP(SEND_ONLY_WITH_INVALIDATE):
                ret = hfi1_rvt_get_rwqe(qp, 0);
                if (ret < 0)
                        goto nack_op_err;
@@ -2226,12 +2256,22 @@ send_middle:
                qp->r_rcv_len = 0;
                if (opcode == OP(SEND_ONLY))
                        goto no_immediate_data;
+               if (opcode == OP(SEND_ONLY_WITH_INVALIDATE))
+                       goto send_last_inv;
                /* FALLTHROUGH for SEND_ONLY_WITH_IMMEDIATE */
        case OP(SEND_LAST_WITH_IMMEDIATE):
 send_last_imm:
                wc.ex.imm_data = ohdr->u.imm_data;
                wc.wc_flags = IB_WC_WITH_IMM;
                goto send_last;
+       case OP(SEND_LAST_WITH_INVALIDATE):
+send_last_inv:
+               rkey = be32_to_cpu(ohdr->u.ieth);
+               if (rvt_invalidate_rkey(qp, rkey))
+                       goto no_immediate_data;
+               wc.ex.invalidate_rkey = rkey;
+               wc.wc_flags = IB_WC_WITH_INVALIDATE;
+               goto send_last;
        case OP(RDMA_WRITE_LAST):
                copy_last = ibpd_to_rvtpd(qp->ibqp.pd)->user;
                /* fall through */
index a659aec3c3c6b95823650c7b6bb3ac800649967a..48d5094f98e259b92fade72c37799f43ecfcec4e 100644 (file)
@@ -372,6 +372,7 @@ static void ruc_loopback(struct rvt_qp *sqp)
        int ret;
        int copy_last = 0;
        u32 to;
+       int local_ops = 0;
 
        rcu_read_lock();
 
@@ -440,11 +441,31 @@ again:
        sqp->s_sge.num_sge = wqe->wr.num_sge;
        sqp->s_len = wqe->length;
        switch (wqe->wr.opcode) {
+       case IB_WR_REG_MR:
+               goto send_comp;
+
+       case IB_WR_LOCAL_INV:
+               if (!(wqe->wr.send_flags & RVT_SEND_COMPLETION_ONLY)) {
+                       if (rvt_invalidate_rkey(sqp,
+                                               wqe->wr.ex.invalidate_rkey))
+                               send_status = IB_WC_LOC_PROT_ERR;
+                       local_ops = 1;
+               }
+               goto send_comp;
+
+       case IB_WR_SEND_WITH_INV:
+               if (!rvt_invalidate_rkey(qp, wqe->wr.ex.invalidate_rkey)) {
+                       wc.wc_flags = IB_WC_WITH_INVALIDATE;
+                       wc.ex.invalidate_rkey = wqe->wr.ex.invalidate_rkey;
+               }
+               goto send;
+
        case IB_WR_SEND_WITH_IMM:
                wc.wc_flags = IB_WC_WITH_IMM;
                wc.ex.imm_data = wqe->wr.ex.imm_data;
                /* FALLTHROUGH */
        case IB_WR_SEND:
+send:
                ret = hfi1_rvt_get_rwqe(qp, 0);
                if (ret < 0)
                        goto op_err;
@@ -583,6 +604,10 @@ send_comp:
 flush_send:
        sqp->s_rnr_retry = sqp->s_rnr_retry_cnt;
        hfi1_send_complete(sqp, wqe, send_status);
+       if (local_ops) {
+               atomic_dec(&sqp->local_ops_pending);
+               local_ops = 0;
+       }
        goto again;
 
 rnr_nak:
@@ -683,10 +708,10 @@ u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr,
        return sizeof(struct ib_grh) / sizeof(u32);
 }
 
-#define BTH2_OFFSET (offsetof(struct hfi1_pio_header, hdr.u.oth.bth[2]) / 4)
+#define BTH2_OFFSET (offsetof(struct hfi1_sdma_header, hdr.u.oth.bth[2]) / 4)
 
 /**
- * build_ahg - create ahg in s_hdr
+ * build_ahg - create ahg in s_ahg
  * @qp: a pointer to QP
  * @npsn: the next PSN for the request/response
  *
@@ -708,19 +733,18 @@ static inline void build_ahg(struct rvt_qp *qp, u32 npsn)
                        qp->s_ahgidx = sdma_ahg_alloc(priv->s_sde);
                if (qp->s_ahgidx >= 0) {
                        qp->s_ahgpsn = npsn;
-                       priv->s_hdr->tx_flags |= SDMA_TXREQ_F_AHG_COPY;
+                       priv->s_ahg->tx_flags |= SDMA_TXREQ_F_AHG_COPY;
                        /* save to protect a change in another thread */
-                       priv->s_hdr->sde = priv->s_sde;
-                       priv->s_hdr->ahgidx = qp->s_ahgidx;
+                       priv->s_ahg->ahgidx = qp->s_ahgidx;
                        qp->s_flags |= RVT_S_AHG_VALID;
                }
        } else {
                /* subsequent middle after valid */
                if (qp->s_ahgidx >= 0) {
-                       priv->s_hdr->tx_flags |= SDMA_TXREQ_F_USE_AHG;
-                       priv->s_hdr->ahgidx = qp->s_ahgidx;
-                       priv->s_hdr->ahgcount++;
-                       priv->s_hdr->ahgdesc[0] =
+                       priv->s_ahg->tx_flags |= SDMA_TXREQ_F_USE_AHG;
+                       priv->s_ahg->ahgidx = qp->s_ahgidx;
+                       priv->s_ahg->ahgcount++;
+                       priv->s_ahg->ahgdesc[0] =
                                sdma_build_ahg_descriptor(
                                        (__force u16)cpu_to_be16((u16)npsn),
                                        BTH2_OFFSET,
@@ -728,8 +752,8 @@ static inline void build_ahg(struct rvt_qp *qp, u32 npsn)
                                        16);
                        if ((npsn & 0xffff0000) !=
                                        (qp->s_ahgpsn & 0xffff0000)) {
-                               priv->s_hdr->ahgcount++;
-                               priv->s_hdr->ahgdesc[1] =
+                               priv->s_ahg->ahgcount++;
+                               priv->s_ahg->ahgdesc[1] =
                                        sdma_build_ahg_descriptor(
                                                (__force u16)cpu_to_be16(
                                                        (u16)(npsn >> 16)),
@@ -766,7 +790,7 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct hfi1_other_headers *ohdr,
        }
        lrh0 |= (priv->s_sc & 0xf) << 12 | (qp->remote_ah_attr.sl & 0xf) << 4;
        /*
-        * reset s_hdr/AHG fields
+        * reset s_ahg/AHG fields
         *
         * This insures that the ahgentry/ahgcount
         * are at a non-AHG default to protect
@@ -776,10 +800,9 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct hfi1_other_headers *ohdr,
         * build_ahg() will modify as appropriate
         * to use the AHG feature.
         */
-       priv->s_hdr->tx_flags = 0;
-       priv->s_hdr->ahgcount = 0;
-       priv->s_hdr->ahgidx = 0;
-       priv->s_hdr->sde = NULL;
+       priv->s_ahg->tx_flags = 0;
+       priv->s_ahg->ahgcount = 0;
+       priv->s_ahg->ahgidx = 0;
        if (qp->s_mig_state == IB_MIG_MIGRATED)
                bth0 |= IB_BTH_MIG_REQ;
        else
@@ -890,7 +913,7 @@ void hfi1_do_send(struct rvt_qp *qp)
                         */
                        if (hfi1_verbs_send(qp, &ps))
                                return;
-                       /* Record that s_hdr is empty. */
+                       /* Record that s_ahg is empty. */
                        qp->s_hdrwords = 0;
                        /* allow other tasks to run */
                        if (unlikely(time_after(jiffies, timeout))) {
index 91fc2aed6aed93dbfa3b5d2b8a197ce4537a67a9..74c84c655f7e5c18f2737adfd582429cbd99c2ca 100644 (file)
@@ -49,6 +49,7 @@
 #include "hfi.h"
 #include "mad.h"
 #include "trace.h"
+#include "affinity.h"
 
 /*
  * Start of per-port congestion control structures and support code
@@ -622,6 +623,27 @@ static ssize_t show_tempsense(struct device *device,
        return ret;
 }
 
+static ssize_t show_sdma_affinity(struct device *device,
+                                 struct device_attribute *attr, char *buf)
+{
+       struct hfi1_ibdev *dev =
+               container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+       struct hfi1_devdata *dd = dd_from_dev(dev);
+
+       return hfi1_get_sdma_affinity(dd, buf);
+}
+
+static ssize_t store_sdma_affinity(struct device *device,
+                                  struct device_attribute *attr,
+                                  const char *buf, size_t count)
+{
+       struct hfi1_ibdev *dev =
+               container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+       struct hfi1_devdata *dd = dd_from_dev(dev);
+
+       return hfi1_set_sdma_affinity(dd, buf, count);
+}
+
 /*
  * end of per-unit (or driver, in some cases, but replicated
  * per unit) functions
@@ -636,6 +658,8 @@ static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL);
 static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL);
 static DEVICE_ATTR(tempsense, S_IRUGO, show_tempsense, NULL);
 static DEVICE_ATTR(chip_reset, S_IWUSR, NULL, store_chip_reset);
+static DEVICE_ATTR(sdma_affinity, S_IWUSR | S_IRUGO, show_sdma_affinity,
+                  store_sdma_affinity);
 
 static struct device_attribute *hfi1_attributes[] = {
        &dev_attr_hw_rev,
@@ -646,6 +670,7 @@ static struct device_attribute *hfi1_attributes[] = {
        &dev_attr_boardversion,
        &dev_attr_tempsense,
        &dev_attr_chip_reset,
+       &dev_attr_sdma_affinity,
 };
 
 int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num,
index 28c1d083288632b9f90042d752732a8a157e3130..92dc88f013c9588b0c0da8d43eb189163c5aad8c 100644 (file)
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  */
-#undef TRACE_SYSTEM_VAR
-#define TRACE_SYSTEM_VAR hfi1
-
-#if !defined(__HFI1_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
-#define __HFI1_TRACE_H
-
-#include <linux/tracepoint.h>
-#include <linux/trace_seq.h>
-
-#include "hfi.h"
-#include "mad.h"
-#include "sdma.h"
-
-#define DD_DEV_ENTRY(dd)       __string(dev, dev_name(&(dd)->pcidev->dev))
-#define DD_DEV_ASSIGN(dd)      __assign_str(dev, dev_name(&(dd)->pcidev->dev))
-
-#define packettype_name(etype) { RHF_RCV_TYPE_##etype, #etype }
-#define show_packettype(etype)                  \
-__print_symbolic(etype,                         \
-       packettype_name(EXPECTED),              \
-       packettype_name(EAGER),                 \
-       packettype_name(IB),                    \
-       packettype_name(ERROR),                 \
-       packettype_name(BYPASS))
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM hfi1_rx
-
-TRACE_EVENT(hfi1_rcvhdr,
-           TP_PROTO(struct hfi1_devdata *dd,
-                    u32 ctxt,
-                    u64 eflags,
-                    u32 etype,
-                    u32 hlen,
-                    u32 tlen,
-                    u32 updegr,
-                    u32 etail
-                    ),
-           TP_ARGS(dd, ctxt, eflags, etype, hlen, tlen, updegr, etail),
-           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
-                            __field(u64, eflags)
-                            __field(u32, ctxt)
-                            __field(u32, etype)
-                            __field(u32, hlen)
-                            __field(u32, tlen)
-                            __field(u32, updegr)
-                            __field(u32, etail)
-                            ),
-           TP_fast_assign(DD_DEV_ASSIGN(dd);
-                          __entry->eflags = eflags;
-                          __entry->ctxt = ctxt;
-                          __entry->etype = etype;
-                          __entry->hlen = hlen;
-                          __entry->tlen = tlen;
-                          __entry->updegr = updegr;
-                          __entry->etail = etail;
-                          ),
-           TP_printk(
-                     "[%s] ctxt %d eflags 0x%llx etype %d,%s hlen %d tlen %d updegr %d etail %d",
-                     __get_str(dev),
-                     __entry->ctxt,
-                     __entry->eflags,
-                     __entry->etype, show_packettype(__entry->etype),
-                     __entry->hlen,
-                     __entry->tlen,
-                     __entry->updegr,
-                     __entry->etail
-                     )
-);
-
-TRACE_EVENT(hfi1_receive_interrupt,
-           TP_PROTO(struct hfi1_devdata *dd, u32 ctxt),
-           TP_ARGS(dd, ctxt),
-           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
-                            __field(u32, ctxt)
-                            __field(u8, slow_path)
-                            __field(u8, dma_rtail)
-                            ),
-           TP_fast_assign(DD_DEV_ASSIGN(dd);
-                          __entry->ctxt = ctxt;
-                          if (dd->rcd[ctxt]->do_interrupt ==
-                              &handle_receive_interrupt) {
-                               __entry->slow_path = 1;
-                               __entry->dma_rtail = 0xFF;
-                          } else if (dd->rcd[ctxt]->do_interrupt ==
-                                     &handle_receive_interrupt_dma_rtail){
-                               __entry->dma_rtail = 1;
-                               __entry->slow_path = 0;
-                          } else if (dd->rcd[ctxt]->do_interrupt ==
-                                     &handle_receive_interrupt_nodma_rtail) {
-                               __entry->dma_rtail = 0;
-                               __entry->slow_path = 0;
-                          }
-                          ),
-           TP_printk("[%s] ctxt %d SlowPath: %d DmaRtail: %d",
-                     __get_str(dev),
-                     __entry->ctxt,
-                     __entry->slow_path,
-                     __entry->dma_rtail
-                     )
-);
-
-TRACE_EVENT(hfi1_exp_tid_reg,
-           TP_PROTO(unsigned ctxt, u16 subctxt, u32 rarr,
-                    u32 npages, unsigned long va, unsigned long pa,
-                    dma_addr_t dma),
-           TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma),
-           TP_STRUCT__entry(
-                   __field(unsigned, ctxt)
-                   __field(u16, subctxt)
-                   __field(u32, rarr)
-                   __field(u32, npages)
-                   __field(unsigned long, va)
-                   __field(unsigned long, pa)
-                   __field(dma_addr_t, dma)
-                   ),
-           TP_fast_assign(
-                   __entry->ctxt = ctxt;
-                   __entry->subctxt = subctxt;
-                   __entry->rarr = rarr;
-                   __entry->npages = npages;
-                   __entry->va = va;
-                   __entry->pa = pa;
-                   __entry->dma = dma;
-                   ),
-           TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx",
-                     __entry->ctxt,
-                     __entry->subctxt,
-                     __entry->rarr,
-                     __entry->npages,
-                     __entry->pa,
-                     __entry->va,
-                     __entry->dma
-                   )
-       );
-
-TRACE_EVENT(hfi1_exp_tid_unreg,
-           TP_PROTO(unsigned ctxt, u16 subctxt, u32 rarr, u32 npages,
-                    unsigned long va, unsigned long pa, dma_addr_t dma),
-           TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma),
-           TP_STRUCT__entry(
-                   __field(unsigned, ctxt)
-                   __field(u16, subctxt)
-                   __field(u32, rarr)
-                   __field(u32, npages)
-                   __field(unsigned long, va)
-                   __field(unsigned long, pa)
-                   __field(dma_addr_t, dma)
-                   ),
-           TP_fast_assign(
-                   __entry->ctxt = ctxt;
-                   __entry->subctxt = subctxt;
-                   __entry->rarr = rarr;
-                   __entry->npages = npages;
-                   __entry->va = va;
-                   __entry->pa = pa;
-                   __entry->dma = dma;
-                   ),
-           TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx",
-                     __entry->ctxt,
-                     __entry->subctxt,
-                     __entry->rarr,
-                     __entry->npages,
-                     __entry->pa,
-                     __entry->va,
-                     __entry->dma
-                   )
-       );
-
-TRACE_EVENT(hfi1_exp_tid_inval,
-           TP_PROTO(unsigned ctxt, u16 subctxt, unsigned long va, u32 rarr,
-                    u32 npages, dma_addr_t dma),
-           TP_ARGS(ctxt, subctxt, va, rarr, npages, dma),
-           TP_STRUCT__entry(
-                   __field(unsigned, ctxt)
-                   __field(u16, subctxt)
-                   __field(unsigned long, va)
-                   __field(u32, rarr)
-                   __field(u32, npages)
-                   __field(dma_addr_t, dma)
-                   ),
-           TP_fast_assign(
-                   __entry->ctxt = ctxt;
-                   __entry->subctxt = subctxt;
-                   __entry->va = va;
-                   __entry->rarr = rarr;
-                   __entry->npages = npages;
-                   __entry->dma = dma;
-                   ),
-           TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx dma: 0x%llx",
-                     __entry->ctxt,
-                     __entry->subctxt,
-                     __entry->rarr,
-                     __entry->npages,
-                     __entry->va,
-                     __entry->dma
-                   )
-       );
-
-TRACE_EVENT(hfi1_mmu_invalidate,
-           TP_PROTO(unsigned ctxt, u16 subctxt, const char *type,
-                    unsigned long start, unsigned long end),
-           TP_ARGS(ctxt, subctxt, type, start, end),
-           TP_STRUCT__entry(
-                   __field(unsigned, ctxt)
-                   __field(u16, subctxt)
-                   __string(type, type)
-                   __field(unsigned long, start)
-                   __field(unsigned long, end)
-                   ),
-           TP_fast_assign(
-                   __entry->ctxt = ctxt;
-                   __entry->subctxt = subctxt;
-                   __assign_str(type, type);
-                   __entry->start = start;
-                   __entry->end = end;
-                   ),
-           TP_printk("[%3u:%02u] MMU Invalidate (%s) 0x%lx - 0x%lx",
-                     __entry->ctxt,
-                     __entry->subctxt,
-                     __get_str(type),
-                     __entry->start,
-                     __entry->end
-                   )
-       );
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM hfi1_tx
-
-TRACE_EVENT(hfi1_piofree,
-           TP_PROTO(struct send_context *sc, int extra),
-           TP_ARGS(sc, extra),
-           TP_STRUCT__entry(DD_DEV_ENTRY(sc->dd)
-                            __field(u32, sw_index)
-                            __field(u32, hw_context)
-                            __field(int, extra)
-                            ),
-           TP_fast_assign(DD_DEV_ASSIGN(sc->dd);
-                          __entry->sw_index = sc->sw_index;
-                          __entry->hw_context = sc->hw_context;
-                          __entry->extra = extra;
-                          ),
-           TP_printk("[%s] ctxt %u(%u) extra %d",
-                     __get_str(dev),
-                     __entry->sw_index,
-                     __entry->hw_context,
-                     __entry->extra
-                     )
-);
-
-TRACE_EVENT(hfi1_wantpiointr,
-           TP_PROTO(struct send_context *sc, u32 needint, u64 credit_ctrl),
-           TP_ARGS(sc, needint, credit_ctrl),
-           TP_STRUCT__entry(DD_DEV_ENTRY(sc->dd)
-                            __field(u32, sw_index)
-                            __field(u32, hw_context)
-                            __field(u32, needint)
-                            __field(u64, credit_ctrl)
-                            ),
-           TP_fast_assign(DD_DEV_ASSIGN(sc->dd);
-                          __entry->sw_index = sc->sw_index;
-                          __entry->hw_context = sc->hw_context;
-                          __entry->needint = needint;
-                          __entry->credit_ctrl = credit_ctrl;
-                          ),
-           TP_printk("[%s] ctxt %u(%u) on %d credit_ctrl 0x%llx",
-                     __get_str(dev),
-                     __entry->sw_index,
-                     __entry->hw_context,
-                     __entry->needint,
-                     (unsigned long long)__entry->credit_ctrl
-                      )
-);
-
-DECLARE_EVENT_CLASS(hfi1_qpsleepwakeup_template,
-                   TP_PROTO(struct rvt_qp *qp, u32 flags),
-                   TP_ARGS(qp, flags),
-                   TP_STRUCT__entry(
-                           DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
-                           __field(u32, qpn)
-                           __field(u32, flags)
-                           __field(u32, s_flags)
-                           ),
-                   TP_fast_assign(
-                           DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
-                           __entry->flags = flags;
-                           __entry->qpn = qp->ibqp.qp_num;
-                           __entry->s_flags = qp->s_flags;
-                           ),
-                   TP_printk(
-                           "[%s] qpn 0x%x flags 0x%x s_flags 0x%x",
-                           __get_str(dev),
-                           __entry->qpn,
-                           __entry->flags,
-                           __entry->s_flags
-                           )
-);
-
-DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpwakeup,
-            TP_PROTO(struct rvt_qp *qp, u32 flags),
-            TP_ARGS(qp, flags));
-
-DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpsleep,
-            TP_PROTO(struct rvt_qp *qp, u32 flags),
-            TP_ARGS(qp, flags));
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM hfi1_ibhdrs
-
-u8 ibhdr_exhdr_len(struct hfi1_ib_header *hdr);
-const char *parse_everbs_hdrs(struct trace_seq *p, u8 opcode, void *ehdrs);
-
-#define __parse_ib_ehdrs(op, ehdrs) parse_everbs_hdrs(p, op, ehdrs)
-
-const char *parse_sdma_flags(struct trace_seq *p, u64 desc0, u64 desc1);
-
-#define __parse_sdma_flags(desc0, desc1) parse_sdma_flags(p, desc0, desc1)
-
-#define lrh_name(lrh) { HFI1_##lrh, #lrh }
-#define show_lnh(lrh)                    \
-__print_symbolic(lrh,                    \
-       lrh_name(LRH_BTH),               \
-       lrh_name(LRH_GRH))
-
-#define ib_opcode_name(opcode) { IB_OPCODE_##opcode, #opcode  }
-#define show_ib_opcode(opcode)                             \
-__print_symbolic(opcode,                                   \
-       ib_opcode_name(RC_SEND_FIRST),                     \
-       ib_opcode_name(RC_SEND_MIDDLE),                    \
-       ib_opcode_name(RC_SEND_LAST),                      \
-       ib_opcode_name(RC_SEND_LAST_WITH_IMMEDIATE),       \
-       ib_opcode_name(RC_SEND_ONLY),                      \
-       ib_opcode_name(RC_SEND_ONLY_WITH_IMMEDIATE),       \
-       ib_opcode_name(RC_RDMA_WRITE_FIRST),               \
-       ib_opcode_name(RC_RDMA_WRITE_MIDDLE),              \
-       ib_opcode_name(RC_RDMA_WRITE_LAST),                \
-       ib_opcode_name(RC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \
-       ib_opcode_name(RC_RDMA_WRITE_ONLY),                \
-       ib_opcode_name(RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \
-       ib_opcode_name(RC_RDMA_READ_REQUEST),              \
-       ib_opcode_name(RC_RDMA_READ_RESPONSE_FIRST),       \
-       ib_opcode_name(RC_RDMA_READ_RESPONSE_MIDDLE),      \
-       ib_opcode_name(RC_RDMA_READ_RESPONSE_LAST),        \
-       ib_opcode_name(RC_RDMA_READ_RESPONSE_ONLY),        \
-       ib_opcode_name(RC_ACKNOWLEDGE),                    \
-       ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE),             \
-       ib_opcode_name(RC_COMPARE_SWAP),                   \
-       ib_opcode_name(RC_FETCH_ADD),                      \
-       ib_opcode_name(RC_SEND_LAST_WITH_INVALIDATE),      \
-       ib_opcode_name(RC_SEND_ONLY_WITH_INVALIDATE),      \
-       ib_opcode_name(UC_SEND_FIRST),                     \
-       ib_opcode_name(UC_SEND_MIDDLE),                    \
-       ib_opcode_name(UC_SEND_LAST),                      \
-       ib_opcode_name(UC_SEND_LAST_WITH_IMMEDIATE),       \
-       ib_opcode_name(UC_SEND_ONLY),                      \
-       ib_opcode_name(UC_SEND_ONLY_WITH_IMMEDIATE),       \
-       ib_opcode_name(UC_RDMA_WRITE_FIRST),               \
-       ib_opcode_name(UC_RDMA_WRITE_MIDDLE),              \
-       ib_opcode_name(UC_RDMA_WRITE_LAST),                \
-       ib_opcode_name(UC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \
-       ib_opcode_name(UC_RDMA_WRITE_ONLY),                \
-       ib_opcode_name(UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \
-       ib_opcode_name(UD_SEND_ONLY),                      \
-       ib_opcode_name(UD_SEND_ONLY_WITH_IMMEDIATE),       \
-       ib_opcode_name(CNP))
-
-#define LRH_PRN "vl %d lver %d sl %d lnh %d,%s dlid %.4x len %d slid %.4x"
-#define BTH_PRN \
-       "op 0x%.2x,%s se %d m %d pad %d tver %d pkey 0x%.4x " \
-       "f %d b %d qpn 0x%.6x a %d psn 0x%.8x"
-#define EHDR_PRN "%s"
-
-DECLARE_EVENT_CLASS(hfi1_ibhdr_template,
-                   TP_PROTO(struct hfi1_devdata *dd,
-                            struct hfi1_ib_header *hdr),
-                   TP_ARGS(dd, hdr),
-                   TP_STRUCT__entry(
-                           DD_DEV_ENTRY(dd)
-                           /* LRH */
-                           __field(u8, vl)
-                           __field(u8, lver)
-                           __field(u8, sl)
-                           __field(u8, lnh)
-                           __field(u16, dlid)
-                           __field(u16, len)
-                           __field(u16, slid)
-                           /* BTH */
-                           __field(u8, opcode)
-                           __field(u8, se)
-                           __field(u8, m)
-                           __field(u8, pad)
-                           __field(u8, tver)
-                           __field(u16, pkey)
-                           __field(u8, f)
-                           __field(u8, b)
-                           __field(u32, qpn)
-                           __field(u8, a)
-                           __field(u32, psn)
-                           /* extended headers */
-                           __dynamic_array(u8, ehdrs, ibhdr_exhdr_len(hdr))
-                           ),
-                   TP_fast_assign(
-                          struct hfi1_other_headers *ohdr;
-
-                          DD_DEV_ASSIGN(dd);
-                          /* LRH */
-                          __entry->vl =
-                          (u8)(be16_to_cpu(hdr->lrh[0]) >> 12);
-                          __entry->lver =
-                          (u8)(be16_to_cpu(hdr->lrh[0]) >> 8) & 0xf;
-                          __entry->sl =
-                          (u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf;
-                          __entry->lnh =
-                          (u8)(be16_to_cpu(hdr->lrh[0]) & 3);
-                          __entry->dlid =
-                          be16_to_cpu(hdr->lrh[1]);
-                          /* allow for larger len */
-                          __entry->len =
-                          be16_to_cpu(hdr->lrh[2]);
-                          __entry->slid =
-                          be16_to_cpu(hdr->lrh[3]);
-                          /* BTH */
-                          if (__entry->lnh == HFI1_LRH_BTH)
-                               ohdr = &hdr->u.oth;
-                          else
-                               ohdr = &hdr->u.l.oth;
-                         __entry->opcode =
-                         (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
-                         __entry->se =
-                         (be32_to_cpu(ohdr->bth[0]) >> 23) & 1;
-                         __entry->m =
-                         (be32_to_cpu(ohdr->bth[0]) >> 22) & 1;
-                         __entry->pad =
-                         (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
-                         __entry->tver =
-                         (be32_to_cpu(ohdr->bth[0]) >> 16) & 0xf;
-                         __entry->pkey =
-                         be32_to_cpu(ohdr->bth[0]) & 0xffff;
-                         __entry->f =
-                         (be32_to_cpu(ohdr->bth[1]) >> HFI1_FECN_SHIFT) &
-                         HFI1_FECN_MASK;
-                         __entry->b =
-                         (be32_to_cpu(ohdr->bth[1]) >> HFI1_BECN_SHIFT) &
-                         HFI1_BECN_MASK;
-                         __entry->qpn =
-                         be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK;
-                         __entry->a =
-                         (be32_to_cpu(ohdr->bth[2]) >> 31) & 1;
-                         /* allow for larger PSN */
-                         __entry->psn =
-                         be32_to_cpu(ohdr->bth[2]) & 0x7fffffff;
-                         /* extended headers */
-                         memcpy(__get_dynamic_array(ehdrs), &ohdr->u,
-                                ibhdr_exhdr_len(hdr));
-                        ),
-                   TP_printk("[%s] " LRH_PRN " " BTH_PRN " " EHDR_PRN,
-                             __get_str(dev),
-                             /* LRH */
-                             __entry->vl,
-                             __entry->lver,
-                             __entry->sl,
-                             __entry->lnh, show_lnh(__entry->lnh),
-                             __entry->dlid,
-                             __entry->len,
-                             __entry->slid,
-                             /* BTH */
-                             __entry->opcode, show_ib_opcode(__entry->opcode),
-                             __entry->se,
-                             __entry->m,
-                             __entry->pad,
-                             __entry->tver,
-                             __entry->pkey,
-                             __entry->f,
-                             __entry->b,
-                             __entry->qpn,
-                             __entry->a,
-                             __entry->psn,
-                             /* extended headers */
-                             __parse_ib_ehdrs(
-                                       __entry->opcode,
-                                       (void *)__get_dynamic_array(ehdrs))
-                            )
-);
-
-DEFINE_EVENT(hfi1_ibhdr_template, input_ibhdr,
-            TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
-            TP_ARGS(dd, hdr));
-
-DEFINE_EVENT(hfi1_ibhdr_template, pio_output_ibhdr,
-            TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
-            TP_ARGS(dd, hdr));
-
-DEFINE_EVENT(hfi1_ibhdr_template, ack_output_ibhdr,
-            TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
-            TP_ARGS(dd, hdr));
-
-DEFINE_EVENT(hfi1_ibhdr_template, sdma_output_ibhdr,
-            TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
-            TP_ARGS(dd, hdr));
-
-#define SNOOP_PRN \
-       "slid %.4x dlid %.4x qpn 0x%.6x opcode 0x%.2x,%s " \
-       "svc lvl %d pkey 0x%.4x [header = %d bytes] [data = %d bytes]"
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM hfi1_snoop
-
-TRACE_EVENT(snoop_capture,
-           TP_PROTO(struct hfi1_devdata *dd,
-                    int hdr_len,
-                    struct hfi1_ib_header *hdr,
-                    int data_len,
-                    void *data),
-           TP_ARGS(dd, hdr_len, hdr, data_len, data),
-           TP_STRUCT__entry(
-               DD_DEV_ENTRY(dd)
-               __field(u16, slid)
-               __field(u16, dlid)
-               __field(u32, qpn)
-               __field(u8, opcode)
-               __field(u8, sl)
-               __field(u16, pkey)
-               __field(u32, hdr_len)
-               __field(u32, data_len)
-               __field(u8, lnh)
-               __dynamic_array(u8, raw_hdr, hdr_len)
-               __dynamic_array(u8, raw_pkt, data_len)
-               ),
-           TP_fast_assign(
-               struct hfi1_other_headers *ohdr;
-
-               __entry->lnh = (u8)(be16_to_cpu(hdr->lrh[0]) & 3);
-               if (__entry->lnh == HFI1_LRH_BTH)
-                       ohdr = &hdr->u.oth;
-               else
-                       ohdr = &hdr->u.l.oth;
-               DD_DEV_ASSIGN(dd);
-               __entry->slid = be16_to_cpu(hdr->lrh[3]);
-               __entry->dlid = be16_to_cpu(hdr->lrh[1]);
-               __entry->qpn = be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK;
-               __entry->opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
-               __entry->sl = (u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf;
-               __entry->pkey = be32_to_cpu(ohdr->bth[0]) & 0xffff;
-               __entry->hdr_len = hdr_len;
-               __entry->data_len = data_len;
-               memcpy(__get_dynamic_array(raw_hdr), hdr, hdr_len);
-               memcpy(__get_dynamic_array(raw_pkt), data, data_len);
-               ),
-           TP_printk(
-               "[%s] " SNOOP_PRN,
-               __get_str(dev),
-               __entry->slid,
-               __entry->dlid,
-               __entry->qpn,
-               __entry->opcode,
-               show_ib_opcode(__entry->opcode),
-               __entry->sl,
-               __entry->pkey,
-               __entry->hdr_len,
-               __entry->data_len
-               )
-);
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM hfi1_ctxts
-
-#define UCTXT_FMT \
-       "cred:%u, credaddr:0x%llx, piobase:0x%llx, rcvhdr_cnt:%u, "     \
-       "rcvbase:0x%llx, rcvegrc:%u, rcvegrb:0x%llx"
-TRACE_EVENT(hfi1_uctxtdata,
-           TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ctxtdata *uctxt),
-           TP_ARGS(dd, uctxt),
-           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
-                            __field(unsigned, ctxt)
-                            __field(u32, credits)
-                            __field(u64, hw_free)
-                            __field(u64, piobase)
-                            __field(u16, rcvhdrq_cnt)
-                            __field(u64, rcvhdrq_phys)
-                            __field(u32, eager_cnt)
-                            __field(u64, rcvegr_phys)
-                            ),
-           TP_fast_assign(DD_DEV_ASSIGN(dd);
-                          __entry->ctxt = uctxt->ctxt;
-                          __entry->credits = uctxt->sc->credits;
-                          __entry->hw_free = (u64)uctxt->sc->hw_free;
-                          __entry->piobase = (u64)uctxt->sc->base_addr;
-                          __entry->rcvhdrq_cnt = uctxt->rcvhdrq_cnt;
-                          __entry->rcvhdrq_phys = uctxt->rcvhdrq_phys;
-                          __entry->eager_cnt = uctxt->egrbufs.alloced;
-                          __entry->rcvegr_phys =
-                          uctxt->egrbufs.rcvtids[0].phys;
-                          ),
-           TP_printk("[%s] ctxt %u " UCTXT_FMT,
-                     __get_str(dev),
-                     __entry->ctxt,
-                     __entry->credits,
-                     __entry->hw_free,
-                     __entry->piobase,
-                     __entry->rcvhdrq_cnt,
-                     __entry->rcvhdrq_phys,
-                     __entry->eager_cnt,
-                     __entry->rcvegr_phys
-                     )
-);
-
-#define CINFO_FMT \
-       "egrtids:%u, egr_size:%u, hdrq_cnt:%u, hdrq_size:%u, sdma_ring_size:%u"
-TRACE_EVENT(hfi1_ctxt_info,
-           TP_PROTO(struct hfi1_devdata *dd, unsigned ctxt, unsigned subctxt,
-                    struct hfi1_ctxt_info cinfo),
-           TP_ARGS(dd, ctxt, subctxt, cinfo),
-           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
-                            __field(unsigned, ctxt)
-                            __field(unsigned, subctxt)
-                            __field(u16, egrtids)
-                            __field(u16, rcvhdrq_cnt)
-                            __field(u16, rcvhdrq_size)
-                            __field(u16, sdma_ring_size)
-                            __field(u32, rcvegr_size)
-                            ),
-           TP_fast_assign(DD_DEV_ASSIGN(dd);
-                           __entry->ctxt = ctxt;
-                           __entry->subctxt = subctxt;
-                           __entry->egrtids = cinfo.egrtids;
-                           __entry->rcvhdrq_cnt = cinfo.rcvhdrq_cnt;
-                           __entry->rcvhdrq_size = cinfo.rcvhdrq_entsize;
-                           __entry->sdma_ring_size = cinfo.sdma_ring_size;
-                           __entry->rcvegr_size = cinfo.rcvegr_size;
-                           ),
-           TP_printk("[%s] ctxt %u:%u " CINFO_FMT,
-                     __get_str(dev),
-                     __entry->ctxt,
-                     __entry->subctxt,
-                     __entry->egrtids,
-                     __entry->rcvegr_size,
-                     __entry->rcvhdrq_cnt,
-                     __entry->rcvhdrq_size,
-                     __entry->sdma_ring_size
-                     )
-);
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM hfi1_sma
-
-#define BCT_FORMAT \
-       "shared_limit %x vls 0-7 [%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x] 15 [%x,%x]"
-
-#define BCT(field) \
-       be16_to_cpu( \
-               ((struct buffer_control *)__get_dynamic_array(bct))->field \
-       )
-
-DECLARE_EVENT_CLASS(hfi1_bct_template,
-                   TP_PROTO(struct hfi1_devdata *dd,
-                            struct buffer_control *bc),
-                   TP_ARGS(dd, bc),
-                   TP_STRUCT__entry(DD_DEV_ENTRY(dd)
-                                    __dynamic_array(u8, bct, sizeof(*bc))
-                                    ),
-                   TP_fast_assign(DD_DEV_ASSIGN(dd);
-                                  memcpy(__get_dynamic_array(bct), bc,
-                                         sizeof(*bc));
-                                  ),
-                   TP_printk(BCT_FORMAT,
-                             BCT(overall_shared_limit),
-
-                             BCT(vl[0].dedicated),
-                             BCT(vl[0].shared),
-
-                             BCT(vl[1].dedicated),
-                             BCT(vl[1].shared),
-
-                             BCT(vl[2].dedicated),
-                             BCT(vl[2].shared),
-
-                             BCT(vl[3].dedicated),
-                             BCT(vl[3].shared),
-
-                             BCT(vl[4].dedicated),
-                             BCT(vl[4].shared),
-
-                             BCT(vl[5].dedicated),
-                             BCT(vl[5].shared),
-
-                             BCT(vl[6].dedicated),
-                             BCT(vl[6].shared),
-
-                             BCT(vl[7].dedicated),
-                             BCT(vl[7].shared),
-
-                             BCT(vl[15].dedicated),
-                             BCT(vl[15].shared)
-                             )
-);
-
-DEFINE_EVENT(hfi1_bct_template, bct_set,
-            TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc),
-            TP_ARGS(dd, bc));
-
-DEFINE_EVENT(hfi1_bct_template, bct_get,
-            TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc),
-            TP_ARGS(dd, bc));
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM hfi1_sdma
-
-TRACE_EVENT(hfi1_sdma_descriptor,
-           TP_PROTO(struct sdma_engine *sde,
-                    u64 desc0,
-                    u64 desc1,
-                    u16 e,
-                    void *descp),
-       TP_ARGS(sde, desc0, desc1, e, descp),
-       TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
-                        __field(void *, descp)
-                        __field(u64, desc0)
-                        __field(u64, desc1)
-                        __field(u16, e)
-                        __field(u8, idx)
-                        ),
-       TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
-                      __entry->desc0 = desc0;
-                      __entry->desc1 = desc1;
-                      __entry->idx = sde->this_idx;
-                      __entry->descp = descp;
-                      __entry->e = e;
-                      ),
-       TP_printk(
-                 "[%s] SDE(%u) flags:%s addr:0x%016llx gen:%u len:%u d0:%016llx d1:%016llx to %p,%u",
-                 __get_str(dev),
-                 __entry->idx,
-                 __parse_sdma_flags(__entry->desc0, __entry->desc1),
-                 (__entry->desc0 >> SDMA_DESC0_PHY_ADDR_SHIFT) &
-                 SDMA_DESC0_PHY_ADDR_MASK,
-                 (u8)((__entry->desc1 >> SDMA_DESC1_GENERATION_SHIFT) &
-                      SDMA_DESC1_GENERATION_MASK),
-                 (u16)((__entry->desc0 >> SDMA_DESC0_BYTE_COUNT_SHIFT) &
-                       SDMA_DESC0_BYTE_COUNT_MASK),
-                 __entry->desc0,
-                 __entry->desc1,
-                 __entry->descp,
-                 __entry->e
-                 )
-);
-
-TRACE_EVENT(hfi1_sdma_engine_select,
-           TP_PROTO(struct hfi1_devdata *dd, u32 sel, u8 vl, u8 idx),
-           TP_ARGS(dd, sel, vl, idx),
-           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
-                            __field(u32, sel)
-                            __field(u8, vl)
-                            __field(u8, idx)
-                            ),
-           TP_fast_assign(DD_DEV_ASSIGN(dd);
-                          __entry->sel = sel;
-                          __entry->vl = vl;
-                          __entry->idx = idx;
-                          ),
-           TP_printk("[%s] selecting SDE %u sel 0x%x vl %u",
-                     __get_str(dev),
-                     __entry->idx,
-                     __entry->sel,
-                     __entry->vl
-                     )
-);
-
-DECLARE_EVENT_CLASS(hfi1_sdma_engine_class,
-                   TP_PROTO(struct sdma_engine *sde, u64 status),
-                   TP_ARGS(sde, status),
-                   TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
-                                    __field(u64, status)
-                                    __field(u8, idx)
-                                    ),
-                   TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
-                                  __entry->status = status;
-                                  __entry->idx = sde->this_idx;
-                                  ),
-                   TP_printk("[%s] SDE(%u) status %llx",
-                             __get_str(dev),
-                             __entry->idx,
-                             (unsigned long long)__entry->status
-                             )
-);
-
-DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_interrupt,
-            TP_PROTO(struct sdma_engine *sde, u64 status),
-            TP_ARGS(sde, status)
-);
-
-DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_progress,
-            TP_PROTO(struct sdma_engine *sde, u64 status),
-            TP_ARGS(sde, status)
-);
-
-DECLARE_EVENT_CLASS(hfi1_sdma_ahg_ad,
-                   TP_PROTO(struct sdma_engine *sde, int aidx),
-                   TP_ARGS(sde, aidx),
-                   TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
-                                    __field(int, aidx)
-                                    __field(u8, idx)
-                                    ),
-                   TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
-                                  __entry->idx = sde->this_idx;
-                                  __entry->aidx = aidx;
-                                  ),
-                   TP_printk("[%s] SDE(%u) aidx %d",
-                             __get_str(dev),
-                             __entry->idx,
-                             __entry->aidx
-                             )
-);
-
-DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_allocate,
-            TP_PROTO(struct sdma_engine *sde, int aidx),
-            TP_ARGS(sde, aidx));
-
-DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_deallocate,
-            TP_PROTO(struct sdma_engine *sde, int aidx),
-            TP_ARGS(sde, aidx));
-
-#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
-TRACE_EVENT(hfi1_sdma_progress,
-           TP_PROTO(struct sdma_engine *sde,
-                    u16 hwhead,
-                    u16 swhead,
-                    struct sdma_txreq *txp
-                    ),
-           TP_ARGS(sde, hwhead, swhead, txp),
-           TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
-                            __field(u64, sn)
-                            __field(u16, hwhead)
-                            __field(u16, swhead)
-                            __field(u16, txnext)
-                            __field(u16, tx_tail)
-                            __field(u16, tx_head)
-                            __field(u8, idx)
-                            ),
-           TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
-                          __entry->hwhead = hwhead;
-                          __entry->swhead = swhead;
-                          __entry->tx_tail = sde->tx_tail;
-                          __entry->tx_head = sde->tx_head;
-                          __entry->txnext = txp ? txp->next_descq_idx : ~0;
-                          __entry->idx = sde->this_idx;
-                          __entry->sn = txp ? txp->sn : ~0;
-                          ),
-           TP_printk(
-                     "[%s] SDE(%u) sn %llu hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u",
-                     __get_str(dev),
-                     __entry->idx,
-                     __entry->sn,
-                     __entry->hwhead,
-                     __entry->swhead,
-                     __entry->txnext,
-                     __entry->tx_head,
-                     __entry->tx_tail
-                     )
-);
-#else
-TRACE_EVENT(hfi1_sdma_progress,
-           TP_PROTO(struct sdma_engine *sde,
-                    u16 hwhead, u16 swhead,
-                    struct sdma_txreq *txp
-           ),
-       TP_ARGS(sde, hwhead, swhead, txp),
-       TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
-                        __field(u16, hwhead)
-                        __field(u16, swhead)
-                        __field(u16, txnext)
-                        __field(u16, tx_tail)
-                        __field(u16, tx_head)
-                        __field(u8, idx)
-                        ),
-       TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
-                      __entry->hwhead = hwhead;
-                      __entry->swhead = swhead;
-                      __entry->tx_tail = sde->tx_tail;
-                      __entry->tx_head = sde->tx_head;
-                      __entry->txnext = txp ? txp->next_descq_idx : ~0;
-                      __entry->idx = sde->this_idx;
-                      ),
-       TP_printk(
-                 "[%s] SDE(%u) hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u",
-                 __get_str(dev),
-                 __entry->idx,
-                 __entry->hwhead,
-                 __entry->swhead,
-                 __entry->txnext,
-                 __entry->tx_head,
-                 __entry->tx_tail
-                 )
-);
-#endif
-
-DECLARE_EVENT_CLASS(hfi1_sdma_sn,
-                   TP_PROTO(struct sdma_engine *sde, u64 sn),
-                   TP_ARGS(sde, sn),
-                   TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
-                                    __field(u64, sn)
-                                    __field(u8, idx)
-                                    ),
-                   TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
-                                  __entry->sn = sn;
-                                  __entry->idx = sde->this_idx;
-                                  ),
-                   TP_printk("[%s] SDE(%u) sn %llu",
-                             __get_str(dev),
-                             __entry->idx,
-                             __entry->sn
-                             )
-);
-
-DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_out_sn,
-            TP_PROTO(
-               struct sdma_engine *sde,
-               u64 sn
-            ),
-            TP_ARGS(sde, sn)
-);
-
-DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_in_sn,
-            TP_PROTO(struct sdma_engine *sde, u64 sn),
-            TP_ARGS(sde, sn)
-);
-
-#define USDMA_HDR_FORMAT \
-       "[%s:%u:%u:%u] PBC=(0x%x 0x%x) LRH=(0x%x 0x%x) BTH=(0x%x 0x%x 0x%x) KDETH=(0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x) TIDVal=0x%x"
-
-TRACE_EVENT(hfi1_sdma_user_header,
-           TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 req,
-                    struct hfi1_pkt_header *hdr, u32 tidval),
-           TP_ARGS(dd, ctxt, subctxt, req, hdr, tidval),
-           TP_STRUCT__entry(
-                   DD_DEV_ENTRY(dd)
-                   __field(u16, ctxt)
-                   __field(u8, subctxt)
-                   __field(u16, req)
-                   __field(__le32, pbc0)
-                   __field(__le32, pbc1)
-                   __field(__be32, lrh0)
-                   __field(__be32, lrh1)
-                   __field(__be32, bth0)
-                   __field(__be32, bth1)
-                   __field(__be32, bth2)
-                   __field(__le32, kdeth0)
-                   __field(__le32, kdeth1)
-                   __field(__le32, kdeth2)
-                   __field(__le32, kdeth3)
-                   __field(__le32, kdeth4)
-                   __field(__le32, kdeth5)
-                   __field(__le32, kdeth6)
-                   __field(__le32, kdeth7)
-                   __field(__le32, kdeth8)
-                   __field(u32, tidval)
-                   ),
-           TP_fast_assign(
-                   __le32 *pbc = (__le32 *)hdr->pbc;
-                   __be32 *lrh = (__be32 *)hdr->lrh;
-                   __be32 *bth = (__be32 *)hdr->bth;
-                   __le32 *kdeth = (__le32 *)&hdr->kdeth;
-
-                   DD_DEV_ASSIGN(dd);
-                   __entry->ctxt = ctxt;
-                   __entry->subctxt = subctxt;
-                   __entry->req = req;
-                   __entry->pbc0 = pbc[0];
-                   __entry->pbc1 = pbc[1];
-                   __entry->lrh0 = be32_to_cpu(lrh[0]);
-                   __entry->lrh1 = be32_to_cpu(lrh[1]);
-                   __entry->bth0 = be32_to_cpu(bth[0]);
-                   __entry->bth1 = be32_to_cpu(bth[1]);
-                   __entry->bth2 = be32_to_cpu(bth[2]);
-                   __entry->kdeth0 = kdeth[0];
-                   __entry->kdeth1 = kdeth[1];
-                   __entry->kdeth2 = kdeth[2];
-                   __entry->kdeth3 = kdeth[3];
-                   __entry->kdeth4 = kdeth[4];
-                   __entry->kdeth5 = kdeth[5];
-                   __entry->kdeth6 = kdeth[6];
-                   __entry->kdeth7 = kdeth[7];
-                   __entry->kdeth8 = kdeth[8];
-                   __entry->tidval = tidval;
-                   ),
-           TP_printk(USDMA_HDR_FORMAT,
-                     __get_str(dev),
-                     __entry->ctxt,
-                     __entry->subctxt,
-                     __entry->req,
-                     __entry->pbc1,
-                     __entry->pbc0,
-                     __entry->lrh0,
-                     __entry->lrh1,
-                     __entry->bth0,
-                     __entry->bth1,
-                     __entry->bth2,
-                     __entry->kdeth0,
-                     __entry->kdeth1,
-                     __entry->kdeth2,
-                     __entry->kdeth3,
-                     __entry->kdeth4,
-                     __entry->kdeth5,
-                     __entry->kdeth6,
-                     __entry->kdeth7,
-                     __entry->kdeth8,
-                     __entry->tidval
-                   )
-       );
-
-#define SDMA_UREQ_FMT \
-       "[%s:%u:%u] ver/op=0x%x, iovcnt=%u, npkts=%u, frag=%u, idx=%u"
-TRACE_EVENT(hfi1_sdma_user_reqinfo,
-           TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 *i),
-           TP_ARGS(dd, ctxt, subctxt, i),
-           TP_STRUCT__entry(
-                   DD_DEV_ENTRY(dd);
-                   __field(u16, ctxt)
-                   __field(u8, subctxt)
-                   __field(u8, ver_opcode)
-                   __field(u8, iovcnt)
-                   __field(u16, npkts)
-                   __field(u16, fragsize)
-                   __field(u16, comp_idx)
-                   ),
-           TP_fast_assign(
-                   DD_DEV_ASSIGN(dd);
-                   __entry->ctxt = ctxt;
-                   __entry->subctxt = subctxt;
-                   __entry->ver_opcode = i[0] & 0xff;
-                   __entry->iovcnt = (i[0] >> 8) & 0xff;
-                   __entry->npkts = i[1];
-                   __entry->fragsize = i[2];
-                   __entry->comp_idx = i[3];
-                   ),
-           TP_printk(SDMA_UREQ_FMT,
-                     __get_str(dev),
-                     __entry->ctxt,
-                     __entry->subctxt,
-                     __entry->ver_opcode,
-                     __entry->iovcnt,
-                     __entry->npkts,
-                     __entry->fragsize,
-                     __entry->comp_idx
-                   )
-       );
-
-#define usdma_complete_name(st) { st, #st }
-#define show_usdma_complete_state(st)                  \
-       __print_symbolic(st,                            \
-                        usdma_complete_name(FREE),     \
-                        usdma_complete_name(QUEUED),   \
-                        usdma_complete_name(COMPLETE), \
-                        usdma_complete_name(ERROR))
-
-TRACE_EVENT(hfi1_sdma_user_completion,
-           TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 idx,
-                    u8 state, int code),
-           TP_ARGS(dd, ctxt, subctxt, idx, state, code),
-           TP_STRUCT__entry(
-                   DD_DEV_ENTRY(dd)
-                   __field(u16, ctxt)
-                   __field(u8, subctxt)
-                   __field(u16, idx)
-                   __field(u8, state)
-                   __field(int, code)
-                   ),
-           TP_fast_assign(
-                   DD_DEV_ASSIGN(dd);
-                   __entry->ctxt = ctxt;
-                   __entry->subctxt = subctxt;
-                   __entry->idx = idx;
-                   __entry->state = state;
-                   __entry->code = code;
-                   ),
-           TP_printk("[%s:%u:%u:%u] SDMA completion state %s (%d)",
-                     __get_str(dev), __entry->ctxt, __entry->subctxt,
-                     __entry->idx, show_usdma_complete_state(__entry->state),
-                     __entry->code)
-       );
-
-const char *print_u32_array(struct trace_seq *, u32 *, int);
-#define __print_u32_hex(arr, len) print_u32_array(p, arr, len)
-
-TRACE_EVENT(hfi1_sdma_user_header_ahg,
-           TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 req,
-                    u8 sde, u8 ahgidx, u32 *ahg, int len, u32 tidval),
-           TP_ARGS(dd, ctxt, subctxt, req, sde, ahgidx, ahg, len, tidval),
-           TP_STRUCT__entry(
-                   DD_DEV_ENTRY(dd)
-                   __field(u16, ctxt)
-                   __field(u8, subctxt)
-                   __field(u16, req)
-                   __field(u8, sde)
-                   __field(u8, idx)
-                   __field(int, len)
-                   __field(u32, tidval)
-                   __array(u32, ahg, 10)
-                   ),
-           TP_fast_assign(
-                   DD_DEV_ASSIGN(dd);
-                   __entry->ctxt = ctxt;
-                   __entry->subctxt = subctxt;
-                   __entry->req = req;
-                   __entry->sde = sde;
-                   __entry->idx = ahgidx;
-                   __entry->len = len;
-                   __entry->tidval = tidval;
-                   memcpy(__entry->ahg, ahg, len * sizeof(u32));
-                   ),
-           TP_printk("[%s:%u:%u:%u] (SDE%u/AHG%u) ahg[0-%d]=(%s) TIDVal=0x%x",
-                     __get_str(dev),
-                     __entry->ctxt,
-                     __entry->subctxt,
-                     __entry->req,
-                     __entry->sde,
-                     __entry->idx,
-                     __entry->len - 1,
-                     __print_u32_hex(__entry->ahg, __entry->len),
-                     __entry->tidval
-                   )
-       );
-
-TRACE_EVENT(hfi1_sdma_state,
-           TP_PROTO(struct sdma_engine *sde,
-                    const char *cstate,
-                    const char *nstate
-                    ),
-           TP_ARGS(sde, cstate, nstate),
-           TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
-                            __string(curstate, cstate)
-                            __string(newstate, nstate)
-                            ),
-       TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
-                      __assign_str(curstate, cstate);
-                      __assign_str(newstate, nstate);
-                      ),
-       TP_printk("[%s] current state %s new state %s",
-                 __get_str(dev),
-                 __get_str(curstate),
-                 __get_str(newstate)
-                 )
-);
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM hfi1_rc
-
-DECLARE_EVENT_CLASS(hfi1_rc_template,
-                   TP_PROTO(struct rvt_qp *qp, u32 psn),
-                   TP_ARGS(qp, psn),
-                   TP_STRUCT__entry(
-                       DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
-                       __field(u32, qpn)
-                       __field(u32, s_flags)
-                       __field(u32, psn)
-                       __field(u32, s_psn)
-                       __field(u32, s_next_psn)
-                       __field(u32, s_sending_psn)
-                       __field(u32, s_sending_hpsn)
-                       __field(u32, r_psn)
-                       ),
-                   TP_fast_assign(
-                       DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
-                       __entry->qpn = qp->ibqp.qp_num;
-                       __entry->s_flags = qp->s_flags;
-                       __entry->psn = psn;
-                       __entry->s_psn = qp->s_psn;
-                       __entry->s_next_psn = qp->s_next_psn;
-                       __entry->s_sending_psn = qp->s_sending_psn;
-                       __entry->s_sending_hpsn = qp->s_sending_hpsn;
-                       __entry->r_psn = qp->r_psn;
-                       ),
-                   TP_printk(
-                       "[%s] qpn 0x%x s_flags 0x%x psn 0x%x s_psn 0x%x s_next_psn 0x%x s_sending_psn 0x%x sending_hpsn 0x%x r_psn 0x%x",
-                       __get_str(dev),
-                       __entry->qpn,
-                       __entry->s_flags,
-                       __entry->psn,
-                       __entry->s_psn,
-                       __entry->s_next_psn,
-                       __entry->s_sending_psn,
-                       __entry->s_sending_hpsn,
-                       __entry->r_psn
-                       )
-);
-
-DEFINE_EVENT(hfi1_rc_template, hfi1_rc_sendcomplete,
-            TP_PROTO(struct rvt_qp *qp, u32 psn),
-            TP_ARGS(qp, psn)
-);
-
-DEFINE_EVENT(hfi1_rc_template, hfi1_rc_ack,
-            TP_PROTO(struct rvt_qp *qp, u32 psn),
-            TP_ARGS(qp, psn)
-);
-
-DEFINE_EVENT(hfi1_rc_template, hfi1_rc_timeout,
-            TP_PROTO(struct rvt_qp *qp, u32 psn),
-            TP_ARGS(qp, psn)
-);
-
-DEFINE_EVENT(hfi1_rc_template, hfi1_rc_rcv_error,
-            TP_PROTO(struct rvt_qp *qp, u32 psn),
-            TP_ARGS(qp, psn)
-);
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM hfi1_misc
-
-TRACE_EVENT(hfi1_interrupt,
-           TP_PROTO(struct hfi1_devdata *dd, const struct is_table *is_entry,
-                    int src),
-           TP_ARGS(dd, is_entry, src),
-           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
-                            __array(char, buf, 64)
-                            __field(int, src)
-                            ),
-           TP_fast_assign(DD_DEV_ASSIGN(dd)
-                          is_entry->is_name(__entry->buf, 64,
-                                            src - is_entry->start);
-                          __entry->src = src;
-                          ),
-           TP_printk("[%s] source: %s [%d]", __get_str(dev), __entry->buf,
-                     __entry->src)
-);
-
-/*
- * Note:
- * This produces a REALLY ugly trace in the console output when the string is
- * too long.
- */
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM hfi1_trace
-
-#define MAX_MSG_LEN 512
-
-DECLARE_EVENT_CLASS(hfi1_trace_template,
-                   TP_PROTO(const char *function, struct va_format *vaf),
-                   TP_ARGS(function, vaf),
-                   TP_STRUCT__entry(__string(function, function)
-                                    __dynamic_array(char, msg, MAX_MSG_LEN)
-                                    ),
-                   TP_fast_assign(__assign_str(function, function);
-                                  WARN_ON_ONCE(vsnprintf
-                                               (__get_dynamic_array(msg),
-                                                MAX_MSG_LEN, vaf->fmt,
-                                                *vaf->va) >=
-                                               MAX_MSG_LEN);
-                                  ),
-                   TP_printk("(%s) %s",
-                             __get_str(function),
-                             __get_str(msg))
-);
-
-/*
- * It may be nice to macroize the __hfi1_trace but the va_* stuff requires an
- * actual function to work and can not be in a macro.
- */
-#define __hfi1_trace_def(lvl) \
-void __hfi1_trace_##lvl(const char *funct, char *fmt, ...);            \
-                                                                       \
-DEFINE_EVENT(hfi1_trace_template, hfi1_ ##lvl,                         \
-       TP_PROTO(const char *function, struct va_format *vaf),          \
-       TP_ARGS(function, vaf))
-
-#define __hfi1_trace_fn(lvl) \
-void __hfi1_trace_##lvl(const char *func, char *fmt, ...)              \
-{                                                                      \
-       struct va_format vaf = {                                        \
-               .fmt = fmt,                                             \
-       };                                                              \
-       va_list args;                                                   \
-                                                                       \
-       va_start(args, fmt);                                            \
-       vaf.va = &args;                                                 \
-       trace_hfi1_ ##lvl(func, &vaf);                                  \
-       va_end(args);                                                   \
-       return;                                                         \
-}
-
-/*
- * To create a new trace level simply define it below and as a __hfi1_trace_fn
- * in trace.c. This will create all the hooks for calling
- * hfi1_cdbg(LVL, fmt, ...); as well as take care of all
- * the debugfs stuff.
- */
-__hfi1_trace_def(PKT);
-__hfi1_trace_def(PROC);
-__hfi1_trace_def(SDMA);
-__hfi1_trace_def(LINKVERB);
-__hfi1_trace_def(DEBUG);
-__hfi1_trace_def(SNOOP);
-__hfi1_trace_def(CNTR);
-__hfi1_trace_def(PIO);
-__hfi1_trace_def(DC8051);
-__hfi1_trace_def(FIRMWARE);
-__hfi1_trace_def(RCVCTRL);
-__hfi1_trace_def(TID);
-__hfi1_trace_def(MMU);
-__hfi1_trace_def(IOCTL);
-
-#define hfi1_cdbg(which, fmt, ...) \
-       __hfi1_trace_##which(__func__, fmt, ##__VA_ARGS__)
-
-#define hfi1_dbg(fmt, ...) \
-       hfi1_cdbg(DEBUG, fmt, ##__VA_ARGS__)
-
-/*
- * Define HFI1_EARLY_DBG at compile time or here to enable early trace
- * messages. Do not check in an enablement for this.
- */
-
-#ifdef HFI1_EARLY_DBG
-#define hfi1_dbg_early(fmt, ...) \
-       trace_printk(fmt, ##__VA_ARGS__)
-#else
-#define hfi1_dbg_early(fmt, ...)
-#endif
-
-#endif /* __HFI1_TRACE_H */
-
-#undef TRACE_INCLUDE_PATH
-#undef TRACE_INCLUDE_FILE
-#define TRACE_INCLUDE_PATH .
-#define TRACE_INCLUDE_FILE trace
-#include <trace/define_trace.h>
+#include "trace_dbg.h"
+#include "trace_misc.h"
+#include "trace_ctxts.h"
+#include "trace_ibhdrs.h"
+#include "trace_rc.h"
+#include "trace_rx.h"
+#include "trace_tx.h"
diff --git a/drivers/infiniband/hw/hfi1/trace_ctxts.h b/drivers/infiniband/hw/hfi1/trace_ctxts.h
new file mode 100644 (file)
index 0000000..31654bb
--- /dev/null
@@ -0,0 +1,141 @@
+/*
+* Copyright(c) 2015, 2016 Intel Corporation.
+*
+* This file is provided under a dual BSD/GPLv2 license.  When using or
+* redistributing this file, you may do so under either license.
+*
+* GPL LICENSE SUMMARY
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of version 2 of the GNU General Public License as
+* published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful, but
+* WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+* General Public License for more details.
+*
+* BSD LICENSE
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+*  - Redistributions of source code must retain the above copyright
+*    notice, this list of conditions and the following disclaimer.
+*  - Redistributions in binary form must reproduce the above copyright
+*    notice, this list of conditions and the following disclaimer in
+*    the documentation and/or other materials provided with the
+*    distribution.
+*  - Neither the name of Intel Corporation nor the names of its
+*    contributors may be used to endorse or promote products derived
+*    from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+#if !defined(__HFI1_TRACE_CTXTS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HFI1_TRACE_CTXTS_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include "hfi.h"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_ctxts
+
+#define UCTXT_FMT \
+       "cred:%u, credaddr:0x%llx, piobase:0x%p, rcvhdr_cnt:%u, "       \
+       "rcvbase:0x%llx, rcvegrc:%u, rcvegrb:0x%llx"
+TRACE_EVENT(hfi1_uctxtdata,
+           TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ctxtdata *uctxt),
+           TP_ARGS(dd, uctxt),
+           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+                            __field(unsigned int, ctxt)
+                            __field(u32, credits)
+                            __field(u64, hw_free)
+                            __field(void __iomem *, piobase)
+                            __field(u16, rcvhdrq_cnt)
+                            __field(u64, rcvhdrq_phys)
+                            __field(u32, eager_cnt)
+                            __field(u64, rcvegr_phys)
+                            ),
+           TP_fast_assign(DD_DEV_ASSIGN(dd);
+                          __entry->ctxt = uctxt->ctxt;
+                          __entry->credits = uctxt->sc->credits;
+                          __entry->hw_free = le64_to_cpu(*uctxt->sc->hw_free);
+                          __entry->piobase = uctxt->sc->base_addr;
+                          __entry->rcvhdrq_cnt = uctxt->rcvhdrq_cnt;
+                          __entry->rcvhdrq_phys = uctxt->rcvhdrq_phys;
+                          __entry->eager_cnt = uctxt->egrbufs.alloced;
+                          __entry->rcvegr_phys =
+                          uctxt->egrbufs.rcvtids[0].phys;
+                          ),
+           TP_printk("[%s] ctxt %u " UCTXT_FMT,
+                     __get_str(dev),
+                     __entry->ctxt,
+                     __entry->credits,
+                     __entry->hw_free,
+                     __entry->piobase,
+                     __entry->rcvhdrq_cnt,
+                     __entry->rcvhdrq_phys,
+                     __entry->eager_cnt,
+                     __entry->rcvegr_phys
+                     )
+);
+
+#define CINFO_FMT \
+       "egrtids:%u, egr_size:%u, hdrq_cnt:%u, hdrq_size:%u, sdma_ring_size:%u"
+TRACE_EVENT(hfi1_ctxt_info,
+           TP_PROTO(struct hfi1_devdata *dd, unsigned int ctxt,
+                    unsigned int subctxt,
+                    struct hfi1_ctxt_info cinfo),
+           TP_ARGS(dd, ctxt, subctxt, cinfo),
+           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+                            __field(unsigned int, ctxt)
+                            __field(unsigned int, subctxt)
+                            __field(u16, egrtids)
+                            __field(u16, rcvhdrq_cnt)
+                            __field(u16, rcvhdrq_size)
+                            __field(u16, sdma_ring_size)
+                            __field(u32, rcvegr_size)
+                            ),
+           TP_fast_assign(DD_DEV_ASSIGN(dd);
+                           __entry->ctxt = ctxt;
+                           __entry->subctxt = subctxt;
+                           __entry->egrtids = cinfo.egrtids;
+                           __entry->rcvhdrq_cnt = cinfo.rcvhdrq_cnt;
+                           __entry->rcvhdrq_size = cinfo.rcvhdrq_entsize;
+                           __entry->sdma_ring_size = cinfo.sdma_ring_size;
+                           __entry->rcvegr_size = cinfo.rcvegr_size;
+                           ),
+           TP_printk("[%s] ctxt %u:%u " CINFO_FMT,
+                     __get_str(dev),
+                     __entry->ctxt,
+                     __entry->subctxt,
+                     __entry->egrtids,
+                     __entry->rcvegr_size,
+                     __entry->rcvhdrq_cnt,
+                     __entry->rcvhdrq_size,
+                     __entry->sdma_ring_size
+                     )
+);
+
+#endif /* __HFI1_TRACE_CTXTS_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_ctxts
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/hw/hfi1/trace_dbg.h b/drivers/infiniband/hw/hfi1/trace_dbg.h
new file mode 100644 (file)
index 0000000..0e7d929
--- /dev/null
@@ -0,0 +1,155 @@
+/*
+* Copyright(c) 2015, 2016 Intel Corporation.
+*
+* This file is provided under a dual BSD/GPLv2 license.  When using or
+* redistributing this file, you may do so under either license.
+*
+* GPL LICENSE SUMMARY
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of version 2 of the GNU General Public License as
+* published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful, but
+* WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+* General Public License for more details.
+*
+* BSD LICENSE
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+*  - Redistributions of source code must retain the above copyright
+*    notice, this list of conditions and the following disclaimer.
+*  - Redistributions in binary form must reproduce the above copyright
+*    notice, this list of conditions and the following disclaimer in
+*    the documentation and/or other materials provided with the
+*    distribution.
+*  - Neither the name of Intel Corporation nor the names of its
+*    contributors may be used to endorse or promote products derived
+*    from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+#if !defined(__HFI1_TRACE_EXTRA_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HFI1_TRACE_EXTRA_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include "hfi.h"
+
+/*
+ * Note:
+ * This produces a REALLY ugly trace in the console output when the string is
+ * too long.
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_dbg
+
+#define MAX_MSG_LEN 512
+
+DECLARE_EVENT_CLASS(hfi1_trace_template,
+                   TP_PROTO(const char *function, struct va_format *vaf),
+                   TP_ARGS(function, vaf),
+                   TP_STRUCT__entry(__string(function, function)
+                                    __dynamic_array(char, msg, MAX_MSG_LEN)
+                                    ),
+                   TP_fast_assign(__assign_str(function, function);
+                                  WARN_ON_ONCE(vsnprintf
+                                               (__get_dynamic_array(msg),
+                                                MAX_MSG_LEN, vaf->fmt,
+                                                *vaf->va) >=
+                                               MAX_MSG_LEN);
+                                  ),
+                   TP_printk("(%s) %s",
+                             __get_str(function),
+                             __get_str(msg))
+);
+
+/*
+ * It may be nice to macroize the __hfi1_trace but the va_* stuff requires an
+ * actual function to work and can not be in a macro.
+ */
+#define __hfi1_trace_def(lvl) \
+void __hfi1_trace_##lvl(const char *funct, char *fmt, ...);            \
+                                                                       \
+DEFINE_EVENT(hfi1_trace_template, hfi1_ ##lvl,                         \
+       TP_PROTO(const char *function, struct va_format *vaf),          \
+       TP_ARGS(function, vaf))
+
+#define __hfi1_trace_fn(lvl) \
+void __hfi1_trace_##lvl(const char *func, char *fmt, ...)              \
+{                                                                      \
+       struct va_format vaf = {                                        \
+               .fmt = fmt,                                             \
+       };                                                              \
+       va_list args;                                                   \
+                                                                       \
+       va_start(args, fmt);                                            \
+       vaf.va = &args;                                                 \
+       trace_hfi1_ ##lvl(func, &vaf);                                  \
+       va_end(args);                                                   \
+       return;                                                         \
+}
+
+/*
+ * To create a new trace level simply define it below and as a __hfi1_trace_fn
+ * in trace.c. This will create all the hooks for calling
+ * hfi1_cdbg(LVL, fmt, ...); as well as take care of all
+ * the debugfs stuff.
+ */
+__hfi1_trace_def(PKT);
+__hfi1_trace_def(PROC);
+__hfi1_trace_def(SDMA);
+__hfi1_trace_def(LINKVERB);
+__hfi1_trace_def(DEBUG);
+__hfi1_trace_def(SNOOP);
+__hfi1_trace_def(CNTR);
+__hfi1_trace_def(PIO);
+__hfi1_trace_def(DC8051);
+__hfi1_trace_def(FIRMWARE);
+__hfi1_trace_def(RCVCTRL);
+__hfi1_trace_def(TID);
+__hfi1_trace_def(MMU);
+__hfi1_trace_def(IOCTL);
+
+#define hfi1_cdbg(which, fmt, ...) \
+       __hfi1_trace_##which(__func__, fmt, ##__VA_ARGS__)
+
+#define hfi1_dbg(fmt, ...) \
+       hfi1_cdbg(DEBUG, fmt, ##__VA_ARGS__)
+
+/*
+ * Define HFI1_EARLY_DBG at compile time or here to enable early trace
+ * messages. Do not check in an enablement for this.
+ */
+
+#ifdef HFI1_EARLY_DBG
+#define hfi1_dbg_early(fmt, ...) \
+       trace_printk(fmt, ##__VA_ARGS__)
+#else
+#define hfi1_dbg_early(fmt, ...)
+#endif
+
+#endif /* __HFI1_TRACE_EXTRA_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_dbg
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/hw/hfi1/trace_ibhdrs.h b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h
new file mode 100644 (file)
index 0000000..c3e41ae
--- /dev/null
@@ -0,0 +1,209 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#if !defined(__HFI1_TRACE_IBHDRS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HFI1_TRACE_IBHDRS_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include "hfi.h"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_ibhdrs
+
+u8 ibhdr_exhdr_len(struct hfi1_ib_header *hdr);
+const char *parse_everbs_hdrs(struct trace_seq *p, u8 opcode, void *ehdrs);
+
+#define __parse_ib_ehdrs(op, ehdrs) parse_everbs_hdrs(p, op, ehdrs)
+
+#define lrh_name(lrh) { HFI1_##lrh, #lrh }
+#define show_lnh(lrh)                    \
+__print_symbolic(lrh,                    \
+       lrh_name(LRH_BTH),               \
+       lrh_name(LRH_GRH))
+
+#define LRH_PRN "vl %d lver %d sl %d lnh %d,%s dlid %.4x len %d slid %.4x"
+#define BTH_PRN \
+       "op 0x%.2x,%s se %d m %d pad %d tver %d pkey 0x%.4x " \
+       "f %d b %d qpn 0x%.6x a %d psn 0x%.8x"
+#define EHDR_PRN "%s"
+
+DECLARE_EVENT_CLASS(hfi1_ibhdr_template,
+                   TP_PROTO(struct hfi1_devdata *dd,
+                            struct hfi1_ib_header *hdr),
+                   TP_ARGS(dd, hdr),
+                   TP_STRUCT__entry(
+                       DD_DEV_ENTRY(dd)
+                       /* LRH */
+                       __field(u8, vl)
+                       __field(u8, lver)
+                       __field(u8, sl)
+                       __field(u8, lnh)
+                       __field(u16, dlid)
+                       __field(u16, len)
+                       __field(u16, slid)
+                       /* BTH */
+                       __field(u8, opcode)
+                       __field(u8, se)
+                       __field(u8, m)
+                       __field(u8, pad)
+                       __field(u8, tver)
+                       __field(u16, pkey)
+                       __field(u8, f)
+                       __field(u8, b)
+                       __field(u32, qpn)
+                       __field(u8, a)
+                       __field(u32, psn)
+                       /* extended headers */
+                       __dynamic_array(u8, ehdrs, ibhdr_exhdr_len(hdr))
+                       ),
+                     TP_fast_assign(
+                       struct hfi1_other_headers *ohdr;
+
+                       DD_DEV_ASSIGN(dd);
+                       /* LRH */
+                       __entry->vl =
+                       (u8)(be16_to_cpu(hdr->lrh[0]) >> 12);
+                       __entry->lver =
+                       (u8)(be16_to_cpu(hdr->lrh[0]) >> 8) & 0xf;
+                       __entry->sl =
+                       (u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf;
+                       __entry->lnh =
+                       (u8)(be16_to_cpu(hdr->lrh[0]) & 3);
+                       __entry->dlid =
+                       be16_to_cpu(hdr->lrh[1]);
+                       /* allow for larger len */
+                       __entry->len =
+                       be16_to_cpu(hdr->lrh[2]);
+                       __entry->slid =
+                       be16_to_cpu(hdr->lrh[3]);
+                       /* BTH */
+                       if (__entry->lnh == HFI1_LRH_BTH)
+                       ohdr = &hdr->u.oth;
+                       else
+                       ohdr = &hdr->u.l.oth;
+                       __entry->opcode =
+                       (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
+                       __entry->se =
+                       (be32_to_cpu(ohdr->bth[0]) >> 23) & 1;
+                       __entry->m =
+                       (be32_to_cpu(ohdr->bth[0]) >> 22) & 1;
+                       __entry->pad =
+                       (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+                       __entry->tver =
+                       (be32_to_cpu(ohdr->bth[0]) >> 16) & 0xf;
+                       __entry->pkey =
+                       be32_to_cpu(ohdr->bth[0]) & 0xffff;
+                       __entry->f =
+                       (be32_to_cpu(ohdr->bth[1]) >> HFI1_FECN_SHIFT) &
+                       HFI1_FECN_MASK;
+                       __entry->b =
+                       (be32_to_cpu(ohdr->bth[1]) >> HFI1_BECN_SHIFT) &
+                       HFI1_BECN_MASK;
+                       __entry->qpn =
+                       be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK;
+                       __entry->a =
+                       (be32_to_cpu(ohdr->bth[2]) >> 31) & 1;
+                       /* allow for larger PSN */
+                       __entry->psn =
+                       be32_to_cpu(ohdr->bth[2]) & 0x7fffffff;
+                       /* extended headers */
+                       memcpy(__get_dynamic_array(ehdrs), &ohdr->u,
+                              ibhdr_exhdr_len(hdr));
+                       ),
+               TP_printk("[%s] " LRH_PRN " " BTH_PRN " " EHDR_PRN,
+                         __get_str(dev),
+                         /* LRH */
+                         __entry->vl,
+                         __entry->lver,
+                         __entry->sl,
+                         __entry->lnh, show_lnh(__entry->lnh),
+                         __entry->dlid,
+                         __entry->len,
+                         __entry->slid,
+                         /* BTH */
+                         __entry->opcode, show_ib_opcode(__entry->opcode),
+                         __entry->se,
+                         __entry->m,
+                         __entry->pad,
+                         __entry->tver,
+                         __entry->pkey,
+                         __entry->f,
+                         __entry->b,
+                         __entry->qpn,
+                         __entry->a,
+                         __entry->psn,
+                         /* extended headers */
+                         __parse_ib_ehdrs(
+                               __entry->opcode,
+                               (void *)__get_dynamic_array(ehdrs))
+                       )
+);
+
+DEFINE_EVENT(hfi1_ibhdr_template, input_ibhdr,
+            TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
+            TP_ARGS(dd, hdr));
+
+DEFINE_EVENT(hfi1_ibhdr_template, pio_output_ibhdr,
+            TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
+            TP_ARGS(dd, hdr));
+
+DEFINE_EVENT(hfi1_ibhdr_template, ack_output_ibhdr,
+            TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
+            TP_ARGS(dd, hdr));
+
+DEFINE_EVENT(hfi1_ibhdr_template, sdma_output_ibhdr,
+            TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
+            TP_ARGS(dd, hdr));
+
+#endif /* __HFI1_TRACE_IBHDRS_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_ibhdrs
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/hw/hfi1/trace_misc.h b/drivers/infiniband/hw/hfi1/trace_misc.h
new file mode 100644 (file)
index 0000000..d308454
--- /dev/null
@@ -0,0 +1,81 @@
+/*
+* Copyright(c) 2015, 2016 Intel Corporation.
+*
+* This file is provided under a dual BSD/GPLv2 license.  When using or
+* redistributing this file, you may do so under either license.
+*
+* GPL LICENSE SUMMARY
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of version 2 of the GNU General Public License as
+* published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful, but
+* WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+* General Public License for more details.
+*
+* BSD LICENSE
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+*  - Redistributions of source code must retain the above copyright
+*    notice, this list of conditions and the following disclaimer.
+*  - Redistributions in binary form must reproduce the above copyright
+*    notice, this list of conditions and the following disclaimer in
+*    the documentation and/or other materials provided with the
+*    distribution.
+*  - Neither the name of Intel Corporation nor the names of its
+*    contributors may be used to endorse or promote products derived
+*    from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+#if !defined(__HFI1_TRACE_MISC_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HFI1_TRACE_MISC_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include "hfi.h"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_misc
+
+TRACE_EVENT(hfi1_interrupt,
+           TP_PROTO(struct hfi1_devdata *dd, const struct is_table *is_entry,
+                    int src),
+           TP_ARGS(dd, is_entry, src),
+           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+                            __array(char, buf, 64)
+                            __field(int, src)
+                            ),
+           TP_fast_assign(DD_DEV_ASSIGN(dd)
+                          is_entry->is_name(__entry->buf, 64,
+                                            src - is_entry->start);
+                          __entry->src = src;
+                          ),
+           TP_printk("[%s] source: %s [%d]", __get_str(dev), __entry->buf,
+                     __entry->src)
+);
+
+#endif /* __HFI1_TRACE_MISC_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_misc
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/hw/hfi1/trace_rc.h b/drivers/infiniband/hw/hfi1/trace_rc.h
new file mode 100644 (file)
index 0000000..5ea5005
--- /dev/null
@@ -0,0 +1,123 @@
+/*
+* Copyright(c) 2015, 2016 Intel Corporation.
+*
+* This file is provided under a dual BSD/GPLv2 license.  When using or
+* redistributing this file, you may do so under either license.
+*
+* GPL LICENSE SUMMARY
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of version 2 of the GNU General Public License as
+* published by the Free Software Foundation.
+*
+* This program is distributed in the hope that it will be useful, but
+* WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+* General Public License for more details.
+*
+* BSD LICENSE
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*
+*  - Redistributions of source code must retain the above copyright
+*    notice, this list of conditions and the following disclaimer.
+*  - Redistributions in binary form must reproduce the above copyright
+*    notice, this list of conditions and the following disclaimer in
+*    the documentation and/or other materials provided with the
+*    distribution.
+*  - Neither the name of Intel Corporation nor the names of its
+*    contributors may be used to endorse or promote products derived
+*    from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+#if !defined(__HFI1_TRACE_RC_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HFI1_TRACE_RC_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include "hfi.h"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_rc
+
+DECLARE_EVENT_CLASS(hfi1_rc_template,
+                   TP_PROTO(struct rvt_qp *qp, u32 psn),
+                   TP_ARGS(qp, psn),
+                   TP_STRUCT__entry(
+                       DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+                       __field(u32, qpn)
+                       __field(u32, s_flags)
+                       __field(u32, psn)
+                       __field(u32, s_psn)
+                       __field(u32, s_next_psn)
+                       __field(u32, s_sending_psn)
+                       __field(u32, s_sending_hpsn)
+                       __field(u32, r_psn)
+                       ),
+                   TP_fast_assign(
+                       DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+                       __entry->qpn = qp->ibqp.qp_num;
+                       __entry->s_flags = qp->s_flags;
+                       __entry->psn = psn;
+                       __entry->s_psn = qp->s_psn;
+                       __entry->s_next_psn = qp->s_next_psn;
+                       __entry->s_sending_psn = qp->s_sending_psn;
+                       __entry->s_sending_hpsn = qp->s_sending_hpsn;
+                       __entry->r_psn = qp->r_psn;
+                       ),
+                   TP_printk(
+                       "[%s] qpn 0x%x s_flags 0x%x psn 0x%x s_psn 0x%x s_next_psn 0x%x s_sending_psn 0x%x sending_hpsn 0x%x r_psn 0x%x",
+                       __get_str(dev),
+                       __entry->qpn,
+                       __entry->s_flags,
+                       __entry->psn,
+                       __entry->s_psn,
+                       __entry->s_next_psn,
+                       __entry->s_sending_psn,
+                       __entry->s_sending_hpsn,
+                       __entry->r_psn
+                       )
+);
+
+DEFINE_EVENT(hfi1_rc_template, hfi1_sendcomplete,
+            TP_PROTO(struct rvt_qp *qp, u32 psn),
+            TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(hfi1_rc_template, hfi1_ack,
+            TP_PROTO(struct rvt_qp *qp, u32 psn),
+            TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(hfi1_rc_template, hfi1_timeout,
+            TP_PROTO(struct rvt_qp *qp, u32 psn),
+            TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(hfi1_rc_template, hfi1_rcv_error,
+            TP_PROTO(struct rvt_qp *qp, u32 psn),
+            TP_ARGS(qp, psn)
+);
+
+#endif /* __HFI1_TRACE_RC_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_rc
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/hw/hfi1/trace_rx.h b/drivers/infiniband/hw/hfi1/trace_rx.h
new file mode 100644 (file)
index 0000000..9ba1f61
--- /dev/null
@@ -0,0 +1,322 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#if !defined(__HFI1_TRACE_RX_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HFI1_TRACE_RX_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include "hfi.h"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_rx
+
+TRACE_EVENT(hfi1_rcvhdr,
+           TP_PROTO(struct hfi1_devdata *dd,
+                    u32 ctxt,
+                    u64 eflags,
+                    u32 etype,
+                    u32 hlen,
+                    u32 tlen,
+                    u32 updegr,
+                    u32 etail
+                   ),
+           TP_ARGS(dd, ctxt, eflags, etype, hlen, tlen, updegr, etail),
+           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+                            __field(u64, eflags)
+                            __field(u32, ctxt)
+                            __field(u32, etype)
+                            __field(u32, hlen)
+                            __field(u32, tlen)
+                            __field(u32, updegr)
+                            __field(u32, etail)
+                            ),
+            TP_fast_assign(DD_DEV_ASSIGN(dd);
+                           __entry->eflags = eflags;
+                           __entry->ctxt = ctxt;
+                           __entry->etype = etype;
+                           __entry->hlen = hlen;
+                           __entry->tlen = tlen;
+                           __entry->updegr = updegr;
+                           __entry->etail = etail;
+                           ),
+            TP_printk(
+               "[%s] ctxt %d eflags 0x%llx etype %d,%s hlen %d tlen %d updegr %d etail %d",
+               __get_str(dev),
+               __entry->ctxt,
+               __entry->eflags,
+               __entry->etype, show_packettype(__entry->etype),
+               __entry->hlen,
+               __entry->tlen,
+               __entry->updegr,
+               __entry->etail
+               )
+);
+
+TRACE_EVENT(hfi1_receive_interrupt,
+           TP_PROTO(struct hfi1_devdata *dd, u32 ctxt),
+           TP_ARGS(dd, ctxt),
+           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+                            __field(u32, ctxt)
+                            __field(u8, slow_path)
+                            __field(u8, dma_rtail)
+                            ),
+           TP_fast_assign(DD_DEV_ASSIGN(dd);
+                       __entry->ctxt = ctxt;
+                       if (dd->rcd[ctxt]->do_interrupt ==
+                           &handle_receive_interrupt) {
+                               __entry->slow_path = 1;
+                               __entry->dma_rtail = 0xFF;
+                       } else if (dd->rcd[ctxt]->do_interrupt ==
+                                       &handle_receive_interrupt_dma_rtail){
+                               __entry->dma_rtail = 1;
+                               __entry->slow_path = 0;
+                       } else if (dd->rcd[ctxt]->do_interrupt ==
+                                       &handle_receive_interrupt_nodma_rtail) {
+                               __entry->dma_rtail = 0;
+                               __entry->slow_path = 0;
+                       }
+                       ),
+           TP_printk("[%s] ctxt %d SlowPath: %d DmaRtail: %d",
+                     __get_str(dev),
+                     __entry->ctxt,
+                     __entry->slow_path,
+                     __entry->dma_rtail
+                     )
+);
+
+TRACE_EVENT(hfi1_exp_tid_reg,
+           TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr,
+                    u32 npages, unsigned long va, unsigned long pa,
+                    dma_addr_t dma),
+           TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma),
+           TP_STRUCT__entry(
+                            __field(unsigned int, ctxt)
+                            __field(u16, subctxt)
+                            __field(u32, rarr)
+                            __field(u32, npages)
+                            __field(unsigned long, va)
+                            __field(unsigned long, pa)
+                            __field(dma_addr_t, dma)
+                            ),
+           TP_fast_assign(
+                          __entry->ctxt = ctxt;
+                          __entry->subctxt = subctxt;
+                          __entry->rarr = rarr;
+                          __entry->npages = npages;
+                          __entry->va = va;
+                          __entry->pa = pa;
+                          __entry->dma = dma;
+                          ),
+           TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx",
+                     __entry->ctxt,
+                     __entry->subctxt,
+                     __entry->rarr,
+                     __entry->npages,
+                     __entry->pa,
+                     __entry->va,
+                     __entry->dma
+                     )
+       );
+
+TRACE_EVENT(hfi1_exp_tid_unreg,
+           TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages,
+                    unsigned long va, unsigned long pa, dma_addr_t dma),
+           TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma),
+           TP_STRUCT__entry(
+                            __field(unsigned int, ctxt)
+                            __field(u16, subctxt)
+                            __field(u32, rarr)
+                            __field(u32, npages)
+                            __field(unsigned long, va)
+                            __field(unsigned long, pa)
+                            __field(dma_addr_t, dma)
+                            ),
+           TP_fast_assign(
+                          __entry->ctxt = ctxt;
+                          __entry->subctxt = subctxt;
+                          __entry->rarr = rarr;
+                          __entry->npages = npages;
+                          __entry->va = va;
+                          __entry->pa = pa;
+                          __entry->dma = dma;
+                          ),
+           TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx",
+                     __entry->ctxt,
+                     __entry->subctxt,
+                     __entry->rarr,
+                     __entry->npages,
+                     __entry->pa,
+                     __entry->va,
+                     __entry->dma
+                     )
+       );
+
+TRACE_EVENT(hfi1_exp_tid_inval,
+           TP_PROTO(unsigned int ctxt, u16 subctxt, unsigned long va, u32 rarr,
+                    u32 npages, dma_addr_t dma),
+           TP_ARGS(ctxt, subctxt, va, rarr, npages, dma),
+           TP_STRUCT__entry(
+                            __field(unsigned int, ctxt)
+                            __field(u16, subctxt)
+                            __field(unsigned long, va)
+                            __field(u32, rarr)
+                            __field(u32, npages)
+                            __field(dma_addr_t, dma)
+                            ),
+           TP_fast_assign(
+                          __entry->ctxt = ctxt;
+                          __entry->subctxt = subctxt;
+                          __entry->va = va;
+                          __entry->rarr = rarr;
+                          __entry->npages = npages;
+                          __entry->dma = dma;
+                         ),
+           TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx dma: 0x%llx",
+                     __entry->ctxt,
+                     __entry->subctxt,
+                     __entry->rarr,
+                     __entry->npages,
+                     __entry->va,
+                     __entry->dma
+                     )
+           );
+
+TRACE_EVENT(hfi1_mmu_invalidate,
+           TP_PROTO(unsigned int ctxt, u16 subctxt, const char *type,
+                    unsigned long start, unsigned long end),
+           TP_ARGS(ctxt, subctxt, type, start, end),
+           TP_STRUCT__entry(
+                            __field(unsigned int, ctxt)
+                            __field(u16, subctxt)
+                            __string(type, type)
+                            __field(unsigned long, start)
+                            __field(unsigned long, end)
+                            ),
+           TP_fast_assign(
+                       __entry->ctxt = ctxt;
+                       __entry->subctxt = subctxt;
+                       __assign_str(type, type);
+                       __entry->start = start;
+                       __entry->end = end;
+           ),
+           TP_printk("[%3u:%02u] MMU Invalidate (%s) 0x%lx - 0x%lx",
+                     __entry->ctxt,
+                     __entry->subctxt,
+                     __get_str(type),
+                     __entry->start,
+                     __entry->end
+                     )
+           );
+
+#define SNOOP_PRN \
+       "slid %.4x dlid %.4x qpn 0x%.6x opcode 0x%.2x,%s " \
+       "svc lvl %d pkey 0x%.4x [header = %d bytes] [data = %d bytes]"
+
+TRACE_EVENT(snoop_capture,
+           TP_PROTO(struct hfi1_devdata *dd,
+                    int hdr_len,
+                    struct hfi1_ib_header *hdr,
+                    int data_len,
+                    void *data),
+           TP_ARGS(dd, hdr_len, hdr, data_len, data),
+           TP_STRUCT__entry(
+                            DD_DEV_ENTRY(dd)
+                            __field(u16, slid)
+                            __field(u16, dlid)
+                            __field(u32, qpn)
+                            __field(u8, opcode)
+                            __field(u8, sl)
+                            __field(u16, pkey)
+                            __field(u32, hdr_len)
+                            __field(u32, data_len)
+                            __field(u8, lnh)
+                            __dynamic_array(u8, raw_hdr, hdr_len)
+                            __dynamic_array(u8, raw_pkt, data_len)
+                            ),
+           TP_fast_assign(
+               struct hfi1_other_headers *ohdr;
+
+               __entry->lnh = (u8)(be16_to_cpu(hdr->lrh[0]) & 3);
+               if (__entry->lnh == HFI1_LRH_BTH)
+               ohdr = &hdr->u.oth;
+               else
+               ohdr = &hdr->u.l.oth;
+               DD_DEV_ASSIGN(dd);
+               __entry->slid = be16_to_cpu(hdr->lrh[3]);
+               __entry->dlid = be16_to_cpu(hdr->lrh[1]);
+               __entry->qpn = be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK;
+               __entry->opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
+               __entry->sl = (u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf;
+               __entry->pkey = be32_to_cpu(ohdr->bth[0]) & 0xffff;
+               __entry->hdr_len = hdr_len;
+               __entry->data_len = data_len;
+               memcpy(__get_dynamic_array(raw_hdr), hdr, hdr_len);
+               memcpy(__get_dynamic_array(raw_pkt), data, data_len);
+               ),
+           TP_printk(
+               "[%s] " SNOOP_PRN,
+               __get_str(dev),
+               __entry->slid,
+               __entry->dlid,
+               __entry->qpn,
+               __entry->opcode,
+               show_ib_opcode(__entry->opcode),
+               __entry->sl,
+               __entry->pkey,
+               __entry->hdr_len,
+               __entry->data_len
+               )
+);
+
+#endif /* __HFI1_TRACE_RX_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_rx
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/hw/hfi1/trace_tx.h b/drivers/infiniband/hw/hfi1/trace_tx.h
new file mode 100644 (file)
index 0000000..415d6be
--- /dev/null
@@ -0,0 +1,642 @@
+/*
+ * Copyright(c) 2015, 2016 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#if !defined(__HFI1_TRACE_TX_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HFI1_TRACE_TX_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include "hfi.h"
+#include "mad.h"
+#include "sdma.h"
+
+const char *parse_sdma_flags(struct trace_seq *p, u64 desc0, u64 desc1);
+
+#define __parse_sdma_flags(desc0, desc1) parse_sdma_flags(p, desc0, desc1)
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_tx
+
+TRACE_EVENT(hfi1_piofree,
+           TP_PROTO(struct send_context *sc, int extra),
+           TP_ARGS(sc, extra),
+           TP_STRUCT__entry(DD_DEV_ENTRY(sc->dd)
+           __field(u32, sw_index)
+           __field(u32, hw_context)
+           __field(int, extra)
+           ),
+           TP_fast_assign(DD_DEV_ASSIGN(sc->dd);
+           __entry->sw_index = sc->sw_index;
+           __entry->hw_context = sc->hw_context;
+           __entry->extra = extra;
+           ),
+           TP_printk("[%s] ctxt %u(%u) extra %d",
+                     __get_str(dev),
+                     __entry->sw_index,
+                     __entry->hw_context,
+                     __entry->extra
+           )
+);
+
+TRACE_EVENT(hfi1_wantpiointr,
+           TP_PROTO(struct send_context *sc, u32 needint, u64 credit_ctrl),
+           TP_ARGS(sc, needint, credit_ctrl),
+           TP_STRUCT__entry(DD_DEV_ENTRY(sc->dd)
+                       __field(u32, sw_index)
+                       __field(u32, hw_context)
+                       __field(u32, needint)
+                       __field(u64, credit_ctrl)
+                       ),
+           TP_fast_assign(DD_DEV_ASSIGN(sc->dd);
+                       __entry->sw_index = sc->sw_index;
+                       __entry->hw_context = sc->hw_context;
+                       __entry->needint = needint;
+                       __entry->credit_ctrl = credit_ctrl;
+                       ),
+           TP_printk("[%s] ctxt %u(%u) on %d credit_ctrl 0x%llx",
+                     __get_str(dev),
+                     __entry->sw_index,
+                     __entry->hw_context,
+                     __entry->needint,
+                     (unsigned long long)__entry->credit_ctrl
+                     )
+);
+
+DECLARE_EVENT_CLASS(hfi1_qpsleepwakeup_template,
+                   TP_PROTO(struct rvt_qp *qp, u32 flags),
+                   TP_ARGS(qp, flags),
+                   TP_STRUCT__entry(
+                   DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+                   __field(u32, qpn)
+                   __field(u32, flags)
+                   __field(u32, s_flags)
+                   ),
+                   TP_fast_assign(
+                   DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+                   __entry->flags = flags;
+                   __entry->qpn = qp->ibqp.qp_num;
+                   __entry->s_flags = qp->s_flags;
+                   ),
+                   TP_printk(
+                   "[%s] qpn 0x%x flags 0x%x s_flags 0x%x",
+                   __get_str(dev),
+                   __entry->qpn,
+                   __entry->flags,
+                   __entry->s_flags
+                   )
+);
+
+DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpwakeup,
+            TP_PROTO(struct rvt_qp *qp, u32 flags),
+            TP_ARGS(qp, flags));
+
+DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpsleep,
+            TP_PROTO(struct rvt_qp *qp, u32 flags),
+            TP_ARGS(qp, flags));
+
+TRACE_EVENT(hfi1_sdma_descriptor,
+           TP_PROTO(struct sdma_engine *sde,
+                    u64 desc0,
+                    u64 desc1,
+                    u16 e,
+                    void *descp),
+                    TP_ARGS(sde, desc0, desc1, e, descp),
+                    TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
+                    __field(void *, descp)
+                    __field(u64, desc0)
+                    __field(u64, desc1)
+                    __field(u16, e)
+                    __field(u8, idx)
+                    ),
+                    TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
+                    __entry->desc0 = desc0;
+                    __entry->desc1 = desc1;
+                    __entry->idx = sde->this_idx;
+                    __entry->descp = descp;
+                    __entry->e = e;
+                    ),
+           TP_printk(
+           "[%s] SDE(%u) flags:%s addr:0x%016llx gen:%u len:%u d0:%016llx d1:%016llx to %p,%u",
+           __get_str(dev),
+           __entry->idx,
+           __parse_sdma_flags(__entry->desc0, __entry->desc1),
+           (__entry->desc0 >> SDMA_DESC0_PHY_ADDR_SHIFT) &
+           SDMA_DESC0_PHY_ADDR_MASK,
+           (u8)((__entry->desc1 >> SDMA_DESC1_GENERATION_SHIFT) &
+           SDMA_DESC1_GENERATION_MASK),
+           (u16)((__entry->desc0 >> SDMA_DESC0_BYTE_COUNT_SHIFT) &
+           SDMA_DESC0_BYTE_COUNT_MASK),
+           __entry->desc0,
+           __entry->desc1,
+           __entry->descp,
+           __entry->e
+           )
+);
+
+TRACE_EVENT(hfi1_sdma_engine_select,
+           TP_PROTO(struct hfi1_devdata *dd, u32 sel, u8 vl, u8 idx),
+           TP_ARGS(dd, sel, vl, idx),
+           TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+           __field(u32, sel)
+           __field(u8, vl)
+           __field(u8, idx)
+           ),
+           TP_fast_assign(DD_DEV_ASSIGN(dd);
+           __entry->sel = sel;
+           __entry->vl = vl;
+           __entry->idx = idx;
+           ),
+           TP_printk("[%s] selecting SDE %u sel 0x%x vl %u",
+                     __get_str(dev),
+                     __entry->idx,
+                     __entry->sel,
+                     __entry->vl
+                     )
+);
+
+DECLARE_EVENT_CLASS(hfi1_sdma_engine_class,
+                   TP_PROTO(struct sdma_engine *sde, u64 status),
+                   TP_ARGS(sde, status),
+                   TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
+                   __field(u64, status)
+                   __field(u8, idx)
+                   ),
+                   TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
+                   __entry->status = status;
+                   __entry->idx = sde->this_idx;
+                   ),
+                   TP_printk("[%s] SDE(%u) status %llx",
+                             __get_str(dev),
+                             __entry->idx,
+                             (unsigned long long)__entry->status
+                             )
+);
+
+DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_interrupt,
+            TP_PROTO(struct sdma_engine *sde, u64 status),
+            TP_ARGS(sde, status)
+);
+
+DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_progress,
+            TP_PROTO(struct sdma_engine *sde, u64 status),
+            TP_ARGS(sde, status)
+);
+
+DECLARE_EVENT_CLASS(hfi1_sdma_ahg_ad,
+                   TP_PROTO(struct sdma_engine *sde, int aidx),
+                   TP_ARGS(sde, aidx),
+                   TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
+                   __field(int, aidx)
+                   __field(u8, idx)
+                   ),
+                   TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
+                   __entry->idx = sde->this_idx;
+                   __entry->aidx = aidx;
+                   ),
+                   TP_printk("[%s] SDE(%u) aidx %d",
+                             __get_str(dev),
+                             __entry->idx,
+                             __entry->aidx
+                             )
+);
+
+DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_allocate,
+            TP_PROTO(struct sdma_engine *sde, int aidx),
+            TP_ARGS(sde, aidx));
+
+DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_deallocate,
+            TP_PROTO(struct sdma_engine *sde, int aidx),
+            TP_ARGS(sde, aidx));
+
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+TRACE_EVENT(hfi1_sdma_progress,
+           TP_PROTO(struct sdma_engine *sde,
+                    u16 hwhead,
+                    u16 swhead,
+                    struct sdma_txreq *txp
+                    ),
+           TP_ARGS(sde, hwhead, swhead, txp),
+           TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
+           __field(u64, sn)
+           __field(u16, hwhead)
+           __field(u16, swhead)
+           __field(u16, txnext)
+           __field(u16, tx_tail)
+           __field(u16, tx_head)
+           __field(u8, idx)
+           ),
+           TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
+           __entry->hwhead = hwhead;
+           __entry->swhead = swhead;
+           __entry->tx_tail = sde->tx_tail;
+           __entry->tx_head = sde->tx_head;
+           __entry->txnext = txp ? txp->next_descq_idx : ~0;
+           __entry->idx = sde->this_idx;
+           __entry->sn = txp ? txp->sn : ~0;
+           ),
+           TP_printk(
+           "[%s] SDE(%u) sn %llu hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u",
+           __get_str(dev),
+           __entry->idx,
+           __entry->sn,
+           __entry->hwhead,
+           __entry->swhead,
+           __entry->txnext,
+           __entry->tx_head,
+           __entry->tx_tail
+           )
+);
+#else
+TRACE_EVENT(hfi1_sdma_progress,
+           TP_PROTO(struct sdma_engine *sde,
+                    u16 hwhead, u16 swhead,
+                    struct sdma_txreq *txp
+                    ),
+           TP_ARGS(sde, hwhead, swhead, txp),
+           TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
+                   __field(u16, hwhead)
+                   __field(u16, swhead)
+                   __field(u16, txnext)
+                   __field(u16, tx_tail)
+                   __field(u16, tx_head)
+                   __field(u8, idx)
+                   ),
+           TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
+                   __entry->hwhead = hwhead;
+                   __entry->swhead = swhead;
+                   __entry->tx_tail = sde->tx_tail;
+                   __entry->tx_head = sde->tx_head;
+                   __entry->txnext = txp ? txp->next_descq_idx : ~0;
+                   __entry->idx = sde->this_idx;
+                   ),
+           TP_printk(
+                   "[%s] SDE(%u) hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u",
+                   __get_str(dev),
+                   __entry->idx,
+                   __entry->hwhead,
+                   __entry->swhead,
+                   __entry->txnext,
+                   __entry->tx_head,
+                   __entry->tx_tail
+           )
+);
+#endif
+
+DECLARE_EVENT_CLASS(hfi1_sdma_sn,
+                   TP_PROTO(struct sdma_engine *sde, u64 sn),
+                   TP_ARGS(sde, sn),
+                   TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
+                   __field(u64, sn)
+                   __field(u8, idx)
+                   ),
+                   TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
+                   __entry->sn = sn;
+                   __entry->idx = sde->this_idx;
+                   ),
+                   TP_printk("[%s] SDE(%u) sn %llu",
+                             __get_str(dev),
+                             __entry->idx,
+                             __entry->sn
+                             )
+);
+
+DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_out_sn,
+            TP_PROTO(
+            struct sdma_engine *sde,
+            u64 sn
+            ),
+            TP_ARGS(sde, sn)
+);
+
+DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_in_sn,
+            TP_PROTO(struct sdma_engine *sde, u64 sn),
+            TP_ARGS(sde, sn)
+);
+
+#define USDMA_HDR_FORMAT \
+       "[%s:%u:%u:%u] PBC=(0x%x 0x%x) LRH=(0x%x 0x%x) BTH=(0x%x 0x%x 0x%x) KDETH=(0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x) TIDVal=0x%x"
+
+TRACE_EVENT(hfi1_sdma_user_header,
+           TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 req,
+                    struct hfi1_pkt_header *hdr, u32 tidval),
+           TP_ARGS(dd, ctxt, subctxt, req, hdr, tidval),
+           TP_STRUCT__entry(
+                   DD_DEV_ENTRY(dd)
+                   __field(u16, ctxt)
+                   __field(u8, subctxt)
+                   __field(u16, req)
+                   __field(u32, pbc0)
+                   __field(u32, pbc1)
+                   __field(u32, lrh0)
+                   __field(u32, lrh1)
+                   __field(u32, bth0)
+                   __field(u32, bth1)
+                   __field(u32, bth2)
+                   __field(u32, kdeth0)
+                   __field(u32, kdeth1)
+                   __field(u32, kdeth2)
+                   __field(u32, kdeth3)
+                   __field(u32, kdeth4)
+                   __field(u32, kdeth5)
+                   __field(u32, kdeth6)
+                   __field(u32, kdeth7)
+                   __field(u32, kdeth8)
+                   __field(u32, tidval)
+                   ),
+                   TP_fast_assign(
+                   __le32 *pbc = (__le32 *)hdr->pbc;
+                   __be32 *lrh = (__be32 *)hdr->lrh;
+                   __be32 *bth = (__be32 *)hdr->bth;
+                   __le32 *kdeth = (__le32 *)&hdr->kdeth;
+
+                   DD_DEV_ASSIGN(dd);
+                   __entry->ctxt = ctxt;
+                   __entry->subctxt = subctxt;
+                   __entry->req = req;
+                   __entry->pbc0 = le32_to_cpu(pbc[0]);
+                   __entry->pbc1 = le32_to_cpu(pbc[1]);
+                   __entry->lrh0 = be32_to_cpu(lrh[0]);
+                   __entry->lrh1 = be32_to_cpu(lrh[1]);
+                   __entry->bth0 = be32_to_cpu(bth[0]);
+                   __entry->bth1 = be32_to_cpu(bth[1]);
+                   __entry->bth2 = be32_to_cpu(bth[2]);
+                   __entry->kdeth0 = le32_to_cpu(kdeth[0]);
+                   __entry->kdeth1 = le32_to_cpu(kdeth[1]);
+                   __entry->kdeth2 = le32_to_cpu(kdeth[2]);
+                   __entry->kdeth3 = le32_to_cpu(kdeth[3]);
+                   __entry->kdeth4 = le32_to_cpu(kdeth[4]);
+                   __entry->kdeth5 = le32_to_cpu(kdeth[5]);
+                   __entry->kdeth6 = le32_to_cpu(kdeth[6]);
+                   __entry->kdeth7 = le32_to_cpu(kdeth[7]);
+                   __entry->kdeth8 = le32_to_cpu(kdeth[8]);
+                   __entry->tidval = tidval;
+           ),
+           TP_printk(USDMA_HDR_FORMAT,
+                     __get_str(dev),
+                     __entry->ctxt,
+                     __entry->subctxt,
+                     __entry->req,
+                     __entry->pbc1,
+                     __entry->pbc0,
+                     __entry->lrh0,
+                     __entry->lrh1,
+                     __entry->bth0,
+                     __entry->bth1,
+                     __entry->bth2,
+                     __entry->kdeth0,
+                     __entry->kdeth1,
+                     __entry->kdeth2,
+                     __entry->kdeth3,
+                     __entry->kdeth4,
+                     __entry->kdeth5,
+                     __entry->kdeth6,
+                     __entry->kdeth7,
+                     __entry->kdeth8,
+                     __entry->tidval
+           )
+);
+
+#define SDMA_UREQ_FMT \
+       "[%s:%u:%u] ver/op=0x%x, iovcnt=%u, npkts=%u, frag=%u, idx=%u"
+TRACE_EVENT(hfi1_sdma_user_reqinfo,
+           TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 *i),
+           TP_ARGS(dd, ctxt, subctxt, i),
+           TP_STRUCT__entry(
+                   DD_DEV_ENTRY(dd);
+                   __field(u16, ctxt)
+                   __field(u8, subctxt)
+                   __field(u8, ver_opcode)
+                   __field(u8, iovcnt)
+                   __field(u16, npkts)
+                   __field(u16, fragsize)
+                   __field(u16, comp_idx)
+           ),
+           TP_fast_assign(
+                   DD_DEV_ASSIGN(dd);
+                   __entry->ctxt = ctxt;
+                   __entry->subctxt = subctxt;
+                   __entry->ver_opcode = i[0] & 0xff;
+                   __entry->iovcnt = (i[0] >> 8) & 0xff;
+                   __entry->npkts = i[1];
+                   __entry->fragsize = i[2];
+                   __entry->comp_idx = i[3];
+           ),
+           TP_printk(SDMA_UREQ_FMT,
+                     __get_str(dev),
+                     __entry->ctxt,
+                     __entry->subctxt,
+                     __entry->ver_opcode,
+                     __entry->iovcnt,
+                     __entry->npkts,
+                     __entry->fragsize,
+                     __entry->comp_idx
+                     )
+);
+
+#define usdma_complete_name(st) { st, #st }
+#define show_usdma_complete_state(st)                  \
+       __print_symbolic(st,                            \
+                       usdma_complete_name(FREE),      \
+                       usdma_complete_name(QUEUED),    \
+                       usdma_complete_name(COMPLETE), \
+                       usdma_complete_name(ERROR))
+
+TRACE_EVENT(hfi1_sdma_user_completion,
+           TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 idx,
+                    u8 state, int code),
+           TP_ARGS(dd, ctxt, subctxt, idx, state, code),
+           TP_STRUCT__entry(
+           DD_DEV_ENTRY(dd)
+           __field(u16, ctxt)
+           __field(u8, subctxt)
+           __field(u16, idx)
+           __field(u8, state)
+           __field(int, code)
+           ),
+           TP_fast_assign(
+           DD_DEV_ASSIGN(dd);
+           __entry->ctxt = ctxt;
+           __entry->subctxt = subctxt;
+           __entry->idx = idx;
+           __entry->state = state;
+           __entry->code = code;
+           ),
+           TP_printk("[%s:%u:%u:%u] SDMA completion state %s (%d)",
+                     __get_str(dev), __entry->ctxt, __entry->subctxt,
+                     __entry->idx, show_usdma_complete_state(__entry->state),
+                     __entry->code)
+);
+
+const char *print_u32_array(struct trace_seq *, u32 *, int);
+#define __print_u32_hex(arr, len) print_u32_array(p, arr, len)
+
+TRACE_EVENT(hfi1_sdma_user_header_ahg,
+           TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 req,
+                    u8 sde, u8 ahgidx, u32 *ahg, int len, u32 tidval),
+           TP_ARGS(dd, ctxt, subctxt, req, sde, ahgidx, ahg, len, tidval),
+           TP_STRUCT__entry(
+           DD_DEV_ENTRY(dd)
+           __field(u16, ctxt)
+           __field(u8, subctxt)
+           __field(u16, req)
+           __field(u8, sde)
+           __field(u8, idx)
+           __field(int, len)
+           __field(u32, tidval)
+           __array(u32, ahg, 10)
+           ),
+           TP_fast_assign(
+           DD_DEV_ASSIGN(dd);
+           __entry->ctxt = ctxt;
+           __entry->subctxt = subctxt;
+           __entry->req = req;
+           __entry->sde = sde;
+           __entry->idx = ahgidx;
+           __entry->len = len;
+           __entry->tidval = tidval;
+           memcpy(__entry->ahg, ahg, len * sizeof(u32));
+           ),
+           TP_printk("[%s:%u:%u:%u] (SDE%u/AHG%u) ahg[0-%d]=(%s) TIDVal=0x%x",
+                     __get_str(dev),
+                     __entry->ctxt,
+                     __entry->subctxt,
+                     __entry->req,
+                     __entry->sde,
+                     __entry->idx,
+                     __entry->len - 1,
+                     __print_u32_hex(__entry->ahg, __entry->len),
+                     __entry->tidval
+                     )
+);
+
+TRACE_EVENT(hfi1_sdma_state,
+           TP_PROTO(struct sdma_engine *sde,
+                    const char *cstate,
+                    const char *nstate
+                    ),
+           TP_ARGS(sde, cstate, nstate),
+           TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd)
+               __string(curstate, cstate)
+               __string(newstate, nstate)
+           ),
+           TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
+               __assign_str(curstate, cstate);
+               __assign_str(newstate, nstate);
+           ),
+           TP_printk("[%s] current state %s new state %s",
+                     __get_str(dev),
+                     __get_str(curstate),
+                     __get_str(newstate)
+           )
+);
+
+#define BCT_FORMAT \
+       "shared_limit %x vls 0-7 [%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x] 15 [%x,%x]"
+
+#define BCT(field) \
+       be16_to_cpu( \
+       ((struct buffer_control *)__get_dynamic_array(bct))->field \
+       )
+
+DECLARE_EVENT_CLASS(hfi1_bct_template,
+                   TP_PROTO(struct hfi1_devdata *dd,
+                            struct buffer_control *bc),
+                   TP_ARGS(dd, bc),
+                   TP_STRUCT__entry(DD_DEV_ENTRY(dd)
+                   __dynamic_array(u8, bct, sizeof(*bc))
+                   ),
+                   TP_fast_assign(DD_DEV_ASSIGN(dd);
+                                  memcpy(__get_dynamic_array(bct), bc,
+                                         sizeof(*bc));
+                   ),
+                   TP_printk(BCT_FORMAT,
+                             BCT(overall_shared_limit),
+
+                             BCT(vl[0].dedicated),
+                             BCT(vl[0].shared),
+
+                             BCT(vl[1].dedicated),
+                             BCT(vl[1].shared),
+
+                             BCT(vl[2].dedicated),
+                             BCT(vl[2].shared),
+
+                             BCT(vl[3].dedicated),
+                             BCT(vl[3].shared),
+
+                             BCT(vl[4].dedicated),
+                             BCT(vl[4].shared),
+
+                             BCT(vl[5].dedicated),
+                             BCT(vl[5].shared),
+
+                             BCT(vl[6].dedicated),
+                             BCT(vl[6].shared),
+
+                             BCT(vl[7].dedicated),
+                             BCT(vl[7].shared),
+
+                             BCT(vl[15].dedicated),
+                             BCT(vl[15].shared)
+                   )
+);
+
+DEFINE_EVENT(hfi1_bct_template, bct_set,
+            TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc),
+            TP_ARGS(dd, bc));
+
+DEFINE_EVENT(hfi1_bct_template, bct_get,
+            TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc),
+            TP_ARGS(dd, bc));
+
+#endif /* __HFI1_TRACE_TX_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_tx
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/hw/hfi1/twsi.c b/drivers/infiniband/hw/hfi1/twsi.c
deleted file mode 100644 (file)
index e82e52a..0000000
+++ /dev/null
@@ -1,489 +0,0 @@
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/delay.h>
-#include <linux/pci.h>
-#include <linux/vmalloc.h>
-
-#include "hfi.h"
-#include "twsi.h"
-
-/*
- * "Two Wire Serial Interface" support.
- *
- * Originally written for a not-quite-i2c serial eeprom, which is
- * still used on some supported boards. Later boards have added a
- * variety of other uses, most board-specific, so the bit-boffing
- * part has been split off to this file, while the other parts
- * have been moved to chip-specific files.
- *
- * We have also dropped all pretense of fully generic (e.g. pretend
- * we don't know whether '1' is the higher voltage) interface, as
- * the restrictions of the generic i2c interface (e.g. no access from
- * driver itself) make it unsuitable for this use.
- */
-
-#define READ_CMD 1
-#define WRITE_CMD 0
-
-/**
- * i2c_wait_for_writes - wait for a write
- * @dd: the hfi1_ib device
- *
- * We use this instead of udelay directly, so we can make sure
- * that previous register writes have been flushed all the way
- * to the chip.  Since we are delaying anyway, the cost doesn't
- * hurt, and makes the bit twiddling more regular
- */
-static void i2c_wait_for_writes(struct hfi1_devdata *dd, u32 target)
-{
-       /*
-        * implicit read of EXTStatus is as good as explicit
-        * read of scratch, if all we want to do is flush
-        * writes.
-        */
-       hfi1_gpio_mod(dd, target, 0, 0, 0);
-       rmb(); /* inlined, so prevent compiler reordering */
-}
-
-/*
- * QSFP modules are allowed to hold SCL low for 500uSec. Allow twice that
- * for "almost compliant" modules
- */
-#define SCL_WAIT_USEC 1000
-
-/* BUF_WAIT is time bus must be free between STOP or ACK and to next START.
- * Should be 20, but some chips need more.
- */
-#define TWSI_BUF_WAIT_USEC 60
-
-static void scl_out(struct hfi1_devdata *dd, u32 target, u8 bit)
-{
-       u32 mask;
-
-       udelay(1);
-
-       mask = QSFP_HFI0_I2CCLK;
-
-       /* SCL is meant to be bare-drain, so never set "OUT", just DIR */
-       hfi1_gpio_mod(dd, target, 0, bit ? 0 : mask, mask);
-
-       /*
-        * Allow for slow slaves by simple
-        * delay for falling edge, sampling on rise.
-        */
-       if (!bit) {
-               udelay(2);
-       } else {
-               int rise_usec;
-
-               for (rise_usec = SCL_WAIT_USEC; rise_usec > 0; rise_usec -= 2) {
-                       if (mask & hfi1_gpio_mod(dd, target, 0, 0, 0))
-                               break;
-                       udelay(2);
-               }
-               if (rise_usec <= 0)
-                       dd_dev_err(dd, "SCL interface stuck low > %d uSec\n",
-                                  SCL_WAIT_USEC);
-       }
-       i2c_wait_for_writes(dd, target);
-}
-
-static u8 scl_in(struct hfi1_devdata *dd, u32 target, int wait)
-{
-       u32 read_val, mask;
-
-       mask = QSFP_HFI0_I2CCLK;
-       /* SCL is meant to be bare-drain, so never set "OUT", just DIR */
-       hfi1_gpio_mod(dd, target, 0, 0, mask);
-       read_val = hfi1_gpio_mod(dd, target, 0, 0, 0);
-       if (wait)
-               i2c_wait_for_writes(dd, target);
-       return (read_val & mask) >> GPIO_SCL_NUM;
-}
-
-static void sda_out(struct hfi1_devdata *dd, u32 target, u8 bit)
-{
-       u32 mask;
-
-       mask = QSFP_HFI0_I2CDAT;
-
-       /* SDA is meant to be bare-drain, so never set "OUT", just DIR */
-       hfi1_gpio_mod(dd, target, 0, bit ? 0 : mask, mask);
-
-       i2c_wait_for_writes(dd, target);
-       udelay(2);
-}
-
-static u8 sda_in(struct hfi1_devdata *dd, u32 target, int wait)
-{
-       u32 read_val, mask;
-
-       mask = QSFP_HFI0_I2CDAT;
-       /* SDA is meant to be bare-drain, so never set "OUT", just DIR */
-       hfi1_gpio_mod(dd, target, 0, 0, mask);
-       read_val = hfi1_gpio_mod(dd, target, 0, 0, 0);
-       if (wait)
-               i2c_wait_for_writes(dd, target);
-       return (read_val & mask) >> GPIO_SDA_NUM;
-}
-
-/**
- * i2c_ackrcv - see if ack following write is true
- * @dd: the hfi1_ib device
- */
-static int i2c_ackrcv(struct hfi1_devdata *dd, u32 target)
-{
-       u8 ack_received;
-
-       /* AT ENTRY SCL = LOW */
-       /* change direction, ignore data */
-       ack_received = sda_in(dd, target, 1);
-       scl_out(dd, target, 1);
-       ack_received = sda_in(dd, target, 1) == 0;
-       scl_out(dd, target, 0);
-       return ack_received;
-}
-
-static void stop_cmd(struct hfi1_devdata *dd, u32 target);
-
-/**
- * rd_byte - read a byte, sending STOP on last, else ACK
- * @dd: the hfi1_ib device
- *
- * Returns byte shifted out of device
- */
-static int rd_byte(struct hfi1_devdata *dd, u32 target, int last)
-{
-       int bit_cntr, data;
-
-       data = 0;
-
-       for (bit_cntr = 7; bit_cntr >= 0; --bit_cntr) {
-               data <<= 1;
-               scl_out(dd, target, 1);
-               data |= sda_in(dd, target, 0);
-               scl_out(dd, target, 0);
-       }
-       if (last) {
-               scl_out(dd, target, 1);
-               stop_cmd(dd, target);
-       } else {
-               sda_out(dd, target, 0);
-               scl_out(dd, target, 1);
-               scl_out(dd, target, 0);
-               sda_out(dd, target, 1);
-       }
-       return data;
-}
-
-/**
- * wr_byte - write a byte, one bit at a time
- * @dd: the hfi1_ib device
- * @data: the byte to write
- *
- * Returns 0 if we got the following ack, otherwise 1
- */
-static int wr_byte(struct hfi1_devdata *dd, u32 target, u8 data)
-{
-       int bit_cntr;
-       u8 bit;
-
-       for (bit_cntr = 7; bit_cntr >= 0; bit_cntr--) {
-               bit = (data >> bit_cntr) & 1;
-               sda_out(dd, target, bit);
-               scl_out(dd, target, 1);
-               scl_out(dd, target, 0);
-       }
-       return (!i2c_ackrcv(dd, target)) ? 1 : 0;
-}
-
-/*
- * issue TWSI start sequence:
- * (both clock/data high, clock high, data low while clock is high)
- */
-static void start_seq(struct hfi1_devdata *dd, u32 target)
-{
-       sda_out(dd, target, 1);
-       scl_out(dd, target, 1);
-       sda_out(dd, target, 0);
-       udelay(1);
-       scl_out(dd, target, 0);
-}
-
-/**
- * stop_seq - transmit the stop sequence
- * @dd: the hfi1_ib device
- *
- * (both clock/data low, clock high, data high while clock is high)
- */
-static void stop_seq(struct hfi1_devdata *dd, u32 target)
-{
-       scl_out(dd, target, 0);
-       sda_out(dd, target, 0);
-       scl_out(dd, target, 1);
-       sda_out(dd, target, 1);
-}
-
-/**
- * stop_cmd - transmit the stop condition
- * @dd: the hfi1_ib device
- *
- * (both clock/data low, clock high, data high while clock is high)
- */
-static void stop_cmd(struct hfi1_devdata *dd, u32 target)
-{
-       stop_seq(dd, target);
-       udelay(TWSI_BUF_WAIT_USEC);
-}
-
-/**
- * hfi1_twsi_reset - reset I2C communication
- * @dd: the hfi1_ib device
- * returns 0 if ok, -EIO on error
- */
-int hfi1_twsi_reset(struct hfi1_devdata *dd, u32 target)
-{
-       int clock_cycles_left = 9;
-       u32 mask;
-
-       /* Both SCL and SDA should be high. If not, there
-        * is something wrong.
-        */
-       mask = QSFP_HFI0_I2CCLK | QSFP_HFI0_I2CDAT;
-
-       /*
-        * Force pins to desired innocuous state.
-        * This is the default power-on state with out=0 and dir=0,
-        * So tri-stated and should be floating high (barring HW problems)
-        */
-       hfi1_gpio_mod(dd, target, 0, 0, mask);
-
-       /* Check if SCL is low, if it is low then we have a slave device
-        * misbehaving and there is not much we can do.
-        */
-       if (!scl_in(dd, target, 0))
-               return -EIO;
-
-       /* Check if SDA is low, if it is low then we have to clock SDA
-        * up to 9 times for the device to release the bus
-        */
-       while (clock_cycles_left--) {
-               if (sda_in(dd, target, 0))
-                       return 0;
-               scl_out(dd, target, 0);
-               scl_out(dd, target, 1);
-       }
-
-       return -EIO;
-}
-
-#define HFI1_TWSI_START 0x100
-#define HFI1_TWSI_STOP 0x200
-
-/* Write byte to TWSI, optionally prefixed with START or suffixed with
- * STOP.
- * returns 0 if OK (ACK received), else != 0
- */
-static int twsi_wr(struct hfi1_devdata *dd, u32 target, int data, int flags)
-{
-       int ret = 1;
-
-       if (flags & HFI1_TWSI_START)
-               start_seq(dd, target);
-
-       /* Leaves SCL low (from i2c_ackrcv()) */
-       ret = wr_byte(dd, target, data);
-
-       if (flags & HFI1_TWSI_STOP)
-               stop_cmd(dd, target);
-       return ret;
-}
-
-/* Added functionality for IBA7220-based cards */
-#define HFI1_TEMP_DEV 0x98
-
-/*
- * hfi1_twsi_blk_rd
- * General interface for data transfer from twsi devices.
- * One vestige of its former role is that it recognizes a device
- * HFI1_TWSI_NO_DEV and does the correct operation for the legacy part,
- * which responded to all TWSI device codes, interpreting them as
- * address within device. On all other devices found on board handled by
- * this driver, the device is followed by a N-byte "address" which selects
- * the "register" or "offset" within the device from which data should
- * be read.
- */
-int hfi1_twsi_blk_rd(struct hfi1_devdata *dd, u32 target, int dev, int addr,
-                    void *buffer, int len)
-{
-       u8 *bp = buffer;
-       int ret = 1;
-       int i;
-       int offset_size;
-
-       /* obtain the offset size, strip it from the device address */
-       offset_size = (dev >> 8) & 0xff;
-       dev &= 0xff;
-
-       /* allow at most a 2 byte offset */
-       if (offset_size > 2)
-               goto bail;
-
-       if (dev == HFI1_TWSI_NO_DEV) {
-               /* legacy not-really-I2C */
-               addr = (addr << 1) | READ_CMD;
-               ret = twsi_wr(dd, target, addr, HFI1_TWSI_START);
-       } else {
-               /* Actual I2C */
-               if (offset_size) {
-                       ret = twsi_wr(dd, target,
-                                     dev | WRITE_CMD, HFI1_TWSI_START);
-                       if (ret) {
-                               stop_cmd(dd, target);
-                               goto bail;
-                       }
-
-                       for (i = 0; i < offset_size; i++) {
-                               ret = twsi_wr(dd, target,
-                                             (addr >> (i * 8)) & 0xff, 0);
-                               udelay(TWSI_BUF_WAIT_USEC);
-                               if (ret) {
-                                       dd_dev_err(dd, "Failed to write byte %d of offset 0x%04X\n",
-                                                  i, addr);
-                                       goto bail;
-                               }
-                       }
-               }
-               ret = twsi_wr(dd, target, dev | READ_CMD, HFI1_TWSI_START);
-       }
-       if (ret) {
-               stop_cmd(dd, target);
-               goto bail;
-       }
-
-       /*
-        * block devices keeps clocking data out as long as we ack,
-        * automatically incrementing the address. Some have "pages"
-        * whose boundaries will not be crossed, but the handling
-        * of these is left to the caller, who is in a better
-        * position to know.
-        */
-       while (len-- > 0) {
-               /*
-                * Get and store data, sending ACK if length remaining,
-                * else STOP
-                */
-               *bp++ = rd_byte(dd, target, !len);
-       }
-
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-/*
- * hfi1_twsi_blk_wr
- * General interface for data transfer to twsi devices.
- * One vestige of its former role is that it recognizes a device
- * HFI1_TWSI_NO_DEV and does the correct operation for the legacy part,
- * which responded to all TWSI device codes, interpreting them as
- * address within device. On all other devices found on board handled by
- * this driver, the device is followed by a N-byte "address" which selects
- * the "register" or "offset" within the device to which data should
- * be written.
- */
-int hfi1_twsi_blk_wr(struct hfi1_devdata *dd, u32 target, int dev, int addr,
-                    const void *buffer, int len)
-{
-       const u8 *bp = buffer;
-       int ret = 1;
-       int i;
-       int offset_size;
-
-       /* obtain the offset size, strip it from the device address */
-       offset_size = (dev >> 8) & 0xff;
-       dev &= 0xff;
-
-       /* allow at most a 2 byte offset */
-       if (offset_size > 2)
-               goto bail;
-
-       if (dev == HFI1_TWSI_NO_DEV) {
-               if (twsi_wr(dd, target, (addr << 1) | WRITE_CMD,
-                           HFI1_TWSI_START)) {
-                       goto failed_write;
-               }
-       } else {
-               /* Real I2C */
-               if (twsi_wr(dd, target, dev | WRITE_CMD, HFI1_TWSI_START))
-                       goto failed_write;
-       }
-
-       for (i = 0; i < offset_size; i++) {
-               ret = twsi_wr(dd, target, (addr >> (i * 8)) & 0xff, 0);
-               udelay(TWSI_BUF_WAIT_USEC);
-               if (ret) {
-                       dd_dev_err(dd, "Failed to write byte %d of offset 0x%04X\n",
-                                  i, addr);
-                       goto bail;
-               }
-       }
-
-       for (i = 0; i < len; i++)
-               if (twsi_wr(dd, target, *bp++, 0))
-                       goto failed_write;
-
-       ret = 0;
-
-failed_write:
-       stop_cmd(dd, target);
-
-bail:
-       return ret;
-}
diff --git a/drivers/infiniband/hw/hfi1/twsi.h b/drivers/infiniband/hw/hfi1/twsi.h
deleted file mode 100644 (file)
index 5b8a5b5..0000000
+++ /dev/null
@@ -1,65 +0,0 @@
-#ifndef _TWSI_H
-#define _TWSI_H
-/*
- * Copyright(c) 2015, 2016 Intel Corporation.
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * BSD LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  - Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  - Neither the name of Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#define HFI1_TWSI_NO_DEV 0xFF
-
-struct hfi1_devdata;
-
-/* Bit position of SDA/SCL pins in ASIC_QSFP* registers  */
-#define  GPIO_SDA_NUM 1
-#define  GPIO_SCL_NUM 0
-
-/* these functions must be called with qsfp_lock held */
-int hfi1_twsi_reset(struct hfi1_devdata *dd, u32 target);
-int hfi1_twsi_blk_rd(struct hfi1_devdata *dd, u32 target, int dev, int addr,
-                    void *buffer, int len);
-int hfi1_twsi_blk_wr(struct hfi1_devdata *dd, u32 target, int dev, int addr,
-                    const void *buffer, int len);
-
-#endif /* _TWSI_H */
index df773d4332970bd2ed2e9567d59fcf55c397f0cd..a726d96d185f3261ac77f99b73fe6112aa90c914 100644 (file)
@@ -118,6 +118,31 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
                        clear_ahg(qp);
                        goto bail;
                }
+               /*
+                * Local operations are processed immediately
+                * after all prior requests have completed.
+                */
+               if (wqe->wr.opcode == IB_WR_REG_MR ||
+                   wqe->wr.opcode == IB_WR_LOCAL_INV) {
+                       int local_ops = 0;
+                       int err = 0;
+
+                       if (qp->s_last != qp->s_cur)
+                               goto bail;
+                       if (++qp->s_cur == qp->s_size)
+                               qp->s_cur = 0;
+                       if (!(wqe->wr.send_flags & RVT_SEND_COMPLETION_ONLY)) {
+                               err = rvt_invalidate_rkey(
+                                       qp, wqe->wr.ex.invalidate_rkey);
+                               local_ops = 1;
+                       }
+                       hfi1_send_complete(qp, wqe, err ? IB_WC_LOC_PROT_ERR
+                                                       : IB_WC_SUCCESS);
+                       if (local_ops)
+                               atomic_dec(&qp->local_ops_pending);
+                       qp->s_hdrwords = 0;
+                       goto done_free_tx;
+               }
                /*
                 * Start a new request.
                 */
@@ -294,46 +319,12 @@ void hfi1_uc_rcv(struct hfi1_packet *packet)
        struct ib_reth *reth;
        int has_grh = rcv_flags & HFI1_HAS_GRH;
        int ret;
-       u32 bth1;
 
        bth0 = be32_to_cpu(ohdr->bth[0]);
        if (hfi1_ruc_check_hdr(ibp, hdr, has_grh, qp, bth0))
                return;
 
-       bth1 = be32_to_cpu(ohdr->bth[1]);
-       if (unlikely(bth1 & (HFI1_BECN_SMASK | HFI1_FECN_SMASK))) {
-               if (bth1 & HFI1_BECN_SMASK) {
-                       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-                       u32 rqpn, lqpn;
-                       u16 rlid = be16_to_cpu(hdr->lrh[3]);
-                       u8 sl, sc5;
-
-                       lqpn = bth1 & RVT_QPN_MASK;
-                       rqpn = qp->remote_qpn;
-
-                       sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
-                       sl = ibp->sc_to_sl[sc5];
-
-                       process_becn(ppd, sl, rlid, lqpn, rqpn,
-                                    IB_CC_SVCTYPE_UC);
-               }
-
-               if (bth1 & HFI1_FECN_SMASK) {
-                       struct ib_grh *grh = NULL;
-                       u16 pkey = (u16)be32_to_cpu(ohdr->bth[0]);
-                       u16 slid = be16_to_cpu(hdr->lrh[3]);
-                       u16 dlid = be16_to_cpu(hdr->lrh[1]);
-                       u32 src_qp = qp->remote_qpn;
-                       u8 sc5;
-
-                       sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
-                       if (has_grh)
-                               grh = &hdr->u.l.grh;
-
-                       return_cnp(ibp, qp, src_qp, pkey, dlid, slid, sc5,
-                                  grh);
-               }
-       }
+       process_ecn(qp, packet, true);
 
        psn = be32_to_cpu(ohdr->bth[2]);
        opcode = (bth0 >> 24) & 0xff;
index be91f6fa1c87b16fe4f31affd6c12848b1951792..f01e8e1d62d3d14d6c6d543630da9e9128695738 100644 (file)
@@ -184,8 +184,12 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
        }
 
        if (ah_attr->ah_flags & IB_AH_GRH) {
-               hfi1_copy_sge(&qp->r_sge, &ah_attr->grh,
-                             sizeof(struct ib_grh), 1, 0);
+               struct ib_grh grh;
+               struct ib_global_route grd = ah_attr->grh;
+
+               hfi1_make_grh(ibp, &grh, &grd, 0, 0);
+               hfi1_copy_sge(&qp->r_sge, &grh,
+                             sizeof(grh), 1, 0);
                wc.wc_flags |= IB_WC_GRH;
        } else {
                hfi1_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1);
@@ -430,10 +434,9 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
                                         qp->qkey : wqe->ud_wr.remote_qkey);
        ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num);
        /* disarm any ahg */
-       priv->s_hdr->ahgcount = 0;
-       priv->s_hdr->ahgidx = 0;
-       priv->s_hdr->tx_flags = 0;
-       priv->s_hdr->sde = NULL;
+       priv->s_ahg->ahgcount = 0;
+       priv->s_ahg->ahgidx = 0;
+       priv->s_ahg->tx_flags = 0;
        /* pbc */
        ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2;
 
@@ -665,13 +668,13 @@ void hfi1_ud_rcv(struct hfi1_packet *packet)
        struct hfi1_other_headers *ohdr = packet->ohdr;
        int opcode;
        u32 hdrsize = packet->hlen;
-       u32 pad;
        struct ib_wc wc;
        u32 qkey;
        u32 src_qp;
        u16 dlid, pkey;
        int mgmt_pkey_idx = -1;
        struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data;
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
        struct hfi1_ib_header *hdr = packet->hdr;
        u32 rcv_flags = packet->rcv_flags;
        void *data = packet->ebuf;
@@ -680,52 +683,33 @@ void hfi1_ud_rcv(struct hfi1_packet *packet)
        bool has_grh = rcv_flags & HFI1_HAS_GRH;
        u8 sc5 = hdr2sc((struct hfi1_message_header *)hdr, packet->rhf);
        u32 bth1;
-       int is_mcast;
-       struct ib_grh *grh = NULL;
+       u8 sl_from_sc, sl;
+       u16 slid;
+       u8 extra_bytes;
 
        qkey = be32_to_cpu(ohdr->u.ud.deth[0]);
        src_qp = be32_to_cpu(ohdr->u.ud.deth[1]) & RVT_QPN_MASK;
        dlid = be16_to_cpu(hdr->lrh[1]);
-       is_mcast = (dlid > be16_to_cpu(IB_MULTICAST_LID_BASE)) &&
-                       (dlid != be16_to_cpu(IB_LID_PERMISSIVE));
        bth1 = be32_to_cpu(ohdr->bth[1]);
-       if (unlikely(bth1 & HFI1_BECN_SMASK)) {
-               /*
-                * In pre-B0 h/w the CNP_OPCODE is handled via an
-                * error path.
-                */
-               struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-               u32 lqpn =  be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK;
-               u8 sl;
-
-               sl = ibp->sc_to_sl[sc5];
-
-               process_becn(ppd, sl, 0, lqpn, 0, IB_CC_SVCTYPE_UD);
-       }
+       slid = be16_to_cpu(hdr->lrh[3]);
+       pkey = (u16)be32_to_cpu(ohdr->bth[0]);
+       sl = (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf;
+       extra_bytes = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+       extra_bytes += (SIZE_OF_CRC << 2);
+       sl_from_sc = ibp->sc_to_sl[sc5];
 
-       /*
-        * The opcode is in the low byte when its in network order
-        * (top byte when in host order).
-        */
        opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
        opcode &= 0xff;
 
-       pkey = (u16)be32_to_cpu(ohdr->bth[0]);
-
-       if (!is_mcast && (opcode != IB_OPCODE_CNP) && bth1 & HFI1_FECN_SMASK) {
-               u16 slid = be16_to_cpu(hdr->lrh[3]);
-
-               return_cnp(ibp, qp, src_qp, pkey, dlid, slid, sc5, grh);
-       }
+       process_ecn(qp, packet, (opcode != IB_OPCODE_CNP));
        /*
         * Get the number of bytes the message was padded by
         * and drop incomplete packets.
         */
-       pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
-       if (unlikely(tlen < (hdrsize + pad + 4)))
+       if (unlikely(tlen < (hdrsize + extra_bytes)))
                goto drop;
 
-       tlen -= hdrsize + pad + 4;
+       tlen -= hdrsize + extra_bytes;
 
        /*
         * Check that the permissive LID is only used on QP0
@@ -736,10 +720,6 @@ void hfi1_ud_rcv(struct hfi1_packet *packet)
                             hdr->lrh[3] == IB_LID_PERMISSIVE))
                        goto drop;
                if (qp->ibqp.qp_num > 1) {
-                       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-                       u16 slid;
-
-                       slid = be16_to_cpu(hdr->lrh[3]);
                        if (unlikely(rcv_pkey_check(ppd, pkey, sc5, slid))) {
                                /*
                                 * Traps will not be sent for packets dropped
@@ -748,12 +728,9 @@ void hfi1_ud_rcv(struct hfi1_packet *packet)
                                 * IB spec (release 1.3, section 10.9.4)
                                 */
                                hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_P_KEY,
-                                              pkey,
-                                              (be16_to_cpu(hdr->lrh[0]) >> 4) &
-                                               0xF,
+                                              pkey, sl,
                                               src_qp, qp->ibqp.qp_num,
-                                              be16_to_cpu(hdr->lrh[3]),
-                                              be16_to_cpu(hdr->lrh[1]));
+                                              slid, dlid);
                                return;
                        }
                } else {
@@ -763,22 +740,18 @@ void hfi1_ud_rcv(struct hfi1_packet *packet)
                                goto drop;
                }
                if (unlikely(qkey != qp->qkey)) {
-                       hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_Q_KEY, qkey,
-                                      (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF,
+                       hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_Q_KEY, qkey, sl,
                                       src_qp, qp->ibqp.qp_num,
-                                      be16_to_cpu(hdr->lrh[3]),
-                                      be16_to_cpu(hdr->lrh[1]));
+                                      slid, dlid);
                        return;
                }
                /* Drop invalid MAD packets (see 13.5.3.1). */
                if (unlikely(qp->ibqp.qp_num == 1 &&
-                            (tlen > 2048 ||
-                             (be16_to_cpu(hdr->lrh[0]) >> 12) == 15)))
+                            (tlen > 2048 || (sc5 == 0xF))))
                        goto drop;
        } else {
                /* Received on QP0, and so by definition, this is an SMP */
                struct opa_smp *smp = (struct opa_smp *)data;
-               u16 slid = be16_to_cpu(hdr->lrh[3]);
 
                if (opa_smp_check(ibp, pkey, sc5, qp, slid, smp))
                        goto drop;
@@ -861,7 +834,6 @@ void hfi1_ud_rcv(struct hfi1_packet *packet)
            qp->ibqp.qp_type == IB_QPT_SMI) {
                if (mgmt_pkey_idx < 0) {
                        if (net_ratelimit()) {
-                               struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
                                struct hfi1_devdata *dd = ppd->dd;
 
                                dd_dev_err(dd, "QP type %d mgmt_pkey_idx < 0 and packet not dropped???\n",
@@ -874,8 +846,8 @@ void hfi1_ud_rcv(struct hfi1_packet *packet)
                wc.pkey_index = 0;
        }
 
-       wc.slid = be16_to_cpu(hdr->lrh[3]);
-       wc.sl = ibp->sc_to_sl[sc5];
+       wc.slid = slid;
+       wc.sl = sl_from_sc;
 
        /*
         * Save the LMC lower bits if the destination LID is a unicast LID.
index 1b640a35b3fe82f6f85022599477f6760744f034..64d26525435af43172ec9518b671a2758b0dcf15 100644 (file)
@@ -82,24 +82,25 @@ struct tid_pageset {
               ((unsigned long)vaddr & PAGE_MASK)) >> PAGE_SHIFT))
 
 static void unlock_exp_tids(struct hfi1_ctxtdata *, struct exp_tid_set *,
-                           struct rb_root *);
+                           struct hfi1_filedata *);
 static u32 find_phys_blocks(struct page **, unsigned, struct tid_pageset *);
 static int set_rcvarray_entry(struct file *, unsigned long, u32,
                              struct tid_group *, struct page **, unsigned);
-static int mmu_rb_insert(struct rb_root *, struct mmu_rb_node *);
-static void mmu_rb_remove(struct rb_root *, struct mmu_rb_node *,
-                         struct mm_struct *);
-static int mmu_rb_invalidate(struct rb_root *, struct mmu_rb_node *);
+static int tid_rb_insert(void *, struct mmu_rb_node *);
+static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
+                                   struct tid_rb_node *tnode);
+static void tid_rb_remove(void *, struct mmu_rb_node *);
+static int tid_rb_invalidate(void *, struct mmu_rb_node *);
 static int program_rcvarray(struct file *, unsigned long, struct tid_group *,
                            struct tid_pageset *, unsigned, u16, struct page **,
                            u32 *, unsigned *, unsigned *);
 static int unprogram_rcvarray(struct file *, u32, struct tid_group **);
-static void clear_tid_node(struct hfi1_filedata *, u16, struct tid_rb_node *);
+static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node);
 
 static struct mmu_rb_ops tid_rb_ops = {
-       .insert = mmu_rb_insert,
-       .remove = mmu_rb_remove,
-       .invalidate = mmu_rb_invalidate
+       .insert = tid_rb_insert,
+       .remove = tid_rb_remove,
+       .invalidate = tid_rb_invalidate
 };
 
 static inline u32 rcventry2tidinfo(u32 rcventry)
@@ -162,7 +163,6 @@ int hfi1_user_exp_rcv_init(struct file *fp)
 
        spin_lock_init(&fd->tid_lock);
        spin_lock_init(&fd->invalid_lock);
-       fd->tid_rb_root = RB_ROOT;
 
        if (!uctxt->subctxt_cnt || !fd->subctxt) {
                exp_tid_group_init(&uctxt->tid_group_list);
@@ -197,7 +197,7 @@ int hfi1_user_exp_rcv_init(struct file *fp)
        if (!fd->entry_to_rb)
                return -ENOMEM;
 
-       if (!HFI1_CAP_IS_USET(TID_UNMAP)) {
+       if (!HFI1_CAP_UGET_MASK(uctxt->flags, TID_UNMAP)) {
                fd->invalid_tid_idx = 0;
                fd->invalid_tids = kzalloc(uctxt->expected_count *
                                           sizeof(u32), GFP_KERNEL);
@@ -208,15 +208,15 @@ int hfi1_user_exp_rcv_init(struct file *fp)
 
                /*
                 * Register MMU notifier callbacks. If the registration
-                * fails, continue but turn off the TID caching for
-                * all user contexts.
+                * fails, continue without TID caching for this context.
                 */
-               ret = hfi1_mmu_rb_register(&fd->tid_rb_root, &tid_rb_ops);
+               ret = hfi1_mmu_rb_register(fd, fd->mm, &tid_rb_ops,
+                                          dd->pport->hfi1_wq,
+                                          &fd->handler);
                if (ret) {
                        dd_dev_info(dd,
                                    "Failed MMU notifier registration %d\n",
                                    ret);
-                       HFI1_CAP_USET(TID_UNMAP);
                        ret = 0;
                }
        }
@@ -235,7 +235,7 @@ int hfi1_user_exp_rcv_init(struct file *fp)
         * init.
         */
        spin_lock(&fd->tid_lock);
-       if (uctxt->subctxt_cnt && !HFI1_CAP_IS_USET(TID_UNMAP)) {
+       if (uctxt->subctxt_cnt && fd->handler) {
                u16 remainder;
 
                fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt;
@@ -261,18 +261,16 @@ int hfi1_user_exp_rcv_free(struct hfi1_filedata *fd)
         * The notifier would have been removed when the process'es mm
         * was freed.
         */
-       if (!HFI1_CAP_IS_USET(TID_UNMAP))
-               hfi1_mmu_rb_unregister(&fd->tid_rb_root);
+       if (fd->handler)
+               hfi1_mmu_rb_unregister(fd->handler);
 
        kfree(fd->invalid_tids);
 
        if (!uctxt->cnt) {
                if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list))
-                       unlock_exp_tids(uctxt, &uctxt->tid_full_list,
-                                       &fd->tid_rb_root);
+                       unlock_exp_tids(uctxt, &uctxt->tid_full_list, fd);
                if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list))
-                       unlock_exp_tids(uctxt, &uctxt->tid_used_list,
-                                       &fd->tid_rb_root);
+                       unlock_exp_tids(uctxt, &uctxt->tid_used_list, fd);
                list_for_each_entry_safe(grp, gptr, &uctxt->tid_group_list.list,
                                         list) {
                        list_del_init(&grp->list);
@@ -399,12 +397,12 @@ int hfi1_user_exp_rcv_setup(struct file *fp, struct hfi1_tid_info *tinfo)
         * pages, accept the amount pinned so far and program only that.
         * User space knows how to deal with partially programmed buffers.
         */
-       if (!hfi1_can_pin_pages(dd, fd->tid_n_pinned, npages)) {
+       if (!hfi1_can_pin_pages(dd, fd->mm, fd->tid_n_pinned, npages)) {
                ret = -ENOMEM;
                goto bail;
        }
 
-       pinned = hfi1_acquire_user_pages(vaddr, npages, true, pages);
+       pinned = hfi1_acquire_user_pages(fd->mm, vaddr, npages, true, pages);
        if (pinned <= 0) {
                ret = pinned;
                goto bail;
@@ -559,7 +557,7 @@ nomem:
         * for example), unpin all unmapped pages so we can pin them nex time.
         */
        if (mapped_pages != pinned) {
-               hfi1_release_user_pages(current->mm, &pages[mapped_pages],
+               hfi1_release_user_pages(fd->mm, &pages[mapped_pages],
                                        pinned - mapped_pages,
                                        false);
                fd->tid_n_pinned -= pinned - mapped_pages;
@@ -829,7 +827,6 @@ static int set_rcvarray_entry(struct file *fp, unsigned long vaddr,
        struct hfi1_ctxtdata *uctxt = fd->uctxt;
        struct tid_rb_node *node;
        struct hfi1_devdata *dd = uctxt->dd;
-       struct rb_root *root = &fd->tid_rb_root;
        dma_addr_t phys;
 
        /*
@@ -861,10 +858,10 @@ static int set_rcvarray_entry(struct file *fp, unsigned long vaddr,
        node->freed = false;
        memcpy(node->pages, pages, sizeof(struct page *) * npages);
 
-       if (HFI1_CAP_IS_USET(TID_UNMAP))
-               ret = mmu_rb_insert(root, &node->mmu);
+       if (!fd->handler)
+               ret = tid_rb_insert(fd, &node->mmu);
        else
-               ret = hfi1_mmu_rb_insert(root, &node->mmu);
+               ret = hfi1_mmu_rb_insert(fd->handler, &node->mmu);
 
        if (ret) {
                hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d",
@@ -904,19 +901,19 @@ static int unprogram_rcvarray(struct file *fp, u32 tidinfo,
        node = fd->entry_to_rb[rcventry];
        if (!node || node->rcventry != (uctxt->expected_base + rcventry))
                return -EBADF;
-       if (HFI1_CAP_IS_USET(TID_UNMAP))
-               mmu_rb_remove(&fd->tid_rb_root, &node->mmu, NULL);
-       else
-               hfi1_mmu_rb_remove(&fd->tid_rb_root, &node->mmu);
 
        if (grp)
                *grp = node->grp;
-       clear_tid_node(fd, fd->subctxt, node);
+
+       if (!fd->handler)
+               cacheless_tid_rb_remove(fd, node);
+       else
+               hfi1_mmu_rb_remove(fd->handler, &node->mmu);
+
        return 0;
 }
 
-static void clear_tid_node(struct hfi1_filedata *fd, u16 subctxt,
-                          struct tid_rb_node *node)
+static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
 {
        struct hfi1_ctxtdata *uctxt = fd->uctxt;
        struct hfi1_devdata *dd = uctxt->dd;
@@ -934,7 +931,7 @@ static void clear_tid_node(struct hfi1_filedata *fd, u16 subctxt,
 
        pci_unmap_single(dd->pcidev, node->dma_addr, node->mmu.len,
                         PCI_DMA_FROMDEVICE);
-       hfi1_release_user_pages(current->mm, node->pages, node->npages, true);
+       hfi1_release_user_pages(fd->mm, node->pages, node->npages, true);
        fd->tid_n_pinned -= node->npages;
 
        node->grp->used--;
@@ -949,12 +946,15 @@ static void clear_tid_node(struct hfi1_filedata *fd, u16 subctxt,
        kfree(node);
 }
 
+/*
+ * As a simple helper for hfi1_user_exp_rcv_free, this function deals with
+ * clearing nodes in the non-cached case.
+ */
 static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
-                           struct exp_tid_set *set, struct rb_root *root)
+                           struct exp_tid_set *set,
+                           struct hfi1_filedata *fd)
 {
        struct tid_group *grp, *ptr;
-       struct hfi1_filedata *fd = container_of(root, struct hfi1_filedata,
-                                               tid_rb_root);
        int i;
 
        list_for_each_entry_safe(grp, ptr, &set->list, list) {
@@ -969,22 +969,23 @@ static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
                                                          uctxt->expected_base];
                                if (!node || node->rcventry != rcventry)
                                        continue;
-                               if (HFI1_CAP_IS_USET(TID_UNMAP))
-                                       mmu_rb_remove(&fd->tid_rb_root,
-                                                     &node->mmu, NULL);
-                               else
-                                       hfi1_mmu_rb_remove(&fd->tid_rb_root,
-                                                          &node->mmu);
-                               clear_tid_node(fd, -1, node);
+
+                               cacheless_tid_rb_remove(fd, node);
                        }
                }
        }
 }
 
-static int mmu_rb_invalidate(struct rb_root *root, struct mmu_rb_node *mnode)
+/*
+ * Always return 0 from this function.  A non-zero return indicates that the
+ * remove operation will be called and that memory should be unpinned.
+ * However, the driver cannot unpin out from under PSM.  Instead, retain the
+ * memory (by returning 0) and inform PSM that the memory is going away.  PSM
+ * will call back later when it has removed the memory from its list.
+ */
+static int tid_rb_invalidate(void *arg, struct mmu_rb_node *mnode)
 {
-       struct hfi1_filedata *fdata =
-               container_of(root, struct hfi1_filedata, tid_rb_root);
+       struct hfi1_filedata *fdata = arg;
        struct hfi1_ctxtdata *uctxt = fdata->uctxt;
        struct tid_rb_node *node =
                container_of(mnode, struct tid_rb_node, mmu);
@@ -1025,10 +1026,9 @@ static int mmu_rb_invalidate(struct rb_root *root, struct mmu_rb_node *mnode)
        return 0;
 }
 
-static int mmu_rb_insert(struct rb_root *root, struct mmu_rb_node *node)
+static int tid_rb_insert(void *arg, struct mmu_rb_node *node)
 {
-       struct hfi1_filedata *fdata =
-               container_of(root, struct hfi1_filedata, tid_rb_root);
+       struct hfi1_filedata *fdata = arg;
        struct tid_rb_node *tnode =
                container_of(node, struct tid_rb_node, mmu);
        u32 base = fdata->uctxt->expected_base;
@@ -1037,14 +1037,20 @@ static int mmu_rb_insert(struct rb_root *root, struct mmu_rb_node *node)
        return 0;
 }
 
-static void mmu_rb_remove(struct rb_root *root, struct mmu_rb_node *node,
-                         struct mm_struct *mm)
+static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
+                                   struct tid_rb_node *tnode)
 {
-       struct hfi1_filedata *fdata =
-               container_of(root, struct hfi1_filedata, tid_rb_root);
-       struct tid_rb_node *tnode =
-               container_of(node, struct tid_rb_node, mmu);
        u32 base = fdata->uctxt->expected_base;
 
        fdata->entry_to_rb[tnode->rcventry - base] = NULL;
+       clear_tid_node(fdata, tnode);
+}
+
+static void tid_rb_remove(void *arg, struct mmu_rb_node *node)
+{
+       struct hfi1_filedata *fdata = arg;
+       struct tid_rb_node *tnode =
+               container_of(node, struct tid_rb_node, mmu);
+
+       cacheless_tid_rb_remove(fdata, tnode);
 }
index 88e10b5f55f154be7b97399de635f2fa24a162ff..20f4ddcac3b0f1c78d26c4ede74a5645f71bb641 100644 (file)
@@ -68,7 +68,8 @@ MODULE_PARM_DESC(cache_size, "Send and receive side cache size limit (in MB)");
  * could keeping caching buffers.
  *
  */
-bool hfi1_can_pin_pages(struct hfi1_devdata *dd, u32 nlocked, u32 npages)
+bool hfi1_can_pin_pages(struct hfi1_devdata *dd, struct mm_struct *mm,
+                       u32 nlocked, u32 npages)
 {
        unsigned long ulimit = rlimit(RLIMIT_MEMLOCK), pinned, cache_limit,
                size = (cache_size * (1UL << 20)); /* convert to bytes */
@@ -89,9 +90,9 @@ bool hfi1_can_pin_pages(struct hfi1_devdata *dd, u32 nlocked, u32 npages)
        /* Convert to number of pages */
        size = DIV_ROUND_UP(size, PAGE_SIZE);
 
-       down_read(&current->mm->mmap_sem);
-       pinned = current->mm->pinned_vm;
-       up_read(&current->mm->mmap_sem);
+       down_read(&mm->mmap_sem);
+       pinned = mm->pinned_vm;
+       up_read(&mm->mmap_sem);
 
        /* First, check the absolute limit against all pinned pages. */
        if (pinned + npages >= ulimit && !can_lock)
@@ -100,8 +101,8 @@ bool hfi1_can_pin_pages(struct hfi1_devdata *dd, u32 nlocked, u32 npages)
        return ((nlocked + npages) <= size) || can_lock;
 }
 
-int hfi1_acquire_user_pages(unsigned long vaddr, size_t npages, bool writable,
-                           struct page **pages)
+int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr, size_t npages,
+                           bool writable, struct page **pages)
 {
        int ret;
 
@@ -109,9 +110,9 @@ int hfi1_acquire_user_pages(unsigned long vaddr, size_t npages, bool writable,
        if (ret < 0)
                return ret;
 
-       down_write(&current->mm->mmap_sem);
-       current->mm->pinned_vm += ret;
-       up_write(&current->mm->mmap_sem);
+       down_write(&mm->mmap_sem);
+       mm->pinned_vm += ret;
+       up_write(&mm->mmap_sem);
 
        return ret;
 }
index 47ffd273ecbd7745d25067f4f8268301d89dde25..0ecf27903dc20f62ab2bd26c307d8be7afcfab5e 100644 (file)
@@ -145,7 +145,7 @@ MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 12
 /* Last packet in the request */
 #define TXREQ_FLAGS_REQ_LAST_PKT BIT(0)
 
-#define SDMA_REQ_IN_USE     0
+/* SDMA request flag bits */
 #define SDMA_REQ_FOR_THREAD 1
 #define SDMA_REQ_SEND_DONE  2
 #define SDMA_REQ_HAVE_AHG   3
@@ -183,16 +183,18 @@ struct user_sdma_iovec {
        struct sdma_mmu_node *node;
 };
 
-#define SDMA_CACHE_NODE_EVICT 0
-
 struct sdma_mmu_node {
        struct mmu_rb_node rb;
-       struct list_head list;
        struct hfi1_user_sdma_pkt_q *pq;
        atomic_t refcount;
        struct page **pages;
        unsigned npages;
-       unsigned long flags;
+};
+
+/* evict operation argument */
+struct evict_data {
+       u32 cleared;    /* count evicted so far */
+       u32 target;     /* target count to evict */
 };
 
 struct user_sdma_request {
@@ -305,14 +307,16 @@ static int defer_packet_queue(
        unsigned seq);
 static void activate_packet_queue(struct iowait *, int);
 static bool sdma_rb_filter(struct mmu_rb_node *, unsigned long, unsigned long);
-static int sdma_rb_insert(struct rb_root *, struct mmu_rb_node *);
-static void sdma_rb_remove(struct rb_root *, struct mmu_rb_node *,
-                          struct mm_struct *);
-static int sdma_rb_invalidate(struct rb_root *, struct mmu_rb_node *);
+static int sdma_rb_insert(void *, struct mmu_rb_node *);
+static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode,
+                        void *arg2, bool *stop);
+static void sdma_rb_remove(void *, struct mmu_rb_node *);
+static int sdma_rb_invalidate(void *, struct mmu_rb_node *);
 
 static struct mmu_rb_ops sdma_rb_ops = {
        .filter = sdma_rb_filter,
        .insert = sdma_rb_insert,
+       .evict = sdma_rb_evict,
        .remove = sdma_rb_remove,
        .invalidate = sdma_rb_invalidate
 };
@@ -397,6 +401,11 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp)
        if (!pq->reqs)
                goto pq_reqs_nomem;
 
+       memsize = BITS_TO_LONGS(hfi1_sdma_comp_ring_size) * sizeof(long);
+       pq->req_in_use = kzalloc(memsize, GFP_KERNEL);
+       if (!pq->req_in_use)
+               goto pq_reqs_no_in_use;
+
        INIT_LIST_HEAD(&pq->list);
        pq->dd = dd;
        pq->ctxt = uctxt->ctxt;
@@ -405,9 +414,8 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp)
        pq->state = SDMA_PKT_Q_INACTIVE;
        atomic_set(&pq->n_reqs, 0);
        init_waitqueue_head(&pq->wait);
-       pq->sdma_rb_root = RB_ROOT;
-       INIT_LIST_HEAD(&pq->evict);
-       spin_lock_init(&pq->evict_lock);
+       atomic_set(&pq->n_locked, 0);
+       pq->mm = fd->mm;
 
        iowait_init(&pq->busy, 0, NULL, defer_packet_queue,
                    activate_packet_queue, NULL);
@@ -437,7 +445,8 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp)
        cq->nentries = hfi1_sdma_comp_ring_size;
        fd->cq = cq;
 
-       ret = hfi1_mmu_rb_register(&pq->sdma_rb_root, &sdma_rb_ops);
+       ret = hfi1_mmu_rb_register(pq, pq->mm, &sdma_rb_ops, dd->pport->hfi1_wq,
+                                  &pq->handler);
        if (ret) {
                dd_dev_err(dd, "Failed to register with MMU %d", ret);
                goto done;
@@ -453,6 +462,8 @@ cq_comps_nomem:
 cq_nomem:
        kmem_cache_destroy(pq->txreq_cache);
 pq_txreq_nomem:
+       kfree(pq->req_in_use);
+pq_reqs_no_in_use:
        kfree(pq->reqs);
 pq_reqs_nomem:
        kfree(pq);
@@ -472,8 +483,9 @@ int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd)
        hfi1_cdbg(SDMA, "[%u:%u:%u] Freeing user SDMA queues", uctxt->dd->unit,
                  uctxt->ctxt, fd->subctxt);
        pq = fd->pq;
-       hfi1_mmu_rb_unregister(&pq->sdma_rb_root);
        if (pq) {
+               if (pq->handler)
+                       hfi1_mmu_rb_unregister(pq->handler);
                spin_lock_irqsave(&uctxt->sdma_qlock, flags);
                if (!list_empty(&pq->list))
                        list_del_init(&pq->list);
@@ -484,6 +496,7 @@ int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd)
                        pq->wait,
                        (ACCESS_ONCE(pq->state) == SDMA_PKT_Q_INACTIVE));
                kfree(pq->reqs);
+               kfree(pq->req_in_use);
                kmem_cache_destroy(pq->txreq_cache);
                kfree(pq);
                fd->pq = NULL;
@@ -496,10 +509,31 @@ int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd)
        return 0;
 }
 
+static u8 dlid_to_selector(u16 dlid)
+{
+       static u8 mapping[256];
+       static int initialized;
+       static u8 next;
+       int hash;
+
+       if (!initialized) {
+               memset(mapping, 0xFF, 256);
+               initialized = 1;
+       }
+
+       hash = ((dlid >> 8) ^ dlid) & 0xFF;
+       if (mapping[hash] == 0xFF) {
+               mapping[hash] = next;
+               next = (next + 1) & 0x7F;
+       }
+
+       return mapping[hash];
+}
+
 int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
                                   unsigned long dim, unsigned long *count)
 {
-       int ret = 0, i = 0;
+       int ret = 0, i;
        struct hfi1_filedata *fd = fp->private_data;
        struct hfi1_ctxtdata *uctxt = fd->uctxt;
        struct hfi1_user_sdma_pkt_q *pq = fd->pq;
@@ -511,6 +545,8 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
        struct user_sdma_request *req;
        u8 opcode, sc, vl;
        int req_queued = 0;
+       u16 dlid;
+       u8 selector;
 
        if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) {
                hfi1_cdbg(
@@ -529,30 +565,48 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
 
        trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt,
                                     (u16 *)&info);
-       if (cq->comps[info.comp_idx].status == QUEUED ||
-           test_bit(SDMA_REQ_IN_USE, &pq->reqs[info.comp_idx].flags)) {
-               hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in QUEUED state",
-                         dd->unit, uctxt->ctxt, fd->subctxt,
-                         info.comp_idx);
-               return -EBADSLT;
+
+       if (info.comp_idx >= hfi1_sdma_comp_ring_size) {
+               hfi1_cdbg(SDMA,
+                         "[%u:%u:%u:%u] Invalid comp index",
+                         dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
+               return -EINVAL;
        }
+
+       /*
+        * Sanity check the header io vector count.  Need at least 1 vector
+        * (header) and cannot be larger than the actual io vector count.
+        */
+       if (req_iovcnt(info.ctrl) < 1 || req_iovcnt(info.ctrl) > dim) {
+               hfi1_cdbg(SDMA,
+                         "[%u:%u:%u:%u] Invalid iov count %d, dim %ld",
+                         dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx,
+                         req_iovcnt(info.ctrl), dim);
+               return -EINVAL;
+       }
+
        if (!info.fragsize) {
                hfi1_cdbg(SDMA,
                          "[%u:%u:%u:%u] Request does not specify fragsize",
                          dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
                return -EINVAL;
        }
+
+       /* Try to claim the request. */
+       if (test_and_set_bit(info.comp_idx, pq->req_in_use)) {
+               hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in use",
+                         dd->unit, uctxt->ctxt, fd->subctxt,
+                         info.comp_idx);
+               return -EBADSLT;
+       }
        /*
-        * We've done all the safety checks that we can up to this point,
-        * "allocate" the request entry.
+        * All safety checks have been done and this request has been claimed.
         */
        hfi1_cdbg(SDMA, "[%u:%u:%u] Using req/comp entry %u\n", dd->unit,
                  uctxt->ctxt, fd->subctxt, info.comp_idx);
        req = pq->reqs + info.comp_idx;
        memset(req, 0, sizeof(*req));
-       /* Mark the request as IN_USE before we start filling it in. */
-       set_bit(SDMA_REQ_IN_USE, &req->flags);
-       req->data_iovs = req_iovcnt(info.ctrl) - 1;
+       req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */
        req->pq = pq;
        req->cq = cq;
        req->status = -1;
@@ -560,13 +614,22 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
 
        memcpy(&req->info, &info, sizeof(info));
 
-       if (req_opcode(info.ctrl) == EXPECTED)
+       if (req_opcode(info.ctrl) == EXPECTED) {
+               /* expected must have a TID info and at least one data vector */
+               if (req->data_iovs < 2) {
+                       SDMA_DBG(req,
+                                "Not enough vectors for expected request");
+                       ret = -EINVAL;
+                       goto free_req;
+               }
                req->data_iovs--;
+       }
 
        if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) {
                SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs,
                         MAX_VECTORS_PER_REQ);
-               return -EINVAL;
+               ret = -EINVAL;
+               goto free_req;
        }
        /* Copy the header from the user buffer */
        ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info),
@@ -634,7 +697,7 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
        idx++;
 
        /* Save all the IO vector structures */
-       while (i < req->data_iovs) {
+       for (i = 0; i < req->data_iovs; i++) {
                INIT_LIST_HEAD(&req->iovs[i].list);
                memcpy(&req->iovs[i].iov, iovec + idx++, sizeof(struct iovec));
                ret = pin_vector_pages(req, &req->iovs[i]);
@@ -642,7 +705,7 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
                        req->status = ret;
                        goto free_req;
                }
-               req->data_len += req->iovs[i++].iov.iov_len;
+               req->data_len += req->iovs[i].iov.iov_len;
        }
        SDMA_DBG(req, "total data length %u", req->data_len);
 
@@ -686,9 +749,13 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
                idx++;
        }
 
+       dlid = be16_to_cpu(req->hdr.lrh[1]);
+       selector = dlid_to_selector(dlid);
+
        /* Have to select the engine */
        req->sde = sdma_select_engine_vl(dd,
-                                        (u32)(uctxt->ctxt + fd->subctxt),
+                                        (u32)(uctxt->ctxt + fd->subctxt +
+                                              selector),
                                         vl);
        if (!req->sde || !sdma_running(req->sde)) {
                ret = -ECOMM;
@@ -766,14 +833,21 @@ static inline u32 compute_data_length(struct user_sdma_request *req,
         * The size of the data of the first packet is in the header
         * template. However, it includes the header and ICRC, which need
         * to be subtracted.
+        * The minimum representable packet data length in a header is 4 bytes,
+        * therefore, when the data length request is less than 4 bytes, there's
+        * only one packet, and the packet data length is equal to that of the
+        * request data length.
         * The size of the remaining packets is the minimum of the frag
         * size (MTU) or remaining data in the request.
         */
        u32 len;
 
        if (!req->seqnum) {
-               len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) -
-                      (sizeof(tx->hdr) - 4));
+               if (req->data_len < sizeof(u32))
+                       len = req->data_len;
+               else
+                       len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) -
+                              (sizeof(tx->hdr) - 4));
        } else if (req_opcode(req->info.ctrl) == EXPECTED) {
                u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) *
                        PAGE_SIZE;
@@ -803,6 +877,13 @@ static inline u32 compute_data_length(struct user_sdma_request *req,
        return len;
 }
 
+static inline u32 pad_len(u32 len)
+{
+       if (len & (sizeof(u32) - 1))
+               len += sizeof(u32) - (len & (sizeof(u32) - 1));
+       return len;
+}
+
 static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len)
 {
        /* (Size of complete header - size of PBC) + 4B ICRC + data length */
@@ -894,7 +975,8 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
                if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags)) {
                        if (!req->seqnum) {
                                u16 pbclen = le16_to_cpu(req->hdr.pbc[0]);
-                               u32 lrhlen = get_lrh_len(req->hdr, datalen);
+                               u32 lrhlen = get_lrh_len(req->hdr,
+                                                        pad_len(datalen));
                                /*
                                 * Copy the request header into the tx header
                                 * because the HW needs a cacheline-aligned
@@ -1048,39 +1130,24 @@ static inline int num_user_pages(const struct iovec *iov)
 
 static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages)
 {
-       u32 cleared = 0;
-       struct sdma_mmu_node *node, *ptr;
-       struct list_head to_evict = LIST_HEAD_INIT(to_evict);
-
-       spin_lock(&pq->evict_lock);
-       list_for_each_entry_safe_reverse(node, ptr, &pq->evict, list) {
-               /* Make sure that no one is still using the node. */
-               if (!atomic_read(&node->refcount)) {
-                       set_bit(SDMA_CACHE_NODE_EVICT, &node->flags);
-                       list_del_init(&node->list);
-                       list_add(&node->list, &to_evict);
-                       cleared += node->npages;
-                       if (cleared >= npages)
-                               break;
-               }
-       }
-       spin_unlock(&pq->evict_lock);
-
-       list_for_each_entry_safe(node, ptr, &to_evict, list)
-               hfi1_mmu_rb_remove(&pq->sdma_rb_root, &node->rb);
+       struct evict_data evict_data;
 
-       return cleared;
+       evict_data.cleared = 0;
+       evict_data.target = npages;
+       hfi1_mmu_rb_evict(pq->handler, &evict_data);
+       return evict_data.cleared;
 }
 
 static int pin_vector_pages(struct user_sdma_request *req,
-                           struct user_sdma_iovec *iovec) {
+                           struct user_sdma_iovec *iovec)
+{
        int ret = 0, pinned, npages, cleared;
        struct page **pages;
        struct hfi1_user_sdma_pkt_q *pq = req->pq;
        struct sdma_mmu_node *node = NULL;
        struct mmu_rb_node *rb_node;
 
-       rb_node = hfi1_mmu_rb_extract(&pq->sdma_rb_root,
+       rb_node = hfi1_mmu_rb_extract(pq->handler,
                                      (unsigned long)iovec->iov.iov_base,
                                      iovec->iov.iov_len);
        if (rb_node && !IS_ERR(rb_node))
@@ -1096,7 +1163,6 @@ static int pin_vector_pages(struct user_sdma_request *req,
                node->rb.addr = (unsigned long)iovec->iov.iov_base;
                node->pq = pq;
                atomic_set(&node->refcount, 0);
-               INIT_LIST_HEAD(&node->list);
        }
 
        npages = num_user_pages(&iovec->iov);
@@ -1111,28 +1177,14 @@ static int pin_vector_pages(struct user_sdma_request *req,
 
                npages -= node->npages;
 
-               /*
-                * If rb_node is NULL, it means that this is brand new node
-                * and, therefore not on the eviction list.
-                * If, however, the rb_node is non-NULL, it means that the
-                * node is already in RB tree and, therefore on the eviction
-                * list (nodes are unconditionally inserted in the eviction
-                * list). In that case, we have to remove the node prior to
-                * calling the eviction function in order to prevent it from
-                * freeing this node.
-                */
-               if (rb_node) {
-                       spin_lock(&pq->evict_lock);
-                       list_del_init(&node->list);
-                       spin_unlock(&pq->evict_lock);
-               }
 retry:
-               if (!hfi1_can_pin_pages(pq->dd, pq->n_locked, npages)) {
+               if (!hfi1_can_pin_pages(pq->dd, pq->mm,
+                                       atomic_read(&pq->n_locked), npages)) {
                        cleared = sdma_cache_evict(pq, npages);
                        if (cleared >= npages)
                                goto retry;
                }
-               pinned = hfi1_acquire_user_pages(
+               pinned = hfi1_acquire_user_pages(pq->mm,
                        ((unsigned long)iovec->iov.iov_base +
                         (node->npages * PAGE_SIZE)), npages, 0,
                        pages + node->npages);
@@ -1142,7 +1194,7 @@ retry:
                        goto bail;
                }
                if (pinned != npages) {
-                       unpin_vector_pages(current->mm, pages, node->npages,
+                       unpin_vector_pages(pq->mm, pages, node->npages,
                                           pinned);
                        ret = -EFAULT;
                        goto bail;
@@ -1152,28 +1204,22 @@ retry:
                node->pages = pages;
                node->npages += pinned;
                npages = node->npages;
-               spin_lock(&pq->evict_lock);
-               list_add(&node->list, &pq->evict);
-               pq->n_locked += pinned;
-               spin_unlock(&pq->evict_lock);
+               atomic_add(pinned, &pq->n_locked);
        }
        iovec->pages = node->pages;
        iovec->npages = npages;
        iovec->node = node;
 
-       ret = hfi1_mmu_rb_insert(&req->pq->sdma_rb_root, &node->rb);
+       ret = hfi1_mmu_rb_insert(req->pq->handler, &node->rb);
        if (ret) {
-               spin_lock(&pq->evict_lock);
-               if (!list_empty(&node->list))
-                       list_del(&node->list);
-               pq->n_locked -= node->npages;
-               spin_unlock(&pq->evict_lock);
+               atomic_sub(node->npages, &pq->n_locked);
+               iovec->node = NULL;
                goto bail;
        }
        return 0;
 bail:
        if (rb_node)
-               unpin_vector_pages(current->mm, node->pages, 0, node->npages);
+               unpin_vector_pages(pq->mm, node->pages, 0, node->npages);
        kfree(node);
        return ret;
 }
@@ -1181,7 +1227,7 @@ bail:
 static void unpin_vector_pages(struct mm_struct *mm, struct page **pages,
                               unsigned start, unsigned npages)
 {
-       hfi1_release_user_pages(mm, pages + start, npages, 0);
+       hfi1_release_user_pages(mm, pages + start, npages, false);
        kfree(pages);
 }
 
@@ -1192,16 +1238,14 @@ static int check_header_template(struct user_sdma_request *req,
        /*
         * Perform safety checks for any type of packet:
         *    - transfer size is multiple of 64bytes
-        *    - packet length is multiple of 4bytes
-        *    - entire request length is multiple of 4bytes
+        *    - packet length is multiple of 4 bytes
         *    - packet length is not larger than MTU size
         *
         * These checks are only done for the first packet of the
         * transfer since the header is "given" to us by user space.
         * For the remainder of the packets we compute the values.
         */
-       if (req->info.fragsize % PIO_BLOCK_SIZE ||
-           lrhlen & 0x3 || req->data_len & 0x3  ||
+       if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 ||
            lrhlen > get_lrh_len(*hdr, req->info.fragsize))
                return -EINVAL;
 
@@ -1263,7 +1307,7 @@ static int set_txreq_header(struct user_sdma_request *req,
        struct hfi1_pkt_header *hdr = &tx->hdr;
        u16 pbclen;
        int ret;
-       u32 tidval = 0, lrhlen = get_lrh_len(*hdr, datalen);
+       u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen));
 
        /* Copy the header template to the request before modification */
        memcpy(hdr, &req->hdr, sizeof(*hdr));
@@ -1374,7 +1418,7 @@ static int set_txreq_header_ahg(struct user_sdma_request *req,
        struct hfi1_user_sdma_pkt_q *pq = req->pq;
        struct hfi1_pkt_header *hdr = &req->hdr;
        u16 pbclen = le16_to_cpu(hdr->pbc[0]);
-       u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, len);
+       u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(len));
 
        if (PBC2LRH(pbclen) != lrhlen) {
                /* PBC.PbcLengthDWs */
@@ -1534,14 +1578,14 @@ static void user_sdma_free_request(struct user_sdma_request *req, bool unpin)
                                continue;
 
                        if (unpin)
-                               hfi1_mmu_rb_remove(&req->pq->sdma_rb_root,
+                               hfi1_mmu_rb_remove(req->pq->handler,
                                                   &node->rb);
                        else
                                atomic_dec(&node->refcount);
                }
        }
        kfree(req->tids);
-       clear_bit(SDMA_REQ_IN_USE, &req->flags);
+       clear_bit(req->info.comp_idx, req->pq->req_in_use);
 }
 
 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
@@ -1564,7 +1608,7 @@ static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr,
        return (bool)(node->addr == addr);
 }
 
-static int sdma_rb_insert(struct rb_root *root, struct mmu_rb_node *mnode)
+static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode)
 {
        struct sdma_mmu_node *node =
                container_of(mnode, struct sdma_mmu_node, rb);
@@ -1573,48 +1617,45 @@ static int sdma_rb_insert(struct rb_root *root, struct mmu_rb_node *mnode)
        return 0;
 }
 
-static void sdma_rb_remove(struct rb_root *root, struct mmu_rb_node *mnode,
-                          struct mm_struct *mm)
+/*
+ * Return 1 to remove the node from the rb tree and call the remove op.
+ *
+ * Called with the rb tree lock held.
+ */
+static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode,
+                        void *evict_arg, bool *stop)
+{
+       struct sdma_mmu_node *node =
+               container_of(mnode, struct sdma_mmu_node, rb);
+       struct evict_data *evict_data = evict_arg;
+
+       /* is this node still being used? */
+       if (atomic_read(&node->refcount))
+               return 0; /* keep this node */
+
+       /* this node will be evicted, add its pages to our count */
+       evict_data->cleared += node->npages;
+
+       /* have enough pages been cleared? */
+       if (evict_data->cleared >= evict_data->target)
+               *stop = true;
+
+       return 1; /* remove this node */
+}
+
+static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode)
 {
        struct sdma_mmu_node *node =
                container_of(mnode, struct sdma_mmu_node, rb);
 
-       spin_lock(&node->pq->evict_lock);
-       /*
-        * We've been called by the MMU notifier but this node has been
-        * scheduled for eviction. The eviction function will take care
-        * of freeing this node.
-        * We have to take the above lock first because we are racing
-        * against the setting of the bit in the eviction function.
-        */
-       if (mm && test_bit(SDMA_CACHE_NODE_EVICT, &node->flags)) {
-               spin_unlock(&node->pq->evict_lock);
-               return;
-       }
+       atomic_sub(node->npages, &node->pq->n_locked);
 
-       if (!list_empty(&node->list))
-               list_del(&node->list);
-       node->pq->n_locked -= node->npages;
-       spin_unlock(&node->pq->evict_lock);
+       unpin_vector_pages(node->pq->mm, node->pages, 0, node->npages);
 
-       /*
-        * If mm is set, we are being called by the MMU notifier and we
-        * should not pass a mm_struct to unpin_vector_page(). This is to
-        * prevent a deadlock when hfi1_release_user_pages() attempts to
-        * take the mmap_sem, which the MMU notifier has already taken.
-        */
-       unpin_vector_pages(mm ? NULL : current->mm, node->pages, 0,
-                          node->npages);
-       /*
-        * If called by the MMU notifier, we have to adjust the pinned
-        * page count ourselves.
-        */
-       if (mm)
-               mm->pinned_vm -= node->npages;
        kfree(node);
 }
 
-static int sdma_rb_invalidate(struct rb_root *root, struct mmu_rb_node *mnode)
+static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode)
 {
        struct sdma_mmu_node *node =
                container_of(mnode, struct sdma_mmu_node, rb);
index b9240e351161c9dad40e253c9f045e99b912a858..39001714f5517ecc8c2e74a4769a589d7e6ee06a 100644 (file)
@@ -63,14 +63,14 @@ struct hfi1_user_sdma_pkt_q {
        struct hfi1_devdata *dd;
        struct kmem_cache *txreq_cache;
        struct user_sdma_request *reqs;
+       unsigned long *req_in_use;
        struct iowait busy;
        unsigned state;
        wait_queue_head_t wait;
        unsigned long unpinned;
-       struct rb_root sdma_rb_root;
-       u32 n_locked;
-       struct list_head evict;
-       spinlock_t evict_lock; /* protect evict and n_locked */
+       struct mmu_rb_handler *handler;
+       atomic_t n_locked;
+       struct mm_struct *mm;
 };
 
 struct hfi1_user_sdma_comp_q {
index dd4be3c2b2254d11a0045ac6adf6739ba658ea3c..2b359540901db3dd4c42cad718cc938ff7139d6a 100644 (file)
@@ -306,7 +306,10 @@ const enum ib_wc_opcode ib_hfi1_wc_opcode[] = {
        [IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
        [IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
        [IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
-       [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD
+       [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD,
+       [IB_WR_SEND_WITH_INV] = IB_WC_SEND,
+       [IB_WR_LOCAL_INV] = IB_WC_LOCAL_INV,
+       [IB_WR_REG_MR] = IB_WC_REG_MR
 };
 
 /*
@@ -378,6 +381,8 @@ static const opcode_handler opcode_handler_tbl[256] = {
        [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE]             = &hfi1_rc_rcv,
        [IB_OPCODE_RC_COMPARE_SWAP]                   = &hfi1_rc_rcv,
        [IB_OPCODE_RC_FETCH_ADD]                      = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE]      = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE]      = &hfi1_rc_rcv,
        /* UC */
        [IB_OPCODE_UC_SEND_FIRST]                     = &hfi1_uc_rcv,
        [IB_OPCODE_UC_SEND_MIDDLE]                    = &hfi1_uc_rcv,
@@ -540,19 +545,15 @@ void hfi1_skip_sge(struct rvt_sge_state *ss, u32 length, int release)
 /*
  * Make sure the QP is ready and able to accept the given opcode.
  */
-static inline int qp_ok(int opcode, struct hfi1_packet *packet)
+static inline opcode_handler qp_ok(int opcode, struct hfi1_packet *packet)
 {
-       struct hfi1_ibport *ibp;
-
        if (!(ib_rvt_state_ops[packet->qp->state] & RVT_PROCESS_RECV_OK))
-               goto dropit;
+               return NULL;
        if (((opcode & RVT_OPCODE_QP_MASK) == packet->qp->allowed_ops) ||
            (opcode == IB_OPCODE_CNP))
-               return 1;
-dropit:
-       ibp = &packet->rcd->ppd->ibport_data;
-       ibp->rvp.n_pkt_drops++;
-       return 0;
+               return opcode_handler_tbl[opcode];
+
+       return NULL;
 }
 
 /**
@@ -571,6 +572,7 @@ void hfi1_ib_rcv(struct hfi1_packet *packet)
        struct hfi1_pportdata *ppd = rcd->ppd;
        struct hfi1_ibport *ibp = &ppd->ibport_data;
        struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi;
+       opcode_handler packet_handler;
        unsigned long flags;
        u32 qp_num;
        int lnh;
@@ -616,8 +618,11 @@ void hfi1_ib_rcv(struct hfi1_packet *packet)
                list_for_each_entry_rcu(p, &mcast->qp_list, list) {
                        packet->qp = p->qp;
                        spin_lock_irqsave(&packet->qp->r_lock, flags);
-                       if (likely((qp_ok(opcode, packet))))
-                               opcode_handler_tbl[opcode](packet);
+                       packet_handler = qp_ok(opcode, packet);
+                       if (likely(packet_handler))
+                               packet_handler(packet);
+                       else
+                               ibp->rvp.n_pkt_drops++;
                        spin_unlock_irqrestore(&packet->qp->r_lock, flags);
                }
                /*
@@ -634,8 +639,11 @@ void hfi1_ib_rcv(struct hfi1_packet *packet)
                        goto drop;
                }
                spin_lock_irqsave(&packet->qp->r_lock, flags);
-               if (likely((qp_ok(opcode, packet))))
-                       opcode_handler_tbl[opcode](packet);
+               packet_handler = qp_ok(opcode, packet);
+               if (likely(packet_handler))
+                       packet_handler(packet);
+               else
+                       ibp->rvp.n_pkt_drops++;
                spin_unlock_irqrestore(&packet->qp->r_lock, flags);
                rcu_read_unlock();
        }
@@ -808,19 +816,19 @@ static int build_verbs_tx_desc(
        struct rvt_sge_state *ss,
        u32 length,
        struct verbs_txreq *tx,
-       struct ahg_ib_header *ahdr,
+       struct hfi1_ahg_info *ahg_info,
        u64 pbc)
 {
        int ret = 0;
-       struct hfi1_pio_header *phdr = &tx->phdr;
+       struct hfi1_sdma_header *phdr = &tx->phdr;
        u16 hdrbytes = tx->hdr_dwords << 2;
 
-       if (!ahdr->ahgcount) {
+       if (!ahg_info->ahgcount) {
                ret = sdma_txinit_ahg(
                        &tx->txreq,
-                       ahdr->tx_flags,
+                       ahg_info->tx_flags,
                        hdrbytes + length,
-                       ahdr->ahgidx,
+                       ahg_info->ahgidx,
                        0,
                        NULL,
                        0,
@@ -838,11 +846,11 @@ static int build_verbs_tx_desc(
        } else {
                ret = sdma_txinit_ahg(
                        &tx->txreq,
-                       ahdr->tx_flags,
+                       ahg_info->tx_flags,
                        length,
-                       ahdr->ahgidx,
-                       ahdr->ahgcount,
-                       ahdr->ahgdesc,
+                       ahg_info->ahgidx,
+                       ahg_info->ahgcount,
+                       ahg_info->ahgdesc,
                        hdrbytes,
                        verbs_sdma_complete);
                if (ret)
@@ -860,7 +868,7 @@ int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
                        u64 pbc)
 {
        struct hfi1_qp_priv *priv = qp->priv;
-       struct ahg_ib_header *ahdr = priv->s_hdr;
+       struct hfi1_ahg_info *ahg_info = priv->s_ahg;
        u32 hdrwords = qp->s_hdrwords;
        struct rvt_sge_state *ss = qp->s_cur_sge;
        u32 len = qp->s_cur_size;
@@ -888,7 +896,7 @@ int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
                                         plen);
                }
                tx->wqe = qp->s_wqe;
-               ret = build_verbs_tx_desc(tx->sde, ss, len, tx, ahdr, pbc);
+               ret = build_verbs_tx_desc(tx->sde, ss, len, tx, ahg_info, pbc);
                if (unlikely(ret))
                        goto bail_build;
        }
@@ -1300,13 +1308,15 @@ static void hfi1_fill_device_attr(struct hfi1_devdata *dd)
        rdi->dparms.props.device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR |
                        IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT |
                        IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN |
-                       IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE;
+                       IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE |
+                       IB_DEVICE_MEM_MGT_EXTENSIONS;
        rdi->dparms.props.page_size_cap = PAGE_SIZE;
        rdi->dparms.props.vendor_id = dd->oui1 << 16 | dd->oui2 << 8 | dd->oui3;
        rdi->dparms.props.vendor_part_id = dd->pcidev->device;
        rdi->dparms.props.hw_ver = dd->minrev;
        rdi->dparms.props.sys_image_guid = ib_hfi1_sys_image_guid;
-       rdi->dparms.props.max_mr_size = ~0ULL;
+       rdi->dparms.props.max_mr_size = U64_MAX;
+       rdi->dparms.props.max_fast_reg_page_list_len = UINT_MAX;
        rdi->dparms.props.max_qp = hfi1_max_qps;
        rdi->dparms.props.max_qp_wr = hfi1_max_qp_wrs;
        rdi->dparms.props.max_sge = hfi1_max_sges;
@@ -1695,6 +1705,9 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd)
        dd->verbs_dev.rdi.dparms.nports = dd->num_pports;
        dd->verbs_dev.rdi.dparms.npkeys = hfi1_get_npkeys(dd);
 
+       /* post send table */
+       dd->verbs_dev.rdi.post_parms = hfi1_post_parms;
+
        ppd = dd->pport;
        for (i = 0; i < dd->num_pports; i++, ppd++)
                rvt_init_port(&dd->verbs_dev.rdi,
@@ -1745,8 +1758,7 @@ void hfi1_cnp_rcv(struct hfi1_packet *packet)
        struct rvt_qp *qp = packet->qp;
        u32 lqpn, rqpn = 0;
        u16 rlid = 0;
-       u8 sl, sc5, sc4_bit, svc_type;
-       bool sc4_set = has_sc4_bit(packet);
+       u8 sl, sc5, svc_type;
 
        switch (packet->qp->ibqp.qp_type) {
        case IB_QPT_UC:
@@ -1769,9 +1781,7 @@ void hfi1_cnp_rcv(struct hfi1_packet *packet)
                return;
        }
 
-       sc4_bit = sc4_set << 4;
-       sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
-       sc5 |= sc4_bit;
+       sc5 = hdr2sc((struct hfi1_message_header *)hdr, packet->rhf);
        sl = ibp->sc_to_sl[sc5];
        lqpn = qp->ibqp.qp_num;
 
index 488356775627809599b6b7702b86d56ef396a2a9..d1b101c5482898b59db32ea895e433ee9bc1ce92 100644 (file)
@@ -178,16 +178,14 @@ struct hfi1_ib_header {
        } u;
 } __packed;
 
-struct ahg_ib_header {
-       struct sdma_engine *sde;
+struct hfi1_ahg_info {
        u32 ahgdesc[2];
        u16 tx_flags;
        u8 ahgcount;
        u8 ahgidx;
-       struct hfi1_ib_header ibh;
 };
 
-struct hfi1_pio_header {
+struct hfi1_sdma_header {
        __le64 pbc;
        struct hfi1_ib_header hdr;
 } __packed;
@@ -197,7 +195,7 @@ struct hfi1_pio_header {
  * pair is made common
  */
 struct hfi1_qp_priv {
-       struct ahg_ib_header *s_hdr;              /* next header to send */
+       struct hfi1_ahg_info *s_ahg;              /* ahg info for next header */
        struct sdma_engine *s_sde;                /* current sde */
        struct send_context *s_sendcontext;       /* current sendcontext */
        u8 s_sc;                                  /* SC[0..4] for next packet */
index a1d6e0807f97161f5336120ab7e6d0ffdb175b4e..5660897593ba4444f43661cba45d1ee06102ad20 100644 (file)
@@ -56,7 +56,7 @@
 #include "iowait.h"
 
 struct verbs_txreq {
-       struct hfi1_pio_header  phdr;
+       struct hfi1_sdma_header phdr;
        struct sdma_txreq       txreq;
        struct rvt_qp           *qp;
        struct rvt_swqe         *wqe;
index 575b737d9ef3d59441c35545f22ebdf53731b835..9cc0aae1d78191735b31eaed33bca1ee21de7d01 100644 (file)
@@ -106,6 +106,49 @@ static u32 credit_table[31] = {
        32768                   /* 1E */
 };
 
+const struct rvt_operation_params qib_post_parms[RVT_OPERATION_MAX] = {
+[IB_WR_RDMA_WRITE] = {
+       .length = sizeof(struct ib_rdma_wr),
+       .qpt_support = BIT(IB_QPT_UC) | BIT(IB_QPT_RC),
+},
+
+[IB_WR_RDMA_READ] = {
+       .length = sizeof(struct ib_rdma_wr),
+       .qpt_support = BIT(IB_QPT_RC),
+       .flags = RVT_OPERATION_ATOMIC,
+},
+
+[IB_WR_ATOMIC_CMP_AND_SWP] = {
+       .length = sizeof(struct ib_atomic_wr),
+       .qpt_support = BIT(IB_QPT_RC),
+       .flags = RVT_OPERATION_ATOMIC | RVT_OPERATION_ATOMIC_SGE,
+},
+
+[IB_WR_ATOMIC_FETCH_AND_ADD] = {
+       .length = sizeof(struct ib_atomic_wr),
+       .qpt_support = BIT(IB_QPT_RC),
+       .flags = RVT_OPERATION_ATOMIC | RVT_OPERATION_ATOMIC_SGE,
+},
+
+[IB_WR_RDMA_WRITE_WITH_IMM] = {
+       .length = sizeof(struct ib_rdma_wr),
+       .qpt_support = BIT(IB_QPT_UC) | BIT(IB_QPT_RC),
+},
+
+[IB_WR_SEND] = {
+       .length = sizeof(struct ib_send_wr),
+       .qpt_support = BIT(IB_QPT_UD) | BIT(IB_QPT_SMI) | BIT(IB_QPT_GSI) |
+                      BIT(IB_QPT_UC) | BIT(IB_QPT_RC),
+},
+
+[IB_WR_SEND_WITH_IMM] = {
+       .length = sizeof(struct ib_send_wr),
+       .qpt_support = BIT(IB_QPT_UD) | BIT(IB_QPT_SMI) | BIT(IB_QPT_GSI) |
+                      BIT(IB_QPT_UC) | BIT(IB_QPT_RC),
+},
+
+};
+
 static void get_map_page(struct rvt_qpn_table *qpt, struct rvt_qpn_map *map,
                         gfp_t gfp)
 {
index 846e6c726df7c67204f1dc1d5f1bfb066c36e149..10d062561bd96f49617a754d26873353b39d78a4 100644 (file)
@@ -169,8 +169,12 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
        }
 
        if (ah_attr->ah_flags & IB_AH_GRH) {
-               qib_copy_sge(&qp->r_sge, &ah_attr->grh,
-                            sizeof(struct ib_grh), 1);
+               struct ib_grh grh;
+               struct ib_global_route grd = ah_attr->grh;
+
+               qib_make_grh(ibp, &grh, &grd, 0, 0);
+               qib_copy_sge(&qp->r_sge, &grh,
+                            sizeof(grh), 1);
                wc.wc_flags |= IB_WC_GRH;
        } else
                qib_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1);
index cbf6200e6afc06b9ba7dc485e00cd807ea78872f..fd1dfbce5539742cb6b5a2ed994b213c50d5672a 100644 (file)
@@ -1582,6 +1582,8 @@ static void qib_fill_device_attr(struct qib_devdata *dd)
        rdi->dparms.props.max_total_mcast_qp_attach =
                                        rdi->dparms.props.max_mcast_qp_attach *
                                        rdi->dparms.props.max_mcast_grp;
+       /* post send table */
+       dd->verbs_dev.rdi.post_parms = qib_post_parms;
 }
 
 /**
index 4f878151f81ff43263ea9524f10b09918f94e565..736ced68484221de35433a9a95e6619fdddaeecc 100644 (file)
@@ -497,4 +497,6 @@ extern unsigned int ib_qib_max_srq_wrs;
 
 extern const u32 ib_qib_rnr_table[];
 
+extern const struct rvt_operation_params qib_post_parms[];
+
 #endif                          /* QIB_VERBS_H */
index 6ca6fa80dd6e7883c071003de9a10853dc593f10..f2f229efbe64d7d8ddcdf4f985ad23be76ec6977 100644 (file)
@@ -510,6 +510,7 @@ int rvt_driver_cq_init(struct rvt_dev_info *rdi)
 
        if (rdi->worker)
                return 0;
+       spin_lock_init(&rdi->n_cqs_lock);
        rdi->worker = kzalloc(sizeof(*rdi->worker), GFP_KERNEL);
        if (!rdi->worker)
                return -ENOMEM;
index 0f4d4500f45e25597322e00cd744c4d05cda9831..80c4b6b401b83af99d8a1dde1ada784daeb8b6cb 100644 (file)
@@ -140,6 +140,7 @@ static int rvt_init_mregion(struct rvt_mregion *mr, struct ib_pd *pd,
        init_completion(&mr->comp);
        /* count returning the ptr to user */
        atomic_set(&mr->refcount, 1);
+       atomic_set(&mr->lkey_invalid, 0);
        mr->pd = pd;
        mr->max_segs = count;
        return 0;
@@ -479,6 +480,123 @@ struct ib_mr *rvt_alloc_mr(struct ib_pd *pd,
        return &mr->ibmr;
 }
 
+/**
+ * rvt_set_page - page assignment function called by ib_sg_to_pages
+ * @ibmr: memory region
+ * @addr: dma address of mapped page
+ *
+ * Return: 0 on success
+ */
+static int rvt_set_page(struct ib_mr *ibmr, u64 addr)
+{
+       struct rvt_mr *mr = to_imr(ibmr);
+       u32 ps = 1 << mr->mr.page_shift;
+       u32 mapped_segs = mr->mr.length >> mr->mr.page_shift;
+       int m, n;
+
+       if (unlikely(mapped_segs == mr->mr.max_segs))
+               return -ENOMEM;
+
+       if (mr->mr.length == 0) {
+               mr->mr.user_base = addr;
+               mr->mr.iova = addr;
+       }
+
+       m = mapped_segs / RVT_SEGSZ;
+       n = mapped_segs % RVT_SEGSZ;
+       mr->mr.map[m]->segs[n].vaddr = (void *)addr;
+       mr->mr.map[m]->segs[n].length = ps;
+       mr->mr.length += ps;
+
+       return 0;
+}
+
+/**
+ * rvt_map_mr_sg - map sg list and set it the memory region
+ * @ibmr: memory region
+ * @sg: dma mapped scatterlist
+ * @sg_nents: number of entries in sg
+ * @sg_offset: offset in bytes into sg
+ *
+ * Return: number of sg elements mapped to the memory region
+ */
+int rvt_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
+                 int sg_nents, unsigned int *sg_offset)
+{
+       struct rvt_mr *mr = to_imr(ibmr);
+
+       mr->mr.length = 0;
+       mr->mr.page_shift = PAGE_SHIFT;
+       return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset,
+                             rvt_set_page);
+}
+
+/**
+ * rvt_fast_reg_mr - fast register physical MR
+ * @qp: the queue pair where the work request comes from
+ * @ibmr: the memory region to be registered
+ * @key: updated key for this memory region
+ * @access: access flags for this memory region
+ *
+ * Returns 0 on success.
+ */
+int rvt_fast_reg_mr(struct rvt_qp *qp, struct ib_mr *ibmr, u32 key,
+                   int access)
+{
+       struct rvt_mr *mr = to_imr(ibmr);
+
+       if (qp->ibqp.pd != mr->mr.pd)
+               return -EACCES;
+
+       /* not applicable to dma MR or user MR */
+       if (!mr->mr.lkey || mr->umem)
+               return -EINVAL;
+
+       if ((key & 0xFFFFFF00) != (mr->mr.lkey & 0xFFFFFF00))
+               return -EINVAL;
+
+       ibmr->lkey = key;
+       ibmr->rkey = key;
+       mr->mr.lkey = key;
+       mr->mr.access_flags = access;
+       atomic_set(&mr->mr.lkey_invalid, 0);
+
+       return 0;
+}
+EXPORT_SYMBOL(rvt_fast_reg_mr);
+
+/**
+ * rvt_invalidate_rkey - invalidate an MR rkey
+ * @qp: queue pair associated with the invalidate op
+ * @rkey: rkey to invalidate
+ *
+ * Returns 0 on success.
+ */
+int rvt_invalidate_rkey(struct rvt_qp *qp, u32 rkey)
+{
+       struct rvt_dev_info *dev = ib_to_rvt(qp->ibqp.device);
+       struct rvt_lkey_table *rkt = &dev->lkey_table;
+       struct rvt_mregion *mr;
+
+       if (rkey == 0)
+               return -EINVAL;
+
+       rcu_read_lock();
+       mr = rcu_dereference(
+               rkt->table[(rkey >> (32 - dev->dparms.lkey_table_size))]);
+       if (unlikely(!mr || mr->lkey != rkey || qp->ibqp.pd != mr->pd))
+               goto bail;
+
+       atomic_set(&mr->lkey_invalid, 1);
+       rcu_read_unlock();
+       return 0;
+
+bail:
+       rcu_read_unlock();
+       return -EINVAL;
+}
+EXPORT_SYMBOL(rvt_invalidate_rkey);
+
 /**
  * rvt_alloc_fmr - allocate a fast memory region
  * @pd: the protection domain for this memory region
@@ -682,7 +800,8 @@ int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd,
        }
        mr = rcu_dereference(
                rkt->table[(sge->lkey >> (32 - dev->dparms.lkey_table_size))]);
-       if (unlikely(!mr || mr->lkey != sge->lkey || mr->pd != &pd->ibpd))
+       if (unlikely(!mr || atomic_read(&mr->lkey_invalid) ||
+                    mr->lkey != sge->lkey || mr->pd != &pd->ibpd))
                goto bail;
 
        off = sge->addr - mr->user_base;
@@ -782,7 +901,8 @@ int rvt_rkey_ok(struct rvt_qp *qp, struct rvt_sge *sge,
 
        mr = rcu_dereference(
                rkt->table[(rkey >> (32 - dev->dparms.lkey_table_size))]);
-       if (unlikely(!mr || mr->lkey != rkey || qp->ibqp.pd != mr->pd))
+       if (unlikely(!mr || atomic_read(&mr->lkey_invalid) ||
+                    mr->lkey != rkey || qp->ibqp.pd != mr->pd))
                goto bail;
 
        off = vaddr - mr->iova;
index 69380512c6d16ce3769f43915c5d8d8606cbe14d..132800ee0205130ad73f3fcb3ae7f18a8cec6e7d 100644 (file)
@@ -82,6 +82,8 @@ int rvt_dereg_mr(struct ib_mr *ibmr);
 struct ib_mr *rvt_alloc_mr(struct ib_pd *pd,
                           enum ib_mr_type mr_type,
                           u32 max_num_sg);
+int rvt_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
+                 int sg_nents, unsigned int *sg_offset);
 struct ib_fmr *rvt_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
                             struct ib_fmr_attr *fmr_attr);
 int rvt_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
index 41ba7e9cadaab29895fec78c0284ddfed2558949..bdb540f25a888dcc24b24cf96268c6d8f8a3c479 100644 (file)
@@ -435,8 +435,7 @@ static void rvt_clear_mr_refs(struct rvt_qp *qp, int clr_sends)
        for (n = 0; n < rvt_max_atomic(rdi); n++) {
                struct rvt_ack_entry *e = &qp->s_ack_queue[n];
 
-               if (e->opcode == IB_OPCODE_RC_RDMA_READ_REQUEST &&
-                   e->rdma_sge.mr) {
+               if (e->rdma_sge.mr) {
                        rvt_put_mr(e->rdma_sge.mr);
                        e->rdma_sge.mr = NULL;
                }
@@ -584,6 +583,7 @@ static void rvt_reset_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp,
                qp->r_rq.wq->tail = 0;
        }
        qp->r_sge.num_sge = 0;
+       atomic_set(&qp->s_reserved_used, 0);
 }
 
 /**
@@ -613,6 +613,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
        struct rvt_dev_info *rdi = ib_to_rvt(ibpd->device);
        void *priv = NULL;
        gfp_t gfp;
+       size_t sqsize;
 
        if (!rdi)
                return ERR_PTR(-EINVAL);
@@ -643,7 +644,9 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
                    init_attr->cap.max_recv_wr == 0)
                        return ERR_PTR(-EINVAL);
        }
-
+       sqsize =
+               init_attr->cap.max_send_wr + 1 +
+               rdi->dparms.reserved_operations;
        switch (init_attr->qp_type) {
        case IB_QPT_SMI:
        case IB_QPT_GSI:
@@ -658,11 +661,11 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
                        sizeof(struct rvt_swqe);
                if (gfp == GFP_NOIO)
                        swq = __vmalloc(
-                               (init_attr->cap.max_send_wr + 1) * sz,
+                               sqsize * sz,
                                gfp | __GFP_ZERO, PAGE_KERNEL);
                else
                        swq = vzalloc_node(
-                               (init_attr->cap.max_send_wr + 1) * sz,
+                               sqsize * sz,
                                rdi->dparms.node);
                if (!swq)
                        return ERR_PTR(-ENOMEM);
@@ -741,13 +744,14 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
                spin_lock_init(&qp->s_lock);
                spin_lock_init(&qp->r_rq.lock);
                atomic_set(&qp->refcount, 0);
+               atomic_set(&qp->local_ops_pending, 0);
                init_waitqueue_head(&qp->wait);
                init_timer(&qp->s_timer);
                qp->s_timer.data = (unsigned long)qp;
                INIT_LIST_HEAD(&qp->rspwait);
                qp->state = IB_QPS_RESET;
                qp->s_wq = swq;
-               qp->s_size = init_attr->cap.max_send_wr + 1;
+               qp->s_size = sqsize;
                qp->s_avail = init_attr->cap.max_send_wr;
                qp->s_max_sge = init_attr->cap.max_send_sge;
                if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR)
@@ -1332,7 +1336,8 @@ int rvt_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
        attr->sq_psn = qp->s_next_psn & rdi->dparms.psn_mask;
        attr->dest_qp_num = qp->remote_qpn;
        attr->qp_access_flags = qp->qp_access_flags;
-       attr->cap.max_send_wr = qp->s_size - 1;
+       attr->cap.max_send_wr = qp->s_size - 1 -
+               rdi->dparms.reserved_operations;
        attr->cap.max_recv_wr = qp->ibqp.srq ? 0 : qp->r_rq.size - 1;
        attr->cap.max_send_sge = qp->s_max_sge;
        attr->cap.max_recv_sge = qp->r_rq.max_sge;
@@ -1440,25 +1445,116 @@ int rvt_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
 }
 
 /**
- * qp_get_savail - return number of avail send entries
+ * rvt_qp_valid_operation - validate post send wr request
+ * @qp - the qp
+ * @post-parms - the post send table for the driver
+ * @wr - the work request
+ *
+ * The routine validates the operation based on the
+ * validation table an returns the length of the operation
+ * which can extend beyond the ib_send_bw.  Operation
+ * dependent flags key atomic operation validation.
  *
+ * There is an exception for UD qps that validates the pd and
+ * overrides the length to include the additional UD specific
+ * length.
+ *
+ * Returns a negative error or the length of the work request
+ * for building the swqe.
+ */
+static inline int rvt_qp_valid_operation(
+       struct rvt_qp *qp,
+       const struct rvt_operation_params *post_parms,
+       struct ib_send_wr *wr)
+{
+       int len;
+
+       if (wr->opcode >= RVT_OPERATION_MAX || !post_parms[wr->opcode].length)
+               return -EINVAL;
+       if (!(post_parms[wr->opcode].qpt_support & BIT(qp->ibqp.qp_type)))
+               return -EINVAL;
+       if ((post_parms[wr->opcode].flags & RVT_OPERATION_PRIV) &&
+           ibpd_to_rvtpd(qp->ibqp.pd)->user)
+               return -EINVAL;
+       if (post_parms[wr->opcode].flags & RVT_OPERATION_ATOMIC_SGE &&
+           (wr->num_sge == 0 ||
+            wr->sg_list[0].length < sizeof(u64) ||
+            wr->sg_list[0].addr & (sizeof(u64) - 1)))
+               return -EINVAL;
+       if (post_parms[wr->opcode].flags & RVT_OPERATION_ATOMIC &&
+           !qp->s_max_rd_atomic)
+               return -EINVAL;
+       len = post_parms[wr->opcode].length;
+       /* UD specific */
+       if (qp->ibqp.qp_type != IB_QPT_UC &&
+           qp->ibqp.qp_type != IB_QPT_RC) {
+               if (qp->ibqp.pd != ud_wr(wr)->ah->pd)
+                       return -EINVAL;
+               len = sizeof(struct ib_ud_wr);
+       }
+       return len;
+}
+
+/**
+ * rvt_qp_is_avail - determine queue capacity
  * @qp - the qp
+ * @rdi - the rdmavt device
+ * @reserved_op - is reserved operation
  *
  * This assumes the s_hlock is held but the s_last
  * qp variable is uncontrolled.
+ *
+ * For non reserved operations, the qp->s_avail
+ * may be changed.
+ *
+ * The return value is zero or a -ENOMEM.
  */
-static inline u32 qp_get_savail(struct rvt_qp *qp)
+static inline int rvt_qp_is_avail(
+       struct rvt_qp *qp,
+       struct rvt_dev_info *rdi,
+       bool reserved_op)
 {
        u32 slast;
-       u32 ret;
-
+       u32 avail;
+       u32 reserved_used;
+
+       /* see rvt_qp_wqe_unreserve() */
+       smp_mb__before_atomic();
+       reserved_used = atomic_read(&qp->s_reserved_used);
+       if (unlikely(reserved_op)) {
+               /* see rvt_qp_wqe_unreserve() */
+               smp_mb__before_atomic();
+               if (reserved_used >= rdi->dparms.reserved_operations)
+                       return -ENOMEM;
+               return 0;
+       }
+       /* non-reserved operations */
+       if (likely(qp->s_avail))
+               return 0;
        smp_read_barrier_depends(); /* see rc.c */
        slast = ACCESS_ONCE(qp->s_last);
        if (qp->s_head >= slast)
-               ret = qp->s_size - (qp->s_head - slast);
+               avail = qp->s_size - (qp->s_head - slast);
        else
-               ret = slast - qp->s_head;
-       return ret - 1;
+               avail = slast - qp->s_head;
+
+       /* see rvt_qp_wqe_unreserve() */
+       smp_mb__before_atomic();
+       reserved_used = atomic_read(&qp->s_reserved_used);
+       avail =  avail - 1 -
+               (rdi->dparms.reserved_operations - reserved_used);
+       /* insure we don't assign a negative s_avail */
+       if ((s32)avail <= 0)
+               return -ENOMEM;
+       qp->s_avail = avail;
+       if (WARN_ON(qp->s_avail >
+                   (qp->s_size - 1 - rdi->dparms.reserved_operations)))
+               rvt_pr_err(rdi,
+                          "More avail entries than QP RB size.\nQP: %u, size: %u, avail: %u\nhead: %u, tail: %u, cur: %u, acked: %u, last: %u",
+                          qp->ibqp.qp_num, qp->s_size, qp->s_avail,
+                          qp->s_head, qp->s_tail, qp->s_cur,
+                          qp->s_acked, qp->s_last);
+       return 0;
 }
 
 /**
@@ -1480,49 +1576,64 @@ static int rvt_post_one_wr(struct rvt_qp *qp,
        struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
        u8 log_pmtu;
        int ret;
+       size_t cplen;
+       bool reserved_op;
+       int local_ops_delayed = 0;
+
+       BUILD_BUG_ON(IB_QPT_MAX >= (sizeof(u32) * BITS_PER_BYTE));
 
        /* IB spec says that num_sge == 0 is OK. */
        if (unlikely(wr->num_sge > qp->s_max_sge))
                return -EINVAL;
 
+       ret = rvt_qp_valid_operation(qp, rdi->post_parms, wr);
+       if (ret < 0)
+               return ret;
+       cplen = ret;
+
        /*
-        * Don't allow RDMA reads or atomic operations on UC or
-        * undefined operations.
-        * Make sure buffer is large enough to hold the result for atomics.
+        * Local operations include fast register and local invalidate.
+        * Fast register needs to be processed immediately because the
+        * registered lkey may be used by following work requests and the
+        * lkey needs to be valid at the time those requests are posted.
+        * Local invalidate can be processed immediately if fencing is
+        * not required and no previous local invalidate ops are pending.
+        * Signaled local operations that have been processed immediately
+        * need to have requests with "completion only" flags set posted
+        * to the send queue in order to generate completions.
         */
-       if (qp->ibqp.qp_type == IB_QPT_UC) {
-               if ((unsigned)wr->opcode >= IB_WR_RDMA_READ)
-                       return -EINVAL;
-       } else if (qp->ibqp.qp_type != IB_QPT_RC) {
-               /* Check IB_QPT_SMI, IB_QPT_GSI, IB_QPT_UD opcode */
-               if (wr->opcode != IB_WR_SEND &&
-                   wr->opcode != IB_WR_SEND_WITH_IMM)
-                       return -EINVAL;
-               /* Check UD destination address PD */
-               if (qp->ibqp.pd != ud_wr(wr)->ah->pd)
+       if ((rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL)) {
+               switch (wr->opcode) {
+               case IB_WR_REG_MR:
+                       ret = rvt_fast_reg_mr(qp,
+                                             reg_wr(wr)->mr,
+                                             reg_wr(wr)->key,
+                                             reg_wr(wr)->access);
+                       if (ret || !(wr->send_flags & IB_SEND_SIGNALED))
+                               return ret;
+                       break;
+               case IB_WR_LOCAL_INV:
+                       if ((wr->send_flags & IB_SEND_FENCE) ||
+                           atomic_read(&qp->local_ops_pending)) {
+                               local_ops_delayed = 1;
+                       } else {
+                               ret = rvt_invalidate_rkey(
+                                       qp, wr->ex.invalidate_rkey);
+                               if (ret || !(wr->send_flags & IB_SEND_SIGNALED))
+                                       return ret;
+                       }
+                       break;
+               default:
                        return -EINVAL;
-       } else if ((unsigned)wr->opcode > IB_WR_ATOMIC_FETCH_AND_ADD) {
-               return -EINVAL;
-       } else if (wr->opcode >= IB_WR_ATOMIC_CMP_AND_SWP &&
-                  (wr->num_sge == 0 ||
-                   wr->sg_list[0].length < sizeof(u64) ||
-                   wr->sg_list[0].addr & (sizeof(u64) - 1))) {
-               return -EINVAL;
-       } else if (wr->opcode >= IB_WR_RDMA_READ && !qp->s_max_rd_atomic) {
-               return -EINVAL;
+               }
        }
+
+       reserved_op = rdi->post_parms[wr->opcode].flags &
+                       RVT_OPERATION_USE_RESERVE;
        /* check for avail */
-       if (unlikely(!qp->s_avail)) {
-               qp->s_avail = qp_get_savail(qp);
-               if (WARN_ON(qp->s_avail > (qp->s_size - 1)))
-                       rvt_pr_err(rdi,
-                                  "More avail entries than QP RB size.\nQP: %u, size: %u, avail: %u\nhead: %u, tail: %u, cur: %u, acked: %u, last: %u",
-                                  qp->ibqp.qp_num, qp->s_size, qp->s_avail,
-                                  qp->s_head, qp->s_tail, qp->s_cur,
-                                  qp->s_acked, qp->s_last);
-               if (!qp->s_avail)
-                       return -ENOMEM;
-       }
+       ret = rvt_qp_is_avail(qp, rdi, reserved_op);
+       if (ret)
+               return ret;
        next = qp->s_head + 1;
        if (next >= qp->s_size)
                next = 0;
@@ -1531,18 +1642,8 @@ static int rvt_post_one_wr(struct rvt_qp *qp,
        pd = ibpd_to_rvtpd(qp->ibqp.pd);
        wqe = rvt_get_swqe_ptr(qp, qp->s_head);
 
-       if (qp->ibqp.qp_type != IB_QPT_UC &&
-           qp->ibqp.qp_type != IB_QPT_RC)
-               memcpy(&wqe->ud_wr, ud_wr(wr), sizeof(wqe->ud_wr));
-       else if (wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM ||
-                wr->opcode == IB_WR_RDMA_WRITE ||
-                wr->opcode == IB_WR_RDMA_READ)
-               memcpy(&wqe->rdma_wr, rdma_wr(wr), sizeof(wqe->rdma_wr));
-       else if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
-                wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
-               memcpy(&wqe->atomic_wr, atomic_wr(wr), sizeof(wqe->atomic_wr));
-       else
-               memcpy(&wqe->wr, wr, sizeof(wqe->wr));
+       /* cplen has length from above */
+       memcpy(&wqe->wr, wr, cplen);
 
        wqe->length = 0;
        j = 0;
@@ -1585,14 +1686,29 @@ static int rvt_post_one_wr(struct rvt_qp *qp,
                atomic_inc(&ibah_to_rvtah(ud_wr(wr)->ah)->refcount);
        }
 
-       wqe->ssn = qp->s_ssn++;
-       wqe->psn = qp->s_next_psn;
-       wqe->lpsn = wqe->psn +
-                       (wqe->length ? ((wqe->length - 1) >> log_pmtu) : 0);
-       qp->s_next_psn = wqe->lpsn + 1;
+       if (rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL) {
+               if (local_ops_delayed)
+                       atomic_inc(&qp->local_ops_pending);
+               else
+                       wqe->wr.send_flags |= RVT_SEND_COMPLETION_ONLY;
+               wqe->ssn = 0;
+               wqe->psn = 0;
+               wqe->lpsn = 0;
+       } else {
+               wqe->ssn = qp->s_ssn++;
+               wqe->psn = qp->s_next_psn;
+               wqe->lpsn = wqe->psn +
+                               (wqe->length ?
+                                       ((wqe->length - 1) >> log_pmtu) :
+                                       0);
+               qp->s_next_psn = wqe->lpsn + 1;
+       }
        trace_rvt_post_one_wr(qp, wqe);
+       if (unlikely(reserved_op))
+               rvt_qp_wqe_reserve(qp, wqe);
+       else
+               qp->s_avail--;
        smp_wmb(); /* see request builders */
-       qp->s_avail--;
        qp->s_head = next;
 
        return 0;
index 30c4fda7a05a6552e5a26d5dc1e52af200959a17..d430c2f7cec4cea4fc24f465dedd0b30e27f29ce 100644 (file)
@@ -370,6 +370,7 @@ enum {
        REG_USER_MR,
        DEREG_MR,
        ALLOC_MR,
+       MAP_MR_SG,
        ALLOC_FMR,
        MAP_PHYS_FMR,
        UNMAP_FMR,
@@ -528,7 +529,8 @@ static noinline int check_support(struct rvt_dev_info *rdi, int verb)
                                                         post_send),
                                           rvt_post_send))
                        if (!rdi->driver_f.schedule_send ||
-                           !rdi->driver_f.do_send)
+                           !rdi->driver_f.do_send ||
+                           !rdi->post_parms)
                                return -EINVAL;
                break;
 
@@ -633,6 +635,12 @@ static noinline int check_support(struct rvt_dev_info *rdi, int verb)
                                      rvt_alloc_mr);
                break;
 
+       case MAP_MR_SG:
+               check_driver_override(rdi, offsetof(struct ib_device,
+                                                   map_mr_sg),
+                                     rvt_map_mr_sg);
+               break;
+
        case MAP_PHYS_FMR:
                check_driver_override(rdi, offsetof(struct ib_device,
                                                    map_phys_fmr),
index a990c04208c94e1d79a28442381c5c6fbb522077..ba6be060a476b19d608c7797bed4526b7df04332 100644 (file)
@@ -137,8 +137,6 @@ isert_create_qp(struct isert_conn *isert_conn,
        attr.cap.max_recv_wr = ISERT_QP_MAX_RECV_DTOS + 1;
        attr.cap.max_rdma_ctxs = ISCSI_DEF_XMIT_CMDS_MAX;
        attr.cap.max_send_sge = device->ib_device->attrs.max_sge;
-       isert_conn->max_sge = min(device->ib_device->attrs.max_sge,
-                                 device->ib_device->attrs.max_sge_rd);
        attr.cap.max_recv_sge = 1;
        attr.sq_sig_type = IB_SIGNAL_REQ_WR;
        attr.qp_type = IB_QPT_RC;
index e512ba941f2f980d659b3e75ec31fb708964ff01..fc791efe3a108178f1949f386548ed84ae81ebc8 100644 (file)
@@ -138,7 +138,6 @@ struct isert_conn {
        u32                     responder_resources;
        u32                     initiator_depth;
        bool                    pi_support;
-       u32                     max_sge;
        struct iser_rx_desc     *login_req_buf;
        char                    *login_rsp_buf;
        u64                     login_req_dma;
index 4a4155640d516252fba99c5728bb794e005218cf..dfa23b075a88469b73c3f199f73c8b8ed571187f 100644 (file)
@@ -1601,6 +1601,7 @@ static int srpt_create_ch_ib(struct srpt_rdma_ch *ch)
        struct ib_qp_init_attr *qp_init;
        struct srpt_port *sport = ch->sport;
        struct srpt_device *sdev = sport->sdev;
+       const struct ib_device_attr *attrs = &sdev->device->attrs;
        u32 srp_sq_size = sport->port_attrib.srp_sq_size;
        int ret;
 
@@ -1638,7 +1639,7 @@ retry:
         */
        qp_init->cap.max_send_wr = srp_sq_size / 2;
        qp_init->cap.max_rdma_ctxs = srp_sq_size / 2;
-       qp_init->cap.max_send_sge = SRPT_DEF_SG_PER_WQE;
+       qp_init->cap.max_send_sge = min(attrs->max_sge, SRPT_MAX_SG_PER_WQE);
        qp_init->port_num = ch->sport->port;
 
        ch->qp = ib_create_qp(sdev->pd, qp_init);
@@ -2261,7 +2262,7 @@ static void srpt_queue_response(struct se_cmd *cmd)
                container_of(cmd, struct srpt_send_ioctx, cmd);
        struct srpt_rdma_ch *ch = ioctx->ch;
        struct srpt_device *sdev = ch->sport->sdev;
-       struct ib_send_wr send_wr, *first_wr = NULL, *bad_wr;
+       struct ib_send_wr send_wr, *first_wr = &send_wr, *bad_wr;
        struct ib_sge sge;
        enum srpt_command_state state;
        unsigned long flags;
@@ -2302,11 +2303,8 @@ static void srpt_queue_response(struct se_cmd *cmd)
                        struct srpt_rw_ctx *ctx = &ioctx->rw_ctxs[i];
 
                        first_wr = rdma_rw_ctx_wrs(&ctx->rw, ch->qp,
-                                       ch->sport->port, NULL,
-                                       first_wr ? first_wr : &send_wr);
+                                       ch->sport->port, NULL, first_wr);
                }
-       } else {
-               first_wr = &send_wr;
        }
 
        if (state != SRPT_STATE_MGMT)
index 389030487da7eecb521342e8a83859b39b3523a6..581878782854322301dc85ba1e6c763888da0827 100644 (file)
@@ -106,7 +106,11 @@ enum {
        SRP_LOGIN_RSP_MULTICHAN_MAINTAINED = 0x2,
 
        SRPT_DEF_SG_TABLESIZE = 128,
-       SRPT_DEF_SG_PER_WQE = 16,
+       /*
+        * An experimentally determined value that avoids that QP creation
+        * fails due to "swiotlb buffer is full" on systems using the swiotlb.
+        */
+       SRPT_MAX_SG_PER_WQE = 16,
 
        MIN_SRPT_SQ_SIZE = 16,
        DEF_SRPT_SQ_SIZE = 4096,
index 94a0bc5b5bdd47b16678148a04690793321b85c2..8e90dd28bb7536d16058d711b096f8125bd42874 100644 (file)
@@ -1490,6 +1490,10 @@ struct ib_rwq_ind_table_init_attr {
        struct ib_wq    **ind_tbl;
 };
 
+/*
+ * @max_write_sge: Maximum SGE elements per RDMA WRITE request.
+ * @max_read_sge:  Maximum SGE elements per RDMA READ request.
+ */
 struct ib_qp {
        struct ib_device       *device;
        struct ib_pd           *pd;
@@ -1511,6 +1515,8 @@ struct ib_qp {
        void                  (*event_handler)(struct ib_event *, void *);
        void                   *qp_context;
        u32                     qp_num;
+       u32                     max_write_sge;
+       u32                     max_read_sge;
        enum ib_qp_type         qp_type;
        struct ib_rwq_ind_table *rwq_ind_tbl;
 };
index 2b95c2c336eb85db88c1054f991547c11a11ec4e..9303e0e4f508d8c524a34b979da7a359cb15b30c 100644 (file)
 #if !defined(OPA_PORT_INFO_H)
 #define OPA_PORT_INFO_H
 
-/* Temporary until HFI driver is updated */
-#ifndef USE_PI_LED_ENABLE
-#define USE_PI_LED_ENABLE 0
-#endif
-
 #define OPA_PORT_LINK_MODE_NOP 0               /* No change */
 #define OPA_PORT_LINK_MODE_OPA 4               /* Port mode is OPA */
 
@@ -274,23 +269,12 @@ enum port_info_field_masks {
        OPA_PI_MASK_MTU_CAP                       = 0x0F,
 };
 
-#if USE_PI_LED_ENABLE
 struct opa_port_states {
        u8     reserved;
        u8     ledenable_offlinereason;   /* 1 res, 1 bit, 6 bits */
        u8     reserved2;
        u8     portphysstate_portstate;   /* 4 bits, 4 bits */
 };
-#define PI_LED_ENABLE_SUP 1
-#else
-struct opa_port_states {
-       u8     reserved;
-       u8     offline_reason;            /* 2 res, 6 bits */
-       u8     reserved2;
-       u8     portphysstate_portstate;   /* 4 bits, 4 bits */
-};
-#define PI_LED_ENABLE_SUP 0
-#endif
 
 struct opa_port_state_info {
        struct opa_port_states port_states;
index 9c9a27d42aaa5e89aa778d3597cb54b13c270ecc..e31502107a58ca115e0ac1543899ed728f792fc4 100644 (file)
@@ -158,6 +158,7 @@ struct rvt_driver_params {
        u32 max_mad_size;
        u8 qos_shift;
        u8 max_rdma_atomic;
+       u8 reserved_operations;
 };
 
 /* Protection domain */
@@ -351,6 +352,9 @@ struct rvt_dev_info {
        /* Driver specific properties */
        struct rvt_driver_params dparms;
 
+       /* post send table */
+       const struct rvt_operation_params *post_parms;
+
        struct rvt_mregion __rcu *dma_mr;
        struct rvt_lkey_table lkey_table;
 
@@ -484,6 +488,9 @@ void rvt_unregister_device(struct rvt_dev_info *rvd);
 int rvt_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr);
 int rvt_init_port(struct rvt_dev_info *rdi, struct rvt_ibport *port,
                  int port_index, u16 *pkey_table);
+int rvt_fast_reg_mr(struct rvt_qp *qp, struct ib_mr *ibmr, u32 key,
+                   int access);
+int rvt_invalidate_rkey(struct rvt_qp *qp, u32 rkey);
 int rvt_rkey_ok(struct rvt_qp *qp, struct rvt_sge *sge,
                u32 len, u64 vaddr, u32 rkey, int acc);
 int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd,
index 5edffdca8c53b201a78bb503a6c6b63381672652..6b3c6c8b6b772a1774012d5b4f32b827a03f5537 100644 (file)
@@ -81,6 +81,7 @@ struct rvt_mregion {
        u32 mapsz;              /* size of the map array */
        u8  page_shift;         /* 0 - non unform/non powerof2 sizes */
        u8  lkey_published;     /* in global table */
+       atomic_t lkey_invalid;  /* true if current lkey is invalid */
        struct completion comp; /* complete when refcount goes to zero */
        atomic_t refcount;
        struct rvt_segarray *map[0];    /* the segments */
index 6d23b879416ac0f5249f33d7fd4713323e704961..bd34d0b56bf770feea3c7d4822cf8e3de8ec0fd3 100644 (file)
 #define RVT_PROCESS_OR_FLUSH_SEND \
        (RVT_PROCESS_SEND_OK | RVT_FLUSH_SEND)
 
+/*
+ * Internal send flags
+ */
+#define RVT_SEND_RESERVE_USED           IB_SEND_RESERVED_START
+#define RVT_SEND_COMPLETION_ONLY       (IB_SEND_RESERVED_START << 1)
+
 /*
  * Send work request queue entry.
  * The size of the sg_list is determined when the QP is created and stored
@@ -216,23 +222,43 @@ struct rvt_mmap_info {
  * to send a RDMA read response or atomic operation.
  */
 struct rvt_ack_entry {
-       u8 opcode;
-       u8 sent;
+       struct rvt_sge rdma_sge;
+       u64 atomic_data;
        u32 psn;
        u32 lpsn;
-       union {
-               struct rvt_sge rdma_sge;
-               u64 atomic_data;
-       };
+       u8 opcode;
+       u8 sent;
 };
 
 #define        RC_QP_SCALING_INTERVAL  5
 
-/*
- * Variables prefixed with s_ are for the requester (sender).
- * Variables prefixed with r_ are for the responder (receiver).
- * Variables prefixed with ack_ are for responder replies.
+#define RVT_OPERATION_PRIV        0x00000001
+#define RVT_OPERATION_ATOMIC      0x00000002
+#define RVT_OPERATION_ATOMIC_SGE  0x00000004
+#define RVT_OPERATION_LOCAL       0x00000008
+#define RVT_OPERATION_USE_RESERVE 0x00000010
+
+#define RVT_OPERATION_MAX (IB_WR_RESERVED10 + 1)
+
+/**
+ * rvt_operation_params - op table entry
+ * @length - the length to copy into the swqe entry
+ * @qpt_support - a bit mask indicating QP type support
+ * @flags - RVT_OPERATION flags (see above)
  *
+ * This supports table driven post send so that
+ * the driver can have differing an potentially
+ * different sets of operations.
+ *
+ **/
+
+struct rvt_operation_params {
+       size_t length;
+       u32 qpt_support;
+       u32 flags;
+};
+
+/*
  * Common variables are protected by both r_rq.lock and s_lock in that order
  * which only happens in modify_qp() or changing the QP 'state'.
  */
@@ -307,6 +333,7 @@ struct rvt_qp {
        u32 s_next_psn;         /* PSN for next request */
        u32 s_avail;            /* number of entries avail */
        u32 s_ssn;              /* SSN of tail entry */
+       atomic_t s_reserved_used; /* reserved entries in use */
 
        spinlock_t s_lock ____cacheline_aligned_in_smp;
        u32 s_flags;
@@ -343,6 +370,8 @@ struct rvt_qp {
        struct rvt_sge_state s_ack_rdma_sge;
        struct timer_list s_timer;
 
+       atomic_t local_ops_pending; /* number of fast_reg/local_inv reqs */
+
        /*
         * This sge list MUST be last. Do not add anything below here.
         */
@@ -436,6 +465,49 @@ static inline struct rvt_rwqe *rvt_get_rwqe_ptr(struct rvt_rq *rq, unsigned n)
                  rq->max_sge * sizeof(struct ib_sge)) * n);
 }
 
+/**
+ * rvt_qp_wqe_reserve - reserve operation
+ * @qp - the rvt qp
+ * @wqe - the send wqe
+ *
+ * This routine used in post send to record
+ * a wqe relative reserved operation use.
+ */
+static inline void rvt_qp_wqe_reserve(
+       struct rvt_qp *qp,
+       struct rvt_swqe *wqe)
+{
+       wqe->wr.send_flags |= RVT_SEND_RESERVE_USED;
+       atomic_inc(&qp->s_reserved_used);
+}
+
+/**
+ * rvt_qp_wqe_unreserve - clean reserved operation
+ * @qp - the rvt qp
+ * @wqe - the send wqe
+ *
+ * This decrements the reserve use count.
+ *
+ * This call MUST precede the change to
+ * s_last to insure that post send sees a stable
+ * s_avail.
+ *
+ * An smp_mp__after_atomic() is used to insure
+ * the compiler does not juggle the order of the s_last
+ * ring index and the decrementing of s_reserved_used.
+ */
+static inline void rvt_qp_wqe_unreserve(
+       struct rvt_qp *qp,
+       struct rvt_swqe *wqe)
+{
+       if (unlikely(wqe->wr.send_flags & RVT_SEND_RESERVE_USED)) {
+               wqe->wr.send_flags &= ~RVT_SEND_RESERVE_USED;
+               atomic_dec(&qp->s_reserved_used);
+               /* insure no compiler re-order up to s_last change */
+               smp_mb__after_atomic();
+       }
+}
+
 extern const int  ib_rvt_state_ops[];
 
 struct rvt_dev_info;
index 98bebf8bef55e41b4e602646394c0b45dfd87d52..d15e7289d8356ec36bdb9d14b5d6e341de228519 100644 (file)
@@ -75,7 +75,7 @@
  * may not be implemented; the user code must deal with this if it
  * cares, or it must abort after initialization reports the difference.
  */
-#define HFI1_USER_SWMINOR 1
+#define HFI1_USER_SWMINOR 2
 
 /*
  * We will encode the major/minor inside a single 32bit version number.
This page took 0.181594 seconds and 5 git commands to generate.