habanalabs: Add debugfs node for engines status
authorTomer Tayar <ttayar@habana.ai>
Mon, 1 Jul 2019 13:59:45 +0000 (13:59 +0000)
committerOded Gabbay <oded.gabbay@gmail.com>
Mon, 1 Jul 2019 13:59:45 +0000 (13:59 +0000)
Command submissions sent to the device are composed of command buffers
which are targeted to different device engines, like DMA and compute
entities. When a command submission gets stuck, knowing in which engine
the stuck is, is crucial for debugging.
This patch adds a debugfs node that exports this information, by
displaying the engines' various registers that assemble their idle/busy
status.
The information retrieval is based on the is_device_idle ASIC function.
The printout in this function, of the first detected busy engine, is
removed because it becomes redundant in the presence of the more
elaborated info of the new debugfs node.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Documentation/ABI/testing/debugfs-driver-habanalabs
drivers/misc/habanalabs/debugfs.c
drivers/misc/habanalabs/goya/goya.c
drivers/misc/habanalabs/habanalabs.h
drivers/misc/habanalabs/habanalabs_ioctl.c

index 18191c2becab00fd18c831e8556e672176903588..f0ac14b70ecbf4107afe8dd36aa60fed2d2dfc90 100644 (file)
@@ -51,6 +51,13 @@ Description:    Enables the root user to set the device to specific state.
                 Valid values are "disable", "enable", "suspend", "resume".
                 User can read this property to see the valid values
 
+What:           /sys/kernel/debug/habanalabs/hl<n>/engines
+Date:           Jul 2019
+KernelVersion:  5.3
+Contact:        oded.gabbay@gmail.com
+Description:    Displays the status registers values of the device engines and
+                their derived idle status
+
 What:           /sys/kernel/debug/habanalabs/hl<n>/i2c_addr
 Date:           Jan 2019
 KernelVersion:  5.1
index 17974919b7607e253a07823f56b3a4bf8fe765bb..6a5dfb14eca1bb2180595edf517ce6a543b55ed1 100644 (file)
@@ -500,6 +500,17 @@ err:
        return -EINVAL;
 }
 
+static int engines_show(struct seq_file *s, void *data)
+{
+       struct hl_debugfs_entry *entry = s->private;
+       struct hl_dbg_device_entry *dev_entry = entry->dev_entry;
+       struct hl_device *hdev = dev_entry->hdev;
+
+       hdev->asic_funcs->is_device_idle(hdev, s);
+
+       return 0;
+}
+
 static bool hl_is_device_va(struct hl_device *hdev, u64 addr)
 {
        struct asic_fixed_properties *prop = &hdev->asic_prop;
@@ -893,6 +904,7 @@ static const struct hl_info_list hl_debugfs_list[] = {
        {"userptr", userptr_show, NULL},
        {"vm", vm_show, NULL},
        {"mmu", mmu_show, mmu_write},
+       {"engines", engines_show, NULL}
 };
 
 static int hl_debugfs_open(struct inode *inode, struct file *file)
index 8653aa914724f79eed17fe62773072836ad26a3a..41e97531f300c9baf2163982c21e46d78478155f 100644 (file)
@@ -15,6 +15,7 @@
 #include <linux/hwmon.h>
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/iommu.h>
+#include <linux/seq_file.h>
 
 /*
  * GOYA security scheme:
 #define GOYA_CB_POOL_CB_CNT            512
 #define GOYA_CB_POOL_CB_SIZE           0x20000         /* 128KB */
 
+#define IS_QM_IDLE(engine, qm_glbl_sts0) \
+       (((qm_glbl_sts0) & engine##_QM_IDLE_MASK) == engine##_QM_IDLE_MASK)
+#define IS_DMA_QM_IDLE(qm_glbl_sts0)   IS_QM_IDLE(DMA, qm_glbl_sts0)
+#define IS_TPC_QM_IDLE(qm_glbl_sts0)   IS_QM_IDLE(TPC, qm_glbl_sts0)
+#define IS_MME_QM_IDLE(qm_glbl_sts0)   IS_QM_IDLE(MME, qm_glbl_sts0)
+
+#define IS_CMDQ_IDLE(engine, cmdq_glbl_sts0) \
+       (((cmdq_glbl_sts0) & engine##_CMDQ_IDLE_MASK) == \
+                       engine##_CMDQ_IDLE_MASK)
+#define IS_TPC_CMDQ_IDLE(cmdq_glbl_sts0) \
+       IS_CMDQ_IDLE(TPC, cmdq_glbl_sts0)
+#define IS_MME_CMDQ_IDLE(cmdq_glbl_sts0) \
+       IS_CMDQ_IDLE(MME, cmdq_glbl_sts0)
+
+#define IS_DMA_IDLE(dma_core_sts0) \
+       !((dma_core_sts0) & DMA_CH_0_STS0_DMA_BUSY_MASK)
+
+#define IS_TPC_IDLE(tpc_cfg_sts) \
+       (((tpc_cfg_sts) & TPC_CFG_IDLE_MASK) == TPC_CFG_IDLE_MASK)
+
+#define IS_MME_IDLE(mme_arch_sts) \
+       (((mme_arch_sts) & MME_ARCH_IDLE_MASK) == MME_ARCH_IDLE_MASK)
+
+
 static const char goya_irq_name[GOYA_MSIX_ENTRIES][GOYA_MAX_STRING_LEN] = {
                "goya cq 0", "goya cq 1", "goya cq 2", "goya cq 3",
                "goya cq 4", "goya cpu eq"
@@ -2796,7 +2821,6 @@ static int goya_send_job_on_qman0(struct hl_device *hdev, struct hl_cs_job *job)
        dma_addr_t fence_dma_addr;
        struct hl_cb *cb;
        u32 tmp, timeout;
-       char buf[16] = {};
        int rc;
 
        if (hdev->pldm)
@@ -2804,10 +2828,9 @@ static int goya_send_job_on_qman0(struct hl_device *hdev, struct hl_cs_job *job)
        else
                timeout = HL_DEVICE_TIMEOUT_USEC;
 
-       if (!hdev->asic_funcs->is_device_idle(hdev, buf, sizeof(buf))) {
+       if (!hdev->asic_funcs->is_device_idle(hdev, NULL)) {
                dev_err_ratelimited(hdev->dev,
-                       "Can't send KMD job on QMAN0 because %s is busy\n",
-                       buf);
+                       "Can't send KMD job on QMAN0 because the device is not idle\n");
                return -EBUSY;
        }
 
@@ -4891,59 +4914,75 @@ int goya_armcp_info_get(struct hl_device *hdev)
        return 0;
 }
 
-static bool goya_is_device_idle(struct hl_device *hdev, char *buf, size_t size)
+static bool goya_is_device_idle(struct hl_device *hdev, struct seq_file *s)
 {
-       u64 offset, dma_qm_reg, tpc_qm_reg, tpc_cmdq_reg, tpc_cfg_reg,
-               dma_core_sts;
+       const char *fmt = "%-5d%-9s%#-14x%#-16x%#x\n";
+       const char *dma_fmt = "%-5d%-9s%#-14x%#x\n";
+       u32 qm_glbl_sts0, cmdq_glbl_sts0, dma_core_sts0, tpc_cfg_sts,
+               mme_arch_sts;
+       bool is_idle = true, is_eng_idle;
+       u64 offset;
        int i;
 
+       if (s)
+               seq_puts(s, "\nDMA  is_idle  QM_GLBL_STS0  DMA_CORE_STS0\n"
+                               "---  -------  ------------  -------------\n");
+
        offset = mmDMA_QM_1_GLBL_STS0 - mmDMA_QM_0_GLBL_STS0;
 
        for (i = 0 ; i < DMA_MAX_NUM ; i++) {
-               dma_qm_reg = mmDMA_QM_0_GLBL_STS0 + i * offset;
-               dma_core_sts = mmDMA_CH_0_STS0 + i * offset;
+               qm_glbl_sts0 = RREG32(mmDMA_QM_0_GLBL_STS0 + i * offset);
+               dma_core_sts0 = RREG32(mmDMA_CH_0_STS0 + i * offset);
+               is_eng_idle = IS_DMA_QM_IDLE(qm_glbl_sts0) &&
+                               IS_DMA_IDLE(dma_core_sts0);
+               is_idle &= is_eng_idle;
 
-               if ((RREG32(dma_qm_reg) & DMA_QM_IDLE_MASK) !=
-                               DMA_QM_IDLE_MASK)
-                       return HL_ENG_BUSY(buf, size, "DMA%d_QM", i);
-
-               if (RREG32(dma_core_sts) & DMA_CH_0_STS0_DMA_BUSY_MASK)
-                       return HL_ENG_BUSY(buf, size, "DMA%d_CORE", i);
+               if (s)
+                       seq_printf(s, dma_fmt, i, is_eng_idle ? "Y" : "N",
+                                       qm_glbl_sts0, dma_core_sts0);
        }
 
+       if (s)
+               seq_puts(s,
+                       "\nTPC  is_idle  QM_GLBL_STS0  CMDQ_GLBL_STS0  CFG_STATUS\n"
+                       "---  -------  ------------  --------------  ----------\n");
+
        offset = mmTPC1_QM_GLBL_STS0 - mmTPC0_QM_GLBL_STS0;
 
        for (i = 0 ; i < TPC_MAX_NUM ; i++) {
-               tpc_qm_reg = mmTPC0_QM_GLBL_STS0 + i * offset;
-               tpc_cmdq_reg = mmTPC0_CMDQ_GLBL_STS0 + i * offset;
-               tpc_cfg_reg = mmTPC0_CFG_STATUS + i * offset;
-
-               if ((RREG32(tpc_qm_reg) & TPC_QM_IDLE_MASK) !=
-                               TPC_QM_IDLE_MASK)
-                       return HL_ENG_BUSY(buf, size, "TPC%d_QM", i);
-
-               if ((RREG32(tpc_cmdq_reg) & TPC_CMDQ_IDLE_MASK) !=
-                               TPC_CMDQ_IDLE_MASK)
-                       return HL_ENG_BUSY(buf, size, "TPC%d_CMDQ", i);
-
-               if ((RREG32(tpc_cfg_reg) & TPC_CFG_IDLE_MASK) !=
-                               TPC_CFG_IDLE_MASK)
-                       return HL_ENG_BUSY(buf, size, "TPC%d_CFG", i);
-       }
-
-       if ((RREG32(mmMME_QM_GLBL_STS0) & MME_QM_IDLE_MASK) !=
-                       MME_QM_IDLE_MASK)
-               return HL_ENG_BUSY(buf, size, "MME_QM");
-
-       if ((RREG32(mmMME_CMDQ_GLBL_STS0) & MME_CMDQ_IDLE_MASK) !=
-                       MME_CMDQ_IDLE_MASK)
-               return HL_ENG_BUSY(buf, size, "MME_CMDQ");
-
-       if ((RREG32(mmMME_ARCH_STATUS) & MME_ARCH_IDLE_MASK) !=
-                       MME_ARCH_IDLE_MASK)
-               return HL_ENG_BUSY(buf, size, "MME_ARCH");
-
-       return true;
+               qm_glbl_sts0 = RREG32(mmTPC0_QM_GLBL_STS0 + i * offset);
+               cmdq_glbl_sts0 = RREG32(mmTPC0_CMDQ_GLBL_STS0 + i * offset);
+               tpc_cfg_sts = RREG32(mmTPC0_CFG_STATUS + i * offset);
+               is_eng_idle = IS_TPC_QM_IDLE(qm_glbl_sts0) &&
+                               IS_TPC_CMDQ_IDLE(cmdq_glbl_sts0) &&
+                               IS_TPC_IDLE(tpc_cfg_sts);
+               is_idle &= is_eng_idle;
+
+               if (s)
+                       seq_printf(s, fmt, i, is_eng_idle ? "Y" : "N",
+                               qm_glbl_sts0, cmdq_glbl_sts0, tpc_cfg_sts);
+       }
+
+       if (s)
+               seq_puts(s,
+                       "\nMME  is_idle  QM_GLBL_STS0  CMDQ_GLBL_STS0  ARCH_STATUS\n"
+                       "---  -------  ------------  --------------  -----------\n");
+
+       qm_glbl_sts0 = RREG32(mmMME_QM_GLBL_STS0);
+       cmdq_glbl_sts0 = RREG32(mmMME_CMDQ_GLBL_STS0);
+       mme_arch_sts = RREG32(mmMME_ARCH_STATUS);
+       is_eng_idle = IS_MME_QM_IDLE(qm_glbl_sts0) &&
+                       IS_MME_CMDQ_IDLE(cmdq_glbl_sts0) &&
+                       IS_MME_IDLE(mme_arch_sts);
+       is_idle &= is_eng_idle;
+
+       if (s) {
+               seq_printf(s, fmt, 0, is_eng_idle ? "Y" : "N", qm_glbl_sts0,
+                               cmdq_glbl_sts0, mme_arch_sts);
+               seq_puts(s, "\n");
+       }
+
+       return is_idle;
 }
 
 static void goya_hw_queues_lock(struct hl_device *hdev)
index 5e4a631b3d88c582a8a51339cd000ea462486063..2c9ea61099b4c08e6e5ba2caf8f6c4d33b3d1b28 100644 (file)
@@ -557,7 +557,7 @@ struct hl_asic_funcs {
                        u32 asid, u64 va, u64 size);
        int (*send_heartbeat)(struct hl_device *hdev);
        int (*debug_coresight)(struct hl_device *hdev, void *data);
-       bool (*is_device_idle)(struct hl_device *hdev, char *buf, size_t size);
+       bool (*is_device_idle)(struct hl_device *hdev, struct seq_file *s);
        int (*soft_reset_late_init)(struct hl_device *hdev);
        void (*hw_queues_lock)(struct hl_device *hdev);
        void (*hw_queues_unlock)(struct hl_device *hdev);
@@ -1112,12 +1112,6 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val);
        (cond) ? 0 : -ETIMEDOUT; \
 })
 
-#define HL_ENG_BUSY(buf, size, fmt, ...) ({ \
-               if (buf) \
-                       snprintf(buf, size, fmt, ##__VA_ARGS__); \
-               false; \
-       })
-
 struct hwmon_chip_info;
 
 /**
index c641c7eb6f7cbc2dbc9ff7a9cf46414c815744aa..b04585af27ad720455c0b494e93e7515fc6f5334 100644 (file)
@@ -119,7 +119,7 @@ static int hw_idle(struct hl_device *hdev, struct hl_info_args *args)
        if ((!max_size) || (!out))
                return -EINVAL;
 
-       hw_idle.is_idle = hdev->asic_funcs->is_device_idle(hdev, NULL, 0);
+       hw_idle.is_idle = hdev->asic_funcs->is_device_idle(hdev, NULL);
 
        return copy_to_user(out, &hw_idle,
                min((size_t) max_size, sizeof(hw_idle))) ? -EFAULT : 0;