summaryrefslogtreecommitdiff
path: root/drivers
diff options
context:
space:
mode:
authorViacheslav Ovsiienko <viacheslavo@mellanox.com>2019-05-27 04:58:32 +0000
committerFerruh Yigit <ferruh.yigit@intel.com>2019-06-06 20:21:20 +0900
commit5897ac139355e2d6602c89b0a1d28e609d6f6ebc (patch)
tree8f37960856152ec3c1a559304885ad4c1065e558 /drivers
parent0d51a5ec26ccca4582f4762854ac3b28208c4a04 (diff)
downloaddpdk-5897ac139355e2d6602c89b0a1d28e609d6f6ebc.zip
dpdk-5897ac139355e2d6602c89b0a1d28e609d6f6ebc.tar.gz
dpdk-5897ac139355e2d6602c89b0a1d28e609d6f6ebc.tar.xz
net/mlx5: fix event handler uninstall
When device is being closed and tries to unregister interrupt callback, there is a chance the handler is still active (called in context of eal_intr_thread_main thread). If so the rte_intr_callback_unregister returns -EAGAIN and keeps the handler registered, causing crash when underlaying resourse is gone away. This race condition may happen if event handling in application takes a long time. We should check the return code of unregistering routine and try again to unregister the handler. The diagnostic messages are shown once a second, while trying to unregister. Fixes: 028b2a28c3cb ("net/mlx5: update event handler for multiport IB devices") Cc: stable@dpdk.org Signed-off-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com> Acked-by: Yongseok Koh <yskoh@mellanox.com>
Diffstat (limited to 'drivers')
-rw-r--r--drivers/net/mlx5/mlx5.c2
-rw-r--r--drivers/net/mlx5/mlx5.h2
-rw-r--r--drivers/net/mlx5/mlx5_ethdev.c80
3 files changed, 79 insertions, 5 deletions
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index aee9f97..3ef33c9 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -313,7 +313,7 @@ mlx5_free_shared_ibctx(struct mlx5_ibv_shared *sh)
**/
assert(!sh->intr_cnt);
if (sh->intr_cnt)
- rte_intr_callback_unregister
+ mlx5_intr_callback_unregister
(&sh->intr_handle, mlx5_dev_interrupt_handler, sh);
pthread_mutex_destroy(&sh->intr_mutex);
if (sh->pd)
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index caabdfc..01cfd65 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -438,6 +438,8 @@ void mlx5_nl_check_switch_info(bool nun_vf_set,
struct mlx5_switch_info *switch_info);
void mlx5_translate_port_name(const char *port_name_in,
struct mlx5_switch_info *port_info_out);
+void mlx5_intr_callback_unregister(const struct rte_intr_handle *handle,
+ rte_intr_callback_fn cb_fn, void *cb_arg);
/* mlx5_mac.c */
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 7785341..ac0500a 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -33,6 +33,7 @@
#include <rte_malloc.h>
#include <rte_string_fns.h>
#include <rte_rwlock.h>
+#include <rte_cycles.h>
#include "mlx5.h"
#include "mlx5_glue.h"
@@ -1256,9 +1257,80 @@ mlx5_dev_interrupt_handler(void *cb_arg)
}
}
+/*
+ * Unregister callback handler safely. The handler may be active
+ * while we are trying to unregister it, in this case code -EAGAIN
+ * is returned by rte_intr_callback_unregister(). This routine checks
+ * the return code and tries to unregister handler again.
+ *
+ * @param handle
+ * interrupt handle
+ * @param cb_fn
+ * pointer to callback routine
+ * @cb_arg
+ * opaque callback parameter
+ */
+void
+mlx5_intr_callback_unregister(const struct rte_intr_handle *handle,
+ rte_intr_callback_fn cb_fn, void *cb_arg)
+{
+ /*
+ * Try to reduce timeout management overhead by not calling
+ * the timer related routines on the first iteration. If the
+ * unregistering succeeds on first call there will be no
+ * timer calls at all.
+ */
+ uint64_t twait = 0;
+ uint64_t start = 0;
+
+ do {
+ int ret;
+
+ ret = rte_intr_callback_unregister(handle, cb_fn, cb_arg);
+ if (ret >= 0)
+ return;
+ if (ret != -EAGAIN) {
+ DRV_LOG(INFO, "failed to unregister interrupt"
+ " handler (error: %d)", ret);
+ assert(false);
+ return;
+ }
+ if (twait) {
+ struct timespec onems;
+
+ /* Wait one millisecond and try again. */
+ onems.tv_sec = 0;
+ onems.tv_nsec = NS_PER_S / MS_PER_S;
+ nanosleep(&onems, 0);
+ /* Check whether one second elapsed. */
+ if ((rte_get_timer_cycles() - start) <= twait)
+ continue;
+ } else {
+ /*
+ * We get the amount of timer ticks for one second.
+ * If this amount elapsed it means we spent one
+ * second in waiting. This branch is executed once
+ * on first iteration.
+ */
+ twait = rte_get_timer_hz();
+ assert(twait);
+ }
+ /*
+ * Timeout elapsed, show message (once a second) and retry.
+ * We have no other acceptable option here, if we ignore
+ * the unregistering return code the handler will not
+ * be unregistered, fd will be closed and we may get the
+ * crush. Hanging and messaging in the loop seems not to be
+ * the worst choice.
+ */
+ DRV_LOG(INFO, "Retrying to unregister interrupt handler");
+ start = rte_get_timer_cycles();
+ } while (true);
+}
+
/**
* Uninstall shared asynchronous device events handler.
- * This function is implemeted to support event sharing
+ * This function is implemented to support event sharing
* between multiple ports of single IB device.
*
* @param dev
@@ -1284,7 +1356,7 @@ mlx5_dev_shared_handler_uninstall(struct rte_eth_dev *dev)
sh->port[priv->ibv_port - 1].ih_port_id = RTE_MAX_ETHPORTS;
if (!sh->intr_cnt || --sh->intr_cnt)
goto exit;
- rte_intr_callback_unregister(&sh->intr_handle,
+ mlx5_intr_callback_unregister(&sh->intr_handle,
mlx5_dev_interrupt_handler, sh);
sh->intr_handle.fd = 0;
sh->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
@@ -1293,8 +1365,8 @@ exit:
}
/**
- * Install shared asyncronous device events handler.
- * This function is implemeted to support event sharing
+ * Install shared asynchronous device events handler.
+ * This function is implemented to support event sharing
* between multiple ports of single IB device.
*
* @param dev