We have one report of a crash in xs_tcp_setup_socket.
The call path to the crash is:
xs_tcp_setup_socket -> inet_stream_connect -> lock_sock_nested.
The 'sock' passed to that last function is NULL.
The only way I can see this happening is a concurrent call to
xs_close:
xs_close -> xs_reset_transport -> sock_release -> inet_release
inet_release sets:
sock->sk = NULL;
inet_stream_connect calls
lock_sock(sock->sk);
which gets NULL.
All calls to xs_close are protected by XPRT_LOCKED as are most
activations of the workqueue which runs xs_tcp_setup_socket.
The exception is xs_tcp_schedule_linger_timeout.
So presumably the timeout queued by the later fires exactly when some
other code runs xs_close().
To protect against this we can move the cancel_delayed_work_sync()
call from xs_destory() to xs_close().
As xs_close is never called from the worker scheduled on
->connect_worker, this can never deadlock.
Signed-off-by: NeilBrown <neilb@suse.de>
[Trond: Make it safe to call cancel_delayed_work_sync() on AF_LOCAL sockets]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
dprintk("RPC: xs_close xprt %p\n", xprt);
dprintk("RPC: xs_close xprt %p\n", xprt);
+ cancel_delayed_work_sync(&transport->connect_worker);
+
xs_reset_transport(transport);
xprt->reestablish_timeout = 0;
xs_reset_transport(transport);
xprt->reestablish_timeout = 0;
*/
static void xs_destroy(struct rpc_xprt *xprt)
{
*/
static void xs_destroy(struct rpc_xprt *xprt)
{
- struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
-
dprintk("RPC: xs_destroy xprt %p\n", xprt);
dprintk("RPC: xs_destroy xprt %p\n", xprt);
- cancel_delayed_work_sync(&transport->connect_worker);
-
xs_local_destroy(xprt);
}
xs_local_destroy(xprt);
}
+static void xs_dummy_setup_socket(struct work_struct *work)
+{
+}
+
static struct socket *xs_create_sock(struct rpc_xprt *xprt,
struct sock_xprt *transport, int family, int type, int protocol)
{
static struct socket *xs_create_sock(struct rpc_xprt *xprt,
struct sock_xprt *transport, int family, int type, int protocol)
{
xprt->ops = &xs_local_ops;
xprt->timeout = &xs_local_default_timeout;
xprt->ops = &xs_local_ops;
xprt->timeout = &xs_local_default_timeout;
+ INIT_DELAYED_WORK(&transport->connect_worker,
+ xs_dummy_setup_socket);
+
switch (sun->sun_family) {
case AF_LOCAL:
if (sun->sun_path[0] != '/') {
switch (sun->sun_family) {
case AF_LOCAL:
if (sun->sun_path[0] != '/') {