Fix: ust-consumer: segfault on snapshot after regenerate metadata
authorJonathan Rajotte <jonathan.rajotte-julien@efficios.com>
Wed, 17 Nov 2021 21:18:59 +0000 (16:18 -0500)
committerJérémie Galarneau <jeremie.galarneau@efficios.com>
Fri, 17 Dec 2021 05:31:09 +0000 (00:31 -0500)
Observed issue
==============

lttng-consumerd segfaults for the following scenario:

  $ lttng create test --snapshot
  $ lttng enable-event -u -an
  $ lttng start
  # Run an app just to have some events
  $ lttng regenerate metadata
  $ lttng snapshot record

The following backtrace is obtained:

 (gdb) bt
 #0  __GI___pthread_mutex_lock (mutex=0x130) at ../nptl/pthread_mutex_lock.c:67
 #1  0x000055b383cfaed3 in lttng_ustconsumer_recv_metadata (sock=29, key=4, offset=0, len=12391, version=1, channel=0x7fe90000a760, timer=0, wait=1) at ust-consumer.c:1347
 #2  0x000055b383d00197 in lttng_ustconsumer_request_metadata (ctx=0x55b3855a1e50, channel=0x7fe90000a760, timer=0, wait=1) at ust-consumer.c:3336
 #3  0x000055b383cf9e76 in snapshot_metadata (metadata_channel=0x7fe90000a760, key=4, path=0x7fe911a09944 "uid/1000/64-bit", relayd_id=18446744073709551615, ctx=0x55b3855a1e50) at ust-consum
 #4  0x000055b383cfbe73 in lttng_ustconsumer_recv_cmd (ctx=0x55b3855a1e50, sock=28, consumer_sockpoll=0x7fe911a0dbb0) at ust-consumer.c:1853
 #5  0x000055b383ccf9b7 in lttng_consumer_recv_cmd (ctx=0x55b3855a1e50, sock=28, consumer_sockpoll=0x7fe911a0dbb0) at consumer.c:2097
 #6  0x000055b383cd3bfd in consumer_thread_sessiond_poll (data=0x55b3855a1e50) at consumer.c:3284
 #7  0x00007fe914c22609 in start_thread (arg=<optimized out>) at pthread_create.c:477
 #8  0x00007fe914b47293 in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
 (gdb) up
 #1  0x000055b383cfaed3 in lttng_ustconsumer_recv_metadata (sock=29, key=4, offset=0, len=12391, version=1, channel=0x7fe90000a760, timer=0, wait=1) at ust-consumer.c:1347
 1347                    pthread_mutex_lock(&channel->metadata_stream->lock);
 (gdb) print channel->metadata_stream
 $1 = (struct lttng_consumer_stream *) 0x0

Note that the following scenario also leads to a similar backtrace:

  $ lttng create test --snapshot
  $ lttng enable-event -u -an
  $ lttng start
  # Run an app just to have some events with a short duration
  $ lttng regenerate metadata
  # Run a second app just to have some events and a metadata flush while
  # the metadata cache status is set as `invalidated`.
  ^ lttng-consumerd segfault on app termination.

The backtrace:

 (gdb) bt
 #0  __GI___pthread_mutex_lock (mutex=0x130) at ../nptl/pthread_mutex_lock.c:67
 #1  0x00005612a5a13ed3 in lttng_ustconsumer_recv_metadata (sock=28, key=2, offset=0, len=12391, version=1, channel=0x7f3978005400, timer=0, wait=1) at ust-consumer.c:1347
 #2  0x00005612a5a14d0a in lttng_ustconsumer_recv_cmd (ctx=0x5612a6feee50, sock=28, consumer_sockpoll=0x7f3989494bb0) at ust-consumer.c:1818
 #3  0x00005612a59e89b7 in lttng_consumer_recv_cmd (ctx=0x5612a6feee50, sock=28, consumer_sockpoll=0x7f3989494bb0) at consumer.c:2097
 #4  0x00005612a59ecbfd in consumer_thread_sessiond_poll (data=0x5612a6feee50) at consumer.c:3284
 #5  0x00007f398c6a9609 in start_thread (arg=<optimized out>) at pthread_create.c:477
 #6  0x00007f398c5ce293 in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
 (gdb) up
 #1  0x00005612a5a13ed3 in lttng_ustconsumer_recv_metadata (sock=28, key=2, offset=0, len=12391, version=1, channel=0x7f3978005400, timer=0, wait=1) at ust-consumer.c:1347
 1347                    pthread_mutex_lock(&channel->metadata_stream->lock);
 (gdb) print channel->metadata_stream
 $1 = (struct lttng_consumer_stream *) 0x0
 (gdb)

Cause
=====

For session configured in snapshot mode the metadata channel
metadata_stream field is NULL except for a "short" window during the
actual snapshot record action (snapshot_metadata).

The `regenerate metadata` effectively flag the metadata cache as invalid
leading to handling the cache invalidation state
(`CONSUMER_METADATA_CACHE_WRITE_STATUS_INVALIDATED`) in
`lttng_ustconsumer_recv_metadata`. This was introduced by
b1316da1ffbd276fc8271e7a9438e683ad352781 [1].

At that point the function expects the `channel->metadata_stream` to be
non null. This is simply not true for snapshot session metadata channels.

Note that we cannot simply swap `lttng_ustconsumer_request_metadata` and
`create_ust_streams` in `snapshot_metadata` to ensure that the
`channel->metadata_stream` is non null since
`lttng_ustconsumer_recv_metadata` ends up being called on metadata flush when
an app quit. This sequence of events corresponds to the second scenario
put forward in the `Observed Issue` section.

Solution
========

Null check `channel->metadata_stream` and perform only the operation
when it is non null. This partly mirror what is done in `consumer_metadata_wakeup_pipe`.

I am not sure if the check on `channel->monitor` is required but it
seems irrelevant to the notion of resetting the stream consumed position
when the stream exists.

With this taken care off, we find ourself with another
backtrace:

 #0  __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50                                                                                                             [93/122]
 #1  0x00007f75cf9b3859 in __GI_abort () at abort.c:79
 #2  0x00007f75cf9b3729 in __assert_fail_base (fmt=0x7f75cfb49588 "%s%s%s:%u: %s%sAssertion `%s' failed.\n%n", assertion=0x55ab004e9c68 "pthread_mutex_trylock(&stream->lock)", file=0x55ab004
 #3  0x00007f75cf9c4f36 in __GI___assert_fail (assertion=0x55ab004e9c68 "pthread_mutex_trylock(&stream->lock)", file=0x55ab004e8d7a "ust-consumer.c", line=1285, function=0x55ab004eb8a0 <__PR
 #4  0x000055ab004b1b9c in metadata_stream_reset_cache_consumed_position (stream=0x7f75b400a850) at ust-consumer.c:1285
 #5  0x000055ab004b4fef in commit_one_metadata_packet (stream=0x7f75b400a850) at ust-consumer.c:2551
 #6  0x000055ab004b5f46 in get_next_subbuffer_metadata (stream=0x7f75b400a850, subbuffer=0x7f75cc972630) at ust-consumer.c:2927
 #7  0x000055ab0048b6a9 in lttng_consumer_read_subbuffer (stream=0x7f75b400a850, ctx=0x55ab01d4ee50, locked_by_caller=true) at consumer.c:3522
 #8  0x000055ab004b0f66 in snapshot_metadata (metadata_channel=0x7f75b4005400, key=2, path=0x7f75cc972944 "uid/1000/64-bit", relayd_id=18446744073709551615, ctx=0x55ab01d4ee50) at ust-consum
 #9  0x000055ab004b2e86 in lttng_ustconsumer_recv_cmd (ctx=0x55ab01d4ee50, sock=28, consumer_sockpoll=0x7f75cc976bb0) at ust-consumer.c:1861
 #10 0x000055ab004869b7 in lttng_consumer_recv_cmd (ctx=0x55ab01d4ee50, sock=28, consumer_sockpoll=0x7f75cc976bb0) at consumer.c:2097
 #11 0x000055ab0048abfd in consumer_thread_sessiond_poll (data=0x55ab01d4ee50) at consumer.c:3284
 #12 0x00007f75cfb8b609 in start_thread (arg=<optimized out>) at pthread_create.c:477
 #13 0x00007f75cfab0293 in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95

Which is also caused in part to the invalidation of the cache.
`metadata_stream_reset_cache_consumed_position` expect that the stream
at that point be locked. Which is not the case despite what the last argument
to `lttng_consumer_read_subbuffer` indicates. To alleviate that we hold
the lock during the call to `lttng_consumer_read_subbuffer`.

Known drawbacks
=========

None.

References
=========

[1] https://git.lttng.org/?p=lttng-tools.git;a=commit;h=b1316da1ffbd276fc8271e7a9438e683ad352781

Signed-off-by: Jonathan Rajotte <jonathan.rajotte-julien@efficios.com>
Signed-off-by: Jérémie Galarneau <jeremie.galarneau@efficios.com>
Change-Id: Ib09fdc989b1baae26d0f496afeabc3e7cb27228c

src/common/consumer/consumer-stream.c
src/common/consumer/consumer.c
src/common/consumer/consumer.h
src/common/kernel-consumer/kernel-consumer.c
src/common/ust-consumer/ust-consumer.c

index ad0f7024d524f4d82d076d3963d086c93e607b89..09543abb78b49aa0469ae641cdeb386f15c758b0 100644 (file)
 #include <unistd.h>
 
 #include <common/common.h>
+#include <common/consumer/consumer-timer.h>
+#include <common/consumer/consumer-timer.h>
+#include <common/consumer/consumer.h>
+#include <common/consumer/consumer.h>
+#include <common/consumer/metadata-bucket.h>
+#include <common/consumer/metadata-bucket.h>
 #include <common/index/index.h>
 #include <common/kernel-consumer/kernel-consumer.h>
+#include <common/kernel-ctl/kernel-ctl.h>
+#include <common/macros.h>
 #include <common/relayd/relayd.h>
 #include <common/ust-consumer/ust-consumer.h>
 #include <common/utils.h>
-#include <common/consumer/consumer.h>
-#include <common/consumer/consumer-timer.h>
-#include <common/consumer/metadata-bucket.h>
-#include <common/kernel-ctl/kernel-ctl.h>
 
 #include "consumer-stream.h"
 
@@ -52,6 +56,12 @@ static void consumer_stream_data_unlock_all(struct lttng_consumer_stream *stream
        pthread_mutex_unlock(&stream->chan->lock);
 }
 
+static void consumer_stream_data_assert_locked_all(struct lttng_consumer_stream *stream)
+{
+       ASSERT_LOCKED(stream->lock);
+       ASSERT_LOCKED(stream->chan->lock);
+}
+
 static void consumer_stream_metadata_lock_all(struct lttng_consumer_stream *stream)
 {
        consumer_stream_data_lock_all(stream);
@@ -64,6 +74,12 @@ static void consumer_stream_metadata_unlock_all(struct lttng_consumer_stream *st
        consumer_stream_data_unlock_all(stream);
 }
 
+static void consumer_stream_metadata_assert_locked_all(struct lttng_consumer_stream *stream)
+{
+       ASSERT_LOCKED(stream->metadata_rdv_lock);
+       consumer_stream_data_assert_locked_all(stream);
+}
+
 /* Only used for data streams. */
 static int consumer_stream_update_stats(struct lttng_consumer_stream *stream,
                const struct stream_subbuffer *subbuf)
@@ -712,6 +728,8 @@ struct lttng_consumer_stream *consumer_stream_create(
                                consumer_stream_metadata_lock_all;
                stream->read_subbuffer_ops.unlock =
                                consumer_stream_metadata_unlock_all;
+               stream->read_subbuffer_ops.assert_locked =
+                               consumer_stream_metadata_assert_locked_all;
                stream->read_subbuffer_ops.pre_consume_subbuffer =
                                metadata_stream_check_version;
        } else {
@@ -738,6 +756,8 @@ struct lttng_consumer_stream *consumer_stream_create(
                stream->read_subbuffer_ops.lock = consumer_stream_data_lock_all;
                stream->read_subbuffer_ops.unlock =
                                consumer_stream_data_unlock_all;
+               stream->read_subbuffer_ops.assert_locked =
+                               consumer_stream_data_assert_locked_all;
                stream->read_subbuffer_ops.pre_consume_subbuffer =
                                consumer_stream_update_stats;
        }
index a903ff9d2a643cda5c090f23ea84829416a55292..ec6af8cc14926876dc7621117b888cc4087752ad 100644 (file)
@@ -3386,6 +3386,8 @@ ssize_t lttng_consumer_read_subbuffer(struct lttng_consumer_stream *stream,
 
        if (!locked_by_caller) {
                stream->read_subbuffer_ops.lock(stream);
+       } else {
+               stream->read_subbuffer_ops.assert_locked(stream);
        }
 
        if (stream->read_subbuffer_ops.on_wake_up) {
index 0c83baa8cec451fce5671c1bc51e0c2cd3970e5e..42f09f98bdfe60260776015b2a55ae4d9c20582b 100644 (file)
@@ -396,6 +396,13 @@ typedef void (*lock_cb)(struct lttng_consumer_stream *);
  */
 typedef void (*unlock_cb)(struct lttng_consumer_stream *);
 
+/*
+ * Assert that the stream and channel lock and any other stream type specific
+ * lock that need to be acquired during the processing of a read_subbuffer
+ * operation is acquired.
+ */
+typedef void (*assert_locked_cb)(struct lttng_consumer_stream *);
+
 /*
  * Invoked when a subbuffer's metadata version does not match the last
  * known metadata version.
@@ -647,6 +654,7 @@ struct lttng_consumer_stream {
                send_live_beacon_cb send_live_beacon;
                on_sleep_cb on_sleep;
                unlock_cb unlock;
+               assert_locked_cb assert_locked;
        } read_subbuffer_ops;
        struct metadata_bucket *metadata_bucket;
 };
index 5334d13585842b0d63ab7f61d2b8cfa33e2b3b5e..8d7570d1736100f281471cf596314e475846ba9e 100644 (file)
@@ -139,7 +139,7 @@ error:
 /*
  * Take a snapshot of all the stream of a channel
  * RCU read-side lock must be held across this function to ensure existence of
- * channel. The channel lock must be held by the caller.
+ * channel.
  *
  * Returns 0 on success, < 0 on error
  */
@@ -154,6 +154,9 @@ static int lttng_kconsumer_snapshot_channel(
 
        DBG("Kernel consumer snapshot channel %" PRIu64, key);
 
+       /* Prevent channel modifications while we perform the snapshot.*/
+       pthread_mutex_lock(&channel->lock);
+
        rcu_read_lock();
 
        /* Splice is not supported yet for channel snapshot. */
@@ -347,13 +350,14 @@ end_unlock:
        pthread_mutex_unlock(&stream->lock);
 end:
        rcu_read_unlock();
+       pthread_mutex_unlock(&channel->lock);
        return ret;
 }
 
 /*
  * Read the whole metadata available for a snapshot.
  * RCU read-side lock must be held across this function to ensure existence of
- * metadata_channel. The channel lock must be held by the caller.
+ * metadata_channel.
  *
  * Returns 0 on success, < 0 on error
  */
@@ -376,7 +380,7 @@ static int lttng_kconsumer_snapshot_metadata(
        metadata_stream = metadata_channel->metadata_stream;
        assert(metadata_stream);
 
-       pthread_mutex_lock(&metadata_stream->lock);
+       metadata_stream->read_subbuffer_ops.lock(metadata_stream);
        assert(metadata_channel->trace_chunk);
        assert(metadata_stream->trace_chunk);
 
@@ -431,7 +435,7 @@ static int lttng_kconsumer_snapshot_metadata(
 
        ret = 0;
 error_snapshot:
-       pthread_mutex_unlock(&metadata_stream->lock);
+       metadata_stream->read_subbuffer_ops.unlock(metadata_stream);
        cds_list_del(&metadata_stream->send_node);
        consumer_stream_destroy(metadata_stream, NULL);
        metadata_channel->metadata_stream = NULL;
@@ -961,7 +965,6 @@ error_streams_sent_nosignal:
                        ERR("Channel %" PRIu64 " not found", key);
                        ret_code = LTTCOMM_CONSUMERD_CHAN_NOT_FOUND;
                } else {
-                       pthread_mutex_lock(&channel->lock);
                        if (msg.u.snapshot_channel.metadata == 1) {
                                int ret_snapshot;
 
@@ -989,7 +992,6 @@ error_streams_sent_nosignal:
                                        ret_code = LTTCOMM_CONSUMERD_SNAPSHOT_FAILED;
                                }
                        }
-                       pthread_mutex_unlock(&channel->lock);
                }
                health_code_update();
 
index 2948fda5043ad7b8aa4dd177e0beb04b25d38d48..bfecf09e2a6ae36bf578052f91b2b4c95855ebff 100644 (file)
@@ -1005,7 +1005,7 @@ static int snapshot_metadata(struct lttng_consumer_channel *metadata_channel,
        metadata_stream = metadata_channel->metadata_stream;
        assert(metadata_stream);
 
-       pthread_mutex_lock(&metadata_stream->lock);
+       metadata_stream->read_subbuffer_ops.lock(metadata_stream);
        if (relayd_id != (uint64_t) -1ULL) {
                metadata_stream->net_seq_idx = relayd_id;
                ret = consumer_send_relayd_stream(metadata_stream, path);
@@ -1013,14 +1013,12 @@ static int snapshot_metadata(struct lttng_consumer_channel *metadata_channel,
                ret = consumer_stream_create_output_files(metadata_stream,
                                false);
        }
-       pthread_mutex_unlock(&metadata_stream->lock);
        if (ret < 0) {
                goto error_stream;
        }
 
        do {
                health_code_update();
-
                ret = lttng_consumer_read_subbuffer(metadata_stream, ctx, true);
                if (ret < 0) {
                        goto error_stream;
@@ -1028,9 +1026,10 @@ static int snapshot_metadata(struct lttng_consumer_channel *metadata_channel,
        } while (ret > 0);
 
 error_stream:
+       metadata_stream->read_subbuffer_ops.unlock(metadata_stream);
        /*
-        * Clean up the stream completly because the next snapshot will use a new
-        * metadata stream.
+        * Clean up the stream completely because the next snapshot will use a
+        * new metadata stream.
         */
        consumer_stream_destroy(metadata_stream, NULL);
        cds_list_del(&metadata_stream->send_node);
@@ -1324,10 +1323,21 @@ int lttng_ustconsumer_recv_metadata(int sock, uint64_t key, uint64_t offset,
                 * metadata position to ensure the metadata poll thread consumes
                 * the whole cache.
                 */
-               pthread_mutex_lock(&channel->metadata_stream->lock);
-               metadata_stream_reset_cache_consumed_position(
-                               channel->metadata_stream);
-               pthread_mutex_unlock(&channel->metadata_stream->lock);
+
+               /*
+                * channel::metadata_stream can be null when the metadata
+                * channel is under a snapshot session type. No need to update
+                * the stream position in that scenario.
+                */
+               if (channel->metadata_stream != NULL) {
+                       pthread_mutex_lock(&channel->metadata_stream->lock);
+                       metadata_stream_reset_cache_consumed_position(
+                                       channel->metadata_stream);
+                       pthread_mutex_unlock(&channel->metadata_stream->lock);
+               } else {
+                       /* Validate we are in snapshot mode. */
+                       assert(!channel->monitor);
+               }
                /* Fall-through. */
        case CONSUMER_METADATA_CACHE_WRITE_STATUS_APPENDED_CONTENT:
                /*
This page took 0.033589 seconds and 5 git commands to generate.