From 569744c5a629e609de3c1ff2c269107610915c8e Mon Sep 17 00:00:00 2001 From: Jonathan Rajotte Date: Tue, 22 Jun 2021 22:17:03 -0400 Subject: [PATCH] Fix: ust: UST communication can return -EAGAIN MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Observed issue ============== The following scenario lead to an abort on event creation. The problem manifest itself when an application is unresponsive. Note that the default timeout for ust communication is 5 seconds. # Start an instrumented app ./app gdb lttng-sessiond # put a breakpoint on ustctl_create_event. lttng create my_session lttng enable-event -u -a lttng start # The tracepoint should hit. Do not continue. kill -s SIGSTOP $(pgrep app) # Continue lttng-sessiond. # lttng-sessiond will abort. Note that for UST this is not an expected behaviour. Expected communication failure with a single app should not invalidate the complete channel, compromise its setup or result in an abort. Note that a similar scenario for the following ustctl call sites also lead to scenario where failure of a single app lead to error reporting and/or error propagation to upper level object. Problematic callsites: ustctl_set_exclusion ustctl_set_filter ustctl_disable_channel These callsites are also fixed by this patch. Cause ===== For an unresponsive application, EAGAIN is returned and is treated as an "unknown" hard error. In this particular case the abort() call was introduced by commit: 88e3c2f5610b9ac89b0923d448fee34140fc46fb [1]. It is not clear if this is a leftover from debugging session since this is the only callsite where an abort is issued on communication failure via ustctl. Solution ======== Handle EAGAIN coming from ustctl_* and treat it the same way a dying application is handled. The only minor difference is that we WARN on communication time out. Albeit not the most useful thing for a CLI client, it could help overall user of lttng-sessiond in time out situation. Most call site already handled "unknown" error correctly. For those call site we simply end up bringing more info in regards to the timeout issue instead of mentioning that "-11" was returned. Note, the reclamation of "app" is handled by the poll loop and ust_app_unregister since the socket is shutdown by lttng-ust internally on error, including EAGAIN. Note that the application will try to register itself back to the lttng-sessiond based on its configuration. Known drawbacks ========= None Note ========== Some logging call sites used the ppid of the app instead of the pid. Those have been changed to pid. References ========== [1] https://github.com/lttng/lttng-tools/commit/88e3c2f5610b9ac89b0923d448fee34140fc46fb Fixes: #1384 Change-Id: If364b5d48e7fd2b664276a0fb1b7eec2c45ed683 Signed-off-by: Jonathan Rajotte Signed-off-by: Jérémie Galarneau --- src/bin/lttng-sessiond/ust-app.c | 628 ++++++++++++++++---------- src/bin/lttng-sessiond/ust-consumer.c | 26 +- 2 files changed, 410 insertions(+), 244 deletions(-) diff --git a/src/bin/lttng-sessiond/ust-app.c b/src/bin/lttng-sessiond/ust-app.c index ae2db857d..3040d0a3b 100644 --- a/src/bin/lttng-sessiond/ust-app.c +++ b/src/bin/lttng-sessiond/ust-app.c @@ -292,9 +292,18 @@ void delete_ust_app_ctx(int sock, struct ust_app_ctx *ua_ctx, pthread_mutex_lock(&app->sock_lock); ret = lttng_ust_ctl_release_object(sock, ua_ctx->obj); pthread_mutex_unlock(&app->sock_lock); - if (ret < 0 && ret != -EPIPE && ret != -LTTNG_UST_ERR_EXITING) { - ERR("UST app sock %d release ctx obj handle %d failed with ret %d", - sock, ua_ctx->obj->handle, ret); + if (ret < 0) { + if (ret == -EPIPE || ret == -LTTNG_UST_ERR_EXITING) { + DBG3("UST app release ctx failed. Application is dead: pid = %d, sock = %d", + app->pid, app->sock); + } else if (ret == -EAGAIN) { + WARN("UST app release ctx failed. Communication time out: pid = %d, sock = %d", + app->pid, app->sock); + } else { + ERR("UST app release ctx obj handle %d failed with ret %d: pid = %d, sock = %d", + ua_ctx->obj->handle, ret, + app->pid, app->sock); + } } free(ua_ctx->obj); } @@ -320,9 +329,17 @@ void delete_ust_app_event(int sock, struct ust_app_event *ua_event, pthread_mutex_lock(&app->sock_lock); ret = lttng_ust_ctl_release_object(sock, ua_event->obj); pthread_mutex_unlock(&app->sock_lock); - if (ret < 0 && ret != -EPIPE && ret != -LTTNG_UST_ERR_EXITING) { - ERR("UST app sock %d release event obj failed with ret %d", - sock, ret); + if (ret < 0) { + if (ret == -EPIPE || ret == -LTTNG_UST_ERR_EXITING) { + DBG3("UST app release event failed. Application is dead: pid = %d, sock = %d", + app->pid, app->sock); + } else if (ret == -EAGAIN) { + WARN("UST app release event failed. Communication time out: pid = %d, sock = %d", + app->pid, app->sock); + } else { + ERR("UST app release event obj failed with ret %d: pid = %d, sock = %d", + ret, app->pid, app->sock); + } } free(ua_event->obj); } @@ -361,9 +378,17 @@ static void delete_ust_app_event_notifier_rule(int sock, pthread_mutex_lock(&app->sock_lock); ret = lttng_ust_ctl_release_object(sock, ua_event_notifier_rule->obj); pthread_mutex_unlock(&app->sock_lock); - if (ret < 0 && ret != -EPIPE && ret != -LTTNG_UST_ERR_EXITING) { - ERR("Failed to release event notifier object: app = '%s' (ppid %d), ret = %d", - app->name, (int) app->ppid, ret); + if (ret < 0) { + if (ret == -EPIPE || ret == -LTTNG_UST_ERR_EXITING) { + DBG3("UST app release event notifier failed. Application is dead: pid = %d, sock = %d", + app->pid, app->sock); + } else if (ret == -EAGAIN) { + WARN("UST app release event notifier failed. Communication time out: pid = %d, sock = %d", + app->pid, app->sock); + } else { + ERR("UST app release event notifier failed with ret %d: pid = %d, sock = %d", + ret, app->pid, app->sock); + } } free(ua_event_notifier_rule->obj); @@ -390,9 +415,17 @@ static int release_ust_app_stream(int sock, struct ust_app_stream *stream, pthread_mutex_lock(&app->sock_lock); ret = lttng_ust_ctl_release_object(sock, stream->obj); pthread_mutex_unlock(&app->sock_lock); - if (ret < 0 && ret != -EPIPE && ret != -LTTNG_UST_ERR_EXITING) { - ERR("UST app sock %d release stream obj failed with ret %d", - sock, ret); + if (ret < 0) { + if (ret == -EPIPE || ret == -LTTNG_UST_ERR_EXITING) { + DBG3("UST app release stream failed. Application is dead: pid = %d, sock = %d", + app->pid, app->sock); + } else if (ret == -EAGAIN) { + WARN("UST app release stream failed. Communication time out: pid = %d, sock = %d", + app->pid, app->sock); + } else { + ERR("UST app release stream obj failed with ret %d: pid = %d, sock = %d", + ret, app->pid, app->sock); + } } lttng_fd_put(LTTNG_FD_APPS, 2); free(stream->obj); @@ -566,9 +599,20 @@ void delete_ust_app_channel(int sock, struct ust_app_channel *ua_chan, pthread_mutex_lock(&app->sock_lock); ret = lttng_ust_ctl_release_object(sock, ua_chan->obj); pthread_mutex_unlock(&app->sock_lock); - if (ret < 0 && ret != -EPIPE && ret != -LTTNG_UST_ERR_EXITING) { - ERR("UST app sock %d release channel obj failed with ret %d", - sock, ret); + if (ret < 0) { + if (ret == -EPIPE || ret == -LTTNG_UST_ERR_EXITING) { + DBG3("UST app channel %s release failed. Application is dead: pid = %d, sock = %d", + ua_chan->name, app->pid, + app->sock); + } else if (ret == -EAGAIN) { + WARN("UST app channel %s release failed. Communication time out: pid = %d, sock = %d", + ua_chan->name, app->pid, + app->sock); + } else { + ERR("UST app channel %s release failed with ret %d: pid = %d, sock = %d", + ua_chan->name, ret, app->pid, + app->sock); + } } lttng_fd_put(LTTNG_FD_APPS, 1); free(ua_chan->obj); @@ -931,10 +975,19 @@ void delete_ust_app_session(int sock, struct ust_app_session *ua_sess, pthread_mutex_lock(&app->sock_lock); ret = lttng_ust_ctl_release_handle(sock, ua_sess->handle); pthread_mutex_unlock(&app->sock_lock); - if (ret < 0 && ret != -EPIPE && ret != -LTTNG_UST_ERR_EXITING) { - ERR("UST app sock %d release session handle failed with ret %d", - sock, ret); + if (ret < 0) { + if (ret == -EPIPE || ret == -LTTNG_UST_ERR_EXITING) { + DBG3("UST app release session handle failed. Application is dead: pid = %d, sock = %d", + app->pid, app->sock); + } else if (ret == -EAGAIN) { + WARN("UST app release session handle failed. Communication time out: pid = %d, sock = %d", + app->pid, app->sock); + } else { + ERR("UST app release session handle failed with ret %d: pid = %d, sock = %d", + ret, app->pid, app->sock); + } } + /* Remove session from application UST object descriptor. */ iter.iter.node = &ua_sess->ust_objd_node.node; ret = lttng_ht_del(app->ust_sessions_objd, &iter); @@ -1526,17 +1579,17 @@ int create_ust_channel_context(struct ust_app_channel *ua_chan, ua_chan->obj, &ua_ctx->obj); pthread_mutex_unlock(&app->sock_lock); if (ret < 0) { - if (ret != -EPIPE && ret != -LTTNG_UST_ERR_EXITING) { - ERR("UST app create channel context failed for app (pid: %d) " - "with ret %d", app->pid, ret); - } else { - /* - * This is normal behavior, an application can die during the - * creation process. Don't report an error so the execution can - * continue normally. - */ + if (ret == -EPIPE || ret == -LTTNG_UST_ERR_EXITING) { + ret = 0; + DBG3("UST app create channel context failed. Application is dead: pid = %d, sock = %d", + app->pid, app->sock); + } else if (ret == -EAGAIN) { ret = 0; - DBG3("UST app add context failed. Application is dead."); + WARN("UST app create channel context failed. Communication time out: pid = %d, sock = %d", + app->pid, app->sock); + } else { + ERR("UST app create channel context failed with ret %d: pid = %d, sock = %d", + ret, app->pid, app->sock); } goto error; } @@ -1573,17 +1626,17 @@ static int set_ust_object_filter(struct ust_app *app, ust_object); pthread_mutex_unlock(&app->sock_lock); if (ret < 0) { - if (ret != -EPIPE && ret != -LTTNG_UST_ERR_EXITING) { - ERR("UST app set object filter failed: object = %p of app pid = %d, ret = %d", - ust_object, app->pid, ret); - } else { - /* - * This is normal behavior, an application can die during the - * creation process. Don't report an error so the execution can - * continue normally. - */ + if (ret == -EPIPE || ret == -LTTNG_UST_ERR_EXITING) { ret = 0; - DBG3("Failed to set UST app object filter. Application is dead."); + DBG3("UST app set filter failed. Application is dead: pid = %d, sock = %d", + app->pid, app->sock); + } else if (ret == -EAGAIN) { + ret = 0; + WARN("UST app set filter failed. Communication time out: pid = %d, sock = %d", + app->pid, app->sock); + } else { + ERR("UST app set filter failed with ret %d: pid = %d, sock = %d, object = %p", + ret, app->pid, app->sock, ust_object); } goto error; } @@ -1627,17 +1680,18 @@ static int set_ust_capture(struct ust_app *app, ust_object); pthread_mutex_unlock(&app->sock_lock); if (ret < 0) { - if (ret != -EPIPE && ret != -LTTNG_UST_ERR_EXITING) { - ERR("UST app set object capture failed: object = %p of app pid = %d, ret = %d", - ust_object, app->pid, ret); - } else { - /* - * This is normal behavior, an application can die during the - * creation process. Don't report an error so the execution can - * continue normally. - */ + if (ret == -EPIPE || ret == -LTTNG_UST_ERR_EXITING) { + ret = 0; + DBG3("UST app set capture failed. Application is dead: pid = %d, sock = %d", + app->pid, app->sock); + } else if (ret == -EAGAIN) { ret = 0; - DBG3("Failed to set UST app object capture. Application is dead."); + DBG3("UST app set capture failed. Communication timeout: pid = %d, sock = %d", + app->pid, app->sock); + } else { + ERR("UST app event set capture failed with ret %d: pid = %d, sock = %d", + ret, app->pid, + app->sock); } goto error; @@ -1696,17 +1750,17 @@ static int set_ust_object_exclusions(struct ust_app *app, ret = lttng_ust_ctl_set_exclusion(app->sock, ust_exclusions, ust_object); pthread_mutex_unlock(&app->sock_lock); if (ret < 0) { - if (ret != -EPIPE && ret != -LTTNG_UST_ERR_EXITING) { - ERR("Failed to set UST app exclusions for object %p of app (pid: %d) " - "with ret %d", ust_object, app->pid, ret); - } else { - /* - * This is normal behavior, an application can die during the - * creation process. Don't report an error so the execution can - * continue normally. - */ + if (ret == -EPIPE || ret == -LTTNG_UST_ERR_EXITING) { + ret = 0; + DBG3("UST app event exclusion failed. Application is dead: pid = %d, sock = %d", + app->pid, app->sock); + } else if (ret == -EAGAIN) { ret = 0; - DBG3("Failed to set UST app object exclusions. Application is dead."); + WARN("UST app event exclusion failed. Communication time out(pid: %d, sock = %d", + app->pid, app->sock); + } else { + ERR("UST app event exclusions failed with ret %d: pid = %d, sock = %d, object = %p", + ret, app->pid, app->sock, ust_object); } goto error; } @@ -1733,22 +1787,22 @@ static int disable_ust_object(struct ust_app *app, ret = lttng_ust_ctl_disable(app->sock, object); pthread_mutex_unlock(&app->sock_lock); if (ret < 0) { - if (ret != -EPIPE && ret != -LTTNG_UST_ERR_EXITING) { - ERR("Failed to disable UST app object %p app (pid: %d) with ret %d", - object, app->pid, ret); - } else { - /* - * This is normal behavior, an application can die during the - * creation process. Don't report an error so the execution can - * continue normally. - */ + if (ret == -EPIPE || ret == -LTTNG_UST_ERR_EXITING) { ret = 0; - DBG3("Failed to disable UST app object. Application is dead."); + DBG3("UST app disable object failed. Application is dead: pid = %d, sock = %d", + app->pid, app->sock); + } else if (ret == -EAGAIN) { + ret = 0; + WARN("UST app disable object failed. Communication time out: pid = %d, sock = %d", + app->pid, app->sock); + } else { + ERR("UST app disable object failed with ret %d: pid = %d, sock = %d, object = %p", + ret, app->pid, app->sock, object); } goto error; } - DBG2("UST app object %p disabled successfully for app (pid: %d)", + DBG2("UST app object %p disabled successfully for app: pid = %d", object, app->pid); error: @@ -1770,23 +1824,23 @@ static int disable_ust_channel(struct ust_app *app, ret = lttng_ust_ctl_disable(app->sock, ua_chan->obj); pthread_mutex_unlock(&app->sock_lock); if (ret < 0) { - if (ret != -EPIPE && ret != -LTTNG_UST_ERR_EXITING) { - ERR("UST app channel %s disable failed for app (pid: %d) " - "and session handle %d with ret %d", - ua_chan->name, app->pid, ua_sess->handle, ret); - } else { - /* - * This is normal behavior, an application can die during the - * creation process. Don't report an error so the execution can - * continue normally. - */ + if (ret == -EPIPE || ret == -LTTNG_UST_ERR_EXITING) { + ret = 0; + DBG3("UST app disable channel failed. Application is dead: pid = %d, sock = %d", + app->pid, app->sock); + } else if (ret == -EAGAIN) { ret = 0; - DBG3("UST app disable channel failed. Application is dead."); + WARN("UST app disable channel failed. Communication time out: pid = %d, sock = %d", + app->pid, app->sock); + } else { + ERR("UST app channel %s disable failed, session handle %d, with ret %d: pid = %d, sock = %d", + ua_chan->name, ua_sess->handle, ret, + app->pid, app->sock); } goto error; } - DBG2("UST app channel %s disabled successfully for app (pid: %d)", + DBG2("UST app channel %s disabled successfully for app: pid = %d", ua_chan->name, app->pid); error: @@ -1808,25 +1862,25 @@ static int enable_ust_channel(struct ust_app *app, ret = lttng_ust_ctl_enable(app->sock, ua_chan->obj); pthread_mutex_unlock(&app->sock_lock); if (ret < 0) { - if (ret != -EPIPE && ret != -LTTNG_UST_ERR_EXITING) { - ERR("UST app channel %s enable failed for app (pid: %d) " - "and session handle %d with ret %d", - ua_chan->name, app->pid, ua_sess->handle, ret); - } else { - /* - * This is normal behavior, an application can die during the - * creation process. Don't report an error so the execution can - * continue normally. - */ + if (ret == -EPIPE || ret == -LTTNG_UST_ERR_EXITING) { ret = 0; - DBG3("UST app enable channel failed. Application is dead."); + DBG3("UST app channel %s enable failed. Application is dead: pid = %d, sock = %d", + ua_chan->name, app->pid, app->sock); + } else if (ret == -EAGAIN) { + ret = 0; + WARN("UST app channel %s enable failed. Communication time out: pid = %d, sock = %d", + ua_chan->name, app->pid, app->sock); + } else { + ERR("UST app channel %s enable failed, session handle %d, with ret %d: pid = %d, sock = %d", + ua_chan->name, ua_sess->handle, ret, + app->pid, app->sock); } goto error; } ua_chan->enabled = 1; - DBG2("UST app channel %s enabled successfully for app (pid: %d)", + DBG2("UST app channel %s enabled successfully for app: pid = %d", ua_chan->name, app->pid); error: @@ -1848,22 +1902,22 @@ static int enable_ust_object( ret = lttng_ust_ctl_enable(app->sock, ust_object); pthread_mutex_unlock(&app->sock_lock); if (ret < 0) { - if (ret != -EPIPE && ret != -LTTNG_UST_ERR_EXITING) { - ERR("UST app enable failed for object %p app (pid: %d) with ret %d", - ust_object, app->pid, ret); - } else { - /* - * This is normal behavior, an application can die during the - * creation process. Don't report an error so the execution can - * continue normally. - */ + if (ret == -EPIPE || ret == -LTTNG_UST_ERR_EXITING) { + ret = 0; + DBG3("UST app enable object failed. Application is dead: pid = %d, sock = %d", + app->pid, app->sock); + } else if (ret == -EAGAIN) { ret = 0; - DBG3("Failed to enable UST app object. Application is dead."); + WARN("UST app enable object failed. Communication time out: pid = %d, sock = %d", + app->pid, app->sock); + } else { + ERR("UST app enable object failed with ret %d: pid = %d, sock = %d, object = %p", + ret, app->pid, app->sock, ust_object); } goto error; } - DBG2("UST app object %p enabled successfully for app (pid: %d)", + DBG2("UST app object %p enabled successfully for app: pid = %d", ust_object, app->pid); error: @@ -1896,6 +1950,13 @@ static int send_channel_pid_to_ust(struct ust_app *app, if (ret == -EPIPE || ret == -LTTNG_UST_ERR_EXITING) { ret = -ENOTCONN; /* Caused by app exiting. */ goto error; + } else if (ret == -EAGAIN) { + /* Caused by timeout. */ + WARN("Communication with application %d timed out on send_channel for channel \"%s\" of session \"%" PRIu64 "\".", + app->pid, ua_chan->name, ua_sess->tracing_id); + /* Treat this the same way as an application that is exiting. */ + ret = -ENOTCONN; + goto error; } else if (ret < 0) { goto error; } @@ -1906,8 +1967,18 @@ static int send_channel_pid_to_ust(struct ust_app *app, cds_list_for_each_entry_safe(stream, stmp, &ua_chan->streams.head, list) { ret = ust_consumer_send_stream_to_ust(app, ua_chan, stream); if (ret == -EPIPE || ret == -LTTNG_UST_ERR_EXITING) { - ret = -ENOTCONN; /* Caused by app exiting. */ + ret = -ENOTCONN; /* Caused by app exiting. */ goto error; + } else if (ret == -EAGAIN) { + /* Caused by timeout. */ + WARN("Communication with application %d timed out on send_stream for stream \"%s\" of channel \"%s\" of session \"%" PRIu64 "\".", + app->pid, stream->name, ua_chan->name, + ua_sess->tracing_id); + /* + * Treat this the same way as an application that is + * exiting. + */ + ret = -ENOTCONN; } else if (ret < 0) { goto error; } @@ -1942,25 +2013,25 @@ int create_ust_event(struct ust_app *app, struct ust_app_session *ua_sess, &ua_event->obj); pthread_mutex_unlock(&app->sock_lock); if (ret < 0) { - if (ret != -EPIPE && ret != -LTTNG_UST_ERR_EXITING) { - abort(); - ERR("Error ustctl create event %s for app pid: %d with ret %d", - ua_event->attr.name, app->pid, ret); - } else { - /* - * This is normal behavior, an application can die during the - * creation process. Don't report an error so the execution can - * continue normally. - */ + if (ret == -EPIPE || ret == -LTTNG_UST_ERR_EXITING) { ret = 0; - DBG3("UST app create event failed. Application is dead."); + DBG3("UST app create event failed. Application is dead: pid = %d, sock = %d", + app->pid, app->sock); + } else if (ret == -EAGAIN) { + ret = 0; + WARN("UST app create event failed. Communication time out: pid = %d, sock = %d", + app->pid, app->sock); + } else { + ERR("UST app create event '%s' failed with ret %d: pid = %d, sock = %d", + ua_event->attr.name, ret, app->pid, + app->sock); } goto error; } ua_event->handle = ua_event->obj->handle; - DBG2("UST app event %s created successfully for pid:%d object: %p", + DBG2("UST app event %s created successfully for pid:%d object = %p", ua_event->attr.name, app->pid, ua_event->obj); health_code_update(); @@ -2143,28 +2214,26 @@ static int create_ust_event_notifier(struct ust_app *app, &ua_event_notifier_rule->obj); pthread_mutex_unlock(&app->sock_lock); if (ret < 0) { - if (ret != -EPIPE && ret != -LTTNG_UST_ERR_EXITING) { - ERR("Error ustctl create event notifier: name = '%s', app = '%s' (ppid: %d), ret = %d", - event_notifier.event.name, app->name, - app->ppid, ret); - } else { - /* - * This is normal behavior, an application can die - * during the creation process. Don't report an error so - * the execution can continue normally. - */ + if (ret == -EPIPE || ret == -LTTNG_UST_ERR_EXITING) { + ret = 0; + DBG3("UST app create event notifier failed. Application is dead: pid = %d, sock = %d", + app->pid, app->sock); + } else if (ret == -EAGAIN) { ret = 0; - DBG3("UST app create event notifier failed (application is dead): app = '%s' (ppid = %d)", - app->name, app->ppid); + WARN("UST app create event notifier failed. Communication time out: pid = %d, sock = %d", + app->pid, app->sock); + } else { + ERR("UST app create event notifier '%s' failed with ret %d: pid = %d, sock = %d", + event_notifier.event.name, ret, app->pid, + app->sock); } - goto error; } ua_event_notifier_rule->handle = ua_event_notifier_rule->obj->handle; - DBG2("UST app event notifier %s created successfully: app = '%s' (ppid: %d), object: %p", - event_notifier.event.name, app->name, app->ppid, + DBG2("UST app event notifier %s created successfully: app = '%s': pid = %d), object = %p", + event_notifier.event.name, app->name, app->pid, ua_event_notifier_rule->obj); health_code_update(); @@ -2640,18 +2709,17 @@ static int find_or_create_ust_app_session(struct ltt_ust_session *usess, ret = lttng_ust_ctl_create_session(app->sock); pthread_mutex_unlock(&app->sock_lock); if (ret < 0) { - if (ret != -EPIPE && ret != -LTTNG_UST_ERR_EXITING) { - ERR("Creating session for app pid %d with ret %d", - app->pid, ret); - } else { - DBG("UST app creating session failed. Application is dead"); - /* - * This is normal behavior, an application can die during the - * creation process. Don't report an error so the execution can - * continue normally. This will get flagged ENOTCONN and the - * caller will handle it. - */ + if (ret == -EPIPE || ret == -LTTNG_UST_ERR_EXITING) { + DBG("UST app creating session failed. Application is dead: pid = %d, sock = %d", + app->pid, app->sock); + ret = 0; + } else if (ret == -EAGAIN) { + DBG("UST app creating session failed. Communication time out: pid = %d, sock = %d", + app->pid, app->sock); ret = 0; + } else { + ERR("UST app creating session failed with ret %d: pid = %d, sock =%d", + ret, app->pid, app->sock); } delete_ust_app_session(-1, ua_sess, app); if (ret != -ENOMEM) { @@ -3233,6 +3301,13 @@ static int send_channel_uid_to_ust(struct buffer_reg_channel *buf_reg_chan, if (ret == -EPIPE || ret == -LTTNG_UST_ERR_EXITING) { ret = -ENOTCONN; /* Caused by app exiting. */ goto error; + } else if (ret == -EAGAIN) { + /* Caused by timeout. */ + WARN("Communication with application %d timed out on send_channel for channel \"%s\" of session \"%" PRIu64 "\".", + app->pid, ua_chan->name, ua_sess->tracing_id); + /* Treat this the same way as an application that is exiting. */ + ret = -ENOTCONN; + goto error; } else if (ret < 0) { goto error; } @@ -3251,10 +3326,21 @@ static int send_channel_uid_to_ust(struct buffer_reg_channel *buf_reg_chan, ret = ust_consumer_send_stream_to_ust(app, ua_chan, &stream); if (ret < 0) { - (void) release_ust_app_stream(-1, &stream, app); if (ret == -EPIPE || ret == -LTTNG_UST_ERR_EXITING) { ret = -ENOTCONN; /* Caused by app exiting. */ + } else if (ret == -EAGAIN) { + /* + * Caused by timeout. + * Treat this the same way as an application + * that is exiting. + */ + WARN("Communication with application %d timed out on send_stream for stream \"%s\" of channel \"%s\" of session \"%" PRIu64 "\".", + app->pid, stream.name, + ua_chan->name, + ua_sess->tracing_id); + ret = -ENOTCONN; } + (void) release_ust_app_stream(-1, &stream, app); goto error_stream_unlock; } @@ -3649,8 +3735,8 @@ int create_ust_app_event(struct ust_app_session *ua_sess, add_unique_ust_app_event(ua_chan, ua_event); - DBG2("UST app create event completed: app = '%s' (ppid: %d)", - app->name, app->ppid); + DBG2("UST app create event completed: app = '%s' pid = %d", + app->name, app->pid); end: return ret; @@ -3702,8 +3788,8 @@ int create_ust_app_event_notifier_rule(struct lttng_trigger *trigger, lttng_ht_add_unique_u64(app->token_to_event_notifier_rule_ht, &ua_event_notifier_rule->node); - DBG2("UST app create token event rule completed: app = '%s' (ppid: %d), token = %" PRIu64, - app->name, app->ppid, lttng_trigger_get_tracer_token(trigger)); + DBG2("UST app create token event rule completed: app = '%s', pid = %d), token = %" PRIu64, + app->name, app->pid, lttng_trigger_get_tracer_token(trigger)); goto end; @@ -3885,15 +3971,15 @@ struct ust_app *ust_app_create(struct ust_register_msg *msg, int sock) */ ret = lttng_fd_get(LTTNG_FD_APPS, 2); if (ret) { - ERR("Failed to reserve two file descriptors for the event source pipe while creating a new application instance: app = '%s' (ppid: %d)", - msg->name, (int) msg->ppid); + ERR("Failed to reserve two file descriptors for the event source pipe while creating a new application instance: app = '%s', pid = %d", + msg->name, (int) msg->pid); goto error; } event_notifier_event_source_pipe = lttng_pipe_open(FD_CLOEXEC); if (!event_notifier_event_source_pipe) { - PERROR("Failed to open application event source pipe: '%s' (ppid = %d)", - msg->name, msg->ppid); + PERROR("Failed to open application event source pipe: '%s' (pid = %d)", + msg->name, msg->pid); goto error; } @@ -3981,8 +4067,8 @@ void ust_app_add(struct ust_app *app) lttng_ht_node_init_ulong(&app->notify_sock_n, app->notify_sock); lttng_ht_add_unique_ulong(ust_app_ht_by_notify_sock, &app->notify_sock_n); - DBG("App registered with pid:%d ppid:%d uid:%d gid:%d sock:%d name:%s " - "notify_sock:%d (version %d.%d)", app->pid, app->ppid, app->uid, + DBG("App registered with pid:%d ppid:%d uid:%d gid:%d sock =%d name:%s " + "notify_sock =%d (version %d.%d)", app->pid, app->ppid, app->uid, app->gid, app->sock, app->name, app->notify_sock, app->v_major, app->v_minor); @@ -4005,10 +4091,15 @@ int ust_app_version(struct ust_app *app) ret = lttng_ust_ctl_tracer_version(app->sock, &app->version); pthread_mutex_unlock(&app->sock_lock); if (ret < 0) { - if (ret != -LTTNG_UST_ERR_EXITING && ret != -EPIPE) { - ERR("UST app %d version failed with ret %d", app->sock, ret); + if (ret == -LTTNG_UST_ERR_EXITING || ret == -EPIPE) { + DBG3("UST app version failed. Application is dead: pid = %d, sock = %d", + app->pid, app->sock); + } else if (ret == -EAGAIN) { + WARN("UST app version failed. Communication time out: pid = %d, sock = %d", + app->pid, app->sock); } else { - DBG3("UST app %d version failed. Application is dead", app->sock); + ERR("UST app version failed with ret %d: pid = %d, sock = %d", + ret, app->pid, app->sock); } } @@ -4055,21 +4146,25 @@ int ust_app_setup_event_notifier_group(struct ust_app *app) event_pipe_write_fd, &event_notifier_group); pthread_mutex_unlock(&app->sock_lock); if (ret < 0) { - if (ret != -LTTNG_UST_ERR_EXITING && ret != -EPIPE) { - ERR("Failed to create application event notifier group: ret = %d, app socket fd = %d, event_pipe_write_fd = %d", - ret, app->sock, event_pipe_write_fd); + if (ret == -EPIPE || ret == -LTTNG_UST_ERR_EXITING) { + ret = 0; + DBG3("UST app create event notifier group failed. Application is dead: pid = %d, sock = %d", + app->pid, app->sock); + } else if (ret == -EAGAIN) { + ret = 0; + WARN("UST app create event notifier group failed. Communication time out: pid = %d, sock = %d", + app->pid, app->sock); } else { - DBG("Failed to create application event notifier group (application is dead): app socket fd = %d", - app->sock); + ERR("UST app create event notifier group failed with ret %d: pid = %d, sock = %d, event_pipe_write_fd: %d", + ret, app->pid, app->sock, event_pipe_write_fd); } - goto error; } ret = lttng_pipe_write_close(app->event_notifier_group.event_pipe); if (ret) { - ERR("Failed to close write end of the application's event source pipe: app = '%s' (ppid = %d)", - app->name, app->ppid); + ERR("Failed to close write end of the application's event source pipe: app = '%s' (pid = %d)", + app->name, app->pid); goto error; } @@ -4099,13 +4194,13 @@ int ust_app_setup_event_notifier_group(struct ust_app *app) case EVENT_NOTIFIER_ERROR_ACCOUNTING_STATUS_OK: break; case EVENT_NOTIFIER_ERROR_ACCOUNTING_STATUS_UNSUPPORTED: - DBG3("Failed to setup event notifier error accounting (application does not support notifier error accounting): app socket fd = %d, app name = '%s', app ppid = %d", - app->sock, app->name, (int) app->ppid); + DBG3("Failed to setup event notifier error accounting (application does not support notifier error accounting): app socket fd = %d, app name = '%s', app pid = %d", + app->sock, app->name, (int) app->pid); ret = 0; goto error_accounting; case EVENT_NOTIFIER_ERROR_ACCOUNTING_STATUS_APP_DEAD: - DBG3("Failed to setup event notifier error accounting (application is dead): app socket fd = %d, app name = '%s', app ppid = %d", - app->sock, app->name, (int) app->ppid); + DBG3("Failed to setup event notifier error accounting (application is dead): app socket fd = %d, app name = '%s', app pid = %d", + app->sock, app->name, (int) app->pid); ret = 0; goto error_accounting; default: @@ -4307,11 +4402,6 @@ int ust_app_list_events(struct lttng_event **events) app->sock, ret); } else { DBG3("UST app tp list get failed. Application is dead"); - /* - * This is normal behavior, an application can die during the - * creation process. Don't report an error so the execution can - * continue normally. Continue normal execution. - */ break; } free(tmp_event); @@ -4366,8 +4456,17 @@ int ust_app_list_events(struct lttng_event **events) } ret = lttng_ust_ctl_release_handle(app->sock, handle); pthread_mutex_unlock(&app->sock_lock); - if (ret < 0 && ret != -LTTNG_UST_ERR_EXITING && ret != -EPIPE) { - ERR("Error releasing app handle for app %d with ret %d", app->sock, ret); + if (ret < 0) { + if (ret == -EPIPE || ret == -LTTNG_UST_ERR_EXITING) { + DBG3("Error releasing app handle. Application died: pid = %d, sock = %d", + app->pid, app->sock); + } else if (ret == -EAGAIN) { + WARN("Error releasing app handle. Communication time out: pid = %d, sock = %d", + app->pid, app->sock); + } else { + ERR("Error releasing app handle with ret %d: pid = %d, sock = %d", + ret, app->pid, app->sock); + } } } @@ -4438,11 +4537,6 @@ int ust_app_list_event_fields(struct lttng_event_field **fields) app->sock, ret); } else { DBG3("UST app tp list field failed. Application is dead"); - /* - * This is normal behavior, an application can die during the - * creation process. Don't report an error so the execution can - * continue normally. Reset list and count for next app. - */ break; } free(tmp_event); @@ -5047,18 +5141,20 @@ skip_setup: ret = lttng_ust_ctl_start_session(app->sock, ua_sess->handle); pthread_mutex_unlock(&app->sock_lock); if (ret < 0) { - if (ret != -EPIPE && ret != -LTTNG_UST_ERR_EXITING) { - ERR("Error starting tracing for app pid: %d (ret: %d)", - app->pid, ret); - } else { - DBG("UST app start session failed. Application is dead."); - /* - * This is normal behavior, an application can die during the - * creation process. Don't report an error so the execution can - * continue normally. - */ + if (ret == -EPIPE || ret == -LTTNG_UST_ERR_EXITING) { + DBG3("UST app start session failed. Application is dead: pid = %d, sock = %d", + app->pid, app->sock); + pthread_mutex_unlock(&ua_sess->lock); + goto end; + } else if (ret == -EAGAIN) { + WARN("UST app start session failed. Communication time out: pid = %d, sock = %d", + app->pid, app->sock); pthread_mutex_unlock(&ua_sess->lock); goto end; + + } else { + ERR("UST app start session failed with ret %d: pid = %d, sock = %d", + ret, app->pid, app->sock); } goto error_unlock; } @@ -5075,9 +5171,17 @@ skip_setup: pthread_mutex_lock(&app->sock_lock); ret = lttng_ust_ctl_wait_quiescent(app->sock); pthread_mutex_unlock(&app->sock_lock); - if (ret < 0 && ret != -EPIPE && ret != -LTTNG_UST_ERR_EXITING) { - ERR("UST app wait quiescent failed for app pid %d ret %d", - app->pid, ret); + if (ret < 0) { + if (ret == -EPIPE || ret == -LTTNG_UST_ERR_EXITING) { + DBG3("UST app wait quiescent failed. Application is dead: pid = %d, sock = %d", + app->pid, app->sock); + } else if (ret == -EAGAIN) { + WARN("UST app wait quiescent failed. Communication time out: pid = %d, sock = %d", + app->pid, app->sock); + } else { + ERR("UST app wait quiescent failed with ret %d: pid %d, sock = %d", + ret, app->pid, app->sock); + } } end: @@ -5139,17 +5243,18 @@ int ust_app_stop_trace(struct ltt_ust_session *usess, struct ust_app *app) ret = lttng_ust_ctl_stop_session(app->sock, ua_sess->handle); pthread_mutex_unlock(&app->sock_lock); if (ret < 0) { - if (ret != -EPIPE && ret != -LTTNG_UST_ERR_EXITING) { - ERR("Error stopping tracing for app pid: %d (ret: %d)", - app->pid, ret); - } else { - DBG("UST app stop session failed. Application is dead."); - /* - * This is normal behavior, an application can die during the - * creation process. Don't report an error so the execution can - * continue normally. - */ + if (ret == -EPIPE || ret == -LTTNG_UST_ERR_EXITING) { + DBG3("UST app stop session failed. Application is dead: pid = %d, sock = %d", + app->pid, app->sock); + goto end_unlock; + } else if (ret == -EAGAIN) { + WARN("UST app stop session failed. Communication time out: pid = %d, sock = %d", + app->pid, app->sock); goto end_unlock; + + } else { + ERR("UST app stop session failed with ret %d: pid = %d, sock = %d", + ret, app->pid, app->sock); } goto error_rcu_unlock; } @@ -5161,9 +5266,17 @@ int ust_app_stop_trace(struct ltt_ust_session *usess, struct ust_app *app) pthread_mutex_lock(&app->sock_lock); ret = lttng_ust_ctl_wait_quiescent(app->sock); pthread_mutex_unlock(&app->sock_lock); - if (ret < 0 && ret != -EPIPE && ret != -LTTNG_UST_ERR_EXITING) { - ERR("UST app wait quiescent failed for app pid %d ret %d", - app->pid, ret); + if (ret < 0) { + if (ret == -EPIPE || ret == -LTTNG_UST_ERR_EXITING) { + DBG3("UST app wait quiescent failed. Application is dead: pid= %d, sock = %d)", + app->pid, app->sock); + } else if (ret == -EAGAIN) { + WARN("UST app wait quiescent failed. Communication time out: pid= %d, sock = %d)", + app->pid, app->sock); + } else { + ERR("UST app wait quiescent failed with ret %d: pid= %d, sock = %d)", + ret, app->pid, app->sock); + } } health_code_update(); @@ -5511,9 +5624,17 @@ static int destroy_trace(struct ltt_ust_session *usess, struct ust_app *app) pthread_mutex_lock(&app->sock_lock); ret = lttng_ust_ctl_wait_quiescent(app->sock); pthread_mutex_unlock(&app->sock_lock); - if (ret < 0 && ret != -EPIPE && ret != -LTTNG_UST_ERR_EXITING) { - ERR("UST app wait quiescent failed for app pid %d ret %d", - app->pid, ret); + if (ret < 0) { + if (ret == -EPIPE || ret == -LTTNG_UST_ERR_EXITING) { + DBG3("UST app wait quiescent failed. Application is dead: pid= %d, sock = %d)", + app->pid, app->sock); + } else if (ret == -EAGAIN) { + WARN("UST app wait quiescent failed. Communication time out: pid= %d, sock = %d)", + app->pid, app->sock); + } else { + ERR("UST app wait quiescent failed with ret %d: pid= %d, sock = %d)", + ret, app->pid, app->sock); + } } end: rcu_read_unlock(); @@ -5999,16 +6120,16 @@ void ust_app_global_update(struct ltt_ust_session *usess, struct ust_app *app) */ void ust_app_global_update_event_notifier_rules(struct ust_app *app) { - DBG2("UST application global event notifier rules update: app = '%s' (ppid: %d)", - app->name, app->ppid); + DBG2("UST application global event notifier rules update: app = '%s', pid = %d)", + app->name, app->pid); if (!app->compatible || !ust_app_supports_notifiers(app)) { return; } if (app->event_notifier_group.object == NULL) { - WARN("UST app global update of event notifiers for app skipped since communication handle is null: app = '%s' (ppid: %d)", - app->name, app->ppid); + WARN("UST app global update of event notifiers for app skipped since communication handle is null: app = '%s' pid = %d)", + app->name, app->pid); return; } @@ -6294,15 +6415,20 @@ static int reply_ust_register_channel(int sock, int cobjd, reply: DBG3("UST app replying to register channel key %" PRIu64 - " with id %u, type: %d, ret: %d", chan_reg_key, chan_id, type, + " with id %u, type = %d, ret = %d", chan_reg_key, chan_id, type, ret_code); ret = lttng_ust_ctl_reply_register_channel(sock, chan_id, type, ret_code); if (ret < 0) { - if (ret != -EPIPE && ret != -LTTNG_UST_ERR_EXITING) { - ERR("UST app reply channel failed with ret %d", ret); + if (ret == -EPIPE || ret == -LTTNG_UST_ERR_EXITING) { + DBG3("UST app reply channel failed. Application died: pid = %d, sock = %d", + app->pid, app->sock); + } else if (ret == -EAGAIN) { + WARN("UST app reply channel failed. Communication time out: pid = %d, sock = %d", + app->pid, app->sock); } else { - DBG3("UST app reply channel failed. Application died"); + ERR("UST app reply channel failed with ret %d: pid = %d, sock = %d", + ret, app->pid, app->sock); } goto error; } @@ -6396,10 +6522,15 @@ static int add_event_ust_registry(int sock, int sobjd, int cobjd, char *name, */ ret = lttng_ust_ctl_reply_register_event(sock, event_id, ret_code); if (ret < 0) { - if (ret != -EPIPE && ret != -LTTNG_UST_ERR_EXITING) { - ERR("UST app reply event failed with ret %d", ret); + if (ret == -EPIPE || ret == -LTTNG_UST_ERR_EXITING) { + DBG3("UST app reply event failed. Application died: pid = %d, sock = %d.", + app->pid, app->sock); + } else if (ret == -EAGAIN) { + WARN("UST app reply event failed. Communication time out: pid = %d, sock = %d", + app->pid, app->sock); } else { - DBG3("UST app reply event failed. Application died"); + ERR("UST app reply event failed with ret %d: pid = %d, sock = %d", + ret, app->pid, app->sock); } /* * No need to wipe the create event since the application socket will @@ -6484,10 +6615,15 @@ static int add_enum_ust_registry(int sock, int sobjd, char *name, */ ret = lttng_ust_ctl_reply_register_enum(sock, enum_id, ret_code); if (ret < 0) { - if (ret != -EPIPE && ret != -LTTNG_UST_ERR_EXITING) { - ERR("UST app reply enum failed with ret %d", ret); + if (ret == -EPIPE || ret == -LTTNG_UST_ERR_EXITING) { + DBG3("UST app reply enum failed. Application died: pid = %d, sock = %d", + app->pid, app->sock); + } else if (ret == -EAGAIN) { + WARN("UST app reply enum failed. Communication time out: pid = %d, sock = %d", + app->pid, app->sock); } else { - DBG3("UST app reply enum failed. Application died"); + ERR("UST app reply enum failed with ret %d: pid = %d, sock = %d", + ret, app->pid, app->sock); } /* * No need to wipe the create enum since the application socket will @@ -6519,10 +6655,15 @@ int ust_app_recv_notify(int sock) ret = lttng_ust_ctl_recv_notify(sock, &cmd); if (ret < 0) { - if (ret != -EPIPE && ret != -LTTNG_UST_ERR_EXITING) { - ERR("UST app recv notify failed with ret %d", ret); + if (ret == -EPIPE || ret == -LTTNG_UST_ERR_EXITING) { + DBG3("UST app recv notify failed. Application died: sock = %d", + sock); + } else if (ret == -EAGAIN) { + WARN("UST app recv notify failed. Communication time out: sock = %d", + sock); } else { - DBG3("UST app recv notify failed. Application died"); + ERR("UST app recv notify failed with ret %d: sock = %d", + ret, sock); } goto error; } @@ -6541,10 +6682,15 @@ int ust_app_recv_notify(int sock) &loglevel_value, &sig, &nr_fields, &fields, &model_emf_uri); if (ret < 0) { - if (ret != -EPIPE && ret != -LTTNG_UST_ERR_EXITING) { - ERR("UST app recv event failed with ret %d", ret); + if (ret == -EPIPE || ret == -LTTNG_UST_ERR_EXITING) { + DBG3("UST app recv event failed. Application died: sock = %d", + sock); + } else if (ret == -EAGAIN) { + WARN("UST app recv event failed. Communication time out: sock = %d", + sock); } else { - DBG3("UST app recv event failed. Application died"); + ERR("UST app recv event failed with ret %d: sock = %d", + ret, sock); } goto error; } @@ -6574,10 +6720,15 @@ int ust_app_recv_notify(int sock) ret = lttng_ust_ctl_recv_register_channel(sock, &sobjd, &cobjd, &nr_fields, &fields); if (ret < 0) { - if (ret != -EPIPE && ret != -LTTNG_UST_ERR_EXITING) { - ERR("UST app recv channel failed with ret %d", ret); + if (ret == -EPIPE || ret == -LTTNG_UST_ERR_EXITING) { + DBG3("UST app recv channel failed. Application died: sock = %d", + sock); + } else if (ret == -EAGAIN) { + WARN("UST app recv channel failed. Communication time out: sock = %d", + sock); } else { - DBG3("UST app recv channel failed. Application died"); + ERR("UST app recv channel failed with ret %d: sock = %d)", + ret, sock); } goto error; } @@ -6607,10 +6758,15 @@ int ust_app_recv_notify(int sock) ret = lttng_ust_ctl_recv_register_enum(sock, &sobjd, name, &entries, &nr_entries); if (ret < 0) { - if (ret != -EPIPE && ret != -LTTNG_UST_ERR_EXITING) { - ERR("UST app recv enum failed with ret %d", ret); + if (ret == -EPIPE || ret == -LTTNG_UST_ERR_EXITING) { + DBG3("UST app recv enum failed. Application died: sock = %d", + sock); + } else if (ret == -EAGAIN) { + WARN("UST app recv enum failed. Communication time out: sock = %d", + sock); } else { - DBG3("UST app recv enum failed. Application died"); + ERR("UST app recv enum failed with ret %d: sock = %d", + ret, sock); } goto error; } diff --git a/src/bin/lttng-sessiond/ust-consumer.c b/src/bin/lttng-sessiond/ust-consumer.c index 14cce9b4f..379aa5884 100644 --- a/src/bin/lttng-sessiond/ust-consumer.c +++ b/src/bin/lttng-sessiond/ust-consumer.c @@ -374,11 +374,16 @@ int ust_consumer_send_stream_to_ust(struct ust_app *app, ret = lttng_ust_ctl_send_stream_to_ust(app->sock, channel->obj, stream->obj); pthread_mutex_unlock(&app->sock_lock); if (ret < 0) { - if (ret != -EPIPE && ret != -LTTNG_UST_ERR_EXITING) { - ERR("ustctl send stream handle %d to app pid: %d with ret %d", - stream->obj->handle, app->pid, ret); + if (ret == -EPIPE || ret == -LTTNG_UST_ERR_EXITING) { + DBG3("UST app send stream to ust failed. Application is dead. (pid: %d, sock: %d).", + app->pid, app->sock); + } else if (ret == -EAGAIN) { + WARN("UST app send stream to ust failed. Communication time out (pid: %d, sock: %d).", + app->pid, app->sock); } else { - DBG3("UST app send stream to ust failed. Application is dead."); + ERR("UST app send stream, handle %d, to ust failed with ret %d (pid: %d, sock: %d).", + stream->obj->handle, ret, app->pid, + app->sock); } goto error; } @@ -411,11 +416,16 @@ int ust_consumer_send_channel_to_ust(struct ust_app *app, ret = lttng_ust_ctl_send_channel_to_ust(app->sock, ua_sess->handle, channel->obj); pthread_mutex_unlock(&app->sock_lock); if (ret < 0) { - if (ret != -EPIPE && ret != -LTTNG_UST_ERR_EXITING) { - ERR("Error ustctl send channel %s to app pid: %d with ret %d", - channel->name, app->pid, ret); + if (ret == -EPIPE || ret == -LTTNG_UST_ERR_EXITING) { + DBG3("UST app send channel to ust failed. Application is dead (pid: %d, sock: %d).", + app->pid, app->sock); + } else if (ret == -EAGAIN) { + WARN("UST app send channel to ust failed. Communication timeout (pid: %d, sock: %d).", + app->pid, app->sock); } else { - DBG3("UST app send channel to ust failed. Application is dead."); + ERR("UST app send channel %s, to ust failed with ret %d (pid: %d, sock: %d).", + channel->name, ret, app->pid, + app->sock); } goto error; } -- 2.34.1