2 * Copyright (C) 2017 - Julien Desfossez <jdesfossez@efficios.com>
3 * Copyright (C) 2018 - Jérémie Galarneau <jeremie.galarneau@efficios.com>
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License, version 2 only, as
7 * published by the Free Software Foundation.
9 * This program is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc., 51
16 * Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20 #include <lttng/trigger/trigger.h>
21 #include <common/error.h>
22 #include <common/config/session-config.h>
23 #include <common/defaults.h>
24 #include <common/utils.h>
25 #include <common/futex.h>
26 #include <common/align.h>
27 #include <common/time.h>
28 #include <common/hashtable/utils.h>
29 #include <sys/eventfd.h>
35 #include <common/kernel-ctl/kernel-ctl.h>
36 #include <lttng/notification/channel-internal.h>
37 #include <lttng/rotate-internal.h>
39 #include "rotation-thread.h"
40 #include "lttng-sessiond.h"
41 #include "health-sessiond.h"
46 #include "notification-thread-commands.h"
51 #include <urcu/list.h>
53 struct lttng_notification_channel
*rotate_notification_channel
= NULL
;
55 struct rotation_thread
{
56 struct lttng_poll_event events
;
59 struct rotation_thread_job
{
60 enum rotation_thread_job_type type
;
61 struct ltt_session
*session
;
62 /* List member in struct rotation_thread_timer_queue. */
63 struct cds_list_head head
;
67 * The timer thread enqueues jobs and wakes up the rotation thread.
68 * When the rotation thread wakes up, it empties the queue.
70 struct rotation_thread_timer_queue
{
71 struct lttng_pipe
*event_pipe
;
72 struct cds_list_head list
;
76 struct rotation_thread_handle
{
77 struct rotation_thread_timer_queue
*rotation_timer_queue
;
78 /* Access to the notification thread cmd_queue */
79 struct notification_thread_handle
*notification_thread_handle
;
80 /* Thread-specific quit pipe. */
81 struct lttng_pipe
*quit_pipe
;
85 const char *get_job_type_str(enum rotation_thread_job_type job_type
)
88 case ROTATION_THREAD_JOB_TYPE_CHECK_PENDING_ROTATION
:
89 return "CHECK_PENDING_ROTATION";
90 case ROTATION_THREAD_JOB_TYPE_SCHEDULED_ROTATION
:
91 return "SCHEDULED_ROTATION";
97 struct rotation_thread_timer_queue
*rotation_thread_timer_queue_create(void)
99 struct rotation_thread_timer_queue
*queue
= NULL
;
101 queue
= zmalloc(sizeof(*queue
));
103 PERROR("Failed to allocate timer rotate queue");
107 queue
->event_pipe
= lttng_pipe_open(FD_CLOEXEC
| O_NONBLOCK
);
108 CDS_INIT_LIST_HEAD(&queue
->list
);
109 pthread_mutex_init(&queue
->lock
, NULL
);
114 void rotation_thread_timer_queue_destroy(
115 struct rotation_thread_timer_queue
*queue
)
121 lttng_pipe_destroy(queue
->event_pipe
);
123 pthread_mutex_lock(&queue
->lock
);
124 assert(cds_list_empty(&queue
->list
));
125 pthread_mutex_unlock(&queue
->lock
);
126 pthread_mutex_destroy(&queue
->lock
);
131 * Destroy the thread data previously created by the init function.
133 void rotation_thread_handle_destroy(
134 struct rotation_thread_handle
*handle
)
136 lttng_pipe_destroy(handle
->quit_pipe
);
140 struct rotation_thread_handle
*rotation_thread_handle_create(
141 struct rotation_thread_timer_queue
*rotation_timer_queue
,
142 struct notification_thread_handle
*notification_thread_handle
)
144 struct rotation_thread_handle
*handle
;
146 handle
= zmalloc(sizeof(*handle
));
151 handle
->rotation_timer_queue
= rotation_timer_queue
;
152 handle
->notification_thread_handle
= notification_thread_handle
;
153 handle
->quit_pipe
= lttng_pipe_open(FD_CLOEXEC
);
154 if (!handle
->quit_pipe
) {
161 rotation_thread_handle_destroy(handle
);
166 * Called with the rotation_thread_timer_queue lock held.
167 * Return true if the same timer job already exists in the queue, false if not.
170 bool timer_job_exists(const struct rotation_thread_timer_queue
*queue
,
171 enum rotation_thread_job_type job_type
,
172 struct ltt_session
*session
)
175 struct rotation_thread_job
*job
;
177 cds_list_for_each_entry(job
, &queue
->list
, head
) {
178 if (job
->session
== session
&& job
->type
== job_type
) {
187 void rotation_thread_enqueue_job(struct rotation_thread_timer_queue
*queue
,
188 enum rotation_thread_job_type job_type
,
189 struct ltt_session
*session
)
192 const char dummy
= '!';
193 struct rotation_thread_job
*job
= NULL
;
194 const char *job_type_str
= get_job_type_str(job_type
);
196 pthread_mutex_lock(&queue
->lock
);
197 if (timer_job_exists(queue
, job_type
, session
)) {
199 * This timer job is already pending, we don't need to add
205 job
= zmalloc(sizeof(struct rotation_thread_job
));
207 PERROR("Failed to allocate rotation thread job of type \"%s\" for session \"%s\"",
208 job_type_str
, session
->name
);
211 /* No reason for this to fail as the caller must hold a reference. */
212 (void) session_get(session
);
214 job
->session
= session
;
215 job
->type
= job_type
;
216 cds_list_add_tail(&job
->head
, &queue
->list
);
218 ret
= lttng_write(lttng_pipe_get_writefd(queue
->event_pipe
), &dummy
,
222 * We do not want to block in the timer handler, the job has
223 * been enqueued in the list, the wakeup pipe is probably full,
224 * the job will be processed when the rotation_thread catches
227 if (errno
== EAGAIN
|| errno
== EWOULDBLOCK
) {
229 * Not an error, but would be surprising and indicate
230 * that the rotation thread can't keep up with the
233 DBG("Wake-up pipe of rotation thread job queue is full");
236 PERROR("Failed to wake-up the rotation thread after pushing a job of type \"%s\" for session \"%s\"",
237 job_type_str
, session
->name
);
242 pthread_mutex_unlock(&queue
->lock
);
246 int init_poll_set(struct lttng_poll_event
*poll_set
,
247 struct rotation_thread_handle
*handle
)
252 * Create pollset with size 3:
253 * - rotation thread quit pipe,
254 * - rotation thread timer queue pipe,
255 * - notification channel sock,
257 ret
= lttng_poll_create(poll_set
, 5, LTTNG_CLOEXEC
);
262 ret
= lttng_poll_add(poll_set
,
263 lttng_pipe_get_readfd(handle
->quit_pipe
),
266 ERR("[rotation-thread] Failed to add quit pipe read fd to poll set");
270 ret
= lttng_poll_add(poll_set
,
271 lttng_pipe_get_readfd(handle
->rotation_timer_queue
->event_pipe
),
274 ERR("[rotation-thread] Failed to add rotate_pending fd to poll set");
280 lttng_poll_clean(poll_set
);
285 void fini_thread_state(struct rotation_thread
*state
)
287 lttng_poll_clean(&state
->events
);
288 if (rotate_notification_channel
) {
289 lttng_notification_channel_destroy(rotate_notification_channel
);
294 int init_thread_state(struct rotation_thread_handle
*handle
,
295 struct rotation_thread
*state
)
299 memset(state
, 0, sizeof(*state
));
300 lttng_poll_init(&state
->events
);
302 ret
= init_poll_set(&state
->events
, handle
);
304 ERR("[rotation-thread] Failed to initialize rotation thread poll set");
308 rotate_notification_channel
= lttng_notification_channel_create(
309 lttng_session_daemon_notification_endpoint
);
310 if (!rotate_notification_channel
) {
311 ERR("[rotation-thread] Could not create notification channel");
315 ret
= lttng_poll_add(&state
->events
, rotate_notification_channel
->socket
,
318 ERR("[rotation-thread] Failed to add notification fd to pollset");
327 void check_session_rotation_pending_on_consumers(struct ltt_session
*session
,
328 bool *_rotation_completed
)
331 struct consumer_socket
*socket
;
332 struct cds_lfht_iter iter
;
333 enum consumer_trace_chunk_exists_status exists_status
;
335 bool chunk_exists_on_peer
= false;
336 enum lttng_trace_chunk_status chunk_status
;
338 assert(session
->chunk_being_archived
);
341 * Check for a local pending rotation on all consumers (32-bit
342 * user space, 64-bit user space, and kernel).
345 if (!session
->ust_session
) {
348 cds_lfht_for_each_entry(session
->ust_session
->consumer
->socks
->ht
,
349 &iter
, socket
, node
.node
) {
350 relayd_id
= session
->ust_session
->consumer
->type
== CONSUMER_DST_LOCAL
?
352 session
->ust_session
->consumer
->net_seq_index
;
354 pthread_mutex_lock(socket
->lock
);
355 ret
= consumer_trace_chunk_exists(socket
,
357 session
->id
, session
->chunk_being_archived
,
360 pthread_mutex_unlock(socket
->lock
);
361 ERR("Error occurred while checking rotation status on consumer daemon");
365 if (exists_status
!= CONSUMER_TRACE_CHUNK_EXISTS_STATUS_UNKNOWN_CHUNK
) {
366 pthread_mutex_unlock(socket
->lock
);
367 chunk_exists_on_peer
= true;
370 pthread_mutex_unlock(socket
->lock
);
374 if (!session
->kernel_session
) {
377 cds_lfht_for_each_entry(session
->kernel_session
->consumer
->socks
->ht
,
378 &iter
, socket
, node
.node
) {
379 pthread_mutex_lock(socket
->lock
);
380 relayd_id
= session
->kernel_session
->consumer
->type
== CONSUMER_DST_LOCAL
?
382 session
->kernel_session
->consumer
->net_seq_index
;
384 ret
= consumer_trace_chunk_exists(socket
,
386 session
->id
, session
->chunk_being_archived
,
389 pthread_mutex_unlock(socket
->lock
);
390 ERR("Error occurred while checking rotation status on consumer daemon");
394 if (exists_status
!= CONSUMER_TRACE_CHUNK_EXISTS_STATUS_UNKNOWN_CHUNK
) {
395 pthread_mutex_unlock(socket
->lock
);
396 chunk_exists_on_peer
= true;
399 pthread_mutex_unlock(socket
->lock
);
405 if (!chunk_exists_on_peer
) {
406 uint64_t chunk_being_archived_id
;
408 chunk_status
= lttng_trace_chunk_get_id(
409 session
->chunk_being_archived
,
410 &chunk_being_archived_id
);
411 assert(chunk_status
== LTTNG_TRACE_CHUNK_STATUS_OK
);
412 DBG("[rotation-thread] Rotation of trace archive %" PRIu64
" of session \"%s\" is complete on all consumers",
413 chunk_being_archived_id
,
416 *_rotation_completed
= !chunk_exists_on_peer
;
418 ret
= session_reset_rotation_state(session
,
419 LTTNG_ROTATION_STATE_ERROR
);
421 ERR("Failed to reset rotation state of session \"%s\"",
428 * Check if the last rotation was completed, called with session lock held.
429 * Should only return non-zero in the event of a fatal error. Doing so will
430 * shutdown the thread.
433 int check_session_rotation_pending(struct ltt_session
*session
,
434 struct notification_thread_handle
*notification_thread_handle
)
437 struct lttng_trace_archive_location
*location
;
438 enum lttng_trace_chunk_status chunk_status
;
439 bool rotation_completed
= false;
440 const char *archived_chunk_name
;
441 uint64_t chunk_being_archived_id
;
443 if (!session
->chunk_being_archived
) {
448 chunk_status
= lttng_trace_chunk_get_id(session
->chunk_being_archived
,
449 &chunk_being_archived_id
);
450 assert(chunk_status
== LTTNG_TRACE_CHUNK_STATUS_OK
);
452 DBG("[rotation-thread] Checking for pending rotation on session \"%s\", trace archive %" PRIu64
,
453 session
->name
, chunk_being_archived_id
);
456 * The rotation-pending check timer of a session is launched in
457 * one-shot mode. If the rotation is incomplete, the rotation
458 * thread will re-enable the pending-check timer.
460 * The timer thread can't stop the timer itself since it is involved
461 * in the check for the timer's quiescence.
463 ret
= timer_session_rotation_pending_check_stop(session
);
465 goto check_ongoing_rotation
;
468 check_session_rotation_pending_on_consumers(session
,
469 &rotation_completed
);
470 if (!rotation_completed
||
471 session
->rotation_state
== LTTNG_ROTATION_STATE_ERROR
) {
472 goto check_ongoing_rotation
;
476 * Now we can clear the "ONGOING" state in the session. New
477 * rotations can start now.
479 chunk_status
= lttng_trace_chunk_get_name(session
->chunk_being_archived
,
480 &archived_chunk_name
, NULL
);
481 assert(chunk_status
== LTTNG_TRACE_CHUNK_STATUS_OK
);
482 free(session
->last_archived_chunk_name
);
483 session
->last_archived_chunk_name
= strdup(archived_chunk_name
);
484 if (!session
->last_archived_chunk_name
) {
485 PERROR("Failed to duplicate archived chunk name");
487 session_reset_rotation_state(session
, LTTNG_ROTATION_STATE_COMPLETED
);
489 if (!session
->quiet_rotation
) {
490 location
= session_get_trace_archive_location(session
);
491 /* Ownership of location is transferred. */
492 ret
= notification_thread_command_session_rotation_completed(
493 notification_thread_handle
,
497 session
->last_archived_chunk_id
.value
,
499 if (ret
!= LTTNG_OK
) {
500 ERR("[rotation-thread] Failed to notify notification thread of completed rotation for session %s",
506 check_ongoing_rotation
:
507 if (session
->rotation_state
== LTTNG_ROTATION_STATE_ONGOING
) {
508 uint64_t chunk_being_archived_id
;
510 chunk_status
= lttng_trace_chunk_get_id(
511 session
->chunk_being_archived
,
512 &chunk_being_archived_id
);
513 assert(chunk_status
== LTTNG_TRACE_CHUNK_STATUS_OK
);
515 DBG("[rotation-thread] Rotation of trace archive %" PRIu64
" is still pending for session %s",
516 chunk_being_archived_id
, session
->name
);
517 ret
= timer_session_rotation_pending_check_start(session
,
518 DEFAULT_ROTATE_PENDING_TIMER
);
520 ERR("Failed to re-enable rotation pending timer");
530 /* Call with the session and session_list locks held. */
532 int launch_session_rotation(struct ltt_session
*session
)
535 struct lttng_rotate_session_return rotation_return
;
537 DBG("[rotation-thread] Launching scheduled time-based rotation on session \"%s\"",
540 ret
= cmd_rotate_session(session
, &rotation_return
, false,
541 LTTNG_TRACE_CHUNK_COMMAND_TYPE_MOVE_TO_COMPLETED
);
542 if (ret
== LTTNG_OK
) {
543 DBG("[rotation-thread] Scheduled time-based rotation successfully launched on session \"%s\"",
546 /* Don't consider errors as fatal. */
547 DBG("[rotation-thread] Scheduled time-based rotation aborted for session %s: %s",
548 session
->name
, lttng_strerror(ret
));
554 int run_job(struct rotation_thread_job
*job
, struct ltt_session
*session
,
555 struct notification_thread_handle
*notification_thread_handle
)
560 case ROTATION_THREAD_JOB_TYPE_SCHEDULED_ROTATION
:
561 ret
= launch_session_rotation(session
);
563 case ROTATION_THREAD_JOB_TYPE_CHECK_PENDING_ROTATION
:
564 ret
= check_session_rotation_pending(session
,
565 notification_thread_handle
);
574 int handle_job_queue(struct rotation_thread_handle
*handle
,
575 struct rotation_thread
*state
,
576 struct rotation_thread_timer_queue
*queue
)
581 struct ltt_session
*session
;
582 struct rotation_thread_job
*job
;
584 /* Take the queue lock only to pop an element from the list. */
585 pthread_mutex_lock(&queue
->lock
);
586 if (cds_list_empty(&queue
->list
)) {
587 pthread_mutex_unlock(&queue
->lock
);
590 job
= cds_list_first_entry(&queue
->list
,
592 cds_list_del(&job
->head
);
593 pthread_mutex_unlock(&queue
->lock
);
596 session
= job
->session
;
598 DBG("[rotation-thread] Session \"%s\" not found",
601 * This is a non-fatal error, and we cannot report it to
602 * the user (timer), so just print the error and
603 * continue the processing.
605 * While the timer thread will purge pending signals for
606 * a session on the session's destruction, it is
607 * possible for a job targeting that session to have
608 * already been queued before it was destroyed.
611 session_put(session
);
612 session_unlock_list();
616 session_lock(session
);
617 ret
= run_job(job
, session
, handle
->notification_thread_handle
);
618 session_unlock(session
);
619 /* Release reference held by the job. */
620 session_put(session
);
621 session_unlock_list();
635 int handle_condition(const struct lttng_condition
*condition
,
636 const struct lttng_evaluation
*evaluation
,
637 struct notification_thread_handle
*notification_thread_handle
)
640 const char *condition_session_name
= NULL
;
641 enum lttng_condition_type condition_type
;
642 enum lttng_condition_status condition_status
;
643 enum lttng_evaluation_status evaluation_status
;
645 struct ltt_session
*session
;
647 condition_type
= lttng_condition_get_type(condition
);
649 if (condition_type
!= LTTNG_CONDITION_TYPE_SESSION_CONSUMED_SIZE
) {
651 ERR("[rotation-thread] Condition type and session usage type are not the same");
655 /* Fetch info to test */
656 condition_status
= lttng_condition_session_consumed_size_get_session_name(
657 condition
, &condition_session_name
);
658 if (condition_status
!= LTTNG_CONDITION_STATUS_OK
) {
659 ERR("[rotation-thread] Session name could not be fetched");
663 evaluation_status
= lttng_evaluation_session_consumed_size_get_consumed_size(evaluation
,
665 if (evaluation_status
!= LTTNG_EVALUATION_STATUS_OK
) {
666 ERR("[rotation-thread] Failed to get evaluation");
672 session
= session_find_by_name(condition_session_name
);
675 session_unlock_list();
676 ERR("[rotation-thread] Session \"%s\" not found",
677 condition_session_name
);
680 session_lock(session
);
682 ret
= unsubscribe_session_consumed_size_rotation(session
,
683 notification_thread_handle
);
688 ret
= cmd_rotate_session(session
, NULL
, false,
689 LTTNG_TRACE_CHUNK_COMMAND_TYPE_MOVE_TO_COMPLETED
);
690 if (ret
== -LTTNG_ERR_ROTATION_PENDING
) {
691 DBG("Rotate already pending, subscribe to the next threshold value");
692 } else if (ret
!= LTTNG_OK
) {
693 ERR("[rotation-thread] Failed to rotate on size notification with error: %s",
694 lttng_strerror(ret
));
698 ret
= subscribe_session_consumed_size_rotation(session
,
699 consumed
+ session
->rotate_size
,
700 notification_thread_handle
);
702 ERR("[rotation-thread] Failed to subscribe to session consumed size condition");
708 session_unlock(session
);
709 session_put(session
);
710 session_unlock_list();
716 int handle_notification_channel(int fd
,
717 struct rotation_thread_handle
*handle
,
718 struct rotation_thread
*state
)
721 bool notification_pending
;
722 struct lttng_notification
*notification
= NULL
;
723 enum lttng_notification_channel_status status
;
724 const struct lttng_evaluation
*notification_evaluation
;
725 const struct lttng_condition
*notification_condition
;
727 status
= lttng_notification_channel_has_pending_notification(
728 rotate_notification_channel
, ¬ification_pending
);
729 if (status
!= LTTNG_NOTIFICATION_CHANNEL_STATUS_OK
) {
730 ERR("[rotation-thread ]Error occurred while checking for pending notification");
735 if (!notification_pending
) {
740 /* Receive the next notification. */
741 status
= lttng_notification_channel_get_next_notification(
742 rotate_notification_channel
,
746 case LTTNG_NOTIFICATION_CHANNEL_STATUS_OK
:
748 case LTTNG_NOTIFICATION_CHANNEL_STATUS_NOTIFICATIONS_DROPPED
:
749 /* Not an error, we will wait for the next one */
752 case LTTNG_NOTIFICATION_CHANNEL_STATUS_CLOSED
:
753 ERR("Notification channel was closed");
757 /* Unhandled conditions / errors. */
758 ERR("Unknown notification channel status");
763 notification_condition
= lttng_notification_get_condition(notification
);
764 notification_evaluation
= lttng_notification_get_evaluation(notification
);
766 ret
= handle_condition(notification_condition
, notification_evaluation
,
767 handle
->notification_thread_handle
);
770 lttng_notification_destroy(notification
);
775 void *thread_rotation(void *data
)
778 struct rotation_thread_handle
*handle
= data
;
779 struct rotation_thread thread
;
782 DBG("[rotation-thread] Started rotation thread");
783 rcu_register_thread();
785 health_register(health_sessiond
, HEALTH_SESSIOND_TYPE_ROTATION
);
786 health_code_update();
789 ERR("[rotation-thread] Invalid thread context provided");
793 queue_pipe_fd
= lttng_pipe_get_readfd(
794 handle
->rotation_timer_queue
->event_pipe
);
797 ret
= init_thread_state(handle
, &thread
);
806 DBG("[rotation-thread] Entering poll wait");
807 ret
= lttng_poll_wait(&thread
.events
, -1);
808 DBG("[rotation-thread] Poll wait returned (%i)", ret
);
812 * Restart interrupted system call.
814 if (errno
== EINTR
) {
817 ERR("[rotation-thread] Error encountered during lttng_poll_wait (%i)", ret
);
822 for (i
= 0; i
< fd_count
; i
++) {
823 int fd
= LTTNG_POLL_GETFD(&thread
.events
, i
);
824 uint32_t revents
= LTTNG_POLL_GETEV(&thread
.events
, i
);
826 DBG("[rotation-thread] Handling fd (%i) activity (%u)",
829 if (revents
& LPOLLERR
) {
830 ERR("[rotation-thread] Polling returned an error on fd %i", fd
);
834 if (fd
== rotate_notification_channel
->socket
) {
835 ret
= handle_notification_channel(fd
, handle
,
838 ERR("[rotation-thread] Error occurred while handling activity on notification channel socket");
842 /* Job queue or quit pipe activity. */
845 * The job queue is serviced if there is
846 * activity on the quit pipe to ensure it is
847 * flushed and all references held in the queue
850 ret
= handle_job_queue(handle
, &thread
,
851 handle
->rotation_timer_queue
);
853 ERR("[rotation-thread] Failed to handle rotation timer pipe event");
857 if (fd
== queue_pipe_fd
) {
860 ret
= lttng_read(fd
, &buf
, 1);
862 ERR("[rotation-thread] Failed to read from wakeup pipe (fd = %i)", fd
);
866 DBG("[rotation-thread] Quit pipe activity");
874 DBG("[rotation-thread] Exit");
875 fini_thread_state(&thread
);
877 health_unregister(health_sessiond
);
878 rcu_thread_offline();
879 rcu_unregister_thread();
884 bool shutdown_rotation_thread(void *thread_data
)
886 struct rotation_thread_handle
*handle
= thread_data
;
887 const int write_fd
= lttng_pipe_get_writefd(handle
->quit_pipe
);
889 return notify_thread_pipe(write_fd
) == 1;
892 bool launch_rotation_thread(struct rotation_thread_handle
*handle
)
894 struct lttng_thread
*thread
;
896 thread
= lttng_thread_create("Rotation",
898 shutdown_rotation_thread
,
904 lttng_thread_put(thread
);