Commit | Line | Data |
---|---|---|
f5db4af4 JB |
1 | /* |
2 | * Copyright (C) 2006-2009 Red Hat, Inc. | |
3 | * | |
4 | * This file is released under the LGPL. | |
5 | */ | |
6 | ||
7 | #include <linux/bio.h> | |
5a0e3ad6 | 8 | #include <linux/slab.h> |
0f30af98 | 9 | #include <linux/jiffies.h> |
f5db4af4 JB |
10 | #include <linux/dm-dirty-log.h> |
11 | #include <linux/device-mapper.h> | |
12 | #include <linux/dm-log-userspace.h> | |
056075c7 | 13 | #include <linux/module.h> |
5066a4df | 14 | #include <linux/workqueue.h> |
f5db4af4 JB |
15 | |
16 | #include "dm-log-userspace-transfer.h" | |
17 | ||
5066a4df | 18 | #define DM_LOG_USERSPACE_VSN "1.3.0" |
86a54a48 | 19 | |
ac1f9ef2 MS |
20 | #define FLUSH_ENTRY_POOL_SIZE 16 |
21 | ||
22 | struct dm_dirty_log_flush_entry { | |
f5db4af4 JB |
23 | int type; |
24 | region_t region; | |
25 | struct list_head list; | |
26 | }; | |
27 | ||
085ae065 JB |
28 | /* |
29 | * This limit on the number of mark and clear request is, to a degree, | |
30 | * arbitrary. However, there is some basis for the choice in the limits | |
31 | * imposed on the size of data payload by dm-log-userspace-transfer.c: | |
32 | * dm_consult_userspace(). | |
33 | */ | |
34 | #define MAX_FLUSH_GROUP_COUNT 32 | |
35 | ||
f5db4af4 JB |
36 | struct log_c { |
37 | struct dm_target *ti; | |
5a25f0eb | 38 | struct dm_dev *log_dev; |
f5db4af4 JB |
39 | |
40 | char *usr_argv_str; | |
41 | uint32_t usr_argc; | |
42 | ||
ac1f9ef2 MS |
43 | uint32_t region_size; |
44 | region_t region_count; | |
45 | uint64_t luid; | |
46 | char uuid[DM_UUID_LEN]; | |
f5db4af4 | 47 | |
909cc4fb JB |
48 | /* |
49 | * Mark and clear requests are held until a flush is issued | |
50 | * so that we can group, and thereby limit, the amount of | |
51 | * network traffic between kernel and userspace. The 'flush_lock' | |
52 | * is used to protect these lists. | |
53 | */ | |
f5db4af4 | 54 | spinlock_t flush_lock; |
909cc4fb JB |
55 | struct list_head mark_list; |
56 | struct list_head clear_list; | |
5066a4df | 57 | |
ac1f9ef2 MS |
58 | /* |
59 | * in_sync_hint gets set when doing is_remote_recovering. It | |
60 | * represents the first region that needs recovery. IOW, the | |
61 | * first zero bit of sync_bits. This can be useful for to limit | |
62 | * traffic for calls like is_remote_recovering and get_resync_work, | |
63 | * but be take care in its use for anything else. | |
64 | */ | |
65 | uint64_t in_sync_hint; | |
66 | ||
5066a4df DZ |
67 | /* |
68 | * Workqueue for flush of clear region requests. | |
69 | */ | |
70 | struct workqueue_struct *dmlog_wq; | |
71 | struct delayed_work flush_log_work; | |
72 | atomic_t sched_flush; | |
73 | ||
74 | /* | |
75 | * Combine userspace flush and mark requests for efficiency. | |
76 | */ | |
77 | uint32_t integrated_flush; | |
f5db4af4 | 78 | |
ac1f9ef2 MS |
79 | mempool_t *flush_entry_pool; |
80 | }; | |
f5db4af4 | 81 | |
ac1f9ef2 | 82 | static struct kmem_cache *_flush_entry_cache; |
f5db4af4 JB |
83 | |
84 | static int userspace_do_request(struct log_c *lc, const char *uuid, | |
85 | int request_type, char *data, size_t data_size, | |
86 | char *rdata, size_t *rdata_size) | |
87 | { | |
88 | int r; | |
89 | ||
90 | /* | |
91 | * If the server isn't there, -ESRCH is returned, | |
92 | * and we must keep trying until the server is | |
93 | * restored. | |
94 | */ | |
95 | retry: | |
7ec23d50 | 96 | r = dm_consult_userspace(uuid, lc->luid, request_type, data, |
f5db4af4 JB |
97 | data_size, rdata, rdata_size); |
98 | ||
99 | if (r != -ESRCH) | |
100 | return r; | |
101 | ||
102 | DMERR(" Userspace log server not found."); | |
103 | while (1) { | |
104 | set_current_state(TASK_INTERRUPTIBLE); | |
105 | schedule_timeout(2*HZ); | |
106 | DMWARN("Attempting to contact userspace log server..."); | |
7ec23d50 JB |
107 | r = dm_consult_userspace(uuid, lc->luid, DM_ULOG_CTR, |
108 | lc->usr_argv_str, | |
f5db4af4 JB |
109 | strlen(lc->usr_argv_str) + 1, |
110 | NULL, NULL); | |
111 | if (!r) | |
112 | break; | |
113 | } | |
114 | DMINFO("Reconnected to userspace log server... DM_ULOG_CTR complete"); | |
7ec23d50 | 115 | r = dm_consult_userspace(uuid, lc->luid, DM_ULOG_RESUME, NULL, |
f5db4af4 JB |
116 | 0, NULL, NULL); |
117 | if (!r) | |
118 | goto retry; | |
119 | ||
120 | DMERR("Error trying to resume userspace log: %d", r); | |
121 | ||
122 | return -ESRCH; | |
123 | } | |
124 | ||
125 | static int build_constructor_string(struct dm_target *ti, | |
126 | unsigned argc, char **argv, | |
127 | char **ctr_str) | |
128 | { | |
129 | int i, str_size; | |
130 | char *str = NULL; | |
131 | ||
132 | *ctr_str = NULL; | |
133 | ||
5066a4df DZ |
134 | /* |
135 | * Determine overall size of the string. | |
136 | */ | |
f5db4af4 JB |
137 | for (i = 0, str_size = 0; i < argc; i++) |
138 | str_size += strlen(argv[i]) + 1; /* +1 for space between args */ | |
139 | ||
140 | str_size += 20; /* Max number of chars in a printed u64 number */ | |
141 | ||
142 | str = kzalloc(str_size, GFP_KERNEL); | |
143 | if (!str) { | |
144 | DMWARN("Unable to allocate memory for constructor string"); | |
145 | return -ENOMEM; | |
146 | } | |
147 | ||
b8313b6d JB |
148 | str_size = sprintf(str, "%llu", (unsigned long long)ti->len); |
149 | for (i = 0; i < argc; i++) | |
150 | str_size += sprintf(str + str_size, " %s", argv[i]); | |
f5db4af4 JB |
151 | |
152 | *ctr_str = str; | |
153 | return str_size; | |
154 | } | |
155 | ||
5066a4df DZ |
156 | static void do_flush(struct work_struct *work) |
157 | { | |
158 | int r; | |
159 | struct log_c *lc = container_of(work, struct log_c, flush_log_work.work); | |
160 | ||
161 | atomic_set(&lc->sched_flush, 0); | |
162 | ||
163 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, NULL, 0, NULL, NULL); | |
164 | ||
165 | if (r) | |
166 | dm_table_event(lc->ti->table); | |
167 | } | |
168 | ||
f5db4af4 JB |
169 | /* |
170 | * userspace_ctr | |
171 | * | |
172 | * argv contains: | |
5066a4df DZ |
173 | * <UUID> [integrated_flush] <other args> |
174 | * Where 'other args' are the userspace implementation-specific log | |
175 | * arguments. | |
176 | * | |
177 | * Example: | |
178 | * <UUID> [integrated_flush] clustered-disk <arg count> <log dev> | |
179 | * <region_size> [[no]sync] | |
180 | * | |
181 | * This module strips off the <UUID> and uses it for identification | |
182 | * purposes when communicating with userspace about a log. | |
f5db4af4 | 183 | * |
5066a4df DZ |
184 | * If integrated_flush is defined, the kernel combines flush |
185 | * and mark requests. | |
186 | * | |
187 | * The rest of the line, beginning with 'clustered-disk', is passed | |
188 | * to the userspace ctr function. | |
f5db4af4 JB |
189 | */ |
190 | static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, | |
191 | unsigned argc, char **argv) | |
192 | { | |
193 | int r = 0; | |
194 | int str_size; | |
195 | char *ctr_str = NULL; | |
196 | struct log_c *lc = NULL; | |
197 | uint64_t rdata; | |
198 | size_t rdata_size = sizeof(rdata); | |
5a25f0eb JB |
199 | char *devices_rdata = NULL; |
200 | size_t devices_rdata_size = DM_NAME_LEN; | |
f5db4af4 JB |
201 | |
202 | if (argc < 3) { | |
203 | DMWARN("Too few arguments to userspace dirty log"); | |
204 | return -EINVAL; | |
205 | } | |
206 | ||
5a25f0eb | 207 | lc = kzalloc(sizeof(*lc), GFP_KERNEL); |
f5db4af4 JB |
208 | if (!lc) { |
209 | DMWARN("Unable to allocate userspace log context."); | |
210 | return -ENOMEM; | |
211 | } | |
212 | ||
7ec23d50 | 213 | /* The ptr value is sufficient for local unique id */ |
bca915aa | 214 | lc->luid = (unsigned long)lc; |
7ec23d50 | 215 | |
f5db4af4 JB |
216 | lc->ti = ti; |
217 | ||
218 | if (strlen(argv[0]) > (DM_UUID_LEN - 1)) { | |
219 | DMWARN("UUID argument too long."); | |
220 | kfree(lc); | |
221 | return -EINVAL; | |
222 | } | |
223 | ||
5066a4df DZ |
224 | lc->usr_argc = argc; |
225 | ||
f5db4af4 | 226 | strncpy(lc->uuid, argv[0], DM_UUID_LEN); |
5066a4df DZ |
227 | argc--; |
228 | argv++; | |
f5db4af4 | 229 | spin_lock_init(&lc->flush_lock); |
909cc4fb JB |
230 | INIT_LIST_HEAD(&lc->mark_list); |
231 | INIT_LIST_HEAD(&lc->clear_list); | |
f5db4af4 | 232 | |
5066a4df DZ |
233 | if (!strcasecmp(argv[0], "integrated_flush")) { |
234 | lc->integrated_flush = 1; | |
235 | argc--; | |
236 | argv++; | |
237 | } | |
238 | ||
239 | str_size = build_constructor_string(ti, argc, argv, &ctr_str); | |
f5db4af4 JB |
240 | if (str_size < 0) { |
241 | kfree(lc); | |
242 | return str_size; | |
243 | } | |
244 | ||
5a25f0eb JB |
245 | devices_rdata = kzalloc(devices_rdata_size, GFP_KERNEL); |
246 | if (!devices_rdata) { | |
247 | DMERR("Failed to allocate memory for device information"); | |
248 | r = -ENOMEM; | |
249 | goto out; | |
250 | } | |
251 | ||
ac1f9ef2 MS |
252 | lc->flush_entry_pool = mempool_create_slab_pool(FLUSH_ENTRY_POOL_SIZE, |
253 | _flush_entry_cache); | |
254 | if (!lc->flush_entry_pool) { | |
255 | DMERR("Failed to create flush_entry_pool"); | |
256 | r = -ENOMEM; | |
257 | goto out; | |
258 | } | |
259 | ||
5a25f0eb JB |
260 | /* |
261 | * Send table string and get back any opened device. | |
262 | */ | |
7ec23d50 | 263 | r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_CTR, |
5a25f0eb JB |
264 | ctr_str, str_size, |
265 | devices_rdata, &devices_rdata_size); | |
f5db4af4 | 266 | |
4a038677 JB |
267 | if (r < 0) { |
268 | if (r == -ESRCH) | |
269 | DMERR("Userspace log server not found"); | |
270 | else | |
271 | DMERR("Userspace log server failed to create log"); | |
f5db4af4 JB |
272 | goto out; |
273 | } | |
274 | ||
275 | /* Since the region size does not change, get it now */ | |
276 | rdata_size = sizeof(rdata); | |
7ec23d50 | 277 | r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_GET_REGION_SIZE, |
f5db4af4 JB |
278 | NULL, 0, (char *)&rdata, &rdata_size); |
279 | ||
280 | if (r) { | |
281 | DMERR("Failed to get region size of dirty log"); | |
282 | goto out; | |
283 | } | |
284 | ||
285 | lc->region_size = (uint32_t)rdata; | |
286 | lc->region_count = dm_sector_div_up(ti->len, lc->region_size); | |
287 | ||
5a25f0eb JB |
288 | if (devices_rdata_size) { |
289 | if (devices_rdata[devices_rdata_size - 1] != '\0') { | |
290 | DMERR("DM_ULOG_CTR device return string not properly terminated"); | |
291 | r = -EINVAL; | |
292 | goto out; | |
293 | } | |
294 | r = dm_get_device(ti, devices_rdata, | |
295 | dm_table_get_mode(ti->table), &lc->log_dev); | |
296 | if (r) | |
297 | DMERR("Failed to register %s with device-mapper", | |
298 | devices_rdata); | |
299 | } | |
5066a4df DZ |
300 | |
301 | if (lc->integrated_flush) { | |
302 | lc->dmlog_wq = alloc_workqueue("dmlogd", WQ_MEM_RECLAIM, 0); | |
303 | if (!lc->dmlog_wq) { | |
304 | DMERR("couldn't start dmlogd"); | |
305 | r = -ENOMEM; | |
306 | goto out; | |
307 | } | |
308 | ||
309 | INIT_DELAYED_WORK(&lc->flush_log_work, do_flush); | |
310 | atomic_set(&lc->sched_flush, 0); | |
311 | } | |
312 | ||
f5db4af4 | 313 | out: |
5a25f0eb | 314 | kfree(devices_rdata); |
f5db4af4 | 315 | if (r) { |
ac1f9ef2 MS |
316 | if (lc->flush_entry_pool) |
317 | mempool_destroy(lc->flush_entry_pool); | |
f5db4af4 JB |
318 | kfree(lc); |
319 | kfree(ctr_str); | |
320 | } else { | |
321 | lc->usr_argv_str = ctr_str; | |
f5db4af4 JB |
322 | log->context = lc; |
323 | } | |
324 | ||
325 | return r; | |
326 | } | |
327 | ||
328 | static void userspace_dtr(struct dm_dirty_log *log) | |
329 | { | |
f5db4af4 JB |
330 | struct log_c *lc = log->context; |
331 | ||
5066a4df DZ |
332 | if (lc->integrated_flush) { |
333 | /* flush workqueue */ | |
334 | if (atomic_read(&lc->sched_flush)) | |
335 | flush_delayed_work(&lc->flush_log_work); | |
336 | ||
337 | destroy_workqueue(lc->dmlog_wq); | |
338 | } | |
339 | ||
4a038677 | 340 | (void) dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR, |
5066a4df | 341 | NULL, 0, NULL, NULL); |
f5db4af4 | 342 | |
5a25f0eb JB |
343 | if (lc->log_dev) |
344 | dm_put_device(lc->ti, lc->log_dev); | |
345 | ||
ac1f9ef2 MS |
346 | mempool_destroy(lc->flush_entry_pool); |
347 | ||
f5db4af4 JB |
348 | kfree(lc->usr_argv_str); |
349 | kfree(lc); | |
350 | ||
351 | return; | |
352 | } | |
353 | ||
354 | static int userspace_presuspend(struct dm_dirty_log *log) | |
355 | { | |
356 | int r; | |
357 | struct log_c *lc = log->context; | |
358 | ||
7ec23d50 | 359 | r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_PRESUSPEND, |
5066a4df | 360 | NULL, 0, NULL, NULL); |
f5db4af4 JB |
361 | |
362 | return r; | |
363 | } | |
364 | ||
365 | static int userspace_postsuspend(struct dm_dirty_log *log) | |
366 | { | |
367 | int r; | |
368 | struct log_c *lc = log->context; | |
369 | ||
5066a4df DZ |
370 | /* |
371 | * Run planned flush earlier. | |
372 | */ | |
373 | if (lc->integrated_flush && atomic_read(&lc->sched_flush)) | |
374 | flush_delayed_work(&lc->flush_log_work); | |
375 | ||
7ec23d50 | 376 | r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_POSTSUSPEND, |
5066a4df | 377 | NULL, 0, NULL, NULL); |
f5db4af4 JB |
378 | |
379 | return r; | |
380 | } | |
381 | ||
382 | static int userspace_resume(struct dm_dirty_log *log) | |
383 | { | |
384 | int r; | |
385 | struct log_c *lc = log->context; | |
386 | ||
387 | lc->in_sync_hint = 0; | |
7ec23d50 | 388 | r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_RESUME, |
5066a4df | 389 | NULL, 0, NULL, NULL); |
f5db4af4 JB |
390 | |
391 | return r; | |
392 | } | |
393 | ||
394 | static uint32_t userspace_get_region_size(struct dm_dirty_log *log) | |
395 | { | |
396 | struct log_c *lc = log->context; | |
397 | ||
398 | return lc->region_size; | |
399 | } | |
400 | ||
401 | /* | |
402 | * userspace_is_clean | |
403 | * | |
404 | * Check whether a region is clean. If there is any sort of | |
405 | * failure when consulting the server, we return not clean. | |
406 | * | |
407 | * Returns: 1 if clean, 0 otherwise | |
408 | */ | |
409 | static int userspace_is_clean(struct dm_dirty_log *log, region_t region) | |
410 | { | |
411 | int r; | |
412 | uint64_t region64 = (uint64_t)region; | |
413 | int64_t is_clean; | |
414 | size_t rdata_size; | |
415 | struct log_c *lc = log->context; | |
416 | ||
417 | rdata_size = sizeof(is_clean); | |
418 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_CLEAN, | |
419 | (char *)®ion64, sizeof(region64), | |
420 | (char *)&is_clean, &rdata_size); | |
421 | ||
422 | return (r) ? 0 : (int)is_clean; | |
423 | } | |
424 | ||
425 | /* | |
426 | * userspace_in_sync | |
427 | * | |
428 | * Check if the region is in-sync. If there is any sort | |
429 | * of failure when consulting the server, we assume that | |
430 | * the region is not in sync. | |
431 | * | |
432 | * If 'can_block' is set, return immediately | |
433 | * | |
434 | * Returns: 1 if in-sync, 0 if not-in-sync, -EWOULDBLOCK | |
435 | */ | |
436 | static int userspace_in_sync(struct dm_dirty_log *log, region_t region, | |
437 | int can_block) | |
438 | { | |
439 | int r; | |
440 | uint64_t region64 = region; | |
441 | int64_t in_sync; | |
442 | size_t rdata_size; | |
443 | struct log_c *lc = log->context; | |
444 | ||
445 | /* | |
446 | * We can never respond directly - even if in_sync_hint is | |
447 | * set. This is because another machine could see a device | |
448 | * failure and mark the region out-of-sync. If we don't go | |
449 | * to userspace to ask, we might think the region is in-sync | |
450 | * and allow a read to pick up data that is stale. (This is | |
451 | * very unlikely if a device actually fails; but it is very | |
452 | * likely if a connection to one device from one machine fails.) | |
453 | * | |
454 | * There still might be a problem if the mirror caches the region | |
455 | * state as in-sync... but then this call would not be made. So, | |
456 | * that is a mirror problem. | |
457 | */ | |
458 | if (!can_block) | |
459 | return -EWOULDBLOCK; | |
460 | ||
461 | rdata_size = sizeof(in_sync); | |
462 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_IN_SYNC, | |
463 | (char *)®ion64, sizeof(region64), | |
464 | (char *)&in_sync, &rdata_size); | |
465 | return (r) ? 0 : (int)in_sync; | |
466 | } | |
467 | ||
085ae065 JB |
468 | static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list) |
469 | { | |
470 | int r = 0; | |
ac1f9ef2 | 471 | struct dm_dirty_log_flush_entry *fe; |
085ae065 JB |
472 | |
473 | list_for_each_entry(fe, flush_list, list) { | |
474 | r = userspace_do_request(lc, lc->uuid, fe->type, | |
475 | (char *)&fe->region, | |
476 | sizeof(fe->region), | |
477 | NULL, NULL); | |
478 | if (r) | |
479 | break; | |
480 | } | |
481 | ||
482 | return r; | |
483 | } | |
484 | ||
5066a4df DZ |
485 | static int flush_by_group(struct log_c *lc, struct list_head *flush_list, |
486 | int flush_with_payload) | |
085ae065 JB |
487 | { |
488 | int r = 0; | |
489 | int count; | |
490 | uint32_t type = 0; | |
ac1f9ef2 | 491 | struct dm_dirty_log_flush_entry *fe, *tmp_fe; |
085ae065 JB |
492 | LIST_HEAD(tmp_list); |
493 | uint64_t group[MAX_FLUSH_GROUP_COUNT]; | |
494 | ||
495 | /* | |
496 | * Group process the requests | |
497 | */ | |
498 | while (!list_empty(flush_list)) { | |
499 | count = 0; | |
500 | ||
501 | list_for_each_entry_safe(fe, tmp_fe, flush_list, list) { | |
502 | group[count] = fe->region; | |
503 | count++; | |
504 | ||
6c9b27ab | 505 | list_move(&fe->list, &tmp_list); |
085ae065 JB |
506 | |
507 | type = fe->type; | |
508 | if (count >= MAX_FLUSH_GROUP_COUNT) | |
509 | break; | |
510 | } | |
511 | ||
5066a4df DZ |
512 | if (flush_with_payload) { |
513 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, | |
514 | (char *)(group), | |
515 | count * sizeof(uint64_t), | |
516 | NULL, NULL); | |
517 | /* | |
518 | * Integrated flush failed. | |
519 | */ | |
520 | if (r) | |
521 | break; | |
522 | } else { | |
523 | r = userspace_do_request(lc, lc->uuid, type, | |
524 | (char *)(group), | |
525 | count * sizeof(uint64_t), | |
526 | NULL, NULL); | |
527 | if (r) { | |
528 | /* | |
529 | * Group send failed. Attempt one-by-one. | |
530 | */ | |
531 | list_splice_init(&tmp_list, flush_list); | |
532 | r = flush_one_by_one(lc, flush_list); | |
533 | break; | |
534 | } | |
085ae065 JB |
535 | } |
536 | } | |
537 | ||
538 | /* | |
539 | * Must collect flush_entrys that were successfully processed | |
540 | * as a group so that they will be free'd by the caller. | |
541 | */ | |
542 | list_splice_init(&tmp_list, flush_list); | |
543 | ||
544 | return r; | |
545 | } | |
546 | ||
f5db4af4 JB |
547 | /* |
548 | * userspace_flush | |
549 | * | |
550 | * This function is ok to block. | |
551 | * The flush happens in two stages. First, it sends all | |
552 | * clear/mark requests that are on the list. Then it | |
553 | * tells the server to commit them. This gives the | |
554 | * server a chance to optimise the commit, instead of | |
555 | * doing it for every request. | |
556 | * | |
557 | * Additionally, we could implement another thread that | |
558 | * sends the requests up to the server - reducing the | |
559 | * load on flush. Then the flush would have less in | |
560 | * the list and be responsible for the finishing commit. | |
561 | * | |
562 | * Returns: 0 on success, < 0 on failure | |
563 | */ | |
564 | static int userspace_flush(struct dm_dirty_log *log) | |
565 | { | |
566 | int r = 0; | |
567 | unsigned long flags; | |
568 | struct log_c *lc = log->context; | |
909cc4fb JB |
569 | LIST_HEAD(mark_list); |
570 | LIST_HEAD(clear_list); | |
5066a4df DZ |
571 | int mark_list_is_empty; |
572 | int clear_list_is_empty; | |
ac1f9ef2 MS |
573 | struct dm_dirty_log_flush_entry *fe, *tmp_fe; |
574 | mempool_t *flush_entry_pool = lc->flush_entry_pool; | |
f5db4af4 JB |
575 | |
576 | spin_lock_irqsave(&lc->flush_lock, flags); | |
909cc4fb JB |
577 | list_splice_init(&lc->mark_list, &mark_list); |
578 | list_splice_init(&lc->clear_list, &clear_list); | |
f5db4af4 JB |
579 | spin_unlock_irqrestore(&lc->flush_lock, flags); |
580 | ||
5066a4df DZ |
581 | mark_list_is_empty = list_empty(&mark_list); |
582 | clear_list_is_empty = list_empty(&clear_list); | |
583 | ||
584 | if (mark_list_is_empty && clear_list_is_empty) | |
f5db4af4 JB |
585 | return 0; |
586 | ||
5066a4df | 587 | r = flush_by_group(lc, &clear_list, 0); |
085ae065 | 588 | if (r) |
5066a4df DZ |
589 | goto out; |
590 | ||
591 | if (!lc->integrated_flush) { | |
592 | r = flush_by_group(lc, &mark_list, 0); | |
593 | if (r) | |
594 | goto out; | |
595 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, | |
596 | NULL, 0, NULL, NULL); | |
597 | goto out; | |
598 | } | |
909cc4fb | 599 | |
5066a4df DZ |
600 | /* |
601 | * Send integrated flush request with mark_list as payload. | |
602 | */ | |
603 | r = flush_by_group(lc, &mark_list, 1); | |
085ae065 | 604 | if (r) |
5066a4df | 605 | goto out; |
f5db4af4 | 606 | |
5066a4df DZ |
607 | if (mark_list_is_empty && !atomic_read(&lc->sched_flush)) { |
608 | /* | |
609 | * When there are only clear region requests, | |
610 | * we schedule a flush in the future. | |
611 | */ | |
612 | queue_delayed_work(lc->dmlog_wq, &lc->flush_log_work, 3 * HZ); | |
613 | atomic_set(&lc->sched_flush, 1); | |
614 | } else { | |
615 | /* | |
616 | * Cancel pending flush because we | |
617 | * have already flushed in mark_region. | |
618 | */ | |
619 | cancel_delayed_work(&lc->flush_log_work); | |
620 | atomic_set(&lc->sched_flush, 0); | |
621 | } | |
f5db4af4 | 622 | |
5066a4df | 623 | out: |
f5db4af4 | 624 | /* |
5066a4df | 625 | * We can safely remove these entries, even after failure. |
f5db4af4 JB |
626 | * Calling code will receive an error and will know that |
627 | * the log facility has failed. | |
628 | */ | |
909cc4fb JB |
629 | list_for_each_entry_safe(fe, tmp_fe, &mark_list, list) { |
630 | list_del(&fe->list); | |
631 | mempool_free(fe, flush_entry_pool); | |
632 | } | |
633 | list_for_each_entry_safe(fe, tmp_fe, &clear_list, list) { | |
f5db4af4 JB |
634 | list_del(&fe->list); |
635 | mempool_free(fe, flush_entry_pool); | |
636 | } | |
637 | ||
638 | if (r) | |
639 | dm_table_event(lc->ti->table); | |
640 | ||
641 | return r; | |
642 | } | |
643 | ||
644 | /* | |
645 | * userspace_mark_region | |
646 | * | |
647 | * This function should avoid blocking unless absolutely required. | |
648 | * (Memory allocation is valid for blocking.) | |
649 | */ | |
650 | static void userspace_mark_region(struct dm_dirty_log *log, region_t region) | |
651 | { | |
652 | unsigned long flags; | |
653 | struct log_c *lc = log->context; | |
ac1f9ef2 | 654 | struct dm_dirty_log_flush_entry *fe; |
f5db4af4 JB |
655 | |
656 | /* Wait for an allocation, but _never_ fail */ | |
ac1f9ef2 | 657 | fe = mempool_alloc(lc->flush_entry_pool, GFP_NOIO); |
f5db4af4 JB |
658 | BUG_ON(!fe); |
659 | ||
660 | spin_lock_irqsave(&lc->flush_lock, flags); | |
661 | fe->type = DM_ULOG_MARK_REGION; | |
662 | fe->region = region; | |
909cc4fb | 663 | list_add(&fe->list, &lc->mark_list); |
f5db4af4 JB |
664 | spin_unlock_irqrestore(&lc->flush_lock, flags); |
665 | ||
666 | return; | |
667 | } | |
668 | ||
669 | /* | |
670 | * userspace_clear_region | |
671 | * | |
672 | * This function must not block. | |
673 | * So, the alloc can't block. In the worst case, it is ok to | |
674 | * fail. It would simply mean we can't clear the region. | |
675 | * Does nothing to current sync context, but does mean | |
676 | * the region will be re-sync'ed on a reload of the mirror | |
677 | * even though it is in-sync. | |
678 | */ | |
679 | static void userspace_clear_region(struct dm_dirty_log *log, region_t region) | |
680 | { | |
681 | unsigned long flags; | |
682 | struct log_c *lc = log->context; | |
ac1f9ef2 | 683 | struct dm_dirty_log_flush_entry *fe; |
f5db4af4 JB |
684 | |
685 | /* | |
686 | * If we fail to allocate, we skip the clearing of | |
687 | * the region. This doesn't hurt us in any way, except | |
688 | * to cause the region to be resync'ed when the | |
689 | * device is activated next time. | |
690 | */ | |
ac1f9ef2 | 691 | fe = mempool_alloc(lc->flush_entry_pool, GFP_ATOMIC); |
f5db4af4 JB |
692 | if (!fe) { |
693 | DMERR("Failed to allocate memory to clear region."); | |
694 | return; | |
695 | } | |
696 | ||
697 | spin_lock_irqsave(&lc->flush_lock, flags); | |
698 | fe->type = DM_ULOG_CLEAR_REGION; | |
699 | fe->region = region; | |
909cc4fb | 700 | list_add(&fe->list, &lc->clear_list); |
f5db4af4 JB |
701 | spin_unlock_irqrestore(&lc->flush_lock, flags); |
702 | ||
703 | return; | |
704 | } | |
705 | ||
706 | /* | |
707 | * userspace_get_resync_work | |
708 | * | |
709 | * Get a region that needs recovery. It is valid to return | |
710 | * an error for this function. | |
711 | * | |
712 | * Returns: 1 if region filled, 0 if no work, <0 on error | |
713 | */ | |
714 | static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region) | |
715 | { | |
716 | int r; | |
717 | size_t rdata_size; | |
718 | struct log_c *lc = log->context; | |
719 | struct { | |
720 | int64_t i; /* 64-bit for mix arch compatibility */ | |
721 | region_t r; | |
722 | } pkg; | |
723 | ||
724 | if (lc->in_sync_hint >= lc->region_count) | |
725 | return 0; | |
726 | ||
727 | rdata_size = sizeof(pkg); | |
728 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_RESYNC_WORK, | |
5066a4df | 729 | NULL, 0, (char *)&pkg, &rdata_size); |
f5db4af4 JB |
730 | |
731 | *region = pkg.r; | |
732 | return (r) ? r : (int)pkg.i; | |
733 | } | |
734 | ||
735 | /* | |
736 | * userspace_set_region_sync | |
737 | * | |
738 | * Set the sync status of a given region. This function | |
739 | * must not fail. | |
740 | */ | |
741 | static void userspace_set_region_sync(struct dm_dirty_log *log, | |
742 | region_t region, int in_sync) | |
743 | { | |
f5db4af4 JB |
744 | struct log_c *lc = log->context; |
745 | struct { | |
746 | region_t r; | |
747 | int64_t i; | |
748 | } pkg; | |
749 | ||
750 | pkg.r = region; | |
751 | pkg.i = (int64_t)in_sync; | |
752 | ||
18cc980a NMG |
753 | (void) userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC, |
754 | (char *)&pkg, sizeof(pkg), NULL, NULL); | |
f5db4af4 JB |
755 | |
756 | /* | |
757 | * It would be nice to be able to report failures. | |
18cc980a | 758 | * However, it is easy enough to detect and resolve. |
f5db4af4 JB |
759 | */ |
760 | return; | |
761 | } | |
762 | ||
763 | /* | |
764 | * userspace_get_sync_count | |
765 | * | |
766 | * If there is any sort of failure when consulting the server, | |
767 | * we assume that the sync count is zero. | |
768 | * | |
769 | * Returns: sync count on success, 0 on failure | |
770 | */ | |
771 | static region_t userspace_get_sync_count(struct dm_dirty_log *log) | |
772 | { | |
773 | int r; | |
774 | size_t rdata_size; | |
775 | uint64_t sync_count; | |
776 | struct log_c *lc = log->context; | |
777 | ||
778 | rdata_size = sizeof(sync_count); | |
779 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_SYNC_COUNT, | |
5066a4df | 780 | NULL, 0, (char *)&sync_count, &rdata_size); |
f5db4af4 JB |
781 | |
782 | if (r) | |
783 | return 0; | |
784 | ||
785 | if (sync_count >= lc->region_count) | |
786 | lc->in_sync_hint = lc->region_count; | |
787 | ||
788 | return (region_t)sync_count; | |
789 | } | |
790 | ||
791 | /* | |
792 | * userspace_status | |
793 | * | |
794 | * Returns: amount of space consumed | |
795 | */ | |
796 | static int userspace_status(struct dm_dirty_log *log, status_type_t status_type, | |
797 | char *result, unsigned maxlen) | |
798 | { | |
799 | int r = 0; | |
b8313b6d | 800 | char *table_args; |
f5db4af4 JB |
801 | size_t sz = (size_t)maxlen; |
802 | struct log_c *lc = log->context; | |
803 | ||
804 | switch (status_type) { | |
805 | case STATUSTYPE_INFO: | |
806 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_STATUS_INFO, | |
5066a4df | 807 | NULL, 0, result, &sz); |
f5db4af4 JB |
808 | |
809 | if (r) { | |
810 | sz = 0; | |
811 | DMEMIT("%s 1 COM_FAILURE", log->type->name); | |
812 | } | |
813 | break; | |
814 | case STATUSTYPE_TABLE: | |
815 | sz = 0; | |
0d03d59d | 816 | table_args = strchr(lc->usr_argv_str, ' '); |
b8313b6d JB |
817 | BUG_ON(!table_args); /* There will always be a ' ' */ |
818 | table_args++; | |
819 | ||
5066a4df DZ |
820 | DMEMIT("%s %u %s ", log->type->name, lc->usr_argc, lc->uuid); |
821 | if (lc->integrated_flush) | |
822 | DMEMIT("integrated_flush "); | |
823 | DMEMIT("%s ", table_args); | |
f5db4af4 JB |
824 | break; |
825 | } | |
826 | return (r) ? 0 : (int)sz; | |
827 | } | |
828 | ||
829 | /* | |
830 | * userspace_is_remote_recovering | |
831 | * | |
832 | * Returns: 1 if region recovering, 0 otherwise | |
833 | */ | |
834 | static int userspace_is_remote_recovering(struct dm_dirty_log *log, | |
835 | region_t region) | |
836 | { | |
837 | int r; | |
838 | uint64_t region64 = region; | |
839 | struct log_c *lc = log->context; | |
0f30af98 | 840 | static unsigned long limit; |
f5db4af4 JB |
841 | struct { |
842 | int64_t is_recovering; | |
843 | uint64_t in_sync_hint; | |
844 | } pkg; | |
845 | size_t rdata_size = sizeof(pkg); | |
846 | ||
847 | /* | |
848 | * Once the mirror has been reported to be in-sync, | |
849 | * it will never again ask for recovery work. So, | |
850 | * we can safely say there is not a remote machine | |
851 | * recovering if the device is in-sync. (in_sync_hint | |
852 | * must be reset at resume time.) | |
853 | */ | |
854 | if (region < lc->in_sync_hint) | |
855 | return 0; | |
0f30af98 | 856 | else if (time_after(limit, jiffies)) |
f5db4af4 JB |
857 | return 1; |
858 | ||
859 | limit = jiffies + (HZ / 4); | |
860 | r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_REMOTE_RECOVERING, | |
861 | (char *)®ion64, sizeof(region64), | |
862 | (char *)&pkg, &rdata_size); | |
863 | if (r) | |
864 | return 1; | |
865 | ||
866 | lc->in_sync_hint = pkg.in_sync_hint; | |
867 | ||
868 | return (int)pkg.is_recovering; | |
869 | } | |
870 | ||
871 | static struct dm_dirty_log_type _userspace_type = { | |
872 | .name = "userspace", | |
873 | .module = THIS_MODULE, | |
874 | .ctr = userspace_ctr, | |
875 | .dtr = userspace_dtr, | |
876 | .presuspend = userspace_presuspend, | |
877 | .postsuspend = userspace_postsuspend, | |
878 | .resume = userspace_resume, | |
879 | .get_region_size = userspace_get_region_size, | |
880 | .is_clean = userspace_is_clean, | |
881 | .in_sync = userspace_in_sync, | |
882 | .flush = userspace_flush, | |
883 | .mark_region = userspace_mark_region, | |
884 | .clear_region = userspace_clear_region, | |
885 | .get_resync_work = userspace_get_resync_work, | |
886 | .set_region_sync = userspace_set_region_sync, | |
887 | .get_sync_count = userspace_get_sync_count, | |
888 | .status = userspace_status, | |
889 | .is_remote_recovering = userspace_is_remote_recovering, | |
890 | }; | |
891 | ||
892 | static int __init userspace_dirty_log_init(void) | |
893 | { | |
894 | int r = 0; | |
895 | ||
ac1f9ef2 MS |
896 | _flush_entry_cache = KMEM_CACHE(dm_dirty_log_flush_entry, 0); |
897 | if (!_flush_entry_cache) { | |
898 | DMWARN("Unable to create flush_entry_cache: No memory."); | |
f5db4af4 JB |
899 | return -ENOMEM; |
900 | } | |
901 | ||
902 | r = dm_ulog_tfr_init(); | |
903 | if (r) { | |
904 | DMWARN("Unable to initialize userspace log communications"); | |
ac1f9ef2 | 905 | kmem_cache_destroy(_flush_entry_cache); |
f5db4af4 JB |
906 | return r; |
907 | } | |
908 | ||
909 | r = dm_dirty_log_type_register(&_userspace_type); | |
910 | if (r) { | |
911 | DMWARN("Couldn't register userspace dirty log type"); | |
912 | dm_ulog_tfr_exit(); | |
ac1f9ef2 | 913 | kmem_cache_destroy(_flush_entry_cache); |
f5db4af4 JB |
914 | return r; |
915 | } | |
916 | ||
86a54a48 | 917 | DMINFO("version " DM_LOG_USERSPACE_VSN " loaded"); |
f5db4af4 JB |
918 | return 0; |
919 | } | |
920 | ||
921 | static void __exit userspace_dirty_log_exit(void) | |
922 | { | |
923 | dm_dirty_log_type_unregister(&_userspace_type); | |
924 | dm_ulog_tfr_exit(); | |
ac1f9ef2 | 925 | kmem_cache_destroy(_flush_entry_cache); |
f5db4af4 | 926 | |
86a54a48 | 927 | DMINFO("version " DM_LOG_USERSPACE_VSN " unloaded"); |
f5db4af4 JB |
928 | return; |
929 | } | |
930 | ||
931 | module_init(userspace_dirty_log_init); | |
932 | module_exit(userspace_dirty_log_exit); | |
933 | ||
934 | MODULE_DESCRIPTION(DM_NAME " userspace dirty log link"); | |
935 | MODULE_AUTHOR("Jonathan Brassow <dm-devel@redhat.com>"); | |
936 | MODULE_LICENSE("GPL"); |