Commit | Line | Data |
---|---|---|
f392ba88 KU |
1 | /* |
2 | * Copyright (C) 2007-2009 NEC Corporation. All Rights Reserved. | |
3 | * | |
4 | * Module Author: Kiyoshi Ueda | |
5 | * | |
6 | * This file is released under the GPL. | |
7 | * | |
8 | * Throughput oriented path selector. | |
9 | */ | |
10 | ||
11 | #include "dm.h" | |
12 | #include "dm-path-selector.h" | |
13 | ||
5a0e3ad6 | 14 | #include <linux/slab.h> |
056075c7 | 15 | #include <linux/module.h> |
5a0e3ad6 | 16 | |
f392ba88 KU |
17 | #define DM_MSG_PREFIX "multipath service-time" |
18 | #define ST_MIN_IO 1 | |
19 | #define ST_MAX_RELATIVE_THROUGHPUT 100 | |
20 | #define ST_MAX_RELATIVE_THROUGHPUT_SHIFT 7 | |
21 | #define ST_MAX_INFLIGHT_SIZE ((size_t)-1 >> ST_MAX_RELATIVE_THROUGHPUT_SHIFT) | |
21136f89 | 22 | #define ST_VERSION "0.3.0" |
f392ba88 KU |
23 | |
24 | struct selector { | |
25 | struct list_head valid_paths; | |
26 | struct list_head failed_paths; | |
9659f811 | 27 | spinlock_t lock; |
f392ba88 KU |
28 | }; |
29 | ||
30 | struct path_info { | |
31 | struct list_head list; | |
32 | struct dm_path *path; | |
33 | unsigned repeat_count; | |
34 | unsigned relative_throughput; | |
35 | atomic_t in_flight_size; /* Total size of in-flight I/Os */ | |
36 | }; | |
37 | ||
38 | static struct selector *alloc_selector(void) | |
39 | { | |
40 | struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); | |
41 | ||
42 | if (s) { | |
43 | INIT_LIST_HEAD(&s->valid_paths); | |
44 | INIT_LIST_HEAD(&s->failed_paths); | |
9659f811 | 45 | spin_lock_init(&s->lock); |
f392ba88 KU |
46 | } |
47 | ||
48 | return s; | |
49 | } | |
50 | ||
51 | static int st_create(struct path_selector *ps, unsigned argc, char **argv) | |
52 | { | |
53 | struct selector *s = alloc_selector(); | |
54 | ||
55 | if (!s) | |
56 | return -ENOMEM; | |
57 | ||
58 | ps->context = s; | |
59 | return 0; | |
60 | } | |
61 | ||
62 | static void free_paths(struct list_head *paths) | |
63 | { | |
64 | struct path_info *pi, *next; | |
65 | ||
66 | list_for_each_entry_safe(pi, next, paths, list) { | |
67 | list_del(&pi->list); | |
68 | kfree(pi); | |
69 | } | |
70 | } | |
71 | ||
72 | static void st_destroy(struct path_selector *ps) | |
73 | { | |
74 | struct selector *s = ps->context; | |
75 | ||
76 | free_paths(&s->valid_paths); | |
77 | free_paths(&s->failed_paths); | |
78 | kfree(s); | |
79 | ps->context = NULL; | |
80 | } | |
81 | ||
82 | static int st_status(struct path_selector *ps, struct dm_path *path, | |
83 | status_type_t type, char *result, unsigned maxlen) | |
84 | { | |
85 | unsigned sz = 0; | |
86 | struct path_info *pi; | |
87 | ||
88 | if (!path) | |
89 | DMEMIT("0 "); | |
90 | else { | |
91 | pi = path->pscontext; | |
92 | ||
93 | switch (type) { | |
94 | case STATUSTYPE_INFO: | |
95 | DMEMIT("%d %u ", atomic_read(&pi->in_flight_size), | |
96 | pi->relative_throughput); | |
97 | break; | |
98 | case STATUSTYPE_TABLE: | |
99 | DMEMIT("%u %u ", pi->repeat_count, | |
100 | pi->relative_throughput); | |
101 | break; | |
102 | } | |
103 | } | |
104 | ||
105 | return sz; | |
106 | } | |
107 | ||
108 | static int st_add_path(struct path_selector *ps, struct dm_path *path, | |
109 | int argc, char **argv, char **error) | |
110 | { | |
111 | struct selector *s = ps->context; | |
112 | struct path_info *pi; | |
113 | unsigned repeat_count = ST_MIN_IO; | |
114 | unsigned relative_throughput = 1; | |
31998ef1 | 115 | char dummy; |
9659f811 | 116 | unsigned long flags; |
f392ba88 KU |
117 | |
118 | /* | |
119 | * Arguments: [<repeat_count> [<relative_throughput>]] | |
120 | * <repeat_count>: The number of I/Os before switching path. | |
121 | * If not given, default (ST_MIN_IO) is used. | |
122 | * <relative_throughput>: The relative throughput value of | |
123 | * the path among all paths in the path-group. | |
124 | * The valid range: 0-<ST_MAX_RELATIVE_THROUGHPUT> | |
125 | * If not given, minimum value '1' is used. | |
126 | * If '0' is given, the path isn't selected while | |
127 | * other paths having a positive value are | |
128 | * available. | |
129 | */ | |
130 | if (argc > 2) { | |
131 | *error = "service-time ps: incorrect number of arguments"; | |
132 | return -EINVAL; | |
133 | } | |
134 | ||
31998ef1 | 135 | if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { |
f392ba88 KU |
136 | *error = "service-time ps: invalid repeat count"; |
137 | return -EINVAL; | |
138 | } | |
139 | ||
21136f89 MS |
140 | if (repeat_count > 1) { |
141 | DMWARN_LIMIT("repeat_count > 1 is deprecated, using 1 instead"); | |
142 | repeat_count = 1; | |
143 | } | |
144 | ||
f392ba88 | 145 | if ((argc == 2) && |
31998ef1 | 146 | (sscanf(argv[1], "%u%c", &relative_throughput, &dummy) != 1 || |
f392ba88 KU |
147 | relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) { |
148 | *error = "service-time ps: invalid relative_throughput value"; | |
149 | return -EINVAL; | |
150 | } | |
151 | ||
152 | /* allocate the path */ | |
153 | pi = kmalloc(sizeof(*pi), GFP_KERNEL); | |
154 | if (!pi) { | |
155 | *error = "service-time ps: Error allocating path context"; | |
156 | return -ENOMEM; | |
157 | } | |
158 | ||
159 | pi->path = path; | |
160 | pi->repeat_count = repeat_count; | |
161 | pi->relative_throughput = relative_throughput; | |
162 | atomic_set(&pi->in_flight_size, 0); | |
163 | ||
164 | path->pscontext = pi; | |
165 | ||
9659f811 | 166 | spin_lock_irqsave(&s->lock, flags); |
f392ba88 | 167 | list_add_tail(&pi->list, &s->valid_paths); |
9659f811 | 168 | spin_unlock_irqrestore(&s->lock, flags); |
f392ba88 KU |
169 | |
170 | return 0; | |
171 | } | |
172 | ||
173 | static void st_fail_path(struct path_selector *ps, struct dm_path *path) | |
174 | { | |
175 | struct selector *s = ps->context; | |
176 | struct path_info *pi = path->pscontext; | |
9659f811 | 177 | unsigned long flags; |
f392ba88 | 178 | |
9659f811 | 179 | spin_lock_irqsave(&s->lock, flags); |
f392ba88 | 180 | list_move(&pi->list, &s->failed_paths); |
9659f811 | 181 | spin_unlock_irqrestore(&s->lock, flags); |
f392ba88 KU |
182 | } |
183 | ||
184 | static int st_reinstate_path(struct path_selector *ps, struct dm_path *path) | |
185 | { | |
186 | struct selector *s = ps->context; | |
187 | struct path_info *pi = path->pscontext; | |
9659f811 | 188 | unsigned long flags; |
f392ba88 | 189 | |
9659f811 | 190 | spin_lock_irqsave(&s->lock, flags); |
f392ba88 | 191 | list_move_tail(&pi->list, &s->valid_paths); |
9659f811 | 192 | spin_unlock_irqrestore(&s->lock, flags); |
f392ba88 KU |
193 | |
194 | return 0; | |
195 | } | |
196 | ||
197 | /* | |
198 | * Compare the estimated service time of 2 paths, pi1 and pi2, | |
199 | * for the incoming I/O. | |
200 | * | |
201 | * Returns: | |
202 | * < 0 : pi1 is better | |
203 | * 0 : no difference between pi1 and pi2 | |
204 | * > 0 : pi2 is better | |
205 | * | |
206 | * Description: | |
207 | * Basically, the service time is estimated by: | |
208 | * ('pi->in-flight-size' + 'incoming') / 'pi->relative_throughput' | |
209 | * To reduce the calculation, some optimizations are made. | |
210 | * (See comments inline) | |
211 | */ | |
212 | static int st_compare_load(struct path_info *pi1, struct path_info *pi2, | |
213 | size_t incoming) | |
214 | { | |
215 | size_t sz1, sz2, st1, st2; | |
216 | ||
217 | sz1 = atomic_read(&pi1->in_flight_size); | |
218 | sz2 = atomic_read(&pi2->in_flight_size); | |
219 | ||
220 | /* | |
221 | * Case 1: Both have same throughput value. Choose less loaded path. | |
222 | */ | |
223 | if (pi1->relative_throughput == pi2->relative_throughput) | |
224 | return sz1 - sz2; | |
225 | ||
226 | /* | |
227 | * Case 2a: Both have same load. Choose higher throughput path. | |
228 | * Case 2b: One path has no throughput value. Choose the other one. | |
229 | */ | |
230 | if (sz1 == sz2 || | |
231 | !pi1->relative_throughput || !pi2->relative_throughput) | |
232 | return pi2->relative_throughput - pi1->relative_throughput; | |
233 | ||
234 | /* | |
235 | * Case 3: Calculate service time. Choose faster path. | |
236 | * Service time using pi1: | |
237 | * st1 = (sz1 + incoming) / pi1->relative_throughput | |
238 | * Service time using pi2: | |
239 | * st2 = (sz2 + incoming) / pi2->relative_throughput | |
240 | * | |
241 | * To avoid the division, transform the expression to use | |
242 | * multiplication. | |
243 | * Because ->relative_throughput > 0 here, if st1 < st2, | |
244 | * the expressions below are the same meaning: | |
245 | * (sz1 + incoming) / pi1->relative_throughput < | |
246 | * (sz2 + incoming) / pi2->relative_throughput | |
247 | * (sz1 + incoming) * pi2->relative_throughput < | |
248 | * (sz2 + incoming) * pi1->relative_throughput | |
249 | * So use the later one. | |
250 | */ | |
251 | sz1 += incoming; | |
252 | sz2 += incoming; | |
253 | if (unlikely(sz1 >= ST_MAX_INFLIGHT_SIZE || | |
254 | sz2 >= ST_MAX_INFLIGHT_SIZE)) { | |
255 | /* | |
256 | * Size may be too big for multiplying pi->relative_throughput | |
257 | * and overflow. | |
258 | * To avoid the overflow and mis-selection, shift down both. | |
259 | */ | |
260 | sz1 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT; | |
261 | sz2 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT; | |
262 | } | |
263 | st1 = sz1 * pi2->relative_throughput; | |
264 | st2 = sz2 * pi1->relative_throughput; | |
265 | if (st1 != st2) | |
266 | return st1 - st2; | |
267 | ||
268 | /* | |
269 | * Case 4: Service time is equal. Choose higher throughput path. | |
270 | */ | |
271 | return pi2->relative_throughput - pi1->relative_throughput; | |
272 | } | |
273 | ||
90a4323c | 274 | static struct dm_path *st_select_path(struct path_selector *ps, size_t nr_bytes) |
f392ba88 KU |
275 | { |
276 | struct selector *s = ps->context; | |
277 | struct path_info *pi = NULL, *best = NULL; | |
9659f811 MS |
278 | struct dm_path *ret = NULL; |
279 | unsigned long flags; | |
f392ba88 | 280 | |
9659f811 | 281 | spin_lock_irqsave(&s->lock, flags); |
f392ba88 | 282 | if (list_empty(&s->valid_paths)) |
9659f811 | 283 | goto out; |
f392ba88 KU |
284 | |
285 | /* Change preferred (first in list) path to evenly balance. */ | |
286 | list_move_tail(s->valid_paths.next, &s->valid_paths); | |
287 | ||
288 | list_for_each_entry(pi, &s->valid_paths, list) | |
289 | if (!best || (st_compare_load(pi, best, nr_bytes) < 0)) | |
290 | best = pi; | |
291 | ||
292 | if (!best) | |
9659f811 | 293 | goto out; |
f392ba88 | 294 | |
9659f811 MS |
295 | ret = best->path; |
296 | out: | |
297 | spin_unlock_irqrestore(&s->lock, flags); | |
298 | return ret; | |
f392ba88 KU |
299 | } |
300 | ||
301 | static int st_start_io(struct path_selector *ps, struct dm_path *path, | |
302 | size_t nr_bytes) | |
303 | { | |
304 | struct path_info *pi = path->pscontext; | |
305 | ||
306 | atomic_add(nr_bytes, &pi->in_flight_size); | |
307 | ||
308 | return 0; | |
309 | } | |
310 | ||
311 | static int st_end_io(struct path_selector *ps, struct dm_path *path, | |
312 | size_t nr_bytes) | |
313 | { | |
314 | struct path_info *pi = path->pscontext; | |
315 | ||
316 | atomic_sub(nr_bytes, &pi->in_flight_size); | |
317 | ||
318 | return 0; | |
319 | } | |
320 | ||
321 | static struct path_selector_type st_ps = { | |
322 | .name = "service-time", | |
323 | .module = THIS_MODULE, | |
324 | .table_args = 2, | |
325 | .info_args = 2, | |
326 | .create = st_create, | |
327 | .destroy = st_destroy, | |
328 | .status = st_status, | |
329 | .add_path = st_add_path, | |
330 | .fail_path = st_fail_path, | |
331 | .reinstate_path = st_reinstate_path, | |
332 | .select_path = st_select_path, | |
333 | .start_io = st_start_io, | |
334 | .end_io = st_end_io, | |
335 | }; | |
336 | ||
337 | static int __init dm_st_init(void) | |
338 | { | |
339 | int r = dm_register_path_selector(&st_ps); | |
340 | ||
341 | if (r < 0) | |
342 | DMERR("register failed %d", r); | |
343 | ||
344 | DMINFO("version " ST_VERSION " loaded"); | |
345 | ||
346 | return r; | |
347 | } | |
348 | ||
349 | static void __exit dm_st_exit(void) | |
350 | { | |
351 | int r = dm_unregister_path_selector(&st_ps); | |
352 | ||
353 | if (r < 0) | |
354 | DMERR("unregister failed %d", r); | |
355 | } | |
356 | ||
357 | module_init(dm_st_init); | |
358 | module_exit(dm_st_exit); | |
359 | ||
360 | MODULE_DESCRIPTION(DM_NAME " throughput oriented path selector"); | |
361 | MODULE_AUTHOR("Kiyoshi Ueda <k-ueda@ct.jp.nec.com>"); | |
362 | MODULE_LICENSE("GPL"); |