Commit | Line | Data |
---|---|---|
5bc1421e NH |
1 | /* |
2 | * net/core/netprio_cgroup.c Priority Control Group | |
3 | * | |
4 | * This program is free software; you can redistribute it and/or | |
5 | * modify it under the terms of the GNU General Public License | |
6 | * as published by the Free Software Foundation; either version | |
7 | * 2 of the License, or (at your option) any later version. | |
8 | * | |
9 | * Authors: Neil Horman <nhorman@tuxdriver.com> | |
10 | */ | |
11 | ||
e005d193 JP |
12 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
13 | ||
5bc1421e NH |
14 | #include <linux/module.h> |
15 | #include <linux/slab.h> | |
16 | #include <linux/types.h> | |
17 | #include <linux/string.h> | |
18 | #include <linux/errno.h> | |
19 | #include <linux/skbuff.h> | |
20 | #include <linux/cgroup.h> | |
21 | #include <linux/rcupdate.h> | |
22 | #include <linux/atomic.h> | |
23 | #include <net/rtnetlink.h> | |
24 | #include <net/pkt_cls.h> | |
25 | #include <net/sock.h> | |
26 | #include <net/netprio_cgroup.h> | |
27 | ||
406a3c63 JF |
28 | #include <linux/fdtable.h> |
29 | ||
4a6ee25c | 30 | #define PRIOMAP_MIN_SZ 128 |
5bc1421e NH |
31 | #define PRIOIDX_SZ 128 |
32 | ||
33 | static unsigned long prioidx_map[PRIOIDX_SZ]; | |
34 | static DEFINE_SPINLOCK(prioidx_map_lock); | |
5bc1421e NH |
35 | |
36 | static inline struct cgroup_netprio_state *cgrp_netprio_state(struct cgroup *cgrp) | |
37 | { | |
38 | return container_of(cgroup_subsys_state(cgrp, net_prio_subsys_id), | |
39 | struct cgroup_netprio_state, css); | |
40 | } | |
41 | ||
42 | static int get_prioidx(u32 *prio) | |
43 | { | |
44 | unsigned long flags; | |
45 | u32 prioidx; | |
46 | ||
47 | spin_lock_irqsave(&prioidx_map_lock, flags); | |
48 | prioidx = find_first_zero_bit(prioidx_map, sizeof(unsigned long) * PRIOIDX_SZ); | |
5962b35c NH |
49 | if (prioidx == sizeof(unsigned long) * PRIOIDX_SZ) { |
50 | spin_unlock_irqrestore(&prioidx_map_lock, flags); | |
51 | return -ENOSPC; | |
52 | } | |
5bc1421e NH |
53 | set_bit(prioidx, prioidx_map); |
54 | spin_unlock_irqrestore(&prioidx_map_lock, flags); | |
5bc1421e NH |
55 | *prio = prioidx; |
56 | return 0; | |
57 | } | |
58 | ||
59 | static void put_prioidx(u32 idx) | |
60 | { | |
61 | unsigned long flags; | |
62 | ||
63 | spin_lock_irqsave(&prioidx_map_lock, flags); | |
64 | clear_bit(idx, prioidx_map); | |
65 | spin_unlock_irqrestore(&prioidx_map_lock, flags); | |
66 | } | |
67 | ||
4a6ee25c TH |
68 | /* |
69 | * Extend @dev->priomap so that it's large enough to accomodate | |
70 | * @target_idx. @dev->priomap.priomap_len > @target_idx after successful | |
71 | * return. Must be called under rtnl lock. | |
72 | */ | |
73 | static int extend_netdev_table(struct net_device *dev, u32 target_idx) | |
5bc1421e | 74 | { |
4a6ee25c TH |
75 | struct netprio_map *old, *new; |
76 | size_t new_sz, new_len; | |
5bc1421e | 77 | |
4a6ee25c | 78 | /* is the existing priomap large enough? */ |
52bca930 | 79 | old = rtnl_dereference(dev->priomap); |
4a6ee25c TH |
80 | if (old && old->priomap_len > target_idx) |
81 | return 0; | |
82 | ||
83 | /* | |
84 | * Determine the new size. Let's keep it power-of-two. We start | |
85 | * from PRIOMAP_MIN_SZ and double it until it's large enough to | |
86 | * accommodate @target_idx. | |
87 | */ | |
88 | new_sz = PRIOMAP_MIN_SZ; | |
89 | while (true) { | |
90 | new_len = (new_sz - offsetof(struct netprio_map, priomap)) / | |
91 | sizeof(new->priomap[0]); | |
92 | if (new_len > target_idx) | |
93 | break; | |
94 | new_sz *= 2; | |
95 | /* overflowed? */ | |
96 | if (WARN_ON(new_sz < PRIOMAP_MIN_SZ)) | |
97 | return -ENOSPC; | |
98 | } | |
5bc1421e | 99 | |
4a6ee25c TH |
100 | /* allocate & copy */ |
101 | new = kzalloc(new_sz, GFP_KERNEL); | |
52bca930 | 102 | if (!new) { |
e005d193 | 103 | pr_warn("Unable to alloc new priomap!\n"); |
ef209f15 | 104 | return -ENOMEM; |
5bc1421e NH |
105 | } |
106 | ||
52bca930 TH |
107 | if (old) |
108 | memcpy(new->priomap, old->priomap, | |
109 | old->priomap_len * sizeof(old->priomap[0])); | |
5bc1421e | 110 | |
52bca930 | 111 | new->priomap_len = new_len; |
5bc1421e | 112 | |
4a6ee25c | 113 | /* install the new priomap */ |
52bca930 TH |
114 | rcu_assign_pointer(dev->priomap, new); |
115 | if (old) | |
116 | kfree_rcu(old, rcu); | |
ef209f15 G |
117 | return 0; |
118 | } | |
119 | ||
92fb9748 | 120 | static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp) |
5bc1421e NH |
121 | { |
122 | struct cgroup_netprio_state *cs; | |
ef209f15 | 123 | int ret = -EINVAL; |
5bc1421e NH |
124 | |
125 | cs = kzalloc(sizeof(*cs), GFP_KERNEL); | |
126 | if (!cs) | |
127 | return ERR_PTR(-ENOMEM); | |
128 | ||
ef209f15 G |
129 | if (cgrp->parent && cgrp_netprio_state(cgrp->parent)->prioidx) |
130 | goto out; | |
5bc1421e NH |
131 | |
132 | ret = get_prioidx(&cs->prioidx); | |
ef209f15 | 133 | if (ret < 0) { |
e005d193 | 134 | pr_warn("No space in priority index array\n"); |
ef209f15 G |
135 | goto out; |
136 | } | |
137 | ||
5bc1421e | 138 | return &cs->css; |
ef209f15 G |
139 | out: |
140 | kfree(cs); | |
141 | return ERR_PTR(ret); | |
5bc1421e NH |
142 | } |
143 | ||
92fb9748 | 144 | static void cgrp_css_free(struct cgroup *cgrp) |
5bc1421e NH |
145 | { |
146 | struct cgroup_netprio_state *cs; | |
147 | struct net_device *dev; | |
148 | struct netprio_map *map; | |
149 | ||
150 | cs = cgrp_netprio_state(cgrp); | |
151 | rtnl_lock(); | |
152 | for_each_netdev(&init_net, dev) { | |
153 | map = rtnl_dereference(dev->priomap); | |
91c68ce2 | 154 | if (map && cs->prioidx < map->priomap_len) |
5bc1421e NH |
155 | map->priomap[cs->prioidx] = 0; |
156 | } | |
157 | rtnl_unlock(); | |
158 | put_prioidx(cs->prioidx); | |
159 | kfree(cs); | |
160 | } | |
161 | ||
162 | static u64 read_prioidx(struct cgroup *cgrp, struct cftype *cft) | |
163 | { | |
164 | return (u64)cgrp_netprio_state(cgrp)->prioidx; | |
165 | } | |
166 | ||
167 | static int read_priomap(struct cgroup *cont, struct cftype *cft, | |
168 | struct cgroup_map_cb *cb) | |
169 | { | |
170 | struct net_device *dev; | |
171 | u32 prioidx = cgrp_netprio_state(cont)->prioidx; | |
172 | u32 priority; | |
173 | struct netprio_map *map; | |
174 | ||
175 | rcu_read_lock(); | |
176 | for_each_netdev_rcu(&init_net, dev) { | |
177 | map = rcu_dereference(dev->priomap); | |
91c68ce2 | 178 | priority = (map && prioidx < map->priomap_len) ? map->priomap[prioidx] : 0; |
5bc1421e NH |
179 | cb->fill(cb, dev->name, priority); |
180 | } | |
181 | rcu_read_unlock(); | |
182 | return 0; | |
183 | } | |
184 | ||
185 | static int write_priomap(struct cgroup *cgrp, struct cftype *cft, | |
186 | const char *buffer) | |
187 | { | |
5bc1421e | 188 | u32 prioidx = cgrp_netprio_state(cgrp)->prioidx; |
6d5759dd | 189 | char devname[IFNAMSIZ + 1]; |
5bc1421e NH |
190 | struct net_device *dev; |
191 | struct netprio_map *map; | |
6d5759dd TH |
192 | u32 prio; |
193 | int ret; | |
5bc1421e | 194 | |
6d5759dd TH |
195 | if (sscanf(buffer, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2) |
196 | return -EINVAL; | |
5bc1421e NH |
197 | |
198 | dev = dev_get_by_name(&init_net, devname); | |
199 | if (!dev) | |
6d5759dd | 200 | return -ENODEV; |
5bc1421e | 201 | |
476ad154 | 202 | rtnl_lock(); |
6d5759dd | 203 | |
4a6ee25c | 204 | ret = extend_netdev_table(dev, prioidx); |
6d5759dd TH |
205 | if (ret) |
206 | goto out_unlock; | |
ef209f15 | 207 | |
476ad154 | 208 | map = rtnl_dereference(dev->priomap); |
5bc1421e | 209 | if (map) |
6d5759dd TH |
210 | map->priomap[prioidx] = prio; |
211 | out_unlock: | |
476ad154 | 212 | rtnl_unlock(); |
5bc1421e | 213 | dev_put(dev); |
5bc1421e NH |
214 | return ret; |
215 | } | |
216 | ||
c3c073f8 AV |
217 | static int update_netprio(const void *v, struct file *file, unsigned n) |
218 | { | |
219 | int err; | |
220 | struct socket *sock = sock_from_file(file, &err); | |
221 | if (sock) | |
222 | sock->sk->sk_cgrp_prioidx = (u32)(unsigned long)v; | |
223 | return 0; | |
224 | } | |
225 | ||
406a3c63 JF |
226 | void net_prio_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) |
227 | { | |
228 | struct task_struct *p; | |
c3c073f8 | 229 | void *v; |
406a3c63 JF |
230 | |
231 | cgroup_taskset_for_each(p, cgrp, tset) { | |
406a3c63 | 232 | task_lock(p); |
c3c073f8 AV |
233 | v = (void *)(unsigned long)task_netprioidx(p); |
234 | iterate_fd(p->files, 0, update_netprio, v); | |
406a3c63 JF |
235 | task_unlock(p); |
236 | } | |
406a3c63 JF |
237 | } |
238 | ||
5bc1421e NH |
239 | static struct cftype ss_files[] = { |
240 | { | |
241 | .name = "prioidx", | |
242 | .read_u64 = read_prioidx, | |
243 | }, | |
244 | { | |
245 | .name = "ifpriomap", | |
246 | .read_map = read_priomap, | |
247 | .write_string = write_priomap, | |
248 | }, | |
4baf6e33 | 249 | { } /* terminate */ |
5bc1421e NH |
250 | }; |
251 | ||
676f7c8f TH |
252 | struct cgroup_subsys net_prio_subsys = { |
253 | .name = "net_prio", | |
92fb9748 TH |
254 | .css_alloc = cgrp_css_alloc, |
255 | .css_free = cgrp_css_free, | |
406a3c63 | 256 | .attach = net_prio_attach, |
676f7c8f | 257 | .subsys_id = net_prio_subsys_id, |
4baf6e33 | 258 | .base_cftypes = ss_files, |
8c7f6edb TH |
259 | .module = THIS_MODULE, |
260 | ||
261 | /* | |
262 | * net_prio has artificial limit on the number of cgroups and | |
263 | * disallows nesting making it impossible to co-mount it with other | |
264 | * hierarchical subsystems. Remove the artificially low PRIOIDX_SZ | |
265 | * limit and properly nest configuration such that children follow | |
266 | * their parents' configurations by default and are allowed to | |
267 | * override and remove the following. | |
268 | */ | |
269 | .broken_hierarchy = true, | |
676f7c8f | 270 | }; |
5bc1421e NH |
271 | |
272 | static int netprio_device_event(struct notifier_block *unused, | |
273 | unsigned long event, void *ptr) | |
274 | { | |
275 | struct net_device *dev = ptr; | |
276 | struct netprio_map *old; | |
5bc1421e NH |
277 | |
278 | /* | |
279 | * Note this is called with rtnl_lock held so we have update side | |
280 | * protection on our rcu assignments | |
281 | */ | |
282 | ||
283 | switch (event) { | |
5bc1421e NH |
284 | case NETDEV_UNREGISTER: |
285 | old = rtnl_dereference(dev->priomap); | |
2cfa5a04 | 286 | RCU_INIT_POINTER(dev->priomap, NULL); |
5bc1421e NH |
287 | if (old) |
288 | kfree_rcu(old, rcu); | |
289 | break; | |
290 | } | |
291 | return NOTIFY_DONE; | |
292 | } | |
293 | ||
294 | static struct notifier_block netprio_device_notifier = { | |
295 | .notifier_call = netprio_device_event | |
296 | }; | |
297 | ||
298 | static int __init init_cgroup_netprio(void) | |
299 | { | |
300 | int ret; | |
301 | ||
302 | ret = cgroup_load_subsys(&net_prio_subsys); | |
303 | if (ret) | |
304 | goto out; | |
5bc1421e NH |
305 | |
306 | register_netdevice_notifier(&netprio_device_notifier); | |
307 | ||
308 | out: | |
309 | return ret; | |
310 | } | |
311 | ||
312 | static void __exit exit_cgroup_netprio(void) | |
313 | { | |
314 | struct netprio_map *old; | |
315 | struct net_device *dev; | |
316 | ||
317 | unregister_netdevice_notifier(&netprio_device_notifier); | |
318 | ||
319 | cgroup_unload_subsys(&net_prio_subsys); | |
320 | ||
5bc1421e NH |
321 | rtnl_lock(); |
322 | for_each_netdev(&init_net, dev) { | |
323 | old = rtnl_dereference(dev->priomap); | |
2cfa5a04 | 324 | RCU_INIT_POINTER(dev->priomap, NULL); |
5bc1421e NH |
325 | if (old) |
326 | kfree_rcu(old, rcu); | |
327 | } | |
328 | rtnl_unlock(); | |
329 | } | |
330 | ||
331 | module_init(init_cgroup_netprio); | |
332 | module_exit(exit_cgroup_netprio); | |
333 | MODULE_LICENSE("GPL v2"); |