kernel/bpf/syscall.c

   1 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
   2  *
   3  * This program is free software; you can redistribute it and/or
   4  * modify it under the terms of version 2 of the GNU General Public
   5  * License as published by the Free Software Foundation.
   6  *
   7  * This program is distributed in the hope that it will be useful, but
   8  * WITHOUT ANY WARRANTY; without even the implied warranty of
   9  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  10  * General Public License for more details.
  11  */
  12 #include <linux/bpf.h>
  13 #include <linux/syscalls.h>
  14 #include <linux/slab.h>
  15 #include <linux/anon_inodes.h>
  16 #include <linux/file.h>
  17
  18 static LIST_HEAD(bpf_map_types);
  19
  20 static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
  21 {
  22         struct bpf_map_type_list *tl;
  23         struct bpf_map *map;
  24
  25         list_for_each_entry(tl, &bpf_map_types, list_node) {
  26                 if (tl->type == attr->map_type) {
  27                         map = tl->ops->map_alloc(attr);
  28                         if (IS_ERR(map))
  29                                 return map;
  30                         map->ops = tl->ops;
  31                         map->map_type = attr->map_type;
  32                         return map;
  33                 }
  34         }
  35         return ERR_PTR(-EINVAL);
  36 }
  37
  38 /* boot time registration of different map implementations */
  39 void bpf_register_map_type(struct bpf_map_type_list *tl)
  40 {
  41         list_add(&tl->list_node, &bpf_map_types);
  42 }
  43
  44 /* called from workqueue */
  45 static void bpf_map_free_deferred(struct work_struct *work)
  46 {
  47         struct bpf_map *map = container_of(work, struct bpf_map, work);
  48
  49         /* implementation dependent freeing */
  50         map->ops->map_free(map);
  51 }
  52
  53 /* decrement map refcnt and schedule it for freeing via workqueue
  54  * (unrelying map implementation ops->map_free() might sleep)
  55  */
  56 void bpf_map_put(struct bpf_map *map)
  57 {
  58         if (atomic_dec_and_test(&map->refcnt)) {
  59                 INIT_WORK(&map->work, bpf_map_free_deferred);
  60                 schedule_work(&map->work);
  61         }
  62 }
  63
  64 static int bpf_map_release(struct inode *inode, struct file *filp)
  65 {
  66         struct bpf_map *map = filp->private_data;
  67
  68         bpf_map_put(map);
  69         return 0;
  70 }
  71
  72 static const struct file_operations bpf_map_fops = {
  73         .release = bpf_map_release,
  74 };
  75
  76 /* helper macro to check that unused fields 'union bpf_attr' are zero */
  77 #define CHECK_ATTR(CMD) \
  78         memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
  79                    sizeof(attr->CMD##_LAST_FIELD), 0, \
  80                    sizeof(*attr) - \
  81                    offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
  82                    sizeof(attr->CMD##_LAST_FIELD)) != NULL
  83
  84 #define BPF_MAP_CREATE_LAST_FIELD max_entries
  85 /* called via syscall */
  86 static int map_create(union bpf_attr *attr)
  87 {
  88         struct bpf_map *map;
  89         int err;
  90
  91         err = CHECK_ATTR(BPF_MAP_CREATE);
  92         if (err)
  93                 return -EINVAL;
  94
  95         /* find map type and init map: hashtable vs rbtree vs bloom vs ... */
  96         map = find_and_alloc_map(attr);
  97         if (IS_ERR(map))
  98                 return PTR_ERR(map);
  99
 100         atomic_set(&map->refcnt, 1);
 101
 102         err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC);
 103
 104         if (err < 0)
 105                 /* failed to allocate fd */
 106                 goto free_map;
 107
 108         return err;
 109
 110 free_map:
 111         map->ops->map_free(map);
 112         return err;
 113 }
 114
 115 /* if error is returned, fd is released.
 116  * On success caller should complete fd access with matching fdput()
 117  */
 118 struct bpf_map *bpf_map_get(struct fd f)
 119 {
 120         struct bpf_map *map;
 121
 122         if (!f.file)
 123                 return ERR_PTR(-EBADF);
 124
 125         if (f.file->f_op != &bpf_map_fops) {
 126                 fdput(f);
 127                 return ERR_PTR(-EINVAL);
 128         }
 129
 130         map = f.file->private_data;
 131
 132         return map;
 133 }
 134
 135 /* helper to convert user pointers passed inside __aligned_u64 fields */
 136 static void __user *u64_to_ptr(__u64 val)
 137 {
 138         return (void __user *) (unsigned long) val;
 139 }
 140
 141 /* last field in 'union bpf_attr' used by this command */
 142 #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
 143
 144 static int map_lookup_elem(union bpf_attr *attr)
 145 {
 146         void __user *ukey = u64_to_ptr(attr->key);
 147         void __user *uvalue = u64_to_ptr(attr->value);
 148         int ufd = attr->map_fd;
 149         struct fd f = fdget(ufd);
 150         struct bpf_map *map;
 151         void *key, *value;
 152         int err;
 153
 154         if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
 155                 return -EINVAL;
 156
 157         map = bpf_map_get(f);
 158         if (IS_ERR(map))
 159                 return PTR_ERR(map);
 160
 161         err = -ENOMEM;
 162         key = kmalloc(map->key_size, GFP_USER);
 163         if (!key)
 164                 goto err_put;
 165
 166         err = -EFAULT;
 167         if (copy_from_user(key, ukey, map->key_size) != 0)
 168                 goto free_key;
 169
 170         err = -ESRCH;
 171         rcu_read_lock();
 172         value = map->ops->map_lookup_elem(map, key);
 173         if (!value)
 174                 goto err_unlock;
 175
 176         err = -EFAULT;
 177         if (copy_to_user(uvalue, value, map->value_size) != 0)
 178                 goto err_unlock;
 179
 180         err = 0;
 181
 182 err_unlock:
 183         rcu_read_unlock();
 184 free_key:
 185         kfree(key);
 186 err_put:
 187         fdput(f);
 188         return err;
 189 }
 190
 191 #define BPF_MAP_UPDATE_ELEM_LAST_FIELD value
 192
 193 static int map_update_elem(union bpf_attr *attr)
 194 {
 195         void __user *ukey = u64_to_ptr(attr->key);
 196         void __user *uvalue = u64_to_ptr(attr->value);
 197         int ufd = attr->map_fd;
 198         struct fd f = fdget(ufd);
 199         struct bpf_map *map;
 200         void *key, *value;
 201         int err;
 202
 203         if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
 204                 return -EINVAL;
 205
 206         map = bpf_map_get(f);
 207         if (IS_ERR(map))
 208                 return PTR_ERR(map);
 209
 210         err = -ENOMEM;
 211         key = kmalloc(map->key_size, GFP_USER);
 212         if (!key)
 213                 goto err_put;
 214
 215         err = -EFAULT;
 216         if (copy_from_user(key, ukey, map->key_size) != 0)
 217                 goto free_key;
 218
 219         err = -ENOMEM;
 220         value = kmalloc(map->value_size, GFP_USER);
 221         if (!value)
 222                 goto free_key;
 223
 224         err = -EFAULT;
 225         if (copy_from_user(value, uvalue, map->value_size) != 0)
 226                 goto free_value;
 227
 228         /* eBPF program that use maps are running under rcu_read_lock(),
 229          * therefore all map accessors rely on this fact, so do the same here
 230          */
 231         rcu_read_lock();
 232         err = map->ops->map_update_elem(map, key, value);
 233         rcu_read_unlock();
 234
 235 free_value:
 236         kfree(value);
 237 free_key:
 238         kfree(key);
 239 err_put:
 240         fdput(f);
 241         return err;
 242 }
 243
 244 #define BPF_MAP_DELETE_ELEM_LAST_FIELD key
 245
 246 static int map_delete_elem(union bpf_attr *attr)
 247 {
 248         void __user *ukey = u64_to_ptr(attr->key);
 249         int ufd = attr->map_fd;
 250         struct fd f = fdget(ufd);
 251         struct bpf_map *map;
 252         void *key;
 253         int err;
 254
 255         if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
 256                 return -EINVAL;
 257
 258         map = bpf_map_get(f);
 259         if (IS_ERR(map))
 260                 return PTR_ERR(map);
 261
 262         err = -ENOMEM;
 263         key = kmalloc(map->key_size, GFP_USER);
 264         if (!key)
 265                 goto err_put;
 266
 267         err = -EFAULT;
 268         if (copy_from_user(key, ukey, map->key_size) != 0)
 269                 goto free_key;
 270
 271         rcu_read_lock();
 272         err = map->ops->map_delete_elem(map, key);
 273         rcu_read_unlock();
 274
 275 free_key:
 276         kfree(key);
 277 err_put:
 278         fdput(f);
 279         return err;
 280 }
 281
 282 /* last field in 'union bpf_attr' used by this command */
 283 #define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
 284
 285 static int map_get_next_key(union bpf_attr *attr)
 286 {
 287         void __user *ukey = u64_to_ptr(attr->key);
 288         void __user *unext_key = u64_to_ptr(attr->next_key);
 289         int ufd = attr->map_fd;
 290         struct fd f = fdget(ufd);
 291         struct bpf_map *map;
 292         void *key, *next_key;
 293         int err;
 294
 295         if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
 296                 return -EINVAL;
 297
 298         map = bpf_map_get(f);
 299         if (IS_ERR(map))
 300                 return PTR_ERR(map);
 301
 302         err = -ENOMEM;
 303         key = kmalloc(map->key_size, GFP_USER);
 304         if (!key)
 305                 goto err_put;
 306
 307         err = -EFAULT;
 308         if (copy_from_user(key, ukey, map->key_size) != 0)
 309                 goto free_key;
 310
 311         err = -ENOMEM;
 312         next_key = kmalloc(map->key_size, GFP_USER);
 313         if (!next_key)
 314                 goto free_key;
 315
 316         rcu_read_lock();
 317         err = map->ops->map_get_next_key(map, key, next_key);
 318         rcu_read_unlock();
 319         if (err)
 320                 goto free_next_key;
 321
 322         err = -EFAULT;
 323         if (copy_to_user(unext_key, next_key, map->key_size) != 0)
 324                 goto free_next_key;
 325
 326         err = 0;
 327
 328 free_next_key:
 329         kfree(next_key);
 330 free_key:
 331         kfree(key);
 332 err_put:
 333         fdput(f);
 334         return err;
 335 }
 336
 337 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 338 {
 339         union bpf_attr attr = {};
 340         int err;
 341
 342         /* the syscall is limited to root temporarily. This restriction will be
 343          * lifted when security audit is clean. Note that eBPF+tracing must have
 344          * this restriction, since it may pass kernel data to user space
 345          */
 346         if (!capable(CAP_SYS_ADMIN))
 347                 return -EPERM;
 348
 349         if (!access_ok(VERIFY_READ, uattr, 1))
 350                 return -EFAULT;
 351
 352         if (size > PAGE_SIZE)   /* silly large */
 353                 return -E2BIG;
 354
 355         /* If we're handed a bigger struct than we know of,
 356          * ensure all the unknown bits are 0 - i.e. new
 357          * user-space does not rely on any kernel feature
 358          * extensions we dont know about yet.
 359          */
 360         if (size > sizeof(attr)) {
 361                 unsigned char __user *addr;
 362                 unsigned char __user *end;
 363                 unsigned char val;
 364
 365                 addr = (void __user *)uattr + sizeof(attr);
 366                 end  = (void __user *)uattr + size;
 367
 368                 for (; addr < end; addr++) {
 369                         err = get_user(val, addr);
 370                         if (err)
 371                                 return err;
 372                         if (val)
 373                                 return -E2BIG;
 374                 }
 375                 size = sizeof(attr);
 376         }
 377
 378         /* copy attributes from user space, may be less than sizeof(bpf_attr) */
 379         if (copy_from_user(&attr, uattr, size) != 0)
 380                 return -EFAULT;
 381
 382         switch (cmd) {
 383         case BPF_MAP_CREATE:
 384                 err = map_create(&attr);
 385                 break;
 386         case BPF_MAP_LOOKUP_ELEM:
 387                 err = map_lookup_elem(&attr);
 388                 break;
 389         case BPF_MAP_UPDATE_ELEM:
 390                 err = map_update_elem(&attr);
 391                 break;
 392         case BPF_MAP_DELETE_ELEM:
 393                 err = map_delete_elem(&attr);
 394                 break;
 395         case BPF_MAP_GET_NEXT_KEY:
 396                 err = map_get_next_key(&attr);
 397                 break;
 398         default:
 399                 err = -EINVAL;
 400                 break;
 401         }
 402
 403         return err;
 404 }