[deliverable/linux.git] / kernel / bpf / syscall.c

/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of version 2 of the GNU General Public
 * License as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * General Public License for more details.
 */
#include <linux/bpf.h>
#include <linux/syscalls.h>
#include <linux/slab.h>
#include <linux/anon_inodes.h>

static LIST_HEAD(bpf_map_types);

static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
{
	struct bpf_map_type_list *tl;
	struct bpf_map *map;

	list_for_each_entry(tl, &bpf_map_types, list_node) {
		if (tl->type == attr->map_type) {
			map = tl->ops->map_alloc(attr);
			if (IS_ERR(map))
				return map;
			map->ops = tl->ops;
			map->map_type = attr->map_type;
			return map;
		}
	}
	return ERR_PTR(-EINVAL);
}

/* boot time registration of different map implementations */
void bpf_register_map_type(struct bpf_map_type_list *tl)
{
	list_add(&tl->list_node, &bpf_map_types);
}

/* called from workqueue */
static void bpf_map_free_deferred(struct work_struct *work)
{
	struct bpf_map *map = container_of(work, struct bpf_map, work);

	/* implementation dependent freeing */
	map->ops->map_free(map);
}

/* decrement map refcnt and schedule it for freeing via workqueue
 * (unrelying map implementation ops->map_free() might sleep)
 */
void bpf_map_put(struct bpf_map *map)
{
	if (atomic_dec_and_test(&map->refcnt)) {
		INIT_WORK(&map->work, bpf_map_free_deferred);
		schedule_work(&map->work);
	}
}

static int bpf_map_release(struct inode *inode, struct file *filp)
{
	struct bpf_map *map = filp->private_data;

	bpf_map_put(map);
	return 0;
}

static const struct file_operations bpf_map_fops = {
	.release = bpf_map_release,
};

/* helper macro to check that unused fields 'union bpf_attr' are zero */
#define CHECK_ATTR(CMD) \
	memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
		   sizeof(attr->CMD##_LAST_FIELD), 0, \
		   sizeof(*attr) - \
		   offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
		   sizeof(attr->CMD##_LAST_FIELD)) != NULL

#define BPF_MAP_CREATE_LAST_FIELD max_entries
/* called via syscall */
static int map_create(union bpf_attr *attr)
{
	struct bpf_map *map;
	int err;

	err = CHECK_ATTR(BPF_MAP_CREATE);
	if (err)
		return -EINVAL;

	/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
	map = find_and_alloc_map(attr);
	if (IS_ERR(map))
		return PTR_ERR(map);

	atomic_set(&map->refcnt, 1);

	err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC);

	if (err < 0)
		/* failed to allocate fd */
		goto free_map;

	return err;

free_map:
	map->ops->map_free(map);
	return err;
}

SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
	union bpf_attr attr = {};
	int err;

	/* the syscall is limited to root temporarily. This restriction will be
	 * lifted when security audit is clean. Note that eBPF+tracing must have
	 * this restriction, since it may pass kernel data to user space
	 */
	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;

	if (!access_ok(VERIFY_READ, uattr, 1))
		return -EFAULT;

	if (size > PAGE_SIZE)	/* silly large */
		return -E2BIG;

	/* If we're handed a bigger struct than we know of,
	 * ensure all the unknown bits are 0 - i.e. new
	 * user-space does not rely on any kernel feature
	 * extensions we dont know about yet.
	 */
	if (size > sizeof(attr)) {
		unsigned char __user *addr;
		unsigned char __user *end;
		unsigned char val;

		addr = (void __user *)uattr + sizeof(attr);
		end  = (void __user *)uattr + size;

		for (; addr < end; addr++) {
			err = get_user(val, addr);
			if (err)
				return err;
			if (val)
				return -E2BIG;
		}
		size = sizeof(attr);
	}

	/* copy attributes from user space, may be less than sizeof(bpf_attr) */
	if (copy_from_user(&attr, uattr, size) != 0)
		return -EFAULT;

	switch (cmd) {
	case BPF_MAP_CREATE:
		err = map_create(&attr);
		break;
	default:
		err = -EINVAL;
		break;
	}

	return err;
}
Commit	Line	Data
99c55f7d AS	1	/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
	2	*
	3	* This program is free software; you can redistribute it and/or
	4	* modify it under the terms of version 2 of the GNU General Public
	5	* License as published by the Free Software Foundation.
	6	*
	7	* This program is distributed in the hope that it will be useful, but
	8	* WITHOUT ANY WARRANTY; without even the implied warranty of
	9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	10	* General Public License for more details.
	11	*/
	12	#include <linux/bpf.h>
	13	#include <linux/syscalls.h>
	14	#include <linux/slab.h>
	15	#include <linux/anon_inodes.h>
	16
	17	static LIST_HEAD(bpf_map_types);
	18
	19	static struct bpf_map find_and_alloc_map(union bpf_attr attr)
	20	{
	21	struct bpf_map_type_list *tl;
	22	struct bpf_map *map;
	23
	24	list_for_each_entry(tl, &bpf_map_types, list_node) {
	25	if (tl->type == attr->map_type) {
	26	map = tl->ops->map_alloc(attr);
	27	if (IS_ERR(map))
	28	return map;
	29	map->ops = tl->ops;
	30	map->map_type = attr->map_type;
	31	return map;
	32	}
	33	}
	34	return ERR_PTR(-EINVAL);
	35	}
	36
	37	/* boot time registration of different map implementations */
	38	void bpf_register_map_type(struct bpf_map_type_list *tl)
	39	{
	40	list_add(&tl->list_node, &bpf_map_types);
	41	}
	42
	43	/* called from workqueue */
	44	static void bpf_map_free_deferred(struct work_struct *work)
	45	{
	46	struct bpf_map *map = container_of(work, struct bpf_map, work);
	47
	48	/* implementation dependent freeing */
	49	map->ops->map_free(map);
	50	}
	51
	52	/* decrement map refcnt and schedule it for freeing via workqueue
	53	* (unrelying map implementation ops->map_free() might sleep)
	54	*/
	55	void bpf_map_put(struct bpf_map *map)
	56	{
	57	if (atomic_dec_and_test(&map->refcnt)) {
	58	INIT_WORK(&map->work, bpf_map_free_deferred);
	59	schedule_work(&map->work);
	60	}
	61	}
	62
	63	static int bpf_map_release(struct inode inode, struct file filp)
	64	{
65	struct bpf_map *map = filp->private_data;
66
67	bpf_map_put(map);
68	return 0;
69	}
70
71	static const struct file_operations bpf_map_fops = {
72	.release = bpf_map_release,
73	};
74
75	/* helper macro to check that unused fields 'union bpf_attr' are zero */
76	#define CHECK_ATTR(CMD) \
77	memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
78	sizeof(attr->CMD##_LAST_FIELD), 0, \
79	sizeof(*attr) - \
80	offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
81	sizeof(attr->CMD##_LAST_FIELD)) != NULL
82
83	#define BPF_MAP_CREATE_LAST_FIELD max_entries
84	/* called via syscall */
85	static int map_create(union bpf_attr *attr)
86	{
87	struct bpf_map *map;
88	int err;
89
90	err = CHECK_ATTR(BPF_MAP_CREATE);
91	if (err)
92	return -EINVAL;
93
94	/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
95	map = find_and_alloc_map(attr);
96	if (IS_ERR(map))
97	return PTR_ERR(map);
98
99	atomic_set(&map->refcnt, 1);
100
101	err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR \| O_CLOEXEC);
102
103	if (err < 0)
104	/* failed to allocate fd */
105	goto free_map;
106
107	return err;
108
109	free_map:
110	map->ops->map_free(map);
111	return err;
112	}
113
114	SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
115	{
116	union bpf_attr attr = {};
117	int err;
118
119	/* the syscall is limited to root temporarily. This restriction will be
120	* lifted when security audit is clean. Note that eBPF+tracing must have
121	* this restriction, since it may pass kernel data to user space
122	*/
123	if (!capable(CAP_SYS_ADMIN))
124	return -EPERM;
125
126	if (!access_ok(VERIFY_READ, uattr, 1))
127	return -EFAULT;
128
129	if (size > PAGE_SIZE) /* silly large */
130	return -E2BIG;
131
132	/* If we're handed a bigger struct than we know of,
133	* ensure all the unknown bits are 0 - i.e. new
134	* user-space does not rely on any kernel feature
135	* extensions we dont know about yet.
136	*/
137	if (size > sizeof(attr)) {
138	unsigned char __user *addr;
139	unsigned char __user *end;
140	unsigned char val;
141
142	addr = (void __user *)uattr + sizeof(attr);
143	end = (void __user *)uattr + size;
144
145	for (; addr < end; addr++) {
146	err = get_user(val, addr);
147	if (err)
148	return err;
149	if (val)
150	return -E2BIG;
151	}
152	size = sizeof(attr);
153	}
154
155	/* copy attributes from user space, may be less than sizeof(bpf_attr) */
156	if (copy_from_user(&attr, uattr, size) != 0)
157	return -EFAULT;
158
159	switch (cmd) {
160	case BPF_MAP_CREATE:
161	err = map_create(&attr);
162	break;
163	default:
164	err = -EINVAL;
165	break;
166	}
167
168	return err;
169	}