[deliverable/linux.git] / net / sched / sch_generic.c

/*
 * net/sched/sch_generic.c	Generic packet scheduler routines.
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
 *              - Ingress support
 */

#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/rcupdate.h>
#include <linux/list.h>
#include <net/pkt_sched.h>

/* Main transmission queue. */

/* Modifications to data participating in scheduling must be protected with
 * dev->queue_lock spinlock.
 *
 * The idea is the following:
 * - enqueue, dequeue are serialized via top level device
 *   spinlock dev->queue_lock.
 * - ingress filtering is serialized via top level device
 *   spinlock dev->ingress_lock.
 * - updates to tree and tree walking are only done under the rtnl mutex.
 */

void qdisc_lock_tree(struct net_device *dev)
	__acquires(dev->queue_lock)
	__acquires(dev->ingress_lock)
{
	spin_lock_bh(&dev->queue_lock);
	spin_lock(&dev->ingress_lock);
}
EXPORT_SYMBOL(qdisc_lock_tree);

void qdisc_unlock_tree(struct net_device *dev)
	__releases(dev->ingress_lock)
	__releases(dev->queue_lock)
{
	spin_unlock(&dev->ingress_lock);
	spin_unlock_bh(&dev->queue_lock);
}
EXPORT_SYMBOL(qdisc_unlock_tree);

static inline int qdisc_qlen(struct Qdisc *q)
{
	return q->q.qlen;
}

static inline int dev_requeue_skb(struct sk_buff *skb, struct net_device *dev,
				  struct Qdisc *q)
{
	if (unlikely(skb->next))
		dev->gso_skb = skb;
	else
		q->ops->requeue(skb, q);

	netif_schedule(dev);
	return 0;
}

static inline struct sk_buff *dev_dequeue_skb(struct net_device *dev,
					      struct Qdisc *q)
{
	struct sk_buff *skb;

	if ((skb = dev->gso_skb))
		dev->gso_skb = NULL;
	else
		skb = q->dequeue(q);

	return skb;
}

static inline int handle_dev_cpu_collision(struct sk_buff *skb,
					   struct net_device *dev,
					   struct Qdisc *q)
{
	int ret;

	if (unlikely(dev->xmit_lock_owner == smp_processor_id())) {
		/*
		 * Same CPU holding the lock. It may be a transient
		 * configuration error, when hard_start_xmit() recurses. We
		 * detect it by checking xmit owner and drop the packet when
		 * deadloop is detected. Return OK to try the next skb.
		 */
		kfree_skb(skb);
		if (net_ratelimit())
			printk(KERN_WARNING "Dead loop on netdevice %s, "
			       "fix it urgently!\n", dev->name);
		ret = qdisc_qlen(q);
	} else {
		/*
		 * Another cpu is holding lock, requeue & delay xmits for
		 * some time.
		 */
		__get_cpu_var(netdev_rx_stat).cpu_collision++;
		ret = dev_requeue_skb(skb, dev, q);
	}

	return ret;
}

/*
 * NOTE: Called under dev->queue_lock with locally disabled BH.
 *
 * __LINK_STATE_QDISC_RUNNING guarantees only one CPU can process this
 * device at a time. dev->queue_lock serializes queue accesses for
 * this device AND dev->qdisc pointer itself.
 *
 *  netif_tx_lock serializes accesses to device driver.
 *
 *  dev->queue_lock and netif_tx_lock are mutually exclusive,
 *  if one is grabbed, another must be free.
 *
 * Note, that this procedure can be called by a watchdog timer
 *
 * Returns to the caller:
 *				0  - queue is empty or throttled.
 *				>0 - queue is not empty.
 *
 */
static inline int qdisc_restart(struct net_device *dev)
{
	struct Qdisc *q = dev->qdisc;
	struct sk_buff *skb;
	int ret = NETDEV_TX_BUSY;

	/* Dequeue packet */
	if (unlikely((skb = dev_dequeue_skb(dev, q)) == NULL))
		return 0;


	/* And release queue */
	spin_unlock(&dev->queue_lock);

	HARD_TX_LOCK(dev, smp_processor_id());
	if (!netif_subqueue_stopped(dev, skb))
		ret = dev_hard_start_xmit(skb, dev);
	HARD_TX_UNLOCK(dev);

	spin_lock(&dev->queue_lock);
	q = dev->qdisc;

	switch (ret) {
	case NETDEV_TX_OK:
		/* Driver sent out skb successfully */
		ret = qdisc_qlen(q);
		break;

	case NETDEV_TX_LOCKED:
		/* Driver try lock failed */
		ret = handle_dev_cpu_collision(skb, dev, q);
		break;

	default:
		/* Driver returned NETDEV_TX_BUSY - requeue skb */
		if (unlikely (ret != NETDEV_TX_BUSY && net_ratelimit()))
			printk(KERN_WARNING "BUG %s code %d qlen %d\n",
			       dev->name, ret, q->q.qlen);

		ret = dev_requeue_skb(skb, dev, q);
		break;
	}

	return ret;
}

void __qdisc_run(struct net_device *dev)
{
	unsigned long start_time = jiffies;

	while (qdisc_restart(dev)) {
		if (netif_queue_stopped(dev))
			break;

		/*
		 * Postpone processing if
		 * 1. another process needs the CPU;
		 * 2. we've been doing it for too long.
		 */
		if (need_resched() || jiffies != start_time) {
			netif_schedule(dev);
			break;
		}
	}

	clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
}

static void dev_watchdog(unsigned long arg)
{
	struct net_device *dev = (struct net_device *)arg;

	netif_tx_lock(dev);
	if (dev->qdisc != &noop_qdisc) {
		if (netif_device_present(dev) &&
		    netif_running(dev) &&
		    netif_carrier_ok(dev)) {
			if (netif_queue_stopped(dev) &&
			    time_after(jiffies, dev->trans_start + dev->watchdog_timeo)) {

				printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n",
				       dev->name);
				dev->tx_timeout(dev);
			}
			if (!mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + dev->watchdog_timeo)))
				dev_hold(dev);
		}
	}
	netif_tx_unlock(dev);

	dev_put(dev);
}

void __netdev_watchdog_up(struct net_device *dev)
{
	if (dev->tx_timeout) {
		if (dev->watchdog_timeo <= 0)
			dev->watchdog_timeo = 5*HZ;
		if (!mod_timer(&dev->watchdog_timer,
			       round_jiffies(jiffies + dev->watchdog_timeo)))
			dev_hold(dev);
	}
}

static void dev_watchdog_up(struct net_device *dev)
{
	__netdev_watchdog_up(dev);
}

static void dev_watchdog_down(struct net_device *dev)
{
	netif_tx_lock_bh(dev);
	if (del_timer(&dev->watchdog_timer))
		dev_put(dev);
	netif_tx_unlock_bh(dev);
}

/**
 *	netif_carrier_on - set carrier
 *	@dev: network device
 *
 * Device has detected that carrier.
 */
void netif_carrier_on(struct net_device *dev)
{
	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
		linkwatch_fire_event(dev);
		if (netif_running(dev))
			__netdev_watchdog_up(dev);
	}
}
EXPORT_SYMBOL(netif_carrier_on);

/**
 *	netif_carrier_off - clear carrier
 *	@dev: network device
 *
 * Device has detected loss of carrier.
 */
void netif_carrier_off(struct net_device *dev)
{
	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
		linkwatch_fire_event(dev);
}
EXPORT_SYMBOL(netif_carrier_off);

/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
   under all circumstances. It is difficult to invent anything faster or
   cheaper.
 */

static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
{
	kfree_skb(skb);
	return NET_XMIT_CN;
}

static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
{
	return NULL;
}

static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
{
	if (net_ratelimit())
		printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
		       skb->dev->name);
	kfree_skb(skb);
	return NET_XMIT_CN;
}

struct Qdisc_ops noop_qdisc_ops __read_mostly = {
	.id		=	"noop",
	.priv_size	=	0,
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
	.requeue	=	noop_requeue,
	.owner		=	THIS_MODULE,
};

struct Qdisc noop_qdisc = {
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
	.flags		=	TCQ_F_BUILTIN,
	.ops		=	&noop_qdisc_ops,
	.list		=	LIST_HEAD_INIT(noop_qdisc.list),
};
EXPORT_SYMBOL(noop_qdisc);

static struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
	.id		=	"noqueue",
	.priv_size	=	0,
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
	.requeue	=	noop_requeue,
	.owner		=	THIS_MODULE,
};

static struct Qdisc noqueue_qdisc = {
	.enqueue	=	NULL,
	.dequeue	=	noop_dequeue,
	.flags		=	TCQ_F_BUILTIN,
	.ops		=	&noqueue_qdisc_ops,
	.list		=	LIST_HEAD_INIT(noqueue_qdisc.list),
};


static const u8 prio2band[TC_PRIO_MAX+1] =
	{ 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };

/* 3-band FIFO queue: old style, but should be a bit faster than
   generic prio+fifo combination.
 */

#define PFIFO_FAST_BANDS 3

static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
					     struct Qdisc *qdisc)
{
	struct sk_buff_head *list = qdisc_priv(qdisc);
	return list + prio2band[skb->priority & TC_PRIO_MAX];
}

static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
{
	struct sk_buff_head *list = prio2list(skb, qdisc);

	if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
		qdisc->q.qlen++;
		return __qdisc_enqueue_tail(skb, qdisc, list);
	}

	return qdisc_drop(skb, qdisc);
}

static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
{
	int prio;
	struct sk_buff_head *list = qdisc_priv(qdisc);

	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
		if (!skb_queue_empty(list + prio)) {
			qdisc->q.qlen--;
			return __qdisc_dequeue_head(qdisc, list + prio);
		}
	}

	return NULL;
}

static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
{
	qdisc->q.qlen++;
	return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
}

static void pfifo_fast_reset(struct Qdisc* qdisc)
{
	int prio;
	struct sk_buff_head *list = qdisc_priv(qdisc);

	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
		__qdisc_reset_queue(qdisc, list + prio);

	qdisc->qstats.backlog = 0;
	qdisc->q.qlen = 0;
}

static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
{
	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };

	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
	return skb->len;

nla_put_failure:
	return -1;
}

static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt)
{
	int prio;
	struct sk_buff_head *list = qdisc_priv(qdisc);

	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
		skb_queue_head_init(list + prio);

	return 0;
}

static struct Qdisc_ops pfifo_fast_ops __read_mostly = {
	.id		=	"pfifo_fast",
	.priv_size	=	PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
	.enqueue	=	pfifo_fast_enqueue,
	.dequeue	=	pfifo_fast_dequeue,
	.requeue	=	pfifo_fast_requeue,
	.init		=	pfifo_fast_init,
	.reset		=	pfifo_fast_reset,
	.dump		=	pfifo_fast_dump,
	.owner		=	THIS_MODULE,
};

struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
{
	void *p;
	struct Qdisc *sch;
	unsigned int size;
	int err = -ENOBUFS;

	/* ensure that the Qdisc and the private data are 32-byte aligned */
	size = QDISC_ALIGN(sizeof(*sch));
	size += ops->priv_size + (QDISC_ALIGNTO - 1);

	p = kzalloc(size, GFP_KERNEL);
	if (!p)
		goto errout;
	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
	sch->padded = (char *) sch - (char *) p;

	INIT_LIST_HEAD(&sch->list);
	skb_queue_head_init(&sch->q);
	sch->ops = ops;
	sch->enqueue = ops->enqueue;
	sch->dequeue = ops->dequeue;
	sch->dev = dev;
	dev_hold(dev);
	atomic_set(&sch->refcnt, 1);

	return sch;
errout:
	return ERR_PTR(-err);
}

struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops,
				 unsigned int parentid)
{
	struct Qdisc *sch;

	sch = qdisc_alloc(dev, ops);
	if (IS_ERR(sch))
		goto errout;
	sch->stats_lock = &dev->queue_lock;
	sch->parent = parentid;

	if (!ops->init || ops->init(sch, NULL) == 0)
		return sch;

	qdisc_destroy(sch);
errout:
	return NULL;
}
EXPORT_SYMBOL(qdisc_create_dflt);

/* Under dev->queue_lock and BH! */

void qdisc_reset(struct Qdisc *qdisc)
{
	const struct Qdisc_ops *ops = qdisc->ops;

	if (ops->reset)
		ops->reset(qdisc);
}
EXPORT_SYMBOL(qdisc_reset);

/* this is the rcu callback function to clean up a qdisc when there
 * are no further references to it */

static void __qdisc_destroy(struct rcu_head *head)
{
	struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
	kfree((char *) qdisc - qdisc->padded);
}

/* Under dev->queue_lock and BH! */

void qdisc_destroy(struct Qdisc *qdisc)
{
	const struct Qdisc_ops  *ops = qdisc->ops;

	if (qdisc->flags & TCQ_F_BUILTIN ||
	    !atomic_dec_and_test(&qdisc->refcnt))
		return;

	list_del(&qdisc->list);
	gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
	if (ops->reset)
		ops->reset(qdisc);
	if (ops->destroy)
		ops->destroy(qdisc);

	module_put(ops->owner);
	dev_put(qdisc->dev);
	call_rcu(&qdisc->q_rcu, __qdisc_destroy);
}
EXPORT_SYMBOL(qdisc_destroy);

void dev_activate(struct net_device *dev)
{
	/* No queueing discipline is attached to device;
	   create default one i.e. pfifo_fast for devices,
	   which need queueing and noqueue_qdisc for
	   virtual interfaces
	 */

	if (dev->qdisc_sleeping == &noop_qdisc) {
		struct Qdisc *qdisc;
		if (dev->tx_queue_len) {
			qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops,
						  TC_H_ROOT);
			if (qdisc == NULL) {
				printk(KERN_INFO "%s: activation failed\n", dev->name);
				return;
			}
			list_add_tail(&qdisc->list, &dev->qdisc_list);
		} else {
			qdisc =  &noqueue_qdisc;
		}
		dev->qdisc_sleeping = qdisc;
	}

	if (!netif_carrier_ok(dev))
		/* Delay activation until next carrier-on event */
		return;

	spin_lock_bh(&dev->queue_lock);
	rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping);
	if (dev->qdisc != &noqueue_qdisc) {
		dev->trans_start = jiffies;
		dev_watchdog_up(dev);
	}
	spin_unlock_bh(&dev->queue_lock);
}

void dev_deactivate(struct net_device *dev)
{
	struct Qdisc *qdisc;
	struct sk_buff *skb;
	int running;

	spin_lock_bh(&dev->queue_lock);
	qdisc = dev->qdisc;
	dev->qdisc = &noop_qdisc;

	qdisc_reset(qdisc);

	skb = dev->gso_skb;
	dev->gso_skb = NULL;
	spin_unlock_bh(&dev->queue_lock);

	kfree_skb(skb);

	dev_watchdog_down(dev);

	/* Wait for outstanding qdisc-less dev_queue_xmit calls. */
	synchronize_rcu();

	/* Wait for outstanding qdisc_run calls. */
	do {
		while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
			yield();

		/*
		 * Double-check inside queue lock to ensure that all effects
		 * of the queue run are visible when we return.
		 */
		spin_lock_bh(&dev->queue_lock);
		running = test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
		spin_unlock_bh(&dev->queue_lock);

		/*
		 * The running flag should never be set at this point because
		 * we've already set dev->qdisc to noop_qdisc *inside* the same
		 * pair of spin locks.  That is, if any qdisc_run starts after
		 * our initial test it should see the noop_qdisc and then
		 * clear the RUNNING bit before dropping the queue lock.  So
		 * if it is set here then we've found a bug.
		 */
	} while (WARN_ON_ONCE(running));
}

void dev_init_scheduler(struct net_device *dev)
{
	qdisc_lock_tree(dev);
	dev->qdisc = &noop_qdisc;
	dev->qdisc_sleeping = &noop_qdisc;
	INIT_LIST_HEAD(&dev->qdisc_list);
	qdisc_unlock_tree(dev);

	setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev);
}

void dev_shutdown(struct net_device *dev)
{
	struct Qdisc *qdisc;

	qdisc_lock_tree(dev);
	qdisc = dev->qdisc_sleeping;
	dev->qdisc = &noop_qdisc;
	dev->qdisc_sleeping = &noop_qdisc;
	qdisc_destroy(qdisc);
#if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
	if ((qdisc = dev->qdisc_ingress) != NULL) {
		dev->qdisc_ingress = NULL;
		qdisc_destroy(qdisc);
	}
#endif
	BUG_TRAP(!timer_pending(&dev->watchdog_timer));
	qdisc_unlock_tree(dev);
}
Commit	Line	Data
1da177e4 LT	1	/*
	2	* net/sched/sch_generic.c Generic packet scheduler routines.
	3	*
	4	* This program is free software; you can redistribute it and/or
	5	* modify it under the terms of the GNU General Public License
	6	* as published by the Free Software Foundation; either version
	7	* 2 of the License, or (at your option) any later version.
	8	*
	9	* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
	10	* Jamal Hadi Salim, <hadi@cyberus.ca> 990601
	11	* - Ingress support
	12	*/
	13
1da177e4	14	#include <linux/bitops.h>
1da177e4 LT	15	#include <linux/module.h>
	16	#include <linux/types.h>
	17	#include <linux/kernel.h>
	18	#include <linux/sched.h>
	19	#include <linux/string.h>
1da177e4	20	#include <linux/errno.h>
1da177e4 LT	21	#include <linux/netdevice.h>
	22	#include <linux/skbuff.h>
	23	#include <linux/rtnetlink.h>
	24	#include <linux/init.h>
	25	#include <linux/rcupdate.h>
	26	#include <linux/list.h>
1da177e4 LT	27	#include <net/pkt_sched.h>
	28
	29	/* Main transmission queue. */
	30
0463d4ae PM	31	/* Modifications to data participating in scheduling must be protected with
	32	* dev->queue_lock spinlock.
	33	*
	34	* The idea is the following:
	35	* - enqueue, dequeue are serialized via top level device
	36	* spinlock dev->queue_lock.
fd44de7c PM	37	* - ingress filtering is serialized via top level device
fd44de7c PM	38	* spinlock dev->ingress_lock.
0463d4ae	39	* - updates to tree and tree walking are only done under the rtnl mutex.
1da177e4	40	*/
1da177e4 LT	41
1da177e4 LT	42	void qdisc_lock_tree(struct net_device *dev)
9a429c49 ED	43	__acquires(dev->queue_lock)
9a429c49 ED	44	__acquires(dev->ingress_lock)
1da177e4	45	{
1da177e4	46	spin_lock_bh(&dev->queue_lock);
fd44de7c	47	spin_lock(&dev->ingress_lock);
1da177e4	48	}
62e3ba1b	49	EXPORT_SYMBOL(qdisc_lock_tree);
1da177e4 LT	50
1da177e4 LT	51	void qdisc_unlock_tree(struct net_device *dev)
9a429c49 ED	52	__releases(dev->ingress_lock)
9a429c49 ED	53	__releases(dev->queue_lock)
1da177e4	54	{
fd44de7c	55	spin_unlock(&dev->ingress_lock);
1da177e4	56	spin_unlock_bh(&dev->queue_lock);
1da177e4	57	}
62e3ba1b	58	EXPORT_SYMBOL(qdisc_unlock_tree);
1da177e4	59
c716a81a JHS	60	static inline int qdisc_qlen(struct Qdisc *q)
c716a81a JHS	61	{
c716a81a JHS	62	return q->q.qlen;
	63	}
	64
6c1361a6 KK	65	static inline int dev_requeue_skb(struct sk_buff skb, struct net_device dev,
6c1361a6 KK	66	struct Qdisc *q)
c716a81a	67	{
c716a81a JHS	68	if (unlikely(skb->next))
	69	dev->gso_skb = skb;
	70	else
	71	q->ops->requeue(skb, q);
6c1361a6	72
c716a81a JHS	73	netif_schedule(dev);
	74	return 0;
	75	}
	76
6c1361a6 KK	77	static inline struct sk_buff dev_dequeue_skb(struct net_device dev,
6c1361a6 KK	78	struct Qdisc *q)
c716a81a	79	{
6c1361a6	80	struct sk_buff *skb;
c716a81a	81
6c1361a6	82	if ((skb = dev->gso_skb))
c716a81a JHS	83	dev->gso_skb = NULL;
	84	else
	85	skb = q->dequeue(q);
	86
	87	return skb;
	88	}
	89
6c1361a6 KK	90	static inline int handle_dev_cpu_collision(struct sk_buff *skb,
	91	struct net_device *dev,
	92	struct Qdisc *q)
c716a81a	93	{
6c1361a6	94	int ret;
c716a81a	95
6c1361a6 KK	96	if (unlikely(dev->xmit_lock_owner == smp_processor_id())) {
	97	/*
	98	* Same CPU holding the lock. It may be a transient
	99	* configuration error, when hard_start_xmit() recurses. We
	100	* detect it by checking xmit owner and drop the packet when
	101	* deadloop is detected. Return OK to try the next skb.
	102	*/
c716a81a	103	kfree_skb(skb);
6c1361a6 KK	104	if (net_ratelimit())
	105	printk(KERN_WARNING "Dead loop on netdevice %s, "
	106	"fix it urgently!\n", dev->name);
	107	ret = qdisc_qlen(q);
	108	} else {
	109	/*
	110	* Another cpu is holding lock, requeue & delay xmits for
	111	* some time.
	112	*/
	113	__get_cpu_var(netdev_rx_stat).cpu_collision++;
	114	ret = dev_requeue_skb(skb, dev, q);
c716a81a JHS	115	}
c716a81a JHS	116
6c1361a6	117	return ret;
c716a81a JHS	118	}
c716a81a JHS	119
10297b99	120	/*
6c1361a6 KK	121	* NOTE: Called under dev->queue_lock with locally disabled BH.
	122	*
	123	* __LINK_STATE_QDISC_RUNNING guarantees only one CPU can process this
	124	* device at a time. dev->queue_lock serializes queue accesses for
	125	* this device AND dev->qdisc pointer itself.
	126	*
	127	* netif_tx_lock serializes accesses to device driver.
	128	*
	129	* dev->queue_lock and netif_tx_lock are mutually exclusive,
	130	* if one is grabbed, another must be free.
	131	*
	132	* Note, that this procedure can be called by a watchdog timer
	133	*
	134	* Returns to the caller:
	135	* 0 - queue is empty or throttled.
	136	* >0 - queue is not empty.
	137	*
	138	*/
48d83325	139	static inline int qdisc_restart(struct net_device *dev)
1da177e4 LT	140	{
	141	struct Qdisc *q = dev->qdisc;
	142	struct sk_buff *skb;
5f1a485d	143	int ret = NETDEV_TX_BUSY;
1da177e4	144
6c1361a6 KK	145	/* Dequeue packet */
6c1361a6 KK	146	if (unlikely((skb = dev_dequeue_skb(dev, q)) == NULL))
c716a81a	147	return 0;
f6a78bfc	148
6c1361a6 KK	149
6c1361a6 KK	150	/* And release queue */
c716a81a JHS	151	spin_unlock(&dev->queue_lock);
c716a81a JHS	152
8236632f	153	HARD_TX_LOCK(dev, smp_processor_id());
5f1a485d PWJ	154	if (!netif_subqueue_stopped(dev, skb))
5f1a485d PWJ	155	ret = dev_hard_start_xmit(skb, dev);
8236632f	156	HARD_TX_UNLOCK(dev);
c716a81a JHS	157
c716a81a JHS	158	spin_lock(&dev->queue_lock);
c716a81a	159	q = dev->qdisc;
c716a81a	160
6c1361a6 KK	161	switch (ret) {
	162	case NETDEV_TX_OK:
	163	/* Driver sent out skb successfully */
	164	ret = qdisc_qlen(q);
	165	break;
	166
	167	case NETDEV_TX_LOCKED:
	168	/* Driver try lock failed */
	169	ret = handle_dev_cpu_collision(skb, dev, q);
	170	break;
	171
	172	default:
	173	/* Driver returned NETDEV_TX_BUSY - requeue skb */
	174	if (unlikely (ret != NETDEV_TX_BUSY && net_ratelimit()))
	175	printk(KERN_WARNING "BUG %s code %d qlen %d\n",
	176	dev->name, ret, q->q.qlen);
	177
	178	ret = dev_requeue_skb(skb, dev, q);
	179	break;
	180	}
c716a81a	181
6c1361a6	182	return ret;
1da177e4 LT	183	}
1da177e4 LT	184
48d83325 HX	185	void __qdisc_run(struct net_device *dev)
48d83325 HX	186	{
2ba2506c HX	187	unsigned long start_time = jiffies;
	188
	189	while (qdisc_restart(dev)) {
	190	if (netif_queue_stopped(dev))
	191	break;
	192
	193	/*
	194	* Postpone processing if
	195	* 1. another process needs the CPU;
	196	* 2. we've been doing it for too long.
	197	*/
	198	if (need_resched() \|\| jiffies != start_time) {
	199	netif_schedule(dev);
d90df3ad	200	break;
2ba2506c HX	201	}
2ba2506c HX	202	}
48d83325 HX	203
	204	clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
	205	}
	206
1da177e4 LT	207	static void dev_watchdog(unsigned long arg)
	208	{
	209	struct net_device dev = (struct net_device )arg;
	210
932ff279	211	netif_tx_lock(dev);
1da177e4 LT	212	if (dev->qdisc != &noop_qdisc) {
	213	if (netif_device_present(dev) &&
	214	netif_running(dev) &&
	215	netif_carrier_ok(dev)) {
	216	if (netif_queue_stopped(dev) &&
338f7566 SH	217	time_after(jiffies, dev->trans_start + dev->watchdog_timeo)) {
	218
	219	printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n",
	220	dev->name);
1da177e4 LT	221	dev->tx_timeout(dev);
1da177e4 LT	222	}
f5a6e01c	223	if (!mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + dev->watchdog_timeo)))
1da177e4 LT	224	dev_hold(dev);
	225	}
	226	}
932ff279	227	netif_tx_unlock(dev);
1da177e4 LT	228
	229	dev_put(dev);
	230	}
	231
1da177e4 LT	232	void __netdev_watchdog_up(struct net_device *dev)
	233	{
	234	if (dev->tx_timeout) {
	235	if (dev->watchdog_timeo <= 0)
	236	dev->watchdog_timeo = 5*HZ;
60468d5b VP	237	if (!mod_timer(&dev->watchdog_timer,
60468d5b VP	238	round_jiffies(jiffies + dev->watchdog_timeo)))
1da177e4 LT	239	dev_hold(dev);
	240	}
	241	}
	242
	243	static void dev_watchdog_up(struct net_device *dev)
	244	{
1da177e4	245	__netdev_watchdog_up(dev);
1da177e4 LT	246	}
	247
	248	static void dev_watchdog_down(struct net_device *dev)
	249	{
932ff279	250	netif_tx_lock_bh(dev);
1da177e4	251	if (del_timer(&dev->watchdog_timer))
15333061	252	dev_put(dev);
932ff279	253	netif_tx_unlock_bh(dev);
1da177e4 LT	254	}
1da177e4 LT	255
bea3348e SH	256	/**
	257	* netif_carrier_on - set carrier
	258	* @dev: network device
	259	*
	260	* Device has detected that carrier.
	261	*/
0a242efc DV	262	void netif_carrier_on(struct net_device *dev)
0a242efc DV	263	{
bfaae0f0	264	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
0a242efc	265	linkwatch_fire_event(dev);
bfaae0f0 JG	266	if (netif_running(dev))
	267	__netdev_watchdog_up(dev);
	268	}
0a242efc	269	}
62e3ba1b	270	EXPORT_SYMBOL(netif_carrier_on);
0a242efc	271
bea3348e SH	272	/**
	273	* netif_carrier_off - clear carrier
	274	* @dev: network device
	275	*
	276	* Device has detected loss of carrier.
	277	*/
0a242efc DV	278	void netif_carrier_off(struct net_device *dev)
	279	{
	280	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
	281	linkwatch_fire_event(dev);
	282	}
62e3ba1b	283	EXPORT_SYMBOL(netif_carrier_off);
0a242efc	284
1da177e4 LT	285	/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
	286	under all circumstances. It is difficult to invent anything faster or
	287	cheaper.
	288	*/
	289
94df109a	290	static int noop_enqueue(struct sk_buff skb, struct Qdisc qdisc)
1da177e4 LT	291	{
	292	kfree_skb(skb);
	293	return NET_XMIT_CN;
	294	}
	295
94df109a	296	static struct sk_buff noop_dequeue(struct Qdisc qdisc)
1da177e4 LT	297	{
	298	return NULL;
	299	}
	300
94df109a	301	static int noop_requeue(struct sk_buff skb, struct Qdisc qdisc)
1da177e4 LT	302	{
1da177e4 LT	303	if (net_ratelimit())
94df109a TG	304	printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
94df109a TG	305	skb->dev->name);
1da177e4 LT	306	kfree_skb(skb);
	307	return NET_XMIT_CN;
	308	}
	309
20fea08b	310	struct Qdisc_ops noop_qdisc_ops __read_mostly = {
1da177e4 LT	311	.id = "noop",
	312	.priv_size = 0,
	313	.enqueue = noop_enqueue,
	314	.dequeue = noop_dequeue,
	315	.requeue = noop_requeue,
	316	.owner = THIS_MODULE,
	317	};
	318
	319	struct Qdisc noop_qdisc = {
	320	.enqueue = noop_enqueue,
	321	.dequeue = noop_dequeue,
	322	.flags = TCQ_F_BUILTIN,
10297b99	323	.ops = &noop_qdisc_ops,
1da177e4 LT	324	.list = LIST_HEAD_INIT(noop_qdisc.list),
1da177e4 LT	325	};
62e3ba1b	326	EXPORT_SYMBOL(noop_qdisc);
1da177e4	327
20fea08b	328	static struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
1da177e4 LT	329	.id = "noqueue",
	330	.priv_size = 0,
	331	.enqueue = noop_enqueue,
	332	.dequeue = noop_dequeue,
	333	.requeue = noop_requeue,
	334	.owner = THIS_MODULE,
	335	};
	336
	337	static struct Qdisc noqueue_qdisc = {
	338	.enqueue = NULL,
	339	.dequeue = noop_dequeue,
	340	.flags = TCQ_F_BUILTIN,
	341	.ops = &noqueue_qdisc_ops,
	342	.list = LIST_HEAD_INIT(noqueue_qdisc.list),
	343	};
	344
	345
	346	static const u8 prio2band[TC_PRIO_MAX+1] =
	347	{ 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
	348
	349	/* 3-band FIFO queue: old style, but should be a bit faster than
	350	generic prio+fifo combination.
	351	*/
	352
f87a9c3d TG	353	#define PFIFO_FAST_BANDS 3
f87a9c3d TG	354
321090e7 TG	355	static inline struct sk_buff_head prio2list(struct sk_buff skb,
321090e7 TG	356	struct Qdisc *qdisc)
1da177e4 LT	357	{
1da177e4 LT	358	struct sk_buff_head *list = qdisc_priv(qdisc);
321090e7 TG	359	return list + prio2band[skb->priority & TC_PRIO_MAX];
321090e7 TG	360	}
1da177e4	361
f87a9c3d	362	static int pfifo_fast_enqueue(struct sk_buff skb, struct Qdisc qdisc)
321090e7 TG	363	{
321090e7 TG	364	struct sk_buff_head *list = prio2list(skb, qdisc);
1da177e4	365
821d24ae	366	if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
1da177e4	367	qdisc->q.qlen++;
821d24ae	368	return __qdisc_enqueue_tail(skb, qdisc, list);
1da177e4	369	}
821d24ae TG	370
821d24ae TG	371	return qdisc_drop(skb, qdisc);
1da177e4 LT	372	}
1da177e4 LT	373
f87a9c3d	374	static struct sk_buff pfifo_fast_dequeue(struct Qdisc qdisc)
1da177e4 LT	375	{
	376	int prio;
	377	struct sk_buff_head *list = qdisc_priv(qdisc);
1da177e4	378
452f299d TG	379	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
452f299d TG	380	if (!skb_queue_empty(list + prio)) {
1da177e4	381	qdisc->q.qlen--;
452f299d	382	return __qdisc_dequeue_head(qdisc, list + prio);
1da177e4 LT	383	}
1da177e4 LT	384	}
f87a9c3d	385
1da177e4 LT	386	return NULL;
	387	}
	388
f87a9c3d	389	static int pfifo_fast_requeue(struct sk_buff skb, struct Qdisc qdisc)
1da177e4	390	{
1da177e4	391	qdisc->q.qlen++;
321090e7	392	return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
1da177e4 LT	393	}
1da177e4 LT	394
f87a9c3d	395	static void pfifo_fast_reset(struct Qdisc* qdisc)
1da177e4 LT	396	{
	397	int prio;
	398	struct sk_buff_head *list = qdisc_priv(qdisc);
	399
f87a9c3d	400	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
821d24ae TG	401	__qdisc_reset_queue(qdisc, list + prio);
	402
	403	qdisc->qstats.backlog = 0;
1da177e4 LT	404	qdisc->q.qlen = 0;
	405	}
	406
	407	static int pfifo_fast_dump(struct Qdisc qdisc, struct sk_buff skb)
	408	{
f87a9c3d	409	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
1da177e4	410
1da177e4	411	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
1e90474c	412	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
1da177e4 LT	413	return skb->len;
1da177e4 LT	414
1e90474c	415	nla_put_failure:
1da177e4 LT	416	return -1;
	417	}
	418
1e90474c	419	static int pfifo_fast_init(struct Qdisc qdisc, struct nlattr opt)
1da177e4	420	{
f87a9c3d	421	int prio;
1da177e4 LT	422	struct sk_buff_head *list = qdisc_priv(qdisc);
1da177e4 LT	423
f87a9c3d TG	424	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
f87a9c3d TG	425	skb_queue_head_init(list + prio);
1da177e4 LT	426
	427	return 0;
	428	}
	429
20fea08b	430	static struct Qdisc_ops pfifo_fast_ops __read_mostly = {
1da177e4	431	.id = "pfifo_fast",
f87a9c3d	432	.priv_size = PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
1da177e4 LT	433	.enqueue = pfifo_fast_enqueue,
	434	.dequeue = pfifo_fast_dequeue,
	435	.requeue = pfifo_fast_requeue,
	436	.init = pfifo_fast_init,
	437	.reset = pfifo_fast_reset,
	438	.dump = pfifo_fast_dump,
	439	.owner = THIS_MODULE,
	440	};
	441
3d54b82f	442	struct Qdisc qdisc_alloc(struct net_device dev, struct Qdisc_ops *ops)
1da177e4 LT	443	{
	444	void *p;
	445	struct Qdisc *sch;
3d54b82f TG	446	unsigned int size;
3d54b82f TG	447	int err = -ENOBUFS;
1da177e4 LT	448
1da177e4 LT	449	/* ensure that the Qdisc and the private data are 32-byte aligned */
3d54b82f TG	450	size = QDISC_ALIGN(sizeof(*sch));
3d54b82f TG	451	size += ops->priv_size + (QDISC_ALIGNTO - 1);
1da177e4	452
0da974f4	453	p = kzalloc(size, GFP_KERNEL);
1da177e4	454	if (!p)
3d54b82f	455	goto errout;
3d54b82f TG	456	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
3d54b82f TG	457	sch->padded = (char ) sch - (char ) p;
1da177e4 LT	458
	459	INIT_LIST_HEAD(&sch->list);
	460	skb_queue_head_init(&sch->q);
	461	sch->ops = ops;
	462	sch->enqueue = ops->enqueue;
	463	sch->dequeue = ops->dequeue;
	464	sch->dev = dev;
	465	dev_hold(dev);
1da177e4	466	atomic_set(&sch->refcnt, 1);
3d54b82f TG	467
	468	return sch;
	469	errout:
	470	return ERR_PTR(-err);
	471	}
	472
9f9afec4 PM	473	struct Qdisc * qdisc_create_dflt(struct net_device dev, struct Qdisc_ops ops,
9f9afec4 PM	474	unsigned int parentid)
3d54b82f TG	475	{
3d54b82f TG	476	struct Qdisc *sch;
10297b99	477
3d54b82f TG	478	sch = qdisc_alloc(dev, ops);
	479	if (IS_ERR(sch))
	480	goto errout;
fd44de7c	481	sch->stats_lock = &dev->queue_lock;
9f9afec4	482	sch->parent = parentid;
3d54b82f	483
1da177e4 LT	484	if (!ops->init \|\| ops->init(sch, NULL) == 0)
	485	return sch;
	486
0fbbeb1b	487	qdisc_destroy(sch);
3d54b82f	488	errout:
1da177e4 LT	489	return NULL;
1da177e4 LT	490	}
62e3ba1b	491	EXPORT_SYMBOL(qdisc_create_dflt);
1da177e4 LT	492
	493	/* Under dev->queue_lock and BH! */
	494
	495	void qdisc_reset(struct Qdisc *qdisc)
	496	{
20fea08b	497	const struct Qdisc_ops *ops = qdisc->ops;
1da177e4 LT	498
	499	if (ops->reset)
	500	ops->reset(qdisc);
	501	}
62e3ba1b	502	EXPORT_SYMBOL(qdisc_reset);
1da177e4	503
10297b99	504	/* this is the rcu callback function to clean up a qdisc when there
1da177e4 LT	505	* are no further references to it */
	506
	507	static void __qdisc_destroy(struct rcu_head *head)
	508	{
	509	struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
1da177e4 LT	510	kfree((char *) qdisc - qdisc->padded);
	511	}
	512
	513	/* Under dev->queue_lock and BH! */
	514
	515	void qdisc_destroy(struct Qdisc *qdisc)
	516	{
20fea08b	517	const struct Qdisc_ops *ops = qdisc->ops;
1da177e4 LT	518
1da177e4 LT	519	if (qdisc->flags & TCQ_F_BUILTIN \|\|
85670cc1	520	!atomic_dec_and_test(&qdisc->refcnt))
1da177e4 LT	521	return;
1da177e4 LT	522
85670cc1	523	list_del(&qdisc->list);
85670cc1	524	gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
85670cc1 PM	525	if (ops->reset)
	526	ops->reset(qdisc);
	527	if (ops->destroy)
	528	ops->destroy(qdisc);
1da177e4	529
85670cc1 PM	530	module_put(ops->owner);
85670cc1 PM	531	dev_put(qdisc->dev);
1da177e4 LT	532	call_rcu(&qdisc->q_rcu, __qdisc_destroy);
1da177e4 LT	533	}
62e3ba1b	534	EXPORT_SYMBOL(qdisc_destroy);
1da177e4 LT	535
	536	void dev_activate(struct net_device *dev)
	537	{
	538	/* No queueing discipline is attached to device;
	539	create default one i.e. pfifo_fast for devices,
	540	which need queueing and noqueue_qdisc for
	541	virtual interfaces
	542	*/
	543
	544	if (dev->qdisc_sleeping == &noop_qdisc) {
	545	struct Qdisc *qdisc;
	546	if (dev->tx_queue_len) {
9f9afec4 PM	547	qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops,
9f9afec4 PM	548	TC_H_ROOT);
1da177e4 LT	549	if (qdisc == NULL) {
	550	printk(KERN_INFO "%s: activation failed\n", dev->name);
	551	return;
	552	}
1da177e4	553	list_add_tail(&qdisc->list, &dev->qdisc_list);
1da177e4 LT	554	} else {
	555	qdisc = &noqueue_qdisc;
	556	}
1da177e4	557	dev->qdisc_sleeping = qdisc;
1da177e4 LT	558	}
1da177e4 LT	559
cacaddf5 TC	560	if (!netif_carrier_ok(dev))
	561	/* Delay activation until next carrier-on event */
	562	return;
	563
1da177e4 LT	564	spin_lock_bh(&dev->queue_lock);
	565	rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping);
	566	if (dev->qdisc != &noqueue_qdisc) {
	567	dev->trans_start = jiffies;
	568	dev_watchdog_up(dev);
	569	}
	570	spin_unlock_bh(&dev->queue_lock);
	571	}
	572
	573	void dev_deactivate(struct net_device *dev)
	574	{
	575	struct Qdisc *qdisc;
41a23b07	576	struct sk_buff *skb;
ce0e32e6	577	int running;
1da177e4 LT	578
	579	spin_lock_bh(&dev->queue_lock);
	580	qdisc = dev->qdisc;
	581	dev->qdisc = &noop_qdisc;
	582
	583	qdisc_reset(qdisc);
	584
41a23b07 HX	585	skb = dev->gso_skb;
41a23b07 HX	586	dev->gso_skb = NULL;
1da177e4 LT	587	spin_unlock_bh(&dev->queue_lock);
1da177e4 LT	588
41a23b07 HX	589	kfree_skb(skb);
41a23b07 HX	590
1da177e4 LT	591	dev_watchdog_down(dev);
1da177e4 LT	592
ce0e32e6	593	/* Wait for outstanding qdisc-less dev_queue_xmit calls. */
d4828d85	594	synchronize_rcu();
1da177e4	595
d4828d85	596	/* Wait for outstanding qdisc_run calls. */
ce0e32e6 HX	597	do {
	598	while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
	599	yield();
	600
	601	/*
	602	* Double-check inside queue lock to ensure that all effects
	603	* of the queue run are visible when we return.
	604	*/
	605	spin_lock_bh(&dev->queue_lock);
	606	running = test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
	607	spin_unlock_bh(&dev->queue_lock);
	608
	609	/*
	610	* The running flag should never be set at this point because
	611	* we've already set dev->qdisc to noop_qdisc inside the same
	612	* pair of spin locks. That is, if any qdisc_run starts after
	613	* our initial test it should see the noop_qdisc and then
	614	* clear the RUNNING bit before dropping the queue lock. So
	615	* if it is set here then we've found a bug.
	616	*/
	617	} while (WARN_ON_ONCE(running));
1da177e4 LT	618	}
	619
	620	void dev_init_scheduler(struct net_device *dev)
	621	{
	622	qdisc_lock_tree(dev);
	623	dev->qdisc = &noop_qdisc;
	624	dev->qdisc_sleeping = &noop_qdisc;
	625	INIT_LIST_HEAD(&dev->qdisc_list);
	626	qdisc_unlock_tree(dev);
	627
b24b8a24	628	setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev);
1da177e4 LT	629	}
	630
	631	void dev_shutdown(struct net_device *dev)
	632	{
	633	struct Qdisc *qdisc;
	634
	635	qdisc_lock_tree(dev);
	636	qdisc = dev->qdisc_sleeping;
	637	dev->qdisc = &noop_qdisc;
	638	dev->qdisc_sleeping = &noop_qdisc;
	639	qdisc_destroy(qdisc);
	640	#if defined(CONFIG_NET_SCH_INGRESS) \|\| defined(CONFIG_NET_SCH_INGRESS_MODULE)
10297b99	641	if ((qdisc = dev->qdisc_ingress) != NULL) {
1da177e4 LT	642	dev->qdisc_ingress = NULL;
1da177e4 LT	643	qdisc_destroy(qdisc);
10297b99	644	}
1da177e4 LT	645	#endif
	646	BUG_TRAP(!timer_pending(&dev->watchdog_timer));
	647	qdisc_unlock_tree(dev);
	648	}