home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
ftp.ee.lbl.gov
/
2014.05.ftp.ee.lbl.gov.tar
/
ftp.ee.lbl.gov
/
cbq-1.1.tar.Z
/
cbq-1.1.tar
/
rm_class.c
< prev
next >
Wrap
C/C++ Source or Header
|
1995-11-03
|
19KB
|
581 lines
/*
* Copyright (c) 1995 The Regents of the University of California.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the Network Research
* Group at Lawrence Berkeley National Laboratory.
* 4. Neither the name of the University nor of the Laboratory may be used
* to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* There are 5 externally visible routines associated with resource
* management:
*
* rmc_init() is called once at system startup by the interface
* 'attach' routine. It initializes all the resource
* management data structures associated with a particular
* network interface.
*
* rmc_queue_packet() is called by the interface output routine
* (ifp->if_output) to queue a packet on some resource
* class and, if the interface isn't busy, start output.
*
* rmc_dequeue_next() is called by the routine that actually starts
* output on an interface (e.g., lestart) to get the
* next packet to be output.
*
* rmc_update_util() is called by the interface transmit done interrupt
* service code to update the class bandwidth utilization
* accounting for the packet that just completed.
*
* rmc_newclass() is called by a setsockopt or protocol specific routine
* to create a new resource class.
*
* The remaining routines in this file are 'internal' routines used
* by the above 5.
*
* Note that this is a research prototype. It was designed to be easy
* to graft onto an existing system running BSD networking code and to
* be easy to debug. This code is not particularly efficient -- a
* `production' version could easily be sped up by a factor of somewhere
* between 2 and 10. But we still have a lot of research to do and our
* plan is to make it right before we make it fast. - vj
*/
#ifndef lint
static char rcsid[] =
"@(#) $Header: rm_class.c,v 1.3 95/08/09 18:58:10 van Locked $ (LBL)";
#endif
/* Plus changes by Sally in the comments for setting maxidle, 10/95. */
#include <sys/param.h>
#include <sys/mbuf.h>
#include <sys/buf.h>
#include <sys/map.h>
#include <sys/socket.h>
#include <sys/systm.h>
#include <sys/errno.h>
#include <sys/debug.h>
#include <sys/time.h>
#include <sys/kernel.h>
#include <net/if.h>
#include <net/if_arp.h>
#include <net/netisr.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include "rm_class.h"
/*
* Macros for dealing with time values. We assume all times are
* 'timevals'. `microtime' is used to get the best available clock
* resolution. If `microtime' *doesn't* return a value that's about
* ten times smaller than the average packet time on the fastest
* link that will use these routines, a slightly different clock
* scheme than this one should be used.
* (Bias due to truncation error in this scheme will overestimate utilization
* and discriminate against high bandwidth classes. To remove this bias an
* integrator needs to be added. The simplest integrator uses a history of
* 10 * avg.packet.time / min.tick.time packet completion entries. This is
* straight forward to add but we don't want to pay the extra memory
* traffic to maintain it if it's not necessary (occasionally a vendor
* accidentally builds a workstation with a decent clock - e.g., Sun & HP).)
*/
#define RM_GETTIME(now) microtime(&now)
#define TV_LT(a, b) (((a).tv_usec < (b).tv_usec && \
(a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec)
#define TV_DELTA(a, b, delta) { \
register int xxs; \
\
delta = (a).tv_usec - (b).tv_usec; \
if ((xxs = (a).tv_sec - (b).tv_sec)) { \
switch (xxs) { \
default: \
if (xxs < 0) \
panic("rm_class: bogus time values"); \
delta = 0; \
/* fall through */ \
case 2: \
delta += 1000000; \
/* fall through */ \
case 1: \
delta += 1000000; \
break; \
} \
} \
}
#define TV_ADD_DELTA(a, delta, res) { \
register int xxus = (a).tv_usec + (delta); \
\
res.tv_sec = a.tv_sec; \
while (xxus >= 1000000) { \
++((res).tv_sec); \
xxus -= 1000000; \
} \
(res).tv_usec = xxus; \
}
/*
* Table for mapping a bit mask (with one bit per priority level)
* into the number of the highest priority set bit.
*/
u_char rmc_mask2pri[] = {
0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, /* 00 - 0f */
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, /* 10 - 1f */
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, /* 20 - 2f */
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, /* 30 - 3f */
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, /* 40 - 4f */
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, /* 50 - 5f */
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, /* 60 - 6f */
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, /* 70 - 7f */
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* 80 - 8f */
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* 90 - 9f */
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* a0 - af */
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* b0 - bf */
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* c0 - cf */
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* d0 - df */
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* e0 - ef */
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 /* f0 - ff */
};
/*
* Create a new resource management class at priority 'pri' on the
* interface given by 'ifdat'.
*
* nsecPerByte is the data rate of the interface in nanoseconds/byte.
* E.g., 800 for a 10Mb/s ethernet. If the class gets less
* than 100% of the bandwidth, this number should be the
* 'effective' rate for the class. Let f be the
* bandwidth fraction allocated to this class, and let
* nsPerByte be the data rate of the output link in
* nanoseconds/byte. Then nsecPerByte is set to
* nsPerByte / f. E.g., 1600 (= 800 / .5)
* for a class that gets 50% of an ethernet's bandwidth.
*
* action the routine to call when the class is over limit.
*
* maxq max allowable queue size for class (in packets).
*
* parent parent class pointer.
*
* borrow class to borrow from (should be either 'parent' or null).
*
* maxidle max value allowed for class 'idle' time estimate (this
* parameter determines how large an initial burst of packets
* can be before overlimit action is invoked.
*
* offtime how long 'delay' action will delay when class goes over
* limit (this parameter determines the steady-state burst
* size when a class is running over its limit).
*
* Maxidle and offtime have to be computed from the following: If the
* average packet size is s, the bandwidth fraction allocated to this
* class is f, we want to allow b packet bursts, and the gain of the
* averaging filter is g (= 1 - 2^(-RM_FILTER_GAIN)), then:
*
* ptime = s * nsPerByte * (1 - f) / f
* maxidle = ptime * (1 - g^b) / g^b
* offtime = ptime * (1 + 1/(1 - g) * (1 - g^(b - 1)) / g^(b - 1)
*
* Operationally, it's convenient to specify maxidle & offtime in units
* independent of the link bandwidth so the maxidle & offtime passed to
* this routine are the above values multiplied by 8*f/(1000*nsPerByte).
* (The constant factor is a scale factor needed to make the parameters
* integers. This scaling also means that the 'unscaled' values of
* maxidle*nsecPerByte/8 and offtime*nsecPerByte/8 will be in microseconds,
* not nanoseconds.) Also note that the 'idle' filter computation keeps
* an estimate scaled upward by 2^RM_FILTER_GAIN so the passed value of
* maxidle also must be scaled upward by this value. Thus, the passed
* values for maxidle and offtime can be computed as follows:
*
* maxidle = maxidle * 2^RM_FILTER_GAIN * 8 / (1000 * nsecPerByte)
* offtime = offtime * 8 / (1000 * nsecPerByte)
*/
struct rm_class *
rmc_newclass(pri, ifdat, nsecPerByte, action, maxq,
parent, borrow, maxidle, offtime)
register int pri;
register struct rm_ifdat *ifdat;
register u_int nsecPerByte;
void (*action)();
register int maxq;
register struct rm_class *parent;
register struct rm_class *borrow;
register u_int maxidle;
register u_int offtime;
{
register struct rm_class *cl;
register struct rm_class *peer;
register int i, tim;
cl = (struct rm_class *)new_kmem_zalloc(sizeof(*cl), KMEM_SLEEP);
if (cl == NULL) {
panic("no space for resource management data structures.");
}
if (peer = ifdat->classes[pri]) {
/* find the last class at this pri */
cl->peer = peer;
while (peer->peer != ifdat->classes[pri])
peer = peer->peer;
peer->peer = cl;
} else {
ifdat->classes[pri] = cl;
ifdat->classlist[pri] = cl;
cl->peer = cl;
}
cl->parent = parent;
cl->borrow = borrow;
cl->ifdat = ifdat;
cl->priority = pri;
cl->qmax = maxq;
maxidle = (maxidle * nsecPerByte) >> 3;
if (maxidle == 0)
maxidle = 1;
cl->maxidle = maxidle;
cl->avgidle = maxidle;
offtime = (offtime * nsecPerByte) >> 3;
if (offtime == 0)
offtime = 1;
cl->offtime = offtime;
cl->overlimit = action;
tim = 0;
for (i = 0; i < sizeof(cl->len2time)/sizeof(cl->len2time[0]); ++i) {
register int v = tim / 1000;
cl->len2time[i] = v == 0? 1 : v;
tim += nsecPerByte;
}
return (cl);
}
void rmc_root_overlimit();
/*
* Initialize the resource management data structures associated
* with the output portion of interface 'ifp'. 'ifdat' is where
* the structures will be built (for backwards compatibility, the
* structures aren't kept in the ifnet struct). 'nsecPerByte'
* gives the link speed (inverse of bandwidth) in nanoseconds/byte.
* 'restart' is the driver-specific routine that the generic 'delay
* until under limit' action will call to restart output. `maxq'
* is the queue size of the 'link' & 'default' classes. 'maxqueued'
* is the maximum number of packets that the resource management
* code will allow to be queued 'downstream' (this is typically 1
* or 2 -- just enough to keep the interface busy during the packet
* completion interrupt latency).
*/
void
rmc_init(ifp, ifdat, nsecPerByte, restart, maxq, maxqueued)
register struct ifnet *ifp;
register struct rm_ifdat *ifdat;
register u_int nsecPerByte;
void (*restart)();
int maxq, maxqueued;
{
register struct rm_class *cl;
bzero(ifdat, sizeof(*ifdat));
ifdat->ifp = ifp;
ifdat->restart = restart;
ifdat->maxqueued = maxqueued;
/*
* allocate space for the root class with ~96% avail to borrow
* and 128 packet bursts.
*/
cl = rmc_newclass(0, ifdat, (nsecPerByte * 100) / 96,
rmc_root_overlimit, maxq, 0, 0, 879, 61);
/* default - low priority with 95% allocation */
ifdat->defaultclass = rmc_newclass(1, ifdat, (nsecPerByte * 100) / 95,
rmc_delay_action, maxq, cl, cl,
6, 5);
}
/*
* Add packet given by mbuf 'm' to queue for resource class 'cl'.
* This routine is called by a driver's if_output routine.
* If the limit on packets queued to the interface hardware hasn't
* been reached, we call the interface 'restart' routine to try
* to output another packet.
*
* This routine must be called with output packet completion
* interrupts locked out (to avoid racing with rmc_dequeue_next).
*/
void
rmc_queue_packet(cl, m)
register struct rm_class *cl;
register struct mbuf* m;
{
register struct mbuf* m0;
register struct rm_ifdat *ifd = cl->ifdat;
if ((m0 = cl->tail) != NULL)
m->m_act = m0->m_act;
else {
register int cpri = cl->priority;
m0 = m;
if (++ifd->activecnt[cpri] == 1)
ifd->csum |= (1 << cpri);
}
m0->m_act = m;
cl->tail = m;
if (++cl->qcnt >= cl->qmax)
rmc_drop_action(cl);
else if (ifd->queued < ifd->maxqueued)
(ifd->restart)(ifd->ifp);
}
/*
* Return 1 if class 'cl' is under limit or can borrow from a parent,
* 0 if overlimit. As a side-effect, this routine will invoke the
* class overlimit action if the class if overlimit.
*/
int
rmc_under_limit(cl)
register struct rm_class *cl;
{
register struct rm_class *lcl = cl;
struct timeval now;
if (cl->parent == 0)
/* root class is always under limit */
return (1);
RM_GETTIME(now);
if (cl->sleeping) {
if (TV_LT(now, cl->undertime))
return (0);
cl->sleeping = 0;
return (1);
}
while (cl->undertime.tv_sec && TV_LT(now, cl->undertime)) {
++cl->borrows;
if ((cl = cl->borrow) == 0) {
++lcl->overactions;
(lcl->overlimit)(lcl);
return (0);
}
}
return (1);
}
/*
* Dequeue & return next packet from the highest priority class that
* has a packet to send & has enough allocation to send it. This
* routine is called by a driver whenever it needs a new packet to
* output, typically in the XXstart routine (e.g., lestart, bfstart,
* etc.). 0 is returned if there is no packet to send. As a side
* effect of calling this routine, `overlimit' actions are called
* for classes that have packets to send but are over their bandwidth
* limit & can't borrow from their parent.
*/
struct mbuf *
rmc_dequeue_next(ifd)
register struct rm_ifdat *ifd;
{
register struct mbuf *m, *m0;
register int csum, cpri;
register struct rm_class *cl;
csum = ifd->csum;
while (csum) {
register struct rm_class *clh;
cpri = rmc_mask2pri[csum];
clh = cl = ifd->classlist[cpri];
do {
if ((m = cl->tail) &&
(cl->undertime.tv_sec == 0 || rmc_under_limit(cl)))
goto out;
cl = cl->peer;
} while (cl != clh);
csum &=~ (1 << cpri);
}
return (0);
out:
ifd->classlist[cpri] = cl->peer;
ifd->class = cl;
if ((m0 = m->m_act) != m)
m->m_act = m0->m_act;
else {
cl->tail = 0;
if (--ifd->activecnt[cpri] <= 0)
ifd->csum &=~ (1 << cpri);
}
--cl->qcnt;
++ifd->queued;
return (m0);
}
/*
* Update the utilization estimate for the packet that just completed.
* The packet's class & the parent(s) of that class all get their
* estimators updated. This routine is called by the driver's output-
* packet-completion interrupt service routine.
*/
void
rmc_update_util(ifd)
register struct rm_ifdat *ifd;
{
register struct rm_class *cl = ifd->class;
register int pktlen = ifd->curlen;
register int idle, avgidle;
struct timeval now;
--ifd->queued;
++cl->npackets;
cl->nbytes += pktlen;
RM_GETTIME(now);
do {
TV_DELTA(now, cl->last, idle);
idle -= cl->len2time[pktlen];
avgidle = cl->avgidle;
avgidle += idle - (avgidle >> RM_FILTER_GAIN);
if (avgidle <= 0) {
cl->avgidle = 0;
cl->undertime = now;
TV_ADD_DELTA(now, cl->offtime, cl->undertime);
++cl->over;
} else {
cl->avgidle = (avgidle > cl->maxidle)? cl->maxidle :
avgidle;
cl->undertime.tv_sec = 0;
}
cl->last = now;
} while (cl = cl->parent);
}
/*
* Generic (not protocol-specific) over-limit action routines. These
* get invoked by rmc_under_limit() if a class with packets to send
* is over its bandwidth limit & can't borrow from a parent class.
*/
void
rmc_drop_action(cl)
register struct rm_class *cl;
{
register struct mbuf *m, *m0;
if ((m = cl->tail) == NULL)
panic("rmc_drop_action: empty queue");
if ((m0 = m->m_act) != m)
m->m_act = m0->m_act;
else {
register struct rm_ifdat *ifd = cl->ifdat;
register int cpri = cl->priority;
cl->tail = 0;
if (--ifd->activecnt[cpri] <= 0)
ifd->csum &=~ (1 << cpri);
}
--cl->qcnt;
++cl->drops;
m_freem(m0);
}
/*
* rmc_delay_action() implements the 'delay' (i.e., rate-limiting) action
* for a class that goes over it's bandwidth limit. It simply schedules
* a timeout to restart sending for the class at the future time when
* the class would go under its limit.
*/
void
rmc_delay_action(cl)
register struct rm_class *cl;
{
register int t = hzto(cl->undertime);
/*
* since packets are phased randomly with respect to the
* clock, 1 tick (the next clock tick) can be an arbitrarily
* short time so we have to wait for at least two ticks.
* Note that we always start a timer even though in the usual
* case packet traffic will cause this class be be rescanned
* and restarted rather than the relatively coarse system
* timer: If there's no other traffic, we need the timer as
* a 'backstop' to restart this class.
*/
if (t <= 1)
t = 2;
cl->sleeping = 1;
timeout(rmc_restart, (caddr_t)cl, t);
}
/*
* rmc_restart() is just a helper routine for rmc_delay_action -- it is
* called by the system timer code & is responsible checking if the
* class is still sleeping (it might have been restarted as a side
* effect of the queue scan on a packet arrival) and, if so, restarting
* output for the class. Inspecting the class state & restarting output
* require locking the class structure. In general the driver is
* responsible for locking but this is the only routine that is not
* called directly or indirectly from the interface driver so it has
* know about system locking conventions. Under bsd, locking is done
* by raising IPL to splimp so that's what's implemented here. On a
* different system this would probably need to be changed.
void
rmc_restart(cl)
register struct rm_class *cl;
{
register int s = splimp();
if (cl->sleeping) {
register struct rm_ifdat *ifd = cl->ifdat;
cl->sleeping = 0;
if (ifd->queued < ifd->maxqueued)
(ifd->restart)(ifd->ifp);
}
splx(s);
}
/*
* We should never get here (it should not be possible for the root
* class to go overlimit) so panic & get a crash dump if we do.
*/
void
rmc_root_overlimit(cl)
register struct rm_class *cl;
{
panic("rmc_root_overlimit");
}