ftp.ee.lbl.gov

home *** CD-ROM | disk | FTP | other *** search

/ ftp.ee.lbl.gov / 2014.05.ftp.ee.lbl.gov.tar / ftp.ee.lbl.gov / cbq-1.1.tar.Z / cbq-1.1.tar / rm_class.c < prev next >

Wrap

C/C++ Source or Header | 1995-11-03 | 19KB | 581 lines

/* * Copyright (c) 1995 The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the Network Research * Group at Lawrence Berkeley National Laboratory. * 4. Neither the name of the University nor of the Laboratory may be used * to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * There are 5 externally visible routines associated with resource * management: * * rmc_init() is called once at system startup by the interface * 'attach' routine. It initializes all the resource * management data structures associated with a particular * network interface. * * rmc_queue_packet() is called by the interface output routine * (ifp->if_output) to queue a packet on some resource * class and, if the interface isn't busy, start output. * * rmc_dequeue_next() is called by the routine that actually starts * output on an interface (e.g., lestart) to get the * next packet to be output. * * rmc_update_util() is called by the interface transmit done interrupt * service code to update the class bandwidth utilization * accounting for the packet that just completed. * * rmc_newclass() is called by a setsockopt or protocol specific routine * to create a new resource class. * * The remaining routines in this file are 'internal' routines used * by the above 5. * * Note that this is a research prototype. It was designed to be easy * to graft onto an existing system running BSD networking code and to * be easy to debug. This code is not particularly efficient -- a * `production' version could easily be sped up by a factor of somewhere * between 2 and 10. But we still have a lot of research to do and our * plan is to make it right before we make it fast. - vj */ #ifndef lint static char rcsid[] = "@(#) $Header: rm_class.c,v 1.3 95/08/09 18:58:10 van Locked $ (LBL)"; #endif /* Plus changes by Sally in the comments for setting maxidle, 10/95. */ #include <sys/param.h> #include <sys/mbuf.h> #include <sys/buf.h> #include <sys/map.h> #include <sys/socket.h> #include <sys/systm.h> #include <sys/errno.h> #include <sys/debug.h> #include <sys/time.h> #include <sys/kernel.h> #include <net/if.h> #include <net/if_arp.h> #include <net/netisr.h> #include <netinet/in.h> #include <netinet/in_systm.h> #include <netinet/ip.h> #include "rm_class.h" /* * Macros for dealing with time values. We assume all times are * 'timevals'. `microtime' is used to get the best available clock * resolution. If `microtime' *doesn't* return a value that's about * ten times smaller than the average packet time on the fastest * link that will use these routines, a slightly different clock * scheme than this one should be used. * (Bias due to truncation error in this scheme will overestimate utilization * and discriminate against high bandwidth classes. To remove this bias an * integrator needs to be added. The simplest integrator uses a history of * 10 * avg.packet.time / min.tick.time packet completion entries. This is * straight forward to add but we don't want to pay the extra memory * traffic to maintain it if it's not necessary (occasionally a vendor * accidentally builds a workstation with a decent clock - e.g., Sun & HP).) */ #define RM_GETTIME(now) microtime(&now) #define TV_LT(a, b) (((a).tv_usec < (b).tv_usec && \ (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec) #define TV_DELTA(a, b, delta) { \ register int xxs; \ \ delta = (a).tv_usec - (b).tv_usec; \ if ((xxs = (a).tv_sec - (b).tv_sec)) { \ switch (xxs) { \ default: \ if (xxs < 0) \ panic("rm_class: bogus time values"); \ delta = 0; \ /* fall through */ \ case 2: \ delta += 1000000; \ /* fall through */ \ case 1: \ delta += 1000000; \ break; \ } \ } \ } #define TV_ADD_DELTA(a, delta, res) { \ register int xxus = (a).tv_usec + (delta); \ \ res.tv_sec = a.tv_sec; \ while (xxus >= 1000000) { \ ++((res).tv_sec); \ xxus -= 1000000; \ } \ (res).tv_usec = xxus; \ } /* * Table for mapping a bit mask (with one bit per priority level) * into the number of the highest priority set bit. */ u_char rmc_mask2pri[] = { 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, /* 00 - 0f */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, /* 10 - 1f */ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, /* 20 - 2f */ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, /* 30 - 3f */ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, /* 40 - 4f */ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, /* 50 - 5f */ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, /* 60 - 6f */ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, /* 70 - 7f */ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* 80 - 8f */ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* 90 - 9f */ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* a0 - af */ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* b0 - bf */ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* c0 - cf */ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* d0 - df */ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* e0 - ef */ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 /* f0 - ff */ }; /* * Create a new resource management class at priority 'pri' on the * interface given by 'ifdat'. * * nsecPerByte is the data rate of the interface in nanoseconds/byte. * E.g., 800 for a 10Mb/s ethernet. If the class gets less * than 100% of the bandwidth, this number should be the * 'effective' rate for the class. Let f be the * bandwidth fraction allocated to this class, and let * nsPerByte be the data rate of the output link in * nanoseconds/byte. Then nsecPerByte is set to * nsPerByte / f. E.g., 1600 (= 800 / .5) * for a class that gets 50% of an ethernet's bandwidth. * * action the routine to call when the class is over limit. * * maxq max allowable queue size for class (in packets). * * parent parent class pointer. * * borrow class to borrow from (should be either 'parent' or null). * * maxidle max value allowed for class 'idle' time estimate (this * parameter determines how large an initial burst of packets * can be before overlimit action is invoked. * * offtime how long 'delay' action will delay when class goes over * limit (this parameter determines the steady-state burst * size when a class is running over its limit). * * Maxidle and offtime have to be computed from the following: If the * average packet size is s, the bandwidth fraction allocated to this * class is f, we want to allow b packet bursts, and the gain of the * averaging filter is g (= 1 - 2^(-RM_FILTER_GAIN)), then: * * ptime = s * nsPerByte * (1 - f) / f * maxidle = ptime * (1 - g^b) / g^b * offtime = ptime * (1 + 1/(1 - g) * (1 - g^(b - 1)) / g^(b - 1) * * Operationally, it's convenient to specify maxidle & offtime in units * independent of the link bandwidth so the maxidle & offtime passed to * this routine are the above values multiplied by 8*f/(1000*nsPerByte). * (The constant factor is a scale factor needed to make the parameters * integers. This scaling also means that the 'unscaled' values of * maxidle*nsecPerByte/8 and offtime*nsecPerByte/8 will be in microseconds, * not nanoseconds.) Also note that the 'idle' filter computation keeps * an estimate scaled upward by 2^RM_FILTER_GAIN so the passed value of * maxidle also must be scaled upward by this value. Thus, the passed * values for maxidle and offtime can be computed as follows: * * maxidle = maxidle * 2^RM_FILTER_GAIN * 8 / (1000 * nsecPerByte) * offtime = offtime * 8 / (1000 * nsecPerByte) */ struct rm_class * rmc_newclass(pri, ifdat, nsecPerByte, action, maxq, parent, borrow, maxidle, offtime) register int pri; register struct rm_ifdat *ifdat; register u_int nsecPerByte; void (*action)(); register int maxq; register struct rm_class *parent; register struct rm_class *borrow; register u_int maxidle; register u_int offtime; { register struct rm_class *cl; register struct rm_class *peer; register int i, tim; cl = (struct rm_class *)new_kmem_zalloc(sizeof(*cl), KMEM_SLEEP); if (cl == NULL) { panic("no space for resource management data structures."); } if (peer = ifdat->classes[pri]) { /* find the last class at this pri */ cl->peer = peer; while (peer->peer != ifdat->classes[pri]) peer = peer->peer; peer->peer = cl; } else { ifdat->classes[pri] = cl; ifdat->classlist[pri] = cl; cl->peer = cl; } cl->parent = parent; cl->borrow = borrow; cl->ifdat = ifdat; cl->priority = pri; cl->qmax = maxq; maxidle = (maxidle * nsecPerByte) >> 3; if (maxidle == 0) maxidle = 1; cl->maxidle = maxidle; cl->avgidle = maxidle; offtime = (offtime * nsecPerByte) >> 3; if (offtime == 0) offtime = 1; cl->offtime = offtime; cl->overlimit = action; tim = 0; for (i = 0; i < sizeof(cl->len2time)/sizeof(cl->len2time[0]); ++i) { register int v = tim / 1000; cl->len2time[i] = v == 0? 1 : v; tim += nsecPerByte; } return (cl); } void rmc_root_overlimit(); /* * Initialize the resource management data structures associated * with the output portion of interface 'ifp'. 'ifdat' is where * the structures will be built (for backwards compatibility, the * structures aren't kept in the ifnet struct). 'nsecPerByte' * gives the link speed (inverse of bandwidth) in nanoseconds/byte. * 'restart' is the driver-specific routine that the generic 'delay * until under limit' action will call to restart output. `maxq' * is the queue size of the 'link' & 'default' classes. 'maxqueued' * is the maximum number of packets that the resource management * code will allow to be queued 'downstream' (this is typically 1 * or 2 -- just enough to keep the interface busy during the packet * completion interrupt latency). */ void rmc_init(ifp, ifdat, nsecPerByte, restart, maxq, maxqueued) register struct ifnet *ifp; register struct rm_ifdat *ifdat; register u_int nsecPerByte; void (*restart)(); int maxq, maxqueued; { register struct rm_class *cl; bzero(ifdat, sizeof(*ifdat)); ifdat->ifp = ifp; ifdat->restart = restart; ifdat->maxqueued = maxqueued; /* * allocate space for the root class with ~96% avail to borrow * and 128 packet bursts. */ cl = rmc_newclass(0, ifdat, (nsecPerByte * 100) / 96, rmc_root_overlimit, maxq, 0, 0, 879, 61); /* default - low priority with 95% allocation */ ifdat->defaultclass = rmc_newclass(1, ifdat, (nsecPerByte * 100) / 95, rmc_delay_action, maxq, cl, cl, 6, 5); } /* * Add packet given by mbuf 'm' to queue for resource class 'cl'. * This routine is called by a driver's if_output routine. * If the limit on packets queued to the interface hardware hasn't * been reached, we call the interface 'restart' routine to try * to output another packet. * * This routine must be called with output packet completion * interrupts locked out (to avoid racing with rmc_dequeue_next). */ void rmc_queue_packet(cl, m) register struct rm_class *cl; register struct mbuf* m; { register struct mbuf* m0; register struct rm_ifdat *ifd = cl->ifdat; if ((m0 = cl->tail) != NULL) m->m_act = m0->m_act; else { register int cpri = cl->priority; m0 = m; if (++ifd->activecnt[cpri] == 1) ifd->csum |= (1 << cpri); } m0->m_act = m; cl->tail = m; if (++cl->qcnt >= cl->qmax) rmc_drop_action(cl); else if (ifd->queued < ifd->maxqueued) (ifd->restart)(ifd->ifp); } /* * Return 1 if class 'cl' is under limit or can borrow from a parent, * 0 if overlimit. As a side-effect, this routine will invoke the * class overlimit action if the class if overlimit. */ int rmc_under_limit(cl) register struct rm_class *cl; { register struct rm_class *lcl = cl; struct timeval now; if (cl->parent == 0) /* root class is always under limit */ return (1); RM_GETTIME(now); if (cl->sleeping) { if (TV_LT(now, cl->undertime)) return (0); cl->sleeping = 0; return (1); } while (cl->undertime.tv_sec && TV_LT(now, cl->undertime)) { ++cl->borrows; if ((cl = cl->borrow) == 0) { ++lcl->overactions; (lcl->overlimit)(lcl); return (0); } } return (1); } /* * Dequeue & return next packet from the highest priority class that * has a packet to send & has enough allocation to send it. This * routine is called by a driver whenever it needs a new packet to * output, typically in the XXstart routine (e.g., lestart, bfstart, * etc.). 0 is returned if there is no packet to send. As a side * effect of calling this routine, `overlimit' actions are called * for classes that have packets to send but are over their bandwidth * limit & can't borrow from their parent. */ struct mbuf * rmc_dequeue_next(ifd) register struct rm_ifdat *ifd; { register struct mbuf *m, *m0; register int csum, cpri; register struct rm_class *cl; csum = ifd->csum; while (csum) { register struct rm_class *clh; cpri = rmc_mask2pri[csum]; clh = cl = ifd->classlist[cpri]; do { if ((m = cl->tail) && (cl->undertime.tv_sec == 0 || rmc_under_limit(cl))) goto out; cl = cl->peer; } while (cl != clh); csum &=~ (1 << cpri); } return (0); out: ifd->classlist[cpri] = cl->peer; ifd->class = cl; if ((m0 = m->m_act) != m) m->m_act = m0->m_act; else { cl->tail = 0; if (--ifd->activecnt[cpri] <= 0) ifd->csum &=~ (1 << cpri); } --cl->qcnt; ++ifd->queued; return (m0); } /* * Update the utilization estimate for the packet that just completed. * The packet's class & the parent(s) of that class all get their * estimators updated. This routine is called by the driver's output- * packet-completion interrupt service routine. */ void rmc_update_util(ifd) register struct rm_ifdat *ifd; { register struct rm_class *cl = ifd->class; register int pktlen = ifd->curlen; register int idle, avgidle; struct timeval now; --ifd->queued; ++cl->npackets; cl->nbytes += pktlen; RM_GETTIME(now); do { TV_DELTA(now, cl->last, idle); idle -= cl->len2time[pktlen]; avgidle = cl->avgidle; avgidle += idle - (avgidle >> RM_FILTER_GAIN); if (avgidle <= 0) { cl->avgidle = 0; cl->undertime = now; TV_ADD_DELTA(now, cl->offtime, cl->undertime); ++cl->over; } else { cl->avgidle = (avgidle > cl->maxidle)? cl->maxidle : avgidle; cl->undertime.tv_sec = 0; } cl->last = now; } while (cl = cl->parent); } /* * Generic (not protocol-specific) over-limit action routines. These * get invoked by rmc_under_limit() if a class with packets to send * is over its bandwidth limit & can't borrow from a parent class. */ void rmc_drop_action(cl) register struct rm_class *cl; { register struct mbuf *m, *m0; if ((m = cl->tail) == NULL) panic("rmc_drop_action: empty queue"); if ((m0 = m->m_act) != m) m->m_act = m0->m_act; else { register struct rm_ifdat *ifd = cl->ifdat; register int cpri = cl->priority; cl->tail = 0; if (--ifd->activecnt[cpri] <= 0) ifd->csum &=~ (1 << cpri); } --cl->qcnt; ++cl->drops; m_freem(m0); } /* * rmc_delay_action() implements the 'delay' (i.e., rate-limiting) action * for a class that goes over it's bandwidth limit. It simply schedules * a timeout to restart sending for the class at the future time when * the class would go under its limit. */ void rmc_delay_action(cl) register struct rm_class *cl; { register int t = hzto(cl->undertime); /* * since packets are phased randomly with respect to the * clock, 1 tick (the next clock tick) can be an arbitrarily * short time so we have to wait for at least two ticks. * Note that we always start a timer even though in the usual * case packet traffic will cause this class be be rescanned * and restarted rather than the relatively coarse system * timer: If there's no other traffic, we need the timer as * a 'backstop' to restart this class. */ if (t <= 1) t = 2; cl->sleeping = 1; timeout(rmc_restart, (caddr_t)cl, t); } /* * rmc_restart() is just a helper routine for rmc_delay_action -- it is * called by the system timer code & is responsible checking if the * class is still sleeping (it might have been restarted as a side * effect of the queue scan on a packet arrival) and, if so, restarting * output for the class. Inspecting the class state & restarting output * require locking the class structure. In general the driver is * responsible for locking but this is the only routine that is not * called directly or indirectly from the interface driver so it has * know about system locking conventions. Under bsd, locking is done * by raising IPL to splimp so that's what's implemented here. On a * different system this would probably need to be changed. void rmc_restart(cl) register struct rm_class *cl; { register int s = splimp(); if (cl->sleeping) { register struct rm_ifdat *ifd = cl->ifdat; cl->sleeping = 0; if (ifd->queued < ifd->maxqueued) (ifd->restart)(ifd->ifp); } splx(s); } /* * We should never get here (it should not be possible for the root * class to go overlimit) so panic & get a crash dump if we do. */ void rmc_root_overlimit(cl) register struct rm_class *cl; { panic("rmc_root_overlimit"); }