OS/2 Shareware BBS: 10 Tools

home *** CD-ROM | disk | FTP | other *** search

/ OS/2 Shareware BBS: 10 Tools / 10-Tools.zip / crypl200.zip / BNLIB / LBN68000.C < prev next >

Wrap

C/C++ Source or Header | 1996-05-16 | 15KB | 461 lines

/* * lbn68000.c - 16-bit bignum primitives for the 68000 (or 68010) processors. * * Copyright (c) 1995 Colin Plumb. All rights reserved. * For licensing and other legal details, see the file legal.c. * * This was written for Metrowerks C, and while it should be reasonably * portable, NOTE that Metrowerks lets a callee trash a0, a1, d0, d1, and d2. * Some 680x0 compilers make d2 callee-save, so instructions to save it * will have to be added. * * This code supports 16 or 32-bit ints, based on UINT_MAX. * Regardless of UINT_MAX, only bignums up to 64K words (1 million bits) * are supported. (68k hackers will recognize this as a consequence of * using dbra.) * * These primitives use little-endian word order. * (The order of bytes within words is irrelevant to this issue.) */ #include <limits.h> #include "lbn.h" /* Should include lbn68000.h */ /* * The Metrowerks C compiler (1.2.2) produces bad 68k code for the * following input, which happens to be the inner loop of lbnSub1, * so a few less than critical routines have been recoded in assembly * to avoid the bug. (Optimizer on or off does not matter.) * * unsigned * decrement(unsigned *num, unsigned len) * { * do { * if ((*num++)-- != 0) * return 0; * } while (--len); * return 1; * } */ asm BNWORD16 lbnSub1_16(BNWORD16 *num, unsigned len, BNWORD16 borrow) { movea.l 4(sp),a0 /* num */ #if UINT_MAX == 0xffff move.w 10(sp),d0 /* borrow */ #else move.w 12(sp),d0 /* borrow */ #endif sub.w d0,(a0)+ bcc done #if UINT_MAX == 0xffff move.w 8(sp),d0 /* len */ #else move.w 10(sp),d0 /* len */ #endif subq.w #2,d0 bcs done loop: subq.w #1,(a0)+ dbcc d0,loop done: moveq.l #0,d0 addx.w d0,d0 rts } asm BNWORD16 lbnAdd1_16(BNWORD16 *num, unsigned len, BNWORD16 carry) { movea.l 4(sp),a0 /* num */ #if UINT_MAX == 0xffff move.w 10(sp),d0 /* carry */ #else move.w 12(sp),d0 /* carry */ #endif add.w d0,(a0)+ bcc done #if UINT_MAX == 0xffff move.w 8(sp),d0 /* len */ #else move.w 10(sp),d0 /* len */ #endif subq.w #2,d0 bcs done loop: addq.w #1,(a0)+ dbcc d0,loop done: moveq.l #0,d0 addx.w d0,d0 rts } asm void lbnMulN1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k) { move.w d3,-(sp) /* 2 bytes of stack frame */ move.l 2+4(sp),a1 /* out */ move.l 2+8(sp),a0 /* in */ #if UINT_MAX == 0xffff move.w 2+12(sp),d3 /* len */ move.w 2+14(sp),d2 /* k */ #else move.w 2+14(sp),d3 /* len (low 16 bits) */ move.w 2+16(sp),d2 /* k */ #endif move.w (a0)+,d1 /* First multiply */ mulu.w d2,d1 move.w d1,(a1)+ clr.w d1 swap d1 subq.w #1,d3 /* Setup for loop unrolling */ lsr.w #1,d3 bcs.s m16_even beq.s m16_short subq.w #1,d3 /* Set up software pipeline properly */ move.l d1,d0 m16_loop: move.w (a0)+,d1 mulu.w d2,d1 add.l d0,d1 move.w d1,(a1)+ clr.w d1 swap d1 m16_even: move.w (a0)+,d0 mulu.w d2,d0 add.l d1,d0 move.w d0,(a1)+ clr.w d0 swap d0 dbra d3,m16_loop move.w d0,(a1) move.w (sp)+,d3 rts m16_short: move.w d1,(a1) move.w (sp)+,d3 rts } asm BNWORD16 lbnMulAdd1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k) { move.w d4,-(sp) clr.w d4 move.w d3,-(sp) /* 4 bytes of stack frame */ move.l 4+4(sp),a1 /* out */ move.l 4+8(sp),a0 /* in */ #if UINT_MAX == 0xffff move.w 4+12(sp),d3 /* len */ move.w 4+14(sp),d2 /* k */ #else move.w 4+14(sp),d3 /* len (low 16 bits) */ move.w 4+16(sp),d2 /* k */ #endif move.w (a0)+,d1 /* First multiply */ mulu.w d2,d1 add.w d1,(a1)+ clr.w d1 swap d1 addx.w d4,d1 subq.w #1,d3 /* Setup for loop unrolling */ lsr.w #1,d3 bcs.s ma16_even beq.s ma16_short subq.w #1,d3 /* Set up software pipeline properly */ move.l d1,d0 ma16_loop: move.w (a0)+,d1 mulu.w d2,d1 add.l d0,d1 add.w d1,(a1)+ clr.w d1 swap d1 addx.w d4,d1 ma16_even: move.w (a0)+,d0 mulu.w d2,d0 add.l d1,d0 add.w d0,(a1)+ clr.w d0 swap d0 addx.w d4,d0 dbra d3,ma16_loop move.w (sp)+,d3 move.w (sp)+,d4 rts ma16_short: move.w (sp)+,d3 move.l d1,d0 move.w (sp)+,d4 rts } asm BNWORD16 lbnMulSub1_16(BNWORD16 *out, BNWORD16 const *in, unsigned len, BNWORD16 k) { move.w d4,-(sp) clr.w d4 move.w d3,-(sp) /* 4 bytes of stack frame */ move.l 4+4(sp),a1 /* out */ move.l 4+8(sp),a0 /* in */ #if UINT_MAX == 0xffff move.w 4+12(sp),d3 /* len */ move.w 4+14(sp),d2 /* k */ #else move.w 4+14(sp),d3 /* len (low 16 bits) */ move.w 4+16(sp),d2 /* k */ #endif move.w (a0)+,d1 /* First multiply */ mulu.w d2,d1 sub.w d1,(a1)+ clr.w d1 swap d1 addx.w d4,d1 subq.w #1,d3 /* Setup for loop unrolling */ lsr.w #1,d3 bcs.s ms16_even beq.s ms16_short subq.w #1,d3 /* Set up software pipeline properly */ move.l d1,d0 ms16_loop: move.w (a0)+,d1 mulu.w d2,d1 add.l d0,d1 sub.w d1,(a1)+ clr.w d1 swap d1 addx.w d4,d1 ms16_even: move.w (a0)+,d0 mulu.w d2,d0 add.l d1,d0 sub.w d0,(a1)+ clr.w d0 swap d0 addx.w d4,d0 dbra d3,ms16_loop move.w (sp)+,d3 move.w (sp)+,d4 rts ms16_short: move.w (sp)+,d3 move.l d1,d0 move.w (sp)+,d4 rts } /* The generic long/short divide doesn't know that nh < d */ asm BNWORD16 lbnDiv21_16(BNWORD16 *q, BNWORD16 nh, BNWORD16 nl, BNWORD16 d) { move.l 8(sp),d0 /* nh *and* nl */ divu.w 12(sp),d0 move.l 4(sp),a0 move.w d0,(a0) clr.w d0 swap d0 rts } asm unsigned lbnModQ_16(BNWORD16 const *n, unsigned len, BNWORD16 d) { move.l 4(sp),a0 /* n */ moveq.l #0,d1 #if UINT_MAX == 0xffff move.w 8(sp),d1 /* len */ move.w 10(sp),d2 /* d */ #else move.w 10(sp),d1 /* len (low 16 bits) */ move.w 12(sp),d2 /* d */ #endif add.l d1,a0 add.l d1,a0 /* n += len */ moveq.l #0,d0 subq.w #1,d1 mq16_loop: move.w -(a0),d0 /* Assemble remainder and new word */ divu.w d2,d0 /* Put remainder in high half of d0 */ dbra d1,mq16_loop mq16_done: clr.w d0 swap d0 rts } /* * Detect if this is a 32-bit processor (68020+ *or* CPU32). * Both the 68020+ and CPU32 processors (which have 32x32->64-bit * multiply, what the 32-bit math library wants) support scaled indexed * addressing. The 68000 and 68010 ignore the scale selection * bits, treating it as *1 all the time. So a 32-bit processor * will evaluate -2(a0,a0.w*2) as 1+1*2-2 = 1. * A 16-bit processor will compute 1+1-2 = 0. * * Thus, the return value will indicate whether the chip this is * running on supports 32x32->64-bit multiply (mulu.l). */ asm int is68020(void) { machine 68020 lea 1,a0 #if 0 lea -2(a0,a0.w*2),a0 /* Metrowerks won't assemble this, arrgh */ #else dc.w 0x41f0, 0x82fe #endif move.l a0,d0 rts } /* * Since I had to hand-assemble that fancy addressing mode, I had to study * up on 680x0 addressing modes. * A summary of 680x0 addressing modes. * A 68000 effective address specifies an operand on an instruction, which * may be a register or in memory. It is made up of a 3-bit mode and a * 3-bit register specifier. The meanings of the various modes are: * * 000 reg - Dn, n specified by "reg" * 001 reg - An, n specified by "reg" * 010 reg - (An) * 011 reg - (An)+ * 100 reg - -(An) * 101 reg - d16(An), one 16-bit displacement word follows, sign-extended * 110 reg - Fancy addressing mode off of An, see extension word below * 111 000 - abs.W, one 16-bit signed absolute address follows * 111 001 - abs.L, one 32-bit absolute address follows * 111 010 - d16(PC), one 16-bit displacemnt word follows, sign-extended * 111 011 - Fancy addressing mode off of PC, see extension word below * 111 100 - #immediate, followed by 16 or 32 bits of immediate value * 111 101 - unused, reserved * 111 110 - unused, reserved * 111 111 - unused, reserved * * Memory references are to data space, except that PC-relative references * are to program space, and are read-only. * * Fancy addressing modes are followed by a 16-bit extension word, and come * in "brief" and "full" forms. * The "brief" form looks like this. Bit 8 is 0 to indicate this form: * * 1 1 1 1 1 1 1 * 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 * +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ * |A/D| register |L/W| scale | 0 | 8-bit signed displacement | * +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ * * The basic effective address specifies a 32-bit base register - A0 through * A7 or PC (the address of the following instruction). * The A/D and register fields specify an index register. A/D is 1 for * address registers, and 0 for data registers. L/W specifies the length * of the index register, 1 for 32 bits, and 0 for 16 bits (sign-extended). * The scale field is a left shift amount (0 to 3 bits) to apply to the * sign-extended index register. The final address is d8(An,Rn.X*SCALE), * also written (d8,An,Rn.X*SCALE). X is "W" or "L", SCALE is 1, 2, 4 or 8. * "*1" may be omitted, as may a d8 of 0. * * The 68000 supports this form, but only with a scale field of 0. * It does NOT (says the MC68030 User's Manual MC68030UM/AD, section 2.7) * decode the scale field and the following format bit. They are treated * as 0. * I recall (I don't have the data book handy) that the CPU32 processor * core used in the 683xx series processors supports variable scales, * but only the brief extension word form. I suspect it decodes the * format bit and traps if it is not zero, but I don't recall. * * The "full" form (680x0, x >= 2 processors only) looks like this: * * 1 1 1 1 1 1 1 * 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 * +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ * |A/D| register |L/W| scale | 1 | BS| IS|BD size| 0 | P |OD size| * +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ * * The first 8 bits are interpreted the same way as in the brief form, * except that bit 8 is set to 1 to indicate the full form. * BS, Base Suppress, if set, causes a value of 0 to be used in place of * the base register value. If this is set, the base register * specified is irrelevant, except that if it is the PC, the fetch is * still done from program space. The specifier "ZPC" can be used in * place of "PC" in the effective address mnemonic to represent this * case. * IS, Index Suppress, if set, causes a value of 0 to be used in place * of the scaled index register. In this case, the first 7 bits of the * extension word are irrelevant. * BD size specifies the base displacement size. A value of 00 * in this field is illegal, while 01, 10 and 11 indicate that the * extension word is followed by 0, 1 or 2 16-bit words of base displacement * (zero, sign-extended to 32 bits, and most-significant word first, * respectively) to add to the base register value. * Bit 3 is unused. * The P bit is the pre/post indexing bit, and only applies if an outer * displacement is used. This is explained later. * OD size specifies the size of an outer displacement. In the simple * case, this field is set to 00 and the effective address is * (disp,An,Rn.X*SCALE) or (disp,PC,Rn.X*SCALE). * In this case the P bit must be 0. Any of those compnents may be * suppressed, with a BD size of 01, the BS bit, or the IS bit. * If the OD size is not 00, it encodes an outer displacement in the same * manner as the BD size, and 0, 1 or 2 16-bit words of outer displacement * follow the base displacement in the instruction stream. In this case, * this is a double-indirect addressing mode. The base, base displacement, * and possibly the index, specify a 32-bit memory word which holds a value * which is fetched, and the outer displacement and possibly the index are * added to produce the address of the operand. * If the P bit is 0, this is pre-indexed, and the index value is added * before the fetch of the indirect word, producing an effective address * of ([disp,An,Rn.X*SCALE],disp). If the P bit is 1, the post-indexed case, * the memory word is fectched from base+base displacement, then the index * and outer displacement are added to compute the address of the operand. * This effective address is written ([disp,An],Rn.X*SCALE,disp). * (In both cases, "An" may also be "PC" or "ZPC".) * Any of the components may be omitted. If the index is omitted (using the * IS bit), the P bit is irrelevant, but must be written as 0. * Thus, legal combinations of IS, P and OD size are: * 0 0 00 - (disp,An,Rn.X*SCALE), also written disp(An,Rn.X*SCALE) * 0 0 01 - ([disp,An,Rn.X*SCALE]) * 0 0 10 - ([disp,An,Rn.X*SCALE],d16) * 0 0 11 - ([disp,An,Rn.X*SCALE],d32) * 0 1 01 - ([disp,An],Rn.X*SCALE) * 0 1 10 - ([disp,An],Rn.X*SCALE,d16) * 0 1 11 - ([disp,An],Rn.X*SCALE,d32) * 1 0 00 - (disp,An), also written disp(An) * 1 0 01 - ([disp,An]) * 1 0 10 - ([disp,An],d16) * 1 0 11 - ([disp,An],d32) */ /* 45678901234567890123456789012345678901234567890123456789012345678901234567 */