From 9f1c5491d27465adf2c26fb94e309355da7cbc95 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Mon, 14 Nov 2011 21:05:42 +0000 Subject: [PATCH] BN update from HEAD. --- crypto/bn/bn_div.c | 272 +++++----------------------------- crypto/bn/bn_exp.c | 240 +++++++++++++++++++++--------- crypto/bn/bn_gf2m.c | 106 +++++++++++--- crypto/bn/bn_lcl.h | 19 ++- crypto/bn/bn_mont.c | 116 ++++----------- crypto/bn/bn_nist.c | 338 ++++++++++++++++++++++++++++++++++++++----- crypto/bn/bn_shift.c | 27 ++-- 7 files changed, 654 insertions(+), 464 deletions(-) diff --git a/crypto/bn/bn_div.c b/crypto/bn/bn_div.c index 802a43d642..52b3304293 100644 --- a/crypto/bn/bn_div.c +++ b/crypto/bn/bn_div.c @@ -169,15 +169,13 @@ int BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d, #endif /* OPENSSL_NO_ASM */ -/* BN_div[_no_branch] computes dv := num / divisor, rounding towards +/* BN_div computes dv := num / divisor, rounding towards * zero, and sets up rm such that dv*divisor + rm = num holds. * Thus: * dv->neg == num->neg ^ divisor->neg (unless the result is zero) * rm->neg == num->neg (unless the remainder is zero) * If 'dv' or 'rm' is NULL, the respective value is not returned. */ -static int BN_div_no_branch(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, - const BIGNUM *divisor, BN_CTX *ctx); int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor, BN_CTX *ctx) { @@ -186,6 +184,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor, BN_ULONG *resp,*wnump; BN_ULONG d0,d1; int num_n,div_n; + int no_branch=0; /* Invalid zero-padding would have particularly bad consequences * in the case of 'num', so don't just rely on bn_check_top() for this one @@ -200,7 +199,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor, if ((BN_get_flags(num, BN_FLG_CONSTTIME) != 0) || (BN_get_flags(divisor, BN_FLG_CONSTTIME) != 0)) { - return BN_div_no_branch(dv, rm, num, divisor, ctx); + no_branch=1; } bn_check_top(dv); @@ -214,7 +213,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor, return(0); } - if (BN_ucmp(num,divisor) < 0) + if (!no_branch && BN_ucmp(num,divisor) < 0) { if (rm != NULL) { if (BN_copy(rm,num) == NULL) return(0); } @@ -239,242 +238,25 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor, norm_shift+=BN_BITS2; if (!(BN_lshift(snum,num,norm_shift))) goto err; snum->neg=0; - div_n=sdiv->top; - num_n=snum->top; - loop=num_n-div_n; - /* Lets setup a 'window' into snum - * This is the part that corresponds to the current - * 'area' being divided */ - wnum.neg = 0; - wnum.d = &(snum->d[loop]); - wnum.top = div_n; - /* only needed when BN_ucmp messes up the values between top and max */ - wnum.dmax = snum->dmax - loop; /* so we don't step out of bounds */ - - /* Get the top 2 words of sdiv */ - /* div_n=sdiv->top; */ - d0=sdiv->d[div_n-1]; - d1=(div_n == 1)?0:sdiv->d[div_n-2]; - - /* pointer to the 'top' of snum */ - wnump= &(snum->d[num_n-1]); - - /* Setup to 'res' */ - res->neg= (num->neg^divisor->neg); - if (!bn_wexpand(res,(loop+1))) goto err; - res->top=loop; - resp= &(res->d[loop-1]); - - /* space for temp */ - if (!bn_wexpand(tmp,(div_n+1))) goto err; - if (BN_ucmp(&wnum,sdiv) >= 0) + if (no_branch) { - /* If BN_DEBUG_RAND is defined BN_ucmp changes (via - * bn_pollute) the const bignum arguments => - * clean the values between top and max again */ - bn_clear_top2max(&wnum); - bn_sub_words(wnum.d, wnum.d, sdiv->d, div_n); - *resp=1; - } - else - res->top--; - /* if res->top == 0 then clear the neg value otherwise decrease - * the resp pointer */ - if (res->top == 0) - res->neg = 0; - else - resp--; - - for (i=0; i 0x%08X\n", - n0, n1, d0, q); -#endif -#endif - -#ifndef REMAINDER_IS_ALREADY_CALCULATED - /* - * rem doesn't have to be BN_ULLONG. The least we - * know it's less that d0, isn't it? - */ - rem=(n1-q*d0)&BN_MASK2; -#endif - t2=(BN_ULLONG)d1*q; - - for (;;) - { - if (t2 <= ((((BN_ULLONG)rem)< 0x%08X\n", - n0, n1, d0, q); -#endif -#ifndef REMAINDER_IS_ALREADY_CALCULATED - rem=(n1-q*d0)&BN_MASK2; -#endif - -#if defined(BN_UMULT_LOHI) - BN_UMULT_LOHI(t2l,t2h,d1,q); -#elif defined(BN_UMULT_HIGH) - t2l = d1 * q; - t2h = BN_UMULT_HIGH(d1,q); -#else + /* Since we don't know whether snum is larger than sdiv, + * we pad snum with enough zeroes without changing its + * value. + */ + if (snum->top <= sdiv->top+1) { - BN_ULONG ql, qh; - t2l=LBITS(d1); t2h=HBITS(d1); - ql =LBITS(q); qh =HBITS(q); - mul64(t2l,t2h,ql,qh); /* t2=(BN_ULLONG)d1*q; */ + if (bn_wexpand(snum, sdiv->top + 2) == NULL) goto err; + for (i = snum->top; i < sdiv->top + 2; i++) snum->d[i] = 0; + snum->top = sdiv->top + 2; } -#endif - - for (;;) - { - if ((t2h < rem) || - ((t2h == rem) && (t2l <= wnump[-2]))) - break; - q--; - rem += d0; - if (rem < d0) break; /* don't let rem overflow */ - if (t2l < d1) t2h--; t2l -= d1; - } -#endif /* !BN_LLONG */ - } -#endif /* !BN_DIV3W */ - - l0=bn_mul_words(tmp->d,sdiv->d,div_n,q); - tmp->d[div_n]=l0; - wnum.d--; - /* ingore top values of the bignums just sub the two - * BN_ULONG arrays with bn_sub_words */ - if (bn_sub_words(wnum.d, wnum.d, tmp->d, div_n+1)) + else { - /* Note: As we have considered only the leading - * two BN_ULONGs in the calculation of q, sdiv * q - * might be greater than wnum (but then (q-1) * sdiv - * is less or equal than wnum) - */ - q--; - if (bn_add_words(wnum.d, wnum.d, sdiv->d, div_n)) - /* we can't have an overflow here (assuming - * that q != 0, but if q == 0 then tmp is - * zero anyway) */ - (*wnump)++; + if (bn_wexpand(snum, snum->top + 1) == NULL) goto err; + snum->d[snum->top] = 0; + snum->top ++; } - /* store part of the result */ - *resp = q; - } - bn_correct_top(snum); - if (rm != NULL) - { - /* Keep a copy of the neg flag in num because if rm==num - * BN_rshift() will overwrite it. - */ - int neg = num->neg; - BN_rshift(rm,snum,norm_shift); - if (!BN_is_zero(rm)) - rm->neg = neg; - bn_check_top(rm); - } - BN_CTX_end(ctx); - return(1); -err: - bn_check_top(rm); - BN_CTX_end(ctx); - return(0); - } - - -/* BN_div_no_branch is a special version of BN_div. It does not contain - * branches that may leak sensitive information. - */ -static int BN_div_no_branch(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, - const BIGNUM *divisor, BN_CTX *ctx) - { - int norm_shift,i,loop; - BIGNUM *tmp,wnum,*snum,*sdiv,*res; - BN_ULONG *resp,*wnump; - BN_ULONG d0,d1; - int num_n,div_n; - - bn_check_top(dv); - bn_check_top(rm); - /* bn_check_top(num); */ /* 'num' has been checked in BN_div() */ - bn_check_top(divisor); - - if (BN_is_zero(divisor)) - { - BNerr(BN_F_BN_DIV_NO_BRANCH,BN_R_DIV_BY_ZERO); - return(0); - } - - BN_CTX_start(ctx); - tmp=BN_CTX_get(ctx); - snum=BN_CTX_get(ctx); - sdiv=BN_CTX_get(ctx); - if (dv == NULL) - res=BN_CTX_get(ctx); - else res=dv; - if (sdiv == NULL || res == NULL) goto err; - - /* First we normalise the numbers */ - norm_shift=BN_BITS2-((BN_num_bits(divisor))%BN_BITS2); - if (!(BN_lshift(sdiv,divisor,norm_shift))) goto err; - sdiv->neg=0; - norm_shift+=BN_BITS2; - if (!(BN_lshift(snum,num,norm_shift))) goto err; - snum->neg=0; - - /* Since we don't know whether snum is larger than sdiv, - * we pad snum with enough zeroes without changing its - * value. - */ - if (snum->top <= sdiv->top+1) - { - if (bn_wexpand(snum, sdiv->top + 2) == NULL) goto err; - for (i = snum->top; i < sdiv->top + 2; i++) snum->d[i] = 0; - snum->top = sdiv->top + 2; - } - else - { - if (bn_wexpand(snum, snum->top + 1) == NULL) goto err; - snum->d[snum->top] = 0; - snum->top ++; } div_n=sdiv->top; @@ -500,12 +282,27 @@ static int BN_div_no_branch(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, /* Setup to 'res' */ res->neg= (num->neg^divisor->neg); if (!bn_wexpand(res,(loop+1))) goto err; - res->top=loop-1; + res->top=loop-no_branch; resp= &(res->d[loop-1]); /* space for temp */ if (!bn_wexpand(tmp,(div_n+1))) goto err; + if (!no_branch) + { + if (BN_ucmp(&wnum,sdiv) >= 0) + { + /* If BN_DEBUG_RAND is defined BN_ucmp changes (via + * bn_pollute) the const bignum arguments => + * clean the values between top and max again */ + bn_clear_top2max(&wnum); + bn_sub_words(wnum.d, wnum.d, sdiv->d, div_n); + *resp=1; + } + else + res->top--; + } + /* if res->top == 0 then clear the neg value otherwise decrease * the resp pointer */ if (res->top == 0) @@ -638,7 +435,7 @@ X) -> 0x%08X\n", rm->neg = neg; bn_check_top(rm); } - bn_correct_top(res); + if (no_branch) bn_correct_top(res); BN_CTX_end(ctx); return(1); err: @@ -646,5 +443,4 @@ err: BN_CTX_end(ctx); return(0); } - #endif diff --git a/crypto/bn/bn_exp.c b/crypto/bn/bn_exp.c index d9b6c737fc..2abf6fd678 100644 --- a/crypto/bn/bn_exp.c +++ b/crypto/bn/bn_exp.c @@ -113,6 +113,18 @@ #include "cryptlib.h" #include "bn_lcl.h" +#include +#ifdef _WIN32 +# include +# ifndef alloca +# define alloca _alloca +# endif +#elif defined(__GNUC__) +# ifndef alloca +# define alloca(s) __builtin_alloca((s)) +# endif +#endif + /* maximum precomputation table size for *variable* sliding windows */ #define TABLE_SIZE 32 @@ -522,23 +534,17 @@ err: * as cache lines are concerned. The following functions are used to transfer a BIGNUM * from/to that table. */ -static int MOD_EXP_CTIME_COPY_TO_PREBUF(BIGNUM *b, int top, unsigned char *buf, int idx, int width) +static int MOD_EXP_CTIME_COPY_TO_PREBUF(const BIGNUM *b, int top, unsigned char *buf, int idx, int width) { size_t i, j; - if (bn_wexpand(b, top) == NULL) - return 0; - while (b->top < top) - { - b->d[b->top++] = 0; - } - + if (top > b->top) + top = b->top; /* this works because 'buf' is explicitly zeroed */ for (i = 0, j=idx; i < top * sizeof b->d[0]; i++, j+=width) { buf[j] = ((unsigned char*)b->d)[i]; } - bn_correct_top(b); return 1; } @@ -561,7 +567,7 @@ static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top, unsigned char *buf /* Given a pointer value, compute the next address that is a cache line multiple. */ #define MOD_EXP_CTIME_ALIGN(x_) \ - ((unsigned char*)(x_) + (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - (((BN_ULONG)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK)))) + ((unsigned char*)(x_) + (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - (((size_t)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK)))) /* This variant of BN_mod_exp_mont() uses fixed windows and the special * precomputation memory layout to limit data-dependency to a minimum @@ -572,17 +578,15 @@ static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top, unsigned char *buf int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont) { - int i,bits,ret=0,idx,window,wvalue; + int i,bits,ret=0,window,wvalue; int top; - BIGNUM *r; - const BIGNUM *aa; BN_MONT_CTX *mont=NULL; int numPowers; unsigned char *powerbufFree=NULL; int powerbufLen = 0; unsigned char *powerbuf=NULL; - BIGNUM *computeTemp=NULL, *am=NULL; + BIGNUM tmp, am; bn_check_top(a); bn_check_top(p); @@ -602,10 +606,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, return ret; } - /* Initialize BIGNUM context and allocate intermediate result */ BN_CTX_start(ctx); - r = BN_CTX_get(ctx); - if (r == NULL) goto err; /* Allocate a montgomery context if it was not supplied by the caller. * If this is not done, things will break in the montgomery part. @@ -620,40 +621,154 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, /* Get the window size to use with size of p. */ window = BN_window_bits_for_ctime_exponent_size(bits); +#if defined(OPENSSL_BN_ASM_MONT5) + if (window==6 && bits<=1024) window=5; /* ~5% improvement of 2048-bit RSA sign */ +#endif /* Allocate a buffer large enough to hold all of the pre-computed - * powers of a. + * powers of am, am itself and tmp. */ numPowers = 1 << window; - powerbufLen = sizeof(m->d[0])*top*numPowers; + powerbufLen = sizeof(m->d[0])*(top*numPowers + + ((2*top)>numPowers?(2*top):numPowers)); +#ifdef alloca + if (powerbufLen < 3072) + powerbufFree = alloca(powerbufLen+MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH); + else +#endif if ((powerbufFree=(unsigned char*)OPENSSL_malloc(powerbufLen+MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH)) == NULL) goto err; powerbuf = MOD_EXP_CTIME_ALIGN(powerbufFree); memset(powerbuf, 0, powerbufLen); - /* Initialize the intermediate result. Do this early to save double conversion, - * once each for a^0 and intermediate result. - */ - if (!BN_to_montgomery(r,BN_value_one(),mont,ctx)) goto err; - if (!MOD_EXP_CTIME_COPY_TO_PREBUF(r, top, powerbuf, 0, numPowers)) goto err; +#ifdef alloca + if (powerbufLen < 3072) + powerbufFree = NULL; +#endif - /* Initialize computeTemp as a^1 with montgomery precalcs */ - computeTemp = BN_CTX_get(ctx); - am = BN_CTX_get(ctx); - if (computeTemp==NULL || am==NULL) goto err; + /* lay down tmp and am right after powers table */ + tmp.d = (BN_ULONG *)(powerbuf + sizeof(m->d[0])*top*numPowers); + am.d = tmp.d + top; + tmp.top = am.top = 0; + tmp.dmax = am.dmax = top; + tmp.neg = am.neg = 0; + tmp.flags = am.flags = BN_FLG_STATIC_DATA; + + /* prepare a^0 in Montgomery domain */ +#if 1 + if (!BN_to_montgomery(&tmp,BN_value_one(),mont,ctx)) goto err; +#else + tmp.d[0] = (0-m->d[0])&BN_MASK2; /* 2^(top*BN_BITS2) - m */ + for (i=1;id[i])&BN_MASK2; + tmp.top = top; +#endif + /* prepare a^1 in Montgomery domain */ if (a->neg || BN_ucmp(a,m) >= 0) { - if (!BN_mod(am,a,m,ctx)) - goto err; - aa= am; + if (!BN_mod(&am,a,m,ctx)) goto err; + if (!BN_to_montgomery(&am,&am,mont,ctx)) goto err; } - else - aa=a; - if (!BN_to_montgomery(am,aa,mont,ctx)) goto err; - if (!BN_copy(computeTemp, am)) goto err; - if (!MOD_EXP_CTIME_COPY_TO_PREBUF(am, top, powerbuf, 1, numPowers)) goto err; + else if (!BN_to_montgomery(&am,a,mont,ctx)) goto err; + +#if defined(OPENSSL_BN_ASM_MONT5) + /* This optimization uses ideas from http://eprint.iacr.org/2011/239, + * specifically optimization of cache-timing attack countermeasures + * and pre-computation optimization. */ + + /* Dedicated window==4 case improves 512-bit RSA sign by ~15%, but as + * 512-bit RSA is hardly relevant, we omit it to spare size... */ + if (window==5) + { + void bn_mul_mont_gather5(BN_ULONG *rp,const BN_ULONG *ap, + const void *table,const BN_ULONG *np, + const BN_ULONG *n0,int num,int power); + void bn_scatter5(const BN_ULONG *inp,size_t num, + void *table,size_t power); + void bn_gather5(BN_ULONG *out,size_t num, + void *table,size_t power); + + BN_ULONG *np=mont->N.d, *n0=mont->n0; + + /* BN_to_montgomery can contaminate words above .top + * [in BN_DEBUG[_DEBUG] build]... */ + for (i=am.top; i=0; i--,bits--) + wvalue = (wvalue<<1)+BN_is_bit_set(p,bits); + bn_gather5(tmp.d,top,powerbuf,wvalue); + + /* Scan the exponent one window at a time starting from the most + * significant bits. + */ + while (bits >= 0) + { + for (wvalue=0, i=0; i<5; i++,bits--) + wvalue = (wvalue<<1)+BN_is_bit_set(p,bits); + + bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top); + bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top); + bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top); + bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top); + bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top); + bn_mul_mont_gather5(tmp.d,tmp.d,powerbuf,np,n0,top,wvalue); + } + + tmp.top=top; + bn_correct_top(&tmp); + } + else +#endif + { + if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 0, numPowers)) goto err; + if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&am, top, powerbuf, 1, numPowers)) goto err; /* If the window size is greater than 1, then calculate * val[i=2..2^winsize-1]. Powers are computed as a*a^(i-1) @@ -662,62 +777,54 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, */ if (window > 1) { - for (i=2; i= 0) + bits--; + for (wvalue=0, i=bits%window; i>=0; i--,bits--) + wvalue = (wvalue<<1)+BN_is_bit_set(p,bits); + if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&tmp,top,powerbuf,wvalue,numPowers)) goto err; + + /* Scan the exponent one window at a time starting from the most + * significant bits. + */ + while (bits >= 0) { wvalue=0; /* The 'value' of the window */ /* Scan the window, squaring the result as we go */ - for (i=0; i> 4 & 0xF] << 8 | SQR_tb[(w) & 0xF] #endif +#if !defined(OPENSSL_BN_ASM_GF2m) /* Product of two polynomials a, b each with degree < BN_BITS2 - 1, * result is a polynomial r with degree < 2 * BN_BITS - 1 * The caller MUST ensure that the variables have the right amount @@ -218,7 +219,9 @@ static void bn_GF2m_mul_2x2(BN_ULONG *r, const BN_ULONG a1, const BN_ULONG a0, c r[2] ^= m1 ^ r[1] ^ r[3]; /* h0 ^= m1 ^ l1 ^ h1; */ r[1] = r[3] ^ r[2] ^ r[0] ^ m1 ^ m0; /* l1 ^= l0 ^ h0 ^ m0; */ } - +#else +void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0); +#endif /* Add polynomials a and b and store result in r; r could be a or b, a and b * could be equal; r is the bitwise XOR of a and b. @@ -362,21 +365,17 @@ int BN_GF2m_mod_arr(BIGNUM *r, const BIGNUM *a, const int p[]) int BN_GF2m_mod(BIGNUM *r, const BIGNUM *a, const BIGNUM *p) { int ret = 0; - const int max = BN_num_bits(p) + 1; - int *arr=NULL; + int arr[6]; bn_check_top(a); bn_check_top(p); - if ((arr = (int *)OPENSSL_malloc(sizeof(int) * max)) == NULL) goto err; - ret = BN_GF2m_poly2arr(p, arr, max); - if (!ret || ret > max) + ret = BN_GF2m_poly2arr(p, arr, sizeof(arr)/sizeof(arr[0])); + if (!ret || ret > (int)(sizeof(arr)/sizeof(arr[0]))) { BNerr(BN_F_BN_GF2M_MOD,BN_R_INVALID_LENGTH); - goto err; + return 0; } ret = BN_GF2m_mod_arr(r, a, arr); bn_check_top(r); -err: - if (arr) OPENSSL_free(arr); return ret; } @@ -531,18 +530,18 @@ int BN_GF2m_mod_inv(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx) BN_CTX_start(ctx); - b = BN_CTX_get(ctx); - c = BN_CTX_get(ctx); - u = BN_CTX_get(ctx); - v = BN_CTX_get(ctx); - if (v == NULL) goto err; + if ((b = BN_CTX_get(ctx))==NULL) goto err; + if ((c = BN_CTX_get(ctx))==NULL) goto err; + if ((u = BN_CTX_get(ctx))==NULL) goto err; + if ((v = BN_CTX_get(ctx))==NULL) goto err; - if (!BN_one(b)) goto err; if (!BN_GF2m_mod(u, a, p)) goto err; - if (!BN_copy(v, p)) goto err; - if (BN_is_zero(u)) goto err; + if (!BN_copy(v, p)) goto err; +#if 0 + if (!BN_one(b)) goto err; + while (1) { while (!BN_is_odd(u)) @@ -567,13 +566,86 @@ int BN_GF2m_mod_inv(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx) if (!BN_GF2m_add(u, u, v)) goto err; if (!BN_GF2m_add(b, b, c)) goto err; } +#else + { + int i, ubits = BN_num_bits(u), + vbits = BN_num_bits(v), /* v is copy of p */ + top = p->top; + BN_ULONG *udp,*bdp,*vdp,*cdp; + + bn_wexpand(u,top); udp = u->d; + for (i=u->top;itop = top; + bn_wexpand(b,top); bdp = b->d; + bdp[0] = 1; + for (i=1;itop = top; + bn_wexpand(c,top); cdp = c->d; + for (i=0;itop = top; + vdp = v->d; /* It pays off to "cache" *->d pointers, because + * it allows optimizer to be more aggressive. + * But we don't have to "cache" p->d, because *p + * is declared 'const'... */ + while (1) + { + while (ubits && !(udp[0]&1)) + { + BN_ULONG u0,u1,b0,b1,mask; + u0 = udp[0]; + b0 = bdp[0]; + mask = (BN_ULONG)0-(b0&1); + b0 ^= p->d[0]&mask; + for (i=0;i>1)|(u1<<(BN_BITS2-1)))&BN_MASK2; + u0 = u1; + b1 = bdp[i+1]^(p->d[i+1]&mask); + bdp[i] = ((b0>>1)|(b1<<(BN_BITS2-1)))&BN_MASK2; + b0 = b1; + } + udp[i] = u0>>1; + bdp[i] = b0>>1; + ubits--; + } + + if (ubits<=BN_BITS2 && udp[0]==1) break; + + if (ubitsd; + bdp = cdp; cdp = c->d; + } + for(i=0;i # define BN_UMULT_HIGH(a,b) (BN_ULONG)asm("umulh %a0,%a1,%v0",(a),(b)) -# elif defined(__GNUC__) +# elif defined(__GNUC__) && __GNUC__>=2 # define BN_UMULT_HIGH(a,b) ({ \ register BN_ULONG ret; \ asm ("umulh %1,%2,%0" \ @@ -247,7 +247,7 @@ extern "C" { ret; }) # endif /* compiler */ # elif defined(_ARCH_PPC) && defined(__64BIT__) && defined(SIXTY_FOUR_BIT_LONG) -# if defined(__GNUC__) +# if defined(__GNUC__) && __GNUC__>=2 # define BN_UMULT_HIGH(a,b) ({ \ register BN_ULONG ret; \ asm ("mulhdu %0,%1,%2" \ @@ -257,7 +257,7 @@ extern "C" { # endif /* compiler */ # elif (defined(__x86_64) || defined(__x86_64__)) && \ (defined(SIXTY_FOUR_BIT_LONG) || defined(SIXTY_FOUR_BIT)) -# if defined(__GNUC__) +# if defined(__GNUC__) && __GNUC__>=2 # define BN_UMULT_HIGH(a,b) ({ \ register BN_ULONG ret,discard; \ asm ("mulq %3" \ @@ -280,6 +280,19 @@ extern "C" { # define BN_UMULT_HIGH(a,b) __umulh((a),(b)) # define BN_UMULT_LOHI(low,high,a,b) ((low)=_umul128((a),(b),&(high))) # endif +# elif defined(__mips) && (defined(SIXTY_FOUR_BIT) || defined(SIXTY_FOUR_BIT_LONG)) +# if defined(__GNUC__) && __GNUC__>=2 +# define BN_UMULT_HIGH(a,b) ({ \ + register BN_ULONG ret; \ + asm ("dmultu %1,%2" \ + : "=h"(ret) \ + : "r"(a), "r"(b) : "l"); \ + ret; }) +# define BN_UMULT_LOHI(low,high,a,b) \ + asm ("dmultu %2,%3" \ + : "=l"(low),"=h"(high) \ + : "r"(a), "r"(b)); +# endif # endif /* cpu */ #endif /* OPENSSL_NO_ASM */ diff --git a/crypto/bn/bn_mont.c b/crypto/bn/bn_mont.c index 1a866880f5..427b5cf4df 100644 --- a/crypto/bn/bn_mont.c +++ b/crypto/bn/bn_mont.c @@ -177,31 +177,26 @@ err: static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont) { BIGNUM *n; - BN_ULONG *ap,*np,*rp,n0,v,*nrp; - int al,nl,max,i,x,ri; + BN_ULONG *ap,*np,*rp,n0,v,carry; + int nl,max,i; n= &(mont->N); - /* mont->ri is the size of mont->N in bits (rounded up - to the word size) */ - al=ri=mont->ri/BN_BITS2; - nl=n->top; - if ((al == 0) || (nl == 0)) { ret->top=0; return(1); } + if (nl == 0) { ret->top=0; return(1); } - max=(nl+al+1); /* allow for overflow (no?) XXX */ + max=(2*nl); /* carry is stored separately */ if (bn_wexpand(r,max) == NULL) return(0); r->neg^=n->neg; np=n->d; rp=r->d; - nrp= &(r->d[nl]); /* clear the top words of T */ #if 1 for (i=r->top; id[i]=0; + rp[i]=0; #else - memset(&(r->d[r->top]),0,(max-r->top)*sizeof(BN_ULONG)); + memset(&(rp[r->top]),0,(max-r->top)*sizeof(BN_ULONG)); #endif r->top=max; @@ -210,7 +205,7 @@ static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont) #ifdef BN_COUNT fprintf(stderr,"word BN_from_montgomery_word %d * %d\n",nl,nl); #endif - for (i=0; i= v) - continue; - else - { - if (((++nrp[0])&BN_MASK2) != 0) continue; - if (((++nrp[1])&BN_MASK2) != 0) continue; - for (x=2; (((++nrp[x])&BN_MASK2) == 0); x++) ; - } - } - bn_correct_top(r); - - /* mont->ri will be a multiple of the word size and below code - * is kind of BN_rshift(ret,r,mont->ri) equivalent */ - if (r->top <= ri) - { - ret->top=0; - return(1); + v = (v+carry+rp[nl])&BN_MASK2; + carry |= (v != rp[nl]); + carry &= (v <= rp[nl]); + rp[nl]=v; } - al=r->top-ri; -#define BRANCH_FREE 1 -#if BRANCH_FREE - if (bn_wexpand(ret,ri) == NULL) return(0); - x=0-(((al-ri)>>(sizeof(al)*8-1))&1); - ret->top=x=(ri&~x)|(al&x); /* min(ri,al) */ + if (bn_wexpand(ret,nl) == NULL) return(0); + ret->top=nl; ret->neg=r->neg; rp=ret->d; - ap=&(r->d[ri]); + ap=&(r->d[nl]); +#define BRANCH_FREE 1 +#if BRANCH_FREE { - size_t m1,m2; - - v=bn_sub_words(rp,ap,np,ri); - /* this ----------------^^ works even in alri) nrp=rp; else nrp=ap; */ - /* in other words if subtraction result is real, then + v=bn_sub_words(rp,ap,np,nl)-carry; + /* if subtraction result is real, then * trick unconditional memcpy below to perform in-place * "refresh" instead of actual copy. */ - m1=0-(size_t)(((al-ri)>>(sizeof(al)*8-1))&1); /* al>(sizeof(al)*8-1))&1); /* al>ri */ - m1|=m2; /* (al!=ri) */ - m1|=(0-(size_t)v); /* (al!=ri || v) */ - m1&=~m2; /* (al!=ri || v) && !al>ri */ - nrp=(BN_ULONG *)(((PTR_SIZE_INT)rp&~m1)|((PTR_SIZE_INT)ap&m1)); - } + m=(0-(size_t)v); + nrp=(BN_ULONG *)(((PTR_SIZE_INT)rp&~m)|((PTR_SIZE_INT)ap&m)); - /* 'itop=al; - ret->neg=r->neg; - - rp=ret->d; - ap=&(r->d[ri]); - al-=4; - for (i=0; iN)) >= 0) - { - if (!BN_usub(ret,ret,&(mont->N))) return(0); - } + if (bn_sub_words (rp,ap,np,nl)-carry) + memcpy(rp,ap,nl*sizeof(BN_ULONG)); #endif + bn_correct_top(r); + bn_correct_top(ret); bn_check_top(ret); return(1); diff --git a/crypto/bn/bn_nist.c b/crypto/bn/bn_nist.c index c6de032696..880989dd5f 100644 --- a/crypto/bn/bn_nist.c +++ b/crypto/bn/bn_nist.c @@ -319,6 +319,13 @@ static void nist_cp_bn(BN_ULONG *buf, BN_ULONG *a, int top) :(to[(n)/2] =((m)&1)?(from[(m)/2]>>32):(from[(m)/2]&BN_MASK2l))) #define bn_32_set_0(to, n) (((n)&1)?(to[(n)/2]&=BN_MASK2l):(to[(n)/2]=0)); #define bn_cp_32(to,n,from,m) ((m)>=0)?bn_cp_32_naked(to,n,from,m):bn_32_set_0(to,n) +# if defined(L_ENDIAN) +# if defined(__arch64__) +# define NIST_INT64 long +# else +# define NIST_INT64 long long +# endif +# endif #else #define bn_cp_64(to, n, from, m) \ { \ @@ -330,13 +337,15 @@ static void nist_cp_bn(BN_ULONG *buf, BN_ULONG *a, int top) bn_32_set_0(to, (n)*2); \ bn_32_set_0(to, (n)*2+1); \ } -#if BN_BITS2 == 32 #define bn_cp_32(to, n, from, m) (to)[n] = (m>=0)?((from)[m]):0; #define bn_32_set_0(to, n) (to)[n] = (BN_ULONG)0; -#endif +# if defined(_WIN32) && !defined(__GNUC__) +# define NIST_INT64 __int64 +# else +# define NIST_INT64 long long +# endif #endif /* BN_BITS2 != 64 */ - #define nist_set_192(to, from, a1, a2, a3) \ { \ bn_cp_64(to, 0, from, (a3) - 3) \ @@ -350,9 +359,11 @@ int BN_nist_mod_192(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, int top = a->top, i; int carry; register BN_ULONG *r_d, *a_d = a->d; - BN_ULONG t_d[BN_NIST_192_TOP], - buf[BN_NIST_192_TOP], - c_d[BN_NIST_192_TOP], + union { + BN_ULONG bn[BN_NIST_192_TOP]; + unsigned int ui[BN_NIST_192_TOP*sizeof(BN_ULONG)/sizeof(unsigned int)]; + } buf; + BN_ULONG c_d[BN_NIST_192_TOP], *res; PTR_SIZE_INT mask; static const BIGNUM _bignum_nist_p_192_sqr = { @@ -385,15 +396,48 @@ int BN_nist_mod_192(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, else r_d = a_d; - nist_cp_bn_0(buf, a_d + BN_NIST_192_TOP, top - BN_NIST_192_TOP, BN_NIST_192_TOP); + nist_cp_bn_0(buf.bn, a_d + BN_NIST_192_TOP, top - BN_NIST_192_TOP, BN_NIST_192_TOP); + +#if defined(NIST_INT64) + { + NIST_INT64 acc; /* accumulator */ + unsigned int *rp=(unsigned int *)r_d; + const unsigned int *bp=(const unsigned int *)buf.ui; + + acc = rp[0]; acc += bp[3*2-6]; + acc += bp[5*2-6]; rp[0] = (unsigned int)acc; acc >>= 32; + + acc += rp[1]; acc += bp[3*2-5]; + acc += bp[5*2-5]; rp[1] = (unsigned int)acc; acc >>= 32; - nist_set_192(t_d, buf, 0, 3, 3); + acc += rp[2]; acc += bp[3*2-6]; + acc += bp[4*2-6]; + acc += bp[5*2-6]; rp[2] = (unsigned int)acc; acc >>= 32; + + acc += rp[3]; acc += bp[3*2-5]; + acc += bp[4*2-5]; + acc += bp[5*2-5]; rp[3] = (unsigned int)acc; acc >>= 32; + + acc += rp[4]; acc += bp[4*2-6]; + acc += bp[5*2-6]; rp[4] = (unsigned int)acc; acc >>= 32; + + acc += rp[5]; acc += bp[4*2-5]; + acc += bp[5*2-5]; rp[5] = (unsigned int)acc; + + carry = (int)(acc>>32); + } +#else + { + BN_ULONG t_d[BN_NIST_192_TOP]; + + nist_set_192(t_d, buf.bn, 0, 3, 3); carry = (int)bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP); - nist_set_192(t_d, buf, 4, 4, 0); + nist_set_192(t_d, buf.bn, 4, 4, 0); carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP); - nist_set_192(t_d, buf, 5, 5, 5) + nist_set_192(t_d, buf.bn, 5, 5, 5) carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP); - + } +#endif if (carry > 0) carry = (int)bn_sub_words(r_d,r_d,_nist_p_192[carry-1],BN_NIST_192_TOP); else @@ -435,8 +479,7 @@ int BN_nist_mod_224(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, int top = a->top, i; int carry; BN_ULONG *r_d, *a_d = a->d; - BN_ULONG t_d[BN_NIST_224_TOP], - buf[BN_NIST_224_TOP], + BN_ULONG buf[BN_NIST_224_TOP], c_d[BN_NIST_224_TOP], *res; PTR_SIZE_INT mask; @@ -474,14 +517,54 @@ int BN_nist_mod_224(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, #if BN_BITS2==64 /* copy upper 256 bits of 448 bit number ... */ - nist_cp_bn_0(t_d, a_d + (BN_NIST_224_TOP-1), top - (BN_NIST_224_TOP-1), BN_NIST_224_TOP); + nist_cp_bn_0(c_d, a_d + (BN_NIST_224_TOP-1), top - (BN_NIST_224_TOP-1), BN_NIST_224_TOP); /* ... and right shift by 32 to obtain upper 224 bits */ - nist_set_224(buf, t_d, 14, 13, 12, 11, 10, 9, 8); + nist_set_224(buf, c_d, 14, 13, 12, 11, 10, 9, 8); /* truncate lower part to 224 bits too */ r_d[BN_NIST_224_TOP-1] &= BN_MASK2l; #else nist_cp_bn_0(buf, a_d + BN_NIST_224_TOP, top - BN_NIST_224_TOP, BN_NIST_224_TOP); #endif + +#if defined(NIST_INT64) && BN_BITS2!=64 + { + NIST_INT64 acc; /* accumulator */ + unsigned int *rp=(unsigned int *)r_d; + const unsigned int *bp=(const unsigned int *)buf; + + acc = rp[0]; acc -= bp[7-7]; + acc -= bp[11-7]; rp[0] = (unsigned int)acc; acc >>= 32; + + acc += rp[1]; acc -= bp[8-7]; + acc -= bp[12-7]; rp[1] = (unsigned int)acc; acc >>= 32; + + acc += rp[2]; acc -= bp[9-7]; + acc -= bp[13-7]; rp[2] = (unsigned int)acc; acc >>= 32; + + acc += rp[3]; acc += bp[7-7]; + acc += bp[11-7]; + acc -= bp[10-7]; rp[3] = (unsigned int)acc; acc>>= 32; + + acc += rp[4]; acc += bp[8-7]; + acc += bp[12-7]; + acc -= bp[11-7]; rp[4] = (unsigned int)acc; acc >>= 32; + + acc += rp[5]; acc += bp[9-7]; + acc += bp[13-7]; + acc -= bp[12-7]; rp[5] = (unsigned int)acc; acc >>= 32; + + acc += rp[6]; acc += bp[10-7]; + acc -= bp[13-7]; rp[6] = (unsigned int)acc; + + carry = (int)(acc>>32); +# if BN_BITS2==64 + rp[7] = carry; +# endif + } +#else + { + BN_ULONG t_d[BN_NIST_224_TOP]; + nist_set_224(t_d, buf, 10, 9, 8, 7, 0, 0, 0); carry = (int)bn_add_words(r_d, r_d, t_d, BN_NIST_224_TOP); nist_set_224(t_d, buf, 0, 13, 12, 11, 0, 0, 0); @@ -493,6 +576,8 @@ int BN_nist_mod_224(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, #if BN_BITS2==64 carry = (int)(r_d[BN_NIST_224_TOP-1]>>32); +#endif + } #endif u.f = bn_sub_words; if (carry > 0) @@ -548,9 +633,11 @@ int BN_nist_mod_256(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, int i, top = a->top; int carry = 0; register BN_ULONG *a_d = a->d, *r_d; - BN_ULONG t_d[BN_NIST_256_TOP], - buf[BN_NIST_256_TOP], - c_d[BN_NIST_256_TOP], + union { + BN_ULONG bn[BN_NIST_256_TOP]; + unsigned int ui[BN_NIST_256_TOP*sizeof(BN_ULONG)/sizeof(unsigned int)]; + } buf; + BN_ULONG c_d[BN_NIST_256_TOP], *res; PTR_SIZE_INT mask; union { bn_addsub_f f; PTR_SIZE_INT p; } u; @@ -584,12 +671,87 @@ int BN_nist_mod_256(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, else r_d = a_d; - nist_cp_bn_0(buf, a_d + BN_NIST_256_TOP, top - BN_NIST_256_TOP, BN_NIST_256_TOP); + nist_cp_bn_0(buf.bn, a_d + BN_NIST_256_TOP, top - BN_NIST_256_TOP, BN_NIST_256_TOP); + +#if defined(NIST_INT64) + { + NIST_INT64 acc; /* accumulator */ + unsigned int *rp=(unsigned int *)r_d; + const unsigned int *bp=(const unsigned int *)buf.ui; + + acc = rp[0]; acc += bp[8-8]; + acc += bp[9-8]; + acc -= bp[11-8]; + acc -= bp[12-8]; + acc -= bp[13-8]; + acc -= bp[14-8]; rp[0] = (unsigned int)acc; acc >>= 32; + + acc += rp[1]; acc += bp[9-8]; + acc += bp[10-8]; + acc -= bp[12-8]; + acc -= bp[13-8]; + acc -= bp[14-8]; + acc -= bp[15-8]; rp[1] = (unsigned int)acc; acc >>= 32; + + acc += rp[2]; acc += bp[10-8]; + acc += bp[11-8]; + acc -= bp[13-8]; + acc -= bp[14-8]; + acc -= bp[15-8]; rp[2] = (unsigned int)acc; acc >>= 32; + + acc += rp[3]; acc += bp[11-8]; + acc += bp[11-8]; + acc += bp[12-8]; + acc += bp[12-8]; + acc += bp[13-8]; + acc -= bp[15-8]; + acc -= bp[8-8]; + acc -= bp[9-8]; rp[3] = (unsigned int)acc; acc >>= 32; + + acc += rp[4]; acc += bp[12-8]; + acc += bp[12-8]; + acc += bp[13-8]; + acc += bp[13-8]; + acc += bp[14-8]; + acc -= bp[9-8]; + acc -= bp[10-8]; rp[4] = (unsigned int)acc; acc >>= 32; + + acc += rp[5]; acc += bp[13-8]; + acc += bp[13-8]; + acc += bp[14-8]; + acc += bp[14-8]; + acc += bp[15-8]; + acc -= bp[10-8]; + acc -= bp[11-8]; rp[5] = (unsigned int)acc; acc >>= 32; + + acc += rp[6]; acc += bp[14-8]; + acc += bp[14-8]; + acc += bp[15-8]; + acc += bp[15-8]; + acc += bp[14-8]; + acc += bp[13-8]; + acc -= bp[8-8]; + acc -= bp[9-8]; rp[6] = (unsigned int)acc; acc >>= 32; + + acc += rp[7]; acc += bp[15-8]; + acc += bp[15-8]; + acc += bp[15-8]; + acc += bp[8 -8]; + acc -= bp[10-8]; + acc -= bp[11-8]; + acc -= bp[12-8]; + acc -= bp[13-8]; rp[7] = (unsigned int)acc; + + carry = (int)(acc>>32); + } +#else + { + BN_ULONG t_d[BN_NIST_256_TOP]; /*S1*/ - nist_set_256(t_d, buf, 15, 14, 13, 12, 11, 0, 0, 0); + nist_set_256(t_d, buf.bn, 15, 14, 13, 12, 11, 0, 0, 0); /*S2*/ - nist_set_256(c_d, buf, 0, 15, 14, 13, 12, 0, 0, 0); + nist_set_256(c_d, buf.bn, 0, 15, 14, 13, 12, 0, 0, 0); carry = (int)bn_add_words(t_d, t_d, c_d, BN_NIST_256_TOP); /* left shift */ { @@ -607,24 +769,26 @@ int BN_nist_mod_256(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, } carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP); /*S3*/ - nist_set_256(t_d, buf, 15, 14, 0, 0, 0, 10, 9, 8); + nist_set_256(t_d, buf.bn, 15, 14, 0, 0, 0, 10, 9, 8); carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP); /*S4*/ - nist_set_256(t_d, buf, 8, 13, 15, 14, 13, 11, 10, 9); + nist_set_256(t_d, buf.bn, 8, 13, 15, 14, 13, 11, 10, 9); carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP); /*D1*/ - nist_set_256(t_d, buf, 10, 8, 0, 0, 0, 13, 12, 11); + nist_set_256(t_d, buf.bn, 10, 8, 0, 0, 0, 13, 12, 11); carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP); /*D2*/ - nist_set_256(t_d, buf, 11, 9, 0, 0, 15, 14, 13, 12); + nist_set_256(t_d, buf.bn, 11, 9, 0, 0, 15, 14, 13, 12); carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP); /*D3*/ - nist_set_256(t_d, buf, 12, 0, 10, 9, 8, 15, 14, 13); + nist_set_256(t_d, buf.bn, 12, 0, 10, 9, 8, 15, 14, 13); carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP); /*D4*/ - nist_set_256(t_d, buf, 13, 0, 11, 10, 9, 0, 15, 14); + nist_set_256(t_d, buf.bn, 13, 0, 11, 10, 9, 0, 15, 14); carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP); + } +#endif /* see BN_nist_mod_224 for explanation */ u.f = bn_sub_words; if (carry > 0) @@ -672,9 +836,11 @@ int BN_nist_mod_384(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, int i, top = a->top; int carry = 0; register BN_ULONG *r_d, *a_d = a->d; - BN_ULONG t_d[BN_NIST_384_TOP], - buf[BN_NIST_384_TOP], - c_d[BN_NIST_384_TOP], + union { + BN_ULONG bn[BN_NIST_384_TOP]; + unsigned int ui[BN_NIST_384_TOP*sizeof(BN_ULONG)/sizeof(unsigned int)]; + } buf; + BN_ULONG c_d[BN_NIST_384_TOP], *res; PTR_SIZE_INT mask; union { bn_addsub_f f; PTR_SIZE_INT p; } u; @@ -709,10 +875,100 @@ int BN_nist_mod_384(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, else r_d = a_d; - nist_cp_bn_0(buf, a_d + BN_NIST_384_TOP, top - BN_NIST_384_TOP, BN_NIST_384_TOP); + nist_cp_bn_0(buf.bn, a_d + BN_NIST_384_TOP, top - BN_NIST_384_TOP, BN_NIST_384_TOP); + +#if defined(NIST_INT64) + { + NIST_INT64 acc; /* accumulator */ + unsigned int *rp=(unsigned int *)r_d; + const unsigned int *bp=(const unsigned int *)buf.ui; + + acc = rp[0]; acc += bp[12-12]; + acc += bp[21-12]; + acc += bp[20-12]; + acc -= bp[23-12]; rp[0] = (unsigned int)acc; acc >>= 32; + + acc += rp[1]; acc += bp[13-12]; + acc += bp[22-12]; + acc += bp[23-12]; + acc -= bp[12-12]; + acc -= bp[20-12]; rp[1] = (unsigned int)acc; acc >>= 32; + + acc += rp[2]; acc += bp[14-12]; + acc += bp[23-12]; + acc -= bp[13-12]; + acc -= bp[21-12]; rp[2] = (unsigned int)acc; acc >>= 32; + + acc += rp[3]; acc += bp[15-12]; + acc += bp[12-12]; + acc += bp[20-12]; + acc += bp[21-12]; + acc -= bp[14-12]; + acc -= bp[22-12]; + acc -= bp[23-12]; rp[3] = (unsigned int)acc; acc >>= 32; + + acc += rp[4]; acc += bp[21-12]; + acc += bp[21-12]; + acc += bp[16-12]; + acc += bp[13-12]; + acc += bp[12-12]; + acc += bp[20-12]; + acc += bp[22-12]; + acc -= bp[15-12]; + acc -= bp[23-12]; + acc -= bp[23-12]; rp[4] = (unsigned int)acc; acc >>= 32; + + acc += rp[5]; acc += bp[22-12]; + acc += bp[22-12]; + acc += bp[17-12]; + acc += bp[14-12]; + acc += bp[13-12]; + acc += bp[21-12]; + acc += bp[23-12]; + acc -= bp[16-12]; rp[5] = (unsigned int)acc; acc >>= 32; + + acc += rp[6]; acc += bp[23-12]; + acc += bp[23-12]; + acc += bp[18-12]; + acc += bp[15-12]; + acc += bp[14-12]; + acc += bp[22-12]; + acc -= bp[17-12]; rp[6] = (unsigned int)acc; acc >>= 32; + + acc += rp[7]; acc += bp[19-12]; + acc += bp[16-12]; + acc += bp[15-12]; + acc += bp[23-12]; + acc -= bp[18-12]; rp[7] = (unsigned int)acc; acc >>= 32; + + acc += rp[8]; acc += bp[20-12]; + acc += bp[17-12]; + acc += bp[16-12]; + acc -= bp[19-12]; rp[8] = (unsigned int)acc; acc >>= 32; + + acc += rp[9]; acc += bp[21-12]; + acc += bp[18-12]; + acc += bp[17-12]; + acc -= bp[20-12]; rp[9] = (unsigned int)acc; acc >>= 32; + + acc += rp[10]; acc += bp[22-12]; + acc += bp[19-12]; + acc += bp[18-12]; + acc -= bp[21-12]; rp[10] = (unsigned int)acc; acc >>= 32; + + acc += rp[11]; acc += bp[23-12]; + acc += bp[20-12]; + acc += bp[19-12]; + acc -= bp[22-12]; rp[11] = (unsigned int)acc; + + carry = (int)(acc>>32); + } +#else + { + BN_ULONG t_d[BN_NIST_384_TOP]; /*S1*/ - nist_set_256(t_d, buf, 0, 0, 0, 0, 0, 23-4, 22-4, 21-4); + nist_set_256(t_d, buf.bn, 0, 0, 0, 0, 0, 23-4, 22-4, 21-4); /* left shift */ { register BN_ULONG *ap,t,c; @@ -729,29 +985,31 @@ int BN_nist_mod_384(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, carry = (int)bn_add_words(r_d+(128/BN_BITS2), r_d+(128/BN_BITS2), t_d, BN_NIST_256_TOP); /*S2 */ - carry += (int)bn_add_words(r_d, r_d, buf, BN_NIST_384_TOP); + carry += (int)bn_add_words(r_d, r_d, buf.bn, BN_NIST_384_TOP); /*S3*/ - nist_set_384(t_d,buf,20,19,18,17,16,15,14,13,12,23,22,21); + nist_set_384(t_d,buf.bn,20,19,18,17,16,15,14,13,12,23,22,21); carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP); /*S4*/ - nist_set_384(t_d,buf,19,18,17,16,15,14,13,12,20,0,23,0); + nist_set_384(t_d,buf.bn,19,18,17,16,15,14,13,12,20,0,23,0); carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP); /*S5*/ - nist_set_384(t_d, buf,0,0,0,0,23,22,21,20,0,0,0,0); + nist_set_384(t_d, buf.bn,0,0,0,0,23,22,21,20,0,0,0,0); carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP); /*S6*/ - nist_set_384(t_d,buf,0,0,0,0,0,0,23,22,21,0,0,20); + nist_set_384(t_d,buf.bn,0,0,0,0,0,0,23,22,21,0,0,20); carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP); /*D1*/ - nist_set_384(t_d,buf,22,21,20,19,18,17,16,15,14,13,12,23); + nist_set_384(t_d,buf.bn,22,21,20,19,18,17,16,15,14,13,12,23); carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP); /*D2*/ - nist_set_384(t_d,buf,0,0,0,0,0,0,0,23,22,21,20,0); + nist_set_384(t_d,buf.bn,0,0,0,0,0,0,0,23,22,21,20,0); carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP); /*D3*/ - nist_set_384(t_d,buf,0,0,0,0,0,0,0,23,23,0,0,0); + nist_set_384(t_d,buf.bn,0,0,0,0,0,0,0,23,23,0,0,0); carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP); + } +#endif /* see BN_nist_mod_224 for explanation */ u.f = bn_sub_words; if (carry > 0) diff --git a/crypto/bn/bn_shift.c b/crypto/bn/bn_shift.c index c4d301afc4..a6fca2c424 100644 --- a/crypto/bn/bn_shift.c +++ b/crypto/bn/bn_shift.c @@ -99,7 +99,7 @@ int BN_lshift1(BIGNUM *r, const BIGNUM *a) int BN_rshift1(BIGNUM *r, const BIGNUM *a) { BN_ULONG *ap,*rp,t,c; - int i; + int i,j; bn_check_top(r); bn_check_top(a); @@ -109,22 +109,25 @@ int BN_rshift1(BIGNUM *r, const BIGNUM *a) BN_zero(r); return(1); } + i = a->top; + ap= a->d; + j = i-(ap[i-1]==1); if (a != r) { - if (bn_wexpand(r,a->top) == NULL) return(0); - r->top=a->top; + if (bn_wexpand(r,j) == NULL) return(0); r->neg=a->neg; } - ap=a->d; rp=r->d; - c=0; - for (i=a->top-1; i>=0; i--) + t=ap[--i]; + c=(t&1)?BN_TBIT:0; + if (t>>=1) rp[i]=t; + while (i>0) { - t=ap[i]; + t=ap[--i]; rp[i]=((t>>1)&BN_MASK2)|c; c=(t&1)?BN_TBIT:0; } - bn_correct_top(r); + r->top=j; bn_check_top(r); return(1); } @@ -182,10 +185,11 @@ int BN_rshift(BIGNUM *r, const BIGNUM *a, int n) BN_zero(r); return(1); } + i = (BN_num_bits(a)-n+(BN_BITS2-1))/BN_BITS2; if (r != a) { r->neg=a->neg; - if (bn_wexpand(r,a->top-nw+1) == NULL) return(0); + if (bn_wexpand(r,i) == NULL) return(0); } else { @@ -196,7 +200,7 @@ int BN_rshift(BIGNUM *r, const BIGNUM *a, int n) f= &(a->d[nw]); t=r->d; j=a->top-nw; - r->top=j; + r->top=i; if (rb == 0) { @@ -212,9 +216,8 @@ int BN_rshift(BIGNUM *r, const BIGNUM *a, int n) l= *(f++); *(t++) =(tmp|(l<>rb)&BN_MASK2; + if ((l = (l>>rb)&BN_MASK2)) *(t) = l; } - bn_correct_top(r); bn_check_top(r); return(1); } -- 2.40.0