From: Andy Polyakov Date: Thu, 13 Feb 2014 13:39:55 +0000 (+0100) Subject: evp/e_aes_cbc_hmac_sha*.c: improve cache locality. X-Git-Tag: master-pre-reformat~913 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=9587429fa07a34066107e926fbc8708220f058fa;p=openssl evp/e_aes_cbc_hmac_sha*.c: improve cache locality. --- diff --git a/crypto/evp/e_aes_cbc_hmac_sha1.c b/crypto/evp/e_aes_cbc_hmac_sha1.c index 09f928190d..0b6f292f62 100644 --- a/crypto/evp/e_aes_cbc_hmac_sha1.c +++ b/crypto/evp/e_aes_cbc_hmac_sha1.c @@ -205,13 +205,15 @@ static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA1 *key, u32 d[32]; u8 c[128]; } blocks[8]; SHA1_MB_CTX *ctx; - unsigned int frag, last, packlen, i, x4=4*n4x; + unsigned int frag, last, packlen, i, x4=4*n4x, minblocks, processed=0; size_t ret = 0; u8 *IVs; #if defined(BSWAP8) u64 seqnum; #endif + RAND_bytes((IVs=blocks[0].c),16*x4); /* ask for IVs in bulk */ + ctx = (SHA1_MB_CTX *)(storage+32-((size_t)storage%32)); /* align */ frag = (unsigned int)inp_len>>(1+n4x); @@ -221,8 +223,21 @@ static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA1 *key, last -= x4-1; } + packlen = 5+16+((frag+20+16)&-16); + + /* populate descriptors with pointers and IVs */ hash_d[0].ptr = inp; - for (i=1;imd.data,8); @@ -268,6 +283,39 @@ static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA1 *key, /* hash 13-byte headers and first 64-13 bytes of inputs */ sha1_multi_block(ctx,edges,n4x); /* hash bulk inputs */ +#define MAXCHUNKSIZE 2048 +#if MAXCHUNKSIZE%64 +#error "MAXCHUNKSIZE is not divisible by 64" +#elif MAXCHUNKSIZE + /* goal is to minimize pressure on L1 cache by moving + * in shorter steps, so that hashed data is still in + * the cache by the time we encrypt it */ + minblocks = ((frag<=last ? frag : last)-(64-13))/64; + if (minblocks>MAXCHUNKSIZE/64) { + for (i=0;iks,n4x); + + for (i=0;iMAXCHUNKSIZE/64); + } +#endif +#undef MAXCHUNKSIZE sha1_multi_block(ctx,hash_d,n4x); memset(blocks,0,sizeof(blocks)); @@ -276,7 +324,7 @@ static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA1 *key, off = hash_d[i].blocks*64; const unsigned char *ptr = hash_d[i].ptr+off; - off = len-(64-13)-off; /* remainder actually */ + off = (len-processed)-(64-13)-off; /* remainder actually */ memcpy(blocks[i].c,ptr,off); blocks[i].c[off]=0x80; len += 64+13; /* 64 is HMAC header */ @@ -310,23 +358,14 @@ static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA1 *key, /* finalize MACs */ sha1_multi_block(ctx,edges,n4x); - packlen = 5+16+((frag+20+16)&-16); - - out += (packlen<<(1+n4x))-packlen; - inp += (frag<<(1+n4x))-frag; - - RAND_bytes((IVs=blocks[0].c),16*x4); /* ask for IVs in bulk */ - - for (i=x4-1;;i--) { + for (i=0;iA[i]); @@ -342,7 +381,7 @@ static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA1 *key, for (j=0;j<=pad;j++) *(out++) = pad; len += pad+1; - ciph_d[i].blocks = len/16; + ciph_d[i].blocks = (len-processed)/16; len += 16; /* account for explicit iv */ /* arrange header */ @@ -352,17 +391,8 @@ static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA1 *key, out0[3] = (u8)(len>>8); out0[4] = (u8)(len); - /* explicit iv */ - memcpy(ciph_d[i].iv, IVs, 16); - memcpy(&out0[5], IVs, 16); - ret += len+5; - - if (i==0) break; - - out = out0-packlen; - inp -= frag; - IVs += 16; + inp += frag; } aesni_multi_cbc_encrypt(ciph_d,&key->ks,n4x); diff --git a/crypto/evp/e_aes_cbc_hmac_sha256.c b/crypto/evp/e_aes_cbc_hmac_sha256.c index 95bdd42b13..c2c48f045c 100644 --- a/crypto/evp/e_aes_cbc_hmac_sha256.c +++ b/crypto/evp/e_aes_cbc_hmac_sha256.c @@ -201,13 +201,15 @@ static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA256 *key, u32 d[32]; u8 c[128]; } blocks[8]; SHA256_MB_CTX *ctx; - unsigned int frag, last, packlen, i, x4=4*n4x; + unsigned int frag, last, packlen, i, x4=4*n4x, minblocks, processed=0; size_t ret = 0; u8 *IVs; #if defined(BSWAP8) u64 seqnum; #endif + RAND_bytes((IVs=blocks[0].c),16*x4); /* ask for IVs in bulk */ + ctx = (SHA256_MB_CTX *)(storage+32-((size_t)storage%32)); /* align */ frag = (unsigned int)inp_len>>(1+n4x); @@ -217,8 +219,21 @@ static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA256 *key, last -= x4-1; } + packlen = 5+16+((frag+32+16)&-16); + + /* populate descriptors with pointers and IVs */ hash_d[0].ptr = inp; - for (i=1;imd.data,8); @@ -267,6 +282,39 @@ static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA256 *key, /* hash 13-byte headers and first 64-13 bytes of inputs */ sha256_multi_block(ctx,edges,n4x); /* hash bulk inputs */ +#define MAXCHUNKSIZE 2048 +#if MAXCHUNKSIZE%64 +#error "MAXCHUNKSIZE is not divisible by 64" +#elif MAXCHUNKSIZE + /* goal is to minimize pressure on L1 cache by moving + * in shorter steps, so that hashed data is still in + * the cache by the time we encrypt it */ + minblocks = ((frag<=last ? frag : last)-(64-13))/64; + if (minblocks>MAXCHUNKSIZE/64) { + for (i=0;iks,n4x); + + for (i=0;iMAXCHUNKSIZE/64); + } +#endif +#undef MAXCHUNKSIZE sha256_multi_block(ctx,hash_d,n4x); memset(blocks,0,sizeof(blocks)); @@ -275,7 +323,7 @@ static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA256 *key, off = hash_d[i].blocks*64; const unsigned char *ptr = hash_d[i].ptr+off; - off = len-(64-13)-off; /* remainder actually */ + off = (len-processed)-(64-13)-off; /* remainder actually */ memcpy(blocks[i].c,ptr,off); blocks[i].c[off]=0x80; len += 64+13; /* 64 is HMAC header */ @@ -312,23 +360,14 @@ static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA256 *key, /* finalize MACs */ sha256_multi_block(ctx,edges,n4x); - packlen = 5+16+((frag+32+16)&-16); - - out += (packlen<<(1+n4x))-packlen; - inp += (frag<<(1+n4x))-frag; - - RAND_bytes((IVs=blocks[0].c),16*x4); /* ask for IVs in bulk */ - - for (i=x4-1;;i--) { + for (i=0;iA[i]); @@ -347,7 +386,7 @@ static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA256 *key, for (j=0;j<=pad;j++) *(out++) = pad; len += pad+1; - ciph_d[i].blocks = len/16; + ciph_d[i].blocks = (len-processed)/16; len += 16; /* account for explicit iv */ /* arrange header */ @@ -357,17 +396,8 @@ static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA256 *key, out0[3] = (u8)(len>>8); out0[4] = (u8)(len); - /* explicit iv */ - memcpy(ciph_d[i].iv, IVs, 16); - memcpy(&out0[5], IVs, 16); - ret += len+5; - - if (i==0) break; - - out = out0-packlen; - inp -= frag; - IVs += 16; + inp += frag; } aesni_multi_cbc_encrypt(ciph_d,&key->ks,n4x);