
/* This is an independent implementation of the HPC algorithm		*/
/* designed by Richard Schroeppel as a candidate for the NIST		*/
/* AES activity.													*/
/*																	*/
/* Copyright in this implementation is held by Dr B R Gladman but	*/
/* I hereby give permission for its free direct or derivative use	*/
/* subject to acknowledgment of its origin and compliance with any	*/
/* constraints that are placed on the exploitation of RC6 by its	*/
/* designers.														*/
/*																	*/
/* Dr Brian Gladman (gladman@seven77.demon.co.uk) 20th Nov 1998		*/
/*

Timing data:
 
Algorithm: hpc (hpc0.c)

128 bit key:
Key Setup:  119709 cycles
Encrypt:      1532 cycles =    16.7 mbits/sec
Decrypt:      1709 cycles =    15.0 mbits/sec
Mean:         1620 cycles =    15.8 mbits/sec
192 bit key:
Key Setup:  119012 cycles
Encrypt:      1536 cycles =    16.7 mbits/sec
Decrypt:      1695 cycles =    15.1 mbits/sec
Mean:         1615 cycles =    15.8 mbits/sec
256 bit key:
Key Setup:  119026 cycles
Encrypt:      1561 cycles =    16.4 mbits/sec
Decrypt:      1715 cycles =    14.9 mbits/sec
Mean:         1638 cycles =    15.6 mbits/sec

*/

#define	BYTE_SWAP

#include "../std_defs.h"

static char *alg_name[] = { "hpc", "hpc0.c" };

char **cipher_name()
{
	return alg_name;
}

typedef	u4byte	u8byte[2];

u8byte	l_key[286];
u8byte	spice[8] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };

u8byte	p119 = { 0xa23249d6, 0x2b992ddf };
u8byte	e19  = { 0xc0b36173, 0x25b946eb };
u8byte	r220 = { 0xe9e17158, 0xc442f56b };

#define	xor_eq(x,y)		(x)[0] ^= (y)[0]; (x)[1] ^= (y)[1]
#define	and_eq(x,y)		(x)[0] &= (y)[0]; (x)[1] &= (y)[1]
#define	 or_eq(x,y)		(x)[0] |= (y)[0]; (x)[1] |= (y)[1]

#define add_eq(x,y)		(x)[1] += (y)[1] + (((x)[0] += (y)[0]) < (y)[0] ? 1 : 0)
#define sub_eq(x,y)		xs = (x)[0]; (x)[1] -= (y)[1] + (((x)[0] -= (y)[0]) > xs ? 1 : 0)   

#define	lsh_eq(x,n)											\
	if((n) > 31)											\
	{	(x)[1] = (x)[0] << ((n) & 31); (x)[0] = 0;			\
	}														\
	else if((n) > 0)										\
	{	(x)[1] = ((x)[1] << (n)) | ((x)[0] >> (-(n) & 31));	\
		(x)[0] = (x)[0] << (n);								\
	}

#define	rsh_eq(x,n)											\
	if((n) > 31)											\
	{	(x)[0] = (x)[1] >> ((n) & 31); (x)[1] = 0;			\
	}														\
	else if((n) > 0)										\
	{	(x)[0] = ((x)[0] >> (n)) | ((x)[1] << (-(n) & 31));	\
		(x)[1] = (x)[1] >> (n);								\
	}

#define lo(x)	((x) & 0x0000ffff)
#define hi(x)	((x) >> 16)

void mult_64(u8byte r, const u8byte x, const u8byte y)
{	u4byte	x0, x1, x2, x3, y0, y1, y2, y3, t0, t1, t2, t3, c;

	x0 = lo(x[0]); x1 = hi(x[0]); x2 = lo(x[1]); x3 = hi(x[1]);
	y0 = lo(y[0]); y1 = hi(y[0]); y2 = lo(y[1]); y3 = hi(y[1]);

	t0 = x0 * y0; r[0] = lo(t0); c = hi(t0);

	t0 = x0 * y1; t1 = x1 * y0; c += lo(t0) + lo(t1);
	r[0] += (c << 16); c = hi(c) + hi(t0) + hi(t1);

	t0 = x0 * y2; t1 = x1 * y1; t2 = x2 * y0;
	c += lo(t0) + lo(t1) + lo(t2); r[1] = lo(c);
	c = hi(c) + hi(t0) + hi(t1) + hi(t2);

	t0 = x0 * y3; t1 = x1 * y2; t2 = x2 * y1; t3 = x3 * y0;
	c += lo(t0) + lo(t1) + lo(t2) + lo(t3); r[1] += (c << 16); 
/*
	c = hi(c) + hi(t0) + hi(t1) + hi(t2) + hi(t3);

	t0 = x1 * y3; t1 = x2 * y2; t2 = x3 * y1;
	c += lo(t0) + lo(t1) + lo(t2); r[1][0] = lo(c);
	c = hi(c) + hi(t0) + hi(t1) + hi(t2);

	t0 = x2 * y3; t1 = x3 * y2; c += lo(t0) + lo(t1);
	r[1][0] += (c << 16); c = hi(c) + hi(t0) + hi(t1);

	r[1][1] = c + x3 * y3;
*/
};

u8byte	l_key[286];	/* storage for the key schedule 		*/

/* initialise the key schedule from the user supplied key	*/

u4byte *set_key(const u4byte in_key[], const u4byte key_len)
{	u8byte	s[8], t;
	u4byte	i, j, xs;

	l_key[0][0] = p119[0] + 3; l_key[0][1] = p119[1];
	t[0] = key_len; t[1] = 0; mult_64(l_key[1], e19, t);
	l_key[2][0] = (r220[0] << 3) | (r220[1] >> 29); 
	l_key[2][1] = (r220[1] << 3) | (r220[0] >> 29); 

	for(i = 3; i < 256; ++i)
	{
		t[0] = l_key[i - 3][0]; t[1] = l_key[i - 3][1]; lsh_eq(t, 41);
		l_key[i][0] = l_key[i - 3][0]; l_key[i][1] = l_key[i - 3][1]; 
		rsh_eq(l_key[i], 23); 
		or_eq(l_key[i], t); 
		xor_eq(l_key[i], l_key[i - 2]); 
		add_eq(l_key[i], l_key[i - 1]);
	}
	
	l_key[0][1] ^= io_swap(in_key[0]); l_key[0][0] ^= io_swap(in_key[1]);
	l_key[1][1] ^= io_swap(in_key[2]); l_key[1][0] ^= io_swap(in_key[3]);

	if(key_len > 128)
	{
		l_key[2][1] ^= io_swap(in_key[4]); l_key[2][0] ^= io_swap(in_key[5]);
	}

	if(key_len > 192)
	{
		l_key[3][1] ^= io_swap(in_key[6]); l_key[3][0] ^= io_swap(in_key[7]);
	}

	for(i = 0; i < 8; ++i)
	{
		s[i][0] = l_key[248 + i][0]; s[i][1] = l_key[248 + i][1];
	}
			
	for(j = 0; j < 3; ++j)
		for(i = 0; i < 256; ++i)
		{
			t[0] = l_key[i][0]; t[1] = l_key[i][1]; xor_eq(t, l_key[(i + 83) & 255]); 
			add_eq(t, l_key[s[0][0] & 255]);
			xor_eq(s[0], t); add_eq(s[1], s[0]); xor_eq(s[3], s[2]); 
			sub_eq(s[5], s[4]); xor_eq(s[7], s[6]);
			t[0] = s[0][0]; t[1] = s[0][1]; rsh_eq(t, 13); add_eq(s[3], t);
			t[0] = s[1][0]; t[1] = s[1][1]; lsh_eq(t, 11); xor_eq(s[4], t);
			t[0] = s[3][0]; t[1] = s[3][1]; 
			lsh_eq(t, s[1][0] & 31); 
			xor_eq(s[5], t);
			t[0] = s[2][0]; t[1] = s[2][1]; rsh_eq(t, 17); add_eq(s[6], t);
			t[0] = s[3][0]; t[1] = s[3][1]; add_eq(t, s[4]); or_eq(s[7], t);
			sub_eq(s[2], s[5]);
			t[0] = s[6][0] ^ i; t[1] = s[6][1]; sub_eq(s[0], t);
			t[0] = s[5][0]; t[1] = s[5][1]; add_eq(t, p119); xor_eq(s[1], t);
			t[0] = s[7][0]; t[1] = s[7][1]; rsh_eq(t, j); add_eq(s[2], t);
			xor_eq(s[2], s[1]); sub_eq(s[4], s[3]); 
			xor_eq(s[6], s[5]); add_eq(s[0], s[7]);
			l_key[i][0] = s[2][0]; l_key[i][1] = s[2][1]; add_eq(l_key[i], s[6]);
		}

	for(i = 0; i < 30; ++i)
	{
		l_key[256 + i][0] = l_key[i][0]; l_key[256 + i][1] = l_key[i][1];
	}
		
	return (u4byte*)l_key;
};

/* encrypt a block of text	*/

void encrypt(const u4byte in_blk[4], u4byte out_blk[4])
{	u8byte	s0, s1, k, kk, t;
	u4byte	tt, xs;
	s4byte	i;

	s0[1] = io_swap(in_blk[0]); s0[0] = io_swap(in_blk[1]);
	s1[1] = io_swap(in_blk[2]); s1[0] = io_swap(in_blk[3]);

	add_eq(s0, l_key[128]); add_eq(s1, l_key[129]);

	for(i = 0; i < 8; ++i)
	{
		tt = s0[0] & 255; k[0] = l_key[tt][0]; k[1] = l_key[tt][1]; 
		add_eq(s1, k); 	lsh_eq(k, 8); xor_eq(s0, k); xor_eq(s1, s0);
		t[0] = s1[0]; t[1] = s1[1]; rsh_eq(t, 11); sub_eq(s0, t);
		t[0] = s1[0]; t[1] = s1[1]; lsh_eq(t, 2); xor_eq(s0, t);
		sub_eq(s0, spice[i ^ 4]);
		t[0] = s0[0]; t[1] = s0[1]; lsh_eq(t, 32);  
		kk[0] = p119[0] + 128; kk[1] = p119[1]; xor_eq(t, kk); add_eq(s0, t);
		t[0] = s0[0]; t[1] = s0[1]; rsh_eq(t, 17); xor_eq(s0, t);
		t[0] = s0[0]; t[1] = s0[1]; rsh_eq(t, 34); xor_eq(s0, t);		
		t[0] = spice[i][0]; t[1] = spice[i][1]; xor_eq(s0, t);
		lsh_eq(t, 5); add_eq(s0, t); 
		t[0] = spice[i][0]; t[1] = spice[i][1]; rsh_eq(t, 4);
		add_eq(s1, t); xor_eq(s0, t);
		t[0] = s0[0]; t[1] = s0[1]; lsh_eq(t, 22 + (s0[0] & 31)); add_eq(s0, t);
		t[0] = s0[0]; t[1] = s0[1]; rsh_eq(t, 23); xor_eq(s0, t);
		sub_eq(s0, spice[i ^ 7]);
		tt = s0[0] & 255; k[0] = l_key[tt][0]; k[1] = l_key[tt][1]; 
		tt += 3 * i + 1; kk[0] = l_key[tt][0]; kk[1] = l_key[tt][1]; 
		xor_eq(s1, k); t[0] = kk[0]; t[1] = kk[1]; lsh_eq(t, 8); 
		xor_eq(s0, t); xor_eq(kk, k); 
		t[0] = kk[0]; t[1] = kk[1]; rsh_eq(t, 5); add_eq(s1, t);
		t[0] = kk[0]; t[1] = kk[1]; lsh_eq(t, 12); sub_eq(s0, t);
		kk[0] &= ~255; xor_eq(s0, kk); 	add_eq(s1, s0);
		t[0] = s1[0]; t[1] = s1[1]; lsh_eq(t, 3); add_eq(s0, t);
		xor_eq(s0, spice[i ^ 2]); add_eq(s0, l_key[144 + i]);
		t[0] = s0[0]; t[1] = s0[1]; lsh_eq(t, 22); add_eq(s0, t);
		t[0] = s1[0]; t[1] = s1[1]; rsh_eq(t, 4); xor_eq(s0, t);
		add_eq(s0, spice[i ^ 1]);
		t[0] = s0[0]; t[1] = s0[1]; rsh_eq(t, 33 + i); xor_eq(s0, t);
	}

	add_eq(s0, l_key[136]); add_eq(s1, l_key[137]);

	out_blk[0] = io_swap(s0[1]); out_blk[1] = io_swap(s0[0]);
	out_blk[2] = io_swap(s1[1]); out_blk[3] = io_swap(s1[0]);
};

/* decrypt a block of text	*/

void decrypt(const u4byte in_blk[4], u4byte out_blk[4])
{	u8byte	s0, s1, k, kk, t;
	u4byte	tt, xs;
	s4byte	i;

	s0[1] = io_swap(in_blk[0]); s0[0] = io_swap(in_blk[1]);
	s1[1] = io_swap(in_blk[2]); s1[0] = io_swap(in_blk[3]);

	sub_eq(s0, l_key[136]); sub_eq(s1, l_key[137]);

	for(i = 7; i >= 0; --i)
	{
		t[0] = s0[0]; t[1] = s0[1]; rsh_eq(t, 33 + i); xor_eq(s0, t);
		sub_eq(s0, spice[i ^ 1]);
		t[0] = s1[0]; t[1] = s1[1]; rsh_eq(t, 4); xor_eq(s0, t);
		k[0] = s0[0]; k[1] = s0[1]; lsh_eq(k, 22);  
		t[0] = s0[0]; t[1] = s0[1]; sub_eq(t, k); 
		lsh_eq(t, 22); sub_eq(s0, t); sub_eq(s0, l_key[144 + i]);
		xor_eq(s0, spice[i ^ 2]); t[0] = s1[0]; t[1] = s1[1]; lsh_eq(t, 3); 
		sub_eq(s0, t); sub_eq(s1, s0); 
		tt = s0[0] & 255; k[0] = l_key[tt][0]; k[1] = l_key[tt][1];
		tt += 3 * i + 1; kk[0] = l_key[tt][0]; kk[1] = l_key[tt][1]; xor_eq(kk, k); 
		t[0] = kk[0] & ~255; t[1] = kk[1]; xor_eq(s0, t);
		t[0] = kk[0]; t[1] = kk[1]; lsh_eq(t, 12); add_eq(s0, t);
		t[0] = kk[0]; t[1] = kk[1]; rsh_eq(t, 5); sub_eq(s1, t);
		kk[0] = l_key[tt][0]; kk[1] = l_key[tt][1]; lsh_eq(kk, 8); 
		xor_eq(s0, kk); xor_eq(s1, k); add_eq(s0, spice[i ^ 7]); 
		t[0] = s0[0]; t[1] = s0[1]; rsh_eq(t, 23); xor_eq(s0, t);
		t[0] = s0[0]; t[1] = s0[1]; rsh_eq(t, 46); xor_eq(s0, t);
		tt = 22 + (s0[0] & 31); t[0] = s0[0]; t[1] = s0[1]; lsh_eq(t, tt);
		kk[0] = s0[0]; kk[1] = s0[1]; sub_eq(kk, t); lsh_eq(kk, tt); sub_eq(s0, kk);
		t[0] = kk[0] = spice[i][0]; t[1] = kk[1] = spice[i][1]; rsh_eq(kk, 4);
		xor_eq(s0, kk); sub_eq(s1, kk); k[0] = t[0]; k[1] = t[1]; lsh_eq(k, 5); 
		sub_eq(s0, k); xor_eq(s0, t); 
		t[0] = s0[0]; t[1] = s0[1]; rsh_eq(t, 17); xor_eq(s0, t);
		t[0] = p119[0] + 128; t[1] = p119[1]; k[0] = s0[0]; k[1] = s0[1];
		sub_eq(k, t); lsh_eq(k, 32); xor_eq(t, k); sub_eq(s0, t);
		add_eq(s0, spice[i ^ 4]); t[0] = s1[0]; t[1] = s1[1]; lsh_eq(t, 2); 
		xor_eq(s0, t); t[0] = s1[0]; t[1] = s1[1]; rsh_eq(t, 11); add_eq(s0, t);
		xor_eq(s1, s0); tt = s0[0] & 255; k[0] = l_key[tt][0]; k[1] = l_key[tt][1];
		t[0] = k[0]; t[1] = k[1]; lsh_eq(t, 8); xor_eq(s0, t); sub_eq(s1, k);
	}

	sub_eq(s0, l_key[128]); sub_eq(s1, l_key[129]);

	out_blk[0] = io_swap(s0[1]); out_blk[1] = io_swap(s0[0]);
	out_blk[2] = io_swap(s1[1]); out_blk[3] = io_swap(s1[0]);
};
