
	;; Rijndael Encryption Algorithm, in 80186/286 assembler, version 1.2
	;; Copyright (C) 2000 Rafael R. Sevilla
	;;
	;; This library of encryption routines is free software; you can
	;; redistribute it and/or modify it under the terms of the GNU Lesser
	;; General Public License as published by the Free Software Foundation;
	;; either version 2 of the License, or (at your option) any later
	;; version.
	;;
	;; This Rijndael Encryption code is distributed in the hope it will
	;; be useful, but WITHOUT ANY WARRANTY; without even the implied
	;; warranty of MERCHANTIBILITY or FITNESS FOR A PARTICULAR PURPOSE.
	;; See the GNU Lesser General Public License for more details.
	;;
	;; You should have received a copy of the GNU Lesser General Public
	;; License along with this Rijndael Encryption code; see the file
	;; COPYING.LIB.  If not, write to the Free Software Foundation, Inc.,
	;; 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
	;;
	;; Note that the only 80186 instructions here are shr/shl instructions
	;; with multibit counts, and these only appear in the key expansion
	;; function.
	;;
	;; The modification here has incorporated a few changes suggested by
	;; Robert G. Durnal (afn21533@afn.org), and a major bugfix in the
	;; key generation code.
	;; 
	;; Rijndael was developed by Joan Daemen and Vincent Rijmen
	;;

SECTION	.data

	;; S-box for Rijndael
sbox:	db	 99, 124, 119, 123, 242, 107, 111, 197,  48,   1, 103,  43
	db	254, 215, 171, 118, 202, 130, 201, 125, 250,  89,  71, 240
	db	173, 212, 162, 175, 156, 164, 114, 192, 183, 253, 147,  38
	db	 54,  63, 247, 204,  52, 165, 229, 241, 113, 216,  49,  21
	db	  4, 199,  35, 195,  24, 150,   5, 154,   7,  18, 128, 226
	db	235,  39, 178, 117,   9, 131,  44,  26,  27, 110,  90, 160
	db	 82,  59, 214, 179,  41, 227,  47, 132,  83, 209,   0, 237
	db	 32, 252, 177,  91, 106, 203, 190,  57,  74,  76,  88, 207
	db	208, 239, 170, 251,  67,  77,  51, 133,  69, 249,   2, 127
	db	 80,  60, 159, 168,  81, 163,  64, 143, 146, 157,  56, 245
	db	188, 182, 218,  33,  16, 255, 243, 210, 205,  12,  19, 236
	db	 95, 151,  68,  23, 196, 167, 126,  61, 100,  93,  25, 115
	db	 96, 129,  79, 220,  34,  42, 144, 136,  70, 238, 184,  20
	db	222,  94,  11, 219, 224,  50,  58,  10,  73,   6,  36,  92
	db	194, 211, 172,  98, 145, 149, 228, 121, 231, 200,  55, 109
	db	141, 213,  78, 169, 108,  86, 244, 234, 101, 122, 174,   8
	db	186, 120,  37,  46,  28, 166, 180, 198, 232, 221, 116,  31
	db	 75, 189, 139, 138, 112,  62, 181, 102,  72,   3, 246,  14
	db	 97,  53,  87, 185, 134, 193,  29, 158, 225, 248, 152,  17
	db	105, 217, 142, 148, 155,  30, 135, 233, 206,  85,  40, 223
	db	140, 161, 137,  13, 191, 230,  66, 104,  65, 153,  45,  15
	db	176,  84, 187,  22

	;; Inverse S-box for Rijndael
isbox:	db	 82,   9, 106, 213,  48,  54, 165,  56, 191,  64, 163, 158
	db	129, 243, 215, 251, 124, 227,  57, 130, 155,  47, 255, 135
	db	 52, 142,  67,  68, 196, 222, 233, 203,  84, 123, 148,  50
	db	166, 194,  35,  61, 238,  76, 149,  11,  66, 250, 195,  78
	db	  8,  46, 161, 102,  40, 217,  36, 178, 118,  91, 162,  73
	db	109, 139, 209,  37, 114, 248, 246, 100, 134, 104, 152,  22
	db	212, 164,  92, 204,  93, 101, 182, 146, 108, 112,  72,  80
	db	253, 237, 185, 218,  94,  21,  70,  87, 167, 141, 157, 132
	db	144, 216, 171,   0, 140, 188, 211,  10, 247, 228,  88,   5
	db	184, 179,  69,   6, 208,  44,  30, 143, 202,  63,  15,   2
	db	193, 175, 189,   3,   1,  19, 138, 107,  58, 145,  17,  65
	db	 79, 103, 220, 234, 151, 242, 207, 206, 240, 180, 230, 115
	db	150, 172, 116,  34, 231, 173,  53, 133, 226, 249,  55, 232
	db	 28, 117, 223, 110,  71, 241,  26, 113,  29,  41, 197, 137
	db	111, 183,  98,  14, 170,  24, 190,  27, 252,  86,  62,  75
	db	198, 210, 121,  32, 154, 219, 192, 254, 120, 205,  90, 244
	db	 31, 221, 168,  51, 136,   7, 199,  49, 177,  18,  16,  89
	db	 39, 128, 236,  95,  96,  81, 127, 169,  25, 181,  74,  13
	db	 45, 229, 122, 159, 147, 201, 156, 239, 160, 224,  59,  77
	db	174,  42, 245, 176, 200, 235, 187,  60, 131,  83, 153,  97
	db	 23,  43,   4, 126, 186, 119, 214,  38, 225, 105,  20,  99
	db	 85,  33,  12, 125

	;; xtime lookup table
xtime:	db	0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e
	db	0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e
	db	0x20, 0x22, 0x24, 0x26, 0x28, 0x2a, 0x2c, 0x2e
	db	0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e
	db	0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e
	db	0x50, 0x52, 0x54, 0x56, 0x58, 0x5a, 0x5c, 0x5e
	db	0x60, 0x62, 0x64, 0x66, 0x68, 0x6a, 0x6c, 0x6e
	db	0x70, 0x72, 0x74, 0x76, 0x78, 0x7a, 0x7c, 0x7e
	db	0x80, 0x82, 0x84, 0x86, 0x88, 0x8a, 0x8c, 0x8e
	db	0x90, 0x92, 0x94, 0x96, 0x98, 0x9a, 0x9c, 0x9e
	db	0xa0, 0xa2, 0xa4, 0xa6, 0xa8, 0xaa, 0xac, 0xae
	db	0xb0, 0xb2, 0xb4, 0xb6, 0xb8, 0xba, 0xbc, 0xbe
	db	0xc0, 0xc2, 0xc4, 0xc6, 0xc8, 0xca, 0xcc, 0xce
	db	0xd0, 0xd2, 0xd4, 0xd6, 0xd8, 0xda, 0xdc, 0xde
	db	0xe0, 0xe2, 0xe4, 0xe6, 0xe8, 0xea, 0xec, 0xee
	db	0xf0, 0xf2, 0xf4, 0xf6, 0xf8, 0xfa, 0xfc, 0xfe
	db	0x1b, 0x19, 0x1f, 0x1d, 0x13, 0x11, 0x17, 0x15
	db	0x0b, 0x09, 0x0f, 0x0d, 0x03, 0x01, 0x07, 0x05
	db	0x3b, 0x39, 0x3f, 0x3d, 0x33, 0x31, 0x37, 0x35
	db	0x2b, 0x29, 0x2f, 0x2d, 0x23, 0x21, 0x27, 0x25
	db	0x5b, 0x59, 0x5f, 0x5d, 0x53, 0x51, 0x57, 0x55
	db	0x4b, 0x49, 0x4f, 0x4d, 0x43, 0x41, 0x47, 0x45
	db	0x7b, 0x79, 0x7f, 0x7d, 0x73, 0x71, 0x77, 0x75
	db	0x6b, 0x69, 0x6f, 0x6d, 0x63, 0x61, 0x67, 0x65
	db	0x9b, 0x99, 0x9f, 0x9d, 0x93, 0x91, 0x97, 0x95
	db	0x8b, 0x89, 0x8f, 0x8d, 0x83, 0x81, 0x87, 0x85
	db	0xbb, 0xb9, 0xbf, 0xbd, 0xb3, 0xb1, 0xb7, 0xb5
	db	0xab, 0xa9, 0xaf, 0xad, 0xa3, 0xa1, 0xa7, 0xa5
	db	0xdb, 0xd9, 0xdf, 0xdd, 0xd3, 0xd1, 0xd7, 0xd5
	db	0xcb, 0xc9, 0xcf, 0xcd, 0xc3, 0xc1, 0xc7, 0xc5
	db	0xfb, 0xf9, 0xff, 0xfd, 0xf3, 0xf1, 0xf7, 0xf5
	db	0xeb, 0xe9, 0xef, 0xed, 0xe3, 0xe1, 0xe7, 0xe5

	;; Round constants for key schedule
rcon:	db	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36
	db	0x6c, 0xd8, 0xab, 0x4d, 0x9a, 0x2f, 0x5e, 0xbc, 0x63, 0xc6
	db	0x97, 0x35, 0x6a, 0xd4, 0xb3, 0x7d, 0xfa, 0xef, 0xc5, 0x91

SECTION	.text

	;; small procedure to mix keys.  input text must be in DI,  key to
	;; mix must be in DI.  Destroys AX, DX, SI, and DI.
mixkey:	mov	dx,8
.keyadd_loop1:
	lodsw
	xor	ax,[di]
	stosw
	dec	dx
	jnz	.keyadd_loop1
	ret

	global	_rijndael_encrypt
_rijndael_encrypt:
	push	bp
	mov	bp,sp
	push	si
	push	di
	mov	di,[bp+4]	; Cipher state (plaintext)
	mov	si,[bp+6]	; first round key
	call	mixkey
	mov	cx,16		; round key offset (16*round_number)
	cld
.round_top:
	mov	ah,16		; apply the s-box to the 16 bytes
	mov	si,[bp+4]	; load cipher state address
	mov	di,si
	mov	bx,sbox		; get address of s-box
.sbox_loop:
	lodsb			; load a byte from the state
	xlatb			; translate using the s-box
	stosb			; store back to the state
	dec	ah		; loop until done
	jnz	.sbox_loop
	;; Rotate (cyclically shift) row 1 by one, row 2 by two and row 3
	;; by three.
	mov	si,[bp+4]
	add	si,4		; point to the first row
	mov	di,si
	mov	bl,1		; row number (also shift quantity)
.rotate_rows:
	;; load entire row into ax:dx
	lodsw			; first half of row
	xchg	dx,ax		; lives in DX
	lodsw			; second half of row lives in AX
	;; Now dl = original first, dh = original second, al = original third
	;; ah = original fourth.  We thus rotate ax:dx a number of bytes
	;; equal to the row number and then store back.
	mov	bh,bl		; make copy of row num. for looping
	;; A single rotation is (0 3 2 1) expressible as (0,1)(0,2)(0,3) as a
	;; product of transpositions. (this is a right shift)
.do_rotate:
	xchg	ah,dl		; (0,3)
	xchg	al,dl		; (0,2)
	xchg	dh,dl		; (0,1)
	dec	bh
	jnz	.do_rotate
	;; store the row back
	xchg	ax,dx		; make ax the first half of row first
	stosw			; store first half of row
	xchg	ax,dx		; make ax the second
	stosw			; store second half of row
	inc	bl
	cmp	bl,3		; have we gotten to row 3?
	jbe	.rotate_rows	; if not, keep rotating
	cmp	cx,224		; are we at round 14 (16*14)?
	jne	.mixcolumn_begin ; if not, do a mixcolumn
	jmp	.finalize	; else finalize
.mixcolumn_begin:
	;; Do the MixColumn transformation.
	mov	si,[bp+4]	; load the address of the state
	mov	di,4		; column loop counter
	mov	bx,xtime	; load xtime table in bx
	push	cx
.do_mixcolumn:
	mov	cl,[si]		; zeroth byte of column
	mov	ch,[si+4]	; first byte of column
	mov	dl,[si+8]	; second byte of column
	mov	dh,[si+12]	; third byte of column
	mov	ah,cl		; let ah = tmp
	xor	ah,ch
	xor	ah,dl
	xor	ah,dh
	mov	al,cl		; a[j]
	xor	al,ch		; a[j]^a[j+4]
	xlatb			; xtimetbl[a[j]^a[j+4]]
	xor	al,ah		; xtimetbl[a[j]^a[j+4]] ^ tmp
	xor	[si],al		; store back to state by xoring with orig
	mov	al,ch		; a[j+4]
	xor	al,dl		; a[j+4]^a[j+8]
	xlatb			; xtimetbl[a[j+4]^a[j+8]]
	xor	al,ah		; xtimetbl[a[j+4]^a[j+8]] ^ tmp
	xor	[si+4],al	; store back to state by xoring with orig
	mov	al,dl		; a[j+8]
	xor	al,dh		; a[j+8]^a[j+12]
	xlatb			; xtimetbl[a[j+8]^a[j+12]]
	xor	al,ah		; xtimetbl[a[j+8]^a[j+12]] ^ tmp
	xor	[si+8],al	; store back to state by xoring with orig
	mov	al,dh		; a[j+12]
	xor	al,cl		; a[j+12]^a[j]
	xlatb			; xtimetbl[a[j+12]^a[j]]
	xor	al,ah		; xtimetbl[a[j+12]^a[j]] ^ tmp
	xor	[si+12],al	; store back to state by xoring with orig
	inc	si		; point to next column
	dec	di
	jnz	.do_mixcolumn
	pop	cx
	mov	di,[bp+4]	; Cipher state
	mov	si,[bp+6]	; base address of round keys
	add	si,cx		; make si address of current round key
	call	mixkey		; mix the key
	add	cx,16		; increment the loop counter
	jmp	.round_top
.finalize:
	;; Perform a final key mixing before finishing up
	mov	di,[bp+4]	; address of state
	mov	si,[bp+6]	; base address of key schedule
	add	si,224		; last key in key schedule
	call	mixkey
	pop	di
	pop	si
	pop	bp
	ret

	global	_rijndael_decrypt
_rijndael_decrypt:
	push	bp
	mov	bp,sp
	push	si
	push	di
	mov	cx,224		; round key offset (14*16)
	;; Perform the initial key mixing operation
	mov	di,[bp+4]	; address of state
	mov	si,[bp+6]	; base address of key schedule
	add	si,224		; last key in key schedule
	call	mixkey
	;; The first round doesn't perform the inverse column mixing.
	jmp	.start_isbox
.round_top:
	mov	di,[bp+4]	; Cipher state
	mov	si,[bp+6]	; base address of round keys
	add	si,cx		; make si address of current round key
	call	mixkey
	;; The inverse column mixing is much more ticklish than the straight
	;; mix...
	mov	si,[bp+4]	; load the address of the state
	mov	di,4		; column loop counter
	mov	bx,xtime	; load xtime table in bx
	push	cx
	push	bp
.do_invmixcolumn:
	mov	cl,[si]		; zeroth byte of column
	mov	ch,[si+4]	; first byte of column
	mov	dl,[si+8]	; second byte of column
	mov	dh,[si+12]	; third byte of column
	xor	bp,bp		; bp is the offset in column
	;; Multiply by 0x0e in GF(2^8) (`*' denotes multiplication in GF(2^8))
.invmix_onecolumn:
	mov	al,cl
	xlatb			; a0*02
	mov	ah,al
	xlatb			; a0*04
	xor	ah,al		; (a0*02)^(a0*04) = ah
	xlatb			; a0*08
	xor	ah,al		; (a0*02)^(a0*04)^*(a0*08) = a0*0e
	mov	al,ch		; a1
	xor	ah,al
	xlatb			; a1*02
	xor	ah,al
	xlatb			; a1*04
	xlatb			; a1*08
	xor	ah,al		; a1^(a1*02)^(a1*08) = a1*0b
	mov	al,dl		; a2
	xor	ah,al
	xlatb			; a2*02
	xlatb			; a2*04
	xor	ah,al
	xlatb			; a2*08
	xor	ah,al		; a2^(a2*04)^(a2*08) = a1*0d
	mov	al,dh		; a3
	xor	ah,al
	xlatb			; a3*02
	xlatb			; a3*04
	xlatb			; a3*08
	xor	ah,al		; a3^(a3*08) = a3*09
	mov	[ds:bp+si],ah	; store to state
	add	bp,4
	cmp	bp,12		; if we are more than 12
	ja	.end_col_invmix	; stop
	;; Now we rotate cl, ch, dl, and dh one byte to the right so that
	;; when we go back up to .mix_onecolumn the coefficients will line up,
	;; since the matrix we multiply with is a circulant matrix.  This
	;; rotation can be expressed as a product of transpositions, just as
	;; above: (0,1)(0,2)(0,3).
	xchg	dh,cl		; (0,3)
	xchg	dl,cl		; (0,2)
	xchg	ch,cl		; (0,1)
	jmp	.invmix_onecolumn
.end_col_invmix:
	inc	si		; point to next column
	dec	di
	jnz	.do_invmixcolumn
	pop	bp		; restore saved registers
	pop	cx
.start_isbox:
	mov	ah,16		; apply the s-box to the 16 bytes
	mov	si,[bp+4]	; load cipher state address
	mov	di,si
	mov	bx,isbox	; get address of inverse s-box for xlat
.sbox_loop:
	lodsb			; load a byte from the state
	xlatb			; translate using the s-box
	stosb			; store back to the state
	dec	ah		; loop until done
	jnz	.sbox_loop
	;; Rotate (cyclically shift) row 1 by one, row 2 by two and row 3
	;; by three, as before, but in the opposite direction, inverting
	;; the previous shifts.
	mov	si,[bp+4]
	add	si,4		; point to the first row
	mov	di,si
	mov	bl,1		; shift count
.rotate_rows:
	;; load entire row into ax:dx
	lodsw			; first half of row
	xchg	dx,ax		; lives in DX
	lodsw			; second half of row lives in AX
	;; Now dl = original first, dh = original second, al = original third
	;; ah = original fourth.  We thus rotate ax:dx a number of bytes
	;; equal to the row number and then store back.
	mov	bh,bl		; make copy of shift count for looping
	;; A single inverse rotation is (0 1 2 3), expressible as
	;; (0,3)(0,2)(0,1) as a product of transpositions. (left shift)
.do_rotate:
	xchg	dh,dl		; (0,1)
	xchg	al,dl		; (0,2)
	xchg	ah,dl		; (0,3)
	dec	bh
	jnz	.do_rotate
	;; store the row back
	xchg	ax,dx		; make ax the first half of row first
	stosw			; store first half of row
	xchg	ax,dx		; make ax the second
	stosw			; store second half of row
	inc	bl
	cmp	bl,3		; have we gotten to row 3?
	jbe	.rotate_rows	; if not, keep rotating
	sub	cx,16
	jz	.finalize
	jmp	.round_top
.finalize:
	mov	di,[bp+4]	; Cipher state (almost plaintext)
	mov	si,[bp+6]	; first round key
	call	mixkey
	pop	di
	pop	si
	pop	bp
	ret

	global	_rijndael_keygen
_rijndael_keygen:
	push	bp
	mov	bp,sp
	push	si
	push	di
	;; Copy the original key data into the W[] array.  This generates
	;; the keys used in the first two rounds.
	mov	si,[bp+4]
	mov	di,[bp+6]
	mov	cx,4
.copy_key1:
	lodsw			; get two byte of key information
	mov	[di],al
	mov	[di+4],ah
	lodsw			; get another two bytes of key info
	mov	[di+8],al
	mov	[di+12],ah
	inc	di		; point to next column
	dec	cx
	jnz	.copy_key1
	mov	di,[bp+6]
	add	di,16		; point di to next block
	mov	cx,4
.copy_key2:
	lodsw			; get two bytes of key information
	mov	[di],al
	mov	[di+4],ah
	lodsw			; get another two bytes of key info
	mov	[di+8],al
	mov	[di+12],ah
	inc	di		; point to next column
	dec	cx
	jnz	.copy_key2
	mov	cx,8		; key number
.keygen_loop:
	mov	si,[bp+6]	; base address of generated keys
	mov	ax,cx
	dec	ax
	mov	dx,ax
	and	ax,0xfffc	; clear low four bits
	shl	ax,2		; 80186 instr.
	and	dx,3		; mask low four bits
	add	ax,dx
	add	si,ax		; SI points to column to get
	mov	dl,[si]
	mov	dh,[si+4]
	mov	al,[si+8]
	mov	ah,[si+12]
	;; Now dl = first byte, dh = second, al = third, ah=fourth
	test	cx,00000011b	; is round # div. by 4 or 8?
	jnz	.not_div4	; no...
	;; Apply the S-boxes if round # is divisible by 4 or 8
	mov	bx,sbox		; load address of S-box for xlat
	xlatb			; al = sbox[#3]
	xchg	ah,al		; ah = sbox[#3], al=#4
	xlatb			; ah=sbox[#3], al=sbox[#4]
	xchg	ax,dx		; ah=#2, al=#1, dh=sbox[#3], dl=sbox[#4]
	xlatb			; ah=#2, al=sbox[#1]
	xchg	ah,al		; ah=sbox[#1], al=#2, dh=sbox[#3], dl=sbox[#4]
	xlatb			; al=sbox[#2]
	;; Now, ah=sbox[#1], al=sbox[#2], dh=sbox[#3], dl=sbox[#4].
	;; Test whether round is divisible by 8.  If so, further transforms
	;; are needed.
	test	cx,00000111b	; is round # div by 8?
	jnz	.not_div8	; no...we're done
	;; What we want is to rotate the data so that ah=sbox[#2], al=sbox[#3],
	;; dh=sbox[#4], and dl=sbox[#1].  This is the cycle (0 1 2 3) which
	;; can be turned into the product of transpositions (impl. by XCHG
	;; instructions) (0 3)(0 2)(0 1)
	xchg	ah,dl		; (0 3)
	xchg	ah,dh		; (0 2)
	xchg	ah,al		; (0,1)
	mov	bx,cx		; get round #
	shr	bx,3		; divide by 8 (80186 instr)
	dec	bx
	xor	ah,[rcon+bx]	; xor with the round constant...finished
.not_div8:
	;; Now we have to rearrange the order of the registers again so that
	;; from ah=#1, al=#2, dh=#3, dl=#4 => dl=#1, dh=#2, al=#3, ah=#4, so
	;; that the following code gets the registers where they are expected.
	xchg	ax,dx		; ah=#3, al=#4, dh=#1, dl=#2
	xchg	ah,al		; ah=#4, al=#3
	xchg	dh,dl		; dh=#2, dl=#1
.not_div4:
	;; When we enter here, dl=first byte, dh=second, al=third, ah=fourth
	;; Perform indexing on the address of the previous key again.
	mov	bx,cx
	sub	bx,8
	mov	di,bx
	and	bx,0xfffc
	shl	bx,2		; 80186 instr
	and	di,3
	add	bx,di		; offset into keys for first byte of old key
	add	bx,[bp+6]	; add base address of subkeys
	xor	dl,[bx]		; xor with current
	xor	dh,[bx+4]
	xor	al,[bx+8]
	xor	ah,[bx+12]
	;; Perform indexing again for new key
	mov	bx,cx
	mov	di,bx
	and	bx,0xfffc
	shl	bx,2		; 80186 instr
	and	di,3
	add	bx,di
	add	bx,[bp+6]
	mov	[bx],dl
	mov	[bx+4],dh
	mov	[bx+8],al
	mov	[bx+12],ah
	inc	cx
	cmp	cx,60		; last key?
	je	.done
	jmp	.keygen_loop
.done:	pop	di
	pop	si
	pop	bp
	ret
