The last time Hackerfall tried to access this page, it returned a not found error. A cached version of the page is below, or clickhereto continue anyway

Asmcodes: RC4 |

Introduction

RC4 is an infamous stream cipher designed in 1987 by Ron Rivest. It was an RSA trade secret up until September 1994 when an anonymous user posted source code to the cypherpunks mailing list

Already used extensively in commercial applications and protocols, it gained notoriety among the security community as a result of a research paper published in 2001 that helped develop the first WEP key recovery tools.

Weaknesses in the Key Scheduling Algorithm of RC4 by Fluhrer, Mantin and Shamir quickly saw the demise of RC4 as a secure cipher. Its reputation was damaged again with the publication of Efficient Reconstruction of RC4 Keys from Internal States by Eli Biham and and Yaniv Carmeli in 2008 but by this time, most people knew it no longer provided adequate protection of data.

Matthew Green from John Hopkins University has a great post called What’s the deal with RC4? if you want more background into the cipher itself.

What follows is a tiny implementation in x86 assembler, optimized for size, so if you require a version optimized for speed, consider using a library such as PolarSSL or OpenSSL

Key-scheduling algorithm (KSA)

The KSA is very simple and consists of mixing key bytes with 256 8-bit element array we call sbox (substitution box). In order to optimize the KSA, some implementations use 32-bit elements (OpenSSL for example) but of course, this requires more code and my goal is to optimize for size.


for i from 0 to 255
    S[i] := i
endfor
j := 0
for i from 0 to 255
    j := (j + S[i] + key[i mod keylength]) mod 256
    swap values of S[i] and S[j]
endfor

The structure I’m using for key context is similar to those you see in many open source libraries. x and y are 32-bit integers because when loading/storing we only use 1 byte each which also happens to zero out the 32-bit register. You could load a byte but then have to zero the upper 24-bits for indexing the sbox array.


#define RC4_BUFSIZ 256 // do not change

#pragma pack(push, 1)
typedef struct _RC4_CTX {
  uint32_t x, y;
  uint8_t s[256];
  uint8_t buf[RC4_BUFSIZ];
} RC4_CTX;
#pragma pack(pop)

Written in C, the above pseudocode looks like the following which is essentially the code posted to cipherpunks mailing list with some minor modifications.


void rc4_setkey (uint32_t keylen, void* key, RC4_CTX* ctx)
{
  int i, j, t;

  for (i=0; i<256; i++) {
    ctx->s[i] = (uint8_t)i;
  }

  ctx->x = 0;
  ctx->y = 0;

  for (i=0, j=0; i<256; i++) {
    j=(j + ctx->s[i] + ((uint8_t*)key)[i % keylen]) % 256;
    t=ctx->s[i];
    ctx->s[i]=ctx->s[j];
    ctx->s[j]=t;
  }
}

And the x86 assembler version where we attempt to reduce space.

The size of this function is 53 bytes


;void rc4_set_key (uint32_t key_len, void *key, RC4_CTX *ctx);
    public _rc4_set_key
    public rc4_set_key
_rc4_set_key:
rc4_set_key proc
    mov    eax, esp
    pushad
    mov    edi, [eax+12]          ; ctx
    mov    ebp, [eax+4]           ; key_len
    mov    esi, [eax+8]           ; key
    xor    eax, eax               ; i=0
    xor    ecx, ecx
    cdq                           ; j=0
    stosd                         ; x=0
    stosd                         ; y=0
init_sbox:
    mov    byte ptr[edi+eax], al  ; s[i] = i
    inc    al                     ; i++
    jnz    init_sbox
init_key:
    cmp    ecx, ebp               ; if (key_idx == key_len)
    jne    $+4
    xor    ecx, ecx               ; key_idx = 0
    mov    bl, [edi+eax]          ; s[i]
    add    dl, bl                 ; j += s[i]
    add    dl, [esi+ecx]          ; j += key[key_idx]
    xchg   bl, [edi+edx]          ; s[j] = s[i]
    mov    [edi+eax], bl          ; s[i] = s[j]
    inc    ecx                    ; key_idx++
    inc    al                     ; i++
    jnz    init_key               ; 
    popad
    ret
rc4_set_key endp

Pseudo-random generation algorithm (PRGA)

A PRNG originally part of OpenBSD until version 5.5 (released in may 2014) used RC4. It now uses ChaCha20 instead.


i := 0
j := 0
while GeneratingOutput:
    i := (i + 1) mod 256
    j := (j + S[i]) mod 256
    swap values of S[i] and S[j]
    K := S[(S[i] + S[j]) mod 256]
    output K
endwhile

The implementation in C is similar to the KSA.


void rc4 (uint32_t data_len, void* data, RC4_CTX* ctx)
{
  uint32_t i;
  uint8_t x=(uint8_t)ctx->x, y=(uint8_t)ctx->y, j=0, t;
  
  for (i=0; i<data_len; i++) {
    // Update x and y
    x=(x + 1) % 256;
    y=(y + ctx->s[x]) % 256;

    t=ctx->s[x];
    ctx->s[x]=ctx->s[y];
    ctx->s[y]=t;

    j=(ctx->s[x] + ctx->s[y]) % 256;
    ((uint8_t*)data)[i] ^= ctx->s[j];
  }
  ctx->x=x;
  ctx->y=y;
}

The size of assembler function is 48 bytes


; void rc4 (uint32_t data_len, void *data, RC4_CTX *ctx);
    public _rc4
    public rc4
_rc4:
rc4 proc
    mov    eax, esp
    pushad
    mov    esi, [eax+12]
    mov    edi, [eax+ 8]
    mov    ecx, [eax+ 4]
    push   esi               ; save pointer to ctx
    lodsd                    ; eax = x
    xchg   eax, ebx      
    lodsd                    ; ebx = y
    xchg   eax, ebx
    cdq
crypt_loop:
    inc    al                ; x++
    mov    dl, [esi+eax]     ; dl = s[x]
    add    bl, dl            ; y += dl
    xchg   dl, [esi+ebx]     ; swap s[y], s[x]
    mov    [esi+eax], dl     ; s[x] = s[y]
    add    dl, [esi+ebx]     ; dl = s[x] + s[y]
    mov    dl, [esi+edx]     ; dl = s[ dl ]
    xor    byte ptr[edi], dl ; *p ^= (s[ s[x] + s[y] ])
    inc    edi               ; p++    
    loop   crypt_loop
    pop    edi               ; edi = rc4key
    stosd                    ; save x
    xchg   eax, ebx 
    stosd                    ; save y
    popad
    ret
rc4 endp

Total size of x86 implementation is 101 bytes but we can integrate both functions into one and reduce the space further. We can also set a limit on the key length if required, here it will be 128-bits.

Together, the whole function is 77 bytes using fastcall convention or 82 using cdecl



; RC4 in x86 assembly (tiny version for 128-bit keys)
; Odzhan

.686
.model flat, C

option prologue:none
option epilogue:none

include rc4.inc

.code

; -----------------------------------------------
; RC4 
; 77 bytes using fastcall
;   ecx=0, edx=ctx (to set key)
;   ecx=len, edx=ctx (to encrypt/decrypt buffer)
;
; 82 bytes using stdcall
;
; set the key          : rc4x (0, ctx)
; encrypt/decrypt data : rc4x (data_len, ctx)
; -----------------------------------------------
rc4x proc
    pushad
ifdef FC ; fastcall
    mov    esi, edx
else
    mov    esi, [esp+32+8]        ; esi = ctx
    mov    ecx, [esp+32+4]        ; len
endif
    ; eax = x
    lodsd
    xchg   eax, ebx
    ; ebx = y
    lodsd
    xchg   eax, ebx
    push   esi
    pop    edi
    cdq                           ; edx = 0
    jecxz  init_sbox
    dec    dl                     ; 256
    add    edi, edx               ; edi = buf
; -----------------------------------------------
; PRNG. edi=buf, eax=x, ebx=y, ecx=data_len 
; -----------------------------------------------
crypt_data:
    inc    al                     ; x++
    
    mov    dl, [esi+eax]          ; dl = s[x]
    add    bl, dl                 ; y += dl
    xchg   dl, [esi+ebx]          ; swap s[y], s[x]
    mov    [esi+eax], dl          ; s[x] = s[y]
    
    add    dl, [esi+ebx]          ; dl = s[x] + s[y]
    mov    dl, [esi+edx]          ; dl = s[ dl ]
    xor    byte ptr[edi], dl      ; *p ^= (s[ s[x] + s[y] ])
    inc    edi                    ; p++     
    loop   crypt_data
    mov    [esi-8], eax           ; store x
    mov    [esi-4], ebx           ; store y
    popad
    ret
; -----------------------------------------------
; KSA. edi=s, eax=0, ebx=j=0, ecx=i=0, edx=0
; -----------------------------------------------
init_sbox:
    stosb                         ; s[i] = i
    inc    al                     ; i++
    jnz    init_sbox
; now edi points to buf with 128-bit key
init_key:
    mov    al, cl                 ; key_idx
    and    al, 15                 ; keys should be 128-bits
    
    add    bl, [edi+eax]          ; j += key[key_idx % 16]
    mov    dl, [esi+ecx]          ; s[i]
    
    add    bl, dl                 ; j += s[i]
    xchg   dl, [esi+ebx]          ; s[j] = s[i]
    mov    [esi+ecx], dl          ; s[i] = s[j]
    
    inc    cl                     ; i++
    jnz    init_key               ; 
    popad
    ret
rc4x endp

Demonstration

Using the tiny version is much different from calling 2 separate functions. Here is how the context is initialized.


    memset (&ctx, 0, sizeof (ctx));
    // set the key first
    memcpy (ctx.buf, k, klen);
    rc4x (0, &ctx);
    // now encrypt the plaintext
    memcpy (ctx.buf, p1, plen); // must not exceed RC4_BUFSIZ
    rc4x (plen, &ctx);
    memcpy (c2, ctx.buf, clen);

Update

I revisted an initial idea before I wrote the 1st tiny version and I suppose the reason i never implemented it was because you couldn’t save x+y. The version written by Peter Ferrie doesn’t save x+y itself, you call it just once which is fine if that’s all you need so I decided to revisit and here it is at 66 bytes. Hope I haven’t made mistake now


; RC4 in x86 assembly (for 128-bit keys)
; Odzhan
;
; 66 bytes
;
; To assemble on windows
;
; yasm -fwin32 rc4.asm -o rc4.obj
; nasm -fwin32 rc4.asm -o rc4.obj
;
; haven't tried linux

;ctx struct
;  sbox db 256 dup (?) ; uint8_t
;  key  db 16 dup (?)  ; uint8_t
;  len  dd ?           ; uint32_t
;  buf  dd ?           ; void*
;ctx ends
    
    bits 32
    
%ifndef BIN
    global rc4x
    global _rc4x
%endif

_rc4x:
rc4x:
    pushad
    mov    edi, [esp+32+4]   ; ctx
    mov    esi, edi
    ; x = 0, i = 0
    xor    eax, eax
    ; y = 0
    cdq
    xor    ebx, ebx
    xor    ecx, ecx
; --------------------------------------------
; KSA. edi=s + key, esi=s, eax=i, edx=j
; --------------------------------------------
init_sbox:
    stosb                    ; s[i] = i
    inc    al                ; i++
    jnz    init_sbox
    ; edi now points to 128-bit key
init_key:
    mov    bl, al            ; bl = key_idx
    and    bl, 15            ; %= 16
    
    add    dl, [edi+ebx]     ; j += key[key_idx % 16]
    jmp    swap
upd_idx:
    inc    al
    jnz    init_key
    cdq
    mov    ecx, [edi+16]     ; ecx = len
; -----------------------------------------------
; PRNG. edi=buf, eax=x, edx=y, ecx=len 
; -----------------------------------------------
crypt_data:
    inc    al                ; x++
swap:
    mov    bl, [esi+eax]
    add    dl, bl
    xchg   bl, [esi+edx]
    mov    [esi+eax], bl
    jecxz  upd_idx
    
    add    bl, [esi+edx]     ; bl = s[x] + s[y]
    mov    bl, [esi+ebx]     ; bl = s[ bl ]
    xor    [edi+16+4], bl    ; *p ^= (s[ s[x] + s[y] ])
    inc    edi               ; p++     
    loop   crypt_data
    popad
    ret

Below is just a C string with the RC4 assembly code which uses cdecl calling convention. It’s only for x86 architecture. If you wanted to change buf to void* and use malloc() instead, it would require changes to the code.



#define RC4_BUFSIZ 1024

#pragma pack (push, 1)
typedef struct _rc4_ctx_t {
  uint8_t sbox[256];
  uint8_t key[16];
  uint32_t len;
  uint8_t buf[RC4_BUFSIZ+32];
} rc4_ctx;
#pragma pack (pop)

void rc4 (rc4_ctx*);

#define RC4_SIZE 66

char RC4[]= {
  /* 0000 */ "\x60"             /* pushad              */
  /* 0001 */ "\x8b\x7c\x24\x24" /* mov edi, [esp+0x24] */
  /* 0005 */ "\x89\xfe"         /* mov esi, edi        */
  /* 0007 */ "\x31\xc0"         /* xor eax, eax        */
  /* 0009 */ "\x99"             /* cdq                 */
  /* 000A */ "\x31\xdb"         /* xor ebx, ebx        */
  /* 000C */ "\x31\xc9"         /* xor ecx, ecx        */
  /* 000E */ "\xaa"             /* stosb               */
  /* 000F */ "\xfe\xc0"         /* inc al              */
  /* 0011 */ "\x75\xfb"         /* jnz 0xe             */
  /* 0013 */ "\x88\xc3"         /* mov bl, al          */
  /* 0015 */ "\x80\xe3\x0f"     /* and bl, 0xf         */
  /* 0018 */ "\x02\x14\x1f"     /* add dl, [edi+ebx]   */
  /* 001B */ "\xeb\x0a"         /* jmp 0x27            */
  /* 001D */ "\xfe\xc0"         /* inc al              */
  /* 001F */ "\x75\xf2"         /* jnz 0x13            */
  /* 0021 */ "\x99"             /* cdq                 */
  /* 0022 */ "\x8b\x4f\x10"     /* mov ecx, [edi+0x10] */
  /* 0025 */ "\xfe\xc0"         /* inc al              */
  /* 0027 */ "\x8a\x1c\x06"     /* mov bl, [esi+eax]   */
  /* 002A */ "\x00\xda"         /* add dl, bl          */
  /* 002C */ "\x86\x1c\x16"     /* xchg [esi+edx], bl  */
  /* 002F */ "\x88\x1c\x06"     /* mov [esi+eax], bl   */
  /* 0032 */ "\xe3\xe9"         /* jecxz 0x1d          */
  /* 0034 */ "\x02\x1c\x16"     /* add bl, [esi+edx]   */
  /* 0037 */ "\x8a\x1c\x1e"     /* mov bl, [esi+ebx]   */
  /* 003A */ "\x30\x5f\x14"     /* xor [edi+0x14], bl  */
  /* 003D */ "\x47"             /* inc edi             */
  /* 003E */ "\xe2\xe5"         /* loop 0x25           */
  /* 0040 */ "\x61"             /* popad               */
  /* 0041 */ "\xc3"             /* ret                 */
};

Results

The x64 version was surprisingly smaller than the 32-bit version, mostly because it uses the fastcall convention. The 32-bit could be modified to use fastcall instead of cdecl which would save more space but only the tiny version has this option.

architecture size x86 (1st tiny version) 77 x86 (2nd tiny version) 66 x86 101 x64 100

Source code for x86 and x86-64 here

Other work

Peter Ferrie, a talented programmer and anti-virus researcher wrote an implementation which is worth a look. It’s only 69 bytes and takes advantage of the carry flag. Very impressive work.

Like this:

Like Loading...

Related

Continue reading on odzhan.wordpress.com