An example of some work I have done in assembly language.


.global _BigCopy
.global _BigMemcpy

.section .text
.align 16

;
; void
; BigMemcpy(
; OUT uint32* dest,
; IN const uint32* src,
; IN uint32 numBytes
; );
;
; Routine Description:
;
; This entry point is used to make BigCopy behave like memcpy. It adjusts
; the numBytes parameter on the stack by dividing it by 4. Once that's
; done, it falls through to the BigCopy routine.
;
; Arguments:
;
; dest - Same as BigCopy.
;
; src - Same as BigCopy.
;
; numBytes - Number of bytes to copy.
;
; Return Value:
;
; Same as BigCopy.
;

_BigMemcpy:

move.l 12(sp), d0 ; Load numBytes parameter.
lsr.l #2, d0 ; numBytes /= 4.
move.l d0, 12(sp) ; Write numBytes parameter back.
; Fall through to BigCopy.

;
; void
; BigCopy(
; OUT uint32* dest,
; IN const uint32* src,
; IN uint32 numLongs
; );
;
; Routine Description:
;
; This is an optimized memcpy routine that is used to copy large blocks of
; data. It's buffer addresses must be aligned on 4 byte boundaries and the
; amount of memory to copy is specified in units of 4 bytes instead of
; single bytes.
;
; Arguments:
;
; dest - Base address of the destination buffer where the copied data is to
; be placed. It is assumed that this address is 4 byte aligned.
;
; src - Base address of the source buffer where data is to be copied from.
; It is assumed that this address is 4 byte aligned.
;
; numLongs - Number of longs (4 byte blocks) to copy. The number of bytes
; that are actually copied is numLongs*4. If 0 is specified, then no
; copy is performed.
;
; Return Value:
;
; None.
;

_BigCopy:
BigCopy:

;
; Main entry point. This code handles keeping the stack in good shape
; and saves off most registers to ensure the registers can be used. The
; code should not be modified.
;

link a6, #-(7*4 + 5*4) ; Setup the stack frame with space
; to store registers.
movem.l d1-d7/a1-a5, (sp) ; Save off registers.

;
; Register Usage
;
; d0-d7 - Available.
; a0-a5 - Available.
; a6 - Stack frame. DO NOT WRITE.
; a7 - Stack pointer. DO NOT WRITE.
;

;
; Load parameters from stack.
;
; Register Usage
;
; d0 - numLongs.
; d1-d7 - Available.
; a0 - dest.
; a1 - src.
; a2-a5 - Available.
;

movea.l 8(a6), a0 ; a0 = dest.
movea.l 12(a6), a1 ; a1 = src.
move.l 16(a6), d0 ; d0 = numLongs.

;
; The copy is done in three phases by breaking up the copy amount into
; three distinct individual copy "types."
;
; 1) Copies in multiples of 64 longs.
; 2) Copies in multiples of 8 longs, but the total is less than 64.
; 3) Copies in longs, but the total is less than 8.
;
; The numLongs parameter can be broken up into parts as follows.
;
; 31 6 5 3 2 0
; +---------------------+-------------------+-------------------+
; | # 64 long copies | # 8 long copies | # 1 long copies |
; +---------------------+-------------------+-------------------+
;
; The number of 64 long copies is given by numLongs >> 6.
; The number of 8 long copies is given by (numLongs & 0x38) >> 3.
; The number of 1 long copies is given by numLongs & 0x3.
;
; The 8 and 64 long copies are handled by the same block of code. This
; is done by creating a loop that is broken into 8 sections which each
; copy 8 bytes. If we jump into the middle of the loop, we will copy
; some multiple of 8 bytes, but not the whole 64.
;
; At the bottom of the loop is a test that decrements a loop counter and
; tests for zero. If the counter is non-zero, the loop iterates from the
; top. Note that it doesn't matter where the loop is entered the first
; time as control will eventually fall through to the decrement and test
; code.
;
; In this way, copy types of 1 and 2 are handled by the same block of
; code. That block is labeled Copy64, below. By setting the loop counter
; to 1 (or incrementing it by 1), we can iterate partly through the loop
; and fall through (or iterate again to handle the proper number of full
; iterations).
;
; We need to break up the numLongs parameter to determine the number of
; each type of copy.
;

;
; Calculate the number of loop iterations. Each full loop is a copy of
; 64 longs. Note that we don't add one here, but use a special test at
; the bottom of the loop to handle the "partial" iteration.
;

move.l d0, d1 ; d1 = numLongs.
lsr.l #6, d0 ; d0 = numLongs / 64.

;
; Extract the low 6 bits from numLongs (numLongs % 64). This number
; contains both the number of 8 long copies and the number of 1 long
; copies. A copy is taken so that we can work on it while maintaining
; the d1 value which we'll need later.
;

andi.l #0x3F, d1 ; d1 = numLongs % 64.
move.l d1, d2 ; Get a copy of numLongs%64.

;
; Extract numLongs[5:3], the portion of the long that tells us the number
; of 8 byte copies that need to happen. Since this value has 3 bits,
; there are 8 possible values (0, 1, 2, ..., 7). When the value is 0, we
; don't have any 8 long copies to perform. When the value is 1, we want
; to copy 8 longs. When the value is 2, we want to copy 16 longs. When
; the value is 3, we want to copy 24 longs, and so on.
;
; We do this by using a jump table that branches into the middle of the
; 8 long copy loop. The jump table consists of branch instructions that
; take up a total of 4 bytes. This means that we can use numLongs[5:3]
; as an index into the jump table, but we need to first shift the bits
; into the right position.
;

andi.l #0x38, d2 ; Get numLongs[5:3].
lsr.l #1, d2 ; Shift bits into an offset.
lea BigCopyJumpTable, a2 ; Address of jump table base.
adda.l d2, a2 ; Add offset using index.
jmp (a2) ; Jump to table entry.

;
; This is the main copy loop that copies multiples of 8 longs. It has
; 8 sections, so for a full iteration, it copies 64 longs. Labels are
; placed between the sections so that the loop may be entered in the
; middle via the jump table.
;
; The copy loop code is only entered via the jump table.
;
; Register Usage
;
; d0 - Number of loop iterations.
; d1 - (d1 & 0x7) = remainder longs to copy.
; d2-d7 - Temporaries for movem.
; a0 - Current destination pointer.
; a1 - Current source pointer.
; a2-a3 - Temporaries for movem.
;

Copy64:
movem.l (a1), d2-d7/a2-a3 ; Read 8 longs from source.
adda.l #32, a1 ; Source pointer += 32 bytes.
movem.l d2-d7/a2-a3, (a0) ; Write 8 longs to destination.
adda.l #32, a0 ; Destination pointer += 32 bytes.
Copy56:
movem.l (a1), d2-d7/a2-a3 ; Read 8 longs from source.
adda.l #32, a1 ; Source pointer += 32 bytes.
movem.l d2-d7/a2-a3, (a0) ; Write 8 longs to destination.
adda.l #32, a0 ; Destination pointer += 32 bytes.
Copy48:
movem.l (a1), d2-d7/a2-a3 ; Read 8 longs from source.
adda.l #32, a1 ; Source pointer += 32 bytes.
movem.l d2-d7/a2-a3, (a0) ; Write 8 longs to destination.
adda.l #32, a0 ; Destination pointer += 32 bytes.
Copy40:
movem.l (a1), d2-d7/a2-a3 ; Read 8 longs from source.
adda.l #32, a1 ; Source pointer += 32 bytes.
movem.l d2-d7/a2-a3, (a0) ; Write 8 longs to destination.
adda.l #32, a0 ; Destination pointer += 32 bytes.
Copy32:
movem.l (a1), d2-d7/a2-a3 ; Read 8 longs from source.
adda.l #32, a1 ; Source pointer += 32 bytes.
movem.l d2-d7/a2-a3, (a0) ; Write 8 longs to destination.
adda.l #32, a0 ; Destination pointer += 32 bytes.
Copy24:
movem.l (a1), d2-d7/a2-a3 ; Read 8 longs from source.
adda.l #32, a1 ; Source pointer += 32 bytes.
movem.l d2-d7/a2-a3, (a0) ; Write 8 longs to destination.
adda.l #32, a0 ; Destination pointer += 32 bytes.
Copy16:
movem.l (a1), d2-d7/a2-a3 ; Read 8 longs from source.
adda.l #32, a1 ; Source pointer += 32 bytes.
movem.l d2-d7/a2-a3, (a0) ; Write 8 longs to destination.
adda.l #32, a0 ; Destination pointer += 32 bytes.
Copy8:
movem.l (a1), d2-d7/a2-a3 ; Read 8 longs from source.
adda.l #32, a1 ; Source pointer += 32 bytes.
movem.l d2-d7/a2-a3, (a0) ; Write 8 longs to destination.
adda.l #32, a0 ; Destination pointer += 32 bytes.

;
; Bottom of of the 64 long copy loop. We need to decrement the loop
; counter and test if we need to iterate again. Here we use a bpl, so
; we will break out if the counter goes negative. This accounts for the
; "partial" iteration.
;

CopyBottom:
subq.l #1, d0 ; i--.
bpl Copy64 ; if (i >= 0) goto Copy64.

;
; We are done copying the 64 and 8 long blocks. Now we need to handle
; any left over longs; there are anywhere between 0 and 7 left, inclusive.
;
; We will again use a jump table, using numLongs[2:0] as an offset into
; the jump table. This allows us to enter the copy code sequence at any
; point and fall through to the rest.
;
; Register Usage
;
; d0 - Unused.
; d1 - numLongs % 64. Used to compute index into jump table.
; d2-d7 - Unused.
; a0 - Current destination pointer.
; a1 - Current source pointer.
; a2 - Used as temporary for jump table.
; a3-a5 - Unused.
;

CopyRemaining:

andi.l #0x07, d1 ; Extract numBytes[2:0].
lsl.l #2, d1 ; Shift bits into an offset.
lea SmallCopyJumpTable, a2 ; Address of jump table base.
adda.l d1, a2 ; Compute jump table entry.
jmp (a2) ; Jump to table entry.

;
; This code performs the last few long copies. It uses the regular
; postincrement addressing mode on the source and destination, performing
; a single long copy with each move. Each instruction simply falls
; through to the next, so to transfer the correct number of bytes, we must
; branch into the correct spot in the instruction sequence.
;
; This block of code is only entered from the jump table.
;
; Register Usage
;
; d0-d7 - Unused.
; a0 - Current destination pointer.
; a1 - Current source pointer.
; a2-a5 - Unused.
;

Copy7: ; Copy 7 longs.
move.l (a1)+, (a0)+ ; Copy and fall through. 6 left.
Copy6: ; Copy 6 longs.
move.l (a1)+, (a0)+ ; Copy and fall through. 5 left.
Copy5: ; Copy 5 longs.
move.l (a1)+, (a0)+ ; Copy and fall through. 4 left.
Copy4: ; Copy 4 longs.
move.l (a1)+, (a0)+ ; Copy and fall through. 3 left.
Copy3: ; Copy 3 longs.
move.l (a1)+, (a0)+ ; Copy and fall through. 2 left.
Copy2: ; Copy 2 longs.
move.l (a1)+, (a0)+ ; Copy and fall through. 1 left.
Copy1: ; Copy 1 longs.
move.l (a1)+, (a0)+ ; Copy and fall through. 0 left.
Copy0: ; Copy 0 longs. All done.

;
; BigCopy exit point. Do not modify code below this line.
;

BigCopyExit:

movem.l (sp), d1-d7/a1-a5 ; Restore saved registers.
unlk a6 ; Restore SP, tear down frame.
rts ; Return.

;
; Jump tables for big and small copies.
;

.align 4

;
; Jump table for the big copy loop. It is full of unconditional branches
; with word width. This means that each instruction is 4 bytes long, so
; we index the table as longs.
;

BigCopyJumpTable:
bra.w CopyBottom
bra.w Copy8
bra.w Copy16
bra.w Copy24
bra.w Copy32
bra.w Copy40
bra.w Copy48
bra.w Copy56
nop ; NOPs to make the disassembly
nop ; more readable.

.align 4

;
; Jump table for the small copy sequence. It is full of unconditional
; branches with word width. This means that each instruction is 4 bytes
; long, so we index the table as longs.
;

SmallCopyJumpTable:
bra.w Copy0
bra.w Copy1
bra.w Copy2
bra.w Copy3
bra.w Copy4
bra.w Copy5
bra.w Copy6
bra.w Copy7
nop ; NOPs to make the disassembly
nop ; more readable.

.end