;===================================================================
; Very Fast 8-bit Multiplication Library
; written by Kirk Meyer <kirkmeyer@bigfoot.com>
;
; You may use this library in your programs in unmodified form as
; long as you give me credit somewhere in your documentation. If
; you must modify the code, please ask first. The multiplication
; routine is extremely optimized and an innocent-looking change
; might cause mass havoc on the functionality of the routine. To use
; this library, simply copy it to the directory where you put your
; source code and then #include it in the source file. Also, as I
; mention later, you must give the term "MultiplyTable" a value.
;
; If you would rather, you can statically include the multiplication
; table. It would take 256 bytes, and is just a multiplication table
; 16x16. (0,1,..,15,16; 0,2,..,30,32; 0,3,..,45,48; etc.)
;
; Routines included in library:
;  Multiply, MultiplyInitTable

;===================================================================
;Very Fast 8-bit Multiplication Routine
;
;Inputs:
; MultiplyInitTable must have been called previous to this routine.
; D = number to multiply
; E = number to multiply
;
;Outputs:
; HL = result of unsigned multiplication, D * E
;
;Destroys:
; AF, BC, HL
;
;Impacts:
; 48 bytes code
; 207 cycles
;
;Notes:
; The number of cycles does not take into account cycles incurred
; while calling and returning from the routine. If you intend to
; use this function as a call and not inline, add 27 to the number
; of cycles. For the best performance, cut and paste this function
; inline in your code when you call it only once or twice.
;
;How it works:
; This function uses nibbles and a lookup table to attempt to make
; multiplication very fast. The input is split up as follows:
;       D = (w << 4) + x              E = (y << 4) + z
; This is a simple operation for the Z80 to do since it has some
; commands designed to deal with nibbles (4-bit groups). Then, by
; definition, D * E is simply the following:
;       D * E = ((w * y) << 8) + ((w * z) + (x * y)) << 4) + (x * z)
; The middle calculation (the << 4 one) is performed first because
; it was easiest that way. Then the outer calculations are performed
; and they are added to the inner calculation result.
Multiply:
        ld   hl,MultiplyModify1        ;10
        ld   (hl),e                    ;7
        ld   a,d                       ;4
        rrd                            ;18
        ld   h,MultiplyTable           ;7
        ld   l,a                       ;4
        ld   a,(hl)                    ;7
        ld   l,0                       ;7
MultiplyModify1 = $ - 1
        add  a,(hl)                    ;7

        ld   hl,MultiplyModify2        ;10
        ld   (hl),a                    ;7
        sbc  a,a                       ;4
        and  $10                       ;7

        rld                            ;18
        ld   b,a                       ;4
        ld   c,(hl)                    ;7

        ld   (hl),e                    ;7
        ld   a,d                       ;4
        rld                            ;18
        ld   h,MultiplyTable           ;7
        ld   l,a                       ;4
        ld   a,(hl)                    ;7
        ld   l,0                       ;7
MultiplyModify2 = $ - 1
        ld   l,(hl)                    ;7
        ld   h,a                       ;4

        add  hl,bc                     ;11
        ret

;===================================================================
;8-bit Multiplication Table Generator
;
;Inputs:
; Make sure that (MultiplyTable << 8) points to a 256-byte block
; of memory that is okay for the routine to use. For example, if
; MultiplyTable equals $90, the routine will put its 256-byte
; table at $9000 through $90FF.
;
;Outputs:
; HL = (MultiplyTable << 8) + 256
; BC = $0010
;
;Destroys:
; AF, BC, HL
; 256 bytes at (MultiplyTable << 8)
;
;Impacts:
; 19 bytes code
; 8045 cycles
;
;Notes:
; The number of cycles does not take into account cycles incurred
; while calling and returning from the routine. If you intend to
; use this function as a call and not inline, add 27 to the number
; of cycles. This routine must be called before you can use the
; included Multiply routine.
;
;How it works:
; This is simply a brute force multiplication routine. It
; successively multiplies 0-15 by 0-15 and stores each result in
; a large table. I did not bother optimizing this since it will be
; run only once.
MultiplyInitTable:
        ld   h,MultiplyTable           ;7
        ld   l,0                       ;7
        ld   c,l                       ;4
MultiplyInitTableOutter:
        xor  a                         ;4
        ld   b,16                      ;7
MultiplyInitTableLoop:
        ld   (hl),a                    ;7
        inc  hl                        ;6
        add  a,c                       ;4
        djnz MultiplyInitTableLoop     ;13/8
        inc  c                         ;4
        ld   a,c                       ;4
        cp   16                        ;7
        jr   c,MultiplyInitTableOutter ;12/7
        ret