Click here to Skip to main content
Click here to Skip to main content
Add your own
alternative version

Generating Fractals with SSE/SSE2

, 29 Nov 2005 CPOL
An article on generating Mandelbrot and Julia sets using Intel's Streaming SIMD Extensions (SSE, SSE2).
fractalssse_src.zip
zoom.cur
bitmap1.bmp
DEFAULT1.BIN
fractals.exe
fractals.ICO
SSEroutines_test.exe
format PE console

include '%fasminc%\win32axp.inc'

ITER equ 64 ; The number of iterations
Julia equ 0
Mandel equ 1

macro copyscr reg, from
{
	MOVSS	 reg, dword[from]
	SHUFPS	 reg, reg, 0
}
macro startm
{
	xor eax, eax
	cpuid
	rdtsc
	mov [b], eax
}
macro endm
{
	xor eax, eax
	cpuid
	rdtsc
	sub eax, [b]
}

; MUL
macro JuliaMandelPaintMUL color, type
{
local YLOOP, XLOOP, ILOOP, EXIT, BSTART
	push	ebx esi edi
	mov edx, [h]
	mov esi, [a]
	mov edi, [bits]
     if type = Julia
	copyscr xmm0, cx2
	copyscr xmm1, cy2
     end if
	copyscr xmm2, dx2
	copyscr xmm3, dy2
     if type = Julia
	MOVAPS dqword[cx1], xmm0
	MOVAPS dqword[cy1], xmm1
     end if
	MOVAPS dqword[dy1], xmm3
	copyscr xmm4, LEFT
	copyscr xmm5, TOP
	MOVAPS	xmm0, xmm2	      ; xmm2 = 0       | dx2     | dx2 * 2 | dx2 * 3
	ANDPS	xmm0, dqword[mask1]
	MOVAPS	xmm1, xmm2
	ANDPS	xmm1, dqword[mask2]
	ADDPS	xmm0, xmm1
	ADDPS	xmm0, xmm2
	ANDPS	xmm0, dqword[mask]
	ADDPS	xmm4, xmm0
	MOVAPS	dqword[left1], xmm4
	ADDPS	xmm2, xmm2
	ADDPS	xmm2, xmm2
	MOVAPS	dqword[dx1], xmm2
	MOVAPS	xmm7, dqword[radius]
	JMP	BSTART
	; xmm0 = zx    xmm1 = zy    xmm2 = tmp    xmm3 = tmp    xmm4 = zx2    xmm5 = zy2    xmm6 = result   xmm7 = 4.0
	; eax = tmp    ebx = x      ecx = i counter      edx = y      edi = bits pointer    esi = color table
YLOOP:
	MOVAPS xmm4, dqword[left1]
BSTART:
	mov ebx, [w]
XLOOP:
	MOVAPS xmm0,xmm4
	XORPS  xmm6,xmm6
	MOVAPS xmm1,xmm5
	mov ecx, ITER
ILOOP:
	; xmm0 = zx             xmm1 = zy
	MOVAPS xmm2, xmm0
	MULPS  xmm0, xmm0
	MULPS  xmm2, xmm1
	MULPS  xmm1, xmm1
	MOVAPS xmm3, xmm1
	ADDPS  xmm1, xmm0
	CMPLEPS  xmm1, xmm7
	SUBPS  xmm0, xmm3
	ADDPS  xmm2, xmm2
	MOVMSKPS eax, xmm1
	test eax, eax
	jz EXIT
     if color
	ANDPS  xmm1, xmm7 ; xmm6 += (xmm2 < radius) ? 4.0 : 0.0;
	ADDPS  xmm6, xmm1
     end if
	MOVAPS xmm1, xmm2
     if type = Julia
	ADDPS  xmm1, dqword[cy1]
	ADDPS  xmm0, dqword[cx1]
     else if type = Mandel
	ADDPS  xmm1, xmm5
	ADDPS  xmm0, xmm4
     end if
	sub ecx, 1
	jnz ILOOP
EXIT:
     if color
	;21-22 cycles
	CVTSS2SI eax, xmm6
	mov eax, [esi + eax]
	mov [edi], eax
	SHUFPS xmm6, xmm6, 0E5h
	CVTSS2SI eax, xmm6
	mov eax, [esi + eax]
	mov [edi+4], eax
	SHUFPS xmm6, xmm6, 0E6h
	CVTSS2SI eax, xmm6
	mov eax, [esi + eax]
	mov [edi+8], eax
	SHUFPS xmm6, xmm6, 0E7h
	CVTSS2SI eax, xmm6
	mov eax, [esi + eax]
	mov [edi+12], eax
     else
	ANDNPS	xmm2, dqword[maskbw]
	MOVAPS	[edi], xmm2
     end if
	add edi, 16
	ADDPS  xmm4, dqword[dx1]
	sub ebx, 4
	jnz XLOOP

	ADDPS  xmm5, dqword[dy1]
	sub edx, 1
	jnz YLOOP

	pop	edi esi ebx

}

; FFFF
macro JuliaMandelPaintFFFF color, type
{
local YLOOP, XLOOP, ILOOP, EXIT, BSTART
	push	ebx esi edi
	mov edx, [h]
	mov esi, [a]
	mov edi, [bits]
     if type = Julia
	copyscr xmm0, cx2
	copyscr xmm1, cy2
     end if
	copyscr xmm2, dx2
	copyscr xmm3, dy2
     if type = Julia
	MOVAPS dqword[cx1], xmm0
	MOVAPS dqword[cy1], xmm1
     end if
	MOVAPS dqword[dy1], xmm3
	copyscr xmm4, LEFT
	copyscr xmm5, TOP
	MOVAPS	xmm0, xmm2	      ; xmm2 = 0       | dx2     | dx2 * 2 | dx2 * 3
	ANDPS	xmm0, dqword[mask1]
	MOVAPS	xmm1, xmm2
	ANDPS	xmm1, dqword[mask2]
	ADDPS	xmm0, xmm1
	ADDPS	xmm0, xmm2
	ANDPS	xmm0, dqword[mask]
	ADDPS	xmm4, xmm0
	MOVAPS	dqword[left1], xmm4
	ADDPS	xmm2, xmm2
	ADDPS	xmm2, xmm2
	MOVAPS	dqword[dx1], xmm2
	MOVAPS	xmm7, dqword[radius]
	JMP	BSTART
	; xmm0 = zx    xmm1 = zy    xmm2 = tmp    xmm3 = tmp    xmm4 = zx2    xmm5 = zy2    xmm6 = result   xmm7 = 4.0
	; eax = tmp    ebx = x      ecx = i counter      edx = y      edi = bits pointer    esi = color table
YLOOP:
	MOVAPS xmm4, dqword[left1]
BSTART:
	mov ebx, [w]
XLOOP:
	MOVAPS xmm0,xmm4
	XORPS  xmm6,xmm6
	MOVAPS xmm1,xmm5
	mov ecx, ITER
ILOOP:
	; xmm0 = zx             xmm1 = zy
	MOVAPS xmm2, xmm0
	MULPS  xmm2, xmm1
	MULPS  xmm0, xmm0
	MULPS  xmm1, xmm1
	addps xmm2, xmm2
	movaps xmm3, xmm0
	addps xmm3, xmm1
	cmpltps xmm3, xmm7
	movmskps eax, xmm3
	test eax, eax
	jz EXIT
	subps xmm0, xmm1
	movaps xmm1, xmm2
     if color
	ANDPS  xmm3, xmm7 ; xmm6 += (xmm2 < radius) ? 4.0 : 0.0;
	ADDPS  xmm6, xmm3
     end if
     if type = Julia
	ADDPS  xmm0, dqword[cx1]
	ADDPS  xmm1, dqword[cy1]
     else if type = Mandel
	ADDPS  xmm0, xmm4
	ADDPS  xmm1, xmm5
     end if
	dec ecx
	jnz ILOOP
EXIT:
     if color
	;21-22 cycles
	CVTSS2SI eax, xmm6
	mov eax, [esi + eax]
	mov [edi], eax
	SHUFPS xmm6, xmm6, 0E5h
	CVTSS2SI eax, xmm6
	mov eax, [esi + eax]
	mov [edi+4], eax
	SHUFPS xmm6, xmm6, 0E6h
	CVTSS2SI eax, xmm6
	mov eax, [esi + eax]
	mov [edi+8], eax
	SHUFPS xmm6, xmm6, 0E7h
	CVTSS2SI eax, xmm6
	mov eax, [esi + eax]
	mov [edi+12], eax
     else
	ANDNPS	xmm2, dqword[maskbw]
	MOVAPS	[edi], xmm2
     end if
	add edi, 16
	ADDPS  xmm4, dqword[dx1]
	sub ebx, 4
	jnz XLOOP

	ADDPS  xmm5, dqword[dy1]
	sub edx, 1
	jnz YLOOP

	pop	edi esi ebx

}

; MOVADD
macro JuliaMandelPaintMOVADD color, type
{
local YLOOP, XLOOP, ILOOP, EXIT, BSTART
	push	ebx esi edi
	mov edx, [h]
	mov esi, [a]
	mov edi, [bits]
     if type = Julia
	copyscr xmm0, cx2
	copyscr xmm1, cy2
     end if
	copyscr xmm2, dx2
	copyscr xmm3, dy2
     if type = Julia
	MOVAPS dqword[cx1], xmm0
	MOVAPS dqword[cy1], xmm1
     end if
	MOVAPS dqword[dy1], xmm3
	copyscr xmm4, LEFT
	copyscr xmm5, TOP
	MOVAPS	xmm0, xmm2	      ; xmm2 = 0       | dx2     | dx2 * 2 | dx2 * 3
	ANDPS	xmm0, dqword[mask1]
	MOVAPS	xmm1, xmm2
	ANDPS	xmm1, dqword[mask2]
	ADDPS	xmm0, xmm1
	ADDPS	xmm0, xmm2
	ANDPS	xmm0, dqword[mask]
	ADDPS	xmm4, xmm0
	MOVAPS	dqword[left1], xmm4
	ADDPS	xmm2, xmm2
	ADDPS	xmm2, xmm2
	MOVAPS	dqword[dx1], xmm2
	MOVAPS	xmm7, dqword[radius]
	JMP	BSTART
	; xmm0 = zx    xmm1 = zy    xmm2 = tmp    xmm3 = tmp    xmm4 = zx2    xmm5 = zy2    xmm6 = result   xmm7 = 4.0
	; eax = tmp    ebx = x      ecx = i counter      edx = y      edi = bits pointer    esi = color table
YLOOP:
	MOVAPS xmm4, dqword[left1]
BSTART:
	mov ebx, [w]
XLOOP:
	MOVAPS xmm0,xmm4
	XORPS  xmm6,xmm6
	MOVAPS xmm1,xmm5
	mov ecx, ITER
ILOOP:
	; xmm0 = zx             xmm1 = zy
	MOVAPS xmm2, xmm0
	MULPS  xmm0, xmm0
	MOVAPS xmm3, xmm1
	ADDPS  xmm1, xmm1
	MULPS  xmm3, xmm3
	; xmm0 = zx^2           xmm1 = 2 * zy      xmm2 = zx           xmm3 = zy^2
	MULPS  xmm1, xmm2
	MOVAPS xmm2, xmm0
	ADDPS  xmm2, xmm3
	; xmm0 = zx^2 - zy^2    xmm1 = 2*zx*zy     xmm2 = zx^2 + zy^2  xmm3 = zy^2
	CMPLEPS  xmm2, xmm7
	SUBPS  xmm0, xmm3
	MOVMSKPS eax, xmm2
	test eax, eax
	jz EXIT
     if color
	ANDPS  xmm2, xmm7 ; xmm6 += (xmm2 < radius) ? 4.0 : 0.0;
	ADDPS  xmm6, xmm2
     end if
     if type = Julia
	ADDPS  xmm1, dqword[cy1]
	ADDPS  xmm0, dqword[cx1]
     else if type = Mandel
	ADDPS  xmm1, xmm5
	ADDPS  xmm0, xmm4
     end if
	sub ecx, 1
	jnz ILOOP
EXIT:
     if color
	;21-22 cycles
	CVTSS2SI eax, xmm6
	mov eax, [esi + eax]
	mov [edi], eax
	SHUFPS xmm6, xmm6, 0E5h
	CVTSS2SI eax, xmm6
	mov eax, [esi + eax]
	mov [edi+4], eax
	SHUFPS xmm6, xmm6, 0E6h
	CVTSS2SI eax, xmm6
	mov eax, [esi + eax]
	mov [edi+8], eax
	SHUFPS xmm6, xmm6, 0E7h
	CVTSS2SI eax, xmm6
	mov eax, [esi + eax]
	mov [edi+12], eax
     else
	ANDNPS	xmm2, dqword[maskbw]
	MOVAPS	[edi], xmm2
     end if
	add edi, 16
	ADDPS  xmm4, dqword[dx1]
	sub ebx, 4
	jnz XLOOP

	ADDPS  xmm5, dqword[dy1]
	sub edx, 1
	jnz YLOOP

	pop	edi esi ebx

}


;MOVADD2
macro JuliaMandelPaintMOVADD2 color, type
{
local YLOOP, XLOOP, ILOOP, EXIT, BSTART
	push	ebx esi edi
	mov edx, [h]
	mov esi, [a]
	mov edi, [bits]
     if type = Julia
	copyscr xmm0, cx2
	copyscr xmm1, cy2
     end if
	copyscr xmm2, dx2
	copyscr xmm3, dy2
     if type = Julia
	MOVAPS dqword[cx1], xmm0
	MOVAPS dqword[cy1], xmm1
     end if
	MOVAPS dqword[dy1], xmm3
	copyscr xmm4, LEFT
	copyscr xmm5, TOP
	MOVAPS	xmm0, xmm2	      ; xmm2 = 0       | dx2     | dx2 * 2 | dx2 * 3
	ANDPS	xmm0, dqword[mask1]
	MOVAPS	xmm1, xmm2
	ANDPS	xmm1, dqword[mask2]
	ADDPS	xmm0, xmm1
	ADDPS	xmm0, xmm2
	ANDPS	xmm0, dqword[mask]
	ADDPS	xmm4, xmm0
	MOVAPS	dqword[left1], xmm4
	ADDPS	xmm2, xmm2
	ADDPS	xmm2, xmm2
	MOVAPS	dqword[dx1], xmm2
	MOVAPS	xmm7, dqword[radius]
	JMP	BSTART
	; xmm0 = zx    xmm1 = zy    xmm2 = tmp    xmm3 = tmp    xmm4 = zx2    xmm5 = zy2    xmm6 = result   xmm7 = 4.0
	; eax = tmp    ebx = x      ecx = i counter      edx = y      edi = bits pointer    esi = color table
YLOOP:
	MOVAPS xmm4, dqword[left1]
BSTART:
	mov ebx, [w]
XLOOP:
	MOVAPS xmm0,xmm4
	XORPS  xmm6,xmm6
	MOVAPS xmm1,xmm5
	mov ecx, ITER
ILOOP:
	; xmm0 = zx             xmm1 = zy
	MOVAPS xmm2, xmm0	;  0 -  6
	MULPS  xmm0, xmm0	;  0 -  6
	MOVAPS xmm3, xmm1	;  1 -  7
	ADDPS  xmm1, xmm1	;  1 -  5
	MULPS  xmm1, xmm2	;  6 - 12
	MOVAPS xmm2, xmm0	;  7 - 13
	MULPS  xmm3, xmm3	;  8 - 14
	; xmm0 = zx^2           xmm1 = 2 * zy      xmm2 = zx           xmm3 = zy^2
     if type = Julia
	ADDPS  xmm1, dqword[cy1]
     else if type = Mandel
	ADDPS  xmm1, xmm5	; 12 - 16
     end if
	ADDPS  xmm2, xmm3	; 14 - 18
	SUBPS  xmm0, xmm3	; 16 - 20
	; xmm0 = zx^2 - zy^2    xmm1 = 2*zx*zy     xmm2 = zx^2 + zy^2  xmm3 = zy^2
	CMPLEPS  xmm2, xmm7	; 18 - 22
     if type = Julia
	ADDPS  xmm0, dqword[cx1]
     else if type = Mandel
	ADDPS  xmm0, xmm4	; 20 - 24
     end if
	MOVMSKPS eax, xmm2	; 22 - 28
	test eax, eax
	jz EXIT
     if color
	ANDPS  xmm2, xmm7	; 23 - 25 ; xmm6 += (xmm2 < radius) ? 4.0 : 0.0;
	ADDPS  xmm6, xmm2	; 26 - 30
     end if
	sub ecx, 1
	jnz ILOOP
EXIT:
     if color
	;21-22 cycles
	CVTSS2SI eax, xmm6
	mov eax, [esi + eax]
	mov [edi], eax
	SHUFPS xmm6, xmm6, 0E5h
	CVTSS2SI eax, xmm6
	mov eax, [esi + eax]
	mov [edi+4], eax
	SHUFPS xmm6, xmm6, 0E6h
	CVTSS2SI eax, xmm6
	mov eax, [esi + eax]
	mov [edi+8], eax
	SHUFPS xmm6, xmm6, 0E7h
	CVTSS2SI eax, xmm6
	mov eax, [esi + eax]
	mov [edi+12], eax
     else
	ANDNPS	xmm2, dqword[maskbw]
	MOVAPS	[edi], xmm2
     end if
	add edi, 16
	ADDPS  xmm4, dqword[dx1]
	sub ebx, 4
	jnz XLOOP

	ADDPS  xmm5, dqword[dy1]
	sub edx, 1
	jnz YLOOP

	pop	edi esi ebx

}

;MOVADD2ps2dq
macro JuliaMandelPaintMOVADD2ps2dq color, type
{
local YLOOP, XLOOP, ILOOP, EXIT, BSTART
	push	ebx esi edi
	mov edx, [h]
	mov esi, [a]
	mov edi, [bits]
     if type = Julia
	copyscr xmm0, cx2
	copyscr xmm1, cy2
     end if
	copyscr xmm2, dx2
	copyscr xmm3, dy2
     if type = Julia
	MOVAPS dqword[cx1], xmm0
	MOVAPS dqword[cy1], xmm1
     end if
	MOVAPS dqword[dy1], xmm3
	copyscr xmm4, LEFT
	copyscr xmm5, TOP
	MOVAPS	xmm0, xmm2	      ; xmm2 = 0       | dx2     | dx2 * 2 | dx2 * 3
	ANDPS	xmm0, dqword[mask1]
	MOVAPS	xmm1, xmm2
	ANDPS	xmm1, dqword[mask2]
	ADDPS	xmm0, xmm1
	ADDPS	xmm0, xmm2
	ANDPS	xmm0, dqword[mask]
	ADDPS	xmm4, xmm0
	MOVAPS	dqword[left1], xmm4
	ADDPS	xmm2, xmm2
	ADDPS	xmm2, xmm2
	MOVAPS	dqword[dx1], xmm2
	MOVAPS	xmm7, dqword[radius]
	JMP	BSTART
	; xmm0 = zx    xmm1 = zy    xmm2 = tmp    xmm3 = tmp    xmm4 = zx2    xmm5 = zy2    xmm6 = result   xmm7 = 4.0
	; eax = tmp    ebx = x      ecx = i counter      edx = y      edi = bits pointer    esi = color table
YLOOP:
	MOVAPS xmm4, dqword[left1]
BSTART:
	mov ebx, [w]
XLOOP:
	MOVAPS xmm0,xmm4
	XORPS  xmm6,xmm6
	MOVAPS xmm1,xmm5
	mov ecx, ITER
ILOOP:
	; xmm0 = zx             xmm1 = zy
	MOVAPS xmm2, xmm0	;  0 -  6
	MULPS  xmm0, xmm0	;  0 -  6
	MOVAPS xmm3, xmm1	;  1 -  7
	ADDPS  xmm1, xmm1	;  1 -  5
	MULPS  xmm1, xmm2	;  6 - 12
	MOVAPS xmm2, xmm0	;  7 - 13
	MULPS  xmm3, xmm3	;  8 - 14
	; xmm0 = zx^2           xmm1 = 2 * zy      xmm2 = zx           xmm3 = zy^2
     if type = Julia
	ADDPS  xmm1, dqword[cy1]
     else if type = Mandel
	ADDPS  xmm1, xmm5	; 12 - 16
     end if
	ADDPS  xmm2, xmm3	; 14 - 18
	SUBPS  xmm0, xmm3	; 16 - 20
	; xmm0 = zx^2 - zy^2    xmm1 = 2*zx*zy     xmm2 = zx^2 + zy^2  xmm3 = zy^2
	CMPLEPS  xmm2, xmm7	; 18 - 22
     if type = Julia
	ADDPS  xmm0, dqword[cx1]
     else if type = Mandel
	ADDPS  xmm0, xmm4	; 20 - 24
     end if
	MOVMSKPS eax, xmm2	; 22 - 28
	test eax, eax
	jz EXIT
     if color
	ANDPS  xmm2, xmm7	; 23 - 25 ; xmm6 += (xmm2 < radius) ? 4.0 : 0.0;
	ADDPS  xmm6, xmm2	; 26 - 30
     end if
	sub ecx, 1
	jnz ILOOP
EXIT:
     if color
	CVTPS2DQ xmm6, xmm6
	MOVAPS dqword[scratch], xmm6
	mov eax, [scratch]
	mov ecx, [scratch+4]
	mov eax, [esi + eax]
	mov ecx, [esi + ecx]
	mov [edi], eax
	mov [edi+4], ecx
	mov eax, [scratch+8]
	mov ecx, [scratch+12]
	mov eax, [esi + eax]
	mov ecx, [esi + ecx]
	mov [edi+8], eax
	mov [edi+12], ecx
     else
	ANDNPS	xmm2, dqword[maskbw]
	MOVAPS	[edi], xmm2
     end if
	add edi, 16
	ADDPS  xmm4, dqword[dx1]
	sub ebx, 4
	jnz XLOOP

	ADDPS  xmm5, dqword[dy1]
	sub edx, 1
	jnz YLOOP

	pop	edi esi ebx

}


;MOVADD3
macro JuliaMandelPaintMOVADD3 color, type
{
local YLOOP, XLOOP, ILOOP, EXIT, BSTART
	push	ebx esi edi
	mov edx, [h]
	mov esi, [a]
	mov edi, [bits]
     if type = Julia
	copyscr xmm0, cx2
	copyscr xmm1, cy2
     end if
	copyscr xmm2, dx2
	copyscr xmm3, dy2
     if type = Julia
	MOVAPS dqword[cx1], xmm0
	MOVAPS dqword[cy1], xmm1
     end if
	MOVAPS dqword[dy1], xmm3
	copyscr xmm4, LEFT
	copyscr xmm5, TOP
	MOVAPS	xmm0, xmm2	      ; xmm2 = 0       | dx2     | dx2 * 2 | dx2 * 3
	ANDPS	xmm0, dqword[mask1]
	MOVAPS	xmm1, xmm2
	ANDPS	xmm1, dqword[mask2]
	ADDPS	xmm0, xmm1
	ADDPS	xmm0, xmm2
	ANDPS	xmm0, dqword[mask]
	ADDPS	xmm4, xmm0
	MOVAPS	dqword[left1], xmm4
	ADDPS	xmm2, xmm2
	ADDPS	xmm2, xmm2
	MOVAPS	dqword[dx1], xmm2
	MOVAPS	xmm7, dqword[radius]
	JMP	BSTART
	; xmm0 = zx    xmm1 = zy    xmm2 = tmp    xmm3 = tmp    xmm4 = zx2    xmm5 = zy2    xmm6 = result   xmm7 = 4.0
	; eax = tmp    ebx = x      ecx = i counter      edx = y      edi = bits pointer    esi = color table
YLOOP:
	MOVAPS xmm4, dqword[left1]
BSTART:
	mov ebx, [w]
XLOOP:
	MOVAPS xmm0,xmm4
	XORPS  xmm6,xmm6
	MOVAPS xmm1,xmm5
	mov ecx, ITER
ILOOP:
	; xmm0 = zx             xmm1 = zy
	MOVAPS xmm2, xmm0	;  0 -  6
	MULPS  xmm0, xmm0	;  0 -  6
	MOVAPS xmm3, xmm1	;  1 -  7
	ADDPS  xmm1, xmm1	;  1 -  5
	MULPS  xmm1, xmm2	;  6 - 12
	MOVAPS xmm2, xmm0	;  7 - 13
	MULPS  xmm3, xmm3	;  8 - 14
	; xmm0 = zx^2           xmm1 = 2 * zy      xmm2 = zx           xmm3 = zy^2
     if type = Julia
	ADDPS  xmm1, dqword[cy1]
     else if type = Mandel
	ADDPS  xmm1, xmm5	; 12 - 16
     end if
	SUBPS  xmm0, xmm3	; 14 - 18
	ADDPS  xmm2, xmm3	; 16 - 20
	; xmm0 = zx^2 - zy^2    xmm1 = 2*zx*zy     xmm2 = zx^2 + zy^2  xmm3 = zy^2
	CMPLEPS  xmm2, xmm7	; 20 - 24
     if type = Julia
	ADDPS  xmm0, dqword[cx1]
     else if type = Mandel
	ADDPS  xmm0, xmm4	; 18 - 22
     end if
	MOVMSKPS eax, xmm2	; 24 - 30
	test eax, eax
	jz EXIT
     if color
	ANDPS  xmm2, xmm7	; 25 - 27 ; xmm6 += (xmm2 < radius) ? 4.0 : 0.0;
	ADDPS  xmm6, xmm2	; 28 - 32
     end if
	sub ecx, 1
	jnz ILOOP
EXIT:
     if color
	;21-22 cycles
	CVTSS2SI eax, xmm6
	mov eax, [esi + eax]
	mov [edi], eax
	SHUFPS xmm6, xmm6, 0E5h
	CVTSS2SI eax, xmm6
	mov eax, [esi + eax]
	mov [edi+4], eax
	SHUFPS xmm6, xmm6, 0E6h
	CVTSS2SI eax, xmm6
	mov eax, [esi + eax]
	mov [edi+8], eax
	SHUFPS xmm6, xmm6, 0E7h
	CVTSS2SI eax, xmm6
	mov eax, [esi + eax]
	mov [edi+12], eax
     else
	ANDNPS	xmm2, dqword[maskbw]
	MOVAPS	[edi], xmm2
     end if
	add edi, 16
	ADDPS  xmm4, dqword[dx1]
	sub ebx, 4
	jnz XLOOP

	ADDPS  xmm5, dqword[dy1]
	sub edx, 1
	jnz YLOOP

	pop	edi esi ebx

}

;MOVADD4
macro JuliaMandelPaintMOVADD4 color, type
{
local YLOOP, XLOOP, ILOOP, EXIT, BSTART
	push	ebx esi edi
	mov edx, [h]
	mov esi, [a]
	mov edi, [bits]
     if type = Julia
	copyscr xmm0, cx2
	copyscr xmm1, cy2
     end if
	copyscr xmm2, dx2
	copyscr xmm3, dy2
     if type = Julia
	MOVAPS dqword[cx1], xmm0
	MOVAPS dqword[cy1], xmm1
     end if
	MOVAPS dqword[dy1], xmm3
	copyscr xmm4, LEFT
	copyscr xmm5, TOP
	MOVAPS	xmm0, xmm2	      ; xmm2 = 0       | dx2     | dx2 * 2 | dx2 * 3
	ANDPS	xmm0, dqword[mask1]
	MOVAPS	xmm1, xmm2
	ANDPS	xmm1, dqword[mask2]
	ADDPS	xmm0, xmm1
	ADDPS	xmm0, xmm2
	ANDPS	xmm0, dqword[mask]
	ADDPS	xmm4, xmm0
	MOVAPS	dqword[left1], xmm4
	ADDPS	xmm2, xmm2
	ADDPS	xmm2, xmm2
	MOVAPS	dqword[dx1], xmm2
	MOVAPS	xmm7, dqword[radius]
	JMP	BSTART
	; xmm0 = zx    xmm1 = zy    xmm2 = tmp    xmm3 = tmp    xmm4 = zx2    xmm5 = zy2    xmm6 = result   xmm7 = 4.0
	; eax = tmp    ebx = x      ecx = i counter      edx = y      edi = bits pointer    esi = color table
YLOOP:
	MOVAPS xmm4, dqword[left1]
BSTART:
	mov ebx, [w]
XLOOP:
	MOVAPS xmm0,xmm4
	XORPS  xmm6,xmm6
	MOVAPS xmm1,xmm5
	mov ecx, ITER
ILOOP:
	; xmm0 = zx             xmm1 = zy
	MOVAPS xmm2, xmm0	;  0 -  6
	MULPS  xmm0, xmm0	;  0 -  6
	MOVAPS xmm3, xmm1	;  1 -  7
	ADDPS  xmm1, xmm1	;  1 -  5
	MULPS  xmm1, xmm2	;  6 - 12
	MOVAPS xmm2, xmm0	;  7 - 13
	MULPS  xmm3, xmm3	;  8 - 14
	; xmm0 = zx^2           xmm1 = 2 * zy      xmm2 = zx           xmm3 = zy^2
     if type = Julia
	ADDPS  xmm1, dqword[cy1]
     else if type = Mandel
	ADDPS  xmm1, xmm5	; 12 - 16
     end if
	SUBPS  xmm0, xmm3	; 14 - 18
	ADDPS  xmm2, xmm3	; 16 - 20
	; xmm0 = zx^2 - zy^2    xmm1 = 2*zx*zy     xmm2 = zx^2 + zy^2  xmm3 = zy^2
     if type = Julia
	ADDPS  xmm0, dqword[cx1]
     else if type = Mandel
	ADDPS  xmm0, xmm4	; 18 - 22
     end if
	CMPLEPS  xmm2, xmm7	; 20 - 24
	MOVMSKPS eax, xmm2	; 24 - 30
	test eax, eax
	jz EXIT
     if color
	ANDPS  xmm2, xmm7	; 25 - 27 ; xmm6 += (xmm2 < radius) ? 4.0 : 0.0;
	ADDPS  xmm6, xmm2	; 28 - 32
     end if
	sub ecx, 1
	jnz ILOOP
EXIT:
     if color
	;21-22 cycles
	CVTSS2SI eax, xmm6
	mov eax, [esi + eax]
	mov [edi], eax
	SHUFPS xmm6, xmm6, 0E5h
	CVTSS2SI eax, xmm6
	mov eax, [esi + eax]
	mov [edi+4], eax
	SHUFPS xmm6, xmm6, 0E6h
	CVTSS2SI eax, xmm6
	mov eax, [esi + eax]
	mov [edi+8], eax
	SHUFPS xmm6, xmm6, 0E7h
	CVTSS2SI eax, xmm6
	mov eax, [esi + eax]
	mov [edi+12], eax
     else
	ANDNPS	xmm2, dqword[maskbw]
	MOVAPS	[edi], xmm2
     end if
	add edi, 16
	ADDPS  xmm4, dqword[dx1]
	sub ebx, 4
	jnz XLOOP

	ADDPS  xmm5, dqword[dy1]
	sub edx, 1
	jnz YLOOP

	pop	edi esi ebx

}

;vod
macro JuliaMandelPaintvod color, type
{
local YLOOP, XLOOP, ILOOP, EXIT, BSTART
	push	ebx esi edi
	mov edx, [h]
	mov esi, [a]
	mov edi, [bits]
     if type = Julia
	copyscr xmm0, cx2
	copyscr xmm1, cy2
     end if
	copyscr xmm2, dx2
	copyscr xmm3, dy2
     if type = Julia
	MOVAPS dqword[cx1], xmm0
	MOVAPS dqword[cy1], xmm1
     end if
	MOVAPS dqword[dy1], xmm3
	copyscr xmm4, LEFT
	copyscr xmm5, TOP
	MOVAPS	xmm0, xmm2	      ; xmm2 = 0       | dx2     | dx2 * 2 | dx2 * 3
	ANDPS	xmm0, dqword[mask1]
	MOVAPS	xmm1, xmm2
	ANDPS	xmm1, dqword[mask2]
	ADDPS	xmm0, xmm1
	ADDPS	xmm0, xmm2
	ANDPS	xmm0, dqword[mask]
	ADDPS	xmm4, xmm0
	MOVAPS	dqword[left1], xmm4
	ADDPS	xmm2, xmm2
	ADDPS	xmm2, xmm2
	MOVAPS	dqword[dx1], xmm2
	MOVAPS	xmm7, dqword[radius]
	JMP	BSTART
	; xmm0 = zx    xmm1 = zy    xmm2 = tmp    xmm3 = tmp    xmm4 = zx2    xmm5 = zy2    xmm6 = result   xmm7 = 4.0
	; eax = tmp    ebx = x      ecx = i counter      edx = y      edi = bits pointer    esi = color table
YLOOP:
	MOVAPS xmm4, dqword[left1]
BSTART:
	mov ebx, [w]
XLOOP:
	MOVAPS xmm0,xmm4
	XORPS  xmm6,xmm6
	MOVAPS xmm1,xmm5
	mov ecx, ITER
ILOOP:
	; xmm0 = zx             xmm1 = zy
	MOVAPS xmm2, xmm1	  ; 0  - 6       mov
	MULPS  xmm2, xmm2	  ; 6  - 12      fp:mul
	MULPS  xmm1, xmm0	  ; 0  - 6       fp:mul

	MOVAPS xmm3, xmm2	  ; 13 - 19      mov
	MULPS  xmm0, xmm0	  ; 2  - 8       fp:mul
	ADDPS  xmm2, xmm0	  ; 12 - 16      fp:add

	CMPLEPS  xmm2, xmm7	  ; 16 - 20      fp:add
	ADDPS  xmm1, xmm1	  ; 6  - 10      fp:add
	SUBPS  xmm0, xmm3	  ; 19 - 23      fp:add
     if type = Julia
	ADDPS  xmm1, dqword[cy1]  ; 10 - 14      fp:add
	ADDPS  xmm0, dqword[cx1]  ; 23 - 27      fp:add
     else if type = Mandel
	ADDPS  xmm1, xmm5
	ADDPS  xmm0, xmm4
     end if
	MOVMSKPS eax, xmm2	  ; 20 - 26      fp
	test eax, eax		  ; 26 - 27      alu0/1
	jz EXIT 		  ; 26 - 27      alu0/1
     if color
	ANDPS  xmm2, xmm7	  ; 21 - 23      mmx:alu
	ADDPS  xmm6, xmm2	  ; 24 - 28      fp:add
     end if
	sub ecx, 1
	jnz ILOOP
EXIT:
     if color
	CVTSS2SI eax, xmm6
	mov eax, [esi + eax]
	mov [edi], eax
	SHUFPS xmm6, xmm6, 0E5h
	CVTSS2SI eax, xmm6
	mov eax, [esi + eax]
	mov [edi+4], eax
	SHUFPS xmm6, xmm6, 0E6h
	CVTSS2SI eax, xmm6
	mov eax, [esi + eax]
	mov [edi+8], eax
	SHUFPS xmm6, xmm6, 0E7h
	CVTSS2SI eax, xmm6
	mov eax, [esi + eax]
	mov [edi+12], eax
	;CVTPS2DQ xmm6, xmm6
	;MOVAPS  [edi], xmm6
     else
	ANDNPS	xmm2, dqword[maskbw]
	MOVAPS	[edi], xmm2
     end if
	add edi, 16
	ADDPS  xmm4, dqword[dx1]
	sub ebx, 4
	jnz XLOOP

	ADDPS  xmm5, dqword[dy1]
	sub edx, 1
	jnz YLOOP

	pop	edi esi ebx

}

;vod2
macro JuliaMandelPaintvod2 color, type
{
local YLOOP, XLOOP, ILOOP, EXIT, BSTART
	push	ebx esi edi
	mov edx, [h]
	mov esi, [a]
	mov edi, [bits]
     if type = Julia
	copyscr xmm0, cx2
	copyscr xmm1, cy2
     end if
	copyscr xmm2, dx2
	copyscr xmm3, dy2
     if type = Julia
	MOVAPS dqword[cx1], xmm0
	MOVAPS dqword[cy1], xmm1
     end if
	MOVAPS dqword[dy1], xmm3
	copyscr xmm4, LEFT
	copyscr xmm5, TOP
	MOVAPS	xmm0, xmm2	      ; xmm2 = 0       | dx2     | dx2 * 2 | dx2 * 3
	ANDPS	xmm0, dqword[mask1]
	MOVAPS	xmm1, xmm2
	ANDPS	xmm1, dqword[mask2]
	ADDPS	xmm0, xmm1
	ADDPS	xmm0, xmm2
	ANDPS	xmm0, dqword[mask]
	ADDPS	xmm4, xmm0
	MOVAPS	dqword[left1], xmm4
	ADDPS	xmm2, xmm2
	ADDPS	xmm2, xmm2
	MOVAPS	dqword[dx1], xmm2
	MOVAPS	xmm7, dqword[radius]
	JMP	BSTART
	; xmm0 = zx    xmm1 = zy    xmm2 = tmp    xmm3 = tmp    xmm4 = zx2    xmm5 = zy2    xmm6 = result   xmm7 = 4.0
	; eax = tmp    ebx = x      ecx = i counter      edx = y      edi = bits pointer    esi = color table
YLOOP:
	MOVAPS xmm4, dqword[left1]
BSTART:
	mov ebx, [w]
XLOOP:
	MOVAPS xmm0,xmm4
	XORPS  xmm6,xmm6
	MOVAPS xmm1,xmm5
	mov ecx, ITER
ILOOP:
	; xmm0 = zx             xmm1 = zy
	MOVAPS xmm2, xmm1	  ; 0  - 6       mov
	MULPS  xmm2, xmm2	  ; 6  - 12      fp:mul
	MULPS  xmm1, xmm0	  ; 0  - 6       fp:mul

	MOVAPS xmm3, xmm2	  ; 13 - 19      mov
	MULPS  xmm0, xmm0	  ; 2  - 8       fp:mul
	ADDPS  xmm2, xmm0	  ; 12 - 16      fp:add

	ADDPS  xmm1, xmm1	  ; 6  - 10      fp:add
	CMPLEPS  xmm2, xmm7	  ; 16 - 20      fp:add
	SUBPS  xmm0, xmm3	  ; 19 - 23      fp:add
     if type = Julia
	ADDPS  xmm1, dqword[cy1]  ; 10 - 14      fp:add
	ADDPS  xmm0, dqword[cx1]  ; 23 - 27      fp:add
     else if type = Mandel
	ADDPS  xmm1, xmm5
	ADDPS  xmm0, xmm4
     end if
	MOVMSKPS eax, xmm2	  ; 20 - 26      fp
	test eax, eax		  ; 26 - 27      alu0/1
	jz EXIT 		  ; 26 - 27      alu0/1
     if color
	ANDPS  xmm2, xmm7	  ; 21 - 23      mmx:alu
	ADDPS  xmm6, xmm2	  ; 24 - 28      fp:add
     end if
	sub ecx, 1
	jnz ILOOP
EXIT:
     if color
	CVTSS2SI eax, xmm6
	mov eax, [esi + eax]
	mov [edi], eax
	SHUFPS xmm6, xmm6, 0E5h
	CVTSS2SI eax, xmm6
	mov eax, [esi + eax]
	mov [edi+4], eax
	SHUFPS xmm6, xmm6, 0E6h
	CVTSS2SI eax, xmm6
	mov eax, [esi + eax]
	mov [edi+8], eax
	SHUFPS xmm6, xmm6, 0E7h
	CVTSS2SI eax, xmm6
	mov eax, [esi + eax]
	mov [edi+12], eax
	;CVTPS2DQ xmm6, xmm6
	;MOVAPS  [edi], xmm6
     else
	ANDNPS	xmm2, dqword[maskbw]
	MOVAPS	[edi], xmm2
     end if
	add edi, 16
	ADDPS  xmm4, dqword[dx1]
	sub ebx, 4
	jnz XLOOP

	ADDPS  xmm5, dqword[dy1]
	sub edx, 1
	jnz YLOOP

	pop	edi esi ebx

}

;vodps2dq
macro JuliaMandelPaintvodps2dq color, type
{
local YLOOP, XLOOP, ILOOP, EXIT, BSTART
	push	ebx esi edi
	mov edx, [h]
	mov esi, [a]
	mov edi, [bits]
     if type = Julia
	copyscr xmm0, cx2
	copyscr xmm1, cy2
     end if
	copyscr xmm2, dx2
	copyscr xmm3, dy2
     if type = Julia
	MOVAPS dqword[cx1], xmm0
	MOVAPS dqword[cy1], xmm1
     end if
	MOVAPS dqword[dy1], xmm3
	copyscr xmm4, LEFT
	copyscr xmm5, TOP
	MOVAPS	xmm0, xmm2	      ; xmm2 = 0       | dx2     | dx2 * 2 | dx2 * 3
	ANDPS	xmm0, dqword[mask1]
	MOVAPS	xmm1, xmm2
	ANDPS	xmm1, dqword[mask2]
	ADDPS	xmm0, xmm1
	ADDPS	xmm0, xmm2
	ANDPS	xmm0, dqword[mask]
	ADDPS	xmm4, xmm0
	MOVAPS	dqword[left1], xmm4
	ADDPS	xmm2, xmm2
	ADDPS	xmm2, xmm2
	MOVAPS	dqword[dx1], xmm2
	MOVAPS	xmm7, dqword[radius]
	JMP	BSTART
	; xmm0 = zx    xmm1 = zy    xmm2 = tmp    xmm3 = tmp    xmm4 = zx2    xmm5 = zy2    xmm6 = result   xmm7 = 4.0
	; eax = tmp    ebx = x      ecx = i counter      edx = y      edi = bits pointer    esi = color table
YLOOP:
	MOVAPS xmm4, dqword[left1]
BSTART:
	mov ebx, [w]
XLOOP:
	MOVAPS xmm0,xmm4
	XORPS  xmm6,xmm6
	MOVAPS xmm1,xmm5
	mov ecx, ITER
ILOOP:
	; xmm0 = zx             xmm1 = zy
	MOVAPS xmm2, xmm1	  ; 0  - 6       mov
	MULPS  xmm2, xmm2	  ; 6  - 12      fp:mul
	MULPS  xmm1, xmm0	  ; 0  - 6       fp:mul

	MOVAPS xmm3, xmm2	  ; 13 - 19      mov
	MULPS  xmm0, xmm0	  ; 2  - 8       fp:mul
	ADDPS  xmm2, xmm0	  ; 12 - 16      fp:add

	CMPLEPS  xmm2, xmm7	  ; 16 - 20      fp:add
	ADDPS  xmm1, xmm1	  ; 6  - 10      fp:add
	SUBPS  xmm0, xmm3	  ; 19 - 23      fp:add
     if type = Julia
	ADDPS  xmm1, dqword[cy1]  ; 10 - 14      fp:add
	ADDPS  xmm0, dqword[cx1]  ; 23 - 27      fp:add
     else if type = Mandel
	ADDPS  xmm1, xmm5
	ADDPS  xmm0, xmm4
     end if
	MOVMSKPS eax, xmm2	  ; 20 - 26      fp
	test eax, eax		  ; 26 - 27      alu0/1
	jz EXIT 		  ; 26 - 27      alu0/1
     if color
	ANDPS  xmm2, xmm7	  ; 21 - 23      mmx:alu
	ADDPS  xmm6, xmm2	  ; 24 - 28      fp:add
     end if
	sub ecx, 1
	jnz ILOOP
EXIT:
     if color
	CVTPS2DQ xmm6, xmm6
	MOVAPS dqword[scratch], xmm6
	mov eax, [scratch]
	mov ecx, [scratch+4]
	mov eax, [esi + eax]
	mov ecx, [esi + ecx]
	mov [edi], eax
	mov [edi+4], ecx
	mov eax, [scratch+8]
	mov ecx, [scratch+12]
	mov eax, [esi + eax]
	mov ecx, [esi + ecx]
	mov [edi+8], eax
	mov [edi+12], ecx
	;CVTPS2DQ xmm6, xmm6
	;MOVAPS  [edi], xmm6
     else
	ANDNPS	xmm2, dqword[maskbw]
	MOVAPS	[edi], xmm2
     end if
	add edi, 16
	ADDPS  xmm4, dqword[dx1]
	sub ebx, 4
	jnz XLOOP

	ADDPS  xmm5, dqword[dy1]
	sub edx, 1
	jnz YLOOP

	pop	edi esi ebx

}
macro testone name
{
	startm
	JuliaMandelPaint#name 1, Julia
	endm
	mov [jt],eax
	startm
	JuliaMandelPaint#name 1, Mandel
	endm
	invoke wsprintf,buff,m_#name,[jt],eax
	invoke WriteConsole,[hOut],buff,eax,read,0
	invoke WriteFile,[hFile],buff,[read],read,0
}

invoke GetStdHandle, STD_OUTPUT_HANDLE
mov [hOut], eax

    ;Parse command line
    invoke GetCommandLine
    mov edx, buff
    mov ebx, ' '
    cmp byte[eax], '"' ; if commandline[0] == '"' then search for '"' else search for ' '
    jnz @F
    inc eax
    mov ebx, '"'
@@:		       ; search loop
    movzx ecx, byte[eax]
    mov [edx], cl
    add eax, 1
    add edx, 1
    or ecx, ecx        ; check for a null char
    jz @F
    cmp ecx, ebx
    jnz @B
@@:
    lea eax, [edx-1]   ; search backward for '\' or '/'
    mov edx, buff
@@:
    sub eax, 1
    cmp byte[eax],'\'
    je slashfound
    cmp byte[eax],'/'
    je slashfound
    cmp eax, edx
    ja @B
    ; no path in the command line
    invoke GetCurrentDirectory,256,buff
    add eax, buff
    cmp byte[eax-1],'\'
    jz @F
    mov byte[eax],'\'
slashfound:
    inc eax
@@:
    mov dword[eax], 'tst.'
    mov dword[eax+4], 'log'
invoke CreateFile,buff,GENERIC_WRITE,0,NULL,OPEN_ALWAYS,FILE_ATTRIBUTE_NORMAL,0
cmp eax,INVALID_HANDLE_VALUE
jz errnoinfo
mov [hFile],eax
invoke SetFilePointer,eax,0,0,FILE_END

call PrintSysInfo
invoke WriteConsole,[hOut],buff,eax,read,0
invoke WriteFile,[hFile],buff,[read],read,0

call IsSSE
or eax, eax
jz errnosse
call IsSSE2
mov [sse2flag], eax

invoke GetCurrentProcess
invoke SetPriorityClass, eax, REALTIME_PRIORITY_CLASS
invoke GetCurrentThread
invoke SetThreadPriority, eax, THREAD_PRIORITY_TIME_CRITICAL
mainloop:
   testone MUL
   testone FFFF
   testone MOVADD
   testone MOVADD2
mov eax, [sse2flag]
or eax, eax
jz @F
   testone MOVADD2ps2dq
@@:
   testone MOVADD3
   testone MOVADD4
   testone vod
mov eax, [sse2flag]
or eax, eax
jz @F
   testone vodps2dq
@@:
   testone vod2
dec [i]
jnz mainloop

invoke GetCurrentProcess
invoke SetPriorityClass, eax, NORMAL_PRIORITY_CLASS
invoke GetCurrentThread
invoke SetThreadPriority, eax, THREAD_PRIORITY_NORMAL
invoke Sleep, 500
invoke CloseHandle,[hFile]
invoke ExitProcess,0

errnoinfo:
invoke WriteConsole,[hOut],msgnoinfo,msgnosse-msgnoinfo,read,0
invoke ExitProcess,0

errnosse:
invoke WriteConsole,[hOut],msgnosse,bits-msgnosse,read,0
invoke CloseHandle,[hFile]
invoke ExitProcess,0

macro tohex reg
{
	and reg, 0xF
	cmp reg, 0xA ; Convert to hex
	jb @F
	add reg, 0x11 - 0xA
@@:
	add reg, 0x30
}

PrintSysInfo:
	push esi
	mov esi, buff
	pushfd
	pop eax
	mov edx, eax
	xor eax, 00200000h
	push eax
	popfd
	pushfd
	pop eax
	xor eax, edx
	jz oldcpu ; 386 or 486, can't use CPUID
	xor eax, eax
	cpuid
	; Find vendor
	cmp ebx, 'Genu'
	jnz nonintel
	cmp edx, 'ineI'
	jnz nonintel
	cmp ecx, 'ntel'
	jnz nonintel
	mov dword[esi], 'Inte'
	mov dword[esi+4], 'l 0x'
getver:
	; Get version information
	xor eax, eax
	inc eax
	cpuid
	mov ecx, eax
	shr ecx, 8   ; Get family ID
	tohex ecx
	mov byte[esi+8],cl
	mov ecx, eax
	shr ecx, 4   ; Get model
	tohex ecx
	mov byte[esi+9],cl
	and eax, 0xF ; Get stepping
	tohex eax
	mov byte[esi+10],al
	mov byte[esi+11],' '
	mov dword[esi+12],'bran'
	mov dword[esi+16],'d in'
	mov dword[esi+20],'dex='
	mov eax, ebx
	shr eax, 4
	tohex eax
	mov byte[esi+24],al
	tohex ebx
	mov byte[esi+25],al
	mov byte[esi+26],10
	;Get brand string
	mov eax, 0x80000000
	cpuid
	cmp eax, 0x80000004
	jb nobrandstr ; Brand string not supported
	mov eax, 0x80000002
	cpuid
	mov dword[esi+27],eax
	mov dword[esi+31],ebx
	mov dword[esi+35],ecx
	mov dword[esi+39],edx
	mov eax, 0x80000003
	cpuid
	mov dword[esi+43],eax
	mov dword[esi+47],ebx
	mov dword[esi+51],ecx
	mov dword[esi+55],edx
	mov eax, 0x80000004
	cpuid
	mov dword[esi+59],eax
	mov dword[esi+63],ebx
	mov dword[esi+67],ecx
	mov dword[esi+71],edx
	mov byte[esi+75],10
	mov eax,76
	pop esi
	ret

nobrandstr:
	mov eax,27
	pop esi
	ret

nonintel:
	cmp ebx, 'Auth'
	jnz nonamd
	cmp edx, 'enti'
	jnz nonamd
	cmp ecx, 'cAMD'
	jnz nonamd
	mov dword[esi], 'AMD '
	mov dword[esi+4], '  0x'
	jmp getver

nonamd:
	mov dword[esi], 'Unkn'
	mov dword[esi+4], 'own '
	mov dword[esi+8], 'CPU:'
	mov dword[esi+12], ebx
	mov dword[esi+16], edx
	mov dword[esi+20], ecx
	mov dword[esi+24], 10
	mov eax, 25
	pop esi
	ret
oldcpu:
	mov dword[esi], '386 '
	mov dword[esi+4], 'or 4'
	mov dword[esi+8], 0x000A3638
	mov eax, 11
	pop esi
	ret

IsSSE:
	pushfd
	pop eax
	mov edx, eax
	xor eax, 00200000h
	push eax
	popfd
	pushfd
	pop eax
	xor eax, edx
	jz nosse2 ; 386 or 486, can't use CPUID
	xor eax, eax
	inc eax
	cpuid
	xor eax, eax
	test edx, 02000000h
	jz nosse2 ; SSE not supported

	push SEHhandler  ; try to execute an SSE instruction
	push dword[FS:0]
	mov [FS:0], esp
	XORPS xmm0, xmm0
	pop dword[FS:0]
	pop edx
	inc eax
	ret
nosse:
	pop dword[FS:0]
	pop edx
nosse2:
	ret

IsSSE2:
	xor eax, eax
	inc eax
	cpuid
	xor eax, eax
	test edx, 04000000h
	jz nosse2 ; SSE2 not supported
	inc eax
	ret


virtual at eax
EXCEPTION_RECORD:
  .ExceptionCode dd ?
  .ExceptionFlag dd ?
  .NestedExceptionRecord dd ?
  .ExceptionAddress dd ?
  .NumberParameters dd ?
  .AdditionalData dd ?
end virtual

virtual at eax
CONTEXT:
  .ContextFlags dd ?
  ;DEBUG REGISTERS
  .Dr dd 6 dup ?
  ;FLOATING POINT
  .ControlWord dd ?
  .StatusWord dd ?
  .TagWord dd ?
  .ErrorOffset dd ?
  .ErrorSelector dd ?
  .DataOffset dd ?
  .DataSelector dd ?
  .FPURegs db 80 dup ?
  .Cr0NpxState dd ?
  ;SEGMENT REGISTERS
  .SegGs dd ?
  .SegFs dd ?
  .SegEs dd ?
  .SegDs dd ?
  ;GENERAL-PURPOSE REGISTERS
  .GPRedi dd ?
  .GPResi dd ?
  .GPRebx dd ?
  .GPRedx dd ?
  .GPRecx dd ?
  .GPReax dd ?
  .GPRebp dd ?
  .GPReip dd ?
  .SegCs dd ?
  .GRPflags dd ?
  .GRPesp dd ?
  .SegSS dd ?
end virtual

SEHhandler:
	mov eax, [esp+04] ; get EXCEPTION_RECORD structure address
	cmp [EXCEPTION_RECORD.ExceptionCode], STATUS_ILLEGAL_INSTRUCTION
	jnz nexthandler
	mov eax, [EXCEPTION_RECORD.ExceptionFlag]
	or eax, eax
	jnz nexthandler
	mov eax, [esp+12] ; get CONTEXT structure address
	mov [CONTEXT.GPReip], nosse
	xor eax, eax
	ret
nexthandler: ; allow system handler to show error message
	mov eax, 1
	ret


data import
  library kernel32,'KERNEL32.DLL',\
	  user32,'USER32.DLL'
  include '%fasminc%\apia\kernel32.inc'
  include '%fasminc%\apia\user32.inc'

end data

align 16
radius dd 4.0, 4.0, 4.0, 4.0
cx1 dd 4 dup ?
cy1 dd 4 dup ?
dx1 dd 4 dup ?
dy1 dd 4 dup ?
left1 dd 4 dup ?
cc    dd 1.0, 1.0, 1.0, 1.0
maskbw dd 0x808080, 0x808080, 0x808080, 0x808080
mask   dd 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
mask1  dd 0, 0, 0, 0xFFFFFFFF
mask2  dd 0, 0, 0xFFFFFFFF, 0xFFFFFFFF
scratch dd 0,0,0,0
reala dd 0x00323232, 0x00353630, 0x00383a2e, 0x003b3e2c, 0x003e422a,\
     0x00414628, 0x00444a26, 0x00474e24, 0x004a5222, 0x004d5620,\
     0x00505a1e, 0x00535e1c, 0x0056621a, 0x00596618, 0x005c6a16,\
     0x005f6e14, 0x00627212, 0x00657610, 0x00687a0e, 0x006b7e0c,\
     0x006e820a, 0x00718608, 0x00748a06, 0x00778e04, 0x007a9202,\
     0x007d9600, 0x00809a02, 0x00839e04, 0x0086a206, 0x0089a608,\
     0x008caa0a, 0x008fae0c, 0x0092b20e, 0x0095b610, 0x0098ba12,\
     0x009bbe14, 0x009ec216, 0x00a1c618, 0x00a4ca1a, 0x00a7ce1c,\
     0x00aad21e, 0x00add620, 0x00b0da22, 0x00b3de24, 0x00b6e226,\
     0x00b9e628, 0x00bcea2a, 0x00bfee2c, 0x00c2ee2e, 0x00c5ea30,\
     0x00c8e632, 0x00cbe234, 0x00cede36, 0x00d1da38, 0x00d4d63a,\
     0x00d7d23c, 0x00dace3e, 0x00ddca40, 0x00e0c642, 0x00e3c244,\
     0x00e6be46, 0x00e9ba48, 0x00ecb64a, 0x00efb24c, 0x00000000
LEFT   dd -2.0
TOP    dd -1.0
cx2	dd -0.12
cy2	dd 0.74
w dd 1024
h dd 719
dx2 dd 0.0029296875000000000
dy2 dd 0.0027816411682892906
m_MUL	       db "MUL          %10d %10d",10,0
m_FFFF	       db "FFFF         %10d %10d",10,0
m_MOVADD       db "MOVADD       %10d %10d",10,0
m_MOVADD2      db "MOVADD2      %10d %10d",10,0
m_MOVADD2ps2dq db "MOVADD2ps2dq %10d %10d",10,0
m_MOVADD3      db "MOVADD3      %10d %10d",10,0
m_MOVADD4      db "MOVADD4      %10d %10d",10,0
m_vod	       db "vod          %10d %10d",10,0
m_vodps2dq     db "vodps2dq     %10d %10d",10,0
m_vod2	       db "vod2         %10d %10d",10,0
msgnoinfo      db "Can't write log file. Copy the program to writable disk.",10
msgnosse       db "Your processor doesn't support SSE. The test can't continue, sorry.",10
bits dd realbits
a    dd reala
align 4
b dd ?
jt dd ?
hOut dd ?
hFile dd ?
read dd ?
sse2flag dd ?
i dd 5
buff db 256 dup ?
realbits dd 1024*768 dup ?

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)

Share

About the Author

Peter Kankowski
Software Developer
Russian Federation Russian Federation
Peter lives in Siberia, the land of sleeping sun, beautiful mountains, and infinitely deep snow. He recently started a wiki about algorithms and code optimization, where people could share their ideas, learn, and teach others.

| Advertise | Privacy | Terms of Use | Mobile
Web04 | 2.8.1411019.1 | Last Updated 29 Nov 2005
Article Copyright 2005 by Peter Kankowski
Everything else Copyright © CodeProject, 1999-2014
Layout: fixed | fluid