Click here to Skip to main content
15,888,610 members
Articles / Desktop Programming / Win32

Using Assembler and SSE2/SSE3 instructions for drawing optimization

Rate me:
Please Sign up or sign in to vote.
4.92/5 (24 votes)
29 Jul 2009CPOL10 min read 52.2K   1.2K   64  
Introduce the algorithm and technique for pre-drawing process speed up.
.686
.MODEL flat,stdcall
option casemap: none
option scoped
.xmm
.code

;---------------------------------------------------------------------
;Main Function to perform transfromations
;Features: Performs in one loop Affine transformation, curve clipping,
;and drawing points simplification
;Iteraction with used functions is performed throght registers.
;Parameters:
; _values -pointer to arrray of double values to be transformed, assume that values are located in maner
;          x0,y0,x1,y1,...,xn-1,yn-1,xn,yn
; _count - unsigned integer values of points in _values array (!) count of Points= count of pairs X,Y
;          but not the count of elements in _values array.
;_res - pointer to array of integers used as output buffer.
;_m00~_m21 - double numbers, representing elements of transformation matrix. 
;            Number - index of element(row,column)
;_xMin,xMax,yMin,yMax - double numbers representing clipping area.
;returns - count of points(!) after transformation.
;Notes:
;Because of curve clipping there possible cases then curve goes out of clipping area
;and than comes back and so on. To handle this cases,
;points of discontinuity are marked as 80000000h -32bit integer min value.
;----------------------------------------------------------------------
TransfromClipReducePoints PROC PUBLIC _values :DWORD, 
_count :DWORD,
_res :DWORD,
_m00 :QWORD,
_m01 :QWORD,
_m10 :QWORD,
_m11 :QWORD,
_m20 :QWORD,
_m21 :QWORD,
_xMin :QWORD, 
_yMin :QWORD,
_xMax :QWORD,
_yMax :QWORD

				push ebx
				push esi
				push edi

	            ;load parameters to corrensponding registers
	            mov esi, dword ptr[_values]      ;input data
	            mov ecx, dword ptr[_count]       ;input points count
	            mov edi, dword ptr[_res]         ;destination pointer
	            
	            ;load transformation matrix to xmm0-xmm2 registers in a way:
	            ; register     Low  | High
	            ;-------------------------
	            ; xmm0         m00  | m10
	            ; xmm1         m01  | m11
	            ; xmm2         m20  | m21
	            
	            movsd  xmm0,_m00
	            movhpd xmm0,_m10
	            movsd  xmm1, _m01
	            movhpd xmm1, _m11
	            movsd  xmm2, _m20
	            movhpd xmm2, _m21           
	            
	            ;loading clipping bounds to xmm6, xmm7 registers in a way
	            ; register     Low  | High
	            ;-------------------------
	            ; xmm6         xMin | yMin
	            ; xmm7         xMax | yMax
	            
	            movsd   xmm6,   _xMin
	            movhpd  xmm6,   _yMin
	            movsd   xmm7,   _xMax
	            movhpd  xmm7,   _yMax
                ;discontinuity marker loading to mm7 register
                mov eax, 80000000h
                push eax
                push eax
                movq mm7, [esp]
                add esp,8h	                                    
	            xor     eax, eax 
	            push    eax            ;stack allocation for ouput points count.
	            xor ebx, ebx           ;initialization to store discontinuity flag.
				call TransformPoint
                        				;calculating clipping bit code for calculated point and return value in [al] register.
				call CalculateBitCode
                        				;in this function during calculations xmm3 contains current point, and xmm5 previos point.
				                        ;Store calculated values as previous point
				movapd  xmm5,   xmm3
				                        ;registers dl,dh are used to store clip bit codes
				                        ; dh - for point located in xmm5 (previos point)
				                        ; dl - for point located in xmm3 (current point)
				mov dh,al
				dec ecx                 ;because firs point is already processed
StartMainLoop:
				call TransformPoint
				call CalculateBitCode
				mov dl,al               ;dl - set dl to current point clip code
				
				                        ;Clipping. Check for two trivial cases
				                        ;1st trivial case: both points are outside and almoust on same side 
				                        ;(same bit is on for both points)
			
			    test dl,dh              ;test operation performs bitwise AND operation,
				                        ;so if same bit in both operators is set result will be not 0(zero)
				
				jnz Continue
				                        ;put previous point to xmm4, as is Y,X
				movapd xmm4,xmm5        ;because AddPoint use as input xmm4
				                        ;2nd trivial case, both points are inside the clipping area
				mov al,dh
				or al,dl
				jnz ClipPrevPoint
				;mov al,0h
				call FilterPoint
				jmp Continue
ClipPrevPoint:
				cmp dh,0h
				jnz ClipPrev
				;mov al,0h
				call FilterPoint
				jmp ClipCurrentPoint
ClipPrev:
				mov al,dh
				call CalcClip           ;uses eax, xmm3,xmm5 result stored in xmm4
				;mov al,0h
				call FilterPoint
ClipCurrentPoint:
				movapd xmm4,xmm3 
				cmp dl,0h
				jz Continue
				mov al,dl
				call CalcClip
				;mov al,1h
				call FilterPointDiscont
Continue:
				                        
			    movapd xmm5,xmm3        ;Store current point to previous
				mov dh,dl               ;Store current point clip code to previous
				loop StartMainLoop
				pop eax                 ;pop return value
				emms                    ;clear MMX state.
				pop edi
				pop esi
				pop ebx
                ret 5ch
TransfromClipReducePoints ENDP

;---------------------------------------------------------------------
;Function performs affine transformation of point located by ptr[esi]
;increments [esi] to point no next point
;calculated point is stored in xmm3 regiter, in a way Low - X, High-Y
;---------------------------------------------------------------------
TransformPoint PROC PRIVATE
				movupd  xmm3,   [esi]       ;xmm3 = x,y
				movapd xmm4,xmm3            ;xmm4 = x,y
				;Calculation
				mulpd xmm3, xmm0         ;xmm3 = M00*X | M10*Y
				mulpd xmm4, xmm1         ;xmm4 = M01*X | M11*Y
				haddpd xmm3,xmm4         ;xmm3 = M00*X + M10*Y | M01X+M11Y
				addpd xmm3, xmm2         ;xmm3 = M00*X + M10*Y+ M20 | M01*X + M11*Y + M20
				add  esi, 10h            ;increment esi till next call of this function.
				ret 0
TransformPoint ENDP

;----------------------------------------------------------------------
;Function calculates point clip code
;Input:
;point to examine located in [xmm3] register.
;Return value in [al] register.
;Notes:
;This function reverse value located in xmm3 register in a way Low part=Y, High part=X
;----------------------------------------------------------------------
CalculateBitCode PROC PRIVATE
				xor eax,eax         ;clear eax
				comisd xmm3, xmm6   ;read about this command in Intel command reference.
				jae __CheckRight
				or al, 0001b
				jmp __CheckBottom
__CheckRight:
				comisd xmm3, xmm7
				jbe __CheckBottom
				or al,0010b
__CheckBottom:
				shufpd xmm6,xmm6,01h
				shufpd xmm7,xmm7,01h
                ;exchange X and Y After this command Low = Y High=X
				SHUFPD xmm3, xmm3, 01h
				;Compare Y
				comisd xmm3, xmm6
				jae __CheckTop
				or al, 0100b
				jmp __RestoreRegisters
__CheckTop:
				comisd xmm3, xmm7
				jbe __RestoreRegisters
				or al,1000b
__RestoreRegisters:
				;return xmm6,xmm7 to original values
				shufpd xmm6,xmm6,01h
				shufpd xmm7,xmm7,01h
				ret 0			
CalculateBitCode ENDP


;-------------------------------------------------------------------------
;Function calculates clipped point
;Inpput:
;xmm3,xmm5 registers contains points to be clipped.
;al register contains clip code.
;Return: clipped point is stored in xmm4 register
;-------------------------------------------------------------------------
CalcClip PROC PRIVATE
				test al,1h
				jz IsRight
				;Calculations
				movddup xmm4, xmm6 ;XMin
				                   ;xmm4= xt,xt
YFromXCalc:				
				shufpd xmm3,xmm3,1h  ;xmm3 = x1,y1
				subsd  xmm4,xmm3     ;xmm4 = Xt-x1, Xt
				shufpd xmm3,xmm3,1h  ;xmm3 = y1,x1
				subpd  xmm5,xmm3     ;xmm5 = y2-y1,x2-x1
				mulsd  xmm4,xmm5     ;xmm4 = (y2-y1)*(Xt-x1),Xt
				shufpd xmm5,xmm5,1h  ;xmm5 = x2-x1,y2-y1
				divsd  xmm4,xmm5     ;xmm4 = (Xt-x1)*(y2-y1)/(x2-x1) , Xt
				addsd  xmm4,xmm3     ;xmm4 = (Xt-x1)*(y2-y1)/(x2-x1)+y1=Yt , Xt
				shufpd xmm5,xmm5,1h  ;xmm5 = y2-y1,x2-x1
				addpd xmm5,xmm3      ;restore values in xmm5
				;X done go to check Y
				jmp IsBottom
IsRight:
				test al,2h
				jz IsBottom
				movddup xmm4, xmm7 ;XMax
				                   ;xmm4 = Xt,Xt
				jmp YFromXCalc
IsBottom:
				TEST al,04h
			    jz IsTop
				shufpd xmm6,xmm6,1h
				comisd xmm4,xmm6
				shufpd xmm6,xmm6,1h
				jae IsTop
				;recalculation
				movapd xmm4, xmm6
				shufpd xmm4,xmm4, 3h
XFromYCalc:				
				;because we going to calc xmm3-xmm5
                ;(YL-Y1)/(Y2-Y1)*(X2-X1)+X1,YL
				subsd xmm4,xmm3     ;xmm4 = Yt-Y1,Yt
				subpd xmm5,xmm3     ;xmm5 = y2-y1, x2-x1
				shufpd xmm5,xmm5,1h ;xmm5 = x2-x1, y2-y1
				mulsd xmm4,xmm5     ;xmm4 =(Yt-Y1)*(x2-x1),Yt
				shufpd xmm5,xmm5,1h ;xmm5 = y2-y1, x2-x1
				divsd xmm4,xmm5     ;xmm4 =(Yt-Y1)*(x2-x1)/(y2-y1),Yt
				shufpd xmm3,xmm3,1h ;xmm3 = x1,y1
				addsd xmm4, xmm3    ;xmm4 = (Yt-Y1)*(x2-x1)/(y2-y1)+x1=Xt,Yt
				shufpd xmm4,xmm4,1h ;xmm4 = Yt,(Yt-Y1)*(x2-x1)/(y2-y1)+x1=Xt
				shufpd xmm3,xmm3,1h ;xmm3 = y1,x1
				addpd xmm5,xmm3     ;restore values in xmm5
				jmp Done
IsTop:
				TEST al,8h
				jz Done

				shufpd xmm7,xmm7,1h
				comisd xmm4,xmm7
				shufpd xmm7,xmm7,1h
				jbe Done

				;recalculation
				movapd xmm4, xmm7
				shufpd xmm4,xmm4,3h
				jmp XFromYCalc
Done:
				ret 0
CalcClip ENDP

;------------------------------------------------------------------------
;Function performs points simplification by range. If rounded to int point
;is same as previous, point is ignored. Point is oututed to ptr [edi] 
;and increments edi register till next use.
;Function increments the counter of outputed points.
;Input:
;Point to examine located in xmm4 register
;Function uses MMX registers mm1, and mm0 to perform rounding and to store
;previously added point values.
;Function reset the disctontinuity flag.
;------------------------------------------------------------------------
FilterPoint PROC PRIVATE
				shufpd xmm4,xmm4,1h    ;xmm4= Xt,Yt
				CVTPD2PI mm1,xmm4
				cmp dword ptr[esp+4h], 0h ;check for first point
				je __Add
				;if not first point perform filtering
				movq mm2,mm0
				pcmpeqd mm2,mm1 ;if equal all bits of mm2 = 1b
				movq mm3,mm2
				PSRLQ mm3,20h
				pand mm2,mm3 
				;if both coordinates are equal value will be FFFFFFFF, 
				;if one of them are not equal value will be 0
				movd eax,mm2
				cmp eax,0h
				jne __Skip
__Add:				
                call OutputPoint
__Skip:
			    ;if point is filtered need to check to reset Discontinuity flag
                mov ebx, 0h ; set that discontinuity do not reqired before addind next point.			    
                ret 0
FilterPoint ENDP
;------------------------------------------------------------------------
;Function performs points simplification by range. If rounded to int point
;is same as previous, point is ignored. Point is oututed to ptr [edi] 
;and increments edi register till next use.
;Function increments the counter of outputed points.
;Input:
;Point to examine located in xmm4 register
;Function uses MMX registers mm1, and mm0 to perform rounding and to store
;previously added point values.
;Functioin set discontinuity flag.
;------------------------------------------------------------------------
FilterPointDiscont PROC PRIVATE
				shufpd xmm4,xmm4,1h    ;xmm4= Xt,Yt
				CVTPD2PI mm1,xmm4
				cmp dword ptr[esp+4h], 0h ;check for first point
				je __Add
				;if not first point perform filtering
				movq mm2,mm0
				pcmpeqd mm2,mm1 ;if equal all bits of mm2 = 1b
				movq mm3,mm2
				PSRLQ mm3,20h
				pand mm2,mm3 
				;if both coordinates are equal value will be FFFFFFFF, 
				;if one of them are not equal value will be 0
				movd eax,mm2
				cmp eax,0h
				jne __Skip
__Add:				
                call OutputPoint
__Skip:
                mov ebx, 01b
	            ret 0			
FilterPointDiscont ENDP
;--------------------------------------------------------------------
;Function outputs result points.
;Input : mm1 contains point to be outputed.
;        ebx contains discontinuity flag 1 set discontinuity marker 0 do not set
;        edi register points to ouput buffer
;        [esp+8] counter of ouputed points
;--------------------------------------------------------------------
OutputPoint PROC PRIVATE
   				test ebx,01b
   				jz AddPoint
                movq [edi],mm7				
                add edi,8h
                inc dword ptr[esp+8h]
AddPoint:                
				movq [edi],mm1
			    add edi,8h
				movq mm0,mm1
				inc dword ptr[esp+8h]
				ret 0

OutputPoint ENDP
end 

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)


Written By
Software Developer Citrix Japan R&D
Japan Japan
This member has not yet provided a Biography. Assume it's interesting and varied, and probably something to do with programming.

Comments and Discussions