Compiler made optimizations

Why a bit of code is VASTLY different on different compilers

Page 1 of 1

2 Replies - 1785 Views - Last Post: 25 September 2009 - 02:08 AM

#1 NickDMax  Icon User is offline

  • Can grep dead trees!
  • member icon

Reputation: 2250
  • View blog
  • Posts: 9,245
  • Joined: 18-February 07

Compiler made optimizations

Posted 25 September 2009 - 12:32 AM

So if you were to look at my post in this thread on ArcTan then you see that I had some fun... But I found the results very surprising. I was rather shocked that the Horner Scheme did so poorly. I was also shocked that the Mul2 version was the fastest on my goto-compiler bcc32 -- I was even MORE shocked that the strait multiplication version was the fastest under MinGW. (BTW the results are not significantly effected in either.

However the hands down winner of all the versions (even on my laptop) is the Mul2 version. So I would like some help understanding why the borland code is so fast -- it either has a flaw or it is neatly optimized compared to the others.
(I didn't clean these up so sorry about all the messy symbolic garbage)

Borland Version:
_TEXT	segment dword public use32 'CODE'
@Mul2ArcTan$qd	segment virtual
@@Mul2ArcTan$qd	proc	near
?live16387@0:
  ;	
  ;	double INLINE Mul2ArcTan(double x) {
  ;	
	push	  ebp
	mov	   ebp,esp
	add	   esp,-24
  ;	
  ;		double x2 = x * x;
  ;	
@9:
	fld	   qword ptr [ebp+8]
	fmul	  qword ptr [ebp+8]
	fstp	  qword ptr [ebp-8]
  ;	
  ;		double x4 = x2 * x2;
  ;	
	fld	   qword ptr [ebp-8]
	fmul	  qword ptr [ebp-8]
	fstp	  qword ptr [ebp-16]
  ;	
  ;		double x8 = x4 * x4;
  ;	
	fld	   qword ptr [ebp-16]
	fmul	  qword ptr [ebp-16]
	fstp	  qword ptr [ebp-24]
  ;	
  ;		return x - x2 * x / 3.0 + x4 * x / 5.0 - x4 * x2 * x / 7.0 + x8 * x / 9.0 - x8 * x2 * x / 11.0;
  ;	
	fld	   qword ptr [ebp-8]
	fmul	  qword ptr [ebp+8]
	fld	   tbyte ptr [@10]
	fmulp	  st(1),st
	fsubr	 qword ptr [ebp+8]
	fld	   qword ptr [ebp-16]
	fmul	  qword ptr [ebp+8]
	fld	   tbyte ptr [@10+12]
	fmulp	  st(1),st
	faddp	  st(1),st
	fld	   qword ptr [ebp-16]
	fmul	  qword ptr [ebp-8]
	fmul	  qword ptr [ebp+8]
	fld	   tbyte ptr [@10+24]
	fmulp	  st(1),st
	fsubp	  st(1),st
	fld	   qword ptr [ebp-24]
	fmul	  qword ptr [ebp+8]
	fld	   tbyte ptr [@10+36]
	fmulp	  st(1),st
	faddp	  st(1),st
	fld	   qword ptr [ebp-24]
	fmul	  qword ptr [ebp-8]
	fmul	  qword ptr [ebp+8]
	fld	   tbyte ptr [@10+48]
	fmulp	  st(1),st
	fsubp	  st(1),st
  ;	
  ;	}
  ;	
@12:
@11:
	mov	   esp,ebp
	pop	   ebp
	ret 
	align 4		
@10:
	db		171,170,170,170,170,170,170,170,253,63,0,0,205,204,204,204
	db		204,204,204,204,252,63,0,0,73,146,36,73,146,36,73,146
	db		252,63,0,0,142,227,56,142,227,56,142,227,251,63,0,0
	db		140,46,186,232,162,139,46,186,251,63,0,0
@@Mul2ArcTan$qd	endp
@Mul2ArcTan$qd	ends
_TEXT	ends


VS 2010 32bit:
PUBLIC	?Mul2ArcTan@@YANN@Z			   ; Mul2ArcTan
; Function compile flags: /Odtp
_TEXT	SEGMENT
_x4$ = -24					   ; size = 8
_x2$ = -16					   ; size = 8
_x8$ = -8					   ; size = 8
_x$ = 8						   ; size = 8
?Mul2ArcTan@@YANN@Z PROC			   ; Mul2ArcTan
; Line 19
	push	ebp
	mov	ebp, esp
	sub	esp, 24				   ; 00000018H
; Line 20
	fld	QWORD PTR _x$[ebp]
	fmul	QWORD PTR _x$[ebp]
	fstp	QWORD PTR _x2$[ebp]
; Line 21
	fld	QWORD PTR _x2$[ebp]
	fmul	QWORD PTR _x2$[ebp]
	fstp	QWORD PTR _x4$[ebp]
; Line 22
	fld	QWORD PTR _x4$[ebp]
	fmul	QWORD PTR _x4$[ebp]
	fstp	QWORD PTR _x8$[ebp]
; Line 23
	fld	QWORD PTR _x2$[ebp]
	fmul	QWORD PTR _x$[ebp]
	fdiv	QWORD PTR __real@4008000000000000
	fsubr	QWORD PTR _x$[ebp]
	fld	QWORD PTR _x4$[ebp]
	fmul	QWORD PTR _x$[ebp]
	fdiv	QWORD PTR __real@4014000000000000
	faddp	ST(1), ST(0)
	fld	QWORD PTR _x4$[ebp]
	fmul	QWORD PTR _x2$[ebp]
	fmul	QWORD PTR _x$[ebp]
	fdiv	QWORD PTR __real@401c000000000000
	fsubp	ST(1), ST(0)
	fld	QWORD PTR _x8$[ebp]
	fmul	QWORD PTR _x$[ebp]
	fdiv	QWORD PTR __real@4022000000000000
	faddp	ST(1), ST(0)
	fld	QWORD PTR _x8$[ebp]
	fmul	QWORD PTR _x2$[ebp]
	fmul	QWORD PTR _x$[ebp]
	fdiv	QWORD PTR __real@4026000000000000
	fsubp	ST(1), ST(0)
; Line 24
	mov	esp, ebp
	pop	ebp
	ret	0
?Mul2ArcTan@@YANN@Z ENDP			   ; Mul2ArcTan
_TEXT	ENDS



The minGW version:
	.section .rdata,"dr"
	.align 8
LC13:
	.long	0
	.long	1074266112
	.align 8
LC14:
	.long	0
	.long	1075052544
	.align 8
LC15:
	.long	0
	.long	1075576832
	.align 8
LC16:
	.long	0
	.long	1075970048
	.align 8
LC17:
	.long	0
	.long	1076232192
	.text
	.align 2
.globl __Z10Mul2ArcTand
	.def	__Z10Mul2ArcTand;	.scl	2;	.type	32;	.endef
__Z10Mul2ArcTand:
	pushl	%ebp
	movl	 %esp, %ebp
	subl	 $24, %esp
	fldl	 8(%ebp)
	fmull	8(%ebp)
	fstpl	-8(%ebp)
	fldl	 -8(%ebp)
	fmull	-8(%ebp)
	fstpl	-16(%ebp)
	fldl	 -16(%ebp)
	fmull	-16(%ebp)
	fstpl	-24(%ebp)
	fldl	 -8(%ebp)
	fmull	8(%ebp)
	fldl	 LC13
	fdivrp   %st, %st(1)
	fldl	 8(%ebp)
	fsubp	%st, %st(1)
	fldl	 -16(%ebp)
	fmull	8(%ebp)
	fldl	 LC14
	fdivrp   %st, %st(1)
	faddp	%st, %st(1)
	fldl	 -16(%ebp)
	fmull	-8(%ebp)
	fmull	8(%ebp)
	fldl	 LC15
	fdivrp   %st, %st(1)
	fsubrp   %st, %st(1)
	fldl	 -24(%ebp)
	fmull	8(%ebp)
	fldl	 LC16
	fdivrp   %st, %st(1)
	faddp	%st, %st(1)
	fldl	 -24(%ebp)
	fmull	-8(%ebp)
	fmull	8(%ebp)
	fldl	LC17
	fdivrp	%st, %st(1)
	fsubrp	%st, %st(1)
	leave
	ret



Vs2010 (not really fair to compare I suppose... it was much slower though...):
_TEXT	SEGMENT
x8$ = 0
x2$ = 8
x4$ = 16
x$ = 48
?Mul2ArcTan@@YANN@Z PROC			   ; Mul2ArcTan
; Line 19
$LN3:
	movsdx	QWORD PTR [rsp+8], xmm0
	sub	rsp, 40				   ; 00000028H
; Line 20
	movsdx	xmm0, QWORD PTR x$[rsp]
	mulsd	xmm0, QWORD PTR x$[rsp]
	movsdx	QWORD PTR x2$[rsp], xmm0
; Line 21
	movsdx	xmm0, QWORD PTR x2$[rsp]
	mulsd	xmm0, QWORD PTR x2$[rsp]
	movsdx	QWORD PTR x4$[rsp], xmm0
; Line 22
	movsdx	xmm0, QWORD PTR x4$[rsp]
	mulsd	xmm0, QWORD PTR x4$[rsp]
	movsdx	QWORD PTR x8$[rsp], xmm0
; Line 23
	movsdx	xmm0, QWORD PTR x2$[rsp]
	mulsd	xmm0, QWORD PTR x$[rsp]
	divsd	xmm0, QWORD PTR __real@4008000000000000
	movsdx	xmm1, QWORD PTR x$[rsp]
	subsd	xmm1, xmm0
	movapd	xmm0, xmm1
	movsdx	xmm1, QWORD PTR x4$[rsp]
	mulsd	xmm1, QWORD PTR x$[rsp]
	divsd	xmm1, QWORD PTR __real@4014000000000000
	addsd	xmm0, xmm1
	movsdx	xmm1, QWORD PTR x4$[rsp]
	mulsd	xmm1, QWORD PTR x2$[rsp]
	mulsd	xmm1, QWORD PTR x$[rsp]
	divsd	xmm1, QWORD PTR __real@401c000000000000
	subsd	xmm0, xmm1
	movsdx	xmm1, QWORD PTR x8$[rsp]
	mulsd	xmm1, QWORD PTR x$[rsp]
	divsd	xmm1, QWORD PTR __real@4022000000000000
	addsd	xmm0, xmm1
	movsdx	xmm1, QWORD PTR x8$[rsp]
	mulsd	xmm1, QWORD PTR x2$[rsp]
	mulsd	xmm1, QWORD PTR x$[rsp]
	divsd	xmm1, QWORD PTR __real@4026000000000000
	subsd	xmm0, xmm1
; Line 24
	add	rsp, 40				   ; 00000028H
	ret	0
?Mul2ArcTan@@YANN@Z ENDP			   ; Mul2ArcTan
_TEXT	ENDS


Now, I it seems to me that all of the compilers produced pretty much the same code... Now my guess is that the Borland/MinGW get their enhanced performance because they store the constant values close to the code (VS puts these __real@4008000000000000 way off in another segment which does not aprear to be adjacent) -- This would cause cache misses and thus slow down the code... What would be the best way to test that?

But that does not explain how the Borland code beats out the MinGW version... Though since they are close maybe the difference is in the loop code... I should actually have run a baseline test where a simple integer calculation in run to get a feel for how they optimize the loop...

Is This A Good Question/Topic? 0
  • +

Replies To: Compiler made optimizations

#2 NickDMax  Icon User is offline

  • Can grep dead trees!
  • member icon

Reputation: 2250
  • View blog
  • Posts: 9,245
  • Joined: 18-February 07

Re: Compiler made optimizations

Posted 25 September 2009 - 01:44 AM

int a 4am effort to get VC++2010 up to the speed of the older compilers I tried to change the routine slightly to get the constants into the cache:
double Mul2ArcTan(double x) {
	double c[]= {3.0, 5.0, 7.0,  9.0, 11.0}; 
	double x2 = x * x;
	double x4 = x2 * x2;
	double x8 = x4 * x4;
	return x - x2 * x / c[0] + x4 * x / c[1] - x4 * x2 * x / c[2] + x8 * x / c[3] - x8 * x2 * x / c[4];
}
-- Resulted in EXACTLY the same code -- I had to compile twice just to be sure. Apparently the compiler thinks that this SHOULD BE an optimization. I first tried named variables... then this array. the code is the same either way -- VC++ replaces the values with constants held in another segment. Anyway it is almost 5am and I am not thinking clearly... I am just mad that the new VC compiler is generating the slowest code. -- Compiler choices on windows are limited...
Was This Post Helpful? 0
  • +
  • -

#3 NickDMax  Icon User is offline

  • Can grep dead trees!
  • member icon

Reputation: 2250
  • View blog
  • Posts: 9,245
  • Joined: 18-February 07

Re: Compiler made optimizations

Posted 25 September 2009 - 02:08 AM

....ok... say what you will about that VC++ compiler... but slap that /O2 option into the command line and POW!

32Bit
> "C:\CProjects\Forum Help\ArcTanTest.exe " 
calculating some values... 
1> 0.785398> 0.744012 : 0.744012 : -0.0413866 : 0
PowArcTan Calculated 50000000 values in :	1.796 seconds.
MulArcTan Calculated 50000000 values in :	0.547 seconds.
Mul2ArcTan Calculated 50000000 values in :	0.532 seconds.
HornArcTan Calculated 50000000 values in :	0.375 seconds.
count* Calculated 50000000 values in :	0.14 seconds.


64bit:
> "C:\CProjects\Forum Help\ArcTanTest.exe " 
calculating some values... 
1> 0.785398> 0.744012 : 0.744012 : -0.0413866 : 0
PowArcTan Calculated 50000000 values in :	1.578 seconds.
MulArcTan Calculated 50000000 values in :	0.078 seconds.
Mul2ArcTan Calculated 50000000 values in :	0.047 seconds.
HornArcTan Calculated 50000000 values in :	0.062 seconds.
count* Calculated 50000000 values in :	0.078 seconds.



Ok... That is pretty darn fast... the change in code is not really all that much, just better use of registers I think:
From 32bit:
PUBLIC	?Mul2ArcTan@@YANN@Z		; Mul2ArcTan
; Function compile flags: /Ogtpy
;	COMDAT ?Mul2ArcTan@@YANN@Z
_TEXT	SEGMENT
_x$ = 8					; size = 8
?Mul2ArcTan@@YANN@Z PROC		; Mul2ArcTan, COMDAT
; Line 21
	fld	QWORD PTR _x$[esp-4]
	fld	ST(0)
	fmul	ST(0), ST(1)
; Line 22
	fld	ST(0)
	fmul	ST(0), ST(1)
; Line 23
	fld	ST(0)
	fmul	ST(0), ST(1)
; Line 24
	fld	ST(2)
	fmul	ST(0), ST(4)
	fdiv	QWORD PTR __real@4008000000000000
	fsubr	ST(0), ST(4)
	fld	ST(2)
	fmul	ST(0), ST(5)
	fdiv	QWORD PTR __real@4014000000000000
	faddp	ST(1), ST(0)
	fxch	ST(2)
	fmul	ST(0), ST(3)
	fmul	ST(0), ST(4)
	fdiv	QWORD PTR __real@401c000000000000
	fsubp	ST(2), ST(0)
	fld	ST(0)
	fmul	ST(0), ST(4)
	fdiv	QWORD PTR __real@4022000000000000
	faddp	ST(2), ST(0)
	fmulp	ST(2), ST(0)
	fxch	ST(1)
	fmulp	ST(2), ST(0)
	fxch	ST(1)
	fdiv	QWORD PTR __real@4026000000000000
	fsubp	ST(1), ST(0)
; Line 25
	ret	0
?Mul2ArcTan@@YANN@Z ENDP		; Mul2ArcTan
_TEXT	ENDS


although the other two compilers did see improvements, NOTHING like VC! Though MinGW did knock the Pow version down to the same range as the other routines which is interesting.

All I can say is WOW! That 64bit /O2 code screams... though... maybe it is too fast -- it is indistinguishable from just a int-double multiplication.... maybe the compiler optimized the function right out (since I don't do anything with the data).
Was This Post Helpful? 0
  • +
  • -

Page 1 of 1