Monday, December 03, 2007

The power of vectorization

Since I have no new cool visuals to show, I'd thought I should blog about some random tech-topic. So lets talk about vectorization in shaders. Doing this in your HLSL code can improve the compiled code significantly. For example, a naive implementation of my bilateral boxfilter looks something like this:

half d[9];
half3 n[9];
half2 uv[9];

for (int i=0; i<9; ++i)
uv[i] = texCoord + offsets[i];
d[i] = GB_GetDepth(uv[i]);
n[i] = GB_GetNormal(uv[i]);

// calculate wieghts
half w[9];
half wSum = 0;

for (int i=0; i<9; i++)
half wn = pow(dot(n[0], n[i]),32);
half wz = (EPSILON + abs(d[0] - d[i]));

w[i] = wn / wz;
wSum += w[i];

// normalize weights
wSum = 1.0f/wSum;

// sample and apply weights
half3 color = 0;
for (int i=0; i<9; i++)
color += GB_GetDiffuse(uv[i]) * w[i] * wSum;

return float4(color,1);

With the ps2_b profile, this compiles to:

def c9, 2, -1, 32, 100
def c10, 0.00999999978, 0, 0, 0
dcl t0.xy
dcl_2d s0
dcl_2d s1
dcl_2d s2
add r0.xy, t0, c1
add r1.xy, t0, c0
add r2.xy, t0, c2
add r3.xy, t0, c3
add r4.xy, t0, c4
add r5.xy, t0, c5
add r6.xy, t0, c6
add r7.xy, t0, c7
add r8.xy, t0, c8
texld r9, r0, s2
texld r10, r1, s2
texld r11, r0, s1
texld r0, r0, s0
texld r12, r1, s1
texld r1, r1, s0
texld r13, r2, s2
texld r14, r2, s1
texld r2, r2, s0
texld r15, r3, s2
texld r16, r3, s1
texld r3, r3, s0
texld r17, r4, s2
texld r18, r4, s1
texld r4, r4, s0
texld r19, r5, s2
texld r20, r5, s1
texld r5, r5, s0
texld r21, r6, s2
texld r22, r6, s1
texld r6, r6, s0
texld r23, r7, s2
texld r24, r7, s1
texld r7, r7, s0
texld r25, r8, s2
texld r26, r8, s1
texld r8, r8, s0
add_pp r0.w, -r9.x, r10.x
abs_pp r0.w, r0.w
add_pp r0.w, r0.w, c10.x
rcp r0.w, r0.w
mad, r11, c9.x, c9.y
mad, r12, c9.x, c9.y
dp3_pp r1.w, r11, r9
pow_pp r2.w, r1.w, c9.z
mul r0.w, r0.w, r2.w
dp3_pp r1.w, r11, r11
pow_pp r2.w, r1.w, c9.z
mad_pp r1.w, r2.w, c9.w, r0.w
mul, r0, r0.w
mul r0.w, r2.w, c9.w
add_pp r2.w, r10.x, -r13.x
abs_pp r2.w, r2.w
add_pp r2.w, r2.w, c10.x
rcp r2.w, r2.w
mad, r14, c9.x, c9.y
dp3_pp r3.w, r11, r9
pow_pp r4.w, r3.w, c9.z
mad_pp r1.w, r4.w, r2.w, r1.w
mul r2.w, r2.w, r4.w
add_pp r3.w, r10.x, -r15.x
abs_pp r3.w, r3.w
add_pp r3.w, r3.w, c10.x
rcp r3.w, r3.w
mad, r16, c9.x, c9.y
dp3_pp r4.w, r11, r9
pow_pp r5.w, r4.w, c9.z
mad_pp r1.w, r5.w, r3.w, r1.w
mul r3.w, r3.w, r5.w
add_pp r4.w, r10.x, -r17.x
abs_pp r4.w, r4.w
add_pp r4.w, r4.w, c10.x
rcp r4.w, r4.w
mad, r18, c9.x, c9.y
dp3_pp r5.w, r11, r9
pow_pp r6.w, r5.w, c9.z
mad_pp r1.w, r6.w, r4.w, r1.w
mul r4.w, r4.w, r6.w
add_pp r5.w, r10.x, -r19.x
abs_pp r5.w, r5.w
add_pp r5.w, r5.w, c10.x
rcp r5.w, r5.w
mad, r20, c9.x, c9.y
dp3_pp r6.w, r11, r9
pow_pp r7.w, r6.w, c9.z
mad_pp r1.w, r7.w, r5.w, r1.w
mul r5.w, r5.w, r7.w
add_pp r6.w, r10.x, -r21.x
abs_pp r6.w, r6.w
add_pp r6.w, r6.w, c10.x
rcp r6.w, r6.w
mad, r22, c9.x, c9.y
dp3_pp r7.w, r11, r9
pow_pp r8.w, r7.w, c9.z
mad_pp r1.w, r8.w, r6.w, r1.w
mul r6.w, r6.w, r8.w
add_pp r7.w, r10.x, -r23.x
abs_pp r7.w, r7.w
add_pp r7.w, r7.w, c10.x
rcp r7.w, r7.w
mad, r24, c9.x, c9.y
dp3_pp r8.w, r11, r9
pow_pp r11.w, r8.w, c9.z
mad_pp r1.w, r11.w, r7.w, r1.w
mul r7.w, r7.w, r11.w
add_pp r8.w, r10.x, -r25.x
abs_pp r8.w, r8.w
add_pp r8.w, r8.w, c10.x
rcp r8.w, r8.w
mad, r26, c9.x, c9.y
dp3_pp r9.x, r11, r9
pow_pp r10.x, r9.x, c9.z
mad_pp r1.w, r10.x, r8.w, r1.w
mul r8.w, r8.w, r10.x
rcp_pp r1.w, r1.w
mul, r0, r1.w
mul, r1, r0.w
mad_pp, r1, r1.w, r0
mul, r2, r2.w
mad_pp, r1, r1.w, r0
mul, r3, r3.w
mad_pp, r1, r1.w, r0
mul, r4, r4.w
mad_pp, r1, r1.w, r0
mul, r5, r5.w
mad_pp, r1, r1.w, r0
mul, r6, r6.w
mad_pp, r1, r1.w, r0
mul, r7, r7.w
mad_pp, r1, r1.w, r0
mul, r8, r8.w
mad_pp, r1, r1.w, r0
mov_pp r0.w, -c9.y
mov_pp oC0, r0

This is an example of bad vectorization. The shader compiles to 151 instructions of witch 124 is arithmetic. The problem is that many instructions (like pow and abs) are not used with full power. Many instructions are SIMD instructions and can work on multiple data in parallel.

We can rewrite this shader to perform some of the arithmetic's in parallel like this:

half3 n[9];
half3 c[9];

for (int i=0; i<9; ++i)
half2 uv = texCoord + offsets[i];
n[i] = GB_GetNormal(uv);
c[i] = GB_GetDiffuse(uv);

// calculate normal wieghts
half4 wn1;
wn1.x = dot(n[0], n[1]);
wn1.y = dot(n[0], n[2]);
wn1.z = dot(n[0], n[3]);
wn1.w = dot(n[0], n[4]);
wn1 = pow(wn1, 32);

half4 wn2;
wn2.x = dot(n[0], n[5]);
wn2.y = dot(n[0], n[6]);
wn2.z = dot(n[0], n[7]);
wn2.w = dot(n[0], n[8]);
wn2 = pow(wn2, 32);

// calculate depth wieghts
half d = GB_GetDepth(texCoord + offsets[0]);

half4 d1;
d1.x = GB_GetDepth(texCoord + offsets[1]);
d1.y = GB_GetDepth(texCoord + offsets[2]);
d1.z = GB_GetDepth(texCoord + offsets[3]);
d1.w = GB_GetDepth(texCoord + offsets[4]);

half4 d2;
d2.x = GB_GetDepth(texCoord + offsets[5]);
d2.y = GB_GetDepth(texCoord + offsets[6]);
d2.z = GB_GetDepth(texCoord + offsets[7]);
d2.w = GB_GetDepth(texCoord + offsets[8]);

half4 wd1 = EPSILON + abs(d - d1);
half4 wd2 = EPSILON + abs(d - d2);

// divide normal weights with depth ones
half4 w1 = wn1 / wd1;
half4 w2 = wn2 / wd2;

// calc weight 0
half w0 = pow(dot(n[0], n[0]),32) / EPSILON;

// calc sum
half4 wSumTmp = w1 + w2; // perform sumation of weights 1+4,2+5,3+6,4+8
half wSum = w0 + dot(wSumTmp, float4(1,1,1,1)); // sum all weights
wSum = 1.0f/wSum;

// normalize weights
w0 *= wSum;
w1 *= wSum;
w2 *= wSum;

//apply weights
half3 color = c[0] * w0;
color += c[1] * w1.x;
color += c[2] * w1.y;
color += c[3] * w1.z;
color += c[4] * w1.w;
color += c[5] * w2.x;
color += c[6] * w2.y;
color += c[7] * w2.z;
color += c[8] * w2.w;

This compiles to:

def c9, 2, -1, 0.00999999978, 32
def c10, 100, 0, 0, 0
dcl t0.xy
dcl_2d s0
dcl_2d s1
dcl_2d s2
add_pp r0.xy, t0, c5
add_pp r1.xy, t0, c0
add_pp r2.xy, t0, c6
add_pp r3.xy, t0, c7
add_pp r4.xy, t0, c8
add_pp r5.xy, t0, c1
add_pp r6.xy, t0, c2
add_pp r7.xy, t0, c3
add_pp r8.xy, t0, c4
texld r9, r0, s1
texld r10, r1, s1
texld r11, r2, s1
texld r12, r3, s1
texld r13, r4, s1
texld_pp r14, r0, s2
texld r0, r0, s0
texld_pp r15, r2, s2
texld r2, r2, s0
texld_pp r16, r3, s2
texld r3, r3, s0
texld_pp r17, r4, s2
texld r4, r4, s0
texld_pp r18, r1, s2
texld r1, r1, s0
texld r19, r5, s1
texld r20, r6, s1
texld r21, r7, s1
texld r22, r8, s1
texld_pp r23, r5, s2
texld r5, r5, s0
texld_pp r24, r6, s2
texld r6, r6, s0
texld_pp r25, r7, s2
texld r7, r7, s0
texld_pp r26, r8, s2
texld r8, r8, s0
mad, r9, c9.x, c9.y
mad, r10, c9.x, c9.y
dp3_pp r9.x, r10, r9
mad, r11, c9.x, c9.y
dp3_pp r9.y, r10, r11
mad, r12, c9.x, c9.y
dp3_pp r9.z, r10, r11
mad, r13, c9.x, c9.y
dp3_pp r9.w, r10, r11
mul_pp r9, r9, r9
mul_pp r9, r9, r9
mul_pp r9, r9, r9
mul_pp r9, r9, r9
mul_pp r9, r9, r9
mov_pp r14.y, r15.x
mov_pp r14.z, r16.x
mov_pp r14.w, r17.x
add_pp r11, r18.x, -r14
abs_pp r11, r11
add_pp r11, r11, c9.z
rcp r12.x, r11.x
rcp r12.y, r11.y
rcp r12.z, r11.z
rcp r12.w, r11.w
mul_pp r9, r9, r12
mad, r19, c9.x, c9.y
dp3_pp r11.x, r10, r11
mad, r20, c9.x, c9.y
dp3_pp r11.y, r10, r12
mad, r21, c9.x, c9.y
dp3_pp r11.z, r10, r12
mad, r22, c9.x, c9.y
dp3_pp r11.w, r10, r12
dp3_pp r0.w, r10, r10
mul_pp r10, r11, r11
mul_pp r10, r10, r10
mul_pp r10, r10, r10
mul_pp r10, r10, r10
mul_pp r10, r10, r10
mov_pp r23.y, r24.x
mov_pp r23.z, r25.x
mov_pp r23.w, r26.x
add_pp r11, r18.x, -r23
abs_pp r11, r11
add_pp r11, r11, c9.z
rcp r12.x, r11.x
rcp r12.y, r11.y
rcp r12.z, r11.z
rcp r12.w, r11.w
mad_pp r11, r10, r12, r9
mul_pp r10, r10, r12
dp4 r1.w, r11, -c9.y
pow_pp r2.w, r0.w, c9.w
mad_pp r0.w, r2.w, c10.x, r1.w
mul_pp r1.w, r2.w, c10.x
rcp_pp r0.w, r0.w
mul_pp r10, r10, r0.w
mul_pp, r5, r10.x
mul_pp r1.w, r1.w, r0.w
mul_pp r9, r9, r0.w
mad_pp, r1, r1.w, r5
mad_pp, r6, r10.y, r1
mad_pp, r7, r10.z, r1
mad_pp, r8, r10.w, r1
mad_pp, r0, r9.x, r1
mad_pp, r2, r9.y, r0
mad_pp, r3, r9.z, r0
mad_pp, r4, r9.w, r0
mov_pp r0.w, -c9.y
mov_pp oC0, r0

This time there is 108 instructions of witch 81 arithmetic.

In this shader, when calculating the weights, more work is done per instruction. For example, the pow instruction is used only three times instead of eight. The summation of weights are done with two add and one dp4 instead of nine add.

Note that we could have vectorized the texture coordinate calculations as well, but then we would need SM 3.0 (arbitrary swizzles) and change how the offset constant is set up. 

With a small effort we got rid of 43 ALU instructions! It should be noted that this necessarily won't lead to a speed-up since instruction count is just part of the story. ALU/Texture instruction ratio, texture cache etc. comes in to play as well. On my Radeon X1600 this shader is limitied by texture lookups so the win is not that big.

At last I would like to point out that I'm by no means a shader optimization guru, just a guy trying to fill his blog ;)