XNOR-Net on VideoCoreIV
- 16. •
def popcount(rx, tmp_r0, tmp_ra):
shr(tmp_r0, rx, 1)
band(tmp_r0, tmp_r0, CONST_0x55555555_REG)
isub(rx, rx, tmp_r0)
band(tmp_ra, rx, CONST_0x33333333_REG)
shr(tmp_r0, rx, 2)
band(tmp_r0, tmp_r0, CONST_0x33333333_REG)
iadd(rx, tmp_ra, tmp_r0).mov(tmp_ra, 0)
shr(tmp_r0, rx, 4)
iadd(rx, rx, tmp_r0)
band(tmp_ra, rx, CONST_0x0f0f0f0f_REG).mov(rx, tmp_ra)
nop()
iadd(rx, rx, tmp_ra.unpack('8a'))
iadd(rx, rx, tmp_ra.unpack('8b'))
iadd(rx, rx, tmp_ra.unpack('8c'))
iadd(rx, rx, tmp_ra.unpack('8d'))
- 18. •
L.bk_loop
mov(r3, A_CUR_REG, sig='load tmu0')
mov(r1, A_STRIDE_REGS[2]).mov(tmu0_s, r3)
mov(r0, r4, sig='load tmu0').imul24(r1, r1, SY_REG)
iadd(tmu0_s, r1, r3)
mov(broadcast, r0, sig='load tmu1').mov(r1, r4)
iadd(A_CUR_REG, r3, DX_REG).fmul(r3, r4, r5)
for i in range(0, 7):
rotate(broadcast, r0, -(2*i+1))
fadd(rb[i], rb[i], r3).fmul(r3, r4, r5)
rotate(broadcast, r0, -(2*i+2))
fadd(ra[i], ra[i], r3).fmul(r3, r4, r5)
rotate(broadcast, r0, -15)
fadd(rb7, rb7, r3).fmul(r3, r4, r5)
mov(broadcast, r1)
fadd(ra7, ra7, r3).fmul(r3, r4, r5)
(snip.)
- 19. •
bxor(r3, r3, r3).mov(r2, KW_REG, cond='zs', sig='load tmu0')
L.bk_loop
bnot(r1, r4, sig='load tmu1’) # VC4 have no xnor instruction
iadd(ra[7+8], ra[7+8], r3).mov(tmu1_s, B_CUR_REG)
rotate(broadcast, r1, -(2*0+0))
bxor(r3, r4, r5).rotate(broadcast, r1, -(2*0+1))
popcount(r3, r0, POPCNT_TMP_REG) # 15 instructions
iadd(rb[0], rb[0], r3)
bxor(r3, r4, r5).rotate(broadcast, r1, -(2*1+0))
popcount(r3, r0, POPCNT_TMP_REG) # 15 instructions
iadd(ra[0], ra[0], r3)
for i in range(1, 7):
bxor(r3, r4, r5).rotate(broadcast, r1, -(2*i+1))
popcount(r3, r0, POPCNT_TMP_REG) # 15 instructions
iadd(rb[i], rb[i], r3)
bxor(r3, r4, r5).rotate(broadcast, r1, -(2*(i+1)+0))
popcount(r3, r0, POPCNT_TMP_REG) # 15 instructions
iadd(ra[i], ra[i], r3)
(snip.)
- 29. L.bi_loop
mov(tmu0_s, r2).rotate(broadcast, r4, -0)
bxor(r3, r5, W_REG[0][0])
mov(r1, r4).rotate(broadcast, r4, -1)
bxor(r3, r5, W_REG[0][0]).v8adds(rs[2][0], rs[2][0], r3, sig='load tmu1')
bxor(r3, r5, W_REG[0][1]).v8adds(rs[2][1], rs[2][1], r3)
mov(tmu1_s, r0).rotate(broadcast, r1, -2)
bxor(r3, r5, W_REG[0][0]).v8adds(rs[2][0], rs[2][0], r3)
bxor(r3, r5, W_REG[0][1]).v8adds(rs[2][2], rs[2][2], r3)
bxor(r3, r5, W_REG[0][2]).v8adds(rs[2][1], rs[2][1], r3)
iadd(r2, r2, B_STRIDE_REGS[2]).rotate(broadcast, r1, -3)
bxor(r3, r5, W_REG[0][0]).v8adds(rs[2][0], rs[2][0], r3)
bxor(r3, r5, W_REG[0][1]).v8adds(rs[2][3], rs[2][3], r3)
bxor(r3, r5, W_REG[0][2]).v8adds(rs[2][2], rs[2][2], r3)
bnot(W_REG[1][1], r4).rotate(broadcast, r1, -4)
(snip.)
- 30. L.bi_loop
# unrolling kernel loop
(snip.)
for j in range(8):
band(rs[0+2][j], rs[0+2][j], CONST_0x0f0f0f0f_REG).mov(r3, rs[0+2][j])
shr(r1, r3, 4)
band(r1, r1, CONST_0x0f0f0f0f_REG)
iadd(r3, rs[0+2][j], r1)
shr(r1, r3, 8)
iadd(r3, r3, r1).mov(rs[0+2][j], 0)
band(r3, r3, CONST_0x00ff00ff_REG)
iadd(rs[0][j], rs[0][j], r3)
(snip.)
jzc(L.bi_loop)