Commit a76b409d authored by Martin Storsjö's avatar Martin Storsjö

aarch64: Reindent all assembly to 8/24 column indentation

libavcodec/aarch64/vc1dsp_neon.S is skipped here, as it intentionally
uses a layered indentation style to visually show how different
unrolled/interleaved phases fit together.
Signed-off-by: 's avatarMartin Storsjö <martin@martin.st>
parent cada4597
......@@ -19,130 +19,130 @@
#include "libavutil/aarch64/asm.S"
function ff_ps_add_squares_neon, export=1
1: ld1 {v0.4s,v1.4s}, [x1], #32
fmul v0.4s, v0.4s, v0.4s
fmul v1.4s, v1.4s, v1.4s
faddp v2.4s, v0.4s, v1.4s
ld1 {v3.4s}, [x0]
fadd v3.4s, v3.4s, v2.4s
st1 {v3.4s}, [x0], #16
subs w2, w2, #4
b.gt 1b
1: ld1 {v0.4s,v1.4s}, [x1], #32
fmul v0.4s, v0.4s, v0.4s
fmul v1.4s, v1.4s, v1.4s
faddp v2.4s, v0.4s, v1.4s
ld1 {v3.4s}, [x0]
fadd v3.4s, v3.4s, v2.4s
st1 {v3.4s}, [x0], #16
subs w2, w2, #4
b.gt 1b
ret
endfunc
function ff_ps_mul_pair_single_neon, export=1
1: ld1 {v0.4s,v1.4s}, [x1], #32
ld1 {v2.4s}, [x2], #16
zip1 v3.4s, v2.4s, v2.4s
zip2 v4.4s, v2.4s, v2.4s
fmul v0.4s, v0.4s, v3.4s
fmul v1.4s, v1.4s, v4.4s
st1 {v0.4s,v1.4s}, [x0], #32
subs w3, w3, #4
b.gt 1b
1: ld1 {v0.4s,v1.4s}, [x1], #32
ld1 {v2.4s}, [x2], #16
zip1 v3.4s, v2.4s, v2.4s
zip2 v4.4s, v2.4s, v2.4s
fmul v0.4s, v0.4s, v3.4s
fmul v1.4s, v1.4s, v4.4s
st1 {v0.4s,v1.4s}, [x0], #32
subs w3, w3, #4
b.gt 1b
ret
endfunc
function ff_ps_stereo_interpolate_neon, export=1
ld1 {v0.4s}, [x2]
ld1 {v1.4s}, [x3]
zip1 v4.4s, v0.4s, v0.4s
zip2 v5.4s, v0.4s, v0.4s
zip1 v6.4s, v1.4s, v1.4s
zip2 v7.4s, v1.4s, v1.4s
1: ld1 {v2.2s}, [x0]
ld1 {v3.2s}, [x1]
fadd v4.4s, v4.4s, v6.4s
fadd v5.4s, v5.4s, v7.4s
mov v2.d[1], v2.d[0]
mov v3.d[1], v3.d[0]
fmul v2.4s, v2.4s, v4.4s
fmla v2.4s, v3.4s, v5.4s
st1 {v2.d}[0], [x0], #8
st1 {v2.d}[1], [x1], #8
subs w4, w4, #1
b.gt 1b
ld1 {v0.4s}, [x2]
ld1 {v1.4s}, [x3]
zip1 v4.4s, v0.4s, v0.4s
zip2 v5.4s, v0.4s, v0.4s
zip1 v6.4s, v1.4s, v1.4s
zip2 v7.4s, v1.4s, v1.4s
1: ld1 {v2.2s}, [x0]
ld1 {v3.2s}, [x1]
fadd v4.4s, v4.4s, v6.4s
fadd v5.4s, v5.4s, v7.4s
mov v2.d[1], v2.d[0]
mov v3.d[1], v3.d[0]
fmul v2.4s, v2.4s, v4.4s
fmla v2.4s, v3.4s, v5.4s
st1 {v2.d}[0], [x0], #8
st1 {v2.d}[1], [x1], #8
subs w4, w4, #1
b.gt 1b
ret
endfunc
function ff_ps_stereo_interpolate_ipdopd_neon, export=1
ld1 {v0.4s,v1.4s}, [x2]
ld1 {v6.4s,v7.4s}, [x3]
fneg v2.4s, v1.4s
fneg v3.4s, v7.4s
zip1 v16.4s, v0.4s, v0.4s
zip2 v17.4s, v0.4s, v0.4s
zip1 v18.4s, v2.4s, v1.4s
zip2 v19.4s, v2.4s, v1.4s
zip1 v20.4s, v6.4s, v6.4s
zip2 v21.4s, v6.4s, v6.4s
zip1 v22.4s, v3.4s, v7.4s
zip2 v23.4s, v3.4s, v7.4s
1: ld1 {v2.2s}, [x0]
ld1 {v3.2s}, [x1]
fadd v16.4s, v16.4s, v20.4s
fadd v17.4s, v17.4s, v21.4s
mov v2.d[1], v2.d[0]
mov v3.d[1], v3.d[0]
fmul v4.4s, v2.4s, v16.4s
fmla v4.4s, v3.4s, v17.4s
fadd v18.4s, v18.4s, v22.4s
fadd v19.4s, v19.4s, v23.4s
ext v2.16b, v2.16b, v2.16b, #4
ext v3.16b, v3.16b, v3.16b, #4
fmla v4.4s, v2.4s, v18.4s
fmla v4.4s, v3.4s, v19.4s
st1 {v4.d}[0], [x0], #8
st1 {v4.d}[1], [x1], #8
subs w4, w4, #1
b.gt 1b
ld1 {v0.4s,v1.4s}, [x2]
ld1 {v6.4s,v7.4s}, [x3]
fneg v2.4s, v1.4s
fneg v3.4s, v7.4s
zip1 v16.4s, v0.4s, v0.4s
zip2 v17.4s, v0.4s, v0.4s
zip1 v18.4s, v2.4s, v1.4s
zip2 v19.4s, v2.4s, v1.4s
zip1 v20.4s, v6.4s, v6.4s
zip2 v21.4s, v6.4s, v6.4s
zip1 v22.4s, v3.4s, v7.4s
zip2 v23.4s, v3.4s, v7.4s
1: ld1 {v2.2s}, [x0]
ld1 {v3.2s}, [x1]
fadd v16.4s, v16.4s, v20.4s
fadd v17.4s, v17.4s, v21.4s
mov v2.d[1], v2.d[0]
mov v3.d[1], v3.d[0]
fmul v4.4s, v2.4s, v16.4s
fmla v4.4s, v3.4s, v17.4s
fadd v18.4s, v18.4s, v22.4s
fadd v19.4s, v19.4s, v23.4s
ext v2.16b, v2.16b, v2.16b, #4
ext v3.16b, v3.16b, v3.16b, #4
fmla v4.4s, v2.4s, v18.4s
fmla v4.4s, v3.4s, v19.4s
st1 {v4.d}[0], [x0], #8
st1 {v4.d}[1], [x1], #8
subs w4, w4, #1
b.gt 1b
ret
endfunc
function ff_ps_hybrid_analysis_neon, export=1
lsl x3, x3, #3
ld2 {v0.4s,v1.4s}, [x1], #32
ld2 {v2.2s,v3.2s}, [x1], #16
ld1 {v24.2s}, [x1], #8
ld2 {v4.2s,v5.2s}, [x1], #16
ld2 {v6.4s,v7.4s}, [x1]
rev64 v6.4s, v6.4s
rev64 v7.4s, v7.4s
ext v6.16b, v6.16b, v6.16b, #8
ext v7.16b, v7.16b, v7.16b, #8
rev64 v4.2s, v4.2s
rev64 v5.2s, v5.2s
mov v2.d[1], v3.d[0]
mov v4.d[1], v5.d[0]
mov v5.d[1], v2.d[0]
mov v3.d[1], v4.d[0]
fadd v16.4s, v0.4s, v6.4s
fadd v17.4s, v1.4s, v7.4s
fsub v18.4s, v1.4s, v7.4s
fsub v19.4s, v0.4s, v6.4s
fadd v22.4s, v2.4s, v4.4s
fsub v23.4s, v5.4s, v3.4s
trn1 v20.2d, v22.2d, v23.2d // {re4+re8, re5+re7, im8-im4, im7-im5}
trn2 v21.2d, v22.2d, v23.2d // {im4+im8, im5+im7, re4-re8, re5-re7}
1: ld2 {v2.4s,v3.4s}, [x2], #32
ld2 {v4.2s,v5.2s}, [x2], #16
ld1 {v6.2s}, [x2], #8
add x2, x2, #8
mov v4.d[1], v5.d[0]
mov v6.s[1], v6.s[0]
fmul v6.2s, v6.2s, v24.2s
fmul v0.4s, v2.4s, v16.4s
fmul v1.4s, v2.4s, v17.4s
fmls v0.4s, v3.4s, v18.4s
fmla v1.4s, v3.4s, v19.4s
fmla v0.4s, v4.4s, v20.4s
fmla v1.4s, v4.4s, v21.4s
faddp v0.4s, v0.4s, v1.4s
faddp v0.4s, v0.4s, v0.4s
fadd v0.2s, v0.2s, v6.2s
st1 {v0.2s}, [x0], x3
subs w4, w4, #1
b.gt 1b
lsl x3, x3, #3
ld2 {v0.4s,v1.4s}, [x1], #32
ld2 {v2.2s,v3.2s}, [x1], #16
ld1 {v24.2s}, [x1], #8
ld2 {v4.2s,v5.2s}, [x1], #16
ld2 {v6.4s,v7.4s}, [x1]
rev64 v6.4s, v6.4s
rev64 v7.4s, v7.4s
ext v6.16b, v6.16b, v6.16b, #8
ext v7.16b, v7.16b, v7.16b, #8
rev64 v4.2s, v4.2s
rev64 v5.2s, v5.2s
mov v2.d[1], v3.d[0]
mov v4.d[1], v5.d[0]
mov v5.d[1], v2.d[0]
mov v3.d[1], v4.d[0]
fadd v16.4s, v0.4s, v6.4s
fadd v17.4s, v1.4s, v7.4s
fsub v18.4s, v1.4s, v7.4s
fsub v19.4s, v0.4s, v6.4s
fadd v22.4s, v2.4s, v4.4s
fsub v23.4s, v5.4s, v3.4s
trn1 v20.2d, v22.2d, v23.2d // {re4+re8, re5+re7, im8-im4, im7-im5}
trn2 v21.2d, v22.2d, v23.2d // {im4+im8, im5+im7, re4-re8, re5-re7}
1: ld2 {v2.4s,v3.4s}, [x2], #32
ld2 {v4.2s,v5.2s}, [x2], #16
ld1 {v6.2s}, [x2], #8
add x2, x2, #8
mov v4.d[1], v5.d[0]
mov v6.s[1], v6.s[0]
fmul v6.2s, v6.2s, v24.2s
fmul v0.4s, v2.4s, v16.4s
fmul v1.4s, v2.4s, v17.4s
fmls v0.4s, v3.4s, v18.4s
fmla v1.4s, v3.4s, v19.4s
fmla v0.4s, v4.4s, v20.4s
fmla v1.4s, v4.4s, v21.4s
faddp v0.4s, v0.4s, v1.4s
faddp v0.4s, v0.4s, v0.4s
fadd v0.2s, v0.2s, v6.2s
st1 {v0.2s}, [x0], x3
subs w4, w4, #1
b.gt 1b
ret
endfunc
......@@ -33,81 +33,81 @@ const tab_x2, align=4
endconst
function ff_opus_deemphasis_neon, export=1
movrel x4, tab_st
ld1 {v4.4s}, [x4]
movrel x4, tab_x0
ld1 {v5.4s}, [x4]
movrel x4, tab_x1
ld1 {v6.4s}, [x4]
movrel x4, tab_x2
ld1 {v7.4s}, [x4]
movrel x4, tab_st
ld1 {v4.4s}, [x4]
movrel x4, tab_x0
ld1 {v5.4s}, [x4]
movrel x4, tab_x1
ld1 {v6.4s}, [x4]
movrel x4, tab_x2
ld1 {v7.4s}, [x4]
fmul v0.4s, v4.4s, v0.s[0]
fmul v0.4s, v4.4s, v0.s[0]
1: ld1 {v1.4s, v2.4s}, [x1], #32
1: ld1 {v1.4s, v2.4s}, [x1], #32
fmla v0.4s, v5.4s, v1.s[0]
fmul v3.4s, v7.4s, v2.s[2]
fmla v0.4s, v5.4s, v1.s[0]
fmul v3.4s, v7.4s, v2.s[2]
fmla v0.4s, v6.4s, v1.s[1]
fmla v3.4s, v6.4s, v2.s[1]
fmla v0.4s, v6.4s, v1.s[1]
fmla v3.4s, v6.4s, v2.s[1]
fmla v0.4s, v7.4s, v1.s[2]
fmla v3.4s, v5.4s, v2.s[0]
fmla v0.4s, v7.4s, v1.s[2]
fmla v3.4s, v5.4s, v2.s[0]
fadd v1.4s, v1.4s, v0.4s
fadd v2.4s, v2.4s, v3.4s
fadd v1.4s, v1.4s, v0.4s
fadd v2.4s, v2.4s, v3.4s
fmla v2.4s, v4.4s, v1.s[3]
fmla v2.4s, v4.4s, v1.s[3]
st1 {v1.4s, v2.4s}, [x0], #32
fmul v0.4s, v4.4s, v2.s[3]
st1 {v1.4s, v2.4s}, [x0], #32
fmul v0.4s, v4.4s, v2.s[3]
subs w2, w2, #8
b.gt 1b
subs w2, w2, #8
b.gt 1b
mov s0, v2.s[3]
mov s0, v2.s[3]
ret
endfunc
function ff_opus_postfilter_neon, export=1
ld1 {v0.4s}, [x2]
dup v1.4s, v0.s[1]
dup v2.4s, v0.s[2]
dup v0.4s, v0.s[0]
ld1 {v0.4s}, [x2]
dup v1.4s, v0.s[1]
dup v2.4s, v0.s[2]
dup v0.4s, v0.s[0]
add w1, w1, #2
sub x1, x0, x1, lsl #2
add w1, w1, #2
sub x1, x0, x1, lsl #2
ld1 {v3.4s}, [x1]
fmul v3.4s, v3.4s, v2.4s
ld1 {v3.4s}, [x1]
fmul v3.4s, v3.4s, v2.4s
1: add x1, x1, #4
ld1 {v4.4s}, [x1]
add x1, x1, #4
ld1 {v5.4s}, [x1]
add x1, x1, #4
ld1 {v6.4s}, [x1]
add x1, x1, #4
ld1 {v7.4s}, [x1]
1: add x1, x1, #4
ld1 {v4.4s}, [x1]
add x1, x1, #4
ld1 {v5.4s}, [x1]
add x1, x1, #4
ld1 {v6.4s}, [x1]
add x1, x1, #4
ld1 {v7.4s}, [x1]
fmla v3.4s, v7.4s, v2.4s
fadd v6.4s, v6.4s, v4.4s
fmla v3.4s, v7.4s, v2.4s
fadd v6.4s, v6.4s, v4.4s
ld1 {v4.4s}, [x0]
fmla v4.4s, v5.4s, v0.4s
ld1 {v4.4s}, [x0]
fmla v4.4s, v5.4s, v0.4s
fmul v6.4s, v6.4s, v1.4s
fadd v6.4s, v6.4s, v3.4s
fmul v6.4s, v6.4s, v1.4s
fadd v6.4s, v6.4s, v3.4s
fadd v4.4s, v4.4s, v6.4s
fmul v3.4s, v7.4s, v2.4s
fadd v4.4s, v4.4s, v6.4s
fmul v3.4s, v7.4s, v2.4s
st1 {v4.4s}, [x0], #16
st1 {v4.4s}, [x0], #16
subs w3, w3, #4
b.gt 1b
subs w3, w3, #4
b.gt 1b
ret
endfunc
......@@ -21,57 +21,57 @@
#include "libavutil/aarch64/asm.S"
function ff_resample_common_apply_filter_x4_float_neon, export=1
movi v0.4s, #0 // accumulator
1: ld1 {v1.4s}, [x1], #16 // src[0..3]
ld1 {v2.4s}, [x2], #16 // filter[0..3]
fmla v0.4s, v1.4s, v2.4s // accumulator += src[0..3] * filter[0..3]
subs w3, w3, #4 // filter_length -= 4
b.gt 1b // loop until filter_length
faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
st1 {v0.s}[0], [x0], #4 // write accumulator
movi v0.4s, #0 // accumulator
1: ld1 {v1.4s}, [x1], #16 // src[0..3]
ld1 {v2.4s}, [x2], #16 // filter[0..3]
fmla v0.4s, v1.4s, v2.4s // accumulator += src[0..3] * filter[0..3]
subs w3, w3, #4 // filter_length -= 4
b.gt 1b // loop until filter_length
faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
st1 {v0.s}[0], [x0], #4 // write accumulator
ret
endfunc
function ff_resample_common_apply_filter_x8_float_neon, export=1
movi v0.4s, #0 // accumulator
1: ld1 {v1.4s}, [x1], #16 // src[0..3]
ld1 {v2.4s}, [x2], #16 // filter[0..3]
ld1 {v3.4s}, [x1], #16 // src[4..7]
ld1 {v4.4s}, [x2], #16 // filter[4..7]
fmla v0.4s, v1.4s, v2.4s // accumulator += src[0..3] * filter[0..3]
fmla v0.4s, v3.4s, v4.4s // accumulator += src[4..7] * filter[4..7]
subs w3, w3, #8 // filter_length -= 8
b.gt 1b // loop until filter_length
faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
st1 {v0.s}[0], [x0], #4 // write accumulator
movi v0.4s, #0 // accumulator
1: ld1 {v1.4s}, [x1], #16 // src[0..3]
ld1 {v2.4s}, [x2], #16 // filter[0..3]
ld1 {v3.4s}, [x1], #16 // src[4..7]
ld1 {v4.4s}, [x2], #16 // filter[4..7]
fmla v0.4s, v1.4s, v2.4s // accumulator += src[0..3] * filter[0..3]
fmla v0.4s, v3.4s, v4.4s // accumulator += src[4..7] * filter[4..7]
subs w3, w3, #8 // filter_length -= 8
b.gt 1b // loop until filter_length
faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
st1 {v0.s}[0], [x0], #4 // write accumulator
ret
endfunc
function ff_resample_common_apply_filter_x4_s16_neon, export=1
movi v0.4s, #0 // accumulator
1: ld1 {v1.4h}, [x1], #8 // src[0..3]
ld1 {v2.4h}, [x2], #8 // filter[0..3]
smlal v0.4s, v1.4h, v2.4h // accumulator += src[0..3] * filter[0..3]
subs w3, w3, #4 // filter_length -= 4
b.gt 1b // loop until filter_length
addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
st1 {v0.s}[0], [x0], #4 // write accumulator
movi v0.4s, #0 // accumulator
1: ld1 {v1.4h}, [x1], #8 // src[0..3]
ld1 {v2.4h}, [x2], #8 // filter[0..3]
smlal v0.4s, v1.4h, v2.4h // accumulator += src[0..3] * filter[0..3]
subs w3, w3, #4 // filter_length -= 4
b.gt 1b // loop until filter_length
addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
st1 {v0.s}[0], [x0], #4 // write accumulator
ret
endfunc
function ff_resample_common_apply_filter_x8_s16_neon, export=1
movi v0.4s, #0 // accumulator
1: ld1 {v1.8h}, [x1], #16 // src[0..7]
ld1 {v2.8h}, [x2], #16 // filter[0..7]
smlal v0.4s, v1.4h, v2.4h // accumulator += src[0..3] * filter[0..3]
smlal2 v0.4s, v1.8h, v2.8h // accumulator += src[4..7] * filter[4..7]
subs w3, w3, #8 // filter_length -= 8
b.gt 1b // loop until filter_length
addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
st1 {v0.s}[0], [x0], #4 // write accumulator
movi v0.4s, #0 // accumulator
1: ld1 {v1.8h}, [x1], #16 // src[0..7]
ld1 {v2.8h}, [x2], #16 // filter[0..7]
smlal v0.4s, v1.4h, v2.4h // accumulator += src[0..3] * filter[0..3]
smlal2 v0.4s, v1.8h, v2.8h // accumulator += src[4..7] * filter[4..7]
subs w3, w3, #8 // filter_length -= 8
b.gt 1b // loop until filter_length
addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
st1 {v0.s}[0], [x0], #4 // write accumulator
ret
endfunc
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment