Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
F
ffmpeg
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Stefan Westerfeld
ffmpeg
Commits
a1444086
Commit
a1444086
authored
May 31, 2024
by
James Almer
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
x86/float_dsp: add SSE2 and AVX versions of scalarproduct_double
Signed-off-by:
James Almer
<
jamrial@gmail.com
>
parent
7413b81e
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
57 additions
and
0 deletions
+57
-0
float_dsp.asm
libavutil/x86/float_dsp.asm
+52
-0
float_dsp_init.c
libavutil/x86/float_dsp_init.c
+5
-0
No files found.
libavutil/x86/float_dsp.asm
View file @
a1444086
...
...
@@ -567,6 +567,58 @@ cglobal scalarproduct_float, 3,5,8, v1, v2, size, len, offset
%endif
RET
;---------------------------------------------------------------------------------
; double scalarproduct_double(const double *v1, const double *v2, size_t len)
;---------------------------------------------------------------------------------
%macro
SCALARPRODUCT_DOUBLE
0
cglobal
scalarproduct_double
,
3
,
3
,
8
,
v1
,
v2
,
offset
shl
offsetq
,
3
add
v1q
,
offsetq
add
v2q
,
offsetq
neg
offsetq
xorpd
m0
,
m0
xorpd
m1
,
m1
movapd
m2
,
m0
movapd
m3
,
m1
align
16
.
loop
:
movapd
m4
,
[
v1q
+
offsetq
+
mmsize
*
0
]
movapd
m5
,
[
v1q
+
offsetq
+
mmsize
*
1
]
movapd
m6
,
[
v1q
+
offsetq
+
mmsize
*
2
]
movapd
m7
,
[
v1q
+
offsetq
+
mmsize
*
3
]
mulpd
m4
,
[
v2q
+
offsetq
+
mmsize
*
0
]
mulpd
m5
,
[
v2q
+
offsetq
+
mmsize
*
1
]
mulpd
m6
,
[
v2q
+
offsetq
+
mmsize
*
2
]
mulpd
m7
,
[
v2q
+
offsetq
+
mmsize
*
3
]
addpd
m0
,
m4
addpd
m1
,
m5
addpd
m2
,
m6
addpd
m3
,
m7
add
offsetq
,
mmsize
*
4
jl
.
loop
addpd
m0
,
m1
addpd
m2
,
m3
addpd
m0
,
m2
%if
mmsize
==
32
vextractf128
xm1
,
m0
,
1
addpd
xm0
,
xm1
%endif
movhlps
xm1
,
xm0
addsd
xm0
,
xm1
%if
ARCH_X86_64
==
0
movsd
r0m
,
xm0
fld
qword
r0m
%endif
RET
%endmacro
INIT_XMM
sse2
SCALARPRODUCT_DOUBLE
%if
HAVE_AVX_EXTERNAL
INIT_YMM
avx
SCALARPRODUCT_DOUBLE
%endif
;-----------------------------------------------------------------------------
; void ff_butterflies_float(float *src0, float *src1, int len);
;-----------------------------------------------------------------------------
...
...
libavutil/x86/float_dsp_init.c
View file @
a1444086
...
...
@@ -73,6 +73,9 @@ void ff_vector_fmul_reverse_avx2(float *dst, const float *src0,
float
ff_scalarproduct_float_sse
(
const
float
*
v1
,
const
float
*
v2
,
int
order
);
float
ff_scalarproduct_float_fma3
(
const
float
*
v1
,
const
float
*
v2
,
int
order
);
double
ff_scalarproduct_double_sse2
(
const
double
*
v1
,
const
double
*
v2
,
size_t
order
);
double
ff_scalarproduct_double_avx
(
const
double
*
v1
,
const
double
*
v2
,
size_t
order
);
void
ff_butterflies_float_sse
(
float
*
restrict
src0
,
float
*
restrict
src1
,
int
len
);
av_cold
void
ff_float_dsp_init_x86
(
AVFloatDSPContext
*
fdsp
)
...
...
@@ -93,6 +96,7 @@ av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
fdsp
->
vector_dmul
=
ff_vector_dmul_sse2
;
fdsp
->
vector_dmac_scalar
=
ff_vector_dmac_scalar_sse2
;
fdsp
->
vector_dmul_scalar
=
ff_vector_dmul_scalar_sse2
;
fdsp
->
scalarproduct_double
=
ff_scalarproduct_double_sse2
;
}
if
(
EXTERNAL_AVX_FAST
(
cpu_flags
))
{
fdsp
->
vector_fmul
=
ff_vector_fmul_avx
;
...
...
@@ -102,6 +106,7 @@ av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
fdsp
->
vector_dmac_scalar
=
ff_vector_dmac_scalar_avx
;
fdsp
->
vector_fmul_add
=
ff_vector_fmul_add_avx
;
fdsp
->
vector_fmul_reverse
=
ff_vector_fmul_reverse_avx
;
fdsp
->
scalarproduct_double
=
ff_scalarproduct_double_avx
;
}
if
(
EXTERNAL_AVX2_FAST
(
cpu_flags
))
{
fdsp
->
vector_fmul_reverse
=
ff_vector_fmul_reverse_avx2
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment