Commit 4041c102 authored by Wu Jianhua's avatar Wu Jianhua Committed by Paul B Mahol

libavfilter/x86/vf_gblur: add localbuf and ff_horiz_slice_avx2/512()

We introduced a ff_horiz_slice_avx2/512() implemented on a new algorithm.
In a nutshell, the new algorithm does three things, gathering data from
8/16 rows, blurring data, and scattering data back to the image buffer.
Here we used a customized transpose 8x8/16x16 to avoid the huge overhead
brought by gather and scatter instructions, which is dependent on the
temporary buffer called localbuf added newly.

Performance data:
ff_horiz_slice_avx2(old): 109.89
ff_horiz_slice_avx2(new): 666.67
ff_horiz_slice_avx512: 1000
Co-authored-by: 's avatarCheng Yanfei <yanfei.cheng@intel.com>
Co-authored-by: 's avatarJin Jun <jun.i.jin@intel.com>
Signed-off-by: 's avatarWu Jianhua <jianhua.wu@intel.com>
parent 0c54ab20
...@@ -39,9 +39,11 @@ typedef struct GBlurContext { ...@@ -39,9 +39,11 @@ typedef struct GBlurContext {
int flt; int flt;
int depth; int depth;
int stride;
int planewidth[4]; int planewidth[4];
int planeheight[4]; int planeheight[4];
float *buffer; float *buffer;
float *localbuf; ///< temporary buffer for horiz_slice. NULL if not used
float boundaryscale; float boundaryscale;
float boundaryscaleV; float boundaryscaleV;
float postscale; float postscale;
...@@ -49,7 +51,7 @@ typedef struct GBlurContext { ...@@ -49,7 +51,7 @@ typedef struct GBlurContext {
float nu; float nu;
float nuV; float nuV;
int nb_planes; int nb_planes;
void (*horiz_slice)(float *buffer, int width, int height, int steps, float nu, float bscale); void (*horiz_slice)(float *buffer, int width, int height, int steps, float nu, float bscale, float *localbuf);
void (*verti_slice)(float *buffer, int width, int height, int slice_start, int slice_end, int steps, void (*verti_slice)(float *buffer, int width, int height, int slice_start, int slice_end, int steps,
float nu, float bscale); float nu, float bscale);
void (*postscale_slice)(float *buffer, int length, float postscale, float min, float max); void (*postscale_slice)(float *buffer, int length, float postscale, float min, float max);
......
...@@ -64,7 +64,7 @@ static void postscale_c(float *buffer, int length, ...@@ -64,7 +64,7 @@ static void postscale_c(float *buffer, int length,
} }
static void horiz_slice_c(float *buffer, int width, int height, int steps, static void horiz_slice_c(float *buffer, int width, int height, int steps,
float nu, float bscale) float nu, float bscale, float *localbuf)
{ {
int step, x, y; int step, x, y;
float *ptr; float *ptr;
...@@ -97,9 +97,13 @@ static int filter_horizontally(AVFilterContext *ctx, void *arg, int jobnr, int n ...@@ -97,9 +97,13 @@ static int filter_horizontally(AVFilterContext *ctx, void *arg, int jobnr, int n
const int steps = s->steps; const int steps = s->steps;
const float nu = s->nu; const float nu = s->nu;
float *buffer = s->buffer; float *buffer = s->buffer;
float *localbuf = NULL;
if (s->localbuf)
localbuf = s->localbuf + s->stride * width * slice_start;
s->horiz_slice(buffer + width * slice_start, width, slice_end - slice_start, s->horiz_slice(buffer + width * slice_start, width, slice_end - slice_start,
steps, nu, boundaryscale); steps, nu, boundaryscale, localbuf);
emms_c(); emms_c();
return 0; return 0;
} }
...@@ -242,6 +246,7 @@ static int query_formats(AVFilterContext *ctx) ...@@ -242,6 +246,7 @@ static int query_formats(AVFilterContext *ctx)
void ff_gblur_init(GBlurContext *s) void ff_gblur_init(GBlurContext *s)
{ {
s->localbuf = NULL;
s->horiz_slice = horiz_slice_c; s->horiz_slice = horiz_slice_c;
s->verti_slice = verti_slice_c; s->verti_slice = verti_slice_c;
s->postscale_slice = postscale_c; s->postscale_slice = postscale_c;
...@@ -384,6 +389,8 @@ static av_cold void uninit(AVFilterContext *ctx) ...@@ -384,6 +389,8 @@ static av_cold void uninit(AVFilterContext *ctx)
GBlurContext *s = ctx->priv; GBlurContext *s = ctx->priv;
av_freep(&s->buffer); av_freep(&s->buffer);
if (s->localbuf)
av_free(s->localbuf);
} }
static const AVFilterPad gblur_inputs[] = { static const AVFilterPad gblur_inputs[] = {
......
This diff is collapsed.
...@@ -24,8 +24,9 @@ ...@@ -24,8 +24,9 @@
#include "libavutil/x86/cpu.h" #include "libavutil/x86/cpu.h"
#include "libavfilter/gblur.h" #include "libavfilter/gblur.h"
void ff_horiz_slice_sse4(float *ptr, int width, int height, int steps, float nu, float bscale); void ff_horiz_slice_sse4(float *ptr, int width, int height, int steps, float nu, float bscale, float *localbuf);
void ff_horiz_slice_avx2(float *ptr, int width, int height, int steps, float nu, float bscale); void ff_horiz_slice_avx2(float *ptr, int width, int height, int steps, float nu, float bscale, float *localbuf);
void ff_horiz_slice_avx512(float *ptr, int width, int height, int steps, float nu, float bscale, float *localbuf);
void ff_postscale_slice_sse(float *ptr, int length, float postscale, float min, float max); void ff_postscale_slice_sse(float *ptr, int length, float postscale, float min, float max);
void ff_postscale_slice_avx2(float *ptr, int length, float postscale, float min, float max); void ff_postscale_slice_avx2(float *ptr, int length, float postscale, float min, float max);
...@@ -51,12 +52,22 @@ av_cold void ff_gblur_init_x86(GBlurContext *s) ...@@ -51,12 +52,22 @@ av_cold void ff_gblur_init_x86(GBlurContext *s)
s->horiz_slice = ff_horiz_slice_sse4; s->horiz_slice = ff_horiz_slice_sse4;
} }
if (EXTERNAL_AVX2(cpu_flags)) { if (EXTERNAL_AVX2(cpu_flags)) {
s->horiz_slice = ff_horiz_slice_avx2;
s->verti_slice = ff_verti_slice_avx2; s->verti_slice = ff_verti_slice_avx2;
} }
if (EXTERNAL_AVX512(cpu_flags)) { if (EXTERNAL_AVX512(cpu_flags)) {
s->postscale_slice = ff_postscale_slice_avx512; s->postscale_slice = ff_postscale_slice_avx512;
s->verti_slice = ff_verti_slice_avx512; s->verti_slice = ff_verti_slice_avx512;
} }
if (EXTERNAL_AVX2(cpu_flags)) {
s->stride = EXTERNAL_AVX512(cpu_flags) ? 16 : 8;
s->localbuf = av_malloc(s->stride * sizeof(float) * s->planewidth[0] * s->planeheight[0]);
if (!s->localbuf)
return;
s->horiz_slice = ff_horiz_slice_avx2;
if (EXTERNAL_AVX512(cpu_flags)) {
s->horiz_slice = ff_horiz_slice_avx512;
}
}
#endif #endif
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment