Commit 716b3967 authored by Mark Reid's avatar Mark Reid Committed by Paul B Mahol

avfilter/vf_lut3d: add x86-optimized tetrahedral interpolation

I spotted an interesting pattern that I didn't see before that leads to the implementation being faster.
The bit shifting table I was using before is no longer needed, and was able to remove quite a few lines. 
I also add use of FMA on the AVX2 version.

f32 1920x1080 1 thread with prelut
c impl
1434012700 UNITS in lut3d->interp,       1 runs,      0 skips
1434035335 UNITS in lut3d->interp,       2 runs,      0 skips
1423615347 UNITS in lut3d->interp,       4 runs,      0 skips
1426268863 UNITS in lut3d->interp,       8 runs,      0 skips

sse2
905484420 UNITS in lut3d->interp,       1 runs,      0 skips
905659010 UNITS in lut3d->interp,       2 runs,      0 skips
915167140 UNITS in lut3d->interp,       4 runs,      0 skips
915834222 UNITS in lut3d->interp,       8 runs,      0 skips

avx
574794860 UNITS in lut3d->interp,       1 runs,      0 skips
581035090 UNITS in lut3d->interp,       2 runs,      0 skips
584116720 UNITS in lut3d->interp,       4 runs,      0 skips
581460290 UNITS in lut3d->interp,       8 runs,      0 skips

avx2
301698880 UNITS in lut3d->interp,       1 runs,      0 skips
301982880 UNITS in lut3d->interp,       2 runs,      0 skips
306962430 UNITS in lut3d->interp,       4 runs,      0 skips
305472025 UNITS in lut3d->interp,       8 runs,      0 skips

gbrap16 1920x1080 1 thread with prelut
c impl
1480894840 UNITS in lut3d->interp,       1 runs,      0 skips
1502922990 UNITS in lut3d->interp,       2 runs,      0 skips
1496114307 UNITS in lut3d->interp,       4 runs,      0 skips
1492554551 UNITS in lut3d->interp,       8 runs,      0 skips

sse2
980777180 UNITS in lut3d->interp,       1 runs,      0 skips
986121520 UNITS in lut3d->interp,       2 runs,      0 skips
986489840 UNITS in lut3d->interp,       4 runs,      0 skips
998832248 UNITS in lut3d->interp,       8 runs,      0 skips

avx
622212360 UNITS in lut3d->interp,       1 runs,      0 skips
622981160 UNITS in lut3d->interp,       2 runs,      0 skips
645396315 UNITS in lut3d->interp,       4 runs,      0 skips
641057075 UNITS in lut3d->interp,       8 runs,      0 skips

avx2
321336400 UNITS in lut3d->interp,       1 runs,      0 skips
321268920 UNITS in lut3d->interp,       2 runs,      0 skips
323459895 UNITS in lut3d->interp,       4 runs,      0 skips
324949967 UNITS in lut3d->interp,       8 runs,      0 skips
parent 5133f4c2
/*
* Copyright (c) 2013 Clément Bœsch
* Copyright (c) 2018 Paul B Mahol
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVFILTER_LUT3D_H
#define AVFILTER_LUT3D_H
#include "libavutil/pixdesc.h"
#include "framesync.h"
#include "avfilter.h"
enum interp_mode {
INTERPOLATE_NEAREST,
INTERPOLATE_TRILINEAR,
INTERPOLATE_TETRAHEDRAL,
INTERPOLATE_PYRAMID,
INTERPOLATE_PRISM,
NB_INTERP_MODE
};
struct rgbvec {
float r, g, b;
};
/* 3D LUT don't often go up to level 32, but it is common to have a Hald CLUT
* of 512x512 (64x64x64) */
#define MAX_LEVEL 256
#define PRELUT_SIZE 65536
typedef struct Lut3DPreLut {
int size;
float min[3];
float max[3];
float scale[3];
float* lut[3];
} Lut3DPreLut;
typedef struct LUT3DContext {
const AVClass *class;
struct rgbvec *lut;
int lutsize;
int lutsize2;
struct rgbvec scale;
int interpolation; ///<interp_mode
char *file;
uint8_t rgba_map[4];
int step;
avfilter_action_func *interp;
Lut3DPreLut prelut;
#if CONFIG_HALDCLUT_FILTER
uint8_t clut_rgba_map[4];
int clut_step;
int clut_bits;
int clut_planar;
int clut_float;
int clut_width;
FFFrameSync fs;
#endif
} LUT3DContext;
typedef struct ThreadData {
AVFrame *in, *out;
} ThreadData;
void ff_lut3d_init_x86(LUT3DContext *s, const AVPixFmtDescriptor *desc);
#endif /* AVFILTER_LUT3D_H */
......@@ -31,73 +31,18 @@
#include "libavutil/intreadwrite.h"
#include "libavutil/intfloat.h"
#include "libavutil/avassert.h"
#include "libavutil/pixdesc.h"
#include "libavutil/avstring.h"
#include "avfilter.h"
#include "drawutils.h"
#include "formats.h"
#include "framesync.h"
#include "internal.h"
#include "video.h"
#include "lut3d.h"
#define R 0
#define G 1
#define B 2
#define A 3
enum interp_mode {
INTERPOLATE_NEAREST,
INTERPOLATE_TRILINEAR,
INTERPOLATE_TETRAHEDRAL,
INTERPOLATE_PYRAMID,
INTERPOLATE_PRISM,
NB_INTERP_MODE
};
struct rgbvec {
float r, g, b;
};
/* 3D LUT don't often go up to level 32, but it is common to have a Hald CLUT
* of 512x512 (64x64x64) */
#define MAX_LEVEL 256
#define PRELUT_SIZE 65536
typedef struct Lut3DPreLut {
int size;
float min[3];
float max[3];
float scale[3];
float* lut[3];
} Lut3DPreLut;
typedef struct LUT3DContext {
const AVClass *class;
int interpolation; ///<interp_mode
char *file;
uint8_t rgba_map[4];
int step;
avfilter_action_func *interp;
struct rgbvec scale;
struct rgbvec *lut;
int lutsize;
int lutsize2;
Lut3DPreLut prelut;
#if CONFIG_HALDCLUT_FILTER
uint8_t clut_rgba_map[4];
int clut_step;
int clut_bits;
int clut_planar;
int clut_float;
int clut_width;
FFFrameSync fs;
#endif
} LUT3DContext;
typedef struct ThreadData {
AVFrame *in, *out;
} ThreadData;
#define OFFSET(x) offsetof(LUT3DContext, x)
#define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM
#define TFLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_RUNTIME_PARAM
......@@ -1203,6 +1148,10 @@ static int config_input(AVFilterLink *inlink)
av_assert0(0);
}
if (ARCH_X86) {
ff_lut3d_init_x86(lut3d, desc);
}
return 0;
}
......
......@@ -17,6 +17,7 @@ OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d_init.o
OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet_init.o
OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_tinterlace_init.o
OBJS-$(CONFIG_LIMITER_FILTER) += x86/vf_limiter_init.o
OBJS-$(CONFIG_LUT3D_FILTER) += x86/vf_lut3d_init.o
OBJS-$(CONFIG_MASKEDCLAMP_FILTER) += x86/vf_maskedclamp_init.o
OBJS-$(CONFIG_MASKEDMERGE_FILTER) += x86/vf_maskedmerge_init.o
OBJS-$(CONFIG_NOISE_FILTER) += x86/vf_noise.o
......@@ -57,6 +58,7 @@ X86ASM-OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d.o
X86ASM-OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet.o
X86ASM-OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_interlace.o
X86ASM-OBJS-$(CONFIG_LIMITER_FILTER) += x86/vf_limiter.o
X86ASM-OBJS-$(CONFIG_LUT3D_FILTER) += x86/vf_lut3d.o
X86ASM-OBJS-$(CONFIG_MASKEDCLAMP_FILTER) += x86/vf_maskedclamp.o
X86ASM-OBJS-$(CONFIG_MASKEDMERGE_FILTER) += x86/vf_maskedmerge.o
X86ASM-OBJS-$(CONFIG_OVERLAY_FILTER) += x86/vf_overlay.o
......
This diff is collapsed.
/*
* Copyright (c) 2021 Mark Reid <mindmark@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavfilter/lut3d.h"
#define DEFINE_INTERP_FUNC(name, format, opt) \
void ff_interp_##name##_##format##_##opt(LUT3DContext *lut3d, Lut3DPreLut *prelut, AVFrame *src, AVFrame *dst, int slice_start, int slice_end, int has_alpha); \
static int interp_##name##_##format##_##opt(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) \
{ \
LUT3DContext *lut3d = ctx->priv; \
Lut3DPreLut *prelut = lut3d->prelut.size > 0? &lut3d->prelut: NULL; \
ThreadData *td = arg; \
AVFrame *in = td->in; \
AVFrame *out = td->out; \
int has_alpha = in->linesize[3] && out != in; \
int slice_start = (in->height * jobnr ) / nb_jobs; \
int slice_end = (in->height * (jobnr+1)) / nb_jobs; \
ff_interp_##name##_##format##_##opt(lut3d, prelut, in, out, slice_start, slice_end, has_alpha); \
return 0; \
}
#if ARCH_X86_64
#if HAVE_AVX2_EXTERNAL
DEFINE_INTERP_FUNC(tetrahedral, pf32, avx2)
DEFINE_INTERP_FUNC(tetrahedral, p16, avx2)
#endif
#if HAVE_AVX_EXTERNAL
DEFINE_INTERP_FUNC(tetrahedral, pf32, avx)
DEFINE_INTERP_FUNC(tetrahedral, p16, avx)
#endif
DEFINE_INTERP_FUNC(tetrahedral, pf32, sse2)
DEFINE_INTERP_FUNC(tetrahedral, p16, sse2)
#endif
av_cold void ff_lut3d_init_x86(LUT3DContext *s, const AVPixFmtDescriptor *desc)
{
int cpu_flags = av_get_cpu_flags();
int planar = desc->flags & AV_PIX_FMT_FLAG_PLANAR;
int isfloat = desc->flags & AV_PIX_FMT_FLAG_FLOAT;
int depth = desc->comp[0].depth;
#if ARCH_X86_64
if (EXTERNAL_AVX2_FAST(cpu_flags) && EXTERNAL_FMA3(cpu_flags) && s->interpolation == INTERPOLATE_TETRAHEDRAL && planar) {
#if HAVE_AVX2_EXTERNAL
if (isfloat && planar) {
s->interp = interp_tetrahedral_pf32_avx2;
} else if (depth == 16) {
s->interp = interp_tetrahedral_p16_avx2;
}
#endif
} else if (EXTERNAL_AVX_FAST(cpu_flags) && s->interpolation == INTERPOLATE_TETRAHEDRAL && planar) {
#if HAVE_AVX_EXTERNAL
if (isfloat) {
s->interp = interp_tetrahedral_pf32_avx;
} else if (depth == 16) {
s->interp = interp_tetrahedral_p16_avx;
}
#endif
} else if (EXTERNAL_SSE2(cpu_flags) && s->interpolation == INTERPOLATE_TETRAHEDRAL && planar) {
if (isfloat) {
s->interp = interp_tetrahedral_pf32_sse2;
} else if (depth == 16) {
s->interp = interp_tetrahedral_p16_sse2;
}
}
#endif
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment