Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
F
ffmpeg
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Stefan Westerfeld
ffmpeg
Commits
f1d0f837
Commit
f1d0f837
authored
Oct 31, 2020
by
Timo Rothenpieler
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
avfilter/scale_cuda: add bicubic interpolation
parent
d5763eda
Changes
6
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
429 additions
and
43 deletions
+429
-43
cuda_runtime.h
compat/cuda/cuda_runtime.h
+61
-7
Makefile
libavfilter/Makefile
+2
-1
vector_helpers.cuh
libavfilter/cuda/vector_helpers.cuh
+112
-0
version.h
libavfilter/version.h
+1
-1
vf_scale_cuda.c
libavfilter/vf_scale_cuda.c
+79
-34
vf_scale_cuda_bicubic.cu
libavfilter/vf_scale_cuda_bicubic.cu
+174
-0
No files found.
compat/cuda/cuda_runtime.h
View file @
f1d0f837
...
...
@@ -49,18 +49,23 @@ typedef struct __device_builtin__ __align__(4) ushort2
unsigned
short
x
,
y
;
}
ushort2
;
typedef
struct
__device_builtin__
uint3
typedef
struct
__device_builtin__
__align__
(
8
)
float2
{
unsigned
int
x
,
y
,
z
;
}
uint3
;
typedef
struct
uint3
dim3
;
float
x
,
y
;
}
float2
;
typedef
struct
__device_builtin__
__align__
(
8
)
int2
{
int
x
,
y
;
}
int2
;
typedef
struct
__device_builtin__
uint3
{
unsigned
int
x
,
y
,
z
;
}
uint3
;
typedef
struct
uint3
dim3
;
typedef
struct
__device_builtin__
__align__
(
4
)
uchar4
{
unsigned
char
x
,
y
,
z
,
w
;
...
...
@@ -76,6 +81,11 @@ typedef struct __device_builtin__ __align__(16) int4
int
x
,
y
,
z
,
w
;
}
int4
;
typedef
struct
__device_builtin__
__align__
(
16
)
float4
{
float
x
,
y
,
z
,
w
;
}
float4
;
// Accessors for special registers
#define GETCOMP(reg, comp) \
asm("mov.u32 %0, %%" #reg "." #comp ";" : "=r"(tmp)); \
...
...
@@ -100,24 +110,31 @@ GET(getThreadIdx, tid)
#define threadIdx (getThreadIdx())
// Basic initializers (simple macros rather than inline functions)
#define make_int2(a, b) ((int2){.x = a, .y = b})
#define make_uchar2(a, b) ((uchar2){.x = a, .y = b})
#define make_ushort2(a, b) ((ushort2){.x = a, .y = b})
#define make_float2(a, b) ((float2){.x = a, .y = b})
#define make_int4(a, b, c, d) ((int4){.x = a, .y = b, .z = c, .w = d})
#define make_uchar4(a, b, c, d) ((uchar4){.x = a, .y = b, .z = c, .w = d})
#define make_ushort4(a, b, c, d) ((ushort4){.x = a, .y = b, .z = c, .w = d})
#define make_float4(a, b, c, d) ((float4){.x = a, .y = b, .z = c, .w = d})
// Conversions from the tex instruction's 4-register output to various types
#define TEX2D(type, ret) static inline __device__ void conv(type* out, unsigned a, unsigned b, unsigned c, unsigned d) {*out = (ret);}
TEX2D
(
unsigned
char
,
a
&
0xFF
)
TEX2D
(
unsigned
short
,
a
&
0xFFFF
)
TEX2D
(
float
,
a
)
TEX2D
(
uchar2
,
make_uchar2
(
a
&
0xFF
,
b
&
0xFF
))
TEX2D
(
ushort2
,
make_ushort2
(
a
&
0xFFFF
,
b
&
0xFFFF
))
TEX2D
(
float2
,
make_float2
(
a
,
b
))
TEX2D
(
uchar4
,
make_uchar4
(
a
&
0xFF
,
b
&
0xFF
,
c
&
0xFF
,
d
&
0xFF
))
TEX2D
(
ushort4
,
make_ushort4
(
a
&
0xFFFF
,
b
&
0xFFFF
,
c
&
0xFFFF
,
d
&
0xFFFF
))
TEX2D
(
float4
,
make_float4
(
a
,
b
,
c
,
d
))
// Template calling tex instruction and converting the output to the selected type
template
<
class
T
>
static
inline
__device__
T
tex2D
(
cudaTextureObject_t
texObject
,
float
x
,
float
y
)
template
<
typename
T
>
inline
__device__
T
tex2D
(
cudaTextureObject_t
texObject
,
float
x
,
float
y
)
{
T
ret
;
unsigned
ret1
,
ret2
,
ret3
,
ret4
;
...
...
@@ -128,4 +145,41 @@ static inline __device__ T tex2D(cudaTextureObject_t texObject, float x, float y
return
ret
;
}
template
<>
inline
__device__
float4
tex2D
<
float4
>
(
cudaTextureObject_t
texObject
,
float
x
,
float
y
)
{
float4
ret
;
asm
(
"tex.2d.v4.f32.f32 {%0, %1, %2, %3}, [%4, {%5, %6}];"
:
"=r"
(
ret
.
x
),
"=r"
(
ret
.
y
),
"=r"
(
ret
.
z
),
"=r"
(
ret
.
w
)
:
"l"
(
texObject
),
"f"
(
x
),
"f"
(
y
));
return
ret
;
}
template
<>
inline
__device__
float
tex2D
<
float
>
(
cudaTextureObject_t
texObject
,
float
x
,
float
y
)
{
return
tex2D
<
float4
>
(
texObject
,
x
,
y
).
x
;
}
template
<>
inline
__device__
float2
tex2D
<
float2
>
(
cudaTextureObject_t
texObject
,
float
x
,
float
y
)
{
float4
ret
=
tex2D
<
float4
>
(
texObject
,
x
,
y
);
return
make_float2
(
ret
.
x
,
ret
.
y
);
}
// Math helper functions
static
inline
__device__
float
floorf
(
float
a
)
{
return
__builtin_floorf
(
a
);
}
static
inline
__device__
float
floor
(
float
a
)
{
return
__builtin_floorf
(
a
);
}
static
inline
__device__
double
floor
(
double
a
)
{
return
__builtin_floor
(
a
);
}
static
inline
__device__
float
ceilf
(
float
a
)
{
return
__builtin_ceilf
(
a
);
}
static
inline
__device__
float
ceil
(
float
a
)
{
return
__builtin_ceilf
(
a
);
}
static
inline
__device__
double
ceil
(
double
a
)
{
return
__builtin_ceil
(
a
);
}
static
inline
__device__
float
truncf
(
float
a
)
{
return
__builtin_truncf
(
a
);
}
static
inline
__device__
float
trunc
(
float
a
)
{
return
__builtin_truncf
(
a
);
}
static
inline
__device__
double
trunc
(
double
a
)
{
return
__builtin_trunc
(
a
);
}
static
inline
__device__
float
fabsf
(
float
a
)
{
return
__builtin_fabsf
(
a
);
}
static
inline
__device__
float
fabs
(
float
a
)
{
return
__builtin_fabsf
(
a
);
}
static
inline
__device__
double
fabs
(
double
a
)
{
return
__builtin_fabs
(
a
);
}
#endif
/* COMPAT_CUDA_CUDA_RUNTIME_H */
libavfilter/Makefile
View file @
f1d0f837
...
...
@@ -374,7 +374,8 @@ OBJS-$(CONFIG_ROBERTS_OPENCL_FILTER) += vf_convolution_opencl.o opencl.o
OBJS-$(CONFIG_ROTATE_FILTER)
+=
vf_rotate.o
OBJS-$(CONFIG_SAB_FILTER)
+=
vf_sab.o
OBJS-$(CONFIG_SCALE_FILTER)
+=
vf_scale.o
scale_eval.o
OBJS-$(CONFIG_SCALE_CUDA_FILTER)
+=
vf_scale_cuda.o
vf_scale_cuda.ptx.o
scale_eval.o
OBJS-$(CONFIG_SCALE_CUDA_FILTER)
+=
vf_scale_cuda.o
scale_eval.o
\
vf_scale_cuda.ptx.o
vf_scale_cuda_bicubic.ptx.o
OBJS-$(CONFIG_SCALE_NPP_FILTER)
+=
vf_scale_npp.o
scale_eval.o
OBJS-$(CONFIG_SCALE_QSV_FILTER)
+=
vf_scale_qsv.o
OBJS-$(CONFIG_SCALE_VAAPI_FILTER)
+=
vf_scale_vaapi.o
scale_eval.o
vaapi_vpp.o
...
...
libavfilter/cuda/vector_helpers.cuh
0 → 100644
View file @
f1d0f837
/*
* This file is part of FFmpeg.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#ifndef AVFILTER_CUDA_VECTORHELPERS_H
#define AVFILTER_CUDA_VECTORHELPERS_H
typedef unsigned char uchar;
typedef unsigned short ushort;
template<typename T> struct vector_helper { };
template<> struct vector_helper<uchar> { typedef float ftype; typedef int itype; };
template<> struct vector_helper<uchar2> { typedef float2 ftype; typedef int2 itype; };
template<> struct vector_helper<uchar4> { typedef float4 ftype; typedef int4 itype; };
template<> struct vector_helper<ushort> { typedef float ftype; typedef int itype; };
template<> struct vector_helper<ushort2> { typedef float2 ftype; typedef int2 itype; };
template<> struct vector_helper<ushort4> { typedef float4 ftype; typedef int4 itype; };
template<> struct vector_helper<int> { typedef float ftype; typedef int itype; };
template<> struct vector_helper<int2> { typedef float2 ftype; typedef int2 itype; };
template<> struct vector_helper<int4> { typedef float4 ftype; typedef int4 itype; };
#define floatT typename vector_helper<T>::ftype
#define intT typename vector_helper<T>::itype
template<typename T, typename V> inline __device__ V to_floatN(const T &a) { return (V)a; }
template<typename T, typename V> inline __device__ T from_floatN(const V &a) { return (T)a; }
#define OPERATORS2(T) \
template<typename V> inline __device__ T operator+(const T &a, const V &b) { return make_ ## T (a.x + b.x, a.y + b.y); } \
template<typename V> inline __device__ T operator-(const T &a, const V &b) { return make_ ## T (a.x - b.x, a.y - b.y); } \
template<typename V> inline __device__ T operator*(const T &a, V b) { return make_ ## T (a.x * b, a.y * b); } \
template<typename V> inline __device__ T operator/(const T &a, V b) { return make_ ## T (a.x / b, a.y / b); } \
template<typename V> inline __device__ T operator>>(const T &a, V b) { return make_ ## T (a.x >> b, a.y >> b); } \
template<typename V> inline __device__ T operator<<(const T &a, V b) { return make_ ## T (a.x << b, a.y << b); } \
template<typename V> inline __device__ T &operator+=(T &a, const V &b) { a.x += b.x; a.y += b.y; return a; } \
template<typename V> inline __device__ void vec_set(T &a, const V &b) { a.x = b.x; a.y = b.y; } \
template<typename V> inline __device__ void vec_set_scalar(T &a, V b) { a.x = b; a.y = b; } \
template<> inline __device__ float2 to_floatN<T, float2>(const T &a) { return make_float2(a.x, a.y); } \
template<> inline __device__ T from_floatN<T, float2>(const float2 &a) { return make_ ## T(a.x, a.y); }
#define OPERATORS4(T) \
template<typename V> inline __device__ T operator+(const T &a, const V &b) { return make_ ## T (a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); } \
template<typename V> inline __device__ T operator-(const T &a, const V &b) { return make_ ## T (a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); } \
template<typename V> inline __device__ T operator*(const T &a, V b) { return make_ ## T (a.x * b, a.y * b, a.z * b, a.w * b); } \
template<typename V> inline __device__ T operator/(const T &a, V b) { return make_ ## T (a.x / b, a.y / b, a.z / b, a.w / b); } \
template<typename V> inline __device__ T operator>>(const T &a, V b) { return make_ ## T (a.x >> b, a.y >> b, a.z >> b, a.w >> b); } \
template<typename V> inline __device__ T operator<<(const T &a, V b) { return make_ ## T (a.x << b, a.y << b, a.z << b, a.w << b); } \
template<typename V> inline __device__ T &operator+=(T &a, const V &b) { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; } \
template<typename V> inline __device__ void vec_set(T &a, const V &b) { a.x = b.x; a.y = b.y; a.z = b.z; a.w = b.w; } \
template<typename V> inline __device__ void vec_set_scalar(T &a, V b) { a.x = b; a.y = b; a.z = b; a.w = b; } \
template<> inline __device__ float4 to_floatN<T, float4>(const T &a) { return make_float4(a.x, a.y, a.z, a.w); } \
template<> inline __device__ T from_floatN<T, float4>(const float4 &a) { return make_ ## T(a.x, a.y, a.z, a.w); }
OPERATORS2(int2)
OPERATORS2(uchar2)
OPERATORS2(ushort2)
OPERATORS2(float2)
OPERATORS4(int4)
OPERATORS4(uchar4)
OPERATORS4(ushort4)
OPERATORS4(float4)
template<typename V> inline __device__ void vec_set(int &a, V b) { a = b; }
template<typename V> inline __device__ void vec_set(float &a, V b) { a = b; }
template<typename V> inline __device__ void vec_set(uchar &a, V b) { a = b; }
template<typename V> inline __device__ void vec_set(ushort &a, V b) { a = b; }
template<typename V> inline __device__ void vec_set_scalar(int &a, V b) { a = b; }
template<typename V> inline __device__ void vec_set_scalar(float &a, V b) { a = b; }
template<typename V> inline __device__ void vec_set_scalar(uchar &a, V b) { a = b; }
template<typename V> inline __device__ void vec_set_scalar(ushort &a, V b) { a = b; }
template<typename T>
inline __device__ T lerp_scalar(T v0, T v1, float t) {
return t*v1 + (1.0f - t)*v0;
}
template<>
inline __device__ float2 lerp_scalar<float2>(float2 v0, float2 v1, float t) {
return make_float2(
lerp_scalar(v0.x, v1.x, t),
lerp_scalar(v0.y, v1.y, t)
);
}
template<>
inline __device__ float4 lerp_scalar<float4>(float4 v0, float4 v1, float t) {
return make_float4(
lerp_scalar(v0.x, v1.x, t),
lerp_scalar(v0.y, v1.y, t),
lerp_scalar(v0.z, v1.z, t),
lerp_scalar(v0.w, v1.w, t)
);
}
#endif
libavfilter/version.h
View file @
f1d0f837
...
...
@@ -31,7 +31,7 @@
#define LIBAVFILTER_VERSION_MAJOR 7
#define LIBAVFILTER_VERSION_MINOR 88
#define LIBAVFILTER_VERSION_MICRO 10
0
#define LIBAVFILTER_VERSION_MICRO 10
1
#define LIBAVFILTER_VERSION_INT AV_VERSION_INT(LIBAVFILTER_VERSION_MAJOR, \
...
...
libavfilter/vf_scale_cuda.c
View file @
f1d0f837
This diff is collapsed.
Click to expand it.
libavfilter/vf_scale_cuda_bicubic.cu
0 → 100644
View file @
f1d0f837
/*
* This file is part of FFmpeg.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include "cuda/vector_helpers.cuh"
__device__ inline float4 bicubic_coeffs(float x)
{
const float A = -0.75f;
float4 res;
res.x = ((A * (x + 1) - 5 * A) * (x + 1) + 8 * A) * (x + 1) - 4 * A;
res.y = ((A + 2) * x - (A + 3)) * x * x + 1;
res.z = ((A + 2) * (1 - x) - (A + 3)) * (1 - x) * (1 - x) + 1;
res.w = 1.0f - res.x - res.y - res.z;
return res;
}
__device__ inline void bicubic_fast_coeffs(float x, float *h0, float *h1, float *s)
{
float4 coeffs = bicubic_coeffs(x);
float g0 = coeffs.x + coeffs.y;
float g1 = coeffs.z + coeffs.w;
*h0 = coeffs.y / g0 - 0.5f;
*h1 = coeffs.w / g1 + 1.5f;
*s = g0 / (g0 + g1);
}
template<typename V>
__device__ inline V bicubic_filter(float4 coeffs, V c0, V c1, V c2, V c3)
{
V res = c0 * coeffs.x;
res += c1 * coeffs.y;
res += c2 * coeffs.z;
res += c3 * coeffs.w;
return res;
}
template<typename T>
__device__ inline void Subsample_Bicubic(cudaTextureObject_t src_tex,
T *dst,
int dst_width, int dst_height, int dst_pitch,
int src_width, int src_height,
int bit_depth)
{
int xo = blockIdx.x * blockDim.x + threadIdx.x;
int yo = blockIdx.y * blockDim.y + threadIdx.y;
if (yo < dst_height && xo < dst_width)
{
float hscale = (float)src_width / (float)dst_width;
float vscale = (float)src_height / (float)dst_height;
float xi = (xo + 0.5f) * hscale - 0.5f;
float yi = (yo + 0.5f) * vscale - 0.5f;
float px = floor(xi);
float py = floor(yi);
float fx = xi - px;
float fy = yi - py;
float factor = bit_depth > 8 ? 0xFFFF : 0xFF;
float4 coeffsX = bicubic_coeffs(fx);
float4 coeffsY = bicubic_coeffs(fy);
#define PIX(x, y) tex2D<floatT>(src_tex, (x), (y))
dst[yo * dst_pitch + xo] = from_floatN<T, floatT>(
bicubic_filter<floatT>(coeffsY,
bicubic_filter<floatT>(coeffsX, PIX(px - 1, py - 1), PIX(px, py - 1), PIX(px + 1, py - 1), PIX(px + 2, py - 1)),
bicubic_filter<floatT>(coeffsX, PIX(px - 1, py ), PIX(px, py ), PIX(px + 1, py ), PIX(px + 2, py )),
bicubic_filter<floatT>(coeffsX, PIX(px - 1, py + 1), PIX(px, py + 1), PIX(px + 1, py + 1), PIX(px + 2, py + 1)),
bicubic_filter<floatT>(coeffsX, PIX(px - 1, py + 2), PIX(px, py + 2), PIX(px + 1, py + 2), PIX(px + 2, py + 2))
) * factor
);
#undef PIX
}
}
/* This does not yield correct results. Most likely because of low internal precision in tex2D linear interpolation */
template<typename T>
__device__ inline void Subsample_FastBicubic(cudaTextureObject_t src_tex,
T *dst,
int dst_width, int dst_height, int dst_pitch,
int src_width, int src_height,
int bit_depth)
{
int xo = blockIdx.x * blockDim.x + threadIdx.x;
int yo = blockIdx.y * blockDim.y + threadIdx.y;
if (yo < dst_height && xo < dst_width)
{
float hscale = (float)src_width / (float)dst_width;
float vscale = (float)src_height / (float)dst_height;
float xi = (xo + 0.5f) * hscale - 0.5f;
float yi = (yo + 0.5f) * vscale - 0.5f;
float px = floor(xi);
float py = floor(yi);
float fx = xi - px;
float fy = yi - py;
float factor = bit_depth > 8 ? 0xFFFF : 0xFF;
float h0x, h1x, sx;
float h0y, h1y, sy;
bicubic_fast_coeffs(fx, &h0x, &h1x, &sx);
bicubic_fast_coeffs(fy, &h0y, &h1y, &sy);
#define PIX(x, y) tex2D<floatT>(src_tex, (x), (y))
floatT pix[4] = {
PIX(px + h0x, py + h0y),
PIX(px + h1x, py + h0y),
PIX(px + h0x, py + h1y),
PIX(px + h1x, py + h1y)
};
#undef PIX
dst[yo * dst_pitch + xo] = from_floatN<T, floatT>(
lerp_scalar(
lerp_scalar(pix[3], pix[2], sx),
lerp_scalar(pix[1], pix[0], sx),
sy) * factor
);
}
}
extern "C" {
#define BICUBIC_KERNEL(T) \
__global__ void Subsample_Bicubic_ ## T(cudaTextureObject_t src_tex, \
T *dst, \
int dst_width, int dst_height, int dst_pitch, \
int src_width, int src_height, \
int bit_depth) \
{ \
Subsample_Bicubic<T>(src_tex, dst, \
dst_width, dst_height, dst_pitch, \
src_width, src_height, \
bit_depth); \
}
BICUBIC_KERNEL(uchar)
BICUBIC_KERNEL(uchar2)
BICUBIC_KERNEL(uchar4)
BICUBIC_KERNEL(ushort)
BICUBIC_KERNEL(ushort2)
BICUBIC_KERNEL(ushort4)
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment