compat/cuda: switch from powf to __powf intrinsic

The powf builtin causes crashes on older clang, so manually implement the (faster) intrinsic. The code it spawns is identical to that of nvcc.

compat/cuda: switch from powf to __powf intrinsic
The powf builtin causes crashes on older clang, so manually implement the (faster) intrinsic. The code it spawns is identical to that of nvcc.
41692334 · Timo Rothenpieler · 73fada02 · 41692334 · 41692334
Commit 41692334 authored Sep 03, 2022 by Timo Rothenpieler
Hide whitespace changes
Inline Side-by-side

Showing with 5 additions and 5 deletions

cuda_runtime.h compat/cuda/cuda_runtime.h +1 -1

vf_bilateral_cuda.cu libavfilter/vf_bilateral_cuda.cu +4 -4

No files found.
--- a/compat/cuda/cuda_runtime.h
+++ b/compat/cuda/cuda_runtime.h
@@ -182,11 +182,11 @@ static inline __device__ float fabsf(float a) { return __builtin_fabsf(a); }
 static inline __device__ float fabs(float a) { return __builtin_fabsf(a); }
 static inline __device__ double fabs(double a) { return __builtin_fabs(a); }
 static inline __device__ float sqrtf(float a) { return __builtin_sqrtf(a); }
-static inline __device__ float powf(float a, float  y) { return __builtin_powf(a,y); }

 static inline __device__ float __saturatef(float a) { return __nvvm_saturate_f(a); }
 static inline __device__ float __sinf(float a) { return __nvvm_sin_approx_f(a); }
 static inline __device__ float __cosf(float a) { return __nvvm_cos_approx_f(a); }
 static inline __device__ float __expf(float a) { return __nvvm_ex2_approx_f(a * (float)__builtin_log2(__builtin_exp(1))); }
+static inline __device__ float __powf(float a, float b) { return __nvvm_ex2_approx_f(__nvvm_lg2_approx_f(a) * b); }

 #endif /* COMPAT_CUDA_CUDA_RUNTIME_H */
--- a/libavfilter/vf_bilateral_cuda.cu
+++ b/libavfilter/vf_bilateral_cuda.cu
@@ -34,9 +34,9 @@ extern "C"
 __device__ static inline float norm_squared(float4 first_yuv, float4 second_yuv)
 {
    float ans = 0;
-    ans += powf(first_yuv.x - second_yuv.x, 2);
-    ans += powf(first_yuv.y - second_yuv.y, 2);
-    ans += powf(first_yuv.z - second_yuv.z, 2);
+    ans += __powf(first_yuv.x - second_yuv.x, 2);
+    ans += __powf(first_yuv.y - second_yuv.y, 2);
+    ans += __powf(first_yuv.z - second_yuv.z, 2);
    return ans;
 }

@@ -52,7 +52,7 @@ __device__ static inline float calculate_w(int x, int y, int r, int c,
                                           float sigma_space, float sigma_color)
 {
    float first_term, second_term;
-    first_term = (powf(x - r, 2) + powf(y - c, 2)) / (2 * sigma_space * sigma_space);
+    first_term = (__powf(x - r, 2) + __powf(y - c, 2)) / (2 * sigma_space * sigma_space);
    second_term = norm_squared(pixel_value, neighbor_value) / (2 * sigma_color * sigma_color);
    return __expf(-first_term - second_term);
 }