• Mark Reid's avatar
    swscale/x86/input.asm: add x86-optimized planer rgb2yuv functions · 52f70261
    Mark Reid authored
    sse2 only operates on 2 lanes per loop for to_y and to_uv functions, due
    to the lack of pmulld instruction.  Emulating pmulld with 2 pmuludq and shuffles
    proved too costly and made to_uv functions slower then the c implementation.
    
    For to_y on sse2 only float functions are generated,
    I was are not able outperform the c implementation on the integer pixel formats.
    
    For to_a on see4 only the float functions are generated.
    sse2 and sse4 generated nearly identical performing code on integer pixel formats,
    so only sse2/avx2 versions are generated.
    
    planar_gbrp_to_y_512_c: 1197.5
    planar_gbrp_to_y_512_sse4: 444.5
    planar_gbrp_to_y_512_avx2: 287.5
    planar_gbrap_to_y_512_c: 1204.5
    planar_gbrap_to_y_512_sse4: 447.5
    planar_gbrap_to_y_512_avx2: 289.5
    planar_gbrp9be_to_y_512_c: 1380.0
    planar_gbrp9be_to_y_512_sse4: 543.5
    planar_gbrp9be_to_y_512_avx2: 340.0
    planar_gbrp9le_to_y_512_c: 1200.5
    planar_gbrp9le_to_y_512_sse4: 442.0
    planar_gbrp9le_to_y_512_avx2: 282.0
    planar_gbrp10be_to_y_512_c: 1378.5
    planar_gbrp10be_to_y_512_sse4: 544.0
    planar_gbrp10be_to_y_512_avx2: 337.5
    planar_gbrp10le_to_y_512_c: 1200.0
    planar_gbrp10le_to_y_512_sse4: 448.0
    planar_gbrp10le_to_y_512_avx2: 285.5
    planar_gbrap10be_to_y_512_c: 1380.0
    planar_gbrap10be_to_y_512_sse4: 542.0
    planar_gbrap10be_to_y_512_avx2: 340.5
    planar_gbrap10le_to_y_512_c: 1199.0
    planar_gbrap10le_to_y_512_sse4: 446.0
    planar_gbrap10le_to_y_512_avx2: 289.5
    planar_gbrp12be_to_y_512_c: 10563.0
    planar_gbrp12be_to_y_512_sse4: 542.5
    planar_gbrp12be_to_y_512_avx2: 339.0
    planar_gbrp12le_to_y_512_c: 1201.0
    planar_gbrp12le_to_y_512_sse4: 440.5
    planar_gbrp12le_to_y_512_avx2: 286.0
    planar_gbrap12be_to_y_512_c: 1701.5
    planar_gbrap12be_to_y_512_sse4: 917.0
    planar_gbrap12be_to_y_512_avx2: 338.5
    planar_gbrap12le_to_y_512_c: 1201.0
    planar_gbrap12le_to_y_512_sse4: 444.5
    planar_gbrap12le_to_y_512_avx2: 288.0
    planar_gbrp14be_to_y_512_c: 1370.5
    planar_gbrp14be_to_y_512_sse4: 545.0
    planar_gbrp14be_to_y_512_avx2: 338.5
    planar_gbrp14le_to_y_512_c: 1199.0
    planar_gbrp14le_to_y_512_sse4: 444.0
    planar_gbrp14le_to_y_512_avx2: 279.5
    planar_gbrp16be_to_y_512_c: 1364.0
    planar_gbrp16be_to_y_512_sse4: 544.5
    planar_gbrp16be_to_y_512_avx2: 339.5
    planar_gbrp16le_to_y_512_c: 1201.0
    planar_gbrp16le_to_y_512_sse4: 445.5
    planar_gbrp16le_to_y_512_avx2: 280.5
    planar_gbrap16be_to_y_512_c: 1377.0
    planar_gbrap16be_to_y_512_sse4: 545.0
    planar_gbrap16be_to_y_512_avx2: 338.5
    planar_gbrap16le_to_y_512_c: 1201.0
    planar_gbrap16le_to_y_512_sse4: 442.0
    planar_gbrap16le_to_y_512_avx2: 279.0
    planar_gbrpf32be_to_y_512_c: 4113.0
    planar_gbrpf32be_to_y_512_sse2: 2438.0
    planar_gbrpf32be_to_y_512_sse4: 1068.0
    planar_gbrpf32be_to_y_512_avx2: 904.5
    planar_gbrpf32le_to_y_512_c: 3818.5
    planar_gbrpf32le_to_y_512_sse2: 2024.5
    planar_gbrpf32le_to_y_512_sse4: 1241.5
    planar_gbrpf32le_to_y_512_avx2: 657.0
    planar_gbrapf32be_to_y_512_c: 3707.0
    planar_gbrapf32be_to_y_512_sse2: 2444.0
    planar_gbrapf32be_to_y_512_sse4: 1077.0
    planar_gbrapf32be_to_y_512_avx2: 909.0
    planar_gbrapf32le_to_y_512_c: 3822.0
    planar_gbrapf32le_to_y_512_sse2: 2024.5
    planar_gbrapf32le_to_y_512_sse4: 1176.0
    planar_gbrapf32le_to_y_512_avx2: 658.5
    
    planar_gbrp_to_uv_512_c: 2325.8
    planar_gbrp_to_uv_512_sse2: 1726.8
    planar_gbrp_to_uv_512_sse4: 771.8
    planar_gbrp_to_uv_512_avx2: 506.8
    planar_gbrap_to_uv_512_c: 2281.8
    planar_gbrap_to_uv_512_sse2: 1726.3
    planar_gbrap_to_uv_512_sse4: 768.3
    planar_gbrap_to_uv_512_avx2: 496.3
    planar_gbrp9be_to_uv_512_c: 2336.8
    planar_gbrp9be_to_uv_512_sse2: 1924.8
    planar_gbrp9be_to_uv_512_sse4: 852.3
    planar_gbrp9be_to_uv_512_avx2: 552.8
    planar_gbrp9le_to_uv_512_c: 2270.3
    planar_gbrp9le_to_uv_512_sse2: 1512.3
    planar_gbrp9le_to_uv_512_sse4: 764.3
    planar_gbrp9le_to_uv_512_avx2: 491.3
    planar_gbrp10be_to_uv_512_c: 2281.8
    planar_gbrp10be_to_uv_512_sse2: 1917.8
    planar_gbrp10be_to_uv_512_sse4: 855.3
    planar_gbrp10be_to_uv_512_avx2: 541.3
    planar_gbrp10le_to_uv_512_c: 2269.8
    planar_gbrp10le_to_uv_512_sse2: 1515.3
    planar_gbrp10le_to_uv_512_sse4: 759.8
    planar_gbrp10le_to_uv_512_avx2: 487.8
    planar_gbrap10be_to_uv_512_c: 2382.3
    planar_gbrap10be_to_uv_512_sse2: 1924.8
    planar_gbrap10be_to_uv_512_sse4: 855.3
    planar_gbrap10be_to_uv_512_avx2: 540.8
    planar_gbrap10le_to_uv_512_c: 2382.3
    planar_gbrap10le_to_uv_512_sse2: 1512.3
    planar_gbrap10le_to_uv_512_sse4: 759.3
    planar_gbrap10le_to_uv_512_avx2: 484.8
    planar_gbrp12be_to_uv_512_c: 2283.8
    planar_gbrp12be_to_uv_512_sse2: 1936.8
    planar_gbrp12be_to_uv_512_sse4: 858.3
    planar_gbrp12be_to_uv_512_avx2: 541.3
    planar_gbrp12le_to_uv_512_c: 2278.8
    planar_gbrp12le_to_uv_512_sse2: 1507.3
    planar_gbrp12le_to_uv_512_sse4: 760.3
    planar_gbrp12le_to_uv_512_avx2: 485.8
    planar_gbrap12be_to_uv_512_c: 2385.3
    planar_gbrap12be_to_uv_512_sse2: 1927.8
    planar_gbrap12be_to_uv_512_sse4: 855.3
    planar_gbrap12be_to_uv_512_avx2: 539.8
    planar_gbrap12le_to_uv_512_c: 2377.3
    planar_gbrap12le_to_uv_512_sse2: 1516.3
    planar_gbrap12le_to_uv_512_sse4: 759.3
    planar_gbrap12le_to_uv_512_avx2: 484.8
    planar_gbrp14be_to_uv_512_c: 2283.8
    planar_gbrp14be_to_uv_512_sse2: 1935.3
    planar_gbrp14be_to_uv_512_sse4: 852.3
    planar_gbrp14be_to_uv_512_avx2: 540.3
    planar_gbrp14le_to_uv_512_c: 2276.8
    planar_gbrp14le_to_uv_512_sse2: 1514.8
    planar_gbrp14le_to_uv_512_sse4: 762.3
    planar_gbrp14le_to_uv_512_avx2: 484.8
    planar_gbrp16be_to_uv_512_c: 2383.3
    planar_gbrp16be_to_uv_512_sse2: 1881.8
    planar_gbrp16be_to_uv_512_sse4: 852.3
    planar_gbrp16be_to_uv_512_avx2: 541.8
    planar_gbrp16le_to_uv_512_c: 2378.3
    planar_gbrp16le_to_uv_512_sse2: 1476.8
    planar_gbrp16le_to_uv_512_sse4: 765.3
    planar_gbrp16le_to_uv_512_avx2: 485.8
    planar_gbrap16be_to_uv_512_c: 2382.3
    planar_gbrap16be_to_uv_512_sse2: 1886.3
    planar_gbrap16be_to_uv_512_sse4: 853.8
    planar_gbrap16be_to_uv_512_avx2: 550.8
    planar_gbrap16le_to_uv_512_c: 2381.8
    planar_gbrap16le_to_uv_512_sse2: 1488.3
    planar_gbrap16le_to_uv_512_sse4: 765.3
    planar_gbrap16le_to_uv_512_avx2: 491.8
    planar_gbrpf32be_to_uv_512_c: 4863.0
    planar_gbrpf32be_to_uv_512_sse2: 3347.5
    planar_gbrpf32be_to_uv_512_sse4: 1800.0
    planar_gbrpf32be_to_uv_512_avx2: 1199.0
    planar_gbrpf32le_to_uv_512_c: 4725.0
    planar_gbrpf32le_to_uv_512_sse2: 2753.0
    planar_gbrpf32le_to_uv_512_sse4: 1474.5
    planar_gbrpf32le_to_uv_512_avx2: 927.5
    planar_gbrapf32be_to_uv_512_c: 4859.0
    planar_gbrapf32be_to_uv_512_sse2: 3269.0
    planar_gbrapf32be_to_uv_512_sse4: 1802.0
    planar_gbrapf32be_to_uv_512_avx2: 1201.5
    planar_gbrapf32le_to_uv_512_c: 6338.0
    planar_gbrapf32le_to_uv_512_sse2: 2756.5
    planar_gbrapf32le_to_uv_512_sse4: 1476.0
    planar_gbrapf32le_to_uv_512_avx2: 908.5
    
    planar_gbrap_to_a_512_c: 383.3
    planar_gbrap_to_a_512_sse2: 66.8
    planar_gbrap_to_a_512_avx2: 43.8
    planar_gbrap10be_to_a_512_c: 601.8
    planar_gbrap10be_to_a_512_sse2: 86.3
    planar_gbrap10be_to_a_512_avx2: 34.8
    planar_gbrap10le_to_a_512_c: 602.3
    planar_gbrap10le_to_a_512_sse2: 48.8
    planar_gbrap10le_to_a_512_avx2: 31.3
    planar_gbrap12be_to_a_512_c: 601.8
    planar_gbrap12be_to_a_512_sse2: 111.8
    planar_gbrap12be_to_a_512_avx2: 41.3
    planar_gbrap12le_to_a_512_c: 385.8
    planar_gbrap12le_to_a_512_sse2: 75.3
    planar_gbrap12le_to_a_512_avx2: 39.8
    planar_gbrap16be_to_a_512_c: 386.8
    planar_gbrap16be_to_a_512_sse2: 79.8
    planar_gbrap16be_to_a_512_avx2: 31.3
    planar_gbrap16le_to_a_512_c: 600.3
    planar_gbrap16le_to_a_512_sse2: 40.3
    planar_gbrap16le_to_a_512_avx2: 30.3
    planar_gbrapf32be_to_a_512_c: 1148.8
    planar_gbrapf32be_to_a_512_sse2: 611.3
    planar_gbrapf32be_to_a_512_sse4: 234.8
    planar_gbrapf32be_to_a_512_avx2: 183.3
    planar_gbrapf32le_to_a_512_c: 851.3
    planar_gbrapf32le_to_a_512_sse2: 263.3
    planar_gbrapf32le_to_a_512_sse4: 199.3
    planar_gbrapf32le_to_a_512_avx2: 156.8
    Reviewed-by: 's avatarPaul B Mahol <onemda@gmail.com>
    Signed-off-by: 's avatarJames Almer <jamrial@gmail.com>
    52f70261
sw_gbrp.c 14.9 KB