ffmpeg-devel
Threads by month
- ----- 2026 -----
- May
- April
- March
- February
- January
- ----- 2025 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2024 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2023 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2022 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2021 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2020 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2019 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2018 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2017 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2016 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2015 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2014 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2013 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2012 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2011 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2010 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2009 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2008 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2007 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2006 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2005 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- 27 participants
- 62015 discussions
[PR] swscale/swscale_unscaled: fix packed30togbra10() and gbr16ptopacked30() for GBRP 10 and 12 bit MSB formats (PR #23108)
by Ramiro Polla 15 May '26
by Ramiro Polla 15 May '26
15 May '26
PR #23108 opened by Ramiro Polla (ramiro)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23108
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23108.patch
The formats added in e93de9948d keep the values in the most significant
bits of the uint16_t, and packed30togbra10() and gbr16ptopacked30()
weren't taking into consideration the shift field from AVComponentDescriptor.
Reproducible with:
$ ./libswscale/tests/swscale -unscaled 1 -src gbrp10msbbe -dst x2rgb10le
$ ./libswscale/tests/swscale -unscaled 1 -src x2rgb10le -dst gbrp10msbbe
From 7f952b2d3b06d60dd3d50e25207a83fe33618d4b Mon Sep 17 00:00:00 2001
From: Ramiro Polla <ramiro.polla(a)gmail.com>
Date: Fri, 15 May 2026 22:44:13 +0200
Subject: [PATCH] swscale/swscale_unscaled: fix packed30togbra10() and
gbr16ptopacked30() for GBRP 10 and 12 bit MSB formats
The formats added in e93de9948d keep the values in the most significant
bits of the uint16_t, and packed30togbra10() and gbr16ptopacked30()
weren't taking into consideration the shift field from AVComponentDescriptor.
Reproducible with:
$ ./libswscale/tests/swscale -unscaled 1 -src gbrp10msbbe -dst x2rgb10le
$ ./libswscale/tests/swscale -unscaled 1 -src x2rgb10le -dst gbrp10msbbe
---
libswscale/swscale_unscaled.c | 45 ++++++++++++++++++-----------------
1 file changed, 23 insertions(+), 22 deletions(-)
diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
index 0ecef7d44a..7de47dfbb7 100644
--- a/libswscale/swscale_unscaled.c
+++ b/libswscale/swscale_unscaled.c
@@ -818,7 +818,7 @@ static void packed16togbra16(const uint8_t *src, int srcStride,
static void packed30togbra10(const uint8_t *src, int srcStride,
uint16_t *dst[], const int dstStride[], int srcSliceH,
- int swap, int bpc, int width)
+ int swap, int bpc, int shift, int width)
{
int x, h, i;
int dst_alpha = dst[3] != NULL;
@@ -835,23 +835,23 @@ static void packed30togbra10(const uint8_t *src, int srcStride,
for (x = 0; x < width; x++) {
unsigned p = AV_RL32(src_line);
component = (p >> 20) & 0x3FF;
- dst[0][x] = av_bswap16(component << scale_high | component >> scale_low);
+ dst[0][x] = av_bswap16((component << scale_high | component >> scale_low) << shift);
component = (p >> 10) & 0x3FF;
- dst[1][x] = av_bswap16(component << scale_high | component >> scale_low);
+ dst[1][x] = av_bswap16((component << scale_high | component >> scale_low) << shift);
component = p & 0x3FF;
- dst[2][x] = av_bswap16(component << scale_high | component >> scale_low);
- dst[3][x] = av_bswap16(alpha_val);
+ dst[2][x] = av_bswap16((component << scale_high | component >> scale_low) << shift);
+ dst[3][x] = av_bswap16(alpha_val) << shift;
src_line++;
}
} else {
for (x = 0; x < width; x++) {
unsigned p = AV_RL32(src_line);
component = (p >> 20) & 0x3FF;
- dst[0][x] = av_bswap16(component << scale_high | component >> scale_low);
+ dst[0][x] = av_bswap16((component << scale_high | component >> scale_low) << shift);
component = (p >> 10) & 0x3FF;
- dst[1][x] = av_bswap16(component << scale_high | component >> scale_low);
+ dst[1][x] = av_bswap16((component << scale_high | component >> scale_low) << shift);
component = p & 0x3FF;
- dst[2][x] = av_bswap16(component << scale_high | component >> scale_low);
+ dst[2][x] = av_bswap16((component << scale_high | component >> scale_low) << shift);
src_line++;
}
}
@@ -861,23 +861,23 @@ static void packed30togbra10(const uint8_t *src, int srcStride,
for (x = 0; x < width; x++) {
unsigned p = AV_RL32(src_line);
component = (p >> 20) & 0x3FF;
- dst[0][x] = component << scale_high | component >> scale_low;
+ dst[0][x] = (component << scale_high | component >> scale_low) << shift;
component = (p >> 10) & 0x3FF;
- dst[1][x] = component << scale_high | component >> scale_low;
+ dst[1][x] = (component << scale_high | component >> scale_low) << shift;
component = p & 0x3FF;
- dst[2][x] = component << scale_high | component >> scale_low;
- dst[3][x] = alpha_val;
+ dst[2][x] = (component << scale_high | component >> scale_low) << shift;
+ dst[3][x] = alpha_val << shift;
src_line++;
}
} else {
for (x = 0; x < width; x++) {
unsigned p = AV_RL32(src_line);
component = (p >> 20) & 0x3FF;
- dst[0][x] = component << scale_high | component >> scale_low;
+ dst[0][x] = (component << scale_high | component >> scale_low) << shift;
component = (p >> 10) & 0x3FF;
- dst[1][x] = component << scale_high | component >> scale_low;
+ dst[1][x] = (component << scale_high | component >> scale_low) << shift;
component = p & 0x3FF;
- dst[2][x] = component << scale_high | component >> scale_low;
+ dst[2][x] = (component << scale_high | component >> scale_low) << shift;
src_line++;
}
}
@@ -899,6 +899,7 @@ static int Rgb16ToPlanarRgb16Wrapper(SwsInternal *c, const uint8_t *const src[],
const AVPixFmtDescriptor *src_format = av_pix_fmt_desc_get(c->opts.src_format);
const AVPixFmtDescriptor *dst_format = av_pix_fmt_desc_get(c->opts.dst_format);
int bpc = dst_format->comp[0].depth;
+ int shift = dst_format->comp[0].shift;
int alpha = src_format->flags & AV_PIX_FMT_FLAG_ALPHA;
int swap = 0;
int i;
@@ -935,7 +936,7 @@ static int Rgb16ToPlanarRgb16Wrapper(SwsInternal *c, const uint8_t *const src[],
av_assert0(bpc >= 10);
packed30togbra10(src[0], srcStride[0],
dst2013, stride2013, srcSliceH, swap,
- bpc, c->opts.src_w);
+ bpc, shift, c->opts.src_w);
break;
case AV_PIX_FMT_BGR48LE:
case AV_PIX_FMT_BGR48BE:
@@ -949,7 +950,7 @@ static int Rgb16ToPlanarRgb16Wrapper(SwsInternal *c, const uint8_t *const src[],
av_assert0(bpc >= 10);
packed30togbra10(src[0], srcStride[0],
dst1023, stride1023, srcSliceH, swap,
- bpc, c->opts.src_w);
+ bpc, shift, c->opts.src_w);
break;
default:
av_log(c, AV_LOG_ERROR,
@@ -1081,11 +1082,10 @@ static void gbr16ptopacked16(const uint16_t *src[], const int srcStride[],
static void gbr16ptopacked30(const uint16_t *src[], const int srcStride[],
uint8_t *dst, int dstStride, int srcSliceH,
- int swap, int bpp, int width)
+ int swap, int shift, int width)
{
int x, h, i;
- int shift = bpp - 10;
- av_assert0(bpp >= 0);
+ av_assert0(shift >= 0);
for (h = 0; h < srcSliceH; h++) {
uint8_t *dest = dst + dstStride * h;
@@ -1125,6 +1125,7 @@ static int planarRgb16ToRgb16Wrapper(SwsInternal *c, const uint8_t *const src[],
const AVPixFmtDescriptor *src_format = av_pix_fmt_desc_get(c->opts.src_format);
const AVPixFmtDescriptor *dst_format = av_pix_fmt_desc_get(c->opts.dst_format);
int bits_per_sample = src_format->comp[0].depth;
+ int shift = src_format->comp[0].shift;
int swap = 0;
if ( HAVE_BIGENDIAN && !(src_format->flags & AV_PIX_FMT_FLAG_BE) ||
!HAVE_BIGENDIAN && src_format->flags & AV_PIX_FMT_FLAG_BE)
@@ -1168,12 +1169,12 @@ static int planarRgb16ToRgb16Wrapper(SwsInternal *c, const uint8_t *const src[],
case AV_PIX_FMT_X2RGB10LE:
gbr16ptopacked30(src201, stride201,
dst[0] + srcSliceY * dstStride[0], dstStride[0],
- srcSliceH, swap, bits_per_sample, c->opts.src_w);
+ srcSliceH, swap, bits_per_sample + shift - 10, c->opts.src_w);
break;
case AV_PIX_FMT_X2BGR10LE:
gbr16ptopacked30(src102, stride102,
dst[0] + srcSliceY * dstStride[0], dstStride[0],
- srcSliceH, swap, bits_per_sample, c->opts.src_w);
+ srcSliceH, swap, bits_per_sample + shift - 10, c->opts.src_w);
break;
default:
av_log(c, AV_LOG_ERROR,
--
2.52.0
1
0
PR #23107 opened by Zhao Zhili (quink)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23107
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23107.patch
vc1_loop_filter() is only reached through the six C wrappers. Clang 14
keeps it out of line with plain static inline, adding a 224-byte stack
frame before the tiny bestcase path on rpi 5. gcc 12 already inlines
it.
rpi 5 clang 14:
before after
vc1_v_loop_filter4_bestcase_c 27.2 8.3 (3.3x)
vc1_h_loop_filter4_bestcase_c 26.4 10.2 (2.6x)
vc1_v_loop_filter8_bestcase_c 32.5 20.3 (1.6x)
vc1_h_loop_filter8_bestcase_c 31.7 19.5 (1.6x)
vc1_v_loop_filter16_bestcase_c 42.1 33.2 (1.3x)
vc1_h_loop_filter16_bestcase_c 41.6 25.3 (1.6x)
Signed-off-by: Zhao Zhili <zhilizhao(a)tencent.com>
From 529f28fe00eef5d3eafe63a9ea5b1c4cac4c6362 Mon Sep 17 00:00:00 2001
From: Zhao Zhili <zhilizhao(a)tencent.com>
Date: Fri, 15 May 2026 21:56:53 +0800
Subject: [PATCH] avcodec/vc1dsp: always inline vc1_loop_filter
vc1_loop_filter() is only reached through the six C wrappers. Clang 14
keeps it out of line with plain static inline, adding a 224-byte stack
frame before the tiny bestcase path on rpi 5. gcc 12 already inlines
it.
rpi 5 clang 14:
before after
vc1_v_loop_filter4_bestcase_c 27.2 8.3 (3.3x)
vc1_h_loop_filter4_bestcase_c 26.4 10.2 (2.6x)
vc1_v_loop_filter8_bestcase_c 32.5 20.3 (1.6x)
vc1_h_loop_filter8_bestcase_c 31.7 19.5 (1.6x)
vc1_v_loop_filter16_bestcase_c 42.1 33.2 (1.3x)
vc1_h_loop_filter16_bestcase_c 41.6 25.3 (1.6x)
Signed-off-by: Zhao Zhili <zhilizhao(a)tencent.com>
---
libavcodec/vc1dsp.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/libavcodec/vc1dsp.c b/libavcodec/vc1dsp.c
index 86b81dfc04..dd143b114c 100644
--- a/libavcodec/vc1dsp.c
+++ b/libavcodec/vc1dsp.c
@@ -196,8 +196,8 @@ static av_always_inline int vc1_filter_line(uint8_t *src, ptrdiff_t stride, int
* @param pq block quantizer
* @see 8.6
*/
-static inline void vc1_loop_filter(uint8_t *src, ptrdiff_t step, ptrdiff_t stride,
- int len, int pq)
+static av_always_inline void vc1_loop_filter(uint8_t *src, ptrdiff_t step,
+ ptrdiff_t stride, int len, int pq)
{
int i;
int filt3;
--
2.52.0
1
0
PR #23105 opened by mengzhuo
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23105
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23105.patch
RISC-V Vector 1.0 MPEG audio synthesis window (float + fixed).
Float (zve64f): LMUL=8 memcpy, batch 16 outputs×2 with
unit-stride vle32.v on per-sub-band contiguous window blocks.
Zero vlse32.v — all memory access is contiguous.
Fixed (zve64x,b): LMUL=8 memcpy, LMUL=4 vwmul.vv dot products.
vsetivli throughout. Constants hoisted. Caller-saved regs; v0 free.
Single init: F32 + I64 + RVB.
Benchmark (X100, 5M calls): RVV 550ns C 589ns speedup 1.07x
Tested SpacemiT X100/X60 (VLEN=256). VisionFive 2 (no V, C fallback).
Signed-off-by: Meng Zhuo <mengzhuo(a)iscas.ac.cn>
From e44ee669a3a2281b61729802a5fe8dfcb53b5385 Mon Sep 17 00:00:00 2001
From: Meng Zhuo <mengzhuo(a)iscas.ac.cn>
Date: Wed, 13 May 2026 18:12:56 +0800
Subject: [PATCH] lavc/mpegaudiodsp: R-V V apply_window
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RISC-V Vector 1.0 MPEG audio synthesis window (float + fixed).
Float (zve64f): LMUL=8 memcpy, batch 16 outputs×2 with
unit-stride vle32.v on per-sub-band contiguous window blocks.
Zero vlse32.v — all memory access is contiguous.
Fixed (zve64x,b): LMUL=8 memcpy, LMUL=4 vwmul.vv dot products.
vsetivli throughout. Constants hoisted. Caller-saved regs; v0 free.
Single init: F32 + I64 + RVB.
Benchmark (X100, 5M calls): RVV 550ns C 589ns speedup 1.07x
Tested SpacemiT X100/X60 (VLEN=256). VisionFive 2 (no V, C fallback).
Signed-off-by: Meng Zhuo <mengzhuo(a)iscas.ac.cn>
---
libavcodec/mpegaudiodsp.c | 2 +
libavcodec/mpegaudiodsp.h | 1 +
libavcodec/riscv/Makefile | 2 +
libavcodec/riscv/mpegaudiodsp_init.c | 46 ++++++
libavcodec/riscv/mpegaudiodsp_rvv.S | 207 +++++++++++++++++++++++++++
5 files changed, 258 insertions(+)
create mode 100644 libavcodec/riscv/mpegaudiodsp_init.c
create mode 100644 libavcodec/riscv/mpegaudiodsp_rvv.S
diff --git a/libavcodec/mpegaudiodsp.c b/libavcodec/mpegaudiodsp.c
index 0971c28734..947b0df88a 100644
--- a/libavcodec/mpegaudiodsp.c
+++ b/libavcodec/mpegaudiodsp.c
@@ -97,6 +97,8 @@ av_cold void ff_mpadsp_init(MPADSPContext *s)
ff_mpadsp_init_arm(s);
#elif ARCH_PPC
ff_mpadsp_init_ppc(s);
+#elif ARCH_RISCV
+ ff_mpadsp_init_riscv(s);
#elif ARCH_X86
ff_mpadsp_init_x86(s);
#endif
diff --git a/libavcodec/mpegaudiodsp.h b/libavcodec/mpegaudiodsp.h
index 5e47a263bb..4c22f4a465 100644
--- a/libavcodec/mpegaudiodsp.h
+++ b/libavcodec/mpegaudiodsp.h
@@ -65,6 +65,7 @@ void ff_mpadsp_init_aarch64(MPADSPContext *s);
void ff_mpadsp_init_arm(MPADSPContext *s);
void ff_mpadsp_init_ppc(MPADSPContext *s);
void ff_mpadsp_init_x86(MPADSPContext *s);
+void ff_mpadsp_init_riscv(MPADSPContext *s);
void ff_mpadsp_init_x86_tabs(void);
void ff_mpadsp_init_mipsfpu(MPADSPContext *s);
void ff_mpadsp_init_mipsdsp(MPADSPContext *s);
diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 2c53334923..5cd63ab3f8 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -52,6 +52,8 @@ OBJS-$(CONFIG_LPC) += riscv/lpc_init.o
RVV-OBJS-$(CONFIG_LPC) += riscv/lpc_rvv.o
OBJS-$(CONFIG_ME_CMP) += riscv/me_cmp_init.o
RVV-OBJS-$(CONFIG_ME_CMP) += riscv/me_cmp_rvv.o
+OBJS-$(CONFIG_MPEGAUDIODSP) += riscv/mpegaudiodsp_init.o
+RVV-OBJS-$(CONFIG_MPEGAUDIODSP) += riscv/mpegaudiodsp_rvv.o
OBJS-$(CONFIG_MPEGVIDEO) += riscv/mpegvideo_init.o
RVV-OBJS-$(CONFIG_MPEGVIDEO) += riscv/mpegvideo_rvv.o
OBJS-$(CONFIG_MPEGVIDEOENCDSP) += riscv/mpegvideoencdsp_init.o
diff --git a/libavcodec/riscv/mpegaudiodsp_init.c b/libavcodec/riscv/mpegaudiodsp_init.c
new file mode 100644
index 0000000000..5067be3748
--- /dev/null
+++ b/libavcodec/riscv/mpegaudiodsp_init.c
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2026 Institute of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/riscv/cpu.h"
+#include "libavcodec/mpegaudiodsp.h"
+
+void ff_mpadsp_apply_window_float_rvv(float *synth_buf, float *window,
+ int *dither_state, float *samples,
+ ptrdiff_t incr);
+void ff_mpadsp_apply_window_fixed_rvv(int32_t *synth_buf, int32_t *window,
+ int *dither_state, int16_t *samples,
+ ptrdiff_t incr);
+
+av_cold void ff_mpadsp_init_riscv(MPADSPContext *s)
+{
+#if HAVE_RVV
+ int flags = av_get_cpu_flags();
+
+ if ((flags & AV_CPU_FLAG_RVV_F32) && (flags & AV_CPU_FLAG_RVV_I64)
+ && (flags & AV_CPU_FLAG_RVB)) {
+ s->apply_window_float = ff_mpadsp_apply_window_float_rvv;
+ s->apply_window_fixed = ff_mpadsp_apply_window_fixed_rvv;
+ }
+#endif
+}
diff --git a/libavcodec/riscv/mpegaudiodsp_rvv.S b/libavcodec/riscv/mpegaudiodsp_rvv.S
new file mode 100644
index 0000000000..5f742bb1a0
--- /dev/null
+++ b/libavcodec/riscv/mpegaudiodsp_rvv.S
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2026 Institute of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+/*
+ * MPEG audio synthesis window — float, RISC-V Vector 1.0
+ *
+ * LMUL=8 (v8) memcpy, LMUL=4 batch 16 outputs.
+ * All unit-stride vle32.v on per-sub-band window blocks.
+ */
+func ff_mpadsp_apply_window_float_rvv, zve64f
+ lpad 0
+ addi sp, sp, -16
+ sd ra, 8(sp)
+
+ vsetivli zero, 16, e64, m8, ta, ma
+ li t1, 512*4
+ add t1, a0, t1
+ vle64.v v8, (a0)
+ vse64.v v8, (t1)
+
+ flw ft0, (a2)
+
+ /* LMUL=4, VL=16. v24=reverse[15..0], v28=zero. */
+ vsetivli zero, 16, e32, m4, ta, ma
+ vid.v v24
+ vrsub.vi v24, v24, 15
+ vxor.vv v28, v28, v28
+
+ vmv.v.i v4, 0
+ vmv.v.i v20, 0
+ addi t0, a0, 16*4
+ addi t1, a0, 33*4
+ mv t2, a1
+ li t3, 8
+0: vle32.v v8, (t2)
+ vle32.v v12, (t0)
+ vfmacc.vv v4, v8, v12
+ addi t4, t2, 32*4
+ vle32.v v8, (t4)
+ vle32.v v12, (t1)
+ vrgather.vv v16, v12, v24
+ vfmacc.vv v20, v8, v16
+ addi t0, t0, 256
+ addi t1, t1, 256
+ addi t2, t2, 256
+ addi t3, t3, -1
+ bnez t3, 0b
+
+ vfsub.vv v4, v4, v20
+ vfmv.s.f v28, ft0
+ vsetivli zero, 1, e32, m1, ta, ma
+ vfadd.vv v4, v4, v28
+ vsetivli zero, 16, e32, m4, ta, ma
+ li t0, 0
+1: vslidedown.vx v8, v4, t0
+ vfmv.f.s ft1, v8
+ mul t1, t0, a4
+ slli t1, t1, 2
+ add t1, a3, t1
+ fsw ft1, (t1)
+ addi t0, t0, 1
+ li t1, 16
+ blt t0, t1, 1b
+
+ vfsub.vv v4, v4, v20
+ li t0, 0
+ vmv.v.i v4, 0
+ vmv.v.i v20, 0
+ addi t0, a0, 32*4
+ addi t1, a0, 17*4
+ mv t2, a1
+ li t3, 8
+2: addi t4, t2, 16*4
+ vle32.v v8, (t4)
+ vle32.v v12, (t0)
+ vfmacc.vv v4, v8, v12
+ addi t4, t2, 48*4
+ vle32.v v8, (t4)
+ vle32.v v12, (t1)
+ vrgather.vv v16, v12, v24
+ vfmacc.vv v20, v8, v16
+ addi t0, t0, 256
+ addi t1, t1, 256
+ addi t2, t2, 256
+ addi t3, t3, -1
+ bnez t3, 2b
+
+ vfsub.vv v4, v4, v20
+ li t0, 0
+3: vslidedown.vx v8, v4, t0
+ vfmv.f.s ft1, v8
+ addi t1, t0, 16
+ mul t1, t1, a4
+ slli t1, t1, 2
+ add t1, a3, t1
+ fsw ft1, (t1)
+ addi t0, t0, 1
+ li t1, 16
+ blt t0, t1, 3b
+
+ sw zero, (a2)
+ ld ra, 8(sp)
+ addi sp, sp, 16
+ ret
+endfunc
+
+/*
+ * MPEG audio synthesis window — fixed-point, RISC-V Vector 1.0
+ *
+ * LMUL=8 (v8) memcpy, LMUL=4 vwmul.vv dot products.
+ * Constants hoisted; explicit e32↔e64 vsetivli after vwmul.
+ */
+func ff_mpadsp_apply_window_fixed_rvv, zve64x, b
+ lpad 0
+ addi sp, sp, -16
+ sd ra, 8(sp)
+
+ vsetivli zero, 16, e64, m8, ta, ma
+ li t1, 512*4
+ add t1, a0, t1
+ vle64.v v8, (a0)
+ vse64.v v8, (t1)
+
+ lw a5, (a2)
+
+ vsetivli zero, 8, e32, m4, ta, ma
+ li t2, 256
+ li t3, 0
+ li t4, 32767
+ li t5, -32768
+ li t6, 48
+ li a6, 32
+ vsetivli zero, 8, e64, m4, ta, ma
+ vxor.vv v28, v28, v28
+ vxor.vv v20, v20, v20
+ vxor.vv v24, v24, v24
+
+.Lloop_fixed:
+ vsetivli zero, 8, e32, m4, ta, ma
+ addi t0, t3, 16
+ slli t0, t0, 2
+ add t0, a0, t0
+ slli t1, t3, 2
+ add t1, a1, t1
+ vlse32.v v8, (t0), t2
+ vlse32.v v12, (t1), t2
+ vwmul.vv v16, v8, v12
+ vsetivli zero, 8, e64, m4, ta, ma
+ vadd.vv v20, v20, v16
+
+ vsetivli zero, 8, e32, m4, ta, ma
+ sub t0, t6, t3
+ slli t0, t0, 2
+ add t0, a0, t0
+ addi t1, t3, 32
+ slli t1, t1, 2
+ add t1, a1, t1
+ vlse32.v v8, (t0), t2
+ vlse32.v v12, (t1), t2
+ vwmul.vv v16, v8, v12
+ vsetivli zero, 8, e64, m4, ta, ma
+ vadd.vv v24, v24, v16
+
+ vsub.vv v20, v20, v24
+ vredsum.vs v20, v20, v28
+ vmv.x.s t0, v20
+ bnez t3, .Lskip_dither_fixed
+ add t0, t0, a5
+.Lskip_dither_fixed:
+ srai t0, t0, 24
+ min t0, t0, t4
+ max t0, t0, t5
+ mul t1, t3, a4
+ slli t1, t1, 1
+ add t1, a3, t1
+ sh t0, (t1)
+
+ vsetivli zero, 8, e64, m4, ta, ma
+ vxor.vv v20, v20, v20
+ vxor.vv v24, v24, v24
+ addi t3, t3, 1
+ blt t3, a6, .Lloop_fixed
+
+ sw zero, (a2)
+ ld ra, 8(sp)
+ addi sp, sp, 16
+ ret
+endfunc
--
2.52.0
1
0
[PR] lavfi/vulkan: convert nlmeans and blackdetect to compile-time GLSL (PR #23104)
by Lynne 15 May '26
by Lynne 15 May '26
15 May '26
PR #23104 opened by Lynne
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23104
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23104.patch
This completes every single filter.
scale_vulkan's GLSL will get replaced by swscale fully. And swscale will soon drop its glslang dep.
From fe16edf5aafd6fe59e56d812d4cd690aa9918cd7 Mon Sep 17 00:00:00 2001
From: Lynne <dev(a)lynne.ee>
Date: Thu, 14 May 2026 17:09:50 +0900
Subject: [PATCH 1/2] vf_nlmeans_vulkan: port to compile-time SPIR-V generation
---
configure | 2 +-
libavfilter/vf_nlmeans_vulkan.c | 534 +++---------------
libavfilter/vulkan/Makefile | 4 +
libavfilter/vulkan/nlmeans_denoise.comp.glsl | 86 +++
.../vulkan/nlmeans_horizontal.comp.glsl | 104 ++++
libavfilter/vulkan/nlmeans_vertical.comp.glsl | 122 ++++
libavfilter/vulkan/nlmeans_weights.comp.glsl | 144 +++++
7 files changed, 550 insertions(+), 446 deletions(-)
create mode 100644 libavfilter/vulkan/nlmeans_denoise.comp.glsl
create mode 100644 libavfilter/vulkan/nlmeans_horizontal.comp.glsl
create mode 100644 libavfilter/vulkan/nlmeans_vertical.comp.glsl
create mode 100644 libavfilter/vulkan/nlmeans_weights.comp.glsl
diff --git a/configure b/configure
index 39a522e7e8..d953074c89 100755
--- a/configure
+++ b/configure
@@ -4222,7 +4222,7 @@ mptestsrc_filter_deps="gpl"
msad_filter_select="scene_sad"
negate_filter_deps="lut_filter"
nlmeans_opencl_filter_deps="opencl"
-nlmeans_vulkan_filter_deps="vulkan spirv_library"
+nlmeans_vulkan_filter_deps="vulkan spirv_compiler"
nnedi_filter_deps="gpl"
ocr_filter_deps="libtesseract"
ocv_filter_deps="libopencv"
diff --git a/libavfilter/vf_nlmeans_vulkan.c b/libavfilter/vf_nlmeans_vulkan.c
index c1430707b7..902c072669 100644
--- a/libavfilter/vf_nlmeans_vulkan.c
+++ b/libavfilter/vf_nlmeans_vulkan.c
@@ -19,19 +19,24 @@
*/
#include "libavutil/mem.h"
-#include "libavutil/random_seed.h"
-#include "libavutil/vulkan_spirv.h"
#include "libavutil/opt.h"
#include "vulkan_filter.h"
#include "filters.h"
#include "video.h"
-#define TYPE_NAME "vec4"
+extern const unsigned char ff_nlmeans_horizontal_comp_spv_data[];
+extern const unsigned int ff_nlmeans_horizontal_comp_spv_len;
+extern const unsigned char ff_nlmeans_vertical_comp_spv_data[];
+extern const unsigned int ff_nlmeans_vertical_comp_spv_len;
+extern const unsigned char ff_nlmeans_weights_comp_spv_data[];
+extern const unsigned int ff_nlmeans_weights_comp_spv_len;
+extern const unsigned char ff_nlmeans_denoise_comp_spv_data[];
+extern const unsigned int ff_nlmeans_denoise_comp_spv_len;
+
+/* Must be kept in sync with the definitions in the nlmeans_* shaders */
#define TYPE_ELEMS 4
#define TYPE_SIZE (TYPE_ELEMS*4)
-#define TYPE_BLOCK_ELEMS 16
-#define TYPE_BLOCK_SIZE (TYPE_SIZE * TYPE_BLOCK_ELEMS)
#define WG_SIZE 32
typedef struct NLMeansVulkanContext {
@@ -80,210 +85,60 @@ typedef struct IntegralPushData {
uint32_t nb_components;
} IntegralPushData;
-static void shared_shd_def(FFVulkanShader *shd) {
- GLSLC(0, #extension GL_ARB_gpu_shader_int64 : require );
- GLSLC(0, );
- GLSLF(0, #define DTYPE %s ,TYPE_NAME);
- GLSLF(0, #define T_ALIGN %i ,TYPE_SIZE);
- GLSLF(0, #define T_BLOCK_ELEMS %i ,TYPE_BLOCK_ELEMS);
- GLSLF(0, #define T_BLOCK_ALIGN %i ,TYPE_BLOCK_SIZE);
- GLSLC(0, );
- GLSLC(0, layout(buffer_reference, buffer_reference_align = T_ALIGN) buffer DataBuffer { );
- GLSLC(1, DTYPE v[]; );
- GLSLC(0, }; );
- GLSLC(0, struct Block { );
- GLSLC(1, DTYPE data[T_BLOCK_ELEMS]; );
- GLSLC(0, }; );
- GLSLC(0, layout(buffer_reference, buffer_reference_align = T_BLOCK_ALIGN) buffer BlockBuffer { );
- GLSLC(1, Block v[]; );
- GLSLC(0, }; );
- GLSLC(0, layout(push_constant, std430) uniform pushConstants { );
- GLSLC(1, uvec4 width; );
- GLSLC(1, uvec4 height; );
- GLSLC(1, vec4 strength; );
- GLSLC(1, uvec4 comp_off; );
- GLSLC(1, uvec4 comp_plane; );
- GLSLC(1, DataBuffer integral_base; );
- GLSLC(1, uint64_t integral_size; );
- GLSLC(1, uint64_t int_stride; );
- GLSLC(1, uint xyoffs_start; );
- GLSLC(1, uint nb_components; );
- GLSLC(0, }; );
- GLSLC(0, );
-
- ff_vk_shader_add_push_const(shd, 0, sizeof(IntegralPushData),
- VK_SHADER_STAGE_COMPUTE_BIT);
-}
-
static av_cold int init_integral_pipeline(FFVulkanContext *vkctx, FFVkExecPool *exec,
FFVulkanShader *shd_horizontal,
FFVulkanShader *shd_vertical,
- FFVkSPIRVCompiler *spv,
- const AVPixFmtDescriptor *desc, int planes)
+ int planes)
{
int err;
- uint8_t *spv_data;
- size_t spv_len;
- void *spv_opaque = NULL;
FFVulkanShader *shd;
- FFVulkanDescriptorSetBinding *desc_set;
+ /* Horizontal pass */
shd = shd_horizontal;
- RET(ff_vk_shader_init(vkctx, shd, "nlmeans_horizontal",
- VK_SHADER_STAGE_COMPUTE_BIT,
- (const char *[]) { "GL_EXT_buffer_reference",
- "GL_EXT_buffer_reference2" }, 2,
- WG_SIZE, 1, 1,
- 0));
- shared_shd_def(shd);
+ ff_vk_shader_load(shd, VK_SHADER_STAGE_COMPUTE_BIT, NULL,
+ (uint32_t []) { WG_SIZE, 1, 1 }, 0);
- GLSLC(0, );
- GLSLC(0, void main() );
- GLSLC(0, { );
- GLSLC(1, uint64_t offset; );
- GLSLC(1, DataBuffer dst; );
- GLSLC(1, BlockBuffer b_dst; );
- GLSLC(1, Block block; );
- GLSLC(1, DTYPE s2; );
- GLSLC(1, DTYPE prefix_sum; );
- GLSLC(1, ivec2 pos; );
- GLSLC(1, int k; );
- GLSLC(1, int o; );
- GLSLC(0, );
- GLSLC(1, DataBuffer integral_data; );
- GLSLC(0, );
- GLSLC(1, uint c_plane; );
- GLSLC(0, );
- GLSLC(1, uint comp_idx = uint(gl_WorkGroupID.y); );
- GLSLC(1, uint invoc_idx = uint(gl_WorkGroupID.z); );
- GLSLC(0, );
- GLSLC(1, if (strength[comp_idx] == 0.0) );
- GLSLC(2, return; );
- GLSLC(0, );
- GLSLC(1, offset = integral_size * (invoc_idx * nb_components + comp_idx); );
- GLSLC(1, integral_data = DataBuffer(uint64_t(integral_base) + offset); );
- GLSLC(0, );
- GLSLC(1, c_plane = comp_plane[comp_idx]; );
- GLSLC(0, );
- GLSLC(1, pos.y = int(gl_GlobalInvocationID.x); );
- GLSLC(1, if (pos.y < height[c_plane]) { );
- GLSLC(2, prefix_sum = DTYPE(0); );
- GLSLC(2, offset = int_stride * uint64_t(pos.y); );
- GLSLC(2, b_dst = BlockBuffer(uint64_t(integral_data) + offset); );
- GLSLC(0, );
- GLSLC(2, for (k = 0; k * T_BLOCK_ELEMS < width[c_plane]; k++) { );
- GLSLC(3, block = b_dst.v[k]; );
- GLSLC(3, for (o = 0; o < T_BLOCK_ELEMS; o++) { );
- GLSLC(4, s2 = block.data[o]; );
- GLSLC(4, block.data[o] = s2 + prefix_sum; );
- GLSLC(4, prefix_sum += s2; );
- GLSLC(3, } );
- GLSLC(3, b_dst.v[k] = block; );
- GLSLC(2, } );
- GLSLC(1, } );
- GLSLC(0, } );
+ ff_vk_shader_add_push_const(shd, 0, sizeof(IntegralPushData),
+ VK_SHADER_STAGE_COMPUTE_BIT);
- RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main", &spv_opaque));
- RET(ff_vk_shader_link(vkctx, shd, spv_data, spv_len, "main"));
+ RET(ff_vk_shader_link(vkctx, shd,
+ ff_nlmeans_horizontal_comp_spv_data,
+ ff_nlmeans_horizontal_comp_spv_len, "main"));
RET(ff_vk_shader_register_exec(vkctx, exec, shd));
+ /* Vertical pass */
shd = shd_vertical;
- RET(ff_vk_shader_init(vkctx, shd, "nlmeans_vertical",
- VK_SHADER_STAGE_COMPUTE_BIT,
- (const char *[]) { "GL_EXT_buffer_reference",
- "GL_EXT_buffer_reference2" }, 2,
- WG_SIZE, 1, 1,
- 0));
- shared_shd_def(shd);
+ ff_vk_shader_load(shd, VK_SHADER_STAGE_COMPUTE_BIT, NULL,
+ (uint32_t []) { WG_SIZE, 1, 1 }, 0);
- desc_set = (FFVulkanDescriptorSetBinding []) {
- {
- .name = "input_img",
- .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
- .mem_layout = ff_vk_shader_rep_fmt(vkctx->input_format, FF_VK_REP_FLOAT),
- .mem_quali = "readonly",
- .dimensions = 2,
- .elems = planes,
- .stages = VK_SHADER_STAGE_COMPUTE_BIT,
+ ff_vk_shader_add_push_const(shd, 0, sizeof(IntegralPushData),
+ VK_SHADER_STAGE_COMPUTE_BIT);
+
+ const FFVulkanDescriptorSetBinding desc_set_img[] = {
+ { /* input_img */
+ .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+ .stages = VK_SHADER_STAGE_COMPUTE_BIT,
+ .elems = planes,
},
};
- RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set, 1, 0, 0));
+ ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set_img, 1, 0, 0);
- desc_set = (FFVulkanDescriptorSetBinding []) {
- {
- .name = "xyoffsets_buffer",
- .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
- .mem_quali = "readonly",
- .stages = VK_SHADER_STAGE_COMPUTE_BIT,
- .buf_content = "ivec2 xyoffsets[];",
+ const FFVulkanDescriptorSetBinding desc_set_xyoffsets[] = {
+ { /* xyoffsets_buffer */
+ .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+ .stages = VK_SHADER_STAGE_COMPUTE_BIT,
},
};
- RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set, 1, 1, 0));
+ ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set_xyoffsets, 1, 1, 0);
- GLSLC(0, );
- GLSLC(0, void main() );
- GLSLC(0, { );
- GLSLC(1, uint64_t offset; );
- GLSLC(1, DataBuffer dst; );
- GLSLC(1, float s1; );
- GLSLC(1, DTYPE s2; );
- GLSLC(1, DTYPE prefix_sum; );
- GLSLC(1, uvec2 size; );
- GLSLC(1, ivec2 pos; );
- GLSLC(1, ivec2 pos_off; );
- GLSLC(0, );
- GLSLC(1, DataBuffer integral_data; );
- GLSLF(1, ivec2 offs[%i]; ,TYPE_ELEMS);
- GLSLC(0, );
- GLSLC(1, uint c_off; );
- GLSLC(1, uint c_plane; );
- GLSLC(0, );
- GLSLC(1, uint comp_idx = uint(gl_WorkGroupID.y); );
- GLSLC(1, uint invoc_idx = uint(gl_WorkGroupID.z); );
- GLSLC(0, );
- GLSLC(1, if (strength[comp_idx] == 0.0) );
- GLSLC(2, return; );
- GLSLC(0, );
- GLSLC(1, offset = integral_size * (invoc_idx * nb_components + comp_idx); );
- GLSLC(1, integral_data = DataBuffer(uint64_t(integral_base) + offset); );
- for (int i = 0; i < TYPE_ELEMS; i++)
- GLSLF(1, offs[%i] = xyoffsets[xyoffs_start + %i*invoc_idx + %i]; ,i,TYPE_ELEMS,i);
- GLSLC(0, );
- GLSLC(1, c_off = comp_off[comp_idx]; );
- GLSLC(1, c_plane = comp_plane[comp_idx]; );
- GLSLC(1, size = imageSize(input_img[c_plane]); );
- GLSLC(0, );
- GLSLC(1, pos.x = int(gl_GlobalInvocationID.x); );
- GLSLC(1, if (pos.x < width[c_plane]) { );
- GLSLC(2, prefix_sum = DTYPE(0); );
- GLSLC(2, for (pos.y = 0; pos.y < height[c_plane]; pos.y++) { );
- GLSLC(3, offset = int_stride * uint64_t(pos.y); );
- GLSLC(3, dst = DataBuffer(uint64_t(integral_data) + offset); );
- GLSLC(4, s1 = imageLoad(input_img[c_plane], pos)[c_off]; );
- for (int i = 0; i < TYPE_ELEMS; i++) {
- GLSLF(4, pos_off = pos + offs[%i]; ,i);
- GLSLC(4, if (!IS_WITHIN(uvec2(pos_off), size)) );
- GLSLF(5, s2[%i] = s1; ,i);
- GLSLC(4, else );
- GLSLF(5, s2[%i] = imageLoad(input_img[c_plane], pos_off)[c_off]; ,i);
- }
- GLSLC(4, s2 = (s1 - s2) * (s1 - s2); );
- GLSLC(3, dst.v[pos.x] = s2 + prefix_sum; );
- GLSLC(3, prefix_sum += s2; );
- GLSLC(2, } );
- GLSLC(1, } );
- GLSLC(0, } );
-
- RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main", &spv_opaque));
- RET(ff_vk_shader_link(vkctx, shd, spv_data, spv_len, "main"));
+ RET(ff_vk_shader_link(vkctx, shd,
+ ff_nlmeans_vertical_comp_spv_data,
+ ff_nlmeans_vertical_comp_spv_len, "main"));
RET(ff_vk_shader_register_exec(vkctx, exec, shd));
fail:
- if (spv_opaque)
- spv->free_shader(spv, &spv_opaque);
-
return err;
}
@@ -305,172 +160,48 @@ typedef struct WeightsPushData {
} WeightsPushData;
static av_cold int init_weights_pipeline(FFVulkanContext *vkctx, FFVkExecPool *exec,
- FFVulkanShader *shd,
- FFVkSPIRVCompiler *spv,
- const AVPixFmtDescriptor *desc,
- int planes)
+ FFVulkanShader *shd, int planes)
{
int err;
- uint8_t *spv_data;
- size_t spv_len;
- void *spv_opaque = NULL;
- FFVulkanDescriptorSetBinding *desc_set;
- RET(ff_vk_shader_init(vkctx, shd, "nlmeans_weights",
- VK_SHADER_STAGE_COMPUTE_BIT,
- (const char *[]) { "GL_EXT_buffer_reference",
- "GL_EXT_buffer_reference2" }, 2,
- WG_SIZE, WG_SIZE, 1,
- 0));
-
- GLSLC(0, #extension GL_ARB_gpu_shader_int64 : require );
- GLSLC(0, );
- GLSLF(0, #define DTYPE %s ,TYPE_NAME);
- GLSLF(0, #define T_ALIGN %i ,TYPE_SIZE);
- GLSLC(0, );
- GLSLC(0, layout(buffer_reference, buffer_reference_align = T_ALIGN) buffer DataBuffer { );
- GLSLC(1, DTYPE v[]; );
- GLSLC(0, }; );
- GLSLC(0, layout(push_constant, std430) uniform pushConstants { );
- GLSLC(1, uvec4 width; );
- GLSLC(1, uvec4 height; );
- GLSLC(1, uvec4 ws_offset; );
- GLSLC(1, uvec4 ws_stride; );
- GLSLC(1, ivec4 patch_size; );
- GLSLC(1, vec4 strength; );
- GLSLC(1, uvec4 comp_off; );
- GLSLC(1, uvec4 comp_plane; );
- GLSLC(1, DataBuffer integral_base; );
- GLSLC(1, uint64_t integral_size; );
- GLSLC(1, uint64_t int_stride; );
- GLSLC(1, uint xyoffs_start; );
- GLSLC(1, uint ws_count; );
- GLSLC(1, uint nb_components; );
- GLSLC(0, }; );
- GLSLC(0, );
+ ff_vk_shader_load(shd, VK_SHADER_STAGE_COMPUTE_BIT, NULL,
+ (uint32_t []) { WG_SIZE, WG_SIZE, 1 }, 0);
ff_vk_shader_add_push_const(shd, 0, sizeof(WeightsPushData),
VK_SHADER_STAGE_COMPUTE_BIT);
- desc_set = (FFVulkanDescriptorSetBinding []) {
- {
- .name = "input_img",
- .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
- .mem_layout = ff_vk_shader_rep_fmt(vkctx->input_format, FF_VK_REP_FLOAT),
- .mem_quali = "readonly",
- .dimensions = 2,
- .elems = planes,
- .stages = VK_SHADER_STAGE_COMPUTE_BIT,
+ const FFVulkanDescriptorSetBinding desc_set[] = {
+ { /* input_img */
+ .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+ .stages = VK_SHADER_STAGE_COMPUTE_BIT,
+ .elems = planes,
},
- {
- .name = "weights_buffer",
- .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
- .stages = VK_SHADER_STAGE_COMPUTE_BIT,
- .buf_content = "float weights[];",
+ { /* weights_buffer */
+ .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+ .stages = VK_SHADER_STAGE_COMPUTE_BIT,
},
- {
- .name = "sums_buffer",
- .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
- .stages = VK_SHADER_STAGE_COMPUTE_BIT,
- .buf_content = "float sums[];",
+ { /* sums_buffer */
+ .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+ .stages = VK_SHADER_STAGE_COMPUTE_BIT,
},
};
- RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set, 3, 0, 0));
+ ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set, 3, 0, 0);
- desc_set = (FFVulkanDescriptorSetBinding []) {
- {
- .name = "xyoffsets_buffer",
- .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
- .mem_quali = "readonly",
- .stages = VK_SHADER_STAGE_COMPUTE_BIT,
- .buf_content = "ivec2 xyoffsets[];",
+ const FFVulkanDescriptorSetBinding desc_set_xyoffsets[] = {
+ { /* xyoffsets_buffer */
+ .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+ .stages = VK_SHADER_STAGE_COMPUTE_BIT,
},
};
- RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set, 1, 1, 0));
+ ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set_xyoffsets, 1, 1, 0);
- GLSLC(0, );
- GLSLC(0, void main() );
- GLSLC(0, { );
- GLSLC(1, uint64_t offset; );
- GLSLC(1, DataBuffer dst; );
- GLSLC(1, uvec2 size; );
- GLSLC(1, ivec2 pos; );
- GLSLC(1, ivec2 pos_off; );
- GLSLC(1, int p; );
- GLSLC(1, float s; );
- GLSLC(0, );
- GLSLC(1, DataBuffer integral_data; );
- GLSLF(1, ivec2 offs[%i]; ,TYPE_ELEMS);
- GLSLC(0, );
- GLSLC(1, uint c_off; );
- GLSLC(1, uint c_plane; );
- GLSLC(1, uint ws_off; );
- GLSLC(0, );
- GLSLC(1, pos = ivec2(gl_GlobalInvocationID.xy); );
- GLSLC(1, uint comp_idx = uint(gl_WorkGroupID.z) %% nb_components; );
- GLSLC(1, uint invoc_idx = uint(gl_WorkGroupID.z) / nb_components; );
- GLSLC(0, );
- GLSLC(1, c_off = comp_off[comp_idx]; );
- GLSLC(1, c_plane = comp_plane[comp_idx]; );
- GLSLC(1, p = patch_size[comp_idx]; );
- GLSLC(1, s = strength[comp_idx]; );
- GLSLC(1, if (s == 0.0 || pos.x < p || pos.y < p || pos.x >= width[c_plane] - p || pos.y >= height[c_plane] - p) );
- GLSLC(2, return; );
- GLSLC(0, );
- GLSLC(1, offset = integral_size * (invoc_idx * nb_components + comp_idx); );
- GLSLC(1, integral_data = DataBuffer(uint64_t(integral_base) + offset); );
- for (int i = 0; i < TYPE_ELEMS; i++)
- GLSLF(1, offs[%i] = xyoffsets[xyoffs_start + %i*invoc_idx + %i]; ,i,TYPE_ELEMS,i);
- GLSLC(0, );
- GLSLC(1, ws_off = ws_count * invoc_idx + ws_offset[comp_idx] + pos.y * ws_stride[comp_idx] + pos.x; );
- GLSLC(1, size = imageSize(input_img[c_plane]); );
- GLSLC(0, );
- GLSLC(1, DTYPE a; );
- GLSLC(1, DTYPE b; );
- GLSLC(1, DTYPE c; );
- GLSLC(1, DTYPE d; );
- GLSLC(0, );
- GLSLC(1, DTYPE patch_diff; );
- GLSLC(1, vec4 src; );
- GLSLC(1, vec4 w; );
- GLSLC(1, float w_sum; );
- GLSLC(1, float sum; );
- GLSLC(0, );
- for (int i = 0; i < 4; i++) {
- GLSLF(1, pos_off = pos + offs[%i]; ,i);
- GLSLC(1, if (!IS_WITHIN(uvec2(pos_off), size)) );
- GLSLF(2, src[%i] = imageLoad(input_img[c_plane], pos)[c_off]; ,i);
- GLSLC(1, else );
- GLSLF(2, src[%i] = imageLoad(input_img[c_plane], pos_off)[c_off]; ,i);
- }
- GLSLC(0, );
- GLSLC(1, offset = int_stride * uint64_t(pos.y - p); );
- GLSLC(1, dst = DataBuffer(uint64_t(integral_data) + offset); );
- GLSLC(1, a = dst.v[pos.x - p]; );
- GLSLC(1, c = dst.v[pos.x + p]; );
- GLSLC(1, offset = int_stride * uint64_t(pos.y + p); );
- GLSLC(1, dst = DataBuffer(uint64_t(integral_data) + offset); );
- GLSLC(1, b = dst.v[pos.x - p]; );
- GLSLC(1, d = dst.v[pos.x + p]; );
- GLSLC(0, );
- GLSLC(1, patch_diff = d + a - b - c; );
- GLSLC(1, w = exp(patch_diff * s); );
- GLSLC(1, w_sum = w[0] + w[1] + w[2] + w[3]; );
- GLSLC(1, sum = dot(w, src * 255); );
- GLSLC(0, );
- GLSLC(1, weights[ws_off] += w_sum; );
- GLSLC(1, sums[ws_off] += sum; );
- GLSLC(0, } );
-
- RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main", &spv_opaque));
- RET(ff_vk_shader_link(vkctx, shd, spv_data, spv_len, "main"));
+ RET(ff_vk_shader_link(vkctx, shd,
+ ff_nlmeans_weights_comp_spv_data,
+ ff_nlmeans_weights_comp_spv_len, "main"));
RET(ff_vk_shader_register_exec(vkctx, exec, shd));
fail:
- if (spv_opaque)
- spv->free_shader(spv, &spv_opaque);
-
return err;
}
@@ -485,121 +216,49 @@ typedef struct DenoisePushData {
} DenoisePushData;
static av_cold int init_denoise_pipeline(FFVulkanContext *vkctx, FFVkExecPool *exec,
- FFVulkanShader *shd, FFVkSPIRVCompiler *spv,
- const AVPixFmtDescriptor *desc, int planes)
+ FFVulkanShader *shd, int planes)
{
int err;
- uint8_t *spv_data;
- size_t spv_len;
- void *spv_opaque = NULL;
- FFVulkanDescriptorSetBinding *desc_set;
- RET(ff_vk_shader_init(vkctx, shd, "nlmeans_denoise",
- VK_SHADER_STAGE_COMPUTE_BIT,
- (const char *[]) { "GL_EXT_buffer_reference",
- "GL_EXT_buffer_reference2" }, 2,
- WG_SIZE, WG_SIZE, 1,
- 0));
- GLSLC(0, layout(push_constant, std430) uniform pushConstants { );
- GLSLC(1, uvec4 comp_off; );
- GLSLC(1, uvec4 comp_plane; );
- GLSLC(1, uvec4 ws_offset; );
- GLSLC(1, uvec4 ws_stride; );
- GLSLC(1, uint32_t ws_count; );
- GLSLC(1, uint32_t t; );
- GLSLC(1, uint32_t nb_components; );
- GLSLC(0, }; );
+ ff_vk_shader_load(shd, VK_SHADER_STAGE_COMPUTE_BIT, NULL,
+ (uint32_t []) { WG_SIZE, WG_SIZE, 1 }, 0);
ff_vk_shader_add_push_const(shd, 0, sizeof(DenoisePushData),
VK_SHADER_STAGE_COMPUTE_BIT);
- desc_set = (FFVulkanDescriptorSetBinding []) {
- {
- .name = "input_img",
- .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
- .mem_layout = ff_vk_shader_rep_fmt(vkctx->input_format, FF_VK_REP_FLOAT),
- .mem_quali = "readonly",
- .dimensions = 2,
- .elems = planes,
- .stages = VK_SHADER_STAGE_COMPUTE_BIT,
+ const FFVulkanDescriptorSetBinding desc_set_img[] = {
+ { /* input_img */
+ .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+ .stages = VK_SHADER_STAGE_COMPUTE_BIT,
+ .elems = planes,
},
- {
- .name = "output_img",
- .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
- .mem_layout = ff_vk_shader_rep_fmt(vkctx->output_format, FF_VK_REP_FLOAT),
- .mem_quali = "writeonly",
- .dimensions = 2,
- .elems = planes,
- .stages = VK_SHADER_STAGE_COMPUTE_BIT,
+ { /* output_img */
+ .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+ .stages = VK_SHADER_STAGE_COMPUTE_BIT,
+ .elems = planes,
},
};
- RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set, 2, 0, 0));
+ ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set_img, 2, 0, 0);
- desc_set = (FFVulkanDescriptorSetBinding []) {
- {
- .name = "weights_buffer",
- .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
- .mem_quali = "readonly",
- .stages = VK_SHADER_STAGE_COMPUTE_BIT,
- .buf_content = "float weights[];",
+ const FFVulkanDescriptorSetBinding desc_set_ws[] = {
+ { /* weights_buffer */
+ .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+ .stages = VK_SHADER_STAGE_COMPUTE_BIT,
},
- {
- .name = "sums_buffer",
- .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
- .mem_quali = "readonly",
- .stages = VK_SHADER_STAGE_COMPUTE_BIT,
- .buf_content = "float sums[];",
+ { /* sums_buffer */
+ .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+ .stages = VK_SHADER_STAGE_COMPUTE_BIT,
},
};
+ ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set_ws, 2, 0, 0);
- RET(ff_vk_shader_add_descriptor_set(vkctx, shd, desc_set, 2, 0, 0));
-
- GLSLC(0, void main() );
- GLSLC(0, { );
- GLSLC(1, const ivec2 pos = ivec2(gl_GlobalInvocationID.xy); );
- GLSLC(1, const uint plane = uint(gl_WorkGroupID.z); );
- GLSLC(1, const uvec2 size = imageSize(output_img[plane]); );
- GLSLC(0, );
- GLSLC(1, uint c_off; );
- GLSLC(1, uint c_plane; );
- GLSLC(1, uint ws_off; );
- GLSLC(0, );
- GLSLC(1, float w_sum; );
- GLSLC(1, float sum; );
- GLSLC(1, vec4 src; );
- GLSLC(1, vec4 r; );
- GLSLC(1, uint invoc_idx; );
- GLSLC(1, uint comp_idx; );
- GLSLC(0, );
- GLSLC(1, if (!IS_WITHIN(pos, size)) );
- GLSLC(2, return; );
- GLSLC(0, );
- GLSLC(1, src = imageLoad(input_img[plane], pos); );
- GLSLC(1, for (comp_idx = 0; comp_idx < nb_components; comp_idx++) { );
- GLSLC(2, if (plane == comp_plane[comp_idx]) { );
- GLSLC(3, w_sum = 0.0; );
- GLSLC(3, sum = 0.0; );
- GLSLC(3, for (invoc_idx = 0; invoc_idx < t; invoc_idx++) { );
- GLSLC(4, ws_off = ws_count * invoc_idx + ws_offset[comp_idx] + pos.y * ws_stride[comp_idx] + pos.x; );
- GLSLC(4, w_sum += weights[ws_off]; );
- GLSLC(4, sum += sums[ws_off]; );
- GLSLC(3, } );
- GLSLC(3, c_off = comp_off[comp_idx]; );
- GLSLC(3, r[c_off] = (sum + src[c_off] * 255) / (1.0 + w_sum) / 255; );
- GLSLC(2, } );
- GLSLC(1, } );
- GLSLC(1, imageStore(output_img[plane], pos, r); );
- GLSLC(0, } );
-
- RET(spv->compile_shader(vkctx, spv, shd, &spv_data, &spv_len, "main", &spv_opaque));
- RET(ff_vk_shader_link(vkctx, shd, spv_data, spv_len, "main"));
+ RET(ff_vk_shader_link(vkctx, shd,
+ ff_nlmeans_denoise_comp_spv_data,
+ ff_nlmeans_denoise_comp_spv_len, "main"));
RET(ff_vk_shader_register_exec(vkctx, exec, shd));
fail:
- if (spv_opaque)
- spv->free_shader(spv, &spv_opaque);
-
return err;
}
@@ -610,15 +269,9 @@ static av_cold int init_filter(AVFilterContext *ctx)
NLMeansVulkanContext *s = ctx->priv;
FFVulkanContext *vkctx = &s->vkctx;
const int planes = av_pix_fmt_count_planes(s->vkctx.output_format);
- FFVkSPIRVCompiler *spv = NULL;
int *offsets_buf;
int offsets_dispatched = 0, nb_dispatches = 0;
- const AVPixFmtDescriptor *desc;
- desc = av_pix_fmt_desc_get(vkctx->output_format);
- if (!desc)
- return AVERROR(EINVAL);
-
if (!(s->opts.r & 1)) {
s->opts.r |= 1;
av_log(ctx, AV_LOG_WARNING, "Research size should be odd, setting to %i",
@@ -682,12 +335,6 @@ static av_cold int init_filter(AVFilterContext *ctx)
s->opts.t = FFMIN(s->opts.t, (FFALIGN(s->nb_offsets, TYPE_ELEMS) / TYPE_ELEMS));
- spv = ff_vk_spirv_init();
- if (!spv) {
- av_log(ctx, AV_LOG_ERROR, "Unable to initialize SPIR-V compiler!\n");
- return AVERROR_EXTERNAL;
- }
-
s->qf = ff_vk_qf_find(vkctx, VK_QUEUE_COMPUTE_BIT, 0);
if (!s->qf) {
av_log(ctx, AV_LOG_ERROR, "Device has no compute queues\n");
@@ -698,11 +345,11 @@ static av_cold int init_filter(AVFilterContext *ctx)
RET(ff_vk_exec_pool_init(vkctx, s->qf, &s->e, 1, 0, 0, 0, NULL));
RET(init_integral_pipeline(vkctx, &s->e, &s->shd_horizontal, &s->shd_vertical,
- spv, desc, planes));
+ planes));
- RET(init_weights_pipeline(vkctx, &s->e, &s->shd_weights, spv, desc, planes));
+ RET(init_weights_pipeline(vkctx, &s->e, &s->shd_weights, planes));
- RET(init_denoise_pipeline(vkctx, &s->e, &s->shd_denoise, spv, desc, planes));
+ RET(init_denoise_pipeline(vkctx, &s->e, &s->shd_denoise, planes));
RET(ff_vk_shader_update_desc_buffer(vkctx, &s->e.contexts[0], &s->shd_vertical,
1, 0, 0,
@@ -726,9 +373,6 @@ static av_cold int init_filter(AVFilterContext *ctx)
s->initialized = 1;
fail:
- if (spv)
- spv->uninit(&spv);
-
return err;
}
diff --git a/libavfilter/vulkan/Makefile b/libavfilter/vulkan/Makefile
index 6d25cf8a50..cd303e535e 100644
--- a/libavfilter/vulkan/Makefile
+++ b/libavfilter/vulkan/Makefile
@@ -15,3 +15,7 @@ OBJS-$(CONFIG_TRANSPOSE_VULKAN_FILTER) += vulkan/transpose.comp.spv.o
OBJS-$(CONFIG_V360_VULKAN_FILTER) += vulkan/v360.comp.spv.o
OBJS-$(CONFIG_INTERLACE_VULKAN_FILTER) += vulkan/interlace.comp.spv.o
OBJS-$(CONFIG_XFADE_VULKAN_FILTER) += vulkan/xfade.comp.spv.o
+OBJS-$(CONFIG_NLMEANS_VULKAN_FILTER) += vulkan/nlmeans_horizontal.comp.spv.o \
+ vulkan/nlmeans_vertical.comp.spv.o \
+ vulkan/nlmeans_weights.comp.spv.o \
+ vulkan/nlmeans_denoise.comp.spv.o
diff --git a/libavfilter/vulkan/nlmeans_denoise.comp.glsl b/libavfilter/vulkan/nlmeans_denoise.comp.glsl
new file mode 100644
index 0000000000..974c09318f
--- /dev/null
+++ b/libavfilter/vulkan/nlmeans_denoise.comp.glsl
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) Lynne
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#pragma shader_stage(compute)
+
+#extension GL_EXT_shader_image_load_formatted : require
+#extension GL_EXT_scalar_block_layout : require
+#extension GL_EXT_nonuniform_qualifier : require
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+
+layout (local_size_x_id = 253, local_size_y_id = 254, local_size_z_id = 255) in;
+
+layout (push_constant, scalar) uniform pushConstants {
+ uvec4 comp_off;
+ uvec4 comp_plane;
+ uvec4 ws_offset;
+ uvec4 ws_stride;
+ uint32_t ws_count;
+ uint32_t t;
+ uint32_t nb_components;
+};
+
+layout (set = 0, binding = 0) uniform readonly image2D input_img[];
+layout (set = 0, binding = 1) uniform writeonly image2D output_img[];
+
+layout (set = 1, binding = 0, scalar) readonly buffer weights_buffer {
+ float weights[];
+};
+
+layout (set = 1, binding = 1, scalar) readonly buffer sums_buffer {
+ float sums[];
+};
+
+void main()
+{
+ const ivec2 pos = ivec2(gl_GlobalInvocationID.xy);
+ const uint plane = uint(gl_WorkGroupID.z);
+ const ivec2 size = imageSize(output_img[plane]);
+
+ uint c_off;
+ uint c_plane;
+ uint ws_off;
+
+ float w_sum;
+ float sum;
+ vec4 src;
+ vec4 r;
+ uint invoc_idx;
+ uint comp_idx;
+
+ if (any(greaterThanEqual(pos, size)))
+ return;
+
+ src = imageLoad(input_img[plane], pos);
+ for (comp_idx = 0; comp_idx < nb_components; comp_idx++) {
+ if (plane == comp_plane[comp_idx]) {
+ w_sum = 0.0;
+ sum = 0.0;
+ for (invoc_idx = 0; invoc_idx < t; invoc_idx++) {
+ ws_off = ws_count * invoc_idx + ws_offset[comp_idx] + pos.y * ws_stride[comp_idx] + pos.x;
+ w_sum += weights[ws_off];
+ sum += sums[ws_off];
+ }
+ c_off = comp_off[comp_idx];
+ r[c_off] = (sum + src[c_off] * 255) / (1.0 + w_sum) / 255;
+ }
+ }
+ imageStore(output_img[plane], pos, r);
+}
diff --git a/libavfilter/vulkan/nlmeans_horizontal.comp.glsl b/libavfilter/vulkan/nlmeans_horizontal.comp.glsl
new file mode 100644
index 0000000000..d1bd62ccb1
--- /dev/null
+++ b/libavfilter/vulkan/nlmeans_horizontal.comp.glsl
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) Lynne
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#pragma shader_stage(compute)
+
+#extension GL_EXT_scalar_block_layout : require
+#extension GL_EXT_buffer_reference : require
+#extension GL_EXT_buffer_reference2 : require
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+#extension GL_ARB_gpu_shader_int64 : require
+
+/* Must be kept in sync with the definitions in vf_nlmeans_vulkan.c */
+#define DTYPE vec4
+#define T_ALIGN 16
+#define T_BLOCK_ELEMS 16
+#define T_BLOCK_ALIGN 256
+
+layout (local_size_x_id = 253, local_size_y_id = 254, local_size_z_id = 255) in;
+
+layout (buffer_reference, buffer_reference_align = T_ALIGN, scalar) buffer DataBuffer {
+ DTYPE v[];
+};
+
+struct Block {
+ DTYPE data[T_BLOCK_ELEMS];
+};
+
+layout (buffer_reference, buffer_reference_align = T_BLOCK_ALIGN, scalar) buffer BlockBuffer {
+ Block v[];
+};
+
+layout (push_constant, scalar) uniform pushConstants {
+ uvec4 width;
+ uvec4 height;
+ vec4 strength;
+ uvec4 comp_off;
+ uvec4 comp_plane;
+ DataBuffer integral_base;
+ uint64_t integral_size;
+ uint64_t int_stride;
+ uint xyoffs_start;
+ uint nb_components;
+};
+
+void main()
+{
+ uint64_t offset;
+ BlockBuffer b_dst;
+ Block block;
+ DTYPE s2;
+ DTYPE prefix_sum;
+ ivec2 pos;
+ int k;
+ int o;
+
+ DataBuffer integral_data;
+
+ uint c_plane;
+
+ uint comp_idx = uint(gl_WorkGroupID.y);
+ uint invoc_idx = uint(gl_WorkGroupID.z);
+
+ if (strength[comp_idx] == 0.0)
+ return;
+
+ offset = integral_size * (invoc_idx * nb_components + comp_idx);
+ integral_data = DataBuffer(uint64_t(integral_base) + offset);
+
+ c_plane = comp_plane[comp_idx];
+
+ pos.y = int(gl_GlobalInvocationID.x);
+ if (pos.y < height[c_plane]) {
+ prefix_sum = DTYPE(0);
+ offset = int_stride * uint64_t(pos.y);
+ b_dst = BlockBuffer(uint64_t(integral_data) + offset);
+
+ for (k = 0; k * T_BLOCK_ELEMS < width[c_plane]; k++) {
+ block = b_dst.v[k];
+ for (o = 0; o < T_BLOCK_ELEMS; o++) {
+ s2 = block.data[o];
+ block.data[o] = s2 + prefix_sum;
+ prefix_sum += s2;
+ }
+ b_dst.v[k] = block;
+ }
+ }
+}
diff --git a/libavfilter/vulkan/nlmeans_vertical.comp.glsl b/libavfilter/vulkan/nlmeans_vertical.comp.glsl
new file mode 100644
index 0000000000..d5842f4a16
--- /dev/null
+++ b/libavfilter/vulkan/nlmeans_vertical.comp.glsl
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) Lynne
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#pragma shader_stage(compute)
+
+#extension GL_EXT_shader_image_load_formatted : require
+#extension GL_EXT_scalar_block_layout : require
+#extension GL_EXT_nonuniform_qualifier : require
+#extension GL_EXT_buffer_reference : require
+#extension GL_EXT_buffer_reference2 : require
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+#extension GL_ARB_gpu_shader_int64 : require
+
+/* Must be kept in sync with the definitions in vf_nlmeans_vulkan.c */
+#define DTYPE vec4
+#define T_ALIGN 16
+#define T_BLOCK_ELEMS 16
+#define T_BLOCK_ALIGN 256
+#define TYPE_ELEMS 4
+
+layout (local_size_x_id = 253, local_size_y_id = 254, local_size_z_id = 255) in;
+
+layout (buffer_reference, buffer_reference_align = T_ALIGN, scalar) buffer DataBuffer {
+ DTYPE v[];
+};
+
+struct Block {
+ DTYPE data[T_BLOCK_ELEMS];
+};
+
+layout (buffer_reference, buffer_reference_align = T_BLOCK_ALIGN, scalar) buffer BlockBuffer {
+ Block v[];
+};
+
+layout (push_constant, scalar) uniform pushConstants {
+ uvec4 width;
+ uvec4 height;
+ vec4 strength;
+ uvec4 comp_off;
+ uvec4 comp_plane;
+ DataBuffer integral_base;
+ uint64_t integral_size;
+ uint64_t int_stride;
+ uint xyoffs_start;
+ uint nb_components;
+};
+
+layout (set = 0, binding = 0) uniform readonly image2D input_img[];
+
+layout (set = 1, binding = 0, scalar) readonly buffer xyoffsets_buffer {
+ ivec2 xyoffsets[];
+};
+
+void main()
+{
+ uint64_t offset;
+ DataBuffer dst;
+ float s1;
+ DTYPE s2;
+ DTYPE prefix_sum;
+ uvec2 size;
+ ivec2 pos;
+ ivec2 pos_off;
+
+ DataBuffer integral_data;
+ ivec2 offs[TYPE_ELEMS];
+
+ uint c_off;
+ uint c_plane;
+
+ uint comp_idx = uint(gl_WorkGroupID.y);
+ uint invoc_idx = uint(gl_WorkGroupID.z);
+
+ if (strength[comp_idx] == 0.0)
+ return;
+
+ offset = integral_size * (invoc_idx * nb_components + comp_idx);
+ integral_data = DataBuffer(uint64_t(integral_base) + offset);
+ for (uint i = 0; i < TYPE_ELEMS; i++)
+ offs[i] = xyoffsets[xyoffs_start + TYPE_ELEMS*invoc_idx + i];
+
+ c_off = comp_off[comp_idx];
+ c_plane = comp_plane[comp_idx];
+ size = imageSize(input_img[c_plane]);
+
+ pos.x = int(gl_GlobalInvocationID.x);
+ if (pos.x < width[c_plane]) {
+ prefix_sum = DTYPE(0);
+ for (pos.y = 0; pos.y < height[c_plane]; pos.y++) {
+ offset = int_stride * uint64_t(pos.y);
+ dst = DataBuffer(uint64_t(integral_data) + offset);
+ s1 = imageLoad(input_img[c_plane], pos)[c_off];
+ for (int i = 0; i < TYPE_ELEMS; i++) {
+ pos_off = pos + offs[i];
+ if (any(greaterThanEqual(uvec2(pos_off), size)))
+ s2[i] = s1;
+ else
+ s2[i] = imageLoad(input_img[c_plane], pos_off)[c_off];
+ }
+ s2 = (s1 - s2) * (s1 - s2);
+ dst.v[pos.x] = s2 + prefix_sum;
+ prefix_sum += s2;
+ }
+ }
+}
diff --git a/libavfilter/vulkan/nlmeans_weights.comp.glsl b/libavfilter/vulkan/nlmeans_weights.comp.glsl
new file mode 100644
index 0000000000..24c918bd0a
--- /dev/null
+++ b/libavfilter/vulkan/nlmeans_weights.comp.glsl
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) Lynne
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#pragma shader_stage(compute)
+
+#extension GL_EXT_shader_image_load_formatted : require
+#extension GL_EXT_scalar_block_layout : require
+#extension GL_EXT_nonuniform_qualifier : require
+#extension GL_EXT_buffer_reference : require
+#extension GL_EXT_buffer_reference2 : require
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+#extension GL_ARB_gpu_shader_int64 : require
+
+/* Must be kept in sync with the definitions in vf_nlmeans_vulkan.c */
+#define DTYPE vec4
+#define T_ALIGN 16
+#define TYPE_ELEMS 4
+
+layout (local_size_x_id = 253, local_size_y_id = 254, local_size_z_id = 255) in;
+
+layout (buffer_reference, buffer_reference_align = T_ALIGN, scalar) buffer DataBuffer {
+ DTYPE v[];
+};
+
+layout (push_constant, scalar) uniform pushConstants {
+ uvec4 width;
+ uvec4 height;
+ uvec4 ws_offset;
+ uvec4 ws_stride;
+ ivec4 patch_size;
+ vec4 strength;
+ uvec4 comp_off;
+ uvec4 comp_plane;
+ DataBuffer integral_base;
+ uint64_t integral_size;
+ uint64_t int_stride;
+ uint xyoffs_start;
+ uint ws_count;
+ uint nb_components;
+};
+
+layout (set = 0, binding = 0) uniform readonly image2D input_img[];
+
+layout (set = 0, binding = 1, scalar) buffer weights_buffer {
+ float weights[];
+};
+
+layout (set = 0, binding = 2, scalar) buffer sums_buffer {
+ float sums[];
+};
+
+layout (set = 1, binding = 0, scalar) readonly buffer xyoffsets_buffer {
+ ivec2 xyoffsets[];
+};
+
+void main()
+{
+ uint64_t offset;
+ DataBuffer dst;
+ uvec2 size;
+ ivec2 pos;
+ ivec2 pos_off;
+ int p;
+ float s;
+
+ DataBuffer integral_data;
+ ivec2 offs[TYPE_ELEMS];
+
+ uint c_off;
+ uint c_plane;
+ uint ws_off;
+
+ pos = ivec2(gl_GlobalInvocationID.xy);
+ uint comp_idx = uint(gl_WorkGroupID.z) % nb_components;
+ uint invoc_idx = uint(gl_WorkGroupID.z) / nb_components;
+
+ c_off = comp_off[comp_idx];
+ c_plane = comp_plane[comp_idx];
+ p = patch_size[comp_idx];
+ s = strength[comp_idx];
+ if (s == 0.0 || pos.x < p || pos.y < p || pos.x >= width[c_plane] - p || pos.y >= height[c_plane] - p)
+ return;
+
+ offset = integral_size * (invoc_idx * nb_components + comp_idx);
+ integral_data = DataBuffer(uint64_t(integral_base) + offset);
+ for (uint i = 0; i < TYPE_ELEMS; i++)
+ offs[i] = xyoffsets[xyoffs_start + TYPE_ELEMS*invoc_idx + i];
+
+ ws_off = ws_count * invoc_idx + ws_offset[comp_idx] + pos.y * ws_stride[comp_idx] + pos.x;
+ size = imageSize(input_img[c_plane]);
+
+ DTYPE a;
+ DTYPE b;
+ DTYPE c;
+ DTYPE d;
+
+ DTYPE patch_diff;
+ vec4 src;
+ vec4 w;
+ float w_sum;
+ float sum;
+
+ for (int i = 0; i < 4; i++) {
+ pos_off = pos + offs[i];
+ if (any(greaterThanEqual(uvec2(pos_off), size)))
+ src[i] = imageLoad(input_img[c_plane], pos)[c_off];
+ else
+ src[i] = imageLoad(input_img[c_plane], pos_off)[c_off];
+ }
+
+ offset = int_stride * uint64_t(pos.y - p);
+ dst = DataBuffer(uint64_t(integral_data) + offset);
+ a = dst.v[pos.x - p];
+ c = dst.v[pos.x + p];
+ offset = int_stride * uint64_t(pos.y + p);
+ dst = DataBuffer(uint64_t(integral_data) + offset);
+ b = dst.v[pos.x - p];
+ d = dst.v[pos.x + p];
+
+ patch_diff = d + a - b - c;
+ w = exp(patch_diff * s);
+ w_sum = w[0] + w[1] + w[2] + w[3];
+ sum = dot(w, src * 255);
+
+ weights[ws_off] += w_sum;
+ sums[ws_off] += sum;
+}
--
2.52.0
From f48c81e5fea86531322ec95cfaaedd610ba57805 Mon Sep 17 00:00:00 2001
From: Lynne <dev(a)lynne.ee>
Date: Tue, 21 Apr 2026 10:00:32 +0200
Subject: [PATCH 2/2] vf_blackdetect_vulkan: port to compile-time SPIR-V
generation
---
configure | 2 +-
libavfilter/vf_blackdetect_vulkan.c | 119 ++++++++---------------
libavfilter/vulkan/Makefile | 1 +
libavfilter/vulkan/blackdetect.comp.glsl | 64 ++++++++++++
4 files changed, 109 insertions(+), 77 deletions(-)
create mode 100644 libavfilter/vulkan/blackdetect.comp.glsl
diff --git a/configure b/configure
index d953074c89..32c9aacc62 100755
--- a/configure
+++ b/configure
@@ -4149,7 +4149,7 @@ ass_filter_deps="libass"
avgblur_opencl_filter_deps="opencl"
avgblur_vulkan_filter_deps="vulkan spirv_compiler"
azmq_filter_deps="libzmq"
-blackdetect_vulkan_filter_deps="vulkan spirv_library"
+blackdetect_vulkan_filter_deps="vulkan spirv_compiler"
blackframe_filter_deps="gpl"
blend_vulkan_filter_deps="vulkan spirv_compiler"
boxblur_filter_deps="gpl"
diff --git a/libavfilter/vf_blackdetect_vulkan.c b/libavfilter/vf_blackdetect_vulkan.c
index 279b057148..3abe2f9fb3 100644
--- a/libavfilter/vf_blackdetect_vulkan.c
+++ b/libavfilter/vf_blackdetect_vulkan.c
@@ -19,13 +19,14 @@
*/
#include <float.h>
-#include "libavutil/vulkan_spirv.h"
#include "libavutil/opt.h"
#include "libavutil/timestamp.h"
#include "vulkan_filter.h"
#include "filters.h"
-#include "video.h"
+
+extern const unsigned char ff_blackdetect_comp_spv_data[];
+extern const unsigned int ff_blackdetect_comp_spv_len;
typedef struct BlackDetectVulkanContext {
FFVulkanContext vkctx;
@@ -36,12 +37,16 @@ typedef struct BlackDetectVulkanContext {
FFVulkanShader shd;
AVBufferPool *sum_buf_pool;
- double black_min_duration_time;
- double picture_black_ratio_th;
- double pixel_black_th;
- int alpha;
+ double picture_black_ratio_th;
+ double pixel_black_th;
+ int alpha;
- int64_t black_start;
+ int black_started;
+ int64_t black_start; ///< pts start time of the first black picture
+ int64_t black_end; ///< pts end time of the last black picture
+ double black_min_duration_time; ///< minimum duration of detected black, in seconds
+ int64_t black_min_duration; ///< minimum duration of detected black, expressed in timebase units
+ AVRational time_base;
} BlackDetectVulkanContext;
typedef struct BlackDetectPushData {
@@ -56,14 +61,9 @@ typedef struct BlackDetectBuf {
static av_cold int init_filter(AVFilterContext *ctx)
{
int err;
- uint8_t *spv_data;
- size_t spv_len;
- void *spv_opaque = NULL;
BlackDetectVulkanContext *s = ctx->priv;
FFVulkanContext *vkctx = &s->vkctx;
- FFVulkanShader *shd;
- FFVkSPIRVCompiler *spv;
- FFVulkanDescriptorSetBinding *desc;
+ const AVFilterLink *inlink = ctx->inputs[0];
const int plane = s->alpha ? 3 : 0;
const AVPixFmtDescriptor *pixdesc = av_pix_fmt_desc_get(s->vkctx.input_format);
@@ -72,12 +72,6 @@ static av_cold int init_filter(AVFilterContext *ctx)
return AVERROR(ENOTSUP);
}
- spv = ff_vk_spirv_init();
- if (!spv) {
- av_log(ctx, AV_LOG_ERROR, "Unable to initialize SPIR-V compiler!\n");
- return AVERROR_EXTERNAL;
- }
-
s->qf = ff_vk_qf_find(vkctx, VK_QUEUE_COMPUTE_BIT, 0);
if (!s->qf) {
av_log(ctx, AV_LOG_ERROR, "Device has no compute queues\n");
@@ -86,89 +80,58 @@ static av_cold int init_filter(AVFilterContext *ctx)
}
RET(ff_vk_exec_pool_init(vkctx, s->qf, &s->e, s->qf->num*4, 0, 0, 0, NULL));
- RET(ff_vk_shader_init(vkctx, &s->shd, "blackdetect",
- VK_SHADER_STAGE_COMPUTE_BIT,
- (const char *[]) { "GL_KHR_shader_subgroup_ballot" }, 1,
- 32, 32, 1,
- 0));
- shd = &s->shd;
- GLSLC(0, layout(push_constant, std430) uniform pushConstants { );
- GLSLC(1, float threshold; );
- GLSLC(0, }; );
+ SPEC_LIST_CREATE(sl, 2, 2*sizeof(uint32_t))
+ SPEC_LIST_ADD(sl, 0, 32, plane);
+ SPEC_LIST_ADD(sl, 1, 32, SLICES);
- ff_vk_shader_add_push_const(shd, 0, sizeof(BlackDetectPushData),
+ ff_vk_shader_load(&s->shd, VK_SHADER_STAGE_COMPUTE_BIT, sl,
+ (int []) { 32, 32, 1 }, 0);
+
+ ff_vk_shader_add_push_const(&s->shd, 0, sizeof(BlackDetectPushData),
VK_SHADER_STAGE_COMPUTE_BIT);
- desc = (FFVulkanDescriptorSetBinding []) {
- {
- .name = "input_img",
+ const FFVulkanDescriptorSetBinding desc[] = {
+ { /* input_img */
.type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
- .mem_layout = ff_vk_shader_rep_fmt(s->vkctx.input_format, FF_VK_REP_FLOAT),
- .mem_quali = "readonly",
- .dimensions = 2,
- .elems = av_pix_fmt_count_planes(s->vkctx.input_format),
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
- }, {
- .name = "sum_buffer",
+ .elems = av_pix_fmt_count_planes(s->vkctx.input_format),
+ },
+ { /* sum_buffer */
.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
- .buf_content = "uint slice_sum[];",
}
};
+ ff_vk_shader_add_descriptor_set(vkctx, &s->shd, desc, 2, 0, 0);
- RET(ff_vk_shader_add_descriptor_set(vkctx, &s->shd, desc, 2, 0, 0));
-
- GLSLC(0, shared uint wg_sum; );
- GLSLC(0, );
- GLSLC(0, void main() );
- GLSLC(0, { );
- GLSLC(1, wg_sum = 0u; );
- GLSLC(1, barrier(); );
- GLSLC(0, );
- GLSLC(1, const ivec2 pos = ivec2(gl_GlobalInvocationID.xy); );
- GLSLF(1, if (!IS_WITHIN(pos, imageSize(input_img[%d]))) ,plane);
- GLSLC(2, return; );
- GLSLF(1, float value = imageLoad(input_img[%d], pos).x; ,plane);
- GLSLC(1, uvec4 isblack = subgroupBallot(value <= threshold); );
- GLSLC(1, if (subgroupElect()) );
- GLSLC(2, atomicAdd(wg_sum, subgroupBallotBitCount(isblack)); );
- GLSLC(1, barrier(); );
- GLSLC(1, if (gl_LocalInvocationIndex == 0u) );
- GLSLF(2, atomicAdd(slice_sum[gl_WorkGroupID.x %% %du], wg_sum); ,SLICES);
- GLSLC(0, } );
-
- RET(spv->compile_shader(vkctx, spv, &s->shd, &spv_data, &spv_len, "main",
- &spv_opaque));
- RET(ff_vk_shader_link(vkctx, &s->shd, spv_data, spv_len, "main"));
+ RET(ff_vk_shader_link(vkctx, &s->shd,
+ ff_blackdetect_comp_spv_data,
+ ff_blackdetect_comp_spv_len, "main"));
RET(ff_vk_shader_register_exec(vkctx, &s->e, &s->shd));
+ s->time_base = inlink->time_base;
+ s->black_min_duration = s->black_min_duration_time / av_q2d(s->time_base);
s->black_start = AV_NOPTS_VALUE;
s->initialized = 1;
fail:
- if (spv_opaque)
- spv->free_shader(spv, &spv_opaque);
- if (spv)
- spv->uninit(&spv);
-
return err;
}
static void report_black_region(AVFilterContext *ctx, int64_t black_end)
{
BlackDetectVulkanContext *s = ctx->priv;
- const AVFilterLink *inlink = ctx->inputs[0];
+
if (s->black_start == AV_NOPTS_VALUE)
return;
- if ((black_end - s->black_start) >= s->black_min_duration_time / av_q2d(inlink->time_base)) {
+ if ((black_end - s->black_start) >= s->black_min_duration) {
av_log(ctx, AV_LOG_INFO,
"black_start:%s black_end:%s black_duration:%s\n",
- av_ts2timestr(s->black_start, &inlink->time_base),
- av_ts2timestr(black_end, &inlink->time_base),
- av_ts2timestr(black_end - s->black_start, &inlink->time_base));
+ av_ts2timestr(s->black_start, &s->time_base),
+ av_ts2timestr(black_end, &s->time_base),
+ av_ts2timestr(black_end - s->black_start, &s->time_base));
}
}
@@ -359,11 +322,15 @@ fail:
static void blackdetect_vulkan_uninit(AVFilterContext *avctx)
{
BlackDetectVulkanContext *s = avctx->priv;
- AVFilterLink *inlink = avctx->inputs[0];
- FilterLink *inl = ff_filter_link(inlink);
FFVulkanContext *vkctx = &s->vkctx;
- report_black_region(avctx, inl->current_pts);
+ /* avctx->inputs[0] is NULL if the filter is freed before its input was
+ * ever linked (e.g. invalid options abort filter creation). s->initialized
+ * guarantees a frame was processed, so the input link is valid. */
+ if (s->initialized) {
+ FilterLink *inl = ff_filter_link(avctx->inputs[0]);
+ report_black_region(avctx, inl->current_pts);
+ }
ff_vk_exec_pool_free(vkctx, &s->e);
ff_vk_shader_free(vkctx, &s->shd);
diff --git a/libavfilter/vulkan/Makefile b/libavfilter/vulkan/Makefile
index cd303e535e..2cfe9cfa93 100644
--- a/libavfilter/vulkan/Makefile
+++ b/libavfilter/vulkan/Makefile
@@ -2,6 +2,7 @@ clean::
$(RM) $(CLEANSUFFIXES:%=libavfilter/vulkan/%)
OBJS-$(CONFIG_AVGBLUR_VULKAN_FILTER) += vulkan/avgblur.comp.spv.o
+OBJS-$(CONFIG_BLACKDETECT_VULKAN_FILTER) += vulkan/blackdetect.comp.spv.o
OBJS-$(CONFIG_BLEND_VULKAN_FILTER) += vulkan/blend.comp.spv.o
OBJS-$(CONFIG_BWDIF_VULKAN_FILTER) += vulkan/bwdif.comp.spv.o
OBJS-$(CONFIG_CHROMABER_VULKAN_FILTER) += vulkan/chromaber.comp.spv.o
diff --git a/libavfilter/vulkan/blackdetect.comp.glsl b/libavfilter/vulkan/blackdetect.comp.glsl
new file mode 100644
index 0000000000..21e7601060
--- /dev/null
+++ b/libavfilter/vulkan/blackdetect.comp.glsl
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2025 (c) Niklas Haas
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#pragma shader_stage(compute)
+
+#extension GL_EXT_shader_image_load_formatted : require
+#extension GL_EXT_scalar_block_layout : require
+#extension GL_EXT_nonuniform_qualifier : require
+#extension GL_KHR_shader_subgroup_ballot : require
+#extension GL_EXT_null_initializer : require
+
+layout (constant_id = 0) const uint plane = 0;
+layout (constant_id = 1) const uint slices = 0;
+
+layout (local_size_x_id = 253, local_size_y_id = 254, local_size_z_id = 255) in;
+
+layout (set = 0, binding = 0) uniform readonly image2D input_img[];
+layout (set = 0, binding = 1, scalar) buffer sum_buffer {
+ uint slice_sum[];
+};
+
+layout (push_constant, scalar) uniform pushConstants {
+ float threshold;
+};
+
+shared uint wg_sum = { };
+
+void main()
+{
+ ivec2 pos = ivec2(gl_GlobalInvocationID.xy);
+
+ /* oob invocs still must reach the barrier, but must'nt
+ * get counted in, threshold is positive, so the fake value of 0.0 would
+ * otherwise be counted as black. */
+ bool in_bounds = all(lessThan(pos, imageSize(input_img[plane])));
+ float value = 0.0f;
+ if (in_bounds)
+ value = imageLoad(input_img[plane], pos).x;
+
+ uvec4 isblack = subgroupBallot(in_bounds && value <= threshold);
+ if (subgroupElect())
+ atomicAdd(wg_sum, subgroupBallotBitCount(isblack));
+
+ barrier();
+ if (gl_LocalInvocationIndex == 0)
+ atomicAdd(slice_sum[gl_WorkGroupID.x % slices], wg_sum);
+}
--
2.52.0
1
0
crash in hevc alpha decoding on second asset resize in cinelerra-gg
by Andrew Randrianasulu 15 May '26
by Andrew Randrianasulu 15 May '26
15 May '26
Unfortunately after patching our ffmpeg 8.1 with
https://source.ffmpeg.org/gitweb/ffmpeg.git/commit/3b939ced79655ed084e6bebc…
I still see this crash on x265 created hevc + alpha file:
Thread 301 "av:hevc:df1" received signal SIGSEGV, Segmentation fault.
[Switching to Thread 0x7fff82ffd700 (LWP 6407)]
alloc_frame (s=s@entry=0x7fff401ae8c0, l=l@entry=0x7fff401af698) at
libavcodec/hevc/refs.c:194
194 AVFrame *base = s->layers[0].cur_frame->f;
(gdb) bt full
#0 alloc_frame (s=s@entry=0x7fff401ae8c0, l=l@entry=0x7fff401af698)
at libavcodec/hevc/refs.c:194
alpha = 0x7fff0406bdc0
base = <optimized out>
frame = <optimized out>
vps = <optimized out>
i = <optimized out>
j = <optimized out>
ret = <optimized out>
#1 0x000000000100b7f3 in ff_hevc_set_new_ref
(s=s@entry=0x7fff401ae8c0, l=l@entry=0x7fff401af698, poc=123)
at libavcodec/hevc/refs.c:226
ref = <optimized out>
i = <optimized out>
no_output = <optimized out>
#2 0x0000000000fd7dd6 in hevc_frame_start (nal_idx=1,
l=0x7fff401af698, s=0x7fff401ae8c0)
at libavcodec/hevc/hevcdec.c:3337
pps = 0x7fff402c5640
new_sequence = 0
prev_layers_active_output = 1
ret = <optimized out>
sps = 0x7fff402c0640
pic_size_in_ctb = 2806
prev_layers_active_decode = 3
#3 decode_slice (gb=0x7fff82ffccc0, nal_idx=1, s=0x7fff401ae8c0) at
libavcodec/hevc/hevcdec.c:3580
---Type <return> to continue, or q <return> to quit---
layer_idx = <optimized out>
l = 0x7fff401af698
ret = <optimized out>
#4 decode_nal_unit (nal_idx=1, s=0x7fff401ae8c0) at
libavcodec/hevc/hevcdec.c:3663
nal = <optimized out>
gb = {
buffer = 0x7fff74000e9e
"\002\t\244\366W\346\024\210\022\003\001*$\004A\370\326\372\303\246\260\314\353\214c).z_=\351\021~\304\f\020'\247\222#gb\033/_\325H\261L\211v\037h\206\301\272\256[\"\377/\204|W\002pm\347\265\322\177e\030\220K\216!\224\b+mI\216\017\307\020\177\215#m\353\200\235W\215E\352K\031\275k\346\322\032g>'0\246C\274U\263v\236\215s\265y\312\245\t\255\203T\376\213Y\037\177`y\005\320\016\022R\036\035I\261\204w\202\271\311\311%S\313\322\370\306\250\241\065JW\224>1\247\234-_\243\317XO\361\003
\031\363\246X\363(s\252\214Pf\331\307o\352\266\201/\360c\020\fo\331\004\064/b"...,
index = 136,
size_in_bits = 5878, size_in_bits_plus8 = 5886}
ret = <optimized out>
#5 decode_nal_units (length=<optimized out>, buf=<optimized out>,
s=0x7fff401ae8c0) at libavcodec/hevc/hevcdec.c:3779
nal = <optimized out>
i = 1
ret = <optimized out>
eos_at_start = <optimized out>
flags = <optimized out>
#6 hevc_receive_frame (avctx=<optimized out>, frame=<optimized out>)
at libavcodec/hevc/hevcdec.c:3881
s = 0x7fff401ae8c0
avci = <optimized out>
avpkt = <optimized out>
---Type <return> to continue, or q <return> to quit---
ret = <optimized out>
sd = <optimized out>
sd_size = 0
#7 0x0000000000f1263f in ff_decode_receive_frame_internal
(avctx=avctx@entry=0x7fff402c6c00,
frame=frame@entry=0x7fff880f2f40) at libavcodec/decode.c:625
avci = 0x7fff402c6fc0
ret = <optimized out>
#8 0x000000000117b47a in frame_worker_thread (arg=0x7fff4006ff00) at
libavcodec/pthread_frame.c:291
ret = 0
p = 0x7fff4006ff00
avctx = 0x7fff402c6c00
#9 0x00007ffff540755a in start_thread () from /lib64/libpthread.so.0
No symbol table info available.
#10 0x00007fffedd02e5f in clone () from /lib64/libc.so.6
No symbol table info available.
There was another patch moving some of those i local variables into
loop, but I am not sure if or how it may help here ?
Will test more. (I wonder if disabling threaded decode will avoid this?)
2
2
[PR] libavcodec/riscv/flacdsp_rvv.S: prevent loop underflow (PR #23102)
by tguilbert-google 15 May '26
by tguilbert-google 15 May '26
15 May '26
PR #23102 opened by tguilbert-google
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23102
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23102.patch
This PR adds an early return if `len <= pred_order` for RISC-V, matching the logic of other architectures.
From 7ffd61bacebc9d55c2c638218a20758aaaea7a8b Mon Sep 17 00:00:00 2001
From: Thomas Guilbert <tguilbert(a)chromium.org>
Date: Fri, 15 May 2026 00:56:45 +0000
Subject: [PATCH] Guard against loop underflow
This commits adds an early returns if `len <= pred_order` for RISC-V
architectures, matching the logic on other architectures.
---
libavcodec/riscv/flacdsp_rvv.S | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/libavcodec/riscv/flacdsp_rvv.S b/libavcodec/riscv/flacdsp_rvv.S
index a927f188d3..0db81726f1 100644
--- a/libavcodec/riscv/flacdsp_rvv.S
+++ b/libavcodec/riscv/flacdsp_rvv.S
@@ -26,6 +26,7 @@ func ff_flac_lpc16_rvv, zve32x, b
vsetvl zero, a2, t0
vle32.v v8, (a1)
sub a4, a4, a2
+ blez a4, 2f
vle32.v v16, (a0)
sh2add a0, a2, a0
vmv.s.x v0, zero
@@ -41,7 +42,7 @@ func ff_flac_lpc16_rvv, zve32x, b
sw t0, (a0)
addi a0, a0, 4
bnez a4, 1b
-
+2:
ret
endfunc
--
2.52.0
1
0
14 May '26
PR #23101 opened by Lynne
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23101
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23101.patch
There were two things wrong with the decoder: the tiles were incorrectly indexed (for non-aligned widths, you got bitstream desyncs). The first commit fixes that in a straightforward way.
And the dequantization was indeed incomplete. ProRes RAW bakes in an 8-point curve, which is required to bake in both the camera's native delinearization curve, and any other curve the manufacturer wants. The last patch addresses that for both Vulkan and C.
This math in the C version caused overflows. The reference decoder is float for a reason. Hence, I had to switch to the 32-bit in -> 12-bit out simple iDCT.
Anyhow, with this, the output of both C and Vulkan matches the reference implementation, and support for Apple's decoder (which is much slower than ours, HAHAHA!) could also be added.
From 0aa96233ceb3f4465a89ee2bf10f696742ccd665 Mon Sep 17 00:00:00 2001
From: Lynne <dev(a)lynne.ee>
Date: Fri, 15 May 2026 02:30:08 +0900
Subject: [PATCH 1/4] vulkan/common: fix LOAD64 again
duh, gb.buf is incremented in the loop and I missed that. ugh.
---
libavcodec/vulkan/common.glsl | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/libavcodec/vulkan/common.glsl b/libavcodec/vulkan/common.glsl
index 0ff9b45b7d..9f1393bbef 100644
--- a/libavcodec/vulkan/common.glsl
+++ b/libavcodec/vulkan/common.glsl
@@ -289,7 +289,8 @@ shared u32vec4 gb_storage[gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize
gb.bits = 0; \
gb.bits_valid = 0; \
u8buf ptr = u8buf(gb.buf); \
- for (uint i = 0; i < ((4 - uint(gb.buf)) & 3); ++i) { \
+ uint prefix = (4 - uint(gb.buf)) & 3; \
+ for (uint i = 0; i < prefix; ++i) { \
gb.bits |= uint64_t(ptr[i].v) << (56 - i * 8); \
gb.bits_valid += 8; \
gb.buf += 1; \
--
2.52.0
From 081a04e351000f52376507ee93304b8072ebcacd Mon Sep 17 00:00:00 2001
From: Lynne <dev(a)lynne.ee>
Date: Fri, 15 May 2026 02:46:11 +0900
Subject: [PATCH 2/4] prores_raw: fix tile alignment issues
Reverse engineered the decoder a bit more. All tiles are always 16x1.
The issue is that at the edges, tiles don't have the same width.
Instead, the first tile that starts to clip is half, and then the
next tile after that is also half the previous tile's width.
---
libavcodec/prores_raw.c | 66 +++++++++++--------
libavcodec/prores_raw.h | 3 +-
libavcodec/vulkan/prores_raw_decode.comp.glsl | 9 +--
libavcodec/vulkan/prores_raw_idct.comp.glsl | 14 ++--
libavcodec/vulkan_prores_raw.c | 5 +-
5 files changed, 52 insertions(+), 45 deletions(-)
diff --git a/libavcodec/prores_raw.c b/libavcodec/prores_raw.c
index c1c05fd959..314386f339 100644
--- a/libavcodec/prores_raw.c
+++ b/libavcodec/prores_raw.c
@@ -20,6 +20,7 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
+#include "libavutil/avassert.h"
#include "libavutil/intreadwrite.h"
#include "libavutil/mem_internal.h"
#include "libavutil/mem.h"
@@ -131,11 +132,10 @@ static int decode_comp(AVCodecContext *avctx, TileContext *tile,
uint16_t *dst = (uint16_t *)(frame->data[0] + tile->y*frame->linesize[0] + 2*tile->x);
int idx;
- const int w = FFMIN(s->tw, avctx->width - tile->x) / 2;
- const int nb_blocks = w / 8;
- const int log2_nb_blocks = 31 - ff_clz(nb_blocks);
- const int block_mask = (1 << log2_nb_blocks) - 1;
- const int nb_codes = 64 * nb_blocks;
+ const int log2_nb_blocks = tile->log2_nb_blocks;
+ const int nb_blocks = 1 << log2_nb_blocks;
+ const int block_mask = nb_blocks - 1;
+ const int nb_codes = 64 * nb_blocks;
LOCAL_ALIGNED_32(int16_t, block, [64*16]);
@@ -426,15 +426,13 @@ static int decode_frame(AVCodecContext *avctx,
ff_permute_scantable(s->qmat, s->prodsp.idct_permutation, qmat);
- s->nb_tw = (w + 15) >> 4;
+ int tw16 = (w + 15) >> 4;
+ s->nb_tw = (tw16 >> align) + av_popcount(~(-1 * (1 << align)) & tw16);
s->nb_th = (h + 15) >> 4;
- s->nb_tw = (s->nb_tw >> align) + av_popcount(~(-1 * (1 << align)) & s->nb_tw);
s->nb_tiles = s->nb_tw * s->nb_th;
av_log(avctx, AV_LOG_DEBUG, "%dx%d | nb_tiles: %d\n", s->nb_tw, s->nb_th, s->nb_tiles);
- s->tw = s->version == 0 ? 128 : 256;
s->th = 16;
- av_log(avctx, AV_LOG_DEBUG, "tile_size: %dx%d\n", s->tw, s->th);
av_fast_mallocz(&s->tiles, &s->tiles_size, s->nb_tiles * sizeof(*s->tiles));
if (!s->tiles)
@@ -443,29 +441,43 @@ static int decode_frame(AVCodecContext *avctx,
if (bytestream2_get_bytes_left(&gb) < s->nb_tiles * 2)
return AVERROR_INVALIDDATA;
- /* Read tile data offsets */
+ /*
+ * Tiles form a nb_tw x nb_th grid over the 16-aligned coded frame, but the
+ * columns are not uniform width: each row's width (in 16-px units) is split
+ * greedily into power-of-two-wide tiles, 2^align down to 2^0, so the right
+ * edge is covered by progressively narrower tiles rather than one clamped
+ * one. A tile is (1 << log2_nb_blocks) blocks wide (block = 16 px), 16 tall.
+ */
int offset = bytestream2_tell(&gb) + s->nb_tiles * 2;
- for (int n = 0; n < s->nb_tiles; n++) {
- TileContext *tile = &s->tiles[n];
+ int n = 0;
+ for (int ty = 0; ty < s->nb_th; ty++) {
+ unsigned tx = 0;
+ int rem = tw16;
+ for (int e = align; rem > 0; e--) {
+ int unit = 1 << e;
+ while (unit <= rem) {
+ TileContext *tile = &s->tiles[n++];
+ int size = bytestream2_get_be16(&gb);
- int size = bytestream2_get_be16(&gb);
- if (offset >= avpkt->size)
- return AVERROR_INVALIDDATA;
- if (size >= avpkt->size)
- return AVERROR_INVALIDDATA;
- if (offset > avpkt->size - size)
- return AVERROR_INVALIDDATA;
+ if (offset >= avpkt->size)
+ return AVERROR_INVALIDDATA;
+ if (size >= avpkt->size)
+ return AVERROR_INVALIDDATA;
+ if (offset > avpkt->size - size)
+ return AVERROR_INVALIDDATA;
- bytestream2_init(&tile->gb, avpkt->data + offset, size);
+ bytestream2_init(&tile->gb, avpkt->data + offset, size);
+ tile->x = tx * 16;
+ tile->y = ty * s->th;
+ tile->log2_nb_blocks = e;
+ offset += size;
- tile->y = (n / s->nb_tw) * s->th;
- tile->x = (n % s->nb_tw) * s->tw;
-
- if (avctx->width - tile->x < 16)
- return AVERROR_PATCHWELCOME;
-
- offset += size;
+ tx += unit;
+ rem -= unit;
+ }
+ }
}
+ av_assert1(n == s->nb_tiles);
ret = ff_thread_get_buffer(avctx, frame, 0);
if (ret < 0)
diff --git a/libavcodec/prores_raw.h b/libavcodec/prores_raw.h
index 3ac8068dd5..23b55661e4 100644
--- a/libavcodec/prores_raw.h
+++ b/libavcodec/prores_raw.h
@@ -33,6 +33,7 @@
typedef struct TileContext {
GetByteContext gb;
unsigned x, y;
+ int log2_nb_blocks;
} TileContext;
typedef struct ProResRAWContext {
@@ -42,7 +43,7 @@ typedef struct ProResRAWContext {
TileContext *tiles;
unsigned int tiles_size;
int nb_tiles;
- int tw, th;
+ int th;
int nb_tw, nb_th;
enum AVPixelFormat pix_fmt;
diff --git a/libavcodec/vulkan/prores_raw_decode.comp.glsl b/libavcodec/vulkan/prores_raw_decode.comp.glsl
index c1ab920e27..92859d59d0 100644
--- a/libavcodec/vulkan/prores_raw_decode.comp.glsl
+++ b/libavcodec/vulkan/prores_raw_decode.comp.glsl
@@ -30,6 +30,7 @@ struct TileData {
ivec2 pos;
uint offset;
uint size;
+ uint log2_nb_blocks;
};
layout (set = 0, binding = 0, r16ui) uniform writeonly uimage2D dst;
@@ -39,7 +40,6 @@ layout (set = 0, binding = 1, scalar) readonly buffer frame_data_buf {
layout (push_constant, scalar) uniform pushConstants {
u8buf pkt_data;
- ivec2 tile_size;
};
#define COMP_ID (gl_LocalInvocationID.y)
@@ -215,10 +215,6 @@ void main(void)
const uint tile_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x;
TileData td = tile_data[tile_idx];
- int width = imageSize(dst).x;
- if (expectEXT(td.pos.x >= width, false))
- return;
-
uint64_t pkt_offset = uint64_t(pkt_data) + td.offset;
u8vec2buf hdr_data = u8vec2buf(pkt_offset);
int header_len = hdr_data[0].v.x >> 3;
@@ -232,8 +228,7 @@ void main(void)
return;
const ivec2 offs = td.pos + ivec2(COMP_ID & 1, COMP_ID >> 1);
- const int w = min(tile_size.x, width - td.pos.x) >> 1;
- const int nb_blocks = w >> 3;
+ const int nb_blocks = 1 << td.log2_nb_blocks;
const ivec4 comp_offset = ivec4(size[2] + size[1] + size[3],
size[2],
diff --git a/libavcodec/vulkan/prores_raw_idct.comp.glsl b/libavcodec/vulkan/prores_raw_idct.comp.glsl
index 15af6d5a3f..ea16272558 100644
--- a/libavcodec/vulkan/prores_raw_idct.comp.glsl
+++ b/libavcodec/vulkan/prores_raw_idct.comp.glsl
@@ -30,6 +30,7 @@ struct TileData {
ivec2 pos;
uint offset;
uint size;
+ uint log2_nb_blocks;
};
layout (set = 0, binding = 0, r16ui) uniform uimage2D dst;
@@ -39,7 +40,6 @@ layout (set = 0, binding = 1, scalar) readonly buffer frame_data_buf {
layout (push_constant, scalar) uniform pushConstants {
u8buf pkt_data;
- ivec2 tile_size;
uint8_t qmat[64];
};
@@ -73,17 +73,12 @@ void main(void)
const uint tile_idx = gl_WorkGroupID.y*gl_NumWorkGroups.x + gl_WorkGroupID.x;
TileData td = tile_data[tile_idx];
- int width = imageSize(dst).x;
- if (expectEXT(td.pos.x >= width, false))
- return;
-
uint64_t pkt_offset = uint64_t(pkt_data) + td.offset;
u8vec2buf hdr_data = u8vec2buf(pkt_offset);
int qscale = pack16(hdr_data[0].v.yx);
const ivec2 offs = td.pos + ivec2(COMP_ID & 1, COMP_ID >> 1);
- const uint w = min(tile_size.x, width - td.pos.x) >> 1;
- const uint nb_blocks = w >> 3;
+ const uint nb_blocks = 1 << td.log2_nb_blocks;
/* Copy push-constant qmat into shared memory for fast non-uniform access */
if (gl_LocalInvocationIndex < 64)
@@ -110,6 +105,11 @@ void main(void)
idct8(BLOCK_ID, COMP_ID*72 + ROW_ID * 9, 1);
barrier();
+ /* Narrow tiles use fewer than the workgroup's block rows; the surplus
+ * rows carry no data and must not be written. No barrier follows. */
+ if (BLOCK_ID >= nb_blocks)
+ return;
+
[[unroll]]
for (uint y = 0; y < 8; y++) {
int v = int(round(blocks[BLOCK_ID][COMP_ID*72 + y*9 + ROW_ID]*4095.0));
diff --git a/libavcodec/vulkan_prores_raw.c b/libavcodec/vulkan_prores_raw.c
index 392b74a863..953b67d592 100644
--- a/libavcodec/vulkan_prores_raw.c
+++ b/libavcodec/vulkan_prores_raw.c
@@ -51,7 +51,6 @@ typedef struct ProResRAWVulkanDecodeContext {
typedef struct DecodePushData {
VkDeviceAddress pkt_data;
- int32_t tile_size[2];
uint8_t qmat[64];
} DecodePushData;
@@ -59,6 +58,7 @@ typedef struct TileData {
int32_t pos[2];
uint32_t offset;
uint32_t size;
+ uint32_t log2_nb_blocks;
} TileData;
static int vk_prores_raw_start_frame(AVCodecContext *avctx,
@@ -118,6 +118,7 @@ static int vk_prores_raw_decode_slice(AVCodecContext *avctx,
td[pp->nb_tiles].pos[0] = prr->tiles[pp->nb_tiles].x;
td[pp->nb_tiles].pos[1] = prr->tiles[pp->nb_tiles].y;
td[pp->nb_tiles].size = size;
+ td[pp->nb_tiles].log2_nb_blocks = prr->tiles[pp->nb_tiles].log2_nb_blocks;
if (vp->slices_buf && slices_buf->host_ref) {
td[pp->nb_tiles].offset = data - slices_buf->mapped_mem;
@@ -229,8 +230,6 @@ static int vk_prores_raw_end_frame(AVCodecContext *avctx)
/* Update push data */
DecodePushData pd_decode = (DecodePushData) {
.pkt_data = slices_buf->address,
- .tile_size[0] = prr->tw,
- .tile_size[1] = prr->th,
};
memcpy(pd_decode.qmat, prr->qmat, 64);
ff_vk_shader_update_push_const(&ctx->s, exec, decode_shader,
--
2.52.0
From 83c2277637cb35410239fbe92d2c6e913c0aa36c Mon Sep 17 00:00:00 2001
From: Lynne <dev(a)lynne.ee>
Date: Thu, 14 May 2026 22:50:36 +0900
Subject: [PATCH 3/4] prores_raw: parse the linearization curve from the
bitstream
After an extended Ghidra session, it turns out that the camera/recorder bakes a
custom curve that *has* to be applied. It contains both the camera's inverse
transfer curve, plus whatever else the camera applied. It could (and does) contain
quantization refinements. And its used to switch between low and high quality encoding
by boosting coeffs (thus acting as an additional dequant curve).
---
libavcodec/prores_raw.c | 11 +++++++++--
libavcodec/prores_raw.h | 5 +++++
2 files changed, 14 insertions(+), 2 deletions(-)
diff --git a/libavcodec/prores_raw.c b/libavcodec/prores_raw.c
index 314386f339..5465e3495c 100644
--- a/libavcodec/prores_raw.c
+++ b/libavcodec/prores_raw.c
@@ -420,8 +420,15 @@ static int decode_frame(AVCodecContext *avctx,
bytestream2_get_buffer(&gb_hdr, qmat, 64);
if ((flags >> 4) & 1) {
- bytestream2_skip(&gb_hdr, 2);
- bytestream2_skip(&gb_hdr, 2 * 7);
+ /* 8-poing 16-bit control points, defining the combined linearization
+ * curve (inv. transfer fn + encoder-defined shaping) */
+ for (int i = 0; i < 8; i++)
+ s->lin_curve[i] = bytestream2_get_be16(&gb_hdr);
+ } else {
+ /* default curve: ptwos */
+ static const uint16_t default_lin_curve[8] =
+ { 0, 512, 1024, 2048, 4096, 8192, 16384, 32768 };
+ memcpy(s->lin_curve, default_lin_curve, sizeof(s->lin_curve));
}
ff_permute_scantable(s->qmat, s->prodsp.idct_permutation, qmat);
diff --git a/libavcodec/prores_raw.h b/libavcodec/prores_raw.h
index 23b55661e4..1e7fee435e 100644
--- a/libavcodec/prores_raw.h
+++ b/libavcodec/prores_raw.h
@@ -54,6 +54,11 @@ typedef struct ProResRAWContext {
DECLARE_ALIGNED(32, uint8_t, scan)[64];
DECLARE_ALIGNED(32, uint8_t, qmat)[64];
+
+ /* 8-point combined linearization curve
+ * (inv. transfer fn + encoder-defined shaping) from the frame header,
+ * applied after iDCT */
+ uint16_t lin_curve[8];
} ProResRAWContext;
extern const uint8_t ff_prores_raw_dc_cb[13];
--
2.52.0
From 563e4dc7e11bbaf5d043e033541e006f05afbc79 Mon Sep 17 00:00:00 2001
From: Lynne <dev(a)lynne.ee>
Date: Fri, 15 May 2026 05:25:19 +0900
Subject: [PATCH 4/4] prores_raw: synchronize decoder with reference
implementation
This completes the reverse engineering of the decoder.
The commit applies the linearization curve from the previous patch.
---
libavcodec/prores_raw.c | 19 ++++----
libavcodec/proresdsp.c | 50 ++++++++++++++++++---
libavcodec/proresdsp.h | 3 +-
libavcodec/vulkan/prores_raw_idct.comp.glsl | 39 ++++++++++------
libavcodec/vulkan_prores_raw.c | 8 ++--
5 files changed, 86 insertions(+), 33 deletions(-)
diff --git a/libavcodec/prores_raw.c b/libavcodec/prores_raw.c
index 5465e3495c..dd9d735356 100644
--- a/libavcodec/prores_raw.c
+++ b/libavcodec/prores_raw.c
@@ -45,15 +45,19 @@ static av_cold int decode_init(AVCodecContext *avctx)
{
ProResRAWContext *s = avctx->priv_data;
- avctx->bits_per_raw_sample = 12;
+ /* The codec outputs linear data, with the transfer function of the
+ * camera and any adjustments built into an 8-point linearization curve */
+ avctx->bits_per_raw_sample = 16;
+ avctx->color_trc = AVCOL_TRC_LINEAR;
avctx->color_primaries = AVCOL_PRI_UNSPECIFIED;
- avctx->color_trc = AVCOL_TRC_UNSPECIFIED;
avctx->colorspace = AVCOL_SPC_UNSPECIFIED;
s->pix_fmt = AV_PIX_FMT_NONE;
ff_blockdsp_init(&s->bdsp);
- ff_proresdsp_init(&s->prodsp, avctx->bits_per_raw_sample);
+ /* Coefficients and the iDCT are 12-bit, the linearization curve then
+ * expands the result to the 16-bit linear output range. */
+ ff_proresdsp_init(&s->prodsp, 12);
ff_permute_scantable(s->scan, ff_prores_interlaced_scan, s->prodsp.idct_permutation);
@@ -137,7 +141,7 @@ static int decode_comp(AVCodecContext *avctx, TileContext *tile,
const int block_mask = nb_blocks - 1;
const int nb_codes = 64 * nb_blocks;
- LOCAL_ALIGNED_32(int16_t, block, [64*16]);
+ LOCAL_ALIGNED_32(int32_t, block, [64*16]);
int16_t sign = 0;
int16_t dc_add = 0;
@@ -158,8 +162,7 @@ static int decode_comp(AVCodecContext *avctx, TileContext *tile,
if ((ret = init_get_bits8(&gb, data, size)) < 0)
return ret;
- for (int n = 0; n < nb_blocks; n++)
- s->bdsp.clear_block(block + n*64);
+ memset(block, 0, nb_blocks * 64 * sizeof(*block));
/* Special handling for first block */
int dc = get_value(&gb, 700);
@@ -234,7 +237,7 @@ static int decode_comp(AVCodecContext *avctx, TileContext *tile,
for (int n = 0; n < nb_blocks; n++) {
uint16_t *ptr = dst + n*16;
- s->prodsp.idct_put_bayer(ptr, linesize, block + n*64, qmat);
+ s->prodsp.idct_put_bayer(ptr, linesize, block + n*64, qmat, s->lin_curve);
}
return 0;
@@ -265,7 +268,7 @@ static int decode_tile(AVCodecContext *avctx, TileContext *tile,
return AVERROR_INVALIDDATA;
for (int i = 0; i < 64; i++)
- qmat[i] = s->qmat[i] * scale >> 1;
+ qmat[i] = s->qmat[i] * scale;
const uint8_t *comp_start = gb->buffer_start + header_len;
diff --git a/libavcodec/proresdsp.c b/libavcodec/proresdsp.c
index eb5dbf4799..dec3beb9b1 100644
--- a/libavcodec/proresdsp.c
+++ b/libavcodec/proresdsp.c
@@ -40,6 +40,14 @@
#define BIT_DEPTH 12
#include "simple_idct_template.c"
#undef BIT_DEPTH
+#undef IN_IDCT_DEPTH
+
+/* 32bit iDCT for the ProRes RAW */
+#define IN_IDCT_DEPTH 32
+#define BIT_DEPTH 12
+#include "simple_idct_template.c"
+#undef BIT_DEPTH
+#undef IN_IDCT_DEPTH
/**
* Special version of ff_simple_idct_int16_10bit() which does dequantization
@@ -74,6 +82,24 @@ static void prores_idct_12(int16_t *restrict block, const int16_t *restrict qmat
}
}
+/*
+ * 32-bit iDCT for the ProRes RAW
+ * qmat must be s->qmat[i] * scale
+ */
+static void prores_idct_bayer_32(int32_t *restrict block, const int16_t *restrict qmat)
+{
+ for (int i = 0; i < 64; i++)
+ block[i] = (block[i] * qmat[i]) >> 1;
+
+ for (int i = 0; i < 8; i++)
+ idctRowCondDC_int32_12bit(block + i*8, 0);
+
+ for (int i = 0; i < 8; i++) {
+ block[i] += 8192;
+ idctSparseCol_int32_12bit(block + i);
+ }
+}
+
#define CLIP_MIN (1 << 2) ///< minimum value for clipping resulting pixels
#define CLIP_MAX_10 (1 << 10) - CLIP_MIN - 1 ///< maximum value for clipping resulting pixels
#define CLIP_MAX_12 (1 << 12) - CLIP_MIN - 1 ///< maximum value for clipping resulting pixels
@@ -99,12 +125,21 @@ static inline void put_pixel(uint16_t *dst, ptrdiff_t linesize, const int16_t *i
}
}
-static inline void put_pixel_bayer_12(uint16_t *dst, ptrdiff_t linesize,
- const int16_t *in)
+/* Apply the 8-point combined linearization curve (inv. transfer fn + encoder shaping) */
+static inline void put_pixel_bayer_lin_curve_12(uint16_t *dst, ptrdiff_t linesize,
+ const int32_t *in, const uint16_t *lin_curve)
{
for (int y = 0; y < 8; y++, dst += linesize) {
- for (int x = 0; x < 8; x++)
- dst[x*2] = CLIP_12(in[(y << 3) + x]) << 4;
+ for (int x = 0; x < 8; x++) {
+ /* Convert the 32-bit input into 16-bits (lrintf(x*16 - 15.5f) = 16) */
+ int u = av_clip_uint16(in[(y << 3) + x]*16 - 16);
+ unsigned seg = (unsigned)u >> 13;
+ unsigned frac = (unsigned)u & 0x1FFF;
+ unsigned cp0 = lin_curve[seg];
+ unsigned cp1 = seg < 7 ? lin_curve[seg + 1] : 0;
+ unsigned o = (cp0 * 8192u + ((cp1 - cp0) & 0xFFFFu) * frac + 4096u) >> 13;
+ dst[x*2] = FFMIN(o, 0xFFFFu);
+ }
}
}
@@ -131,10 +166,11 @@ static void prores_idct_put_12_c(uint16_t *out, ptrdiff_t linesize, int16_t *blo
}
static void prores_idct_put_bayer_12_c(uint16_t *out, ptrdiff_t linesize,
- int16_t *block, const int16_t *qmat)
+ int32_t *block, const int16_t *qmat,
+ const uint16_t *lin_curve)
{
- prores_idct_12(block, qmat);
- put_pixel_bayer_12(out, linesize << 1, block);
+ prores_idct_bayer_32(block, qmat);
+ put_pixel_bayer_lin_curve_12(out, linesize << 1, block, lin_curve);
}
av_cold void ff_proresdsp_init(ProresDSPContext *dsp, int bits_per_raw_sample)
diff --git a/libavcodec/proresdsp.h b/libavcodec/proresdsp.h
index f8b57d7e87..75c782fb56 100644
--- a/libavcodec/proresdsp.h
+++ b/libavcodec/proresdsp.h
@@ -30,7 +30,8 @@ typedef struct ProresDSPContext {
int idct_permutation_type;
uint8_t idct_permutation[64];
void (*idct_put)(uint16_t *out, ptrdiff_t linesize, int16_t *block, const int16_t *qmat);
- void (*idct_put_bayer)(uint16_t *out, ptrdiff_t linesize, int16_t *block, const int16_t *qmat);
+ void (*idct_put_bayer)(uint16_t *out, ptrdiff_t linesize, int32_t *block, const int16_t *qmat,
+ const uint16_t *lin_curve);
} ProresDSPContext;
void ff_proresdsp_init(ProresDSPContext *dsp, int bits_per_raw_sample);
diff --git a/libavcodec/vulkan/prores_raw_idct.comp.glsl b/libavcodec/vulkan/prores_raw_idct.comp.glsl
index ea16272558..2989236513 100644
--- a/libavcodec/vulkan/prores_raw_idct.comp.glsl
+++ b/libavcodec/vulkan/prores_raw_idct.comp.glsl
@@ -41,6 +41,7 @@ layout (set = 0, binding = 1, scalar) readonly buffer frame_data_buf {
layout (push_constant, scalar) uniform pushConstants {
u8buf pkt_data;
uint8_t qmat[64];
+ uint16_t lin_curve[8];
};
#define COMP_ID (gl_LocalInvocationID.z)
@@ -67,6 +68,7 @@ const u8vec2 scan[64] = {
};
shared uint8_t qmat_buf[64];
+shared uint lin_curve_buf[8];
void main(void)
{
@@ -75,32 +77,30 @@ void main(void)
uint64_t pkt_offset = uint64_t(pkt_data) + td.offset;
u8vec2buf hdr_data = u8vec2buf(pkt_offset);
- int qscale = pack16(hdr_data[0].v.yx);
+ int qscale = int(hdr_data[0].v.y);
const ivec2 offs = td.pos + ivec2(COMP_ID & 1, COMP_ID >> 1);
const uint nb_blocks = 1 << td.log2_nb_blocks;
- /* Copy push-constant qmat into shared memory for fast non-uniform access */
- if (gl_LocalInvocationIndex < 64)
- qmat_buf[gl_LocalInvocationIndex] = qmat[gl_LocalInvocationIndex];
+ if (gl_LocalInvocationIndex == 0) {
+ [[unroll]] for (uint i = 0; i < 64; i++) qmat_buf[i] = qmat[i];
+ [[unroll]] for (uint i = 0; i < 8; i++) lin_curve_buf[i] = uint(lin_curve[i]);
+ }
barrier();
[[unroll]]
for (uint y = 0; y < 8; y++) {
uint block_off = y*8 + ROW_ID;
int v = int(imageLoad(dst, offs + 2*ivec2(BLOCK_ID*8, 0) + scan[block_off])[0]);
- float vf = float(sign_extend(v, 16)) / 32768.0;
- vf *= qmat_buf[block_off] * qscale;
- blocks[BLOCK_ID][COMP_ID*72 + y*9 + ROW_ID] = (vf / (64*4.56)) *
- idct_scale[block_off];
+ /* Dequantize (coeff * qmat * qscale), matching the reference decoder */
+ float vf = float(sign_extend(v, 16)) * float(qmat_buf[block_off]) * float(qscale);
+ blocks[BLOCK_ID][COMP_ID*72 + y*9 + ROW_ID] = vf * idct_scale[block_off];
}
/* Column-wise iDCT */
idct8(BLOCK_ID, COMP_ID*72 + ROW_ID, 9);
barrier();
- blocks[BLOCK_ID][COMP_ID*72 + ROW_ID * 9] += 0.5f;
-
/* Row-wise iDCT */
idct8(BLOCK_ID, COMP_ID*72 + ROW_ID * 9, 1);
barrier();
@@ -112,11 +112,22 @@ void main(void)
[[unroll]]
for (uint y = 0; y < 8; y++) {
- int v = int(round(blocks[BLOCK_ID][COMP_ID*72 + y*9 + ROW_ID]*4095.0));
- v = clamp(v, 0, 4095);
- v <<= 4;
+ /* Bias the signed iDCT output into the reference's unsigned 16-bit space */
+ int u = clamp(int(round(blocks[BLOCK_ID][COMP_ID*72 + y*9 + ROW_ID])) + 32768,
+ 0, 65535);
+
+ /* 8-point combined linearization curve (inv. transfer fn +
+ * encoder-defined shaping). cp1 - cp0 is the segment slope; for the
+ * final segment cp[8] == 0. */
+ uint seg = uint(u) >> 13;
+ uint frac = uint(u) & 0x1FFFu;
+ uint cp0 = lin_curve_buf[seg];
+ uint cp1 = seg < 7u ? lin_curve_buf[seg + 1u] : 0u;
+ uint outv = (cp0 * 8192u + ((cp1 - cp0) & 0xFFFFu) * frac + 4096u) >> 13u;
+ outv = min(outv, 0xFFFFu);
+
imageStore(dst,
offs + 2*ivec2(BLOCK_ID*8 + ROW_ID, y),
- ivec4(v));
+ ivec4(outv));
}
}
diff --git a/libavcodec/vulkan_prores_raw.c b/libavcodec/vulkan_prores_raw.c
index 953b67d592..b6314ab693 100644
--- a/libavcodec/vulkan_prores_raw.c
+++ b/libavcodec/vulkan_prores_raw.c
@@ -52,6 +52,7 @@ typedef struct ProResRAWVulkanDecodeContext {
typedef struct DecodePushData {
VkDeviceAddress pkt_data;
uint8_t qmat[64];
+ uint16_t lin_curve[8];
} DecodePushData;
typedef struct TileData {
@@ -232,9 +233,10 @@ static int vk_prores_raw_end_frame(AVCodecContext *avctx)
.pkt_data = slices_buf->address,
};
memcpy(pd_decode.qmat, prr->qmat, 64);
+ memcpy(pd_decode.lin_curve, prr->lin_curve, sizeof(pd_decode.lin_curve));
ff_vk_shader_update_push_const(&ctx->s, exec, decode_shader,
VK_SHADER_STAGE_COMPUTE_BIT,
- 0, sizeof(pd_decode) - 64, &pd_decode);
+ 0, offsetof(DecodePushData, qmat), &pd_decode);
vk->CmdDispatch(exec->buf, prr->nb_tw, prr->nb_th, 1);
@@ -302,7 +304,7 @@ static int init_decode_shader(AVCodecContext *avctx, FFVulkanContext *s,
{
int err;
- ff_vk_shader_add_push_const(shd, 0, sizeof(DecodePushData) - 64,
+ ff_vk_shader_add_push_const(shd, 0, offsetof(DecodePushData, qmat),
VK_SHADER_STAGE_COMPUTE_BIT);
ff_vk_shader_load(shd, VK_SHADER_STAGE_COMPUTE_BIT, NULL,
(uint32_t []) { 1, 4, 1 }, 0);
@@ -338,7 +340,7 @@ static int init_idct_shader(AVCodecContext *avctx, FFVulkanContext *s,
};
for (int i = 0; i < 64; i++)
SPEC_LIST_ADD(sl, 18 + i, 32,
- av_float2int(idct_8_scales[i >> 3]*idct_8_scales[i & 7]));
+ av_float2int(8*idct_8_scales[i >> 3]*idct_8_scales[i & 7]));
ff_vk_shader_load(shd, VK_SHADER_STAGE_COMPUTE_BIT, sl,
(uint32_t []) { 8, nb_blocks, 4 }, 0);
--
2.52.0
1
0
14 May '26
From: Benedict <benedictcm1(a)gmail.com>
Date: Tue, 6 May2026
Summary:
Add a bounds check for nb_index_entries in jvdec.c before allocating
index_entries and frames arrays. This aligns with similar validation
present in other demuxers (e.g., rl2.c) and improves robustness when
handling malformed input.
Details:
In read_header(), nb_index_entries is read from the input file and
used directly in allocation expressions:
av_malloc(nb_index_entries * sizeof(AVIndexEntry));
av_malloc(nb_index_entries * sizeof(JVFrame)));
Adding a validation check ensures consistency with other demuxers and
prevents potential overflow scenarios on constrained platforms.
Proposed fix:
--- a/libavformat/jvdec.c
+++ b/libavformat/jvdec.c
@@ -95,6 +95,12 @@ static int read_header(AVFormatContext *s)
vst->duration =
vst->nb_frames =
asti->nb_index_entries = avio_rl16(pb);
+
+ /* Validate nb_index_entries to prevent excessive allocation */
+ if (asti->nb_index_entries > INT_MAX / sizeof(AVIndexEntry) ||
+ asti->nb_index_entries > INT_MAX / sizeof(JVFrame)) {
+ return AVERROR_INVALIDDATA;
+ }
avpriv_set_pts_info(vst, 64, avio_rl16(pb), 1000);
avio_skip(pb, 4);
Rationale:
- Improves consistency with rl2.c and similar parsers
- Adds defensive validation against malformed input
- No impact on valid files
Signed-off-by: Benedict <benedictcm1(a)gmail.com>
2
1
[PR] avutil/hwcontext_qsv: free dynamic pool resources on allocation failures (PR #23099)
by jiangjie 14 May '26
by jiangjie 14 May '26
14 May '26
PR #23099 opened by jiangjie
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23099
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23099.patch
Handle av_buffer_create() failure in qsv_dynamic_pool_alloc() via the
common cleanup path so qsv_surface, child_frame, and handle_pairs_internal
are released correctly.
Also free the separately allocated VAAPI surface ID wrapper in the fail
path of qsv_dynamic_pool_map_to().
From e4e39839bd02d968fd65e002954edc5e0c834316 Mon Sep 17 00:00:00 2001
From: jiangjie <jiangjie(a)agora.io>
Date: Thu, 14 May 2026 22:01:05 +0800
Subject: [PATCH] avutil/hwcontext_qsv: free dynamic pool resources on
allocation failures
Handle av_buffer_create() failure in qsv_dynamic_pool_alloc() via the
common cleanup path so qsv_surface, child_frame, and handle_pairs_internal
are released correctly.
Also free the separately allocated VAAPI surface ID wrapper in the fail
path of qsv_dynamic_pool_map_to().
---
libavutil/hwcontext_qsv.c | 18 ++++++++++++++++--
1 file changed, 16 insertions(+), 2 deletions(-)
diff --git a/libavutil/hwcontext_qsv.c b/libavutil/hwcontext_qsv.c
index b92c9cb0ad..8b0c99e733 100644
--- a/libavutil/hwcontext_qsv.c
+++ b/libavutil/hwcontext_qsv.c
@@ -430,6 +430,7 @@ static AVBufferRef *qsv_dynamic_pool_alloc(void *opaque, size_t size)
AVHWFramesContext *child_frames_ctx;
QSVSurface *qsv_surface = NULL;
mfxHDLPair *handle_pairs_internal = NULL;
+ AVBufferRef *buf = NULL;
int ret;
if (!s->child_frames_ref)
@@ -495,14 +496,23 @@ static AVBufferRef *qsv_dynamic_pool_alloc(void *opaque, size_t size)
#endif
qsv_surface->mfx_surface.Data.MemId = (mfxMemId)handle_pairs_internal;
- return av_buffer_create((uint8_t *)qsv_surface, sizeof(*qsv_surface),
- qsv_pool_release, ctx, 0);
+ buf = av_buffer_create((uint8_t *)qsv_surface, sizeof(*qsv_surface),
+ qsv_pool_release, ctx, 0);
+ if (!buf)
+ goto fail;
+
+ return buf;
fail:
if (qsv_surface) {
av_frame_free(&qsv_surface->child_frame);
}
+#if CONFIG_VAAPI
+ if (handle_pairs_internal && child_frames_ctx->device_ctx &&
+ child_frames_ctx->device_ctx->type == AV_HWDEVICE_TYPE_VAAPI)
+ av_freep(&handle_pairs_internal->first);
+#endif
av_freep(&qsv_surface);
av_freep(&handle_pairs_internal);
@@ -2313,6 +2323,10 @@ static int qsv_dynamic_pool_map_to(AVHWFramesContext *dst_ctx,
return 0;
fail:
+#if CONFIG_VAAPI
+ if (handle_pairs_internal && src->format == AV_PIX_FMT_VAAPI)
+ av_freep(&handle_pairs_internal->first);
+#endif
av_freep(&handle_pairs_internal);
av_freep(&surfaces_internal);
return ret;
--
2.52.0
1
0
PR #23098 opened by Zhao Zhili (quink)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23098
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23098.patch
The closing paren was misplaced so the expression parsed as
result = (av_new_packet(...) < 0), which stored 0 or 1 in result instead
of the AVERROR code. On allocation failure the function returned 1
rather than a negative error.
Signed-off-by: Zhao Zhili <zhilizhao(a)tencent.com>
From a7eb2b04cc842ac4538152248b5074938bae6028 Mon Sep 17 00:00:00 2001
From: Zhao Zhili <zhilizhao(a)tencent.com>
Date: Thu, 14 May 2026 19:35:07 +0800
Subject: [PATCH] avformat/rtpdec_av1: fix operator precedence
The closing paren was misplaced so the expression parsed as
result = (av_new_packet(...) < 0), which stored 0 or 1 in result instead
of the AVERROR code. On allocation failure the function returned 1
rather than a negative error.
Signed-off-by: Zhao Zhili <zhilizhao(a)tencent.com>
---
libavformat/rtpdec_av1.c | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/libavformat/rtpdec_av1.c b/libavformat/rtpdec_av1.c
index 91f75326f7..85c3208a65 100644
--- a/libavformat/rtpdec_av1.c
+++ b/libavformat/rtpdec_av1.c
@@ -278,13 +278,13 @@ static int av1_handle_packet(AVFormatContext *ctx, PayloadContext *data,
if (data->needs_td) {
output_size += 2; // for Temporal Delimiter (TD)
}
- if (pkt->data) {
- if ((result = av_grow_packet(pkt, output_size)) < 0)
- return result;
- } else {
- if ((result = av_new_packet(pkt, output_size) < 0))
- return result;
- }
+ if (pkt->data)
+ result = av_grow_packet(pkt, output_size);
+ else
+ result = av_new_packet(pkt, output_size);
+
+ if (result < 0)
+ return result;
if (data->needs_td) {
// restore TD
--
2.52.0
1
0