[FFmpeg-devel] [PATCH 2/3] scale_cuda frame crop support
Timo Rothenpieler
timo at rothenpieler.org
Tue Sep 10 21:37:25 EEST 2024
On 10.09.2024 20:10, Koushik Dutta wrote:
> The crop filter has no effect on scale_cuda:
>
> -vf crop=100:100,scale_cuda=300x300
>
> Hardware frames (AV_PIX_FMT_FLAG_HWACCEL) are expected to use the crop_* properties,
> as seen in the implementation vf_crop.c.
>
> The current workaround is to hwdownload the full frame
> and perform the crop on CPU.
> ---
> libavfilter/vf_scale_cuda.c | 15 ++++++++++-----
> libavfilter/vf_scale_cuda.cu | 22 ++++++++++++++--------
> 2 files changed, 24 insertions(+), 13 deletions(-)
>
> diff --git a/libavfilter/vf_scale_cuda.c b/libavfilter/vf_scale_cuda.c
> index 54a340949d..eb8beee771 100644
> --- a/libavfilter/vf_scale_cuda.c
> +++ b/libavfilter/vf_scale_cuda.c
> @@ -407,7 +407,7 @@ fail:
> }
>
> static int call_resize_kernel(AVFilterContext *ctx, CUfunction func,
> - CUtexObject src_tex[4], int src_width, int src_height,
> + CUtexObject src_tex[4], int src_left, int src_top, int src_width, int src_height,
> AVFrame *out_frame, int dst_width, int dst_height, int dst_pitch)
> {
> CUDAScaleContext *s = ctx->priv;
> @@ -422,7 +422,7 @@ static int call_resize_kernel(AVFilterContext *ctx, CUfunction func,
> &src_tex[0], &src_tex[1], &src_tex[2], &src_tex[3],
> &dst_devptr[0], &dst_devptr[1], &dst_devptr[2], &dst_devptr[3],
> &dst_width, &dst_height, &dst_pitch,
> - &src_width, &src_height, &s->param
> + &src_left, &src_top, &src_width, &src_height, &s->param
> };
>
> return CHECK_CU(cu->cuLaunchKernel(func,
> @@ -440,6 +440,9 @@ static int scalecuda_resize(AVFilterContext *ctx,
>
> CUtexObject tex[4] = { 0, 0, 0, 0 };
>
> + int crop_width = (in->width - in->crop_right) - in->crop_left;
> + int crop_height = (in->height - in->crop_bottom) - in->crop_top;
Bit of a nit, but I don't think the parenthesis are neccesary.
> +
> ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
> if (ret < 0)
> return ret;
> @@ -477,7 +480,7 @@ static int scalecuda_resize(AVFilterContext *ctx,
>
> // scale primary plane(s). Usually Y (and A), or single plane of RGB frames.
> ret = call_resize_kernel(ctx, s->cu_func,
> - tex, in->width, in->height,
> + tex, in->crop_left, in->crop_top, crop_width, crop_height,
> out, out->width, out->height, out->linesize[0]);
> if (ret < 0)
> goto exit;
> @@ -485,8 +488,10 @@ static int scalecuda_resize(AVFilterContext *ctx,
> if (s->out_planes > 1) {
> // scale UV plane. Scale function sets both U and V plane, or singular interleaved plane.
> ret = call_resize_kernel(ctx, s->cu_func_uv, tex,
> - AV_CEIL_RSHIFT(in->width, s->in_desc->log2_chroma_w),
> - AV_CEIL_RSHIFT(in->height, s->in_desc->log2_chroma_h),
> + AV_CEIL_RSHIFT(in->crop_left, s->in_desc->log2_chroma_w),
> + AV_CEIL_RSHIFT(in->crop_top, s->in_desc->log2_chroma_h),
> + AV_CEIL_RSHIFT(crop_width, s->in_desc->log2_chroma_w),
> + AV_CEIL_RSHIFT(crop_height, s->in_desc->log2_chroma_h),
> out,
> AV_CEIL_RSHIFT(out->width, s->out_desc->log2_chroma_w),
> AV_CEIL_RSHIFT(out->height, s->out_desc->log2_chroma_h),
> diff --git a/libavfilter/vf_scale_cuda.cu b/libavfilter/vf_scale_cuda.cu
> index de06ba9433..271b55cd5d 100644
> --- a/libavfilter/vf_scale_cuda.cu
> +++ b/libavfilter/vf_scale_cuda.cu
> @@ -26,6 +26,7 @@
> template<typename T>
> using subsample_function_t = T (*)(cudaTextureObject_t tex, int xo, int yo,
> int dst_width, int dst_height,
> + int src_left, int src_top,
> int src_width, int src_height,
> int bit_depth, float param);
>
> @@ -64,11 +65,12 @@ static inline __device__ ushort conv_16to10(ushort in)
> subsample_function_t<in_T_uv> subsample_func_uv> \
> __device__ static inline void N(cudaTextureObject_t src_tex[4], T *dst[4], int xo, int yo, \
> int dst_width, int dst_height, int dst_pitch, \
> - int src_width, int src_height, float param)
> + int src_left, int src_top, int src_width, int src_height, float param)
>
> #define SUB_F(m, plane) \
> subsample_func_##m(src_tex[plane], xo, yo, \
> dst_width, dst_height, \
> + src_left, src_top, \
> src_width, src_height, \
> in_bit_depth, param)
>
> @@ -1063,13 +1065,14 @@ template<typename T>
> __device__ static inline T Subsample_Nearest(cudaTextureObject_t tex,
> int xo, int yo,
> int dst_width, int dst_height,
> + int src_left, int src_top,
> int src_width, int src_height,
> int bit_depth, float param)
> {
> float hscale = (float)src_width / (float)dst_width;
> float vscale = (float)src_height / (float)dst_height;
> - float xi = (xo + 0.5f) * hscale;
> - float yi = (yo + 0.5f) * vscale;
> + float xi = (xo + 0.5f) * hscale + src_left;
> + float yi = (yo + 0.5f) * vscale + src_top;
>
> return tex2D<T>(tex, xi, yi);
> }
> @@ -1078,13 +1081,14 @@ template<typename T>
> __device__ static inline T Subsample_Bilinear(cudaTextureObject_t tex,
> int xo, int yo,
> int dst_width, int dst_height,
> + int src_left, int src_top,
> int src_width, int src_height,
> int bit_depth, float param)
> {
> float hscale = (float)src_width / (float)dst_width;
> float vscale = (float)src_height / (float)dst_height;
> - float xi = (xo + 0.5f) * hscale;
> - float yi = (yo + 0.5f) * vscale;
> + float xi = (xo + 0.5f) * hscale + src_left;
> + float yi = (yo + 0.5f) * vscale + src_top;
> // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv}
> float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f);
> float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f);
> @@ -1109,13 +1113,14 @@ template<typename T, coeffs_function_t coeffs_function>
> __device__ static inline T Subsample_Bicubic(cudaTextureObject_t tex,
> int xo, int yo,
> int dst_width, int dst_height,
> + int src_left, int src_top,
> int src_width, int src_height,
> int bit_depth, float param)
> {
> float hscale = (float)src_width / (float)dst_width;
> float vscale = (float)src_height / (float)dst_height;
> - float xi = (xo + 0.5f) * hscale - 0.5f;
> - float yi = (yo + 0.5f) * vscale - 0.5f;
> + float xi = (xo + 0.5f) * hscale - 0.5f + src_left;
> + float yi = (yo + 0.5f) * vscale - 0.5f + src_top;
> float px = floor(xi);
> float py = floor(yi);
> float fx = xi - px;
> @@ -1147,7 +1152,7 @@ __device__ static inline T Subsample_Bicubic(cudaTextureObject_t tex,
> cudaTextureObject_t src_tex_2, cudaTextureObject_t src_tex_3, \
> T *dst_0, T *dst_1, T *dst_2, T *dst_3, \
> int dst_width, int dst_height, int dst_pitch, \
> - int src_width, int src_height, float param
> + int src_left, int src_top, int src_width, int src_height, float param
>
> #define SUBSAMPLE(Convert, T) \
> cudaTextureObject_t src_tex[4] = \
> @@ -1159,6 +1164,7 @@ __device__ static inline T Subsample_Bicubic(cudaTextureObject_t tex,
> Convert( \
> src_tex, dst, xo, yo, \
> dst_width, dst_height, dst_pitch, \
> + src_left, src_top, \
> src_width, src_height, param);
>
> extern "C" {
Looks good to me otherwise, will give it a test later.
More information about the ffmpeg-devel
mailing list