[FFmpeg-devel] [PATCH 2/3] scale_cuda frame crop support

Timo Rothenpieler timo at rothenpieler.org
Tue Sep 10 21:37:25 EEST 2024


On 10.09.2024 20:10, Koushik Dutta wrote:
> The crop filter has no effect on scale_cuda:
> 
> -vf crop=100:100,scale_cuda=300x300
> 
> Hardware frames (AV_PIX_FMT_FLAG_HWACCEL) are expected to use the crop_* properties,
> as seen in the implementation vf_crop.c.
> 
> The current workaround is to hwdownload the full frame
> and perform the crop on CPU.
> ---
>   libavfilter/vf_scale_cuda.c  | 15 ++++++++++-----
>   libavfilter/vf_scale_cuda.cu | 22 ++++++++++++++--------
>   2 files changed, 24 insertions(+), 13 deletions(-)
> 
> diff --git a/libavfilter/vf_scale_cuda.c b/libavfilter/vf_scale_cuda.c
> index 54a340949d..eb8beee771 100644
> --- a/libavfilter/vf_scale_cuda.c
> +++ b/libavfilter/vf_scale_cuda.c
> @@ -407,7 +407,7 @@ fail:
>   }
>   
>   static int call_resize_kernel(AVFilterContext *ctx, CUfunction func,
> -                              CUtexObject src_tex[4], int src_width, int src_height,
> +                              CUtexObject src_tex[4], int src_left, int src_top, int src_width, int src_height,
>                                 AVFrame *out_frame, int dst_width, int dst_height, int dst_pitch)
>   {
>       CUDAScaleContext *s = ctx->priv;
> @@ -422,7 +422,7 @@ static int call_resize_kernel(AVFilterContext *ctx, CUfunction func,
>           &src_tex[0], &src_tex[1], &src_tex[2], &src_tex[3],
>           &dst_devptr[0], &dst_devptr[1], &dst_devptr[2], &dst_devptr[3],
>           &dst_width, &dst_height, &dst_pitch,
> -        &src_width, &src_height, &s->param
> +        &src_left, &src_top, &src_width, &src_height, &s->param
>       };
>   
>       return CHECK_CU(cu->cuLaunchKernel(func,
> @@ -440,6 +440,9 @@ static int scalecuda_resize(AVFilterContext *ctx,
>   
>       CUtexObject tex[4] = { 0, 0, 0, 0 };
>   
> +    int crop_width = (in->width - in->crop_right) - in->crop_left;
> +    int crop_height = (in->height - in->crop_bottom) - in->crop_top;

Bit of a nit, but I don't think the parenthesis are neccesary.

> +
>       ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
>       if (ret < 0)
>           return ret;
> @@ -477,7 +480,7 @@ static int scalecuda_resize(AVFilterContext *ctx,
>   
>       // scale primary plane(s). Usually Y (and A), or single plane of RGB frames.
>       ret = call_resize_kernel(ctx, s->cu_func,
> -                             tex, in->width, in->height,
> +                             tex, in->crop_left, in->crop_top, crop_width, crop_height,
>                                out, out->width, out->height, out->linesize[0]);
>       if (ret < 0)
>           goto exit;
> @@ -485,8 +488,10 @@ static int scalecuda_resize(AVFilterContext *ctx,
>       if (s->out_planes > 1) {
>           // scale UV plane. Scale function sets both U and V plane, or singular interleaved plane.
>           ret = call_resize_kernel(ctx, s->cu_func_uv, tex,
> -                                 AV_CEIL_RSHIFT(in->width, s->in_desc->log2_chroma_w),
> -                                 AV_CEIL_RSHIFT(in->height, s->in_desc->log2_chroma_h),
> +                                 AV_CEIL_RSHIFT(in->crop_left, s->in_desc->log2_chroma_w),
> +                                 AV_CEIL_RSHIFT(in->crop_top, s->in_desc->log2_chroma_h),
> +                                 AV_CEIL_RSHIFT(crop_width, s->in_desc->log2_chroma_w),
> +                                 AV_CEIL_RSHIFT(crop_height, s->in_desc->log2_chroma_h),
>                                    out,
>                                    AV_CEIL_RSHIFT(out->width, s->out_desc->log2_chroma_w),
>                                    AV_CEIL_RSHIFT(out->height, s->out_desc->log2_chroma_h),
> diff --git a/libavfilter/vf_scale_cuda.cu b/libavfilter/vf_scale_cuda.cu
> index de06ba9433..271b55cd5d 100644
> --- a/libavfilter/vf_scale_cuda.cu
> +++ b/libavfilter/vf_scale_cuda.cu
> @@ -26,6 +26,7 @@
>   template<typename T>
>   using subsample_function_t = T (*)(cudaTextureObject_t tex, int xo, int yo,
>                                      int dst_width, int dst_height,
> +                                   int src_left, int src_top,
>                                      int src_width, int src_height,
>                                      int bit_depth, float param);
>   
> @@ -64,11 +65,12 @@ static inline __device__ ushort conv_16to10(ushort in)
>                subsample_function_t<in_T_uv> subsample_func_uv>                                  \
>       __device__ static inline void N(cudaTextureObject_t src_tex[4], T *dst[4], int xo, int yo, \
>                                       int dst_width, int dst_height, int dst_pitch,              \
> -                                    int src_width, int src_height, float param)
> +                                    int src_left, int src_top, int src_width, int src_height, float param)
>   
>   #define SUB_F(m, plane) \
>       subsample_func_##m(src_tex[plane], xo, yo, \
>                          dst_width, dst_height,  \
> +                       src_left, src_top,      \
>                          src_width, src_height,  \
>                          in_bit_depth, param)
>   
> @@ -1063,13 +1065,14 @@ template<typename T>
>   __device__ static inline T Subsample_Nearest(cudaTextureObject_t tex,
>                                                int xo, int yo,
>                                                int dst_width, int dst_height,
> +                                             int src_left, int src_top,
>                                                int src_width, int src_height,
>                                                int bit_depth, float param)
>   {
>       float hscale = (float)src_width / (float)dst_width;
>       float vscale = (float)src_height / (float)dst_height;
> -    float xi = (xo + 0.5f) * hscale;
> -    float yi = (yo + 0.5f) * vscale;
> +    float xi = (xo + 0.5f) * hscale + src_left;
> +    float yi = (yo + 0.5f) * vscale + src_top;
>   
>       return tex2D<T>(tex, xi, yi);
>   }
> @@ -1078,13 +1081,14 @@ template<typename T>
>   __device__ static inline T Subsample_Bilinear(cudaTextureObject_t tex,
>                                                 int xo, int yo,
>                                                 int dst_width, int dst_height,
> +                                              int src_left, int src_top,
>                                                 int src_width, int src_height,
>                                                 int bit_depth, float param)
>   {
>       float hscale = (float)src_width / (float)dst_width;
>       float vscale = (float)src_height / (float)dst_height;
> -    float xi = (xo + 0.5f) * hscale;
> -    float yi = (yo + 0.5f) * vscale;
> +    float xi = (xo + 0.5f) * hscale + src_left;
> +    float yi = (yo + 0.5f) * vscale + src_top;
>       // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv}
>       float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f);
>       float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f);
> @@ -1109,13 +1113,14 @@ template<typename T, coeffs_function_t coeffs_function>
>   __device__ static inline T Subsample_Bicubic(cudaTextureObject_t tex,
>                                                int xo, int yo,
>                                                int dst_width, int dst_height,
> +                                             int src_left, int src_top,
>                                                int src_width, int src_height,
>                                                int bit_depth, float param)
>   {
>       float hscale = (float)src_width / (float)dst_width;
>       float vscale = (float)src_height / (float)dst_height;
> -    float xi = (xo + 0.5f) * hscale - 0.5f;
> -    float yi = (yo + 0.5f) * vscale - 0.5f;
> +    float xi = (xo + 0.5f) * hscale - 0.5f + src_left;
> +    float yi = (yo + 0.5f) * vscale - 0.5f + src_top;
>       float px = floor(xi);
>       float py = floor(yi);
>       float fx = xi - px;
> @@ -1147,7 +1152,7 @@ __device__ static inline T Subsample_Bicubic(cudaTextureObject_t tex,
>       cudaTextureObject_t src_tex_2, cudaTextureObject_t src_tex_3, \
>       T *dst_0, T *dst_1, T *dst_2, T *dst_3,                       \
>       int dst_width, int dst_height, int dst_pitch,                 \
> -    int src_width, int src_height, float param
> +    int src_left, int src_top, int src_width, int src_height, float param
>   
>   #define SUBSAMPLE(Convert, T) \
>       cudaTextureObject_t src_tex[4] =                    \
> @@ -1159,6 +1164,7 @@ __device__ static inline T Subsample_Bicubic(cudaTextureObject_t tex,
>       Convert(                                            \
>           src_tex, dst, xo, yo,                           \
>           dst_width, dst_height, dst_pitch,               \
> +        src_left, src_top,                              \
>           src_width, src_height, param);
>   
>   extern "C" {

Looks good to me otherwise, will give it a test later.


More information about the ffmpeg-devel mailing list