[FFmpeg-devel] GSoC Weekly report (libswscale)

Pedro Arthur bygrandao at gmail.com
Mon Aug 17 00:13:07 CEST 2015


2015-08-15 7:24 GMT-03:00 Michael Niedermayer <michael at niedermayer.cc>:

> these are not git patches
>
Yes, they are raw git diffs.

>
> > A - New code
>
> doesnt compile (but that doesnt matter as you say this is slower anyway)
> libswscale/swscale.c: In function ‘swscale’:
> libswscale/swscale.c:529:18: error: ‘i’ undeclared (first use in this
> function)
>
Fixed.

time ./ffmpeg -i matrixbench_mpeg2.mpg -an -vf
> scale=1920:1080,scale=720:480 -f null -
>
 Performance seems good for C. But this is not a good test for measuring
the difference
between the split vs merged color conversion and horizontal scaling as the
source slice passed
to be scaled is already in YUV format and thus there is no need for color
conversion.
Indeed the split color conversion should perform better as the code path is
shorter, instead
of calling the "process" function twice, one for color conversion and one
for  h scaling, it will
call only the hscaling function.


> also this seems well working except
> make -j4 libswscale/swscale-test
> gdb --args libswscale/swscale-test
>
It seems the api is being used incorrectly in swscale-test.c.

The following code creates a sws context with srcH = H / 12, srcW = W / 12.
Next it calls the scaling functions with srcY = 0, srcH = H. Thus it is
scaling more lines
than were specified when creating the context.
Is it intended or it is a bug? If it is a bug I can put a check in the
sws_scale function, if not
I'll have to think a solution for this, as the new code expects only H/12
lines to be scaled.

sws = sws_getContext(W / 12, H / 12, AV_PIX_FMT_RGB32, W, H,
        AV_PIX_FMT_YUVA420P, SWS_BILINEAR, NULL, NULL, NULL);
[...]
sws_scale(sws, rgb_src, rgb_stride, 0, H, src, stride);


I'm attaching the  diff for the fixed new code with split color
conversion/hscaling
(referenced as A previously) and a new one, that I'll call D, which is A
with line batches.
Thus you can test both approaches, split/merged color conversion
with/without line
batches.
As soon as we decide which approach is better I can send a definitive patch.
-------------- next part --------------
diff --git a/libswscale/Makefile b/libswscale/Makefile
index b11e789..24dae8a 100644
--- a/libswscale/Makefile
+++ b/libswscale/Makefile
@@ -15,6 +15,7 @@ OBJS = alphablend.o                                     \
        swscale_unscaled.o                               \
        utils.o                                          \
        yuv2rgb.o                                        \
+       slice.o                                          \
 
 OBJS-$(CONFIG_SHARED)        += log2_tab.o
 
diff --git a/libswscale/hscale.c b/libswscale/hscale.c
new file mode 100644
index 0000000..83f082e
--- /dev/null
+++ b/libswscale/hscale.c
@@ -0,0 +1,274 @@
+static int lum_h_scale(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
+{
+    FilterContext *instance = desc->instance;
+    int srcW = desc->src->width;
+    int dstW = desc->dst->width;
+    int xInc = instance->xInc;
+
+    int i;
+    for (i = 0; i < sliceH; ++i) {
+        uint8_t ** src = desc->src->plane[0].line;
+        uint8_t ** dst = desc->dst->plane[0].line;
+        int src_pos = sliceY+i - desc->src->plane[0].sliceY;
+        int dst_pos = sliceY+i - desc->dst->plane[0].sliceY;
+
+    
+        if (c->hyscale_fast) {
+            c->hyscale_fast(c, (int16_t*)dst[dst_pos], dstW, src[src_pos], srcW, xInc);
+        } else {
+            c->hyScale(c, (int16_t*)dst[dst_pos], dstW, (const uint8_t *)src[src_pos], instance->filter,
+                       instance->filter_pos, instance->filter_size);
+        }
+
+        if (c->lumConvertRange)
+            c->lumConvertRange((int16_t*)dst[dst_pos], dstW);
+
+        desc->dst->plane[0].sliceH += 1;
+
+        if (desc->alpha) {
+            src = desc->src->plane[3].line;
+            dst = desc->dst->plane[3].line;
+
+            src_pos = sliceY+i - desc->src->plane[3].sliceY;
+            dst_pos = sliceY+i - desc->dst->plane[3].sliceY;
+
+            desc->dst->plane[3].sliceH += 1;
+
+            if (c->hyscale_fast) {
+                c->hyscale_fast(c, (int16_t*)dst[dst_pos], dstW, src[src_pos], srcW, xInc);
+            } else {
+                c->hyScale(c, (int16_t*)dst[dst_pos], dstW, (const uint8_t *)src[src_pos], instance->filter,
+                            instance->filter_pos, instance->filter_size);
+            }
+        }
+    }
+
+    return sliceH;
+}
+
+static int lum_convert(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
+{
+    int srcW = desc->src->width;
+    ColorContext * instance = desc->instance;
+    uint32_t * pal = instance->pal;
+    int i;
+
+    desc->dst->plane[0].sliceY = sliceY;
+    desc->dst->plane[0].sliceH = sliceH;
+    desc->dst->plane[3].sliceY = sliceY;
+    desc->dst->plane[3].sliceH = sliceH;
+
+    for (i = 0; i < sliceH; ++i) {
+        int sp0 = sliceY+i - desc->src->plane[0].sliceY;
+        int sp1 = ((sliceY+i) >> desc->src->v_chr_sub_sample) - desc->src->plane[1].sliceY;
+        const uint8_t * src[4] = { desc->src->plane[0].line[sp0],
+                        desc->src->plane[1].line[sp1],
+                        desc->src->plane[2].line[sp1],
+                        desc->src->plane[3].line[sp0]};
+        uint8_t * dst = desc->dst->plane[0].line[i];
+
+        if (c->lumToYV12) {
+            c->lumToYV12(dst, src[0], src[1], src[2], srcW, pal);
+        } else if (c->readLumPlanar) {
+            c->readLumPlanar(dst, src, srcW, c->input_rgb2yuv_table);
+        } 
+        
+        
+        if (desc->alpha) {
+            dst = desc->dst->plane[3].line[i];
+            if (c->alpToYV12) {
+                c->alpToYV12(dst, src[3], src[1], src[2], srcW, pal);
+            } else if (c->readAlpPlanar) {
+                c->readAlpPlanar(dst, src, srcW, NULL);
+            }
+        }
+    }
+
+    return sliceH;
+}
+
+static int init_desc_fmt_convert(SwsFilterDescriptor *desc, SwsSlice * src, SwsSlice *dst, uint32_t *pal)
+{
+    ColorContext * li = av_malloc(sizeof(ColorContext));
+    if (!li)
+        return AVERROR(ENOMEM);
+    li->pal = pal;
+    desc->instance = li;
+
+    desc->alpha = isALPHA(src->fmt) && isALPHA(dst->fmt);
+    desc->src =src;
+    desc->dst = dst;
+    desc->process = &lum_convert;
+
+    return 0;
+}
+
+
+static int init_desc_hscale(SwsFilterDescriptor *desc, SwsSlice *src, SwsSlice *dst, uint16_t *filter, int * filter_pos, int filter_size, int xInc)
+{
+    FilterContext *li = av_malloc(sizeof(FilterContext));
+    if (!li)
+        return AVERROR(ENOMEM);
+
+    li->filter = filter;
+    li->filter_pos = filter_pos;
+    li->filter_size = filter_size;
+    li->xInc = xInc;
+
+    desc->instance = li;
+
+    desc->alpha = isALPHA(src->fmt) && isALPHA(dst->fmt);
+    desc->src = src;
+    desc->dst = dst;
+
+    desc->process = &lum_h_scale;
+
+    return 0;
+}
+
+static int chr_h_scale(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
+{
+    FilterContext *instance = desc->instance;
+    int srcW = FF_CEIL_RSHIFT(desc->src->width, desc->src->h_chr_sub_sample);
+    int dstW = FF_CEIL_RSHIFT(desc->dst->width, desc->dst->h_chr_sub_sample);
+    int xInc = instance->xInc;
+
+    uint8_t ** src1 = desc->src->plane[1].line;
+    uint8_t ** dst1 = desc->dst->plane[1].line;
+    uint8_t ** src2 = desc->src->plane[2].line;
+    uint8_t ** dst2 = desc->dst->plane[2].line;
+
+    int src_pos1 = sliceY - desc->src->plane[1].sliceY;
+    int dst_pos1 = sliceY - desc->dst->plane[1].sliceY;
+
+    int src_pos2 = sliceY - desc->src->plane[2].sliceY;
+    int dst_pos2 = sliceY - desc->dst->plane[2].sliceY;
+
+    int i;
+    for (i = 0; i < sliceH; ++i) {
+        if (c->hcscale_fast) {
+            c->hcscale_fast(c, (uint16_t*)dst1[dst_pos1+i], (uint16_t*)dst2[dst_pos2+i], dstW, src1[src_pos1+i], src2[src_pos2+i], srcW, xInc);
+        } else {
+            c->hcScale(c, (uint16_t*)dst1[dst_pos1+i], dstW, src1[src_pos1+i], instance->filter, instance->filter_pos, instance->filter_size);
+            c->hcScale(c, (uint16_t*)dst2[dst_pos2+i], dstW, src2[src_pos2+i], instance->filter, instance->filter_pos, instance->filter_size);
+        }
+
+        if (c->chrConvertRange)
+            c->chrConvertRange((uint16_t*)dst1[dst_pos1+i], (uint16_t*)dst2[dst_pos2+i], dstW);
+
+        desc->dst->plane[1].sliceH += 1;
+        desc->dst->plane[2].sliceH += 1;
+    }
+    return sliceH;
+}
+
+static int chr_convert(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
+{
+    int srcW = FF_CEIL_RSHIFT(desc->src->width, desc->src->h_chr_sub_sample);
+    ColorContext * instance = desc->instance;
+    uint32_t * pal = instance->pal;
+
+    int sp0 = (sliceY - (desc->src->plane[0].sliceY >> desc->src->v_chr_sub_sample)) << desc->src->v_chr_sub_sample;
+    int sp1 = sliceY - desc->src->plane[1].sliceY;
+
+    int i;
+
+    desc->dst->plane[1].sliceY = sliceY;
+    desc->dst->plane[1].sliceH = sliceH;
+    desc->dst->plane[2].sliceY = sliceY;
+    desc->dst->plane[2].sliceH = sliceH;
+
+    for (i = 0; i < sliceH; ++i) {
+        const uint8_t * src[4] = { desc->src->plane[0].line[sp0+i],
+                        desc->src->plane[1].line[sp1+i],
+                        desc->src->plane[2].line[sp1+i],
+                        desc->src->plane[3].line[sp0+i]};
+
+        uint8_t * dst1 = desc->dst->plane[1].line[i];
+        uint8_t * dst2 = desc->dst->plane[2].line[i];
+        if (c->chrToYV12) {
+            c->chrToYV12(dst1, dst2, src[0], src[1], src[2], srcW, pal);
+        } else if (c->readChrPlanar) {
+            c->readChrPlanar(dst1, dst2, src, srcW, c->input_rgb2yuv_table);
+        }
+    }
+    return sliceH;
+}
+
+static int init_desc_cfmt_convert(SwsFilterDescriptor *desc, SwsSlice * src, SwsSlice *dst, uint32_t *pal)
+{
+    ColorContext * li = av_malloc(sizeof(ColorContext));
+    if (!li)
+        return AVERROR(ENOMEM);
+    li->pal = pal;
+    desc->instance = li;
+
+    desc->src =src;
+    desc->dst = dst;
+    desc->process = &chr_convert;
+
+    return 0;
+}
+
+static int init_desc_chscale(SwsFilterDescriptor *desc, SwsSlice *src, SwsSlice *dst, uint16_t *filter, int * filter_pos, int filter_size, int xInc)
+{
+    FilterContext *li = av_malloc(sizeof(FilterContext));
+    if (!li)
+        return AVERROR(ENOMEM);
+
+    li->filter = filter;
+    li->filter_pos = filter_pos;
+    li->filter_size = filter_size;
+    li->xInc = xInc;
+
+    desc->instance = li;
+
+    desc->alpha = isALPHA(src->fmt) && isALPHA(dst->fmt);
+    desc->src = src;
+    desc->dst = dst;
+
+    desc->process = &chr_h_scale;
+
+    return 0;
+}
+
+static void fill_ones(SwsSlice *s, int n, int is16bit)
+{
+    int i;
+    for (i = 0; i < 4; ++i) {
+        int j;
+        int size = s->plane[i].available_lines;
+        for (j = 0; j < size; ++j) {
+            int k;
+            int end = is16bit ? n>>1: n;
+            // fill also one extra element
+            end += 1;
+            if (is16bit)
+                for (k = 0; k < end; ++k)
+                    ((int32_t*)(s->plane[i].line[j]))[k] = 1<<18;
+            else
+                for (k = 0; k < end; ++k)
+                    ((int16_t*)(s->plane[i].line[j]))[k] = 1<<14;
+        }   
+    }
+}
+
+static int no_chr_scale(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
+{
+    desc->dst->plane[1].sliceY = sliceY + sliceH - desc->dst->plane[1].available_lines;
+    desc->dst->plane[1].sliceH = desc->dst->plane[1].available_lines;
+    desc->dst->plane[2].sliceY = sliceY + sliceH - desc->dst->plane[2].available_lines;
+    desc->dst->plane[2].sliceH = desc->dst->plane[2].available_lines;
+    return 0;
+}
+
+static int init_desc_no_chr(SwsFilterDescriptor *desc, SwsSlice * src, SwsSlice *dst)
+{
+    desc->src = src;
+    desc->dst = dst;
+    desc->alpha = 0;
+    desc->instance = NULL;
+    desc->process = &no_chr_scale;
+    return 0;
+}
+
diff --git a/libswscale/slice.c b/libswscale/slice.c
new file mode 100644
index 0000000..75065bd
--- /dev/null
+++ b/libswscale/slice.c
@@ -0,0 +1,282 @@
+#include "swscale_internal.h"
+
+static void free_lines(SwsSlice *s)
+{
+    int i;
+    for (i = 0; i < 2; ++i) {
+        int n = s->plane[i].available_lines;
+        int j;
+        for (j = 0; j < n; ++j) {
+            av_freep(&s->plane[i].line[j]);
+            if (s->is_ring)
+               s->plane[i].line[j+n] = NULL;
+        }
+    }
+
+    for (i = 0; i < 4; ++i)
+        memset(s->plane[i].line, 0, sizeof(uint8_t*) * s->plane[i].available_lines * (s->is_ring ? 3 : 1));
+    s->should_free_lines = 0;
+}
+
+/*
+ slice lines contains extra bytes for vetorial code thus @size
+ is the allocated memory size and @width is the number of pixels 
+*/
+static int alloc_lines(SwsSlice *s, int size, int width)
+{
+    int i;
+    int idx[2] = {3, 2};
+
+    s->should_free_lines = 1;
+    s->width = width;
+
+    for (i = 0; i < 2; ++i) {
+        int n = s->plane[i].available_lines;
+        int j;
+        int ii = idx[i];
+
+        av_assert0(n == s->plane[ii].available_lines);
+        for (j = 0; j < n; ++j) {
+            // chroma plane line U and V are expected to be contiguous in memory
+            // by mmx vertical scaler code
+            s->plane[i].line[j] = av_malloc(size * 2 + 32);
+            if (!s->plane[i].line[j]) {
+                free_lines(s);
+                return AVERROR(ENOMEM);
+            }
+            s->plane[ii].line[j] = s->plane[i].line[j] + size + 16; 
+            if (s->is_ring) {
+               s->plane[i].line[j+n] = s->plane[i].line[j];
+               s->plane[ii].line[j+n] = s->plane[ii].line[j];
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int alloc_slice(SwsSlice *s, enum AVPixelFormat fmt, int lumLines, int chrLines, int h_sub_sample, int v_sub_sample, int ring)
+{
+    int i;
+    int size[4] = { lumLines,
+                    chrLines,
+                    chrLines,
+                    lumLines };
+
+    s->h_chr_sub_sample = h_sub_sample;
+    s->v_chr_sub_sample = v_sub_sample;
+    s->fmt = fmt;
+    s->is_ring = ring;
+    s->should_free_lines = 0;
+
+    for (i = 0; i < 4; ++i) {
+        int n = size[i] * ( ring == 0 ? 1 : 3);
+        s->plane[i].line = av_mallocz_array(sizeof(uint8_t*), n);
+        if (!s->plane[i].line) 
+            return AVERROR(ENOMEM);
+
+        s->plane[i].tmp = ring ? s->plane[i].line + size[i] * 2 : NULL;
+        s->plane[i].available_lines = size[i];
+        s->plane[i].sliceY = 0;
+        s->plane[i].sliceH = 0;
+    }
+    return 0;
+}
+
+static void free_slice(SwsSlice *s)
+{
+    int i;
+    if (s) {
+        if (s->should_free_lines)
+            free_lines(s);
+        for (i = 0; i < 4; ++i) {
+            av_freep(&s->plane[i].line);
+            s->plane[i].tmp = NULL;
+        }
+    }
+}
+
+int ff_rotate_slice(SwsSlice *s, int lum, int chr)
+{
+    int i;
+    if (lum) {
+        for (i = 0; i < 4; i+=3) {
+            int n = s->plane[i].available_lines;
+            int l = lum - s->plane[i].sliceY;
+
+            if (l >= n * 2) {
+                s->plane[i].sliceY += n;
+                s->plane[i].sliceH -= n;
+            }
+        }
+    }
+    if (chr) {
+        for (i = 1; i < 3; ++i) {
+            int n = s->plane[i].available_lines;
+            int l = chr - s->plane[i].sliceY;
+
+            if (l >= n * 2) {
+                s->plane[i].sliceY += n;
+                s->plane[i].sliceH -= n;
+            }
+        }
+    }
+    return 0;
+}
+
+int ff_init_slice_from_src(SwsSlice * s, uint8_t *src[4], int stride[4], int srcW, int lumY, int lumH, int chrY, int chrH)
+{
+    int i = 0;
+
+    const int start[4] = {lumY,
+                    chrY,
+                    chrY,
+                    lumY};
+
+    const int end[4] = {lumY +lumH,
+                        chrY + chrH,
+                        chrY + chrH,
+                        lumY + lumH};
+
+    s->width = srcW;
+
+    for (i = 0; i < 4; ++i) {
+        int j;
+        int lines = end[i];
+        lines = s->plane[i].available_lines < lines ? s->plane[i].available_lines : lines;
+
+        if (end[i] > s->plane[i].sliceY+s->plane[i].sliceH) {
+            if (start[i] <= s->plane[i].sliceY+1)
+                s->plane[i].sliceY = FFMIN(start[i], s->plane[i].sliceY);
+            else
+                s->plane[i].sliceY = start[i];
+            s->plane[i].sliceH = end[i] - s->plane[i].sliceY;
+        } else {
+            if (end[i] >= s->plane[i].sliceY)
+                s->plane[i].sliceH = s->plane[i].sliceY + s->plane[i].sliceH - start[i];
+            else
+                s->plane[i].sliceH = end[i] - start[i];
+            s->plane[i].sliceY = start[i];
+        }
+
+        for (j = start[i]; j < lines; j+= 1)
+            s->plane[i].line[j] = src[i] + (start[i] + j) * stride[i];
+
+    }
+
+    return 0;
+}
+
+#include "hscale.c"
+
+int ff_init_filters(SwsContext * c)
+{
+    int i;
+    int index;
+    int num_ydesc;
+    int num_cdesc;
+    int need_lum_conv = c->lumToYV12 || c->readLumPlanar || c->alpToYV12 || c->readAlpPlanar;
+    int need_chr_conv = c->chrToYV12 || c->readChrPlanar;
+    int srcIdx, dstIdx;
+    int dst_stride = FFALIGN(c->dstW * sizeof(int16_t) + 66, 16);
+
+    uint32_t * pal = usePal(c->srcFormat) ? c->pal_yuv : (uint32_t*)c->input_rgb2yuv_table;
+    int res = 0;
+
+    if (c->dstBpc == 16)
+        dst_stride <<= 1;
+
+    num_ydesc = need_lum_conv ? 2 : 1;
+    num_cdesc = need_chr_conv ? 2 : 1;
+
+    c->numSlice = FFMAX(num_ydesc, num_cdesc) + 1;
+    c->numDesc = num_ydesc + num_cdesc;
+    c->descIndex[0] = num_ydesc;
+    c->descIndex[1] = num_ydesc + num_cdesc;
+
+    
+
+    c->desc = av_mallocz_array(sizeof(SwsFilterDescriptor), c->numDesc);
+    if (!c->desc)
+        return AVERROR(ENOMEM);
+    c->slice = av_mallocz_array(sizeof(SwsSlice), c->numSlice);
+
+
+    res = alloc_slice(&c->slice[0], c->srcFormat, c->srcH, c->chrSrcH, c->chrSrcHSubSample, c->chrSrcVSubSample, 0);
+    if (res < 0) goto cleanup;
+    for (i = 1; i < c->numSlice-1; ++i) {
+        res = alloc_slice(&c->slice[i], c->srcFormat, c->vLumFilterSize, c->vChrFilterSize, c->chrSrcHSubSample, c->chrSrcVSubSample, 0);
+        if (res < 0) goto cleanup;
+        res = alloc_lines(&c->slice[i], FFALIGN(c->srcW*2+78, 16), c->srcW);
+        if (res < 0) goto cleanup;
+    }
+    res = alloc_slice(&c->slice[i], c->srcFormat, c->vLumFilterSize, c->vChrFilterSize, c->chrDstHSubSample, c->chrDstVSubSample, 1);
+    if (res < 0) goto cleanup;
+    res = alloc_lines(&c->slice[i], dst_stride, c->dstW);
+    if (res < 0) goto cleanup;
+
+    fill_ones(&c->slice[i], dst_stride>>1, c->dstBpc == 16);
+
+    index = 0;
+    srcIdx = 0;
+    dstIdx = 1;
+
+    if (need_lum_conv) {
+        init_desc_fmt_convert(&c->desc[index], &c->slice[srcIdx], &c->slice[dstIdx], pal);
+        c->desc[index].alpha = c->alpPixBuf != 0;
+        ++index;
+        srcIdx = dstIdx;
+    }
+
+
+    dstIdx = FFMAX(num_ydesc, num_cdesc);
+    init_desc_hscale(&c->desc[index], &c->slice[index], &c->slice[dstIdx], c->hLumFilter, c->hLumFilterPos, c->hLumFilterSize, c->lumXInc);
+    c->desc[index].alpha = c->alpPixBuf != 0;
+
+
+    ++index;
+    {
+        srcIdx = 0;
+        dstIdx = 1;
+        if (need_chr_conv) {
+            init_desc_cfmt_convert(&c->desc[index], &c->slice[srcIdx], &c->slice[dstIdx], pal);
+            ++index;
+            srcIdx = dstIdx;
+        }
+
+        dstIdx = FFMAX(num_ydesc, num_cdesc);
+        if (c->needs_hcscale)
+            init_desc_chscale(&c->desc[index], &c->slice[srcIdx], &c->slice[dstIdx], c->hChrFilter, c->hChrFilterPos, c->hChrFilterSize, c->chrXInc);
+        else
+            init_desc_no_chr(&c->desc[index], &c->slice[srcIdx], &c->slice[dstIdx]);
+    }
+
+    return 0;
+
+cleanup:
+    ff_free_filters(c);
+    return res;
+}
+
+int ff_free_filters(SwsContext *c)
+{
+    int i;
+    if (c->desc) {
+        for (i = 0; i < c->numDesc; ++i)
+            av_freep(&c->desc[i].instance);
+        av_freep(&c->desc);
+    }
+
+    if (c->slice) {
+        for (i = 0; i < c->numSlice; ++i)
+            free_slice(&c->slice[i]);
+        av_freep(&c->slice);
+    }
+    return 0;
+}
+
+
+
+
+
+
diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index 1945e1d..e96c7ee 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -371,6 +371,15 @@ static int swscale(SwsContext *c, const uint8_t *src[],
     int lastInChrBuf = c->lastInChrBuf;
     int perform_gamma = c->is_internal_gamma;
 
+    int numDesc = c->numDesc;
+    int lumStart = 0;
+    int lumEnd = c->descIndex[0];
+    int chrStart = lumEnd;
+    int chrEnd = c->descIndex[1];
+    SwsSlice *src_slice = &c->slice[lumStart];
+    SwsSlice *dst_slice = &c->slice[c->numSlice-1];
+    SwsFilterDescriptor *desc = c->desc;
+
 
     if (!usePal(c->srcFormat)) {
         pal = c->input_rgb2yuv_table;
@@ -439,6 +448,23 @@ static int swscale(SwsContext *c, const uint8_t *src[],
     }
     lastDstY = dstY;
 
+#define NEW_FILTER 1
+
+    ff_init_slice_from_src(src_slice, (uint8_t**)src, srcStride, c->srcW,
+            srcSliceY, srcSliceH,
+            chrSrcSliceY, chrSrcSliceH);
+
+    dst_slice->plane[0].sliceY = lastInLumBuf + 1;
+    dst_slice->plane[1].sliceY = lastInChrBuf + 1;
+    dst_slice->plane[2].sliceY = lastInChrBuf + 1;
+    dst_slice->plane[3].sliceY = lastInLumBuf + 1;
+
+    dst_slice->plane[0].sliceH =
+    dst_slice->plane[1].sliceH =
+    dst_slice->plane[2].sliceH =
+    dst_slice->plane[3].sliceH = 0;
+    dst_slice->width = dstW;
+
     for (; dstY < dstH; dstY++) {
         const int chrDstY = dstY >> c->chrDstVSubSample;
         uint8_t *dest[4]  = {
@@ -460,12 +486,23 @@ static int swscale(SwsContext *c, const uint8_t *src[],
         int lastLumSrcY2 = FFMIN(c->srcH,    firstLumSrcY2 + vLumFilterSize) - 1;
         int lastChrSrcY  = FFMIN(c->chrSrcH, firstChrSrcY  + vChrFilterSize) - 1;
         int enough_lines;
+        int i;
 
         // handle holes (FAST_BILINEAR & weird filters)
-        if (firstLumSrcY > lastInLumBuf)
+        if (firstLumSrcY > lastInLumBuf) {
             lastInLumBuf = firstLumSrcY - 1;
-        if (firstChrSrcY > lastInChrBuf)
+            dst_slice->plane[0].sliceY = lastInLumBuf + 1;
+            dst_slice->plane[3].sliceY = lastInLumBuf + 1;
+            dst_slice->plane[0].sliceH =
+            dst_slice->plane[3].sliceH = 0;
+        }
+        if (firstChrSrcY > lastInChrBuf) {
             lastInChrBuf = firstChrSrcY - 1;
+            dst_slice->plane[1].sliceY = lastInChrBuf + 1;
+            dst_slice->plane[2].sliceY = lastInChrBuf + 1;
+            dst_slice->plane[1].sliceH =
+            dst_slice->plane[2].sliceH = 0;
+        }
         av_assert0(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
         av_assert0(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
 
@@ -486,6 +523,22 @@ static int swscale(SwsContext *c, const uint8_t *src[],
                           lastLumSrcY, lastChrSrcY);
         }
 
+#if NEW_FILTER
+        ff_rotate_slice(dst_slice, lastLumSrcY, lastChrSrcY);
+
+        if (lastInLumBuf < lastLumSrcY)
+            for (i = lumStart; i < lumEnd; ++i)
+                desc[i].process(c, &desc[i], lastInLumBuf + 1, lastLumSrcY - lastInLumBuf);
+        lumBufIndex += lastLumSrcY - lastInLumBuf;
+        lastInLumBuf = lastLumSrcY;
+
+        if (lastInChrBuf < lastChrSrcY)
+            for (i = chrStart; i < chrEnd; ++i)
+                desc[i].process(c, &desc[i], lastInChrBuf + 1, lastChrSrcY - lastInChrBuf);
+        chrBufIndex += lastChrSrcY - lastInChrBuf;
+        lastInChrBuf = lastChrSrcY;
+
+#else
         // Do horizontal scaling
         while (lastInLumBuf < lastLumSrcY) {
             const uint8_t *src1[4] = {
@@ -499,8 +552,8 @@ static int swscale(SwsContext *c, const uint8_t *src[],
             av_assert0(lastInLumBuf + 1 - srcSliceY < srcSliceH);
             av_assert0(lastInLumBuf + 1 - srcSliceY >= 0);
 
-            if (perform_gamma)
-                gamma_convert((uint8_t **)src1, srcW, c->inv_gamma);
+            //if (perform_gamma)
+            //    gamma_convert((uint8_t **)src1, srcW, c->inv_gamma);
 
             hyscale(c, lumPixBuf[lumBufIndex], dstW, src1, srcW, lumXInc,
                     hLumFilter, hLumFilterPos, hLumFilterSize,
@@ -535,6 +588,7 @@ static int swscale(SwsContext *c, const uint8_t *src[],
             DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
                           chrBufIndex, lastInChrBuf);
         }
+#endif
         // wrap buf index around to stay inside the ring buffer
         if (lumBufIndex >= vLumBufSize)
             lumBufIndex -= vLumBufSize;
@@ -560,11 +614,19 @@ static int swscale(SwsContext *c, const uint8_t *src[],
         }
 
         {
+#if NEW_FILTER
+            const int16_t **lumSrcPtr  = (const int16_t **)(void*) dst_slice->plane[0].line + firstLumSrcY - dst_slice->plane[0].sliceY;
+            const int16_t **chrUSrcPtr = (const int16_t **)(void*) dst_slice->plane[1].line + firstChrSrcY - dst_slice->plane[1].sliceY;
+            const int16_t **chrVSrcPtr = (const int16_t **)(void*) dst_slice->plane[2].line + firstChrSrcY - dst_slice->plane[2].sliceY;
+            const int16_t **alpSrcPtr  = (CONFIG_SWSCALE_ALPHA && alpPixBuf) ?
+                                         (const int16_t **)(void*) dst_slice->plane[3].line + firstLumSrcY - dst_slice->plane[3].sliceY : NULL;
+#else
             const int16_t **lumSrcPtr  = (const int16_t **)(void*) lumPixBuf  + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
             const int16_t **chrUSrcPtr = (const int16_t **)(void*) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
             const int16_t **chrVSrcPtr = (const int16_t **)(void*) chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
             const int16_t **alpSrcPtr  = (CONFIG_SWSCALE_ALPHA && alpPixBuf) ?
                                          (const int16_t **)(void*) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
+#endif
             int16_t *vLumFilter = c->vLumFilter;
             int16_t *vChrFilter = c->vChrFilter;
 
@@ -629,8 +691,10 @@ static int swscale(SwsContext *c, const uint8_t *src[],
                     }
                 }
             } else if (yuv2packedX) {
+#if !NEW_FILTER
                 av_assert1(lumSrcPtr  + vLumFilterSize - 1 < (const int16_t **)lumPixBuf  + vLumBufSize * 2);
                 av_assert1(chrUSrcPtr + vChrFilterSize - 1 < (const int16_t **)chrUPixBuf + vChrBufSize * 2);
+#endif
                 if (c->yuv2packed1 && vLumFilterSize == 1 &&
                     vChrFilterSize <= 2) { // unscaled RGB
                     int chrAlpha = vChrFilterSize == 1 ? 0 : vChrFilter[2 * dstY + 1];
@@ -663,8 +727,8 @@ static int swscale(SwsContext *c, const uint8_t *src[],
                          chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
                          alpSrcPtr, dest, dstW, dstY);
             }
-            if (perform_gamma)
-                gamma_convert(dest, dstW, c->gamma);
+            //if (perform_gamma)
+            //    gamma_convert(dest, dstW, c->gamma);
         }
     }
     if (isPlanar(dstFormat) && isALPHA(dstFormat) && !alpPixBuf) {
@@ -1151,4 +1215,3 @@ int attribute_align_arg sws_scale(struct SwsContext *c,
     av_free(rgb0_tmp);
     return ret;
 }
-
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 852dd94..2e5c45f 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -276,6 +276,9 @@ typedef void (*yuv2anyX_fn)(struct SwsContext *c, const int16_t *lumFilter,
                             const int16_t **alpSrc, uint8_t **dest,
                             int dstW, int y);
 
+struct SwsSlice;
+struct SwsFilterDescriptor;
+
 /* This struct should be aligned on at least a 32-byte boundary. */
 typedef struct SwsContext {
     /**
@@ -326,6 +329,12 @@ typedef struct SwsContext {
     uint16_t *gamma;
     uint16_t *inv_gamma;
 
+    int numDesc;
+    int descIndex[2];
+    int numSlice;
+    struct SwsSlice *slice;
+    struct SwsFilterDescriptor *desc;
+
     uint32_t pal_yuv[256];
     uint32_t pal_rgb[256];
 
@@ -934,4 +943,80 @@ static inline void fillPlane16(uint8_t *plane, int stride, int width, int height
     }
 }
 
+#define MAX_SLICE_PLANES 4
+
+/// Slice plane
+typedef struct SwsPlane
+{
+    int available_lines;    ///< max number of lines that can be hold by this plane
+    int sliceY;             ///< index of first line
+    int sliceH;             ///< number of lines
+    uint8_t **line;         ///< line buffer
+    uint8_t **tmp;          ///< Tmp line buffer used by mmx code
+} SwsPlane;
+
+/**
+ * Struct which defines a slice of an image to be scaled or a output for
+ * a scaled slice.
+ * A slice can also be used as intermediate ring buffer for scaling steps.
+ */
+typedef struct SwsSlice 
+{
+    int width;              ///< Slice line width
+    int h_chr_sub_sample;   ///< horizontal chroma subsampling factor
+    int v_chr_sub_sample;   ///< vertical chroma subsampling factor
+    int is_ring;            ///< flag to identify if this slice is a ring buffer
+    int should_free_lines;  ///< flag to identify if there are dynamic allocated lines
+    enum AVPixelFormat fmt; ///< planes pixel format
+    SwsPlane plane[MAX_SLICE_PLANES];   ///< color planes
+} SwsSlice;
+
+/**
+ * Struct which holds all necessary data for processing a slice.
+ * A processing step can be a color conversion or horizontal/vertical scaling.
+ */
+typedef struct SwsFilterDescriptor
+{
+    SwsSlice *src;  ///< Source slice
+    SwsSlice *dst;  ///< Output slice
+
+    int alpha;      ///< Flag for processing alpha channel
+    void *instance; ///< Filter instance data
+
+    /// Function for processing input slice sliceH lines starting from line sliceY
+    int (*process)(SwsContext *c, struct SwsFilterDescriptor *desc, int sliceY, int sliceH);
+} SwsFilterDescriptor;
+
+/// Color conversion instance data
+typedef struct ColorContext
+{
+    uint32_t *pal;
+} ColorContext;
+
+/// Scaler instance data
+typedef struct FilterContext
+{
+    uint16_t *filter;
+    int *filter_pos;
+    int filter_size;
+    int xInc;
+} FilterContext;
+
+// warp input lines in the form (src + width*i + j) to slice format (line[i][j])
+int ff_init_slice_from_src(SwsSlice * s, uint8_t *src[4], int stride[4], int srcW, int lumY, int lumH, int chrY, int chrH);
+
+// Initialize scaler filter descriptor chain
+int ff_init_filters(SwsContext *c);
+
+// Free all filter data
+int ff_free_filters(SwsContext *c);
+
+/*
+ function for applying ring buffer logic into slice s
+ It checks if the slice can hold more @lum lines, if yes
+ do nothing otherwise remove @lum least used lines.
+ It applyes the same procedure for @chr lines.
+*/
+int ff_rotate_slice(SwsSlice *s, int lum, int chr);
+
 #endif /* SWSCALE_SWSCALE_INTERNAL_H */
diff --git a/libswscale/utils.c b/libswscale/utils.c
index 1f4dc7d..181a48a 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -1702,7 +1702,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
     }
 
     c->swscale = ff_getSwsFunc(c);
-    return 0;
+    return ff_init_filters(c);
 fail: // FIXME replace things by appropriate error codes
     if (ret == RETCODE_USE_CASCADE)  {
         int tmpW = sqrt(srcW * (int64_t)dstW);
@@ -2219,6 +2219,7 @@ void sws_freeContext(SwsContext *c)
     av_freep(&c->gamma);
     av_freep(&c->inv_gamma);
 
+    ff_free_filters(c);
 
     av_free(c);
 }
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index d611b76..83c01a0 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -85,9 +85,17 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrB
 {
     const int dstH= c->dstH;
     const int flags= c->flags;
+#define NEW_FILTER 1
+#if NEW_FILTER
+    SwsPlane *lumPlane = &c->slice[c->numSlice-1].plane[0];
+    SwsPlane *chrUPlane = &c->slice[c->numSlice-1].plane[1];
+    SwsPlane *alpPlane = &c->slice[c->numSlice-1].plane[3];
+#else
     int16_t **lumPixBuf= c->lumPixBuf;
     int16_t **chrUPixBuf= c->chrUPixBuf;
     int16_t **alpPixBuf= c->alpPixBuf;
+#endif
+    int hasAlpha = c->alpPixBuf != NULL;
     const int vLumBufSize= c->vLumBufSize;
     const int vChrBufSize= c->vChrBufSize;
     int32_t *vLumFilterPos= c->vLumFilterPos;
@@ -110,13 +118,22 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrB
         c->greenDither= ff_dither4[dstY&1];
     c->redDither= ff_dither8[(dstY+1)&1];
     if (dstY < dstH - 2) {
+#if NEW_FILTER
+        const int16_t **lumSrcPtr  = (const int16_t **)(void*) lumPlane->line + firstLumSrcY - lumPlane->sliceY;
+        const int16_t **chrUSrcPtr = (const int16_t **)(void*) chrUPlane->line + firstChrSrcY - chrUPlane->sliceY;
+        const int16_t **alpSrcPtr  = (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) ? (const int16_t **)(void*) alpPlane->line + firstLumSrcY - alpPlane->sliceY : NULL;
+#else
         const int16_t **lumSrcPtr= (const int16_t **)(void*) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
         const int16_t **chrUSrcPtr= (const int16_t **)(void*) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
         const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)(void*) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
+#endif
         int i;
-
         if (firstLumSrcY < 0 || firstLumSrcY + vLumFilterSize > c->srcH) {
+#if NEW_FILTER
+            const int16_t **tmpY = (const int16_t **) lumPlane->tmp;
+#else
             const int16_t **tmpY = (const int16_t **) lumPixBuf + 2 * vLumBufSize;
+#endif
             int neg = -firstLumSrcY, i, end = FFMIN(c->srcH - firstLumSrcY, vLumFilterSize);
             for (i = 0; i < neg;            i++)
                 tmpY[i] = lumSrcPtr[neg];
@@ -127,7 +144,11 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrB
             lumSrcPtr = tmpY;
 
             if (alpSrcPtr) {
+#if NEW_FILTER
+                const int16_t **tmpA = (const int16_t **) alpPlane->tmp;
+#else
                 const int16_t **tmpA = (const int16_t **) alpPixBuf + 2 * vLumBufSize;
+#endif
                 for (i = 0; i < neg;            i++)
                     tmpA[i] = alpSrcPtr[neg];
                 for (     ; i < end;            i++)
@@ -138,7 +159,11 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrB
             }
         }
         if (firstChrSrcY < 0 || firstChrSrcY + vChrFilterSize > c->chrSrcH) {
+#if NEW_FILTER
+            const int16_t **tmpU = (const int16_t **) chrUPlane->tmp;
+#else
             const int16_t **tmpU = (const int16_t **) chrUPixBuf + 2 * vChrBufSize;
+#endif
             int neg = -firstChrSrcY, i, end = FFMIN(c->chrSrcH - firstChrSrcY, vChrFilterSize);
             for (i = 0; i < neg;            i++) {
                 tmpU[i] = chrUSrcPtr[neg];
@@ -160,7 +185,7 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrB
                 lumMmxFilter[s*i+APCK_COEF/4  ]=
                 lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i    ]
                 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
-                if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
+                if (CONFIG_SWSCALE_ALPHA && hasAlpha) {
                     *(const void**)&alpMmxFilter[s*i              ]= alpSrcPtr[i  ];
                     *(const void**)&alpMmxFilter[s*i+APCK_PTR2/4  ]= alpSrcPtr[i+(vLumFilterSize>1)];
                     alpMmxFilter[s*i+APCK_COEF/4  ]=
@@ -180,7 +205,7 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrB
                 lumMmxFilter[4*i+2]=
                 lumMmxFilter[4*i+3]=
                 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001U;
-                if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
+                if (CONFIG_SWSCALE_ALPHA && hasAlpha) {
                     *(const void**)&alpMmxFilter[4*i+0]= alpSrcPtr[i];
                     alpMmxFilter[4*i+2]=
                     alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
-------------- next part --------------
diff --git a/libswscale/Makefile b/libswscale/Makefile
index b11e789..24dae8a 100644
--- a/libswscale/Makefile
+++ b/libswscale/Makefile
@@ -15,6 +15,7 @@ OBJS = alphablend.o                                     \
        swscale_unscaled.o                               \
        utils.o                                          \
        yuv2rgb.o                                        \
+       slice.o                                          \
 
 OBJS-$(CONFIG_SHARED)        += log2_tab.o
 
diff --git a/libswscale/hscale.c b/libswscale/hscale.c
new file mode 100644
index 0000000..83f082e
--- /dev/null
+++ b/libswscale/hscale.c
@@ -0,0 +1,274 @@
+static int lum_h_scale(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
+{
+    FilterContext *instance = desc->instance;
+    int srcW = desc->src->width;
+    int dstW = desc->dst->width;
+    int xInc = instance->xInc;
+
+    int i;
+    for (i = 0; i < sliceH; ++i) {
+        uint8_t ** src = desc->src->plane[0].line;
+        uint8_t ** dst = desc->dst->plane[0].line;
+        int src_pos = sliceY+i - desc->src->plane[0].sliceY;
+        int dst_pos = sliceY+i - desc->dst->plane[0].sliceY;
+
+    
+        if (c->hyscale_fast) {
+            c->hyscale_fast(c, (int16_t*)dst[dst_pos], dstW, src[src_pos], srcW, xInc);
+        } else {
+            c->hyScale(c, (int16_t*)dst[dst_pos], dstW, (const uint8_t *)src[src_pos], instance->filter,
+                       instance->filter_pos, instance->filter_size);
+        }
+
+        if (c->lumConvertRange)
+            c->lumConvertRange((int16_t*)dst[dst_pos], dstW);
+
+        desc->dst->plane[0].sliceH += 1;
+
+        if (desc->alpha) {
+            src = desc->src->plane[3].line;
+            dst = desc->dst->plane[3].line;
+
+            src_pos = sliceY+i - desc->src->plane[3].sliceY;
+            dst_pos = sliceY+i - desc->dst->plane[3].sliceY;
+
+            desc->dst->plane[3].sliceH += 1;
+
+            if (c->hyscale_fast) {
+                c->hyscale_fast(c, (int16_t*)dst[dst_pos], dstW, src[src_pos], srcW, xInc);
+            } else {
+                c->hyScale(c, (int16_t*)dst[dst_pos], dstW, (const uint8_t *)src[src_pos], instance->filter,
+                            instance->filter_pos, instance->filter_size);
+            }
+        }
+    }
+
+    return sliceH;
+}
+
+static int lum_convert(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
+{
+    int srcW = desc->src->width;
+    ColorContext * instance = desc->instance;
+    uint32_t * pal = instance->pal;
+    int i;
+
+    desc->dst->plane[0].sliceY = sliceY;
+    desc->dst->plane[0].sliceH = sliceH;
+    desc->dst->plane[3].sliceY = sliceY;
+    desc->dst->plane[3].sliceH = sliceH;
+
+    for (i = 0; i < sliceH; ++i) {
+        int sp0 = sliceY+i - desc->src->plane[0].sliceY;
+        int sp1 = ((sliceY+i) >> desc->src->v_chr_sub_sample) - desc->src->plane[1].sliceY;
+        const uint8_t * src[4] = { desc->src->plane[0].line[sp0],
+                        desc->src->plane[1].line[sp1],
+                        desc->src->plane[2].line[sp1],
+                        desc->src->plane[3].line[sp0]};
+        uint8_t * dst = desc->dst->plane[0].line[i];
+
+        if (c->lumToYV12) {
+            c->lumToYV12(dst, src[0], src[1], src[2], srcW, pal);
+        } else if (c->readLumPlanar) {
+            c->readLumPlanar(dst, src, srcW, c->input_rgb2yuv_table);
+        } 
+        
+        
+        if (desc->alpha) {
+            dst = desc->dst->plane[3].line[i];
+            if (c->alpToYV12) {
+                c->alpToYV12(dst, src[3], src[1], src[2], srcW, pal);
+            } else if (c->readAlpPlanar) {
+                c->readAlpPlanar(dst, src, srcW, NULL);
+            }
+        }
+    }
+
+    return sliceH;
+}
+
+static int init_desc_fmt_convert(SwsFilterDescriptor *desc, SwsSlice * src, SwsSlice *dst, uint32_t *pal)
+{
+    ColorContext * li = av_malloc(sizeof(ColorContext));
+    if (!li)
+        return AVERROR(ENOMEM);
+    li->pal = pal;
+    desc->instance = li;
+
+    desc->alpha = isALPHA(src->fmt) && isALPHA(dst->fmt);
+    desc->src =src;
+    desc->dst = dst;
+    desc->process = &lum_convert;
+
+    return 0;
+}
+
+
+static int init_desc_hscale(SwsFilterDescriptor *desc, SwsSlice *src, SwsSlice *dst, uint16_t *filter, int * filter_pos, int filter_size, int xInc)
+{
+    FilterContext *li = av_malloc(sizeof(FilterContext));
+    if (!li)
+        return AVERROR(ENOMEM);
+
+    li->filter = filter;
+    li->filter_pos = filter_pos;
+    li->filter_size = filter_size;
+    li->xInc = xInc;
+
+    desc->instance = li;
+
+    desc->alpha = isALPHA(src->fmt) && isALPHA(dst->fmt);
+    desc->src = src;
+    desc->dst = dst;
+
+    desc->process = &lum_h_scale;
+
+    return 0;
+}
+
+static int chr_h_scale(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
+{
+    FilterContext *instance = desc->instance;
+    int srcW = FF_CEIL_RSHIFT(desc->src->width, desc->src->h_chr_sub_sample);
+    int dstW = FF_CEIL_RSHIFT(desc->dst->width, desc->dst->h_chr_sub_sample);
+    int xInc = instance->xInc;
+
+    uint8_t ** src1 = desc->src->plane[1].line;
+    uint8_t ** dst1 = desc->dst->plane[1].line;
+    uint8_t ** src2 = desc->src->plane[2].line;
+    uint8_t ** dst2 = desc->dst->plane[2].line;
+
+    int src_pos1 = sliceY - desc->src->plane[1].sliceY;
+    int dst_pos1 = sliceY - desc->dst->plane[1].sliceY;
+
+    int src_pos2 = sliceY - desc->src->plane[2].sliceY;
+    int dst_pos2 = sliceY - desc->dst->plane[2].sliceY;
+
+    int i;
+    for (i = 0; i < sliceH; ++i) {
+        if (c->hcscale_fast) {
+            c->hcscale_fast(c, (uint16_t*)dst1[dst_pos1+i], (uint16_t*)dst2[dst_pos2+i], dstW, src1[src_pos1+i], src2[src_pos2+i], srcW, xInc);
+        } else {
+            c->hcScale(c, (uint16_t*)dst1[dst_pos1+i], dstW, src1[src_pos1+i], instance->filter, instance->filter_pos, instance->filter_size);
+            c->hcScale(c, (uint16_t*)dst2[dst_pos2+i], dstW, src2[src_pos2+i], instance->filter, instance->filter_pos, instance->filter_size);
+        }
+
+        if (c->chrConvertRange)
+            c->chrConvertRange((uint16_t*)dst1[dst_pos1+i], (uint16_t*)dst2[dst_pos2+i], dstW);
+
+        desc->dst->plane[1].sliceH += 1;
+        desc->dst->plane[2].sliceH += 1;
+    }
+    return sliceH;
+}
+
+static int chr_convert(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
+{
+    int srcW = FF_CEIL_RSHIFT(desc->src->width, desc->src->h_chr_sub_sample);
+    ColorContext * instance = desc->instance;
+    uint32_t * pal = instance->pal;
+
+    int sp0 = (sliceY - (desc->src->plane[0].sliceY >> desc->src->v_chr_sub_sample)) << desc->src->v_chr_sub_sample;
+    int sp1 = sliceY - desc->src->plane[1].sliceY;
+
+    int i;
+
+    desc->dst->plane[1].sliceY = sliceY;
+    desc->dst->plane[1].sliceH = sliceH;
+    desc->dst->plane[2].sliceY = sliceY;
+    desc->dst->plane[2].sliceH = sliceH;
+
+    for (i = 0; i < sliceH; ++i) {
+        const uint8_t * src[4] = { desc->src->plane[0].line[sp0+i],
+                        desc->src->plane[1].line[sp1+i],
+                        desc->src->plane[2].line[sp1+i],
+                        desc->src->plane[3].line[sp0+i]};
+
+        uint8_t * dst1 = desc->dst->plane[1].line[i];
+        uint8_t * dst2 = desc->dst->plane[2].line[i];
+        if (c->chrToYV12) {
+            c->chrToYV12(dst1, dst2, src[0], src[1], src[2], srcW, pal);
+        } else if (c->readChrPlanar) {
+            c->readChrPlanar(dst1, dst2, src, srcW, c->input_rgb2yuv_table);
+        }
+    }
+    return sliceH;
+}
+
+static int init_desc_cfmt_convert(SwsFilterDescriptor *desc, SwsSlice * src, SwsSlice *dst, uint32_t *pal)
+{
+    ColorContext * li = av_malloc(sizeof(ColorContext));
+    if (!li)
+        return AVERROR(ENOMEM);
+    li->pal = pal;
+    desc->instance = li;
+
+    desc->src =src;
+    desc->dst = dst;
+    desc->process = &chr_convert;
+
+    return 0;
+}
+
+static int init_desc_chscale(SwsFilterDescriptor *desc, SwsSlice *src, SwsSlice *dst, uint16_t *filter, int * filter_pos, int filter_size, int xInc)
+{
+    FilterContext *li = av_malloc(sizeof(FilterContext));
+    if (!li)
+        return AVERROR(ENOMEM);
+
+    li->filter = filter;
+    li->filter_pos = filter_pos;
+    li->filter_size = filter_size;
+    li->xInc = xInc;
+
+    desc->instance = li;
+
+    desc->alpha = isALPHA(src->fmt) && isALPHA(dst->fmt);
+    desc->src = src;
+    desc->dst = dst;
+
+    desc->process = &chr_h_scale;
+
+    return 0;
+}
+
+static void fill_ones(SwsSlice *s, int n, int is16bit)
+{
+    int i;
+    for (i = 0; i < 4; ++i) {
+        int j;
+        int size = s->plane[i].available_lines;
+        for (j = 0; j < size; ++j) {
+            int k;
+            int end = is16bit ? n>>1: n;
+            // fill also one extra element
+            end += 1;
+            if (is16bit)
+                for (k = 0; k < end; ++k)
+                    ((int32_t*)(s->plane[i].line[j]))[k] = 1<<18;
+            else
+                for (k = 0; k < end; ++k)
+                    ((int16_t*)(s->plane[i].line[j]))[k] = 1<<14;
+        }   
+    }
+}
+
+static int no_chr_scale(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
+{
+    desc->dst->plane[1].sliceY = sliceY + sliceH - desc->dst->plane[1].available_lines;
+    desc->dst->plane[1].sliceH = desc->dst->plane[1].available_lines;
+    desc->dst->plane[2].sliceY = sliceY + sliceH - desc->dst->plane[2].available_lines;
+    desc->dst->plane[2].sliceH = desc->dst->plane[2].available_lines;
+    return 0;
+}
+
+static int init_desc_no_chr(SwsFilterDescriptor *desc, SwsSlice * src, SwsSlice *dst)
+{
+    desc->src = src;
+    desc->dst = dst;
+    desc->alpha = 0;
+    desc->instance = NULL;
+    desc->process = &no_chr_scale;
+    return 0;
+}
+
diff --git a/libswscale/slice.c b/libswscale/slice.c
new file mode 100644
index 0000000..ea5f759
--- /dev/null
+++ b/libswscale/slice.c
@@ -0,0 +1,282 @@
+#include "swscale_internal.h"
+
+static void free_lines(SwsSlice *s)
+{
+    int i;
+    for (i = 0; i < 2; ++i) {
+        int n = s->plane[i].available_lines;
+        int j;
+        for (j = 0; j < n; ++j) {
+            av_freep(&s->plane[i].line[j]);
+            if (s->is_ring)
+               s->plane[i].line[j+n] = NULL;
+        }
+    }
+
+    for (i = 0; i < 4; ++i)
+        memset(s->plane[i].line, 0, sizeof(uint8_t*) * s->plane[i].available_lines * (s->is_ring ? 3 : 1));
+    s->should_free_lines = 0;
+}
+
+/*
+ slice lines contains extra bytes for vetorial code thus @size
+ is the allocated memory size and @width is the number of pixels 
+*/
+static int alloc_lines(SwsSlice *s, int size, int width)
+{
+    int i;
+    int idx[2] = {3, 2};
+
+    s->should_free_lines = 1;
+    s->width = width;
+
+    for (i = 0; i < 2; ++i) {
+        int n = s->plane[i].available_lines;
+        int j;
+        int ii = idx[i];
+
+        av_assert0(n == s->plane[ii].available_lines);
+        for (j = 0; j < n; ++j) {
+            // chroma plane line U and V are expected to be contiguous in memory
+            // by mmx vertical scaler code
+            s->plane[i].line[j] = av_malloc(size * 2 + 32);
+            if (!s->plane[i].line[j]) {
+                free_lines(s);
+                return AVERROR(ENOMEM);
+            }
+            s->plane[ii].line[j] = s->plane[i].line[j] + size + 16; 
+            if (s->is_ring) {
+               s->plane[i].line[j+n] = s->plane[i].line[j];
+               s->plane[ii].line[j+n] = s->plane[ii].line[j];
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int alloc_slice(SwsSlice *s, enum AVPixelFormat fmt, int lumLines, int chrLines, int h_sub_sample, int v_sub_sample, int ring)
+{
+    int i;
+    int size[4] = { lumLines,
+                    chrLines,
+                    chrLines,
+                    lumLines };
+
+    s->h_chr_sub_sample = h_sub_sample;
+    s->v_chr_sub_sample = v_sub_sample;
+    s->fmt = fmt;
+    s->is_ring = ring;
+    s->should_free_lines = 0;
+
+    for (i = 0; i < 4; ++i) {
+        int n = size[i] * ( ring == 0 ? 1 : 3);
+        s->plane[i].line = av_mallocz_array(sizeof(uint8_t*), n);
+        if (!s->plane[i].line) 
+            return AVERROR(ENOMEM);
+
+        s->plane[i].tmp = ring ? s->plane[i].line + size[i] * 2 : NULL;
+        s->plane[i].available_lines = size[i];
+        s->plane[i].sliceY = 0;
+        s->plane[i].sliceH = 0;
+    }
+    return 0;
+}
+
+static void free_slice(SwsSlice *s)
+{
+    int i;
+    if (s) {
+        if (s->should_free_lines)
+            free_lines(s);
+        for (i = 0; i < 4; ++i) {
+            av_freep(&s->plane[i].line);
+            s->plane[i].tmp = NULL;
+        }
+    }
+}
+
+int ff_rotate_slice(SwsSlice *s, int lum, int chr)
+{
+    int i;
+    if (lum) {
+        for (i = 0; i < 4; i+=3) {
+            int n = s->plane[i].available_lines;
+            int l = lum - s->plane[i].sliceY;
+
+            if (l >= n * 2) {
+                s->plane[i].sliceY += n;
+                s->plane[i].sliceH -= n;
+            }
+        }
+    }
+    if (chr) {
+        for (i = 1; i < 3; ++i) {
+            int n = s->plane[i].available_lines;
+            int l = chr - s->plane[i].sliceY;
+
+            if (l >= n * 2) {
+                s->plane[i].sliceY += n;
+                s->plane[i].sliceH -= n;
+            }
+        }
+    }
+    return 0;
+}
+
+int ff_init_slice_from_src(SwsSlice * s, uint8_t *src[4], int stride[4], int srcW, int lumY, int lumH, int chrY, int chrH)
+{
+    int i = 0;
+
+    const int start[4] = {lumY,
+                    chrY,
+                    chrY,
+                    lumY};
+
+    const int end[4] = {lumY +lumH,
+                        chrY + chrH,
+                        chrY + chrH,
+                        lumY + lumH};
+
+    s->width = srcW;
+
+    for (i = 0; i < 4; ++i) {
+        int j;
+        int lines = end[i];
+        lines = s->plane[i].available_lines < lines ? s->plane[i].available_lines : lines;
+
+        if (end[i] > s->plane[i].sliceY+s->plane[i].sliceH) {
+            if (start[i] <= s->plane[i].sliceY+1)
+                s->plane[i].sliceY = FFMIN(start[i], s->plane[i].sliceY);
+            else
+                s->plane[i].sliceY = start[i];
+            s->plane[i].sliceH = end[i] - s->plane[i].sliceY;
+        } else {
+            if (end[i] >= s->plane[i].sliceY)
+                s->plane[i].sliceH = s->plane[i].sliceY + s->plane[i].sliceH - start[i];
+            else
+                s->plane[i].sliceH = end[i] - start[i];
+            s->plane[i].sliceY = start[i];
+        }
+
+        for (j = start[i]; j < lines; j+= 1)
+            s->plane[i].line[j] = src[i] + (start[i] + j) * stride[i];
+
+    }
+
+    return 0;
+}
+
+#include "hscale.c"
+#define MAX_LINES_AHEAD 4
+int ff_init_filters(SwsContext * c)
+{
+    int i;
+    int index;
+    int num_ydesc;
+    int num_cdesc;
+    int need_lum_conv = c->lumToYV12 || c->readLumPlanar || c->alpToYV12 || c->readAlpPlanar;
+    int need_chr_conv = c->chrToYV12 || c->readChrPlanar;
+    int srcIdx, dstIdx;
+    int dst_stride = FFALIGN(c->dstW * sizeof(int16_t) + 66, 16);
+
+    uint32_t * pal = usePal(c->srcFormat) ? c->pal_yuv : (uint32_t*)c->input_rgb2yuv_table;
+    int res = 0;
+
+    if (c->dstBpc == 16)
+        dst_stride <<= 1;
+
+    num_ydesc = need_lum_conv ? 2 : 1;
+    num_cdesc = need_chr_conv ? 2 : 1;
+
+    c->numSlice = FFMAX(num_ydesc, num_cdesc) + 1;
+    c->numDesc = num_ydesc + num_cdesc;
+    c->descIndex[0] = num_ydesc;
+    c->descIndex[1] = num_ydesc + num_cdesc;
+
+    
+
+    c->desc = av_mallocz_array(sizeof(SwsFilterDescriptor), c->numDesc);
+    if (!c->desc)
+        return AVERROR(ENOMEM);
+    c->slice = av_mallocz_array(sizeof(SwsSlice), c->numSlice);
+
+
+    res = alloc_slice(&c->slice[0], c->srcFormat, c->srcH, c->chrSrcH, c->chrSrcHSubSample, c->chrSrcVSubSample, 0);
+    if (res < 0) goto cleanup;
+    for (i = 1; i < c->numSlice-1; ++i) {
+        res = alloc_slice(&c->slice[i], c->srcFormat, c->vLumFilterSize + MAX_LINES_AHEAD, c->vChrFilterSize + MAX_LINES_AHEAD, c->chrSrcHSubSample, c->chrSrcVSubSample, 0);
+        if (res < 0) goto cleanup;
+        res = alloc_lines(&c->slice[i], FFALIGN(c->srcW*2+78, 16), c->srcW);
+        if (res < 0) goto cleanup;
+    }
+    res = alloc_slice(&c->slice[i], c->srcFormat, c->vLumFilterSize + MAX_LINES_AHEAD, c->vChrFilterSize + MAX_LINES_AHEAD, c->chrDstHSubSample, c->chrDstVSubSample, 1);
+    if (res < 0) goto cleanup;
+    res = alloc_lines(&c->slice[i], dst_stride, c->dstW);
+    if (res < 0) goto cleanup;
+
+    fill_ones(&c->slice[i], dst_stride>>1, c->dstBpc == 16);
+
+    index = 0;
+    srcIdx = 0;
+    dstIdx = 1;
+
+    if (need_lum_conv) {
+        init_desc_fmt_convert(&c->desc[index], &c->slice[srcIdx], &c->slice[dstIdx], pal);
+        c->desc[index].alpha = c->alpPixBuf != 0;
+        ++index;
+        srcIdx = dstIdx;
+    }
+
+
+    dstIdx = FFMAX(num_ydesc, num_cdesc);
+    init_desc_hscale(&c->desc[index], &c->slice[index], &c->slice[dstIdx], c->hLumFilter, c->hLumFilterPos, c->hLumFilterSize, c->lumXInc);
+    c->desc[index].alpha = c->alpPixBuf != 0;
+
+
+    ++index;
+    {
+        srcIdx = 0;
+        dstIdx = 1;
+        if (need_chr_conv) {
+            init_desc_cfmt_convert(&c->desc[index], &c->slice[srcIdx], &c->slice[dstIdx], pal);
+            ++index;
+            srcIdx = dstIdx;
+        }
+
+        dstIdx = FFMAX(num_ydesc, num_cdesc);
+        if (c->needs_hcscale)
+            init_desc_chscale(&c->desc[index], &c->slice[srcIdx], &c->slice[dstIdx], c->hChrFilter, c->hChrFilterPos, c->hChrFilterSize, c->chrXInc);
+        else
+            init_desc_no_chr(&c->desc[index], &c->slice[srcIdx], &c->slice[dstIdx]);
+    }
+
+    return 0;
+
+cleanup:
+    ff_free_filters(c);
+    return res;
+}
+
+int ff_free_filters(SwsContext *c)
+{
+    int i;
+    if (c->desc) {
+        for (i = 0; i < c->numDesc; ++i)
+            av_freep(&c->desc[i].instance);
+        av_freep(&c->desc);
+    }
+
+    if (c->slice) {
+        for (i = 0; i < c->numSlice; ++i)
+            free_slice(&c->slice[i]);
+        av_freep(&c->slice);
+    }
+    return 0;
+}
+
+
+
+
+
+
diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index 1945e1d..6c9b87b 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -371,6 +371,17 @@ static int swscale(SwsContext *c, const uint8_t *src[],
     int lastInChrBuf = c->lastInChrBuf;
     int perform_gamma = c->is_internal_gamma;
 
+    int numDesc = c->numDesc;
+    int lumStart = 0;
+    int lumEnd = c->descIndex[0];
+    int chrStart = lumEnd;
+    int chrEnd = c->descIndex[1];
+    SwsSlice *src_slice = &c->slice[lumStart];
+    SwsSlice *dst_slice = &c->slice[c->numSlice-1];
+    SwsFilterDescriptor *desc = c->desc;
+    int hasLumHoles = 1;
+    int hasChrHoles = 1;
+
 
     if (!usePal(c->srcFormat)) {
         pal = c->input_rgb2yuv_table;
@@ -439,6 +450,23 @@ static int swscale(SwsContext *c, const uint8_t *src[],
     }
     lastDstY = dstY;
 
+#define NEW_FILTER 1
+
+    ff_init_slice_from_src(src_slice, (uint8_t**)src, srcStride, c->srcW,
+            srcSliceY, srcSliceH,
+            chrSrcSliceY, chrSrcSliceH);
+
+    dst_slice->plane[0].sliceY = lastInLumBuf + 1;
+    dst_slice->plane[1].sliceY = lastInChrBuf + 1;
+    dst_slice->plane[2].sliceY = lastInChrBuf + 1;
+    dst_slice->plane[3].sliceY = lastInLumBuf + 1;
+
+    dst_slice->plane[0].sliceH =
+    dst_slice->plane[1].sliceH =
+    dst_slice->plane[2].sliceH =
+    dst_slice->plane[3].sliceH = 0;
+    dst_slice->width = dstW;
+
     for (; dstY < dstH; dstY++) {
         const int chrDstY = dstY >> c->chrDstVSubSample;
         uint8_t *dest[4]  = {
@@ -460,12 +488,30 @@ static int swscale(SwsContext *c, const uint8_t *src[],
         int lastLumSrcY2 = FFMIN(c->srcH,    firstLumSrcY2 + vLumFilterSize) - 1;
         int lastChrSrcY  = FFMIN(c->chrSrcH, firstChrSrcY  + vChrFilterSize) - 1;
         int enough_lines;
+        int i;
+        int posY, cPosY, firstPosY, lastPosY, firstCPosY, lastCPosY;
 
         // handle holes (FAST_BILINEAR & weird filters)
-        if (firstLumSrcY > lastInLumBuf)
+        if (firstLumSrcY > lastInLumBuf) {
+            hasLumHoles = lastInLumBuf != firstLumSrcY - 1;
             lastInLumBuf = firstLumSrcY - 1;
-        if (firstChrSrcY > lastInChrBuf)
+            if (hasLumHoles) {
+                dst_slice->plane[0].sliceY = lastInLumBuf + 1;
+                dst_slice->plane[3].sliceY = lastInLumBuf + 1;
+                dst_slice->plane[0].sliceH =
+                dst_slice->plane[3].sliceH = 0;
+            } 
+        }
+        if (firstChrSrcY > lastInChrBuf) {
+            hasChrHoles = lastInChrBuf != firstChrSrcY - 1;
             lastInChrBuf = firstChrSrcY - 1;
+            if (hasChrHoles) {
+                dst_slice->plane[1].sliceY = lastInChrBuf + 1;
+                dst_slice->plane[2].sliceY = lastInChrBuf + 1;
+                dst_slice->plane[1].sliceH =
+                dst_slice->plane[2].sliceH = 0;
+            }
+        }
         av_assert0(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
         av_assert0(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
 
@@ -486,6 +532,47 @@ static int swscale(SwsContext *c, const uint8_t *src[],
                           lastLumSrcY, lastChrSrcY);
         }
 
+#if NEW_FILTER
+
+#define MAX_LINES_AHEAD 4
+
+        posY = dst_slice->plane[0].sliceY + dst_slice->plane[0].sliceH;
+        if (posY <= lastLumSrcY && !hasLumHoles) {
+            firstPosY = FFMAX(firstLumSrcY, posY);
+            lastPosY = FFMIN(lastLumSrcY + MAX_LINES_AHEAD, srcSliceY + srcSliceH - 1);
+        } else {
+            firstPosY = lastInLumBuf + 1;
+            lastPosY = lastLumSrcY;
+        }
+
+        cPosY = dst_slice->plane[1].sliceY + dst_slice->plane[1].sliceH;
+        if (cPosY <= lastChrSrcY && !hasChrHoles) {
+            firstCPosY = FFMAX(firstChrSrcY, cPosY);
+            lastCPosY = FFMIN(lastChrSrcY + MAX_LINES_AHEAD, FF_CEIL_RSHIFT(srcSliceY + srcSliceH, c->chrSrcVSubSample) - 1);
+        } else {
+            firstCPosY = lastInChrBuf + 1;
+            lastCPosY = lastChrSrcY;
+        }
+
+        ff_rotate_slice(dst_slice, lastPosY, lastCPosY);
+        
+        if (posY < lastLumSrcY + 1) {
+            for (i = lumStart; i < lumEnd; ++i)
+                desc[i].process(c, &desc[i], firstPosY, lastPosY - firstPosY + 1);
+        }
+
+        lumBufIndex += lastLumSrcY - lastInLumBuf;
+        lastInLumBuf = lastLumSrcY;
+
+        if (cPosY < lastChrSrcY + 1) {
+            for (i = chrStart; i < chrEnd; ++i)
+                desc[i].process(c, &desc[i], firstCPosY, lastCPosY - firstCPosY + 1);
+        }
+
+        chrBufIndex += lastChrSrcY - lastInChrBuf;
+        lastInChrBuf = lastChrSrcY;
+
+#else
         // Do horizontal scaling
         while (lastInLumBuf < lastLumSrcY) {
             const uint8_t *src1[4] = {
@@ -499,8 +586,8 @@ static int swscale(SwsContext *c, const uint8_t *src[],
             av_assert0(lastInLumBuf + 1 - srcSliceY < srcSliceH);
             av_assert0(lastInLumBuf + 1 - srcSliceY >= 0);
 
-            if (perform_gamma)
-                gamma_convert((uint8_t **)src1, srcW, c->inv_gamma);
+            //if (perform_gamma)
+            //    gamma_convert((uint8_t **)src1, srcW, c->inv_gamma);
 
             hyscale(c, lumPixBuf[lumBufIndex], dstW, src1, srcW, lumXInc,
                     hLumFilter, hLumFilterPos, hLumFilterSize,
@@ -535,6 +622,7 @@ static int swscale(SwsContext *c, const uint8_t *src[],
             DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
                           chrBufIndex, lastInChrBuf);
         }
+#endif
         // wrap buf index around to stay inside the ring buffer
         if (lumBufIndex >= vLumBufSize)
             lumBufIndex -= vLumBufSize;
@@ -560,11 +648,19 @@ static int swscale(SwsContext *c, const uint8_t *src[],
         }
 
         {
+#if NEW_FILTER
+            const int16_t **lumSrcPtr  = (const int16_t **)(void*) dst_slice->plane[0].line + firstLumSrcY - dst_slice->plane[0].sliceY;
+            const int16_t **chrUSrcPtr = (const int16_t **)(void*) dst_slice->plane[1].line + firstChrSrcY - dst_slice->plane[1].sliceY;
+            const int16_t **chrVSrcPtr = (const int16_t **)(void*) dst_slice->plane[2].line + firstChrSrcY - dst_slice->plane[2].sliceY;
+            const int16_t **alpSrcPtr  = (CONFIG_SWSCALE_ALPHA && alpPixBuf) ?
+                                         (const int16_t **)(void*) dst_slice->plane[3].line + firstLumSrcY - dst_slice->plane[3].sliceY : NULL;
+#else
             const int16_t **lumSrcPtr  = (const int16_t **)(void*) lumPixBuf  + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
             const int16_t **chrUSrcPtr = (const int16_t **)(void*) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
             const int16_t **chrVSrcPtr = (const int16_t **)(void*) chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
             const int16_t **alpSrcPtr  = (CONFIG_SWSCALE_ALPHA && alpPixBuf) ?
                                          (const int16_t **)(void*) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
+#endif
             int16_t *vLumFilter = c->vLumFilter;
             int16_t *vChrFilter = c->vChrFilter;
 
@@ -629,8 +725,10 @@ static int swscale(SwsContext *c, const uint8_t *src[],
                     }
                 }
             } else if (yuv2packedX) {
+#if !NEW_FILTER
                 av_assert1(lumSrcPtr  + vLumFilterSize - 1 < (const int16_t **)lumPixBuf  + vLumBufSize * 2);
                 av_assert1(chrUSrcPtr + vChrFilterSize - 1 < (const int16_t **)chrUPixBuf + vChrBufSize * 2);
+#endif
                 if (c->yuv2packed1 && vLumFilterSize == 1 &&
                     vChrFilterSize <= 2) { // unscaled RGB
                     int chrAlpha = vChrFilterSize == 1 ? 0 : vChrFilter[2 * dstY + 1];
@@ -663,8 +761,8 @@ static int swscale(SwsContext *c, const uint8_t *src[],
                          chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
                          alpSrcPtr, dest, dstW, dstY);
             }
-            if (perform_gamma)
-                gamma_convert(dest, dstW, c->gamma);
+            //if (perform_gamma)
+            //    gamma_convert(dest, dstW, c->gamma);
         }
     }
     if (isPlanar(dstFormat) && isALPHA(dstFormat) && !alpPixBuf) {
@@ -1151,4 +1249,3 @@ int attribute_align_arg sws_scale(struct SwsContext *c,
     av_free(rgb0_tmp);
     return ret;
 }
-
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 852dd94..2e5c45f 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -276,6 +276,9 @@ typedef void (*yuv2anyX_fn)(struct SwsContext *c, const int16_t *lumFilter,
                             const int16_t **alpSrc, uint8_t **dest,
                             int dstW, int y);
 
+struct SwsSlice;
+struct SwsFilterDescriptor;
+
 /* This struct should be aligned on at least a 32-byte boundary. */
 typedef struct SwsContext {
     /**
@@ -326,6 +329,12 @@ typedef struct SwsContext {
     uint16_t *gamma;
     uint16_t *inv_gamma;
 
+    int numDesc;
+    int descIndex[2];
+    int numSlice;
+    struct SwsSlice *slice;
+    struct SwsFilterDescriptor *desc;
+
     uint32_t pal_yuv[256];
     uint32_t pal_rgb[256];
 
@@ -934,4 +943,80 @@ static inline void fillPlane16(uint8_t *plane, int stride, int width, int height
     }
 }
 
+#define MAX_SLICE_PLANES 4
+
+/// Slice plane
+typedef struct SwsPlane
+{
+    int available_lines;    ///< max number of lines that can be hold by this plane
+    int sliceY;             ///< index of first line
+    int sliceH;             ///< number of lines
+    uint8_t **line;         ///< line buffer
+    uint8_t **tmp;          ///< Tmp line buffer used by mmx code
+} SwsPlane;
+
+/**
+ * Struct which defines a slice of an image to be scaled or a output for
+ * a scaled slice.
+ * A slice can also be used as intermediate ring buffer for scaling steps.
+ */
+typedef struct SwsSlice 
+{
+    int width;              ///< Slice line width
+    int h_chr_sub_sample;   ///< horizontal chroma subsampling factor
+    int v_chr_sub_sample;   ///< vertical chroma subsampling factor
+    int is_ring;            ///< flag to identify if this slice is a ring buffer
+    int should_free_lines;  ///< flag to identify if there are dynamic allocated lines
+    enum AVPixelFormat fmt; ///< planes pixel format
+    SwsPlane plane[MAX_SLICE_PLANES];   ///< color planes
+} SwsSlice;
+
+/**
+ * Struct which holds all necessary data for processing a slice.
+ * A processing step can be a color conversion or horizontal/vertical scaling.
+ */
+typedef struct SwsFilterDescriptor
+{
+    SwsSlice *src;  ///< Source slice
+    SwsSlice *dst;  ///< Output slice
+
+    int alpha;      ///< Flag for processing alpha channel
+    void *instance; ///< Filter instance data
+
+    /// Function for processing input slice sliceH lines starting from line sliceY
+    int (*process)(SwsContext *c, struct SwsFilterDescriptor *desc, int sliceY, int sliceH);
+} SwsFilterDescriptor;
+
+/// Color conversion instance data
+typedef struct ColorContext
+{
+    uint32_t *pal;
+} ColorContext;
+
+/// Scaler instance data
+typedef struct FilterContext
+{
+    uint16_t *filter;
+    int *filter_pos;
+    int filter_size;
+    int xInc;
+} FilterContext;
+
+// warp input lines in the form (src + width*i + j) to slice format (line[i][j])
+int ff_init_slice_from_src(SwsSlice * s, uint8_t *src[4], int stride[4], int srcW, int lumY, int lumH, int chrY, int chrH);
+
+// Initialize scaler filter descriptor chain
+int ff_init_filters(SwsContext *c);
+
+// Free all filter data
+int ff_free_filters(SwsContext *c);
+
+/*
+ function for applying ring buffer logic into slice s
+ It checks if the slice can hold more @lum lines, if yes
+ do nothing otherwise remove @lum least used lines.
+ It applyes the same procedure for @chr lines.
+*/
+int ff_rotate_slice(SwsSlice *s, int lum, int chr);
+
 #endif /* SWSCALE_SWSCALE_INTERNAL_H */
diff --git a/libswscale/utils.c b/libswscale/utils.c
index 1f4dc7d..181a48a 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -1702,7 +1702,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
     }
 
     c->swscale = ff_getSwsFunc(c);
-    return 0;
+    return ff_init_filters(c);
 fail: // FIXME replace things by appropriate error codes
     if (ret == RETCODE_USE_CASCADE)  {
         int tmpW = sqrt(srcW * (int64_t)dstW);
@@ -2219,6 +2219,7 @@ void sws_freeContext(SwsContext *c)
     av_freep(&c->gamma);
     av_freep(&c->inv_gamma);
 
+    ff_free_filters(c);
 
     av_free(c);
 }
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index d611b76..83c01a0 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -85,9 +85,17 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrB
 {
     const int dstH= c->dstH;
     const int flags= c->flags;
+#define NEW_FILTER 1
+#if NEW_FILTER
+    SwsPlane *lumPlane = &c->slice[c->numSlice-1].plane[0];
+    SwsPlane *chrUPlane = &c->slice[c->numSlice-1].plane[1];
+    SwsPlane *alpPlane = &c->slice[c->numSlice-1].plane[3];
+#else
     int16_t **lumPixBuf= c->lumPixBuf;
     int16_t **chrUPixBuf= c->chrUPixBuf;
     int16_t **alpPixBuf= c->alpPixBuf;
+#endif
+    int hasAlpha = c->alpPixBuf != NULL;
     const int vLumBufSize= c->vLumBufSize;
     const int vChrBufSize= c->vChrBufSize;
     int32_t *vLumFilterPos= c->vLumFilterPos;
@@ -110,13 +118,22 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrB
         c->greenDither= ff_dither4[dstY&1];
     c->redDither= ff_dither8[(dstY+1)&1];
     if (dstY < dstH - 2) {
+#if NEW_FILTER
+        const int16_t **lumSrcPtr  = (const int16_t **)(void*) lumPlane->line + firstLumSrcY - lumPlane->sliceY;
+        const int16_t **chrUSrcPtr = (const int16_t **)(void*) chrUPlane->line + firstChrSrcY - chrUPlane->sliceY;
+        const int16_t **alpSrcPtr  = (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) ? (const int16_t **)(void*) alpPlane->line + firstLumSrcY - alpPlane->sliceY : NULL;
+#else
         const int16_t **lumSrcPtr= (const int16_t **)(void*) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
         const int16_t **chrUSrcPtr= (const int16_t **)(void*) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
         const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)(void*) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
+#endif
         int i;
-
         if (firstLumSrcY < 0 || firstLumSrcY + vLumFilterSize > c->srcH) {
+#if NEW_FILTER
+            const int16_t **tmpY = (const int16_t **) lumPlane->tmp;
+#else
             const int16_t **tmpY = (const int16_t **) lumPixBuf + 2 * vLumBufSize;
+#endif
             int neg = -firstLumSrcY, i, end = FFMIN(c->srcH - firstLumSrcY, vLumFilterSize);
             for (i = 0; i < neg;            i++)
                 tmpY[i] = lumSrcPtr[neg];
@@ -127,7 +144,11 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrB
             lumSrcPtr = tmpY;
 
             if (alpSrcPtr) {
+#if NEW_FILTER
+                const int16_t **tmpA = (const int16_t **) alpPlane->tmp;
+#else
                 const int16_t **tmpA = (const int16_t **) alpPixBuf + 2 * vLumBufSize;
+#endif
                 for (i = 0; i < neg;            i++)
                     tmpA[i] = alpSrcPtr[neg];
                 for (     ; i < end;            i++)
@@ -138,7 +159,11 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrB
             }
         }
         if (firstChrSrcY < 0 || firstChrSrcY + vChrFilterSize > c->chrSrcH) {
+#if NEW_FILTER
+            const int16_t **tmpU = (const int16_t **) chrUPlane->tmp;
+#else
             const int16_t **tmpU = (const int16_t **) chrUPixBuf + 2 * vChrBufSize;
+#endif
             int neg = -firstChrSrcY, i, end = FFMIN(c->chrSrcH - firstChrSrcY, vChrFilterSize);
             for (i = 0; i < neg;            i++) {
                 tmpU[i] = chrUSrcPtr[neg];
@@ -160,7 +185,7 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrB
                 lumMmxFilter[s*i+APCK_COEF/4  ]=
                 lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i    ]
                 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
-                if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
+                if (CONFIG_SWSCALE_ALPHA && hasAlpha) {
                     *(const void**)&alpMmxFilter[s*i              ]= alpSrcPtr[i  ];
                     *(const void**)&alpMmxFilter[s*i+APCK_PTR2/4  ]= alpSrcPtr[i+(vLumFilterSize>1)];
                     alpMmxFilter[s*i+APCK_COEF/4  ]=
@@ -180,7 +205,7 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrB
                 lumMmxFilter[4*i+2]=
                 lumMmxFilter[4*i+3]=
                 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001U;
-                if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
+                if (CONFIG_SWSCALE_ALPHA && hasAlpha) {
                     *(const void**)&alpMmxFilter[4*i+0]= alpSrcPtr[i];
                     alpMmxFilter[4*i+2]=
                     alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];


More information about the ffmpeg-devel mailing list