[FFmpeg-devel] [PATCH v2] mov: Export spherical information

Thu Nov 17 19:52:01 EET 2016

On 11/17/2016 12:53 PM, Vittorio Giovara wrote:
> This implements Spherical Video V1 and V2, as described in the
> spatial-media collection by Google.
> 
> Signed-off-by: Vittorio Giovara <vittorio.giovara at gmail.com>
> ---
> Updated following James' review.
> Please CC.
> Vittorio
> 
>  libavformat/isom.h |   7 ++
>  libavformat/mov.c  | 309 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 315 insertions(+), 1 deletion(-)
> 
> diff --git a/libavformat/isom.h b/libavformat/isom.h
> index 02bfedd..0fd9eb0 100644
> --- a/libavformat/isom.h
> +++ b/libavformat/isom.h
> @@ -24,6 +24,9 @@
>  #ifndef AVFORMAT_ISOM_H
>  #define AVFORMAT_ISOM_H
>  
> +#include "libavutil/spherical.h"
> +#include "libavutil/stereo3d.h"
> +
>  #include "avio.h"
>  #include "internal.h"
>  #include "dv.h"
> @@ -177,6 +180,10 @@ typedef struct MOVStreamContext {
>      int stsd_count;
>  
>      int32_t *display_matrix;
> +    AVStereo3D *stereo3d;
> +    AVSphericalMapping *spherical;
> +    size_t spherical_size;
> +
>      uint32_t format;
>  
>      int has_sidx;  // If there is an sidx entry for this stream.
> diff --git a/libavformat/mov.c b/libavformat/mov.c
> index 6beb027..7d02fe0 100644
> --- a/libavformat/mov.c
> +++ b/libavformat/mov.c
> @@ -42,6 +42,8 @@
>  #include "libavutil/aes.h"
>  #include "libavutil/aes_ctr.h"
>  #include "libavutil/sha.h"
> +#include "libavutil/spherical.h"
> +#include "libavutil/stereo3d.h"
>  #include "libavutil/timecode.h"
>  #include "libavcodec/ac3tab.h"
>  #include "libavcodec/mpegaudiodecheader.h"
> @@ -4497,8 +4499,258 @@ static int mov_read_tmcd(MOVContext *c, AVIOContext *pb, MOVAtom atom)
>      return 0;
>  }
>  
> +static int mov_read_st3d(MOVContext *c, AVIOContext *pb, MOVAtom atom)
> +{
> +    AVStream *st;
> +    MOVStreamContext *sc;
> +    enum AVStereo3DType type;
> +    int mode;
> +
> +    if (c->fc->nb_streams < 1)
> +        return 0;
> +
> +    st = c->fc->streams[c->fc->nb_streams - 1];
> +    sc = st->priv_data;
> +
> +    if (atom.size < 5) {
> +        av_log(c->fc, AV_LOG_ERROR, "Empty stereoscopic video box\n");
> +        return AVERROR_INVALIDDATA;
> +    }

Missing version + flags here. st3d is a fullfox. You're reading the version byte
as the mode byte.

> +
> +    mode = avio_r8(pb);
> +    switch (mode) {
> +    case 0:
> +        type = AV_STEREO3D_2D;
> +        break;
> +    case 1:
> +        type = AV_STEREO3D_TOPBOTTOM;
> +        break;
> +    case 2:
> +        type = AV_STEREO3D_SIDEBYSIDE;
> +        break;
> +    default:
> +        av_log(c->fc, AV_LOG_WARNING, "Unknown st3d mode value %d\n", mode);
> +        return 0;
> +    }
> +
> +    sc->stereo3d = av_stereo3d_alloc();
> +    if (!sc->stereo3d)
> +        return AVERROR(ENOMEM);
> +
> +    sc->stereo3d->type = type;
> +    return 0;
> +}
> +
> +static int mov_read_sv3d(MOVContext *c, AVIOContext *pb, MOVAtom atom)
> +{
> +    AVStream *st;
> +    MOVStreamContext *sc;
> +    int version, size;
> +    int32_t yaw, pitch, roll;
> +    uint32_t tag;
> +    unsigned l, t, r, b;
> +    enum AVSphericalProjection projection;
> +
> +    if (c->fc->nb_streams < 1)
> +        return 0;
> +
> +    st = c->fc->streams[c->fc->nb_streams - 1];
> +    sc = st->priv_data;
> +
> +    if (atom.size < 8) {
> +        av_log(c->fc, AV_LOG_ERROR, "Empty spherical video box\n");
> +        return AVERROR_INVALIDDATA;
> +    }
> +
> +    version = avio_r8(pb);
> +    if (version) {
> +        av_log(c->fc, AV_LOG_WARNING, "Unsupported svhd box version (%d)\n",
> +               version);

Afaics, the order for Fullboxes is

size (4 bytes) -> tag (4 bytes) -> version (1 byte) -> flags (3 bytes) ->
data (size - 12 bytes)
So you're reading the first of four size bytes as if it were the version byte.

> +        return 0;
> +    }
> +
> +    size = avio_rb24(pb);

Sizes are 4 bytes. It was correct in your previous version.

> +    if (size > atom.size)
> +        return AVERROR_INVALIDDATA;
> +
> +    tag = avio_rl32(pb);
> +    if (tag != MKTAG('s','v','h','d')) {
> +        av_log(c->fc, AV_LOG_ERROR, "Missing spherical video header\n");
> +        return 0;
> +    }
> +    avio_skip(pb, size - 8); /* metadata_source */
> +
> +    version = avio_r8(pb);
> +    if (version) {
> +        av_log(c->fc, AV_LOG_WARNING, "Unsupported proj box version (%d)\n",

proj is not defined as a fullbox, so no version + flags.

This is why i said a sample with non zero values would have been better.
These samples you have are full of zeroed boxes that make it hard to assert
you're reading at the correct offsets.
Look at the box sizes in the equirectangular sample

st3d has a size of 13, meaning size (4) tag (4) version+flags (4) mode (1)
sv3d has a size of 81, meaning size (4) tag (4) svhd+proj boxes (73)
svhd has a size of 13, meaning size (4) tag (4) version+flags (4) string (1)
proj has a size of 60, meaning size (4) tag (4) prhd+equi boxes (52)
prhd has a size of 24, meaning size (4) tag (4) version+flags (4) yaw+pitch+roll (12)
equi has a size of 28, meaning size (4) tag (4) version+flags (4) t+b+l+r (16)

And the cube sample (only the ones that differ from the above)

sv3d has a size of 73, meaning size (4) tag (4) svhd+proj boxes (65)
proj has a size of 52, meaning size (4) tag (4) prhd+cbmp boxes (44)
cbmp has a size of 20, meaning size (4) tag (4) version+flags (4), layout+padding (8)

All the version, flags, and box specific fields are zeroed, same as three out of four
bytes from size.

> +               version);
> +        return 0;
> +    }
> +
> +    size = avio_rb24(pb);
> +    if (size > atom.size)
> +        return AVERROR_INVALIDDATA;
> +
> +    tag = avio_rl32(pb);
> +    if (tag != MKTAG('p','r','o','j')) {
> +        av_log(c->fc, AV_LOG_ERROR, "Missing projection box\n");
> +        return 0;
> +    }
> +
> +    version = avio_r8(pb);
> +    if (version) {
> +        av_log(c->fc, AV_LOG_WARNING, "Unsupported prhd box version (%d)\n",
> +               version);
> +        return 0;
> +    }
> +
> +    size = avio_rb24(pb);
> +    if (size > atom.size)
> +        return AVERROR_INVALIDDATA;
> +
> +    tag = avio_rl32(pb);
> +    if (tag != MKTAG('p','r','h','d')) {
> +        av_log(c->fc, AV_LOG_ERROR, "Missing projection header box\n");
> +        return 0;
> +    }
> +
> +    /* 16.16 fixed point */
> +    yaw   = avio_rb32(pb);
> +    pitch = avio_rb32(pb);
> +    roll  = avio_rb32(pb);
> +
> +    avio_skip(pb, size - 20);
> +
> +    version = avio_r8(pb);
> +    if (version) {
> +        av_log(c->fc, AV_LOG_WARNING, "Unsupported box version (%d)\n",
> +               version);

You could instead do (Once you correctly read the version after the tag)

av_log(c->fc, AV_LOG_WARNING, "Unsupported %.4s box version (%d)\n",
       (char*)&tag, version);

> +        return 0;
> +    }
> +
> +    size = avio_rb24(pb);
> +    if (size > atom.size)
> +        return AVERROR_INVALIDDATA;
> +
> +    tag = avio_rl32(pb);
> +    switch (tag) {
> +    case MKTAG('c','b','m','p'):
> +        projection = AV_SPHERICAL_CUBEMAP;
> +        avio_skip(pb, 4); /* layout */
> +        l = t = r = b = avio_rb32(pb);
> +        break;
> +    case MKTAG('e','q','u','i'):
> +        projection = AV_SPHERICAL_EQUIRECTANGULAR;
> +        t = avio_rb32(pb);
> +        b = avio_rb32(pb);
> +        l = avio_rb32(pb);
> +        r = avio_rb32(pb);
> +        break;
> +    default:
> +        av_log(c->fc, AV_LOG_ERROR, "Unknown projection type\n");
> +        return 0;
> +    }
> +
> +    sc->spherical = av_spherical_alloc(&sc->spherical_size);
> +    if (!sc->spherical)
> +        return AVERROR(ENOMEM);
> +
> +    sc->spherical->projection = projection;
> +
> +    sc->spherical->yaw   = yaw;
> +    sc->spherical->pitch = pitch;
> +    sc->spherical->roll  = roll;
> +
> +    sc->spherical->left_offset   = l;
> +    sc->spherical->top_offset    = t;
> +    sc->spherical->right_offset  = r;
> +    sc->spherical->bottom_offset = b;
> +
> +    return 0;
> +}
> +
> +static int mov_parse_uuid_spherical(MOVStreamContext *sc, AVIOContext *pb, size_t len)
> +{
> +    int ret = 0;
> +    uint8_t *buffer = av_malloc(len + 1);
> +    const char *val;
> +
> +    if (!buffer)
> +        return AVERROR(ENOMEM);
> +    buffer[len] = '\0';
> +
> +    ret = ffio_read_size(pb, buffer, len);
> +    if (ret < 0)
> +        goto out;
> +
> +    /* Check for mandatory keys and values, try to support XML as best-effort */
> +    if (av_stristr(buffer, "<GSpherical:StitchingSoftware>") &&
> +        (val = av_stristr(buffer, "<GSpherical:Spherical>")) &&
> +        av_stristr(val, "true") &&
> +        (val = av_stristr(buffer, "<GSpherical:Stitched>")) &&
> +        av_stristr(val, "true") &&
> +        (val = av_stristr(buffer, "<GSpherical:ProjectionType>")) &&
> +        av_stristr(val, "equirectangular")) {
> +        sc->spherical = av_spherical_alloc(&sc->spherical_size);
> +        if (!sc->spherical)
> +            goto out;
> +
> +        sc->spherical->projection = AV_SPHERICAL_EQUIRECTANGULAR;
> +
> +        if (av_stristr(buffer, "<GSpherical:StereoMode>")) {
> +            enum AVStereo3DType mode;
> +
> +            if (av_stristr(buffer, "left-right"))
> +                mode = AV_STEREO3D_SIDEBYSIDE;
> +            else if (av_stristr(buffer, "top-bottom"))
> +                mode = AV_STEREO3D_TOPBOTTOM;
> +            else
> +                mode = AV_STEREO3D_2D;
> +
> +            sc->stereo3d = av_stereo3d_alloc();
> +            if (!sc->stereo3d)
> +                goto out;
> +
> +            sc->stereo3d->type = mode;
> +        }
> +
> +        /* orientation */
> +        val = av_stristr(buffer, "<GSpherical:InitialViewHeadingDegrees>");
> +        if (val)
> +            sc->spherical->yaw = strtol(val, NULL, 10) * (1 << 16);
> +        val = av_stristr(buffer, "<GSpherical:InitialViewPitchDegrees>");
> +        if (val)
> +            sc->spherical->pitch = strtol(val, NULL, 10) * (1 << 16);
> +        val = av_stristr(buffer, "<GSpherical:InitialViewRollDegrees>");
> +        if (val)
> +            sc->spherical->roll = strtol(val, NULL, 10) * (1 << 16);
> +
> +        /* cropping */
> +        val = av_stristr(buffer, "<GSpherical:CroppedAreaLeftPixels>");
> +        if (val)
> +            sc->spherical->left_offset = strtol(val, NULL, 10);
> +        val = av_stristr(buffer, "<GSpherical:CroppedAreaTopPixels>");
> +        if (val)
> +            sc->spherical->top_offset = strtol(val, NULL, 10);
> +        val = av_stristr(buffer, "<GSpherical:CroppedAreaImageWidthPixels>");
> +        if (val)
> +            sc->spherical->right_offset =
> +                sc->width - sc->spherical->left_offset - strtol(val, NULL, 10);
> +        val = av_stristr(buffer, "<GSpherical:CroppedAreaImageHeightPixels>");
> +        if (val)
> +            sc->spherical->bottom_offset =
> +                sc->height - sc->spherical->top_offset - strtol(val, NULL, 10);
> +    }
> +
> +out:
> +    av_free(buffer);
> +    return ret;
> +}
> +
>  static int mov_read_uuid(MOVContext *c, AVIOContext *pb, MOVAtom atom)
>  {
> +    AVStream *st;
> +    MOVStreamContext *sc;
>      int ret;
>      uint8_t uuid[16];
>      static const uint8_t uuid_isml_manifest[] = {
> @@ -4509,10 +4761,19 @@ static int mov_read_uuid(MOVContext *c, AVIOContext *pb, MOVAtom atom)
>          0xbe, 0x7a, 0xcf, 0xcb, 0x97, 0xa9, 0x42, 0xe8,
>          0x9c, 0x71, 0x99, 0x94, 0x91, 0xe3, 0xaf, 0xac
>      };
> +    static const uint8_t uuid_spherical[] = {
> +        0xff, 0xcc, 0x82, 0x63, 0xf8, 0x55, 0x4a, 0x93,
> +        0x88, 0x14, 0x58, 0x7a, 0x02, 0x52, 0x1f, 0xdd,
> +    };
>  
>      if (atom.size < sizeof(uuid) || atom.size == INT64_MAX)
>          return AVERROR_INVALIDDATA;
>  
> +    if (c->fc->nb_streams < 1)
> +        return 0;
> +    st = c->fc->streams[c->fc->nb_streams - 1];
> +    sc = st->priv_data;
> +
>      ret = avio_read(pb, uuid, sizeof(uuid));
>      if (ret < 0) {
>          return ret;
> @@ -4584,7 +4845,14 @@ static int mov_read_uuid(MOVContext *c, AVIOContext *pb, MOVAtom atom)
>              av_dict_set(&c->fc->metadata, "xmp", buffer, 0);
>          }
>          av_free(buffer);
> -    }
> +    } else if (!memcmp(uuid, uuid_spherical, sizeof(uuid))) {
> +        size_t len = atom.size - sizeof(uuid);
> +        ret = mov_parse_uuid_spherical(sc, pb, len);
> +        if (ret < 0)
> +            return ret;
> +        if (!sc->spherical)
> +            av_log(c->fc, AV_LOG_WARNING, "Invalid spherical metadata found\n");    }
> +
>      return 0;
>  }
>  
> @@ -4934,6 +5202,8 @@ static const MOVParseTableEntry mov_default_parse_table[] = {
>  { MKTAG('f','r','m','a'), mov_read_frma },
>  { MKTAG('s','e','n','c'), mov_read_senc },
>  { MKTAG('s','a','i','z'), mov_read_saiz },
> +{ MKTAG('s','t','3','d'), mov_read_st3d }, /* stereoscopic 3D video box */
> +{ MKTAG('s','v','3','d'), mov_read_sv3d }, /* spherical video box */
>  { 0, NULL }
>  };
>  
> @@ -5354,6 +5624,9 @@ static int mov_read_close(AVFormatContext *s)
>          av_freep(&sc->cenc.auxiliary_info);
>          av_freep(&sc->cenc.auxiliary_info_sizes);
>          av_aes_ctr_free(sc->cenc.aes_ctr);
> +
> +        av_freep(&sc->stereo3d);
> +        av_freep(&sc->spherical);
>      }
>  
>      if (mov->dv_demux) {
> @@ -5681,6 +5954,40 @@ static int mov_read_header(AVFormatContext *s)
>                  sd->data = (uint8_t*)sc->display_matrix;
>                  sc->display_matrix = NULL;
>              }
> +            if (sc->stereo3d) {
> +                AVPacketSideData *sd, *tmp;
> +
> +                tmp = av_realloc_array(st->side_data,
> +                                       st->nb_side_data + 1, sizeof(*tmp));
> +                if (!tmp)
> +                    return AVERROR(ENOMEM);
> +
> +                st->side_data = tmp;
> +                st->nb_side_data++;
> +
> +                sd = &st->side_data[st->nb_side_data - 1];
> +                sd->type = AV_PKT_DATA_STEREO3D;
> +                sd->size = sizeof(*sc->stereo3d);
> +                sd->data = (uint8_t *)sc->stereo3d;
> +                sc->stereo3d = NULL;
> +            }
> +            if (sc->spherical) {
> +                AVPacketSideData *sd, *tmp;
> +
> +                tmp = av_realloc_array(st->side_data,
> +                                       st->nb_side_data + 1, sizeof(*tmp));
> +                if (!tmp)
> +                    return AVERROR(ENOMEM);
> +
> +                st->side_data = tmp;
> +                st->nb_side_data++;
> +
> +                sd = &st->side_data[st->nb_side_data - 1];
> +                sd->type = AV_PKT_DATA_SPHERICAL;
> +                sd->size = sc->spherical_size;
> +                sd->data = (uint8_t *)sc->spherical;
> +                sc->spherical = NULL;
> +            }

I sent a patchset to add a new av_stream_add_side_data() function to remove
this kind of code duplication.
If it's reviewed and committed before you send a new version of this patch
then consider using it so a separate commit replacing the above can be
avoided.

>              break;
>          }
>      }
>