From c787781efbc572442946d4f95da6ce375d9f63dc Mon Sep 17 00:00:00 2001
From: Andrzej Kapolka <drzej.k@gmail.com>
Date: Tue, 30 Jul 2013 15:11:32 -0700
Subject: [PATCH] Send color and depth as separate streams (rather than one on
 top of the other) so that we can control their bitrates separately.

---
 interface/src/Webcam.cpp               | 88 +++++++++++++++---------
 interface/src/Webcam.h                 |  3 +-
 interface/src/avatar/Face.cpp          | 93 +++++++++++++++-----------
 interface/src/avatar/Face.h            |  3 +-
 libraries/shared/src/PacketHeaders.cpp |  7 +-
 5 files changed, 120 insertions(+), 74 deletions(-)

diff --git a/interface/src/Webcam.cpp b/interface/src/Webcam.cpp
index b73537f6ae..2a58d51dae 100644
--- a/interface/src/Webcam.cpp
+++ b/interface/src/Webcam.cpp
@@ -269,7 +269,7 @@ void Webcam::setFrame(const Mat& color, int format, const Mat& depth, float mean
 }
 
 FrameGrabber::FrameGrabber() : _initialized(false), _capture(0), _searchWindow(0, 0, 0, 0),
-    _smoothedMeanFaceDepth(UNINITIALIZED_FACE_DEPTH), _codec(), _frameCount(0) {
+    _smoothedMeanFaceDepth(UNINITIALIZED_FACE_DEPTH), _colorCodec(), _depthCodec(), _frameCount(0) {
 }
 
 FrameGrabber::~FrameGrabber() {
@@ -377,9 +377,13 @@ void FrameGrabber::shutdown() {
         cvReleaseCapture(&_capture);
         _capture = 0;
     }
-    if (_codec.name != 0) {
-        vpx_codec_destroy(&_codec);
-        _codec.name = 0;
+    if (_colorCodec.name != 0) {
+        vpx_codec_destroy(&_colorCodec);
+        _colorCodec.name = 0;
+    }
+    if (_depthCodec.name != 0) {
+        vpx_codec_destroy(&_depthCodec);
+        _depthCodec.name = 0;
     }
     _initialized = false;
     
@@ -492,17 +496,19 @@ void FrameGrabber::grabFrame() {
 
     const int ENCODED_FACE_WIDTH = 128;
     const int ENCODED_FACE_HEIGHT = 128;
-    int combinedFaceHeight = ENCODED_FACE_HEIGHT * (depth.empty() ? 1 : 2);
-    if (_codec.name == 0) {
-        // initialize encoder context
+    if (_colorCodec.name == 0) {
+        // initialize encoder context(s)
         vpx_codec_enc_cfg_t codecConfig;
         vpx_codec_enc_config_default(vpx_codec_vp8_cx(), &codecConfig, 0);
-        const int QUALITY_MULTIPLIER = 2;
-        codecConfig.rc_target_bitrate = QUALITY_MULTIPLIER * ENCODED_FACE_WIDTH * combinedFaceHeight *
+        codecConfig.rc_target_bitrate = ENCODED_FACE_WIDTH * ENCODED_FACE_HEIGHT *
             codecConfig.rc_target_bitrate / codecConfig.g_w / codecConfig.g_h;
         codecConfig.g_w = ENCODED_FACE_WIDTH;
-        codecConfig.g_h = combinedFaceHeight;
-        vpx_codec_enc_init(&_codec, vpx_codec_vp8_cx(), &codecConfig, 0); 
+        codecConfig.g_h = ENCODED_FACE_HEIGHT;
+        vpx_codec_enc_init(&_colorCodec, vpx_codec_vp8_cx(), &codecConfig, 0);
+        
+        if (!depth.empty()) {
+            vpx_codec_enc_init(&_depthCodec, vpx_codec_vp8_cx(), &codecConfig, 0);
+        }
     }
     
     // correct for 180 degree rotations
@@ -539,9 +545,9 @@ void FrameGrabber::grabFrame() {
     const int ENCODED_BITS_PER_VU = 2;
     const int ENCODED_BITS_PER_PIXEL = ENCODED_BITS_PER_Y + 2 * ENCODED_BITS_PER_VU;
     const int BITS_PER_BYTE = 8;
-    _encodedFace.fill(128, ENCODED_FACE_WIDTH * combinedFaceHeight * ENCODED_BITS_PER_PIXEL / BITS_PER_BYTE);
+    _encodedFace.resize(ENCODED_FACE_WIDTH * ENCODED_FACE_HEIGHT * ENCODED_BITS_PER_PIXEL / BITS_PER_BYTE);
     vpx_image_t vpxImage;
-    vpx_img_wrap(&vpxImage, VPX_IMG_FMT_YV12, ENCODED_FACE_WIDTH, combinedFaceHeight, 1, (unsigned char*)_encodedFace.data());
+    vpx_img_wrap(&vpxImage, VPX_IMG_FMT_YV12, ENCODED_FACE_WIDTH, ENCODED_FACE_HEIGHT, 1, (unsigned char*)_encodedFace.data());
     uchar* yline = vpxImage.planes[0];
     uchar* vline = vpxImage.planes[1];
     uchar* uline = vpxImage.planes[2];
@@ -588,6 +594,24 @@ void FrameGrabber::grabFrame() {
         uline += vpxImage.stride[2];
     }
     
+    // encode the frame
+    vpx_codec_encode(&_colorCodec, &vpxImage, ++_frameCount, 1, 0, VPX_DL_REALTIME);
+
+    // start the payload off with the aspect ratio
+    QByteArray payload(sizeof(float), 0);
+    *(float*)payload.data() = _smoothedFaceRect.size.width / _smoothedFaceRect.size.height;
+
+    // extract the encoded frame
+    vpx_codec_iter_t iterator = 0;
+    const vpx_codec_cx_pkt_t* packet;
+    while ((packet = vpx_codec_get_cx_data(&_colorCodec, &iterator)) != 0) {
+        if (packet->kind == VPX_CODEC_CX_FRAME_PKT) {
+            // prepend the length, which will indicate whether there's a depth frame too
+            payload.append((const char*)&packet->data.frame.sz, sizeof(packet->data.frame.sz));
+            payload.append((const char*)packet->data.frame.buf, packet->data.frame.sz);
+        }
+    }
+    
     if (!depth.empty()) {
         // warp the face depth without interpolation (because it will contain invalid zero values)
         _faceDepth.create(ENCODED_FACE_WIDTH, ENCODED_FACE_HEIGHT, CV_16UC1);
@@ -621,12 +645,14 @@ void FrameGrabber::grabFrame() {
         depth.convertTo(_grayDepthFrame, CV_8UC1, 1.0, depthOffset);
 
         // likewise for the encoded representation
-        uchar* yline = vpxImage.planes[0] + vpxImage.stride[0] * ENCODED_FACE_HEIGHT;
-        uchar* vline = vpxImage.planes[1] + vpxImage.stride[1] * (ENCODED_FACE_HEIGHT / 2);
+        uchar* yline = vpxImage.planes[0];
+        uchar* vline = vpxImage.planes[1];
+        uchar* uline = vpxImage.planes[2];
         const uchar EIGHT_BIT_MAXIMUM = 255;
         for (int i = 0; i < ENCODED_FACE_HEIGHT; i += 2) {
             uchar* ydest = yline;
             uchar* vdest = vline;
+            uchar* udest = uline;
             for (int j = 0; j < ENCODED_FACE_WIDTH; j += 2) {
                 ushort tl = *_faceDepth.ptr<ushort>(i, j);
                 ushort tr = *_faceDepth.ptr<ushort>(i, j + 1);
@@ -644,28 +670,28 @@ void FrameGrabber::grabFrame() {
                 ydest += 2;
             
                 *vdest++ = mask;
+                *udest++ = EIGHT_BIT_MIDPOINT;
             }
             yline += vpxImage.stride[0] * 2;
             vline += vpxImage.stride[1];
+            uline += vpxImage.stride[2];
+        }
+        
+        // encode the frame
+        vpx_codec_encode(&_depthCodec, &vpxImage, _frameCount, 1, 0, VPX_DL_REALTIME);
+
+        // extract the encoded frame
+        vpx_codec_iter_t iterator = 0;
+        const vpx_codec_cx_pkt_t* packet;
+        while ((packet = vpx_codec_get_cx_data(&_depthCodec, &iterator)) != 0) {
+            if (packet->kind == VPX_CODEC_CX_FRAME_PKT) {
+                payload.append((const char*)packet->data.frame.buf, packet->data.frame.sz);
+            }
         }
     }
     
-    // encode the frame
-    vpx_codec_encode(&_codec, &vpxImage, ++_frameCount, 1, 0, VPX_DL_REALTIME);
-
-    // extract the encoded frame
-    vpx_codec_iter_t iterator = 0;
-    const vpx_codec_cx_pkt_t* packet;
-    while ((packet = vpx_codec_get_cx_data(&_codec, &iterator)) != 0) {
-        if (packet->kind == VPX_CODEC_CX_FRAME_PKT) {
-            // prepend the aspect ratio
-            QByteArray payload(sizeof(float), 0);
-            *(float*)payload.data() = _smoothedFaceRect.size.width / _smoothedFaceRect.size.height;
-            payload.append((const char*)packet->data.frame.buf, packet->data.frame.sz);
-            QMetaObject::invokeMethod(Application::getInstance(), "sendAvatarFaceVideoMessage", Q_ARG(int, _frameCount),
-                Q_ARG(QByteArray, payload));
-        }
-    }
+    QMetaObject::invokeMethod(Application::getInstance(), "sendAvatarFaceVideoMessage",
+        Q_ARG(int, _frameCount), Q_ARG(QByteArray, payload));
 
     QMetaObject::invokeMethod(Application::getInstance()->getWebcam(), "setFrame",
         Q_ARG(cv::Mat, color), Q_ARG(int, format), Q_ARG(cv::Mat, _grayDepthFrame), Q_ARG(float, _smoothedMeanFaceDepth),
diff --git a/interface/src/Webcam.h b/interface/src/Webcam.h
index 16cf3a33a1..6c6d250897 100644
--- a/interface/src/Webcam.h
+++ b/interface/src/Webcam.h
@@ -120,7 +120,8 @@ private:
     cv::Mat _grayDepthFrame;
     float _smoothedMeanFaceDepth;
     
-    vpx_codec_ctx_t _codec;
+    vpx_codec_ctx_t _colorCodec;
+    vpx_codec_ctx_t _depthCodec;
     int _frameCount;
     cv::Mat _faceColor;
     cv::Mat _faceDepth;
diff --git a/interface/src/avatar/Face.cpp b/interface/src/avatar/Face.cpp
index 5d6c4984bd..f25426a5be 100644
--- a/interface/src/avatar/Face.cpp
+++ b/interface/src/avatar/Face.cpp
@@ -30,19 +30,25 @@ GLuint Face::_vboID;
 GLuint Face::_iboID;
 
 Face::Face(Head* owningHead) : _owningHead(owningHead), _renderMode(MESH),
-        _colorTextureID(0), _depthTextureID(0), _codec(), _frameCount(0) {
+        _colorTextureID(0), _depthTextureID(0), _colorCodec(), _depthCodec(), _frameCount(0) {
     // we may have been created in the network thread, but we live in the main thread
     moveToThread(Application::getInstance()->thread());
 }
 
 Face::~Face() {
-    if (_codec.name != 0) {
-        vpx_codec_destroy(&_codec);
+    if (_colorCodec.name != 0) {
+        vpx_codec_destroy(&_colorCodec);
         
-        // delete our textures, since we know that we own them
+        // delete our texture, since we know that we own it
         if (_colorTextureID != 0) {
             glDeleteTextures(1, &_colorTextureID);
         }
+        
+    }
+    if (_depthCodec.name != 0) {
+        vpx_codec_destroy(&_depthCodec);
+        
+        // delete our texture, since we know that we own it
         if (_depthTextureID != 0) {
             glDeleteTextures(1, &_depthTextureID);
         }
@@ -55,9 +61,9 @@ void Face::setTextureRect(const cv::RotatedRect& textureRect) {
 }
 
 int Face::processVideoMessage(unsigned char* packetData, size_t dataBytes) {
-    if (_codec.name == 0) {
+    if (_colorCodec.name == 0) {
         // initialize decoder context
-        vpx_codec_dec_init(&_codec, vpx_codec_vp8_dx(), 0, 0);
+        vpx_codec_dec_init(&_colorCodec, vpx_codec_vp8_dx(), 0, 0);
     }
     // skip the header
     unsigned char* packetPosition = packetData;
@@ -85,14 +91,14 @@ int Face::processVideoMessage(unsigned char* packetData, size_t dataBytes) {
     
     if ((_frameBytesRemaining -= payloadSize) <= 0) {
         float aspectRatio = *(const float*)_arrivingFrame.constData();
-        vpx_codec_decode(&_codec, (const uint8_t*)_arrivingFrame.constData() + sizeof(float),
-            _arrivingFrame.size() - sizeof(float), 0, 0);
+        size_t colorSize = *(const size_t*)(_arrivingFrame.constData() + sizeof(float));
+        const uint8_t* colorData = (const uint8_t*)(_arrivingFrame.constData() + sizeof(float) + sizeof(size_t));
+        vpx_codec_decode(&_colorCodec, colorData, colorSize, 0, 0);
         vpx_codec_iter_t iterator = 0;
         vpx_image_t* image;
-        while ((image = vpx_codec_get_frame(&_codec, &iterator)) != 0) {
+        while ((image = vpx_codec_get_frame(&_colorCodec, &iterator)) != 0) {
             // convert from YV12 to RGB
-            const int imageHeight = image->d_w;
-            Mat color(imageHeight, image->d_w, CV_8UC3);
+            Mat color(image->d_h, image->d_w, CV_8UC3);
             uchar* yline = image->planes[0];
             uchar* vline = image->planes[1];
             uchar* uline = image->planes[2];
@@ -100,7 +106,7 @@ int Face::processVideoMessage(unsigned char* packetData, size_t dataBytes) {
             const int GREEN_V_WEIGHT = (int)(0.714 * 256);
             const int GREEN_U_WEIGHT = (int)(0.344 * 256);
             const int BLUE_U_WEIGHT = (int)(1.773 * 256);
-            for (int i = 0; i < imageHeight; i += 2) {
+            for (int i = 0; i < image->d_h; i += 2) {
                 uchar* ysrc = yline;
                 uchar* vsrc = vline;
                 uchar* usrc = uline;
@@ -144,34 +150,45 @@ int Face::processVideoMessage(unsigned char* packetData, size_t dataBytes) {
                 uline += image->stride[2];
             }
             Mat depth;
-            if (image->d_h > imageHeight) {
-                // if the height is greater than the width, we have depth data
-                depth.create(imageHeight, image->d_w, CV_8UC1);
-                uchar* yline = image->planes[0] + image->stride[0] * imageHeight;
-                uchar* vline = image->planes[1] + image->stride[1] * (imageHeight / 2);
-                const uchar EIGHT_BIT_MAXIMUM = 255;
-                const uchar MASK_THRESHOLD = 192;
-                for (int i = 0; i < imageHeight; i += 2) {
-                    uchar* ysrc = yline;
-                    uchar* vsrc = vline;
-                    for (int j = 0; j < image->d_w; j += 2) {
-                        
-                        if (*vsrc++ >= MASK_THRESHOLD) {
-                            *depth.ptr(i, j) = EIGHT_BIT_MAXIMUM;
-                            *depth.ptr(i, j + 1) = EIGHT_BIT_MAXIMUM;
-                            *depth.ptr(i + 1, j) = EIGHT_BIT_MAXIMUM;
-                            *depth.ptr(i + 1, j + 1) = EIGHT_BIT_MAXIMUM;
-                        
-                        } else {
-                            *depth.ptr(i, j) = ysrc[0];
-                            *depth.ptr(i, j + 1) = ysrc[1];
-                            *depth.ptr(i + 1, j) = ysrc[image->stride[0]];
-                            *depth.ptr(i + 1, j + 1) = ysrc[image->stride[0] + 1];
+            
+            const uint8_t* depthData = colorData + colorSize;
+            int depthSize = _arrivingFrame.size() - ((const char*)depthData - _arrivingFrame.constData());
+            if (depthSize > 0) {
+                if (_depthCodec.name == 0) {
+                    // initialize decoder context
+                    vpx_codec_dec_init(&_depthCodec, vpx_codec_vp8_dx(), 0, 0);
+                }
+                vpx_codec_decode(&_depthCodec, depthData, depthSize, 0, 0);
+                vpx_codec_iter_t iterator = 0;
+                vpx_image_t* image;
+                while ((image = vpx_codec_get_frame(&_depthCodec, &iterator)) != 0) {
+                    depth.create(image->d_h, image->d_w, CV_8UC1);
+                    uchar* yline = image->planes[0];
+                    uchar* vline = image->planes[1];
+                    const uchar EIGHT_BIT_MAXIMUM = 255;
+                    const uchar MASK_THRESHOLD = 192;
+                    for (int i = 0; i < image->d_h; i += 2) {
+                        uchar* ysrc = yline;
+                        uchar* vsrc = vline;
+                        for (int j = 0; j < image->d_w; j += 2) {
+                            
+                            if (*vsrc++ >= MASK_THRESHOLD) {
+                                *depth.ptr(i, j) = EIGHT_BIT_MAXIMUM;
+                                *depth.ptr(i, j + 1) = EIGHT_BIT_MAXIMUM;
+                                *depth.ptr(i + 1, j) = EIGHT_BIT_MAXIMUM;
+                                *depth.ptr(i + 1, j + 1) = EIGHT_BIT_MAXIMUM;
+                            
+                            } else {
+                                *depth.ptr(i, j) = ysrc[0];
+                                *depth.ptr(i, j + 1) = ysrc[1];
+                                *depth.ptr(i + 1, j) = ysrc[image->stride[0]];
+                                *depth.ptr(i + 1, j + 1) = ysrc[image->stride[0] + 1];
+                            }
+                            ysrc += 2;
                         }
-                        ysrc += 2;
+                        yline += image->stride[0] * 2;
+                        vline += image->stride[1];
                     }
-                    yline += image->stride[0] * 2;
-                    vline += image->stride[1];
                 }
             }
             QMetaObject::invokeMethod(this, "setFrame", Q_ARG(cv::Mat, color),
diff --git a/interface/src/avatar/Face.h b/interface/src/avatar/Face.h
index 1f9d41a1b3..d4812fecfb 100644
--- a/interface/src/avatar/Face.h
+++ b/interface/src/avatar/Face.h
@@ -57,7 +57,8 @@ private:
     cv::RotatedRect _textureRect;
     float _aspectRatio;
     
-    vpx_codec_ctx_t _codec;
+    vpx_codec_ctx_t _colorCodec;
+    vpx_codec_ctx_t _depthCodec;
     
     QByteArray _arrivingFrame;
     int _frameCount;
diff --git a/libraries/shared/src/PacketHeaders.cpp b/libraries/shared/src/PacketHeaders.cpp
index 292c4bbc0a..2e7b95c7f7 100644
--- a/libraries/shared/src/PacketHeaders.cpp
+++ b/libraries/shared/src/PacketHeaders.cpp
@@ -18,14 +18,15 @@ PACKET_VERSION versionForPacketType(PACKET_TYPE type) {
         case PACKET_TYPE_MICROPHONE_AUDIO_NO_ECHO:
         case PACKET_TYPE_MICROPHONE_AUDIO_WITH_ECHO:
             return 1;
-            break;
 
         case PACKET_TYPE_HEAD_DATA:
             return 2;
-            break;
+        
+        case PACKET_TYPE_AVATAR_FACE_VIDEO:
+            return 1;
+            
         default:
             return 0;
-            break;
     }
 }