Send color and depth as separate streams (rather than one on top of the other)

so that we can control their bitrates separately.
This commit is contained in:
Andrzej Kapolka 2013-07-30 15:11:32 -07:00
parent f8ba1c4be1
commit c787781efb
5 changed files with 120 additions and 74 deletions

View file

@ -269,7 +269,7 @@ void Webcam::setFrame(const Mat& color, int format, const Mat& depth, float mean
}
FrameGrabber::FrameGrabber() : _initialized(false), _capture(0), _searchWindow(0, 0, 0, 0),
_smoothedMeanFaceDepth(UNINITIALIZED_FACE_DEPTH), _codec(), _frameCount(0) {
_smoothedMeanFaceDepth(UNINITIALIZED_FACE_DEPTH), _colorCodec(), _depthCodec(), _frameCount(0) {
}
FrameGrabber::~FrameGrabber() {
@ -377,9 +377,13 @@ void FrameGrabber::shutdown() {
cvReleaseCapture(&_capture);
_capture = 0;
}
if (_codec.name != 0) {
vpx_codec_destroy(&_codec);
_codec.name = 0;
if (_colorCodec.name != 0) {
vpx_codec_destroy(&_colorCodec);
_colorCodec.name = 0;
}
if (_depthCodec.name != 0) {
vpx_codec_destroy(&_depthCodec);
_depthCodec.name = 0;
}
_initialized = false;
@ -492,17 +496,19 @@ void FrameGrabber::grabFrame() {
const int ENCODED_FACE_WIDTH = 128;
const int ENCODED_FACE_HEIGHT = 128;
int combinedFaceHeight = ENCODED_FACE_HEIGHT * (depth.empty() ? 1 : 2);
if (_codec.name == 0) {
// initialize encoder context
if (_colorCodec.name == 0) {
// initialize encoder context(s)
vpx_codec_enc_cfg_t codecConfig;
vpx_codec_enc_config_default(vpx_codec_vp8_cx(), &codecConfig, 0);
const int QUALITY_MULTIPLIER = 2;
codecConfig.rc_target_bitrate = QUALITY_MULTIPLIER * ENCODED_FACE_WIDTH * combinedFaceHeight *
codecConfig.rc_target_bitrate = ENCODED_FACE_WIDTH * ENCODED_FACE_HEIGHT *
codecConfig.rc_target_bitrate / codecConfig.g_w / codecConfig.g_h;
codecConfig.g_w = ENCODED_FACE_WIDTH;
codecConfig.g_h = combinedFaceHeight;
vpx_codec_enc_init(&_codec, vpx_codec_vp8_cx(), &codecConfig, 0);
codecConfig.g_h = ENCODED_FACE_HEIGHT;
vpx_codec_enc_init(&_colorCodec, vpx_codec_vp8_cx(), &codecConfig, 0);
if (!depth.empty()) {
vpx_codec_enc_init(&_depthCodec, vpx_codec_vp8_cx(), &codecConfig, 0);
}
}
// correct for 180 degree rotations
@ -539,9 +545,9 @@ void FrameGrabber::grabFrame() {
const int ENCODED_BITS_PER_VU = 2;
const int ENCODED_BITS_PER_PIXEL = ENCODED_BITS_PER_Y + 2 * ENCODED_BITS_PER_VU;
const int BITS_PER_BYTE = 8;
_encodedFace.fill(128, ENCODED_FACE_WIDTH * combinedFaceHeight * ENCODED_BITS_PER_PIXEL / BITS_PER_BYTE);
_encodedFace.resize(ENCODED_FACE_WIDTH * ENCODED_FACE_HEIGHT * ENCODED_BITS_PER_PIXEL / BITS_PER_BYTE);
vpx_image_t vpxImage;
vpx_img_wrap(&vpxImage, VPX_IMG_FMT_YV12, ENCODED_FACE_WIDTH, combinedFaceHeight, 1, (unsigned char*)_encodedFace.data());
vpx_img_wrap(&vpxImage, VPX_IMG_FMT_YV12, ENCODED_FACE_WIDTH, ENCODED_FACE_HEIGHT, 1, (unsigned char*)_encodedFace.data());
uchar* yline = vpxImage.planes[0];
uchar* vline = vpxImage.planes[1];
uchar* uline = vpxImage.planes[2];
@ -588,6 +594,24 @@ void FrameGrabber::grabFrame() {
uline += vpxImage.stride[2];
}
// encode the frame
vpx_codec_encode(&_colorCodec, &vpxImage, ++_frameCount, 1, 0, VPX_DL_REALTIME);
// start the payload off with the aspect ratio
QByteArray payload(sizeof(float), 0);
*(float*)payload.data() = _smoothedFaceRect.size.width / _smoothedFaceRect.size.height;
// extract the encoded frame
vpx_codec_iter_t iterator = 0;
const vpx_codec_cx_pkt_t* packet;
while ((packet = vpx_codec_get_cx_data(&_colorCodec, &iterator)) != 0) {
if (packet->kind == VPX_CODEC_CX_FRAME_PKT) {
// prepend the length, which will indicate whether there's a depth frame too
payload.append((const char*)&packet->data.frame.sz, sizeof(packet->data.frame.sz));
payload.append((const char*)packet->data.frame.buf, packet->data.frame.sz);
}
}
if (!depth.empty()) {
// warp the face depth without interpolation (because it will contain invalid zero values)
_faceDepth.create(ENCODED_FACE_WIDTH, ENCODED_FACE_HEIGHT, CV_16UC1);
@ -621,12 +645,14 @@ void FrameGrabber::grabFrame() {
depth.convertTo(_grayDepthFrame, CV_8UC1, 1.0, depthOffset);
// likewise for the encoded representation
uchar* yline = vpxImage.planes[0] + vpxImage.stride[0] * ENCODED_FACE_HEIGHT;
uchar* vline = vpxImage.planes[1] + vpxImage.stride[1] * (ENCODED_FACE_HEIGHT / 2);
uchar* yline = vpxImage.planes[0];
uchar* vline = vpxImage.planes[1];
uchar* uline = vpxImage.planes[2];
const uchar EIGHT_BIT_MAXIMUM = 255;
for (int i = 0; i < ENCODED_FACE_HEIGHT; i += 2) {
uchar* ydest = yline;
uchar* vdest = vline;
uchar* udest = uline;
for (int j = 0; j < ENCODED_FACE_WIDTH; j += 2) {
ushort tl = *_faceDepth.ptr<ushort>(i, j);
ushort tr = *_faceDepth.ptr<ushort>(i, j + 1);
@ -644,28 +670,28 @@ void FrameGrabber::grabFrame() {
ydest += 2;
*vdest++ = mask;
*udest++ = EIGHT_BIT_MIDPOINT;
}
yline += vpxImage.stride[0] * 2;
vline += vpxImage.stride[1];
uline += vpxImage.stride[2];
}
// encode the frame
vpx_codec_encode(&_depthCodec, &vpxImage, _frameCount, 1, 0, VPX_DL_REALTIME);
// extract the encoded frame
vpx_codec_iter_t iterator = 0;
const vpx_codec_cx_pkt_t* packet;
while ((packet = vpx_codec_get_cx_data(&_depthCodec, &iterator)) != 0) {
if (packet->kind == VPX_CODEC_CX_FRAME_PKT) {
payload.append((const char*)packet->data.frame.buf, packet->data.frame.sz);
}
}
}
// encode the frame
vpx_codec_encode(&_codec, &vpxImage, ++_frameCount, 1, 0, VPX_DL_REALTIME);
// extract the encoded frame
vpx_codec_iter_t iterator = 0;
const vpx_codec_cx_pkt_t* packet;
while ((packet = vpx_codec_get_cx_data(&_codec, &iterator)) != 0) {
if (packet->kind == VPX_CODEC_CX_FRAME_PKT) {
// prepend the aspect ratio
QByteArray payload(sizeof(float), 0);
*(float*)payload.data() = _smoothedFaceRect.size.width / _smoothedFaceRect.size.height;
payload.append((const char*)packet->data.frame.buf, packet->data.frame.sz);
QMetaObject::invokeMethod(Application::getInstance(), "sendAvatarFaceVideoMessage", Q_ARG(int, _frameCount),
Q_ARG(QByteArray, payload));
}
}
QMetaObject::invokeMethod(Application::getInstance(), "sendAvatarFaceVideoMessage",
Q_ARG(int, _frameCount), Q_ARG(QByteArray, payload));
QMetaObject::invokeMethod(Application::getInstance()->getWebcam(), "setFrame",
Q_ARG(cv::Mat, color), Q_ARG(int, format), Q_ARG(cv::Mat, _grayDepthFrame), Q_ARG(float, _smoothedMeanFaceDepth),

View file

@ -120,7 +120,8 @@ private:
cv::Mat _grayDepthFrame;
float _smoothedMeanFaceDepth;
vpx_codec_ctx_t _codec;
vpx_codec_ctx_t _colorCodec;
vpx_codec_ctx_t _depthCodec;
int _frameCount;
cv::Mat _faceColor;
cv::Mat _faceDepth;

View file

@ -30,19 +30,25 @@ GLuint Face::_vboID;
GLuint Face::_iboID;
Face::Face(Head* owningHead) : _owningHead(owningHead), _renderMode(MESH),
_colorTextureID(0), _depthTextureID(0), _codec(), _frameCount(0) {
_colorTextureID(0), _depthTextureID(0), _colorCodec(), _depthCodec(), _frameCount(0) {
// we may have been created in the network thread, but we live in the main thread
moveToThread(Application::getInstance()->thread());
}
Face::~Face() {
if (_codec.name != 0) {
vpx_codec_destroy(&_codec);
if (_colorCodec.name != 0) {
vpx_codec_destroy(&_colorCodec);
// delete our textures, since we know that we own them
// delete our texture, since we know that we own it
if (_colorTextureID != 0) {
glDeleteTextures(1, &_colorTextureID);
}
}
if (_depthCodec.name != 0) {
vpx_codec_destroy(&_depthCodec);
// delete our texture, since we know that we own it
if (_depthTextureID != 0) {
glDeleteTextures(1, &_depthTextureID);
}
@ -55,9 +61,9 @@ void Face::setTextureRect(const cv::RotatedRect& textureRect) {
}
int Face::processVideoMessage(unsigned char* packetData, size_t dataBytes) {
if (_codec.name == 0) {
if (_colorCodec.name == 0) {
// initialize decoder context
vpx_codec_dec_init(&_codec, vpx_codec_vp8_dx(), 0, 0);
vpx_codec_dec_init(&_colorCodec, vpx_codec_vp8_dx(), 0, 0);
}
// skip the header
unsigned char* packetPosition = packetData;
@ -85,14 +91,14 @@ int Face::processVideoMessage(unsigned char* packetData, size_t dataBytes) {
if ((_frameBytesRemaining -= payloadSize) <= 0) {
float aspectRatio = *(const float*)_arrivingFrame.constData();
vpx_codec_decode(&_codec, (const uint8_t*)_arrivingFrame.constData() + sizeof(float),
_arrivingFrame.size() - sizeof(float), 0, 0);
size_t colorSize = *(const size_t*)(_arrivingFrame.constData() + sizeof(float));
const uint8_t* colorData = (const uint8_t*)(_arrivingFrame.constData() + sizeof(float) + sizeof(size_t));
vpx_codec_decode(&_colorCodec, colorData, colorSize, 0, 0);
vpx_codec_iter_t iterator = 0;
vpx_image_t* image;
while ((image = vpx_codec_get_frame(&_codec, &iterator)) != 0) {
while ((image = vpx_codec_get_frame(&_colorCodec, &iterator)) != 0) {
// convert from YV12 to RGB
const int imageHeight = image->d_w;
Mat color(imageHeight, image->d_w, CV_8UC3);
Mat color(image->d_h, image->d_w, CV_8UC3);
uchar* yline = image->planes[0];
uchar* vline = image->planes[1];
uchar* uline = image->planes[2];
@ -100,7 +106,7 @@ int Face::processVideoMessage(unsigned char* packetData, size_t dataBytes) {
const int GREEN_V_WEIGHT = (int)(0.714 * 256);
const int GREEN_U_WEIGHT = (int)(0.344 * 256);
const int BLUE_U_WEIGHT = (int)(1.773 * 256);
for (int i = 0; i < imageHeight; i += 2) {
for (int i = 0; i < image->d_h; i += 2) {
uchar* ysrc = yline;
uchar* vsrc = vline;
uchar* usrc = uline;
@ -144,34 +150,45 @@ int Face::processVideoMessage(unsigned char* packetData, size_t dataBytes) {
uline += image->stride[2];
}
Mat depth;
if (image->d_h > imageHeight) {
// if the height is greater than the width, we have depth data
depth.create(imageHeight, image->d_w, CV_8UC1);
uchar* yline = image->planes[0] + image->stride[0] * imageHeight;
uchar* vline = image->planes[1] + image->stride[1] * (imageHeight / 2);
const uchar EIGHT_BIT_MAXIMUM = 255;
const uchar MASK_THRESHOLD = 192;
for (int i = 0; i < imageHeight; i += 2) {
uchar* ysrc = yline;
uchar* vsrc = vline;
for (int j = 0; j < image->d_w; j += 2) {
if (*vsrc++ >= MASK_THRESHOLD) {
*depth.ptr(i, j) = EIGHT_BIT_MAXIMUM;
*depth.ptr(i, j + 1) = EIGHT_BIT_MAXIMUM;
*depth.ptr(i + 1, j) = EIGHT_BIT_MAXIMUM;
*depth.ptr(i + 1, j + 1) = EIGHT_BIT_MAXIMUM;
} else {
*depth.ptr(i, j) = ysrc[0];
*depth.ptr(i, j + 1) = ysrc[1];
*depth.ptr(i + 1, j) = ysrc[image->stride[0]];
*depth.ptr(i + 1, j + 1) = ysrc[image->stride[0] + 1];
const uint8_t* depthData = colorData + colorSize;
int depthSize = _arrivingFrame.size() - ((const char*)depthData - _arrivingFrame.constData());
if (depthSize > 0) {
if (_depthCodec.name == 0) {
// initialize decoder context
vpx_codec_dec_init(&_depthCodec, vpx_codec_vp8_dx(), 0, 0);
}
vpx_codec_decode(&_depthCodec, depthData, depthSize, 0, 0);
vpx_codec_iter_t iterator = 0;
vpx_image_t* image;
while ((image = vpx_codec_get_frame(&_depthCodec, &iterator)) != 0) {
depth.create(image->d_h, image->d_w, CV_8UC1);
uchar* yline = image->planes[0];
uchar* vline = image->planes[1];
const uchar EIGHT_BIT_MAXIMUM = 255;
const uchar MASK_THRESHOLD = 192;
for (int i = 0; i < image->d_h; i += 2) {
uchar* ysrc = yline;
uchar* vsrc = vline;
for (int j = 0; j < image->d_w; j += 2) {
if (*vsrc++ >= MASK_THRESHOLD) {
*depth.ptr(i, j) = EIGHT_BIT_MAXIMUM;
*depth.ptr(i, j + 1) = EIGHT_BIT_MAXIMUM;
*depth.ptr(i + 1, j) = EIGHT_BIT_MAXIMUM;
*depth.ptr(i + 1, j + 1) = EIGHT_BIT_MAXIMUM;
} else {
*depth.ptr(i, j) = ysrc[0];
*depth.ptr(i, j + 1) = ysrc[1];
*depth.ptr(i + 1, j) = ysrc[image->stride[0]];
*depth.ptr(i + 1, j + 1) = ysrc[image->stride[0] + 1];
}
ysrc += 2;
}
ysrc += 2;
yline += image->stride[0] * 2;
vline += image->stride[1];
}
yline += image->stride[0] * 2;
vline += image->stride[1];
}
}
QMetaObject::invokeMethod(this, "setFrame", Q_ARG(cv::Mat, color),

View file

@ -57,7 +57,8 @@ private:
cv::RotatedRect _textureRect;
float _aspectRatio;
vpx_codec_ctx_t _codec;
vpx_codec_ctx_t _colorCodec;
vpx_codec_ctx_t _depthCodec;
QByteArray _arrivingFrame;
int _frameCount;

View file

@ -18,14 +18,15 @@ PACKET_VERSION versionForPacketType(PACKET_TYPE type) {
case PACKET_TYPE_MICROPHONE_AUDIO_NO_ECHO:
case PACKET_TYPE_MICROPHONE_AUDIO_WITH_ECHO:
return 1;
break;
case PACKET_TYPE_HEAD_DATA:
return 2;
break;
case PACKET_TYPE_AVATAR_FACE_VIDEO:
return 1;
default:
return 0;
break;
}
}