From deaa4a747b57d1996365df5a5d45a1bcfa70942d Mon Sep 17 00:00:00 2001
From: Brad Davis <bdavis@saintandreas.org>
Date: Tue, 22 Sep 2015 10:11:49 -0700
Subject: [PATCH] Batch side implementation of multi-draw indirect

---
 libraries/gpu/src/gpu/Batch.cpp              |  27 +++
 libraries/gpu/src/gpu/Batch.h                |  36 +++-
 libraries/gpu/src/gpu/GLBackend.cpp          |  56 +++++-
 libraries/gpu/src/gpu/GLBackend.h            |  13 +-
 libraries/gpu/src/gpu/GLBackendInput.cpp     |  37 ++--
 libraries/render-utils/src/GeometryCache.cpp |  17 +-
 libraries/render-utils/src/GeometryCache.h   |  14 +-
 tests/gpu-test/src/main.cpp                  | 196 +++++++++++++++----
 8 files changed, 318 insertions(+), 78 deletions(-)

diff --git a/libraries/gpu/src/gpu/Batch.cpp b/libraries/gpu/src/gpu/Batch.cpp
index e6e176be88..15b841dd04 100644
--- a/libraries/gpu/src/gpu/Batch.cpp
+++ b/libraries/gpu/src/gpu/Batch.cpp
@@ -102,6 +102,19 @@ void Batch::drawIndexedInstanced(uint32 nbInstances, Primitive primitiveType, ui
     _params.push_back(nbInstances);
 }
 
+
+void Batch::multiDrawIndirect(uint32 nbCommands, Primitive primitiveType) {
+    ADD_COMMAND(multiDrawIndirect);
+    _params.push_back(nbCommands);
+    _params.push_back(primitiveType);
+}
+
+void Batch::multiDrawIndexedIndirect(uint32 nbCommands, Primitive primitiveType) {
+    ADD_COMMAND(multiDrawIndexedIndirect);
+    _params.push_back(nbCommands);
+    _params.push_back(primitiveType);
+}
+
 void Batch::setInputFormat(const Stream::FormatPointer& format) {
     ADD_COMMAND(setInputFormat);
 
@@ -144,6 +157,15 @@ void Batch::setIndexBuffer(const BufferView& buffer) {
     setIndexBuffer(buffer._element.getType(), buffer._buffer, buffer._offset);
 }
 
+void Batch::setIndirectBuffer(const BufferPointer& buffer, Offset offset, Offset stride) {
+    ADD_COMMAND(setIndirectBuffer);
+
+    _params.push_back(_buffers.cache(buffer));
+    _params.push_back(offset);
+    _params.push_back(stride);
+}
+
+
 void Batch::setModelTransform(const Transform& model) {
     ADD_COMMAND(setModelTransform);
 
@@ -288,6 +310,11 @@ void Batch::resetStages() {
     ADD_COMMAND(resetStages);
 }
 
+void Batch::runLambda(std::function<void()> f) {
+    ADD_COMMAND(runLambda);
+    _params.push_back(_lambdas.cache(f));
+}
+
 void Batch::enableStereo(bool enable) {
     _enableStereo = enable;
 }
diff --git a/libraries/gpu/src/gpu/Batch.h b/libraries/gpu/src/gpu/Batch.h
index ec6fb26c34..6dd92739c5 100644
--- a/libraries/gpu/src/gpu/Batch.h
+++ b/libraries/gpu/src/gpu/Batch.h
@@ -63,8 +63,8 @@ public:
 
         void process(Batch& batch) {
             if (_function) {
-            _function(batch, *this);
-        }
+                _function(batch, *this);
+            }
         }
     };
 
@@ -96,12 +96,15 @@ public:
     void drawIndexed(Primitive primitiveType, uint32 nbIndices, uint32 startIndex = 0);
     void drawInstanced(uint32 nbInstances, Primitive primitiveType, uint32 nbVertices, uint32 startVertex = 0, uint32 startInstance = 0);
     void drawIndexedInstanced(uint32 nbInstances, Primitive primitiveType, uint32 nbIndices, uint32 startIndex = 0, uint32 startInstance = 0);
+    void multiDrawIndirect(uint32 nbCommands, Primitive primitiveType);
+    void multiDrawIndexedIndirect(uint32 nbCommands, Primitive primitiveType);
 
 
     void setupNamedCalls(const std::string& instanceName, size_t count, NamedBatchData::Function function);
     void setupNamedCalls(const std::string& instanceName, NamedBatchData::Function function);
     BufferPointer getNamedBuffer(const std::string& instanceName, uint8_t index = 0);
-    
+    void setNamedBuffer(const std::string& instanceName, BufferPointer& buffer, uint8_t index = 0);
+
     
 
     // Input Stage
@@ -117,6 +120,8 @@ public:
     void setIndexBuffer(Type type, const BufferPointer& buffer, Offset offset);
     void setIndexBuffer(const BufferView& buffer); // not a command, just a shortcut from a BufferView
 
+    void setIndirectBuffer(const BufferPointer& buffer, Offset offset = 0, Offset stride = 0);
+
     // Transform Stage
     // Vertex position is transformed by ModelTransform from object space to world space
     // Then by the inverse of the ViewTransform from world space to eye space
@@ -169,6 +174,8 @@ public:
     // Reset the stage caches and states
     void resetStages();
 
+    void runLambda(std::function<void()> f);
+
     // TODO: As long as we have gl calls explicitely issued from interface
     // code, we need to be able to record and batch these calls. THe long 
     // term strategy is to get rid of any GL calls in favor of the HIFI GPU API
@@ -194,10 +201,13 @@ public:
         COMMAND_drawIndexed,
         COMMAND_drawInstanced,
         COMMAND_drawIndexedInstanced,
+        COMMAND_multiDrawIndirect,
+        COMMAND_multiDrawIndexedIndirect,
 
         COMMAND_setInputFormat,
         COMMAND_setInputBuffer,
         COMMAND_setIndexBuffer,
+        COMMAND_setIndirectBuffer,
 
         COMMAND_setModelTransform,
         COMMAND_setViewTransform,
@@ -221,6 +231,8 @@ public:
 
         COMMAND_resetStages,
 
+        COMMAND_runLambda,
+
         // TODO: As long as we have gl calls explicitely issued from interface
         // code, we need to be able to record and batch these calls. THe long 
         // term strategy is to get rid of any GL calls in favor of the HIFI GPU API
@@ -302,6 +314,7 @@ public:
     typedef Cache<PipelinePointer>::Vector PipelineCaches;
     typedef Cache<FramebufferPointer>::Vector FramebufferCaches;
     typedef Cache<QueryPointer>::Vector QueryCaches;
+    typedef Cache<std::function<void()>>::Vector LambdaCache;
 
     // Cache Data in a byte array if too big to fit in Param
     // FOr example Mat4s are going there
@@ -327,6 +340,7 @@ public:
     PipelineCaches _pipelines;
     FramebufferCaches _framebuffers;
     QueryCaches _queries;
+    LambdaCache _lambdas;
 
     NamedBatchDataMap _namedData;
 
@@ -336,6 +350,20 @@ public:
 protected:
 };
 
-};
+template <typename V>
+void popVectorParam(Batch::Params& params, uint32& paramOffset, V& v) {
+    for (size_t i = 0; i < v.length(); ++i) {
+        v[i] = params[paramOffset++]._float;
+    }
+}
+
+template <typename V>
+void pushVectorParam(Batch::Params& params, const V& v) {
+    for (size_t i = 0; i < v.length(); ++i) {
+        params.push_back(v[i]);
+    }
+}
+
+}
 
 #endif
diff --git a/libraries/gpu/src/gpu/GLBackend.cpp b/libraries/gpu/src/gpu/GLBackend.cpp
index 62508f273c..79b37ddc0e 100644
--- a/libraries/gpu/src/gpu/GLBackend.cpp
+++ b/libraries/gpu/src/gpu/GLBackend.cpp
@@ -23,10 +23,13 @@ GLBackend::CommandCall GLBackend::_commandCalls[Batch::NUM_COMMANDS] =
     (&::gpu::GLBackend::do_drawIndexed),
     (&::gpu::GLBackend::do_drawInstanced),
     (&::gpu::GLBackend::do_drawIndexedInstanced),
-    
+    (&::gpu::GLBackend::do_multiDrawIndirect),
+    (&::gpu::GLBackend::do_multiDrawIndexedIndirect),
+
     (&::gpu::GLBackend::do_setInputFormat),
     (&::gpu::GLBackend::do_setInputBuffer),
     (&::gpu::GLBackend::do_setIndexBuffer),
+    (&::gpu::GLBackend::do_setIndirectBuffer),
 
     (&::gpu::GLBackend::do_setModelTransform),
     (&::gpu::GLBackend::do_setViewTransform),
@@ -50,6 +53,8 @@ GLBackend::CommandCall GLBackend::_commandCalls[Batch::NUM_COMMANDS] =
 
     (&::gpu::GLBackend::do_resetStages),
 
+    (&::gpu::GLBackend::do_runLambda),
+
     (&::gpu::GLBackend::do_glActiveBindTexture),
 
     (&::gpu::GLBackend::do_glUniform1i),
@@ -323,6 +328,9 @@ void GLBackend::do_drawInstanced(Batch& batch, uint32 paramOffset) {
     (void) CHECK_GL_ERROR();
 }
 
+// DO NOT MERGE THIS, it will break mac clients
+#define GL_430
+
 void GLBackend::do_drawIndexedInstanced(Batch& batch, uint32 paramOffset) {
     updateInput();
     updateTransform();
@@ -332,17 +340,63 @@ void GLBackend::do_drawIndexedInstanced(Batch& batch, uint32 paramOffset) {
     GLenum mode = _primitiveToGLmode[(Primitive)batch._params[paramOffset + 3]._uint];
     uint32 numIndices = batch._params[paramOffset + 2]._uint;
     uint32 startIndex = batch._params[paramOffset + 1]._uint;
+    // FIXME glDrawElementsInstancedBaseVertexBaseInstance is only available in GL 4.3 
+    // and higher, so currently we ignore this field
     uint32 startInstance = batch._params[paramOffset + 0]._uint;
     GLenum glType = _elementTypeToGLType[_input._indexBufferType];
 
+#ifdef GL_430
+    glDrawElementsInstancedBaseVertexBaseInstance(mode, numIndices, glType, reinterpret_cast<GLvoid*>(startIndex + _input._indexBufferOffset), numInstances, 0, startInstance);
+#else
     glDrawElementsInstanced(mode, numIndices, glType, reinterpret_cast<GLvoid*>(startIndex + _input._indexBufferOffset), numInstances);
+#endif
     (void)CHECK_GL_ERROR();
 }
 
+
+void GLBackend::do_multiDrawIndirect(Batch& batch, uint32 paramOffset) {
+#ifdef GL_430
+    updateInput();
+    updateTransform();
+    updatePipeline();
+
+    uint commandCount = batch._params[paramOffset + 0]._uint;
+    GLenum mode = _primitiveToGLmode[(Primitive)batch._params[paramOffset + 1]._uint];
+
+    glMultiDrawArraysIndirect(mode, reinterpret_cast<GLvoid*>(_input._indirectBufferOffset), commandCount, _input._indirectBufferStride);
+#else
+	// FIXME implement the slow path
+#endif
+    (void)CHECK_GL_ERROR();
+}
+
+void GLBackend::do_multiDrawIndexedIndirect(Batch& batch, uint32 paramOffset) {
+#ifdef GL_430
+    updateInput();
+    updateTransform();
+    updatePipeline();
+
+    uint commandCount = batch._params[paramOffset + 0]._uint;
+    GLenum mode = _primitiveToGLmode[(Primitive)batch._params[paramOffset + 1]._uint];
+    GLenum indexType = _elementTypeToGLType[_input._indexBufferType];
+
+    glMultiDrawElementsIndirect(mode, indexType, reinterpret_cast<GLvoid*>(_input._indirectBufferOffset), commandCount, _input._indirectBufferStride);
+#else
+	// FIXME implement the slow path
+#endif
+    (void)CHECK_GL_ERROR();
+}
+
+
 void GLBackend::do_resetStages(Batch& batch, uint32 paramOffset) {
     resetStages();
 }
 
+void GLBackend::do_runLambda(Batch& batch, uint32 paramOffset) {
+    std::function<void()> f = batch._lambdas.get(batch._params[paramOffset]._uint);
+    f();
+}
+
 void GLBackend::resetStages() {
     resetInputStage();
     resetPipelineStage();
diff --git a/libraries/gpu/src/gpu/GLBackend.h b/libraries/gpu/src/gpu/GLBackend.h
index dabc69dedb..f12cda827a 100644
--- a/libraries/gpu/src/gpu/GLBackend.h
+++ b/libraries/gpu/src/gpu/GLBackend.h
@@ -252,11 +252,14 @@ protected:
     void do_drawIndexed(Batch& batch, uint32 paramOffset);
     void do_drawInstanced(Batch& batch, uint32 paramOffset);
     void do_drawIndexedInstanced(Batch& batch, uint32 paramOffset);
-
+    void do_multiDrawIndirect(Batch& batch, uint32 paramOffset);
+    void do_multiDrawIndexedIndirect(Batch& batch, uint32 paramOffset);
+    
     // Input Stage
     void do_setInputFormat(Batch& batch, uint32 paramOffset);
     void do_setInputBuffer(Batch& batch, uint32 paramOffset);
     void do_setIndexBuffer(Batch& batch, uint32 paramOffset);
+    void do_setIndirectBuffer(Batch& batch, uint32 paramOffset);
 
     void initInput();
     void killInput();
@@ -284,6 +287,10 @@ protected:
         Offset _indexBufferOffset;
         Type _indexBufferType;
 
+        BufferPointer _indirectBuffer;
+        Offset _indirectBufferOffset{ 0 };
+        Offset _indirectBufferStride{ 0 };
+
         GLuint _defaultVAO;
 
         InputStageState() :
@@ -448,6 +455,9 @@ protected:
 
     // Reset stages
     void do_resetStages(Batch& batch, uint32 paramOffset);
+
+    void do_runLambda(Batch& batch, uint32 paramOffset);
+
     void resetStages();
 
     // TODO: As long as we have gl calls explicitely issued from interface
@@ -471,7 +481,6 @@ protected:
     static CommandCall _commandCalls[Batch::NUM_COMMANDS];
 };
 
-
 };
 
 #endif
diff --git a/libraries/gpu/src/gpu/GLBackendInput.cpp b/libraries/gpu/src/gpu/GLBackendInput.cpp
index 7f021fd5c5..2b14e4d7f0 100755
--- a/libraries/gpu/src/gpu/GLBackendInput.cpp
+++ b/libraries/gpu/src/gpu/GLBackendInput.cpp
@@ -273,21 +273,36 @@ void GLBackend::resetInputStage() {
 }
 
 void GLBackend::do_setIndexBuffer(Batch& batch, uint32 paramOffset) {
-    _input._indexBufferType = (Type) batch._params[paramOffset + 2]._uint;
-    BufferPointer indexBuffer = batch._buffers.get(batch._params[paramOffset + 1]._uint);
+    _input._indexBufferType = (Type)batch._params[paramOffset + 2]._uint;
     _input._indexBufferOffset = batch._params[paramOffset + 0]._uint;
-    _input._indexBuffer = indexBuffer;
-    if (indexBuffer) {
-        glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, getBufferID(*indexBuffer));
-    } else {
-        glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
+
+    BufferPointer indexBuffer = batch._buffers.get(batch._params[paramOffset + 1]._uint);
+    if (indexBuffer != _input._indexBuffer) {
+        _input._indexBuffer = indexBuffer;
+        if (indexBuffer) {
+            glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, getBufferID(*indexBuffer));
+        } else {
+            // FIXME do we really need this?  Is there ever a draw call where we care that the element buffer is null?
+            glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
+        }
     }
     (void) CHECK_GL_ERROR();
 }
 
-template <typename V>
-void popParam(Batch::Params& params, uint32& paramOffset, V& v) {
-    for (size_t i = 0; i < v.length(); ++i) {
-        v[i] = params[paramOffset++]._float;
+void GLBackend::do_setIndirectBuffer(Batch& batch, uint32 paramOffset) {
+    _input._indirectBufferOffset = batch._params[paramOffset + 1]._uint;
+    _input._indirectBufferStride = batch._params[paramOffset + 2]._uint;
+
+    BufferPointer buffer = batch._buffers.get(batch._params[paramOffset]._uint);
+    if (buffer != _input._indirectBuffer) {
+        _input._indirectBuffer = buffer;
+        if (buffer) {
+            glBindBuffer(GL_DRAW_INDIRECT_BUFFER, getBufferID(*buffer));
+        } else {
+            // FIXME do we really need this?  Is there ever a draw call where we care that the element buffer is null?
+            glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
+        }
     }
+
+    (void)CHECK_GL_ERROR();
 }
diff --git a/libraries/render-utils/src/GeometryCache.cpp b/libraries/render-utils/src/GeometryCache.cpp
index 093434f079..5e04fe867f 100644
--- a/libraries/render-utils/src/GeometryCache.cpp
+++ b/libraries/render-utils/src/GeometryCache.cpp
@@ -87,37 +87,34 @@ void GeometryCache::ShapeData::setupIndices(gpu::BufferPointer& indexBuffer, con
 void GeometryCache::ShapeData::setupBatch(gpu::Batch& batch) const {
     batch.setInputBuffer(gpu::Stream::POSITION, _positionView);
     batch.setInputBuffer(gpu::Stream::NORMAL, _normalView);
+    batch.setIndexBuffer(gpu::UINT16, _indices, 0);
 }
 
 void GeometryCache::ShapeData::draw(gpu::Batch& batch) const {
     if (_indexCount) {
         setupBatch(batch);
-        batch.setIndexBuffer(gpu::UINT16, _indices, _indexOffset);
-        batch.drawIndexed(gpu::TRIANGLES, _indexCount);
+        batch.drawIndexed(gpu::TRIANGLES, _indexCount, _indexOffset);
     }
 }
 
 void GeometryCache::ShapeData::drawWire(gpu::Batch& batch) const {
     if (_wireIndexCount) {
         setupBatch(batch);
-        batch.setIndexBuffer(gpu::UINT16, _indices, _wireIndexOffset);
-        batch.drawIndexed(gpu::LINES, _wireIndexCount);
+        batch.drawIndexed(gpu::LINES, _wireIndexCount, _wireIndexOffset);
     }
 }
 
 void GeometryCache::ShapeData::drawInstances(gpu::Batch& batch, size_t count) const {
     if (_indexCount) {
         setupBatch(batch);
-        batch.setIndexBuffer(gpu::UINT16, _indices, _indexOffset);
-        batch.drawIndexedInstanced(count, gpu::TRIANGLES, _indexCount);
+        batch.drawIndexedInstanced(count, gpu::TRIANGLES, _indexCount, _indexOffset);
     }
 }
 
 void GeometryCache::ShapeData::drawWireInstances(gpu::Batch& batch, size_t count) const {
     if (_wireIndexCount) {
         setupBatch(batch);
-        batch.setIndexBuffer(gpu::UINT16, _indices, _wireIndexOffset);
-        batch.drawIndexedInstanced(count, gpu::LINES, _wireIndexCount);
+        batch.drawIndexedInstanced(count, gpu::LINES, _wireIndexCount, _wireIndexOffset);
     }
 }
 
@@ -323,7 +320,7 @@ void GeometryCache::buildShapes() {
             20, 21, 21, 22, 22, 23, 23, 20, // back
             0, 23, 1, 22, 2, 21, 3, 20 // sides
         };
-        for (int i = 0; i < wireIndices.size(); ++i) {
+        for (size_t i = 0; i < wireIndices.size(); ++i) {
             indices[i] += startingIndex;
         }
 
@@ -374,7 +371,7 @@ void GeometryCache::buildShapes() {
             0, 3, 1, 3, 2, 3,
         };
 
-        for (int i = 0; i < wireIndices.size(); ++i) {
+        for (size_t i = 0; i < wireIndices.size(); ++i) {
             wireIndices[i] += startingIndex;
         }
 
diff --git a/libraries/render-utils/src/GeometryCache.h b/libraries/render-utils/src/GeometryCache.h
index 2e0d0a5493..aa1593db78 100644
--- a/libraries/render-utils/src/GeometryCache.h
+++ b/libraries/render-utils/src/GeometryCache.h
@@ -232,14 +232,6 @@ public:
     /// Set a batch to the simple pipeline, returning the previous pipeline
     void useSimpleDrawPipeline(gpu::Batch& batch, bool noBlend = false);
 
-private:
-    GeometryCache();
-    virtual ~GeometryCache();
-    void buildShapes();
-
-    typedef QPair<int, int> IntPair;
-    typedef QPair<unsigned int, unsigned int> VerticesIndices;
-
     struct ShapeData {
         size_t _indexOffset{ 0 };
         size_t _indexCount{ 0 };
@@ -263,7 +255,13 @@ private:
 
     VShape _shapes;
 
+private:
+    GeometryCache();
+    virtual ~GeometryCache();
+    void buildShapes();
 
+    typedef QPair<int, int> IntPair;
+    typedef QPair<unsigned int, unsigned int> VerticesIndices;
 
     gpu::PipelinePointer _standardDrawPipeline;
     gpu::PipelinePointer _standardDrawPipelineNoBlend;
diff --git a/tests/gpu-test/src/main.cpp b/tests/gpu-test/src/main.cpp
index ad9ed9bb4a..0acbbcd725 100644
--- a/tests/gpu-test/src/main.cpp
+++ b/tests/gpu-test/src/main.cpp
@@ -35,6 +35,7 @@
 
 // Must come after GL headers
 #include <QtGui/QOpenGLContext>
+#include <QtGui/QOpenGLDebugLogger>
 
 #include <GLMHelpers.h>
 #include <PathUtils.h>
@@ -101,7 +102,24 @@ float getSeconds(quint64 start = 0) {
     return seconds;
 }
 
+struct DrawElementsIndirectCommand {
+    uint  _count{ 0 };
+    uint  _instanceCount{ 0 };
+    uint  _firstIndex{ 0 };
+    uint  _baseVertex{ 0 };
+    uint  _baseInstance{ 0 };
+};
 
+static const size_t TYPE_COUNT = 4;
+static GeometryCache::Shape SHAPE[TYPE_COUNT] = {
+    GeometryCache::Icosahedron,
+    GeometryCache::Cube,
+    GeometryCache::Sphere,
+    GeometryCache::Tetrahedron,
+    //GeometryCache::Line,
+};
+
+gpu::Stream::FormatPointer& getInstancedSolidStreamFormat();
 
 // Creates an OpenGL window that renders a simple unlit scene using the gpu library and GeometryCache
 // Should eventually get refactored into something that supports multiple gpu backends.
@@ -134,7 +152,7 @@ public:
         // Qt Quick may need a depth and stencil buffer. Always make sure these are available.
         format.setDepthBufferSize(16);
         format.setStencilBufferSize(8);
-        format.setVersion(4, 1);
+        format.setVersion(4, 3);
         format.setProfile(QSurfaceFormat::OpenGLContextProfile::CoreProfile);
         format.setOption(QSurfaceFormat::DebugContext);
         format.setSwapInterval(0);
@@ -147,6 +165,13 @@ public:
 
         show();
         makeCurrent();
+        QOpenGLDebugLogger *logger = new QOpenGLDebugLogger(this);
+        logger->initialize(); // initializes in the current context, i.e. ctx
+        connect(logger, &QOpenGLDebugLogger::messageLogged, [](const QOpenGLDebugMessage& message){
+            qDebug() << message;
+        });
+        logger->startLogging(QOpenGLDebugLogger::SynchronousLogging);
+
 
         gpu::Context::init<gpu::GLBackend>();
         _context = std::make_shared<gpu::Context>();
@@ -177,7 +202,9 @@ public:
     void draw() {
         static auto startTime = usecTimestampNow();
 
-        if (!isVisible()) {
+        // Attempting to draw before we're visible and have a valid size will
+        // produce GL errors.
+        if (!isVisible() || _size.width() <= 0 || _size.height() <= 0) {
             return;
         }
         makeCurrent();
@@ -192,7 +219,8 @@ public:
         glm::vec3 unitscale { 1.0f };
         glm::vec3 up { 0.0f, 1.0f, 0.0f };
 
-        glm::vec3 camera_position { 1.5f * sinf(t), 0.0f, 1.5f * cos(t) };
+        float distance = 3.0f;
+        glm::vec3 camera_position{ distance * sinf(t), 0.0f, distance * cos(t) };
 
         static const vec3 camera_focus(0);
         static const vec3 camera_up(0, 1, 0);
@@ -202,57 +230,141 @@ public:
         batch.setModelTransform(Transform());
 
         auto geometryCache = DependencyManager::get<GeometryCache>();
-        
+
         // Render grid on xz plane (not the optimal way to do things, but w/e)
         // Note: GeometryCache::renderGrid will *not* work, as it is apparenly unaffected by batch rotations and renders xy only
-        static const std::string GRID_INSTANCE = "Grid";
-        static auto compactColor1 = toCompactColor(vec4{ 0.35f, 0.25f, 0.15f, 1.0f });
-        static auto compactColor2 = toCompactColor(vec4{ 0.15f, 0.25f, 0.35f, 1.0f });
-        auto transformBuffer = batch.getNamedBuffer(GRID_INSTANCE, 0);
-        auto colorBuffer = batch.getNamedBuffer(GRID_INSTANCE, 1);
-        for (int i = 0; i < 100; ++i) {
-            {
-                glm::mat4 transform = glm::translate(mat4(), vec3(0, -1, -50 + i));
-                transform = glm::scale(transform, vec3(100, 1, 1));
-                transformBuffer->append(transform);
-                colorBuffer->append(compactColor1);
-            }
+        {
+            static const std::string GRID_INSTANCE = "Grid";
+            static auto compactColor1 = toCompactColor(vec4{ 0.35f, 0.25f, 0.15f, 1.0f });
+            static auto compactColor2 = toCompactColor(vec4{ 0.15f, 0.25f, 0.35f, 1.0f });
+            static gpu::BufferPointer transformBuffer; 
+            static gpu::BufferPointer colorBuffer;
+            if (!transformBuffer) {
+                transformBuffer = std::make_shared<gpu::Buffer>();
+                colorBuffer = std::make_shared<gpu::Buffer>();
+                for (int i = 0; i < 100; ++i) {
+                    {
+                        glm::mat4 transform = glm::translate(mat4(), vec3(0, -1, -50 + i));
+                        transform = glm::scale(transform, vec3(100, 1, 1));
+                        transformBuffer->append(transform);
+                        colorBuffer->append(compactColor1);
+                    }
 
-            {
-                glm::mat4 transform = glm::mat4_cast(quat(vec3(0, PI / 2.0f, 0)));
-                transform = glm::translate(transform, vec3(0, -1, -50 + i));
-                transform = glm::scale(transform, vec3(100, 1, 1));
-                transformBuffer->append(transform);
-                colorBuffer->append(compactColor2);
+                    {
+                        glm::mat4 transform = glm::mat4_cast(quat(vec3(0, PI / 2.0f, 0)));
+                        transform = glm::translate(transform, vec3(0, -1, -50 + i));
+                        transform = glm::scale(transform, vec3(100, 1, 1));
+                        transformBuffer->append(transform);
+                        colorBuffer->append(compactColor2);
+                    }
+                }
             }
+            
+            batch.setupNamedCalls(GRID_INSTANCE, 200, [=](gpu::Batch& batch, gpu::Batch::NamedBatchData& data) {
+                batch.setViewTransform(camera);
+                batch.setModelTransform(Transform());
+                batch.setPipeline(_pipeline);
+                batch._glUniform1i(_instanceLocation, 1);
+                geometryCache->renderWireShapeInstances(batch, GeometryCache::Line, data._count, transformBuffer, colorBuffer);
+                batch._glUniform1i(_instanceLocation, 0);
+            });
         }
 
-        batch.setupNamedCalls(GRID_INSTANCE, 200, [=](gpu::Batch& batch, gpu::Batch::NamedBatchData& data) {
+        {
+            static const size_t ITEM_COUNT = 1000;
+            static const float SHAPE_INTERVAL = (PI * 2.0f) / ITEM_COUNT;
+            static const float ITEM_INTERVAL = SHAPE_INTERVAL / TYPE_COUNT;
+
+            static const gpu::Element POSITION_ELEMENT{ gpu::VEC3, gpu::FLOAT, gpu::XYZ };
+            static const gpu::Element NORMAL_ELEMENT{ gpu::VEC3, gpu::FLOAT, gpu::XYZ };
+            static const gpu::Element COLOR_ELEMENT{ gpu::VEC4, gpu::NUINT8, gpu::RGBA };
+            static const gpu::Element TRANSFORM_ELEMENT{ gpu::MAT4, gpu::FLOAT, gpu::XYZW };
+
+
+            static std::vector<Transform> transforms;
+            static std::vector<vec4> colors;
+            static gpu::BufferPointer indirectBuffer;
+            static gpu::BufferPointer transformBuffer;
+            static gpu::BufferPointer colorBuffer;
+            static gpu::BufferView colorView; 
+            static gpu::BufferView instanceXfmView; 
+
+            if (!transformBuffer) {
+                transformBuffer = std::make_shared<gpu::Buffer>();
+                colorBuffer = std::make_shared<gpu::Buffer>();
+                indirectBuffer = std::make_shared<gpu::Buffer>();
+
+                static const float ITEM_RADIUS = 20;
+                static const vec3 ITEM_TRANSLATION{ 0, 0, -ITEM_RADIUS };
+                for (size_t i = 0; i < TYPE_COUNT; ++i) {
+                    GeometryCache::Shape shape = SHAPE[i];
+                    GeometryCache::ShapeData shapeData = geometryCache->_shapes[shape];
+                    {
+                        DrawElementsIndirectCommand indirectCommand;
+                        indirectCommand._count = shapeData._indexCount;
+                        indirectCommand._instanceCount = ITEM_COUNT;
+                        indirectCommand._baseInstance = i * ITEM_COUNT;
+                        indirectCommand._firstIndex = shapeData._indexOffset / 2;
+                        indirectCommand._baseVertex = 0;
+                        indirectBuffer->append(indirectCommand);
+                    }
+
+                    //indirectCommand._count
+                    float startingInterval = ITEM_INTERVAL * i;
+                    for (size_t j = 0; j < ITEM_COUNT; ++j) {
+                        float theta = j * SHAPE_INTERVAL + startingInterval;
+                        auto transform = glm::rotate(mat4(), theta, Vectors::UP);
+                        transform = glm::rotate(transform, (randFloat() - 0.5f) * PI / 4.0f, Vectors::UNIT_X);
+                        transform = glm::translate(transform, ITEM_TRANSLATION);
+                        transform = glm::scale(transform, vec3(randFloat() / 2.0f + 0.5f));
+                        transformBuffer->append(transform);
+                        transforms.push_back(transform);
+                        auto color = vec4{ randomColorValue(64), randomColorValue(64), randomColorValue(64), 255 };
+                        color /= 255.0f;
+                        colors.push_back(color);
+                        colorBuffer->append(toCompactColor(color));
+                    }
+                }
+                colorView = gpu::BufferView(colorBuffer, COLOR_ELEMENT);
+                instanceXfmView = gpu::BufferView(transformBuffer, TRANSFORM_ELEMENT);
+            }
+
+#if 1
+            GeometryCache::ShapeData shapeData = geometryCache->_shapes[GeometryCache::Icosahedron];
+            {
+                batch.setViewTransform(camera);
+                batch.setModelTransform(Transform());
+                batch.setPipeline(_pipeline);
+                batch._glUniform1i(_instanceLocation, 1);
+                batch.setInputFormat(getInstancedSolidStreamFormat());
+                batch.setInputBuffer(gpu::Stream::COLOR, colorView);
+                batch.setInputBuffer(gpu::Stream::INSTANCE_XFM, instanceXfmView);
+                batch.setIndirectBuffer(indirectBuffer);
+                shapeData.setupBatch(batch);
+                batch.multiDrawIndexedIndirect(TYPE_COUNT, gpu::TRIANGLES);
+                batch._glUniform1i(_instanceLocation, 0);
+            }
+#else
             batch.setViewTransform(camera);
-            batch.setModelTransform(Transform());
             batch.setPipeline(_pipeline);
-            auto& xfm = data._buffers[0];
-            auto& color = data._buffers[1];
-            batch._glUniform1i(_instanceLocation, 1);
-            geometryCache->renderWireShapeInstances(batch, GeometryCache::Line, data._count, xfm, color);
-            batch._glUniform1i(_instanceLocation, 0);
-        });
-
-
+            for (size_t i = 0; i < TYPE_COUNT; ++i) {
+                GeometryCache::Shape shape = SHAPE[i];
+                for (size_t j = 0; j < ITEM_COUNT; ++j) {
+                    int index = i * ITEM_COUNT + j;
+                    batch.setModelTransform(transforms[index]);
+                    const vec4& color = colors[index];
+                    batch._glColor4f(color.r, color.g, color.b, 1.0);
+                    geometryCache->renderShape(batch, shape);
+                }
+            }
+#endif
+        }
 
         // Render unlit cube + sphere
-
-        static GeometryCache::Shape SHAPE[] = {
-            GeometryCache::Cube,
-            GeometryCache::Sphere,
-            GeometryCache::Tetrahedron,
-            GeometryCache::Icosahedron,
-        };
-
         static auto startUsecs = usecTimestampNow(); 
         float seconds = getSeconds(startUsecs);
         seconds /= 4.0;
-        int shapeIndex = ((int)seconds) % 4;
+        int shapeIndex = ((int)seconds) % TYPE_COUNT;
         bool wire = seconds - floor(seconds) > 0.5f;
         batch.setModelTransform(Transform());
         batch._glColor4f(0.8f, 0.25f, 0.25f, 1.0f);
@@ -263,7 +375,7 @@ public:
             geometryCache->renderShape(batch, SHAPE[shapeIndex]);
         }
         
-        batch.setModelTransform(Transform().setScale(1.05f));
+        batch.setModelTransform(Transform().setScale(2.05f));
         batch._glColor4f(1, 1, 1, 1);
         geometryCache->renderWireCube(batch);