Merge pull request #5886 from samcake/calvin

Expose multi Draw Indirect in Batch (Hijack Austin's work...)
2025-04-14 05:27:07 +02:00 · 2015-09-23 19:13:28 -07:00 · 2015-09-23 19:13:28 -07:00 · 5a80c4d0e7
commit 5a80c4d0e7
parent 14bfdebdca 7d8f3661ad
12 changed files with 395 additions and 127 deletions
--- a/libraries/gpu/src/gpu/Batch.cpp
+++ b/libraries/gpu/src/gpu/Batch.cpp
@ -59,47 +59,60 @@ void Batch::clear() {

 uint32 Batch::cacheData(uint32 size, const void* data) {
    uint32 offset = _data.size();
-    uint32 nbBytes = size;
-    _data.resize(offset + nbBytes);
+    uint32 numBytes = size;
+    _data.resize(offset + numBytes);
    memcpy(_data.data() + offset, data, size);

    return offset;
 }

-void Batch::draw(Primitive primitiveType, uint32 nbVertices, uint32 startVertex) {
+void Batch::draw(Primitive primitiveType, uint32 numVertices, uint32 startVertex) {
    ADD_COMMAND(draw);

    _params.push_back(startVertex);
-    _params.push_back(nbVertices);
+    _params.push_back(numVertices);
    _params.push_back(primitiveType);
 }

-void Batch::drawIndexed(Primitive primitiveType, uint32 nbIndices, uint32 startIndex) {
+void Batch::drawIndexed(Primitive primitiveType, uint32 numIndices, uint32 startIndex) {
    ADD_COMMAND(drawIndexed);

    _params.push_back(startIndex);
-    _params.push_back(nbIndices);
+    _params.push_back(numIndices);
    _params.push_back(primitiveType);
 }

-void Batch::drawInstanced(uint32 nbInstances, Primitive primitiveType, uint32 nbVertices, uint32 startVertex, uint32 startInstance) {
+void Batch::drawInstanced(uint32 numInstances, Primitive primitiveType, uint32 numVertices, uint32 startVertex, uint32 startInstance) {
    ADD_COMMAND(drawInstanced);

    _params.push_back(startInstance);
    _params.push_back(startVertex);
-    _params.push_back(nbVertices);
+    _params.push_back(numVertices);
    _params.push_back(primitiveType);
-    _params.push_back(nbInstances);
+    _params.push_back(numInstances);
 }

-void Batch::drawIndexedInstanced(uint32 nbInstances, Primitive primitiveType, uint32 nbIndices, uint32 startIndex, uint32 startInstance) {
+void Batch::drawIndexedInstanced(uint32 numInstances, Primitive primitiveType, uint32 numIndices, uint32 startIndex, uint32 startInstance) {
    ADD_COMMAND(drawIndexedInstanced);

    _params.push_back(startInstance);
    _params.push_back(startIndex);
-    _params.push_back(nbIndices);
+    _params.push_back(numIndices);
+    _params.push_back(primitiveType);
+    _params.push_back(numInstances);
+}
+
+
+void Batch::multiDrawIndirect(uint32 numCommands, Primitive primitiveType) {
+    ADD_COMMAND(multiDrawIndirect);
+    _params.push_back(numCommands);
+    _params.push_back(primitiveType);
+}
+
+void Batch::multiDrawIndexedIndirect(uint32 nbCommands, Primitive primitiveType) {
+    ADD_COMMAND(multiDrawIndexedIndirect);
+    _params.push_back(nbCommands);
    _params.push_back(primitiveType);
-    _params.push_back(nbInstances);
 }

 void Batch::setInputFormat(const Stream::FormatPointer& format) {
@ -144,6 +157,15 @@ void Batch::setIndexBuffer(const BufferView& buffer) {
    setIndexBuffer(buffer._element.getType(), buffer._buffer, buffer._offset);
 }

+void Batch::setIndirectBuffer(const BufferPointer& buffer, Offset offset, Offset stride) {
+    ADD_COMMAND(setIndirectBuffer);
+
+    _params.push_back(_buffers.cache(buffer));
+    _params.push_back(offset);
+    _params.push_back(stride);
+}
+
+
 void Batch::setModelTransform(const Transform& model) {
    ADD_COMMAND(setModelTransform);

@ -288,6 +310,11 @@ void Batch::resetStages() {
    ADD_COMMAND(resetStages);
 }

+void Batch::runLambda(std::function<void()> f) {
+    ADD_COMMAND(runLambda);
+    _params.push_back(_lambdas.cache(f));
+}
+
 void Batch::enableStereo(bool enable) {
    _enableStereo = enable;
 }
--- a/libraries/gpu/src/gpu/Batch.h
+++ b/libraries/gpu/src/gpu/Batch.h
@ -63,8 +63,8 @@ public:

        void process(Batch& batch) {
            if (_function) {
-            _function(batch, *this);
-        }
+                _function(batch, *this);
+            }
        }
    };

@ -93,15 +93,18 @@ public:

    // Drawcalls
    void draw(Primitive primitiveType, uint32 numVertices, uint32 startVertex = 0);
-    void drawIndexed(Primitive primitiveType, uint32 nbIndices, uint32 startIndex = 0);
-    void drawInstanced(uint32 nbInstances, Primitive primitiveType, uint32 nbVertices, uint32 startVertex = 0, uint32 startInstance = 0);
-    void drawIndexedInstanced(uint32 nbInstances, Primitive primitiveType, uint32 nbIndices, uint32 startIndex = 0, uint32 startInstance = 0);
+    void drawIndexed(Primitive primitiveType, uint32 numIndices, uint32 startIndex = 0);
+    void drawInstanced(uint32 numInstances, Primitive primitiveType, uint32 numVertices, uint32 startVertex = 0, uint32 startInstance = 0);
+    void drawIndexedInstanced(uint32 numInstances, Primitive primitiveType, uint32 numIndices, uint32 startIndex = 0, uint32 startInstance = 0);
+    void multiDrawIndirect(uint32 numCommands, Primitive primitiveType);
+    void multiDrawIndexedIndirect(uint32 numCommands, Primitive primitiveType);


    void setupNamedCalls(const std::string& instanceName, size_t count, NamedBatchData::Function function);
    void setupNamedCalls(const std::string& instanceName, NamedBatchData::Function function);
    BufferPointer getNamedBuffer(const std::string& instanceName, uint8_t index = 0);
-    
+    void setNamedBuffer(const std::string& instanceName, BufferPointer& buffer, uint8_t index = 0);
+
    

    // Input Stage
@ -117,6 +120,29 @@ public:
    void setIndexBuffer(Type type, const BufferPointer& buffer, Offset offset);
    void setIndexBuffer(const BufferView& buffer); // not a command, just a shortcut from a BufferView

+    // Indirect buffer is used by the multiDrawXXXIndirect calls
+    // The indirect buffer contains the command descriptions to execute multiple drawcalls in a single call
+    void setIndirectBuffer(const BufferPointer& buffer, Offset offset = 0, Offset stride = 0);
+    
+    // multi command desctription for multiDrawIndexedIndirect
+    class DrawIndirectCommand {
+    public:
+        uint  _count{ 0 };
+        uint  _instanceCount{ 0 };
+        uint  _firstIndex{ 0 };
+        uint  _baseInstance{ 0 };
+    };
+
+    // multi command desctription for multiDrawIndexedIndirect
+    class DrawIndexedIndirectCommand {
+    public:
+        uint  _count{ 0 };
+        uint  _instanceCount{ 0 };
+        uint  _firstIndex{ 0 };
+        uint  _baseVertex{ 0 };
+        uint  _baseInstance{ 0 };
+    };
+
    // Transform Stage
    // Vertex position is transformed by ModelTransform from object space to world space
    // Then by the inverse of the ViewTransform from world space to eye space
@ -194,10 +220,13 @@ public:
        COMMAND_drawIndexed,
        COMMAND_drawInstanced,
        COMMAND_drawIndexedInstanced,
+        COMMAND_multiDrawIndirect,
+        COMMAND_multiDrawIndexedIndirect,

        COMMAND_setInputFormat,
        COMMAND_setInputBuffer,
        COMMAND_setIndexBuffer,
+        COMMAND_setIndirectBuffer,

        COMMAND_setModelTransform,
        COMMAND_setViewTransform,
@ -221,6 +250,8 @@ public:

        COMMAND_resetStages,

+        COMMAND_runLambda,
+
        // TODO: As long as we have gl calls explicitely issued from interface
        // code, we need to be able to record and batch these calls. THe long 
        // term strategy is to get rid of any GL calls in favor of the HIFI GPU API
@ -302,6 +333,7 @@ public:
    typedef Cache<PipelinePointer>::Vector PipelineCaches;
    typedef Cache<FramebufferPointer>::Vector FramebufferCaches;
    typedef Cache<QueryPointer>::Vector QueryCaches;
+    typedef Cache<std::function<void()>>::Vector LambdaCache;

    // Cache Data in a byte array if too big to fit in Param
    // FOr example Mat4s are going there
@ -327,6 +359,7 @@ public:
    PipelineCaches _pipelines;
    FramebufferCaches _framebuffers;
    QueryCaches _queries;
+    LambdaCache _lambdas;

    NamedBatchDataMap _namedData;

@ -334,8 +367,10 @@ public:
    bool _enableSkybox{ false };

 protected:
+    // Maybe useful but shoudln't be public. Please convince me otherwise
+    void runLambda(std::function<void()> f);
 };

-};
+}

 #endif
--- a/libraries/gpu/src/gpu/Format.h
+++ b/libraries/gpu/src/gpu/Format.h
@ -130,8 +130,8 @@ static const int LOCATION_COUNT[NUM_DIMENSIONS] = {
    4,
 };

-// Count (of scalars) in an Element for a given Dimension
-static const int DIMENSION_COUNT[NUM_DIMENSIONS] = {
+// Count (of scalars) in an Element for a given Dimension's location
+static const int SCALAR_COUNT_PER_LOCATION[NUM_DIMENSIONS] = {
    1,
    2,
    3,
@ -141,6 +141,17 @@ static const int DIMENSION_COUNT[NUM_DIMENSIONS] = {
    4,
 };

+// Count (of scalars) in an Element for a given Dimension
+static const int SCALAR_COUNT[NUM_DIMENSIONS] = {
+    1,
+    2,
+    3,
+    4,
+    4,
+    9,
+    16,
+};
+
 // Semantic of an Element
 // Provide information on how to use the element
 enum Semantic {
@ -194,14 +205,18 @@ public:
    Semantic getSemantic() const { return (Semantic)_semantic; }

    Dimension getDimension() const { return (Dimension)_dimension; }
-    uint8 getDimensionCount() const { return  DIMENSION_COUNT[(Dimension)_dimension]; }
-    uint8 getLocationCount() const { return  LOCATION_COUNT[(Dimension)_dimension]; }
+    

    Type getType() const { return (Type)_type; }
    bool isNormalized() const { return (getType() >= NORMALIZED_START); }
    bool isInteger() const { return TYPE_IS_INTEGER[getType()]; }

-    uint32 getSize() const { return DIMENSION_COUNT[_dimension] * TYPE_SIZE[_type]; }
+    uint8 getScalarCount() const { return  SCALAR_COUNT[(Dimension)_dimension]; }
+    uint32 getSize() const { return SCALAR_COUNT[_dimension] * TYPE_SIZE[_type]; }
+
+    uint8 getLocationCount() const { return  LOCATION_COUNT[(Dimension)_dimension]; }
+    uint8 getLocationScalarCount() const { return  SCALAR_COUNT_PER_LOCATION[(Dimension)_dimension]; }
+    uint32 getLocationSize() const { return SCALAR_COUNT_PER_LOCATION[_dimension] * TYPE_SIZE[_type]; }

    uint16 getRaw() const { return *((uint16*) (this)); }

--- a/libraries/gpu/src/gpu/GLBackend.cpp
+++ b/libraries/gpu/src/gpu/GLBackend.cpp
@ -23,10 +23,13 @@ GLBackend::CommandCall GLBackend::_commandCalls[Batch::NUM_COMMANDS] =
    (&::gpu::GLBackend::do_drawIndexed),
    (&::gpu::GLBackend::do_drawInstanced),
    (&::gpu::GLBackend::do_drawIndexedInstanced),
-    
+    (&::gpu::GLBackend::do_multiDrawIndirect),
+    (&::gpu::GLBackend::do_multiDrawIndexedIndirect),
+
    (&::gpu::GLBackend::do_setInputFormat),
    (&::gpu::GLBackend::do_setInputBuffer),
    (&::gpu::GLBackend::do_setIndexBuffer),
+    (&::gpu::GLBackend::do_setIndirectBuffer),

    (&::gpu::GLBackend::do_setModelTransform),
    (&::gpu::GLBackend::do_setViewTransform),
@ -50,6 +53,8 @@ GLBackend::CommandCall GLBackend::_commandCalls[Batch::NUM_COMMANDS] =

    (&::gpu::GLBackend::do_resetStages),

+    (&::gpu::GLBackend::do_runLambda),
+
    (&::gpu::GLBackend::do_glActiveBindTexture),

    (&::gpu::GLBackend::do_glUniform1i),
@ -332,19 +337,65 @@ void GLBackend::do_drawIndexedInstanced(Batch& batch, uint32 paramOffset) {
    GLenum mode = _primitiveToGLmode[(Primitive)batch._params[paramOffset + 3]._uint];
    uint32 numIndices = batch._params[paramOffset + 2]._uint;
    uint32 startIndex = batch._params[paramOffset + 1]._uint;
+    // FIXME glDrawElementsInstancedBaseVertexBaseInstance is only available in GL 4.3 
+    // and higher, so currently we ignore this field
    uint32 startInstance = batch._params[paramOffset + 0]._uint;
    GLenum glType = _elementTypeToGLType[_input._indexBufferType];

+#if (GPU_INPUT_PROFILE == GPU_CORE_43)
+    glDrawElementsInstancedBaseVertexBaseInstance(mode, numIndices, glType, reinterpret_cast<GLvoid*>(startIndex + _input._indexBufferOffset), numInstances, 0, startInstance);
+#else
    glDrawElementsInstanced(mode, numIndices, glType, reinterpret_cast<GLvoid*>(startIndex + _input._indexBufferOffset), numInstances);
+    Q_UNUSED(startInstance); 
+#endif
+    (void)CHECK_GL_ERROR();
+}
+
+
+void GLBackend::do_multiDrawIndirect(Batch& batch, uint32 paramOffset) {
+#if (GPU_INPUT_PROFILE == GPU_CORE_43)
+    updateInput();
+    updateTransform();
+    updatePipeline();
+
+    uint commandCount = batch._params[paramOffset + 0]._uint;
+    GLenum mode = _primitiveToGLmode[(Primitive)batch._params[paramOffset + 1]._uint];
+
+    glMultiDrawArraysIndirect(mode, reinterpret_cast<GLvoid*>(_input._indirectBufferOffset), commandCount, _input._indirectBufferStride);
+#else
+    // FIXME implement the slow path
+#endif
    (void)CHECK_GL_ERROR();

-    Q_UNUSED(startInstance);
 }

+void GLBackend::do_multiDrawIndexedIndirect(Batch& batch, uint32 paramOffset) {
+#if (GPU_INPUT_PROFILE == GPU_CORE_43)
+    updateInput();
+    updateTransform();
+    updatePipeline();
+
+    uint commandCount = batch._params[paramOffset + 0]._uint;
+    GLenum mode = _primitiveToGLmode[(Primitive)batch._params[paramOffset + 1]._uint];
+    GLenum indexType = _elementTypeToGLType[_input._indexBufferType];
+
+    glMultiDrawElementsIndirect(mode, indexType, reinterpret_cast<GLvoid*>(_input._indirectBufferOffset), commandCount, _input._indirectBufferStride);
+#else
+    // FIXME implement the slow path
+#endif
+    (void)CHECK_GL_ERROR();
+}
+
+
 void GLBackend::do_resetStages(Batch& batch, uint32 paramOffset) {
    resetStages();
 }

+void GLBackend::do_runLambda(Batch& batch, uint32 paramOffset) {
+    std::function<void()> f = batch._lambdas.get(batch._params[paramOffset]._uint);
+    f();
+}
+
 void GLBackend::resetStages() {
    resetInputStage();
    resetPipelineStage();
--- a/libraries/gpu/src/gpu/GLBackend.h
+++ b/libraries/gpu/src/gpu/GLBackend.h
@ -252,11 +252,14 @@ protected:
    void do_drawIndexed(Batch& batch, uint32 paramOffset);
    void do_drawInstanced(Batch& batch, uint32 paramOffset);
    void do_drawIndexedInstanced(Batch& batch, uint32 paramOffset);
-
+    void do_multiDrawIndirect(Batch& batch, uint32 paramOffset);
+    void do_multiDrawIndexedIndirect(Batch& batch, uint32 paramOffset);
+    
    // Input Stage
    void do_setInputFormat(Batch& batch, uint32 paramOffset);
    void do_setInputBuffer(Batch& batch, uint32 paramOffset);
    void do_setIndexBuffer(Batch& batch, uint32 paramOffset);
+    void do_setIndirectBuffer(Batch& batch, uint32 paramOffset);

    void initInput();
    void killInput();
@ -284,6 +287,10 @@ protected:
        Offset _indexBufferOffset;
        Type _indexBufferType;

+        BufferPointer _indirectBuffer;
+        Offset _indirectBufferOffset{ 0 };
+        Offset _indirectBufferStride{ 0 };
+
        GLuint _defaultVAO;

        InputStageState() :
@ -448,6 +455,9 @@ protected:

    // Reset stages
    void do_resetStages(Batch& batch, uint32 paramOffset);
+
+    void do_runLambda(Batch& batch, uint32 paramOffset);
+
    void resetStages();

    // TODO: As long as we have gl calls explicitely issued from interface
@ -471,7 +481,6 @@ protected:
    static CommandCall _commandCalls[Batch::NUM_COMMANDS];
 };

-
 };

 #endif
--- a/libraries/gpu/src/gpu/GLBackendInput.cpp
+++ b/libraries/gpu/src/gpu/GLBackendInput.cpp
@ -57,11 +57,7 @@ void GLBackend::do_setInputBuffer(Batch& batch, uint32 paramOffset) {
    }
 }

-#if (GPU_INPUT_PROFILE == GPU_CORE_41)
-#define NO_SUPPORT_VERTEX_ATTRIB_FORMAT
-#else
-#define SUPPORT_VERTEX_ATTRIB_FORMAT
-#endif
+


 void GLBackend::initInput() {
@ -90,6 +86,14 @@ void GLBackend::syncInputStateCache() {
    glBindVertexArray(_input._defaultVAO);
 }

+// Core 41 doesn't expose the features to really separate the vertex format from the vertex buffers binding
+// Core 43 does :)
+#if (GPU_INPUT_PROFILE == GPU_CORE_41)
+#define NO_SUPPORT_VERTEX_ATTRIB_FORMAT
+#else
+#define SUPPORT_VERTEX_ATTRIB_FORMAT
+#endif
+
 void GLBackend::updateInput() {
 #if defined(SUPPORT_VERTEX_ATTRIB_FORMAT)
    if (_input._invalidFormat) {
@ -100,19 +104,28 @@ void GLBackend::updateInput() {
        if (_input._format) {
            for (auto& it : _input._format->getAttributes()) {
                const Stream::Attribute& attrib = (it).second;
-                newActivation.set(attrib._slot);
-                glVertexAttribFormat(
-                    attrib._slot,
-                    attrib._element.getDimensionCount(),
-                    _elementTypeToGLType[attrib._element.getType()],
-                    attrib._element.isNormalized(),
-                    attrib._offset);
+
+                GLuint slot = attrib._slot;
+                GLuint count = attrib._element.getLocationScalarCount();
+                uint8_t locationCount = attrib._element.getLocationCount();
+                GLenum type = _elementTypeToGLType[attrib._element.getType()];
+                GLuint offset = attrib._offset;;
+                GLboolean isNormalized = attrib._element.isNormalized();
+
+                GLenum perLocationSize = attrib._element.getLocationSize();
+
+                for (size_t locNum = 0; locNum < locationCount; ++locNum) {
+                    newActivation.set(slot + locNum);
+                    glVertexAttribFormat(slot + locNum, count, type, isNormalized, offset + locNum * perLocationSize);
+                    glVertexAttribDivisor(slot + locNum, attrib._frequency);
+                    glVertexAttribBinding(slot + locNum, attrib._channel);
+                }
            }
            (void) CHECK_GL_ERROR();
        }

        // Manage Activation what was and what is expected now
-        for (int i = 0; i < newActivation.size(); i++) {
+        for (size_t i = 0; i < newActivation.size(); i++) {
            bool newState = newActivation[i];
            if (newState != _input._attributeActivation[i]) {
                if (newState) {
@ -213,18 +226,19 @@ void GLBackend::updateInput() {
                        for (unsigned int i = 0; i < channel._slots.size(); i++) {
                            const Stream::Attribute& attrib = attributes.at(channel._slots[i]);
                            GLuint slot = attrib._slot;
-                            GLuint count = attrib._element.getDimensionCount();
+                            GLuint count = attrib._element.getLocationScalarCount();
                            uint8_t locationCount = attrib._element.getLocationCount();
                            GLenum type = _elementTypeToGLType[attrib._element.getType()];
-                            GLenum perLocationStride = strides[bufferNum];
-                            GLuint stride = perLocationStride * locationCount;
+                            // GLenum perLocationStride = strides[bufferNum];
+                            GLenum perLocationStride = attrib._element.getLocationSize();
+                            GLuint stride = strides[bufferNum];
                            GLuint pointer = attrib._offset + offsets[bufferNum];
                            GLboolean isNormalized = attrib._element.isNormalized();

-                            for (int j = 0; j < locationCount; ++j) {
-                                glVertexAttribPointer(slot + j, count, type, isNormalized, stride,
-                                    reinterpret_cast<GLvoid*>(pointer + perLocationStride * j));
-                                glVertexAttribDivisor(slot + j, attrib._frequency);
+                            for (size_t locNum = 0; locNum < locationCount; ++locNum) {
+                                glVertexAttribPointer(slot + locNum, count, type, isNormalized, stride,
+                                    reinterpret_cast<GLvoid*>(pointer + perLocationStride * locNum));
+                                glVertexAttribDivisor(slot + locNum, attrib._frequency);
                            }
                            
                            // TODO: Support properly the IAttrib version
@ -273,21 +287,36 @@ void GLBackend::resetInputStage() {
 }

 void GLBackend::do_setIndexBuffer(Batch& batch, uint32 paramOffset) {
-    _input._indexBufferType = (Type) batch._params[paramOffset + 2]._uint;
-    BufferPointer indexBuffer = batch._buffers.get(batch._params[paramOffset + 1]._uint);
+    _input._indexBufferType = (Type)batch._params[paramOffset + 2]._uint;
    _input._indexBufferOffset = batch._params[paramOffset + 0]._uint;
-    _input._indexBuffer = indexBuffer;
-    if (indexBuffer) {
-        glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, getBufferID(*indexBuffer));
-    } else {
-        glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
+
+    BufferPointer indexBuffer = batch._buffers.get(batch._params[paramOffset + 1]._uint);
+    if (indexBuffer != _input._indexBuffer) {
+        _input._indexBuffer = indexBuffer;
+        if (indexBuffer) {
+            glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, getBufferID(*indexBuffer));
+        } else {
+            // FIXME do we really need this?  Is there ever a draw call where we care that the element buffer is null?
+            glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
+        }
    }
    (void) CHECK_GL_ERROR();
 }

-template <typename V>
-void popParam(Batch::Params& params, uint32& paramOffset, V& v) {
-    for (size_t i = 0; i < v.length(); ++i) {
-        v[i] = params[paramOffset++]._float;
+void GLBackend::do_setIndirectBuffer(Batch& batch, uint32 paramOffset) {
+    _input._indirectBufferOffset = batch._params[paramOffset + 1]._uint;
+    _input._indirectBufferStride = batch._params[paramOffset + 2]._uint;
+
+    BufferPointer buffer = batch._buffers.get(batch._params[paramOffset]._uint);
+    if (buffer != _input._indirectBuffer) {
+        _input._indirectBuffer = buffer;
+        if (buffer) {
+            glBindBuffer(GL_DRAW_INDIRECT_BUFFER, getBufferID(*buffer));
+        } else {
+            // FIXME do we really need this?  Is there ever a draw call where we care that the element buffer is null?
+            glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
+        }
    }
+
+    (void)CHECK_GL_ERROR();
 }
--- a/libraries/gpu/src/gpu/GPUConfig.h
+++ b/libraries/gpu/src/gpu/GPUConfig.h
@ -17,6 +17,8 @@

 #define GPU_CORE 1
 #define GPU_LEGACY 0
+#define GPU_CORE_41 410
+#define GPU_CORE_43 430

 #if defined(__APPLE__)

@ -33,7 +35,7 @@
 #include <GL/wglew.h>

 #define GPU_FEATURE_PROFILE GPU_CORE
-#define GPU_INPUT_PROFILE GPU_CORE_41
+#define GPU_INPUT_PROFILE GPU_CORE_43

 #elif defined(ANDROID)

@ -42,7 +44,7 @@
 #include <GL/glew.h>

 #define GPU_FEATURE_PROFILE GPU_CORE
-#define GPU_INPUT_PROFILE GPU_CORE_41
+#define GPU_INPUT_PROFILE GPU_CORE_43

 #endif

--- a/libraries/gpu/src/gpu/Texture.cpp
+++ b/libraries/gpu/src/gpu/Texture.cpp
@ -629,7 +629,7 @@ bool sphericalHarmonicsFromTexture(const gpu::Texture& cubeTexture, std::vector<
    // for each face of cube texture
    for(int face=0; face < gpu::Texture::NUM_CUBE_FACES; face++) {

-        auto numComponents = cubeTexture.accessStoredMipFace(0,face)->_format.getDimensionCount();
+        auto numComponents = cubeTexture.accessStoredMipFace(0,face)->_format.getScalarCount();
        auto data = cubeTexture.accessStoredMipFace(0,face)->_sysmem.readData();
        if (data == nullptr) {
            continue;
--- a/libraries/render-utils/src/DeferredGlobalLight.slh
+++ b/libraries/render-utils/src/DeferredGlobalLight.slh
@ -147,11 +147,9 @@ vec3 evalLightmappedColor(mat4 invViewMat, float shadowAttenuation, vec3 normal,
    // it should be just 0, but we have innacurracy so we need to overshoot
    const float PERPENDICULAR_THRESHOLD = -0.005;
    float facingLight = step(PERPENDICULAR_THRESHOLD, diffuseDot); 
-    //float facingLight = step(PERPENDICULAR_THRESHOLD, diffuseDot); 
-        
+    //float facingLight = step(PERPENDICULAR_THRESHOLD, diffuseDot);
    // evaluate the shadow test but only relevant for light facing fragments
    float lightAttenuation = (1 - facingLight) + facingLight * shadowAttenuation;
-            
    // diffuse light is the lightmap dimmed by shadow
    vec3 diffuseLight = lightAttenuation * lightmap;

--- a/libraries/render-utils/src/GeometryCache.cpp
+++ b/libraries/render-utils/src/GeometryCache.cpp
@ -88,37 +88,34 @@ void GeometryCache::ShapeData::setupIndices(gpu::BufferPointer& indexBuffer, con
 void GeometryCache::ShapeData::setupBatch(gpu::Batch& batch) const {
    batch.setInputBuffer(gpu::Stream::POSITION, _positionView);
    batch.setInputBuffer(gpu::Stream::NORMAL, _normalView);
+    batch.setIndexBuffer(gpu::UINT16, _indices, 0);
 }

 void GeometryCache::ShapeData::draw(gpu::Batch& batch) const {
    if (_indexCount) {
        setupBatch(batch);
-        batch.setIndexBuffer(gpu::UINT16, _indices, _indexOffset);
-        batch.drawIndexed(gpu::TRIANGLES, _indexCount);
+        batch.drawIndexed(gpu::TRIANGLES, _indexCount, _indexOffset);
    }
 }

 void GeometryCache::ShapeData::drawWire(gpu::Batch& batch) const {
    if (_wireIndexCount) {
        setupBatch(batch);
-        batch.setIndexBuffer(gpu::UINT16, _indices, _wireIndexOffset);
-        batch.drawIndexed(gpu::LINES, _wireIndexCount);
+        batch.drawIndexed(gpu::LINES, _wireIndexCount, _wireIndexOffset);
    }
 }

 void GeometryCache::ShapeData::drawInstances(gpu::Batch& batch, size_t count) const {
    if (_indexCount) {
        setupBatch(batch);
-        batch.setIndexBuffer(gpu::UINT16, _indices, _indexOffset);
-        batch.drawIndexedInstanced(count, gpu::TRIANGLES, _indexCount);
+        batch.drawIndexedInstanced(count, gpu::TRIANGLES, _indexCount, _indexOffset);
    }
 }

 void GeometryCache::ShapeData::drawWireInstances(gpu::Batch& batch, size_t count) const {
    if (_wireIndexCount) {
        setupBatch(batch);
-        batch.setIndexBuffer(gpu::UINT16, _indices, _wireIndexOffset);
-        batch.drawIndexedInstanced(count, gpu::LINES, _wireIndexCount);
+        batch.drawIndexedInstanced(count, gpu::LINES, _wireIndexCount, _wireIndexOffset);
    }
 }

@ -323,7 +320,8 @@ void GeometryCache::buildShapes() {
            20, 21, 21, 22, 22, 23, 23, 20, // back
            0, 23, 1, 22, 2, 21, 3, 20 // sides
        };
-        for (unsigned int i = 0; i < wireIndices.size(); ++i) {
+
+        for (size_t i = 0; i < wireIndices.size(); ++i) {
            indices[i] += startingIndex;
        }

@ -374,7 +372,7 @@ void GeometryCache::buildShapes() {
            0, 3, 1, 3, 2, 3,
        };

-        for (unsigned int i = 0; i < wireIndices.size(); ++i) {
+        for (size_t i = 0; i < wireIndices.size(); ++i) {
            wireIndices[i] += startingIndex;
        }

--- a/libraries/render-utils/src/GeometryCache.h
+++ b/libraries/render-utils/src/GeometryCache.h
@ -232,14 +232,6 @@ public:
    /// Set a batch to the simple pipeline, returning the previous pipeline
    void useSimpleDrawPipeline(gpu::Batch& batch, bool noBlend = false);

-private:
-    GeometryCache();
-    virtual ~GeometryCache();
-    void buildShapes();
-
-    typedef QPair<int, int> IntPair;
-    typedef QPair<unsigned int, unsigned int> VerticesIndices;
-
    struct ShapeData {
        size_t _indexOffset{ 0 };
        size_t _indexCount{ 0 };
@ -263,7 +255,13 @@ private:

    VShape _shapes;

+private:
+    GeometryCache();
+    virtual ~GeometryCache();
+    void buildShapes();

+    typedef QPair<int, int> IntPair;
+    typedef QPair<unsigned int, unsigned int> VerticesIndices;

    gpu::PipelinePointer _standardDrawPipeline;
    gpu::PipelinePointer _standardDrawPipelineNoBlend;
--- a/tests/gpu-test/src/main.cpp
+++ b/tests/gpu-test/src/main.cpp
@ -35,6 +35,7 @@

 // Must come after GL headers
 #include <QtGui/QOpenGLContext>
+#include <QtGui/QOpenGLDebugLogger>

 #include <GLMHelpers.h>
 #include <PathUtils.h>
@ -101,7 +102,16 @@ float getSeconds(quint64 start = 0) {
    return seconds;
 }

+static const size_t TYPE_COUNT = 4;
+static GeometryCache::Shape SHAPE[TYPE_COUNT] = {
+    GeometryCache::Icosahedron,
+    GeometryCache::Cube,
+    GeometryCache::Sphere,
+    GeometryCache::Tetrahedron,
+    //GeometryCache::Line,
+};

+gpu::Stream::FormatPointer& getInstancedSolidStreamFormat();

 // Creates an OpenGL window that renders a simple unlit scene using the gpu library and GeometryCache
 // Should eventually get refactored into something that supports multiple gpu backends.
@ -134,7 +144,7 @@ public:
        // Qt Quick may need a depth and stencil buffer. Always make sure these are available.
        format.setDepthBufferSize(16);
        format.setStencilBufferSize(8);
-        format.setVersion(4, 1);
+        format.setVersion(4, 3);
        format.setProfile(QSurfaceFormat::OpenGLContextProfile::CoreProfile);
        format.setOption(QSurfaceFormat::DebugContext);
        format.setSwapInterval(0);
@ -147,6 +157,13 @@ public:

        show();
        makeCurrent();
+        QOpenGLDebugLogger* logger = new QOpenGLDebugLogger(this);
+        logger->initialize(); // initializes in the current context, i.e. ctx
+        connect(logger, &QOpenGLDebugLogger::messageLogged, [](const QOpenGLDebugMessage& message){
+            qDebug() << message;
+        });
+        logger->startLogging(QOpenGLDebugLogger::SynchronousLogging);
+

        gpu::Context::init<gpu::GLBackend>();
        _context = std::make_shared<gpu::Context>();
@ -175,7 +192,9 @@ public:
    }

    void draw() {
-        if (!isVisible()) {
+        // Attempting to draw before we're visible and have a valid size will
+        // produce GL errors.
+        if (!isVisible() || _size.width() <= 0 || _size.height() <= 0) {
            return;
        }
        makeCurrent();
@ -186,11 +205,12 @@ public:
        batch.setViewportTransform({ 0, 0, _size.width() * devicePixelRatio(), _size.height() * devicePixelRatio() });
        batch.setProjectionTransform(_projectionMatrix);
        
-        double t = _time.elapsed() * 1e-3;
+        float t = _time.elapsed() * 1e-3f;
        glm::vec3 unitscale { 1.0f };
        glm::vec3 up { 0.0f, 1.0f, 0.0f };

-        glm::vec3 camera_position { 1.5f * sinf(t), 0.0f, 1.5f * cosf(t) };
+        float distance = 3.0f;
+        glm::vec3 camera_position{ distance * sinf(t), 0.0f, distance * cosf(t) };

        static const vec3 camera_focus(0);
        static const vec3 camera_up(0, 1, 0);
@ -200,58 +220,143 @@ public:
        batch.setModelTransform(Transform());

        auto geometryCache = DependencyManager::get<GeometryCache>();
-        
+
        // Render grid on xz plane (not the optimal way to do things, but w/e)
        // Note: GeometryCache::renderGrid will *not* work, as it is apparenly unaffected by batch rotations and renders xy only
-        static const std::string GRID_INSTANCE = "Grid";
-        static auto compactColor1 = toCompactColor(vec4{ 0.35f, 0.25f, 0.15f, 1.0f });
-        static auto compactColor2 = toCompactColor(vec4{ 0.15f, 0.25f, 0.35f, 1.0f });
-        auto transformBuffer = batch.getNamedBuffer(GRID_INSTANCE, 0);
-        auto colorBuffer = batch.getNamedBuffer(GRID_INSTANCE, 1);
-        for (int i = 0; i < 100; ++i) {
-            {
-                glm::mat4 transform = glm::translate(mat4(), vec3(0, -1, -50 + i));
-                transform = glm::scale(transform, vec3(100, 1, 1));
-                transformBuffer->append(transform);
-                colorBuffer->append(compactColor1);
-            }
+        {
+            static const std::string GRID_INSTANCE = "Grid";
+            static auto compactColor1 = toCompactColor(vec4{ 0.35f, 0.25f, 0.15f, 1.0f });
+            static auto compactColor2 = toCompactColor(vec4{ 0.15f, 0.25f, 0.35f, 1.0f });
+            static gpu::BufferPointer transformBuffer; 
+            static gpu::BufferPointer colorBuffer;
+            if (!transformBuffer) {
+                transformBuffer = std::make_shared<gpu::Buffer>();
+                colorBuffer = std::make_shared<gpu::Buffer>();
+                for (int i = 0; i < 100; ++i) {
+                    {
+                        glm::mat4 transform = glm::translate(mat4(), vec3(0, -1, -50 + i));
+                        transform = glm::scale(transform, vec3(100, 1, 1));
+                        transformBuffer->append(transform);
+                        colorBuffer->append(compactColor1);
+                    }

-            {
-                glm::mat4 transform = glm::mat4_cast(quat(vec3(0, PI / 2.0f, 0)));
-                transform = glm::translate(transform, vec3(0, -1, -50 + i));
-                transform = glm::scale(transform, vec3(100, 1, 1));
-                transformBuffer->append(transform);
-                colorBuffer->append(compactColor2);
+                    {
+                        glm::mat4 transform = glm::mat4_cast(quat(vec3(0, PI / 2.0f, 0)));
+                        transform = glm::translate(transform, vec3(0, -1, -50 + i));
+                        transform = glm::scale(transform, vec3(100, 1, 1));
+                        transformBuffer->append(transform);
+                        colorBuffer->append(compactColor2);
+                    }
+                }
            }
+            
+            batch.setupNamedCalls(GRID_INSTANCE, 200, [=](gpu::Batch& batch, gpu::Batch::NamedBatchData& data) {
+                batch.setViewTransform(camera);
+                batch.setModelTransform(Transform());
+                batch.setPipeline(_pipeline);
+                batch._glUniform1i(_instanceLocation, 1);
+                geometryCache->renderWireShapeInstances(batch, GeometryCache::Line, data._count, transformBuffer, colorBuffer);
+                batch._glUniform1i(_instanceLocation, 0);
+            });
        }

-        batch.setupNamedCalls(GRID_INSTANCE, 200, [=](gpu::Batch& batch, gpu::Batch::NamedBatchData& data) {
+        {
+            static const size_t ITEM_COUNT = 1000;
+            static const float SHAPE_INTERVAL = (PI * 2.0f) / ITEM_COUNT;
+            static const float ITEM_INTERVAL = SHAPE_INTERVAL / TYPE_COUNT;
+
+            static const gpu::Element POSITION_ELEMENT{ gpu::VEC3, gpu::FLOAT, gpu::XYZ };
+            static const gpu::Element NORMAL_ELEMENT{ gpu::VEC3, gpu::FLOAT, gpu::XYZ };
+            static const gpu::Element COLOR_ELEMENT{ gpu::VEC4, gpu::NUINT8, gpu::RGBA };
+            static const gpu::Element TRANSFORM_ELEMENT{ gpu::MAT4, gpu::FLOAT, gpu::XYZW };
+
+
+            static std::vector<Transform> transforms;
+            static std::vector<vec4> colors;
+            static gpu::BufferPointer indirectBuffer;
+            static gpu::BufferPointer transformBuffer;
+            static gpu::BufferPointer colorBuffer;
+            static gpu::BufferView colorView; 
+            static gpu::BufferView instanceXfmView; 
+
+            if (!transformBuffer) {
+                transformBuffer = std::make_shared<gpu::Buffer>();
+                colorBuffer = std::make_shared<gpu::Buffer>();
+                indirectBuffer = std::make_shared<gpu::Buffer>();
+
+                static const float ITEM_RADIUS = 20;
+                static const vec3 ITEM_TRANSLATION{ 0, 0, -ITEM_RADIUS };
+                for (size_t i = 0; i < TYPE_COUNT; ++i) {
+                    GeometryCache::Shape shape = SHAPE[i];
+                    GeometryCache::ShapeData shapeData = geometryCache->_shapes[shape];
+                    {
+                        gpu::Batch::DrawIndexedIndirectCommand indirectCommand;
+                        indirectCommand._count = shapeData._indexCount;
+                        indirectCommand._instanceCount = ITEM_COUNT;
+                        indirectCommand._baseInstance = i * ITEM_COUNT;
+                        indirectCommand._firstIndex = shapeData._indexOffset / 2;
+                        indirectCommand._baseVertex = 0;
+                        indirectBuffer->append(indirectCommand);
+                    }
+
+                    //indirectCommand._count
+                    float startingInterval = ITEM_INTERVAL * i;
+                    for (size_t j = 0; j < ITEM_COUNT; ++j) {
+                        float theta = j * SHAPE_INTERVAL + startingInterval;
+                        auto transform = glm::rotate(mat4(), theta, Vectors::UP);
+                        transform = glm::rotate(transform, (randFloat() - 0.5f) * PI / 4.0f, Vectors::UNIT_X);
+                        transform = glm::translate(transform, ITEM_TRANSLATION);
+                        transform = glm::scale(transform, vec3(randFloat() / 2.0f + 0.5f));
+                        transformBuffer->append(transform);
+                        transforms.push_back(transform);
+                        auto color = vec4{ randomColorValue(64), randomColorValue(64), randomColorValue(64), 255 };
+                        color /= 255.0f;
+                        colors.push_back(color);
+                        colorBuffer->append(toCompactColor(color));
+                    }
+                }
+                colorView = gpu::BufferView(colorBuffer, COLOR_ELEMENT);
+                instanceXfmView = gpu::BufferView(transformBuffer, TRANSFORM_ELEMENT);
+            }
+
+#if 1
+            GeometryCache::ShapeData shapeData = geometryCache->_shapes[GeometryCache::Icosahedron];
+            {
+                batch.setViewTransform(camera);
+                batch.setModelTransform(Transform());
+                batch.setPipeline(_pipeline);
+                batch._glUniform1i(_instanceLocation, 1);
+                batch.setInputFormat(getInstancedSolidStreamFormat());
+                batch.setInputBuffer(gpu::Stream::COLOR, colorView);
+                batch.setInputBuffer(gpu::Stream::INSTANCE_XFM, instanceXfmView);
+                batch.setIndirectBuffer(indirectBuffer);
+                shapeData.setupBatch(batch);
+                batch.multiDrawIndexedIndirect(TYPE_COUNT, gpu::TRIANGLES);
+                batch._glUniform1i(_instanceLocation, 0);
+            }
+#else
            batch.setViewTransform(camera);
-            batch.setModelTransform(Transform());
            batch.setPipeline(_pipeline);
-            auto& xfm = data._buffers[0];
-            auto& color = data._buffers[1];
-            batch._glUniform1i(_instanceLocation, 1);
-            geometryCache->renderWireShapeInstances(batch, GeometryCache::Line, data._count, xfm, color);
-            batch._glUniform1i(_instanceLocation, 0);
-        });
-
-
+            for (size_t i = 0; i < TYPE_COUNT; ++i) {
+                GeometryCache::Shape shape = SHAPE[i];
+                for (size_t j = 0; j < ITEM_COUNT; ++j) {
+                    int index = i * ITEM_COUNT + j;
+                    batch.setModelTransform(transforms[index]);
+                    const vec4& color = colors[index];
+                    batch._glColor4f(color.r, color.g, color.b, 1.0);
+                    geometryCache->renderShape(batch, shape);
+                }
+            }
+#endif
+        }

        // Render unlit cube + sphere
-
-        static GeometryCache::Shape SHAPE[] = {
-            GeometryCache::Cube,
-            GeometryCache::Sphere,
-            GeometryCache::Tetrahedron,
-            GeometryCache::Icosahedron,
-        };
-
        static auto startUsecs = usecTimestampNow(); 
        float seconds = getSeconds(startUsecs);
+
        seconds /= 4.0f;
-        int shapeIndex = ((int)seconds) % 4;
-        bool wire = seconds - (float)floor(seconds) > 0.5f;
+        int shapeIndex = ((int)seconds) % TYPE_COUNT;
+        bool wire = (seconds - floorf(seconds) > 0.5f);
        batch.setModelTransform(Transform());
        batch._glColor4f(0.8f, 0.25f, 0.25f, 1.0f);

@ -261,7 +366,7 @@ public:
            geometryCache->renderShape(batch, SHAPE[shapeIndex]);
        }
        
-        batch.setModelTransform(Transform().setScale(1.05f));
+        batch.setModelTransform(Transform().setScale(2.05f));
        batch._glColor4f(1, 1, 1, 1);
        geometryCache->renderWireCube(batch);

@ -305,3 +410,4 @@ int main(int argc, char** argv) {
 }

 #include "main.moc"
+