I want to use a vkImage rendered at a previous render pass as Texture to do the composite operation in a fragment shader. From here I learned vkCmdPipelineBarrier is used to wait for GPU finish a rendering operation and I write this code. It works well on Snapdragon devices. But not on Mali-G52. The Write-after-write error is partly happed. Is this code not enough? Any suggestions?
vkCmdEndRenderPass(cb);
vkCmdBeginRenderPass(cb, &renderPassBeginInfo, VK_SUBPASS_CONTENTS_INLINE);
VkViewport viewport = vks::initializers::viewport((float)offscreenPass.width, (float)offscreenPass.height, 0.0f, 1.0f);
vkCmdSetViewport(cb, 0, 1, &viewport);
VkRect2D scissor = vks::initializers::rect2D(offscreenPass.width, offscreenPass.height, 0, 0);
vkCmdSetScissor(cb, 0, 1, &scissor);
// https://github.com/KhronosGroup/Vulkan-Samples/blob/master/samples/performance/pipeline_barriers/pipeline_barriers.cpp
VkImageMemoryBarrier imageMemoryBarrier = vks::initializers::imageMemoryBarrier();
imageMemoryBarrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED;
imageMemoryBarrier.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
imageMemoryBarrier.srcAccessMask = 0;
imageMemoryBarrier.dstAccessMask = 0;
imageMemoryBarrier.image = offscreenPass.color[drawframe].image;
imageMemoryBarrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
imageMemoryBarrier.subresourceRange.baseMipLevel = 0;
imageMemoryBarrier.subresourceRange.levelCount = 1;
imageMemoryBarrier.subresourceRange.baseArrayLayer = 0;
imageMemoryBarrier.subresourceRange.layerCount = 1;
vkCmdPipelineBarrier(
cb,
VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT,
VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
0, 0, nullptr, 0, nullptr, 1, &imageMemoryBarrier);
imageMemoryBarrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED;
imageMemoryBarrier.newLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL;
imageMemoryBarrier.image = offscreenPass.depth.image;
imageMemoryBarrier.srcAccessMask = 0;
imageMemoryBarrier.dstAccessMask = 0;
vkCmdPipelineBarrier(
cb,
VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT,
VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
0, 0, nullptr, 0, nullptr, 1, &imageMemoryBarrier);
I have tried every pattern written here.
If you want to synchronize render passes then your pipeline barrier must be outside of the render pass in the command stream. I.e. it must be after the vkCmdEndRenderPass() of the first pass, and before the vkCmdBeginRenderPass() of the second pass. Pipeline barriers issued inside a render pass, as you are currently doing, are used for synchronization only within the current subpass.
Also, try to avoid:
srcStage=VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT
dstStage=VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT
... for pipeline barriers when you only consume the output of the first pass as a fragment shader input in the second. This is overly conservative and needlessly serializes execution of the geometry processing too. In this case, you should use:
srcStage=VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT
dstStage=VK_PIPELINE_STAGE_FRAGMENT_BIT
... which allows the non-dependent vertex shading and binning for the second pass to run in parallel to the first pass.
Self solved.
The difference in the precision of sampler2D between Adreno and Mali causes this issue. I can read correct data using "precision highp sampler2D".
Related
I am very much a Vulkan/ graphics APIs beginner. I've read some resources on Vulkan synchronization and understand it more than at the beginning but the code still doesn't work. I'm expecting the ray tracing pipeline to output a flat color bule image, but it flickers intensly between blue and just black. Validation layers scream every frame that "images passed to present must be in layout VK_IMAGE_LAYOUT_PRESENT_SRC_KHR or VK_IMAGE_LAYOUT_SHARED_PRESENT_KHR but is in VK_IMAGE_LAYOUT_UNDEFINED."
This is more or less what my code looks like:
vkBeginCommandBuffer();
// ... bind pipeline and descriptor sets
vkCmdTraceRaysKHR();
// Prepare current swap chain image as transfer destination
VkImageMemoryBarrier barrier{};
barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
barrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED;
barrier.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
barrier.image = swapchainImage;
barrier.subresourceRange = subresource_range;
// No need to make anything available
barrier.srcAccessMask = 0;
// The result of this transition should be visible for transfers
barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_TRANSFER_READ_BIT;
vkCmdPipelineBarrier(
cmdBuffer,
VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, // No need to wait for anything
VK_PIPELINE_STAGE_TRANSFER_BIT, // Should make transfers wait
0,
0, nullptr,
0, nullptr,
1, &barrier
);
// Prepare ray tracing output image as transfer source
VkImageMemoryBarrier barrier{};
barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
barrier.oldLayout = VK_IMAGE_LAYOUT_GENERAL;
barrier.newLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL;
barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
barrier.image = renderImage.image;
barrier.subresourceRange = subresource_range;
// The data written by the ray tracing should be made available
barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
// The transition and data should be visible for transitions
barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_TRANSFER_READ_BIT;
vkCmdPipelineBarrier(
cmdBuffer,
VK_PIPELINE_STAGE_RAY_TRACING_SHADER_BIT_KHR, // Should wait until ray tracing is done
VK_PIPELINE_STAGE_TRANSFER_BIT, // Should make transfers wait
0,
0, nullptr,
0, nullptr,
1, &barrier
);
vkCmdCopyImage();
// Transition swap chain image back for presentation
VkImageMemoryBarrier barrier{};
barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
barrier.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
barrier.newLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR;
barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
barrier.image = swapchainImage;
barrier.subresourceRange = subresource_range;
// The effects of the transfer should be made available
barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
// The effects of the transfer should be made visible for swapchain presentation
barrier.dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_READ_BIT;
vkCmdPipelineBarrier(
cmdBuffer,
VK_PIPELINE_STAGE_TRANSFER_BIT, // Wait for transfers
VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, // Block all commands after this barrier
0,
0, nullptr,
0, nullptr,
1, &barrier
);
// Transition ray tracing output image back to general layout
VkImageMemoryBarrier barrier{};
barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
barrier.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL;
barrier.newLayout = VK_IMAGE_LAYOUT_GENERAL;
barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
barrier.image = renderImage.image;
barrier.subresourceRange = subresource_range;
// The effects of the transfer should be made available (possibly unnecessary?)
barrier.srcAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
// (possibly unnecessary?)
barrier.dstAccessMask = VK_ACCESS_MEMORY_WRITE_BIT;
vkCmdPipelineBarrier(
cmdBuffer,
VK_PIPELINE_STAGE_TRANSFER_BIT, // Wait for transfers
VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, // Block all commands after this barrier
0,
0, nullptr,
0, nullptr,
1, &barrier
);
After all this the queue is submitted with two semaphores:
one wait semaphore that is signalled by vkAcquireNextImageKHR. It's wait stage is set to VK_PIPELINE_STAGE_RAY_TRACING_SHADER_BIT_KHR.
one signal semaphore that is later used as a wait semaphore in vkQueuePresentKHR.
So how do I get rid of the vulkan validation layer message and properly display the rendered image?
Edit: The culrpit was found somwhere else (choosing wrong swapchain image for rendering), but I would still appreciate it if someone could confirm/correct my rationale behind the chosen stage and access masks. Especially that now I can't even make it freak out on purpouse, for example by setting the semaphore wait stage to BOTTOM_OF_PIPE (I thought it would mean that no stages wait so the render runs and writes without a swapchain image)
There are two ways that can copy data to image(using stage buffer or not).In the first way that using stage buffer, when the image format is VK_FORMAT_R8G8B8A8_UNORM or VK_FORMAT_R8G8B8_UNORM, it works correctly.But in the way that not using stage buffer, the image format is VK_FORMAT_R8G8B8A8_UNORM, it works well. While changing the format to VK_FORMAT_R8G8B8_UNORM, the result of sample is not correct.The source data can be assured correct when setting different image format .
The code used is from [https://github.com/SaschaWillems/Vulkan/blob/master/examples/texture/texture.cpp](https://www.stackoverflow.com/
if (0/*useStaging*/) {
// Copy data to an optimal tiled image
// This loads the texture data into a host local buffer that is copied to the optimal tiled image on the device
// Create a host-visible staging buffer that contains the raw image data
// This buffer will be the data source for copying texture data to the optimal tiled image on the device
VkBuffer stagingBuffer;
VkDeviceMemory stagingMemory;
VkBufferCreateInfo bufferCreateInfo = vks::initializers::bufferCreateInfo();
bufferCreateInfo.size = ktxTextureSize;
// This buffer is used as a transfer source for the buffer copy
bufferCreateInfo.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
bufferCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
VK_CHECK_RESULT(vkCreateBuffer(device, &bufferCreateInfo, nullptr, &stagingBuffer));
// Get memory requirements for the staging buffer (alignment, memory type bits)
vkGetBufferMemoryRequirements(device, stagingBuffer, &memReqs);
memAllocInfo.allocationSize = memReqs.size;
// Get memory type index for a host visible buffer
memAllocInfo.memoryTypeIndex = vulkanDevice->getMemoryType(memReqs.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
VK_CHECK_RESULT(vkAllocateMemory(device, &memAllocInfo, nullptr, &stagingMemory));
VK_CHECK_RESULT(vkBindBufferMemory(device, stagingBuffer, stagingMemory, 0));
// Copy texture data into host local staging buffer
uint8_t *data;
VK_CHECK_RESULT(vkMapMemory(device, stagingMemory, 0, memReqs.size, 0, (void **)&data));
memcpy(data, ktxTextureData, ktxTextureSize);
vkUnmapMemory(device, stagingMemory);
// Setup buffer copy regions for each mip level
std::vector<VkBufferImageCopy> bufferCopyRegions;
uint32_t offset = 0;
for (uint32_t i = 0; i < texture.mipLevels; i++) {
// Calculate offset into staging buffer for the current mip level
ktx_size_t offset;
KTX_error_code ret = ktxTexture_GetImageOffset(ktxTexture, i, 0, 0, &offset);
assert(ret == KTX_SUCCESS);
// Setup a buffer image copy structure for the current mip level
VkBufferImageCopy bufferCopyRegion = {};
bufferCopyRegion.imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
bufferCopyRegion.imageSubresource.mipLevel = i;
bufferCopyRegion.imageSubresource.baseArrayLayer = 0;
bufferCopyRegion.imageSubresource.layerCount = 1;
bufferCopyRegion.imageExtent.width = ktxTexture->baseWidth >> i;
bufferCopyRegion.imageExtent.height = ktxTexture->baseHeight >> i;
bufferCopyRegion.imageExtent.depth = 1;
bufferCopyRegion.bufferOffset = offset;
bufferCopyRegions.push_back(bufferCopyRegion);
}
// Create optimal tiled target image on the device
VkImageCreateInfo imageCreateInfo = vks::initializers::imageCreateInfo();
imageCreateInfo.imageType = VK_IMAGE_TYPE_2D;
imageCreateInfo.format = format;
imageCreateInfo.mipLevels = texture.mipLevels;
imageCreateInfo.arrayLayers = 1;
imageCreateInfo.samples = VK_SAMPLE_COUNT_1_BIT;
imageCreateInfo.tiling = VK_IMAGE_TILING_OPTIMAL;
imageCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
// Set initial layout of the image to undefined
imageCreateInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
imageCreateInfo.extent = { texture.width, texture.height, 1 };
imageCreateInfo.usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT;
VK_CHECK_RESULT(vkCreateImage(device, &imageCreateInfo, nullptr, &texture.image));
vkGetImageMemoryRequirements(device, texture.image, &memReqs);
memAllocInfo.allocationSize = memReqs.size;
memAllocInfo.memoryTypeIndex = vulkanDevice->getMemoryType(memReqs.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
VK_CHECK_RESULT(vkAllocateMemory(device, &memAllocInfo, nullptr, &texture.deviceMemory));
VK_CHECK_RESULT(vkBindImageMemory(device, texture.image, texture.deviceMemory, 0));
VkCommandBuffer copyCmd = vulkanDevice->createCommandBuffer(VK_COMMAND_BUFFER_LEVEL_PRIMARY, true);
// Image memory barriers for the texture image
// The sub resource range describes the regions of the image that will be transitioned using the memory barriers below
VkImageSubresourceRange subresourceRange = {};
// Image only contains color data
subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
// Start at first mip level
subresourceRange.baseMipLevel = 0;
// We will transition on all mip levels
subresourceRange.levelCount = texture.mipLevels;
// The 2D texture only has one layer
subresourceRange.layerCount = 1;
// Transition the texture image layout to transfer target, so we can safely copy our buffer data to it.
VkImageMemoryBarrier imageMemoryBarrier = vks::initializers::imageMemoryBarrier();;
imageMemoryBarrier.image = texture.image;
imageMemoryBarrier.subresourceRange = subresourceRange;
imageMemoryBarrier.srcAccessMask = 0;
imageMemoryBarrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
imageMemoryBarrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED;
imageMemoryBarrier.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
// Insert a memory dependency at the proper pipeline stages that will execute the image layout transition
// Source pipeline stage is host write/read execution (VK_PIPELINE_STAGE_HOST_BIT)
// Destination pipeline stage is copy command execution (VK_PIPELINE_STAGE_TRANSFER_BIT)
vkCmdPipelineBarrier(
copyCmd,
VK_PIPELINE_STAGE_HOST_BIT,
VK_PIPELINE_STAGE_TRANSFER_BIT,
0,
0, nullptr,
0, nullptr,
1, &imageMemoryBarrier);
// Copy mip levels from staging buffer
vkCmdCopyBufferToImage(
copyCmd,
stagingBuffer,
texture.image,
VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
static_cast<uint32_t>(bufferCopyRegions.size()),
bufferCopyRegions.data());
// Once the data has been uploaded we transfer to the texture image to the shader read layout, so it can be sampled from
imageMemoryBarrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
imageMemoryBarrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
imageMemoryBarrier.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
imageMemoryBarrier.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
// Insert a memory dependency at the proper pipeline stages that will execute the image layout transition
// Source pipeline stage is copy command execution (VK_PIPELINE_STAGE_TRANSFER_BIT)
// Destination pipeline stage fragment shader access (VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT)
vkCmdPipelineBarrier(
copyCmd,
VK_PIPELINE_STAGE_TRANSFER_BIT,
VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT,
0,
0, nullptr,
0, nullptr,
1, &imageMemoryBarrier);
// Store current layout for later reuse
texture.imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
vulkanDevice->flushCommandBuffer(copyCmd, queue, true);
// Clean up staging resources
vkFreeMemory(device, stagingMemory, nullptr);
vkDestroyBuffer(device, stagingBuffer, nullptr);
} else {
// Copy data to a linear tiled image
VkImage mappableImage;
VkDeviceMemory mappableMemory;
// Load mip map level 0 to linear tiling image
VkImageCreateInfo imageCreateInfo = vks::initializers::imageCreateInfo();
imageCreateInfo.imageType = VK_IMAGE_TYPE_2D;
imageCreateInfo.format = format;
imageCreateInfo.mipLevels = 1;
imageCreateInfo.arrayLayers = 1;
imageCreateInfo.samples = VK_SAMPLE_COUNT_1_BIT;
imageCreateInfo.tiling = VK_IMAGE_TILING_LINEAR;
imageCreateInfo.usage = VK_IMAGE_USAGE_SAMPLED_BIT;
imageCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
imageCreateInfo.initialLayout = VK_IMAGE_LAYOUT_PREINITIALIZED;
imageCreateInfo.extent = { texture.width, texture.height, 1 };
VK_CHECK_RESULT(vkCreateImage(device, &imageCreateInfo, nullptr, &mappableImage));
// Get memory requirements for this image like size and alignment
vkGetImageMemoryRequirements(device, mappableImage, &memReqs);
// Set memory allocation size to required memory size
memAllocInfo.allocationSize = memReqs.size;
// Get memory type that can be mapped to host memory
memAllocInfo.memoryTypeIndex = vulkanDevice->getMemoryType(memReqs.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
VK_CHECK_RESULT(vkAllocateMemory(device, &memAllocInfo, nullptr, &mappableMemory));
VK_CHECK_RESULT(vkBindImageMemory(device, mappableImage, mappableMemory, 0));
// Map image memory
void *data;
VK_CHECK_RESULT(vkMapMemory(device, mappableMemory, 0, memReqs.size, 0, &data));
// Copy image data of the first mip level into memory
memcpy(data, ktxTextureData, memReqs.size);
vkUnmapMemory(device, mappableMemory);
// Linear tiled images don't need to be staged and can be directly used as textures
texture.image = mappableImage;
texture.deviceMemory = mappableMemory;
texture.imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
// Setup image memory barrier transfer image to shader read layout
VkCommandBuffer copyCmd = vulkanDevice->createCommandBuffer(VK_COMMAND_BUFFER_LEVEL_PRIMARY, true);
// The sub resource range describes the regions of the image we will be transition
VkImageSubresourceRange subresourceRange = {};
subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
subresourceRange.baseMipLevel = 0;
subresourceRange.levelCount = 1;
subresourceRange.layerCount = 1;
// Transition the texture image layout to shader read, so it can be sampled from
VkImageMemoryBarrier imageMemoryBarrier = vks::initializers::imageMemoryBarrier();;
imageMemoryBarrier.image = texture.image;
imageMemoryBarrier.subresourceRange = subresourceRange;
imageMemoryBarrier.srcAccessMask = VK_ACCESS_HOST_WRITE_BIT;
imageMemoryBarrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
imageMemoryBarrier.oldLayout = VK_IMAGE_LAYOUT_PREINITIALIZED;
imageMemoryBarrier.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
// Insert a memory dependency at the proper pipeline stages that will execute the image layout transition
// Source pipeline stage is host write/read execution (VK_PIPELINE_STAGE_HOST_BIT)
// Destination pipeline stage fragment shader access (VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT)
vkCmdPipelineBarrier(
copyCmd,
VK_PIPELINE_STAGE_HOST_BIT,
VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT,
0,
0, nullptr,
0, nullptr,
1, &imageMemoryBarrier);
vulkanDevice->flushCommandBuffer(copyCmd, queue, true);
}
)
I have a small Vulkan program that runs a compute shader in a loop.
There is only one commandBuffer that is allocated from the only commandPool I have.
After the commandBuffer is built, I submit it to the queue, and wait for it to comple with vkQueueWaitIddle. I does indeed wait for a while in that line of code. After that, I call vkResetCommandPool, which should reset all commandBuffer allocated with that pool (there is only one anyways).
...
vkEndCommandBuffer(commandBuffer);
{
VkSubmitInfo info = {};
info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
info.commandBufferCount = 1;
info.pCommandBuffers = &commandBuffer;
vkQueueSubmit(queue, 1, &info, VK_NULL_HANDLE);
}
vkQueueWaitIdle(queue);
vkResetCommandPool(device, commandPool, VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT);
When it tries to reset the commandPool the validation gives me the following error.
VUID-vkResetCommandPool-commandPool-00040(ERROR / SPEC): msgNum: -1254218959
- Validation Error: [ VUID-vkResetCommandPool-commandPool-00040 ]
Object 0: handle = 0x20d2ce0b718, type = VK_OBJECT_TYPE_COMMAND_BUFFER; |
MessageID = 0xb53e2331 |
Attempt to reset command pool with VkCommandBuffer 0x20d2ce0b718[] which is in use.
The Vulkan spec states: All VkCommandBuffer objects allocated from commandPool must not be in the pending state
(https://vulkan.lunarg.com/doc/view/1.2.176.1/windows/1.2-extensions/vkspec.html#VUID-vkResetCommandPool-commandPool-00040)
Objects: 1
[0] 0x20d2ce0b718, type: 6, name: NULL
But I don't understand why, since I'm already waiting with vkQueueWaitIdle. According to the documentation, once the commandBuffer is done executing, it should go to the invalid state, and I should be able to reset it.
Here's the relevan surrounding code:
VkCommandBufferBeginInfo beginInfo = {};
beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
beginInfo.pInheritanceInfo = nullptr;
for (i64 i = 0; i < numIterations; i++)
{
vkBeginCommandBuffer(commandBuffer, &beginInfo);
vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipelineLayout,
0, 2, descriptorSets, 0, nullptr);
uniforms.start = i * numThreads;
vkCmdUpdateBuffer(commandBuffer, unifsBuffer, 0, sizeof(uniforms), &uniforms);
vkCmdPipelineBarrier(commandBuffer,
VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0,
0, nullptr,
1, &memBarriers[0],
0, nullptr);
vkCmdDispatch(commandBuffer, numThreads, 1, 1);
vkCmdPipelineBarrier(commandBuffer,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0,
0, nullptr,
1, &memBarriers[1],
0, nullptr);
VkBufferCopy copyInfo = {};
copyInfo.srcOffset = 0;
copyInfo.dstOffset = 0;
copyInfo.size = sizeof(i64) * numThreads;
vkCmdCopyBuffer(commandBuffer,
buffer, stagingBuffer, 1, ©Info);
vkEndCommandBuffer(commandBuffer);
{
VkSubmitInfo info = {};
info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
info.commandBufferCount = 1;
info.pCommandBuffers = &commandBuffer;
vkQueueSubmit(queue, 1, &info, VK_NULL_HANDLE);
}
vkQueueWaitIdle(queue);
vkResetCommandPool(device, commandPool, VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT);
i64* result;
vkMapMemory(device, stagingBufferMem, 0, sizeof(i64) * numThreads, 0, (void**)&result);
for (int i = 0; i < numThreads; i++)
{
if (result[i]) {
auto res = result[i];
vkUnmapMemory(device, stagingBufferMem);
return res;
}
}
vkUnmapMemory(device, stagingBufferMem);
}
I have found my problem. In vkCmdDispatch, I thought the paremeters specify the global size (number of compute shader invocations) but it's actually the number of work groups. Therefore, I was dispatching more threads than I intended, and my buffer wasn't big enough, so the threads were writing out of bounds.
I believe the validation layer wasn't giving me the right hints though.
I don't quite understand here.:
https://github.com/SaschaWillems/Vulkan/blob/master/examples/computeshader/computeshader.cpp
void draw()
{
VulkanExampleBase::prepareFrame();
submitInfo.commandBufferCount = 1;
submitInfo.pCommandBuffers = &drawCmdBuffers[currentBuffer];
VK_CHECK_RESULT(vkQueueSubmit(queue, 1, &submitInfo, VK_NULL_HANDLE));
VulkanExampleBase::submitFrame();
// Submit compute commands
// Use a fence to ensure that compute command buffer has finished executin before using it again
vkWaitForFences(device, 1, &compute.fence, VK_TRUE, UINT64_MAX);
vkResetFences(device, 1, &compute.fence);
VkSubmitInfo computeSubmitInfo = vks::initializers::submitInfo();
computeSubmitInfo.commandBufferCount = 1;
computeSubmitInfo.pCommandBuffers = &compute.commandBuffer;
VK_CHECK_RESULT(vkQueueSubmit(compute.queue, 1, &computeSubmitInfo, compute.fence));
}
drawCmdBuffers[currentBuffer] runs before compute.commandBuffer, but the consumer drawCmdBuffers[currentBuffer] requires the textureComputeTarget produced by the producer compute.commandBuffer.
I don't understand why drawCmdBuffers[currentBuffer] is called before compute.commandBuffer.
In the following code, only the first frame is rendered, while the right picture does not get the textureComputeTarget, so it is rendered with a blue background.
void draw()
{
VulkanExampleBase::prepareFrame();
submitInfo.commandBufferCount = 1;
submitInfo.pCommandBuffers = &drawCmdBuffers[currentBuffer];
VK_CHECK_RESULT(vkQueueSubmit(queue, 1, &submitInfo, VK_NULL_HANDLE));
VulkanExampleBase::submitFrame();
// Submit compute commands
// Use a fence to ensure that compute command buffer has finished executin before using it again
vkWaitForFences(device, 1, &compute.fence, VK_TRUE, UINT64_MAX);
vkResetFences(device, 1, &compute.fence);
VkSubmitInfo computeSubmitInfo = vks::initializers::submitInfo();
computeSubmitInfo.commandBufferCount = 1;
computeSubmitInfo.pCommandBuffers = &compute.commandBuffer;
VK_CHECK_RESULT(vkQueueSubmit(compute.queue, 1, &computeSubmitInfo, compute.fence));
sleep(1000) // <-------- Wait
}
Executed when calling vkQueueSubmit(queue, 1, &submitInfo, VK_NULL_HANDLE):
VkImageMemoryBarrier imageMemoryBarrier = {};
imageMemoryBarrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
// We won't be changing the layout of the image
imageMemoryBarrier.oldLayout = VK_IMAGE_LAYOUT_GENERAL;
imageMemoryBarrier.newLayout = VK_IMAGE_LAYOUT_GENERAL;
imageMemoryBarrier.image = textureComputeTarget.image;
imageMemoryBarrier.subresourceRange = { VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1 };
imageMemoryBarrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
imageMemoryBarrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
vkCmdPipelineBarrier(
drawCmdBuffers[i],
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT,
VK_FLAGS_NONE,
0, nullptr,
0, nullptr,
1, &imageMemoryBarrier);
vkCmdBeginRenderPass(drawCmdBuffers[i], &renderPassBeginInfo, VK_SUBPASS_CONTENTS_INLINE);
Wait for VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, but this phase has not been executed before, why is the pipeline not stuck? Is it because
there is no pipeline before, so there is no need to wait?
In section 6.6 Pipeline Barriers
vkCmdPipelineBarrier is a synchronization command that inserts a dependency between commands submitted to the same queue, or between commands in the same subpass.
void draw()
{
printf("%p, %p\n", queue, compute.queue);
VulkanExampleBase::prepareFrame();
submitInfo.commandBufferCount = 1;
submitInfo.pCommandBuffers = &drawCmdBuffers[currentBuffer];
VK_CHECK_RESULT(vkQueueSubmit(queue, 1, &submitInfo, VK_NULL_HANDLE));
VulkanExampleBase::submitFrame();
// Submit compute commands
// Use a fence to ensure that compute command buffer has finished executin before using it again
vkWaitForFences(device, 1, &compute.fence, VK_TRUE, UINT64_MAX);
vkResetFences(device, 1, &compute.fence);
VkSubmitInfo computeSubmitInfo = vks::initializers::submitInfo();
computeSubmitInfo.commandBufferCount = 1;
computeSubmitInfo.pCommandBuffers = &compute.commandBuffer;
VK_CHECK_RESULT(vkQueueSubmit(compute.queue, 1, &computeSubmitInfo, compute.fence));
sleep(1000);
}
Print results:
0x6000039c4a20, 0x6000039c4a20
The current queue and compute.queue are the same queue.But it is possible that the above code may generate different queue.
Can VkImageMemoryBarrier be synchronized in multiple queues?
vkCmdPipelineBarrier is a synchronization command that inserts a dependency between commands submitted to the same queue, or
between commands in the same subpass. why use "or", why not use
"and"?
I don't understand why drawCmdBuffers[currentBuffer] is called before compute.commandBuffer.
Dunno, it is an example. Author was probably not awfully woried what happens in the first frame. It would simply be drawn with one frame delay. Swapping the compute before draw should also work with some effort.
Wait for VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, but this phase has not been executed before, why is the pipeline not stuck? Is it because there is no pipeline before, so there is no need to wait?
Because that is not how pipeline and dependencies work. vkCmdPipelineBarrier makes sure any command\operation in queue before the barrier reaches (and finishes) at least the srcStage stage (i.e. COMPUTE) before any command\op recorded after it reach dstStage.
Such dependency is satisfied even if there are no commands recorded before. I.e. by definition of "nothing", there are no commands that have not reached COMPUTE stage yet.
Can VkImageMemoryBarrier be synchronized in multiple queues?
Yes, with the help of a Semaphore.
For VK_SHARING_MODE_EXCLUSIVE and different queue family it is called Queue Family Ownership Transfer (QFOT).
Otherwisely, a Semaphore already performs a memory dependency and a VkImageMemoryBarrier is not needed.
vkCmdPipelineBarrier is a synchronization command that inserts a dependency between commands submitted to the same queue, or between commands in the same subpass. why use "or", why not use "and"?
vkCmdPipelineBarrier is either outside subpass, then it forms a dependency with commands recorded before and after in the queue.
Or vkCmdPipelineBarrier is inside a subpass, in which case it is called "subpass self-dependency" and its scope is limited only to that subpass (among other restrictions).
I'm trying to synchronize a host stage into my pipeline, where I basically edit some data on the host during the execution of a command buffer on the device. From reading the specification I think I'm doing the correct synchronization, execution/memory dependencies and availability/visibility operations, but it neither works on NV nor AMD hardware. Is this even possible? If so, what am I doing wrong in terms of synchronization?
In summary I'm doing the following:
[D] A device buffer (VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) is copied to a host visible and coherent one (VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT).
[D] The first event is set.
[D] The second event is waited for.
[H] Meanwhile the host waits for the first event.
[H] After it has been set, it increments the numbers in the host visible buffer.
[H] Then it sets the second event.
[D] The device then continues to copy the host visible buffer back to the device local buffer.
What happens?
On NV the first part works, the correct data arrives at the host side, but the altered data never arrives at the device side. On AMD not even the first part works and I already don't get the data on the host.
Command buffer recording:
// ...
VkMemoryBarrier barrier = {};
barrier.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
barrier.srcAccessMask = ...;
barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
vkCmdPipelineBarrier(command_buffer, ..., VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 1, &barrier, 0, nullptr, 0, nullptr);
copyWholeBuffer(command_buffer, host_buffer, device_buffer);
barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
barrier.dstAccessMask = VK_ACCESS_HOST_READ_BIT;
vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_HOST_BIT, 0, 1, &barrier, 0, nullptr, 0, nullptr);
vkCmdSetEvent(command_buffer, device_to_host_sync_event, VK_PIPELINE_STAGE_TRANSFER_BIT);
barrier.srcAccessMask = VK_ACCESS_HOST_WRITE_BIT;
barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
vkCmdWaitEvents(command_buffer, 1, &host_to_device_sync_event, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 1, &barrier, 0, nullptr, 0, nullptr);
copyWholeBuffer(command_buffer, device_buffer, host_buffer);
barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
barrier.dstAccessMask = ...;
vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TRANSFER_BIT, ..., 0, 1, &barrier, 0, nullptr, 0, nullptr);
// ...
Execution
vkQueueSubmit(queue, 1, &submitInfo, VK_NULL_HANDLE);
while(vkGetEventStatus(device, device_to_host_sync_event) != VK_EVENT_SET)
std::this_thread::sleep_for(std::chrono::microseconds(10));
void* data;
vkMapMemory(device, host_buffer, 0, BUFFER_SIZE, 0, &data);
// read and write parts of the memory
vkUnmapMemory(device, host_buffer);
vkSetEvent(device, host_to_device_sync_event);
vkDeviceWaitIdle(device);
I've uploaded a working example: https://gist.github.com/neXyon/859b2e52bac9a5a56b804d8a9d5fa4a5
The interesting bits start at line 292! Please have a look if it works for you?
I opened an issue on github: https://github.com/KhronosGroup/Vulkan-Docs/issues/755
After a bit of discussion there, the conclusion is that Device to Host synchronization is not possible with an event and a fence has to be used.