I tried to get some simple MT for voxelization using std::thread, but performance is terrible.
The code does this:
Divide the space into smaller cubes of voxels, dispatch one std::thread for each. These cubes are like leaf nodes of an octree.
After (a configurable multiple) of 8 leafs has been dispatched, join first group of its 8 threads, compress results and build the hierarchy bottom up. (This is done in the main tread but probably not relevant.)
After that, dispatch 8 new voxelization threads and ensure the total number of concurrent threads stays at a constant number. I always create new threads here and do not use a thread pool like i would do. I thought a job system would be overkill for a preprocessing tool.
To isolate the bad threading performance i have replaced voxelization with just clearing the memory, so there is no work done other than copying memory around. Also compression is off and cube per thread is small.
With this setting, i get this release timings on 8-core Ryzen, Win10, and a 512^3 volume of empty space (2MB):
Single threaded: 10ms (expected with some other things going on per frame).
Multi threaded: 3.4 seconds, launching a total 4096 threads over time. (Changing max concurrent threads count between 8-64 has not much effect.)
Strange: Running Diagnostic Tools for CPU usage i only get 340ms with MT, showing thread constructor taking 30% if the time but nothing else:
Function Name Total CPU (%) Self CPU (%) Total CPU (ms) Self CPU (ms) Module
- [External Code] 100.00% 60.24% 12880 7759 43 modules
- __scrt_common_main_seh 33.51% 0.00% 4316 0 GIPreProcess.exe
- WinMain 33.48% 0.00% 4312 0 GIPreProcess.exe
- VisMesh::VisualizeLMUVBuilder 30.75% 0.01% 3960 1 GIPreProcess.exe
- MeshProcessing::VoxelVolume::VoxelizeBox 30.26% 0.09% 3898 12 GIPreProcess.exe
+ std::thread::thread<void (__cdecl MeshProcessing::VoxelVolume::*)(MeshProcessing::VoxelVolume::TaskData & __ptr64) __ptr64,MeshProcessing::VoxelVolume * __ptr64 const,std::reference_wrapper<MeshProcessing::VoxelVolume::TaskData>,void> 26.48% 0.00% 3411 0 GIPreProcess.exe
[External Code] 1.79% 1.79% 230 230 4 modules
+ MeshProcessing::VoxelVolume::VisTaskData 0.84% 0.00% 108 0 GIPreProcess.exe
+ ImGui::TextColored 0.47% 0.00% 60 0 GIPreProcess.exe
So my question is: Can it be that the cost of launching / joining a thread is indeed close to 1 ms?
I knew this has some overhead but that's hard to believe.
Maybe i did something wrong, so posting code as well.
Thanks!
bool VoxelizeBox (
const vec center, const float scale,
const int brickDimensionX, const int brickDimensionY, const int brickDimensionZ,
bool vis)
{
static int taskLevelsX = 2; ImGui::DragInt("voxHierarchyLevels", &taskLevelsX, 0.01f);
static int compressionX = 0; ImGui::DragInt("compression (NONE, SVO, DAG)", &compressionX, 0.01f);
static bool multiThreading = 1; ImGui::Checkbox("multiThreading", &multiThreading);
static int threadFactor = 8; ImGui::DragInt("threadFactor", &threadFactor, 0.01f);
static bool lowRes = 1; ImGui::Checkbox("lowRes", &lowRes);
static bool voxelize = 1; ImGui::Checkbox("[-voxelize-]", &voxelize);
static int visLevel = 6; ImGui::DragInt("visLevel", &visLevel, 0.01f);
voxelizationSettings.globalCenter = center;
//voxelizationSettings.compression = DAG;
voxelizationSettings.compression = CompressionType(compressionX);
int maxDim = max(max(brickDimensionX, brickDimensionY), brickDimensionZ);
int base = WidthToBase(maxDim);
voxelizationSettings.taskLevels = taskLevelsX; // 3 or 4 seems practical
voxelizationSettings.taskBrickBase = ((voxelizationSettings.taskLevels-1)*2);
int bricksPerTask = BaseToWidth(voxelizationSettings.taskBrickBase);
int subdivRes = maxDim / bricksPerTask;
int subdivLevels = WidthToBase(subdivRes);
int subdivVol = BaseToVolume(subdivLevels);
maxDim = bricksPerTask<<subdivLevels;
voxelizationSettings.scale = scale * float(bricksPerTask) / 4;
ImGui::Text("voxelVol %i^3 maxDim %i bricksPerTask %i subdivRes %i \nsubdivLevels %i subdivVol %i voxBrickBase %i",
maxDim*4, maxDim, bricksPerTask, subdivRes, subdivLevels, subdivVol, voxelizationSettings.taskBrickBase);
RenderPoint (center, 1,1,1);
vec boxDim( float(brickDimensionX) * scale/2,
float(brickDimensionY) * scale/2,
float(brickDimensionZ) * scale/2);
RenderAABBox(center-boxDim, center+boxDim, 1,1,1);
vec cubeDim(float(maxDim) * scale/2);
vec cubeCenter = center-boxDim + cubeDim;
RenderAABBox(cubeCenter-cubeDim, cubeCenter+cubeDim, 1,0,0);
vec taskDim(float(bricksPerTask) * scale/2);
if (!voxelize) return false;
struct VoxTask
{
std::array<std::thread, 8> thread;
std::array<TaskData, 8> data;
};
struct MergeTask
{
std::array<TaskData, 8> data;
bool ready;
MergeTask ()
{
ready = false;
}
};
int numVoxTasks = subdivVol/8;
int numVoxWorkers = threadFactor;
std::vector<VoxTask> voxTasks (numVoxWorkers);
std::vector<MergeTask> mergeTask (subdivLevels);
int dbgDispatchedThreadsCount = 0;
int workIterations = numVoxTasks + numVoxWorkers;
for (int workIndex = 0; workIndex < workIterations; workIndex++)
{
// start to merge child trees form main thread after enough threads filled up with voxelization work
if (workIndex >= numVoxWorkers)
{
int mergeTaskIndex = (workIndex + (numVoxWorkers)) % numVoxWorkers;
VoxTask &task = voxTasks[mergeTaskIndex];
int morton = (workIndex - (numVoxWorkers))*8;
uint64_t x,y,z;
MortonToCoords3D (x,y,z, morton, subdivLevels);
uint64_t px=(x>>1), py=(y>>1), pz=(z>>1);
uint64_t parentMorton = Coords3DToMorton(px,py,pz, subdivLevels-1);
for (int i=0; i<8; i++)
{
if (multiThreading && task.thread[i].joinable())
task.thread[i].join();
if (vis && visLevel==task.data[i].brickBase) VisTaskData (task.data[i], lowRes);
}
// merge if 8 voxelization tasks done
ExecuteMergeTask(mergeTask[0].data[parentMorton & 7], task.data, parentMorton);
mergeTask[0].ready = ((parentMorton & 7) == 7);
ImGui::TextColored(ImVec4(0,1,0,1), " merge from voxelization task[%i]: morton %08x -> %08x task.data->mergeTask[0].data[%i]",
mergeTaskIndex, morton, parentMorton, parentMorton & 7);
if (vis && visLevel==mergeTask[0].data[parentMorton & 7].brickBase) VisTaskData (mergeTask[0].data[parentMorton & 7], lowRes);
// merge upwards the hierarchy
for (int level = 0; level<subdivLevels-1; level++)
{
if (mergeTask[level].ready)
{
uint64_t childMorton = mergeTask[level].data[0].morton;
uint64_t x,y,z;
MortonToCoords3D (x,y,z, childMorton, subdivLevels-1-level);
uint64_t px=(x>>1), py=(y>>1), pz=(z>>1);
uint64_t parentMorton = Coords3DToMorton(px,py,pz, subdivLevels-2-level);
ExecuteMergeTask(mergeTask[level+1].data[parentMorton & 7], mergeTask[level].data, parentMorton);
mergeTask[level+1].ready = ((parentMorton & 7) == 7);
mergeTask[level].ready = false;
ImGui::TextColored(ImVec4(0,1,1,1), " merge from merge. Base %i morton %08x -> %08x mergeTask[%i].data->mergeTask[%i].data[%i]",
mergeTask[level+1].data[parentMorton & 7].brickBase, childMorton, parentMorton, level, level+1, parentMorton & 7);
if (vis && visLevel==mergeTask[level+1].data[parentMorton & 7].brickBase) VisTaskData (mergeTask[level+1].data[parentMorton & 7], lowRes);
}
}
}
// dispatch voxelization threads
if (workIndex < numVoxTasks)
{
int voxTaskIndex = workIndex % numVoxWorkers;
VoxTask &task = voxTasks[voxTaskIndex];
int morton = workIndex*8;
ImGui::TextColored(ImVec4(1,0,0,1), "voxelization task[%i]: morton %08x", voxTaskIndex, morton);
for (int i=0; i<8; i++, morton++)
{
uint64_t x,y,z;
MortonToCoords3D (x,y,z, morton, subdivLevels);
vec taskCenter = vec(float(x)+0.5f, float(y)+0.5f, float(z)+0.5f);
taskCenter = center-boxDim + taskCenter * (float(bricksPerTask) * scale);
task.data[i].localCenter = taskCenter;
task.data[i].scale = voxelizationSettings.scale;
task.data[i].morton = morton;
task.data[i].brickBase = voxelizationSettings.taskBrickBase;
if (multiThreading) task.thread[i] = std::thread(
&VoxelVolume::ExecuteVoxelizationTask, this, std::ref(task.data[i]));
else
ExecuteVoxelizationTask (task.data[i]);
dbgDispatchedThreadsCount++;
}
std::this_thread::yield();
}
}
TaskData &result = mergeTask[subdivLevels-1].data[0];
ImGui::Text("total RAW size %i", result.bricks.size() * sizeof(uint64_t));
ImGui::Text("total SVO size %i", result.svo.size() * sizeof(uint64_t));
ImGui::Text("total DAG size %i", result.dag.size() * sizeof(uint32_t) + result.dagBrickDictionary.size() * sizeof(uint64_t));
ImGui::Text("dbgDispatchedThreadsCount %i", dbgDispatchedThreadsCount);
return true;
}