Workgroup sizes of 8 along with reducing the amount of work groups improves performance 20x

6 years ago · 314fa3e4af
parent 711e678969
commit 314fa3e4af
5 changed files with 69 additions and 123 deletions
--- a/resources/shaders/simple-edge.compute
+++ b/resources/shaders/simple-edge.compute
@ -1,6 +1,6 @@
 #version 450
-layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
+layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
 layout(set = 0, binding = 0) buffer wData {
    int buf[];
@ -35,44 +35,39 @@ void main() {
    uint idx = get_idx(0,0);
-//    ivec4 p  = separate(read_buffer.buf[get_idx(0 , 0)]);
+    ivec4 p  = separate(read_buffer.buf[get_idx(0 , 0)]);
-//    ivec4 p0 = separate(read_buffer.buf[get_idx(0 , 1)]);
+    ivec4 p0 = separate(read_buffer.buf[get_idx(0 , 1)]);
-//    ivec4 p1 = separate(read_buffer.buf[get_idx(0 ,-1)]);
+    ivec4 p1 = separate(read_buffer.buf[get_idx(0 ,-1)]);
-//    ivec4 p2 = separate(read_buffer.buf[get_idx(1 , 1)]);
+    ivec4 p2 = separate(read_buffer.buf[get_idx(1 , 1)]);
-//    ivec4 p3 = separate(read_buffer.buf[get_idx(-1,-1)]);
+    ivec4 p3 = separate(read_buffer.buf[get_idx(-1,-1)]);
-//    ivec4 p4 = separate(read_buffer.buf[get_idx(1 , 0)]);
+    ivec4 p4 = separate(read_buffer.buf[get_idx(1 , 0)]);
-//    ivec4 p5 = separate(read_buffer.buf[get_idx(-1, 0)]);
+    ivec4 p5 = separate(read_buffer.buf[get_idx(-1, 0)]);
-//    ivec4 p6 = separate(read_buffer.buf[get_idx(1 ,-1)]);
+    ivec4 p6 = separate(read_buffer.buf[get_idx(1 ,-1)]);
-//    ivec4 p7 = separate(read_buffer.buf[get_idx(-1, 1)]);
+    ivec4 p7 = separate(read_buffer.buf[get_idx(-1, 1)]);
-//
+
-//    ivec3 d0 = abs(p0.xyz - p1.xyz);
+    ivec3 d0 = abs(p0.xyz - p1.xyz);
-//    ivec3 d1 = abs(p2.xyz - p3.xyz);
+    ivec3 d1 = abs(p2.xyz - p3.xyz);
-//    ivec3 d2 = abs(p4.xyz - p5.xyz);
+    ivec3 d2 = abs(p4.xyz - p5.xyz);
-//    ivec3 d3 = abs(p6.xyz - p7.xyz);
+    ivec3 d3 = abs(p6.xyz - p7.xyz);
-//
+
-//    ivec3 m = max(max(max(d0, d1), d2), d3);
+    ivec3 m = max(max(max(d0, d1), d2), d3);
-//
+
-//    if ((m.x + m.y + m.z) > 200){
+    if ((m.x + m.y + m.z) > 200){
-//       p.x = 0;
+       p.x = 0;
-//       p.y = 0;
+       p.y = 0;
-//       p.z = 255;
+       p.z = 255;
-//    }
+    }
-//    else {
+    else {
-//
+        //p.w = 125;
-//        //p.w = 125;
+    }
-//    }
+
-
+    write_buffer.buf[idx] = (write_buffer.buf[idx] & (~0x000000FF) ) | (p.x);
-//    write_buffer.buf[idx] = (write_buffer.buf[idx] & (~0x000000FF) ) | (p.x);
+    write_buffer.buf[idx] = (write_buffer.buf[idx] & (~0x0000FF00) ) | (p.y << 8);
-//    write_buffer.buf[idx] = (write_buffer.buf[idx] & (~0x0000FF00) ) | (p.y << 8);
+    write_buffer.buf[idx] = (write_buffer.buf[idx] & (~0x00FF0000) ) | (p.z << 16);
-//    write_buffer.buf[idx] = (write_buffer.buf[idx] & (~0x00FF0000) ) | (p.z << 16);
+    write_buffer.buf[idx] = (write_buffer.buf[idx] & (~0xFF000000) ) | (p.w << 24);
 //    write_buffer.buf[idx] = (write_buffer.buf[idx] & (~0xFF000000) ) | (p.w << 24);
 }
 // Just gonna keep this around
 //  read_buffer.buf[idx] = (read_buffer.buf[idx] & (~0x000000FF) ) | (p.x);
 //  read_buffer.buf[idx] = (read_buffer.buf[idx] & (~0x0000FF00) ) | (p.y << 8);
--- a/src/canvas.rs
+++ b/src/canvas.rs
@ -329,31 +329,10 @@ impl CanvasState {
        */
        //TODO should probably use cpu accessible buffer instead of recreating immutes each frame
        /*
          CpuAccessibleBuffer::from_iter(
                    device.clone(),
                    BufferUsage::vertex_buffer(),
                    self.colored_drawables.iter().cloned(),
                ).unwrap().0;
        */
        //if self.colored_vertex_buffer.len() == 0 {
        self.colored_vertex_buffer.clear();
        {
-            let g = hprof::enter("Colored Vertex Buffer : From Data");
+            let g = hprof::enter("Colored Vertex Buffer");
            self.colored_vertex_buffer.push(
                ImmutableBuffer::from_data(self.colored_drawables.clone(),
                                           BufferUsage::vertex_buffer(),
                                           self.queue.clone()).unwrap().0
            );
        }
        {
            let g = hprof::enter("Colored Vertex Buffer : From Iter");
            self.colored_vertex_buffer.push(
                ImmutableBuffer::from_iter(
                    self.colored_drawables.iter().cloned(),
@ -363,18 +342,10 @@ impl CanvasState {
            );
        }
        self.colored_vertex_buffer.clear();
        //println!("{:?}", self.colored_vertex_buffer.last().unwrap().size());
        //self.colored_vertex_buffer.pop();
        // }
        //  println!("{} {}", self.textured_vertex_buffer.len() , self.textured_drawables.len());
        //   if self.textured_vertex_buffer.len() != self.textured_drawables.len() {
        self.textured_vertex_buffer.clear();
-        for (k, v) in self.textured_drawables.drain() {
+        {
-            {
+            let g = hprof::enter("Textured Vertex Buffer");
-                let g = hprof::enter("Textured Vertex Buffer : From Iter");
+            for (k, v) in self.textured_drawables.drain() {
                self.textured_vertex_buffer.insert(
                    k.clone(),
                    ImmutableBuffer::from_iter(
@ -384,19 +355,7 @@ impl CanvasState {
                    ).unwrap().0,
                );
            }
 //                {
 //                    let g = hprof::enter("Textured Vertex Buffer : From Data");
 //                    self.textured_vertex_buffer.insert(
 //                        k.clone(),
 //                        ImmutableBuffer::from_data(v.first().unwrap().clone(),
 //                                                   BufferUsage::vertex_buffer(),
 //                                                   self.queue.clone()
 //                        ).unwrap().0
 //                    );
 //                }
            //self.textured_vertex_buffer.pop();
        }
        //    }
    }
    fn get_solid_color_descriptor_set(&self, kernel: Arc<CanvasShader>) -> Box<dyn DescriptorSet + Send + Sync> {
--- a/src/compu_state.rs
+++ b/src/compu_state.rs
@ -75,6 +75,7 @@ impl CompuState {
        handle
    }
    // TODO : THIS IS BROKEN
    pub fn get_kernel_handle(&self, kernel_name: String) -> Option<Arc<CompuKernelHandle>> {
        for i in self.kernels.clone() {
            if i.get_name() == kernel_name {
@ -104,7 +105,7 @@ impl CompuState {
            let size = buffer.get_size();
            command_buffer = command_buffer
-                .dispatch([size.0,size.1,1], p, d, ()).unwrap()
+                .dispatch([size.0/8,size.1/8,1], p, d, ()).unwrap()
        }
        // i = (Buffer, Image, Kernel)
--- a/src/main.rs
+++ b/src/main.rs
@ -165,13 +165,13 @@ fn main() {
        }
        let mut compu_frame = CompuFrame::new();
-       // compu_frame.add(compute_buffer.clone(), compute_kernel.clone());
+        compu_frame.add(compute_buffer.clone(), compute_kernel.clone());
       // compu_frame.add_with_image_swap(compute_buffer.clone(), compute_kernel.clone(), &compu_sprite1);
        let mut canvas = CanvasFrame::new();
-        canvas.draw(&sprite);
+//        canvas.draw(&sprite);
-        //canvas.draw(&sprite2);
+//        canvas.draw(&sprite2);
-        //canvas.draw(&sprite3);
+//        canvas.draw(&sprite3);
        //canvas.draw(&compu_sprite1);
        {
            let g = hprof::enter("Run");
--- a/src/vkprocessor.rs
+++ b/src/vkprocessor.rs
@ -95,7 +95,7 @@ impl<'a> VkProcessor<'a> {
            Swapchain::new(self.device.clone(),
                           surface.clone(),
-                           capabilities.min_image_count + 10, // number of attachment images
+                           capabilities.min_image_count, // number of attachment images
                           format,
                           initial_dimensions,
                           1, // Layers
@ -103,7 +103,7 @@ impl<'a> VkProcessor<'a> {
                           &self.queue,
                           SurfaceTransform::Identity,
                           alpha,
-                           PresentMode::Mailbox, true, None).unwrap()
+                           PresentMode::Immediate, true, None).unwrap()
        };
        self.swapchain = Some(swapchain);
@ -174,12 +174,10 @@ impl<'a> VkProcessor<'a> {
    pub fn run(&mut self,
               surface: &'a Arc<Surface<Window>>,
    //           mut frame_future: Box<dyn GpuFuture>,
               canvas_frame: CanvasFrame,
               compute_frame: CompuFrame,
-    )
+    ) {
-    //      -> Box<dyn GpuFuture> {
+
    {
        {
            let g = hprof::enter("Waiting at queue");
            self.queue.wait();
@ -189,9 +187,6 @@ impl<'a> VkProcessor<'a> {
        let mut framebuffers =
            self.canvas.window_size_dependent_setup(&self.swapchain_images.clone().unwrap().clone());
        // The docs said to call this on each loop.
   //     frame_future.cleanup_finished();
        // Whenever the window resizes we need to recreate everything dependent on the window size.
        // In this example that includes the swapchain, the framebuffers and the dynamic state viewport.
        if self.swapchain_recreate_needed {
@ -201,33 +196,27 @@ impl<'a> VkProcessor<'a> {
            self.swapchain_recreate_needed = false;
        }
        // This function can block if no image is available. The parameter is an optional timeout
        // after which the function call will return an error.
        let (image_num, acquire_future) =
            match vulkano::swapchain::acquire_next_image(
                self.swapchain.clone().unwrap().clone(),
                //Some(Duration::from_millis(3)),
                None,
            ) {
                Ok(r) => r,
                Err(AcquireError::OutOfDate) => {
                    self.swapchain_recreate_needed = true;
                    //return Box::new(sync::now(self.device.clone())) as Box<_>;
                    return;
                }
                Err(err) => panic!("{:?}", err)
            };
        drop(g);
        let g = hprof::enter("Joining the future");
     //   let future = frame_future.join(acquire_future);
        drop(g);
        {
            let g = hprof::enter("Canvas creates GPU buffers");
            // take the canvas frame and create the vertex buffers
-            // TODO: This performs gpu buffer creation. Shouldn't be in hotpath
+            // TODO: This performs gpu buffer creation. Shouldn't be in hotpath??
            self.canvas.draw(canvas_frame);
        }
@ -236,41 +225,43 @@ impl<'a> VkProcessor<'a> {
        let g = hprof::enter("Push compute commands to command buffer");
        // Add the compute commands
-        //  let mut command_buffer = self.compute_state.compute_commands(compute_frame, command_buffer, &self.canvas);
+        let mut command_buffer = self.compute_state.compute_commands(compute_frame, command_buffer, &self.canvas);
        drop(g);
        let g = hprof::enter("Push draw commands to command buffer");
        // Add the draw commands
-        //   let mut command_buffer = self.canvas.draw_commands(command_buffer, framebuffers, image_num);
+        let mut command_buffer = self.canvas.draw_commands(command_buffer, framebuffers, image_num);
        drop(g);
        // And build
        let command_buffer = command_buffer.build().unwrap();
        drop(g);
        // Wait on the previous frame, then execute the command buffer and present the image
        {
            let g = hprof::enter("Mussing with the frame future");
-            //let future = future //frame_future.join(acquire_future)
+            let g = hprof::enter("Joining on the framebuffer");
-            let future = sync::now(self.device.clone())
+            let mut future = sync::now(self.device.clone())
                .join(acquire_future);
            drop(g);
            let g = hprof::enter("Running the kernel and waiting on the future");
            let future = future
                .then_execute(self.queue.clone(), command_buffer).unwrap()
                .then_swapchain_present(self.queue.clone(), self.swapchain.clone().unwrap().clone(), image_num)
                .then_signal_fence_and_flush();
-            future.unwrap().wait(None).unwrap();
+
-
+            match future {
-//            match future {
+                Ok(future) => {
-//                Ok(future) => {
+                    future.wait(None).unwrap();
-//                    (Box::new(future) as Box<_>)
+                }
-//                }
+                Err(FlushError::OutOfDate) => {
-//                Err(FlushError::OutOfDate) => {
+                    self.swapchain_recreate_needed = true;
-//                    self.swapchain_recreate_needed = true;
+                }
-//                    (Box::new(sync::now(self.device.clone())) as Box<_>)
+                Err(e) => {
-//                }
+                    println!("{:?}", e);
-//                Err(e) => {
+                }
-//                    println!("{:?}", e);
+            }
 //                    (Box::new(sync::now(self.device.clone())) as Box<_>)
 //                }
 //            }
        }
    }
 }