diff --git a/examples/matmul_clamp_f16_f16_f16p/matmul_clamp_f16_f16_f16p.cpp b/examples/matmul_clamp_f16_f16_f16p/matmul_clamp_f16_f16_f16p.cpp
index 0baee907cf93e9c14e5adddad4b65261d455e92b..9428255449ca96c81b3e28681301a93113ae09b1 100644
--- a/examples/matmul_clamp_f16_f16_f16p/matmul_clamp_f16_f16_f16p.cpp
+++ b/examples/matmul_clamp_f16_f16_f16p/matmul_clamp_f16_f16_f16p.cpp
@@ -105,12 +105,17 @@ bool is_output_correct(
 
 int main() {
     int ret = 0;
-
-    // Parameters of the matrix multiplication. Change these values to see how the micro-kernels operate on different
-    // sized matrices
-    const size_t M = 6;   // Rows of LHS and DST matrices
-    const size_t N = 24;  // Columns of RHS and DST matrices, and length of the Bias vector.
-    const size_t K = 4;   // Columns of LHS, rows of RHS matrices
+    // 1x1 Convolution operator in NHWC format.
+    const size_t nhwc_n = 2;
+    const size_t nhwc_h = 2;
+    const size_t nhwc_w = 4;
+    const size_t nhwc_c_in = 4;    // Input channels
+    const size_t nhwc_c_out = 24;  // Output channels
+
+    // Map NHWC of operator to GEMM terminology
+    const size_t M = nhwc_h * nhwc_w * nhwc_n;  // Rows of LHS and DST matrices
+    const size_t N = nhwc_c_out;                // Columns of RHS and DST matrices
+    const size_t K = nhwc_c_in;                 // Columns of LHS, rows of RHS matrices
 
     const size_t lhs_size = M * K;
     const size_t rhs_size = N * K;
@@ -186,22 +191,39 @@ int main() {
 
     float16_t* dst = new float16_t[dst_size];
 
-    const auto timer_matmul_start = std::chrono::high_resolution_clock::now();
+    // Framework scheduling params
 
-    ukernel.run_matmul(
-        M, N, K,           // Dimensions
-        lhs,               // LHS
-        lhs_stride,        // LHS stride
-        rhs_packed,        // RHS packed
-        dst,               // DST
-        dst_stride_row,    // DST stride (row)
-        dst_stride_col,    // DST stride (col)
-        -FLT_MAX, FLT_MAX  // Min and max for the clamp operation
-    );
+    // Example alternative values to try. ukernel.get_m_step() * 2 or M;
+    const size_t m_step = ukernel.get_m_step();  // Scheduling along M
 
-    const auto timer_matmul_end = std::chrono::high_resolution_clock::now();
-    const auto time_matmul =
-        std::chrono::duration_cast<std::chrono::nanoseconds>(timer_matmul_end - timer_matmul_start);
+    // Example alternative values to try.  n_step = N;
+    const size_t n_step = ukernel.get_n_step();  // Scheduling along N
+
+    for (size_t i_m_step = 0; i_m_step < M; i_m_step += m_step) {
+        for (size_t i_n_step = 0; i_n_step < N; i_n_step += n_step) {
+            // Support functions return offset in bytes
+            const uint8_t* lhs_ptr =
+                (const uint8_t*)lhs + (ukernel.get_lhs_packed_offset(i_m_step, K * sizeof(uint16_t)));
+            const uint8_t* rhs_ptr = (const uint8_t*)rhs_packed + (ukernel.get_rhs_packed_offset(i_n_step, K));
+            uint8_t* dst_ptr = (uint8_t*)dst + (ukernel.get_dst_offset(i_m_step, i_n_step, N * sizeof(uint16_t)));
+#ifdef KAI_DEBUG
+            printf("Processing a %zux%zu ouptut block starting at (%zu, %zu)\n", m_step, n_step, i_m_step, i_n_step);
+#endif
+            const size_t actual_m = std::min(M - i_m_step, m_step);
+            const size_t actual_n = std::min(N - i_n_step, n_step);
+
+            ukernel.run_matmul(
+                actual_m, actual_n, K,  // Dimensions
+                lhs_ptr,                // LHS
+                lhs_stride,             // LHS stride
+                rhs_ptr,                // RHS packed
+                dst_ptr,                // DST
+                dst_stride_row,         // DST stride (row)
+                dst_stride_col,         // DST stride (col)
+                -FLT_MAX, FLT_MAX       // Min and max for the clamp operation
+            );
+        }
+    }
 
 #ifdef KAI_DEBUG
     print_matrix(M, N, "dst", dst);
@@ -213,7 +235,6 @@ int main() {
     std::cout << "- ukernel: matmul_clamp_f16_f16_f16p16x1biasf16_6x16x8_neon_mla\n";
     if (is_valid) {
         std::cout << "- Status: PASSED\n";
-        std::cout << "- Performance: " << time_matmul.count() << "ns\n";
     } else {
         std::cout << "- Status: FAILED\n";
         ret = 1;