+4
-3
lines changedFilter options
+4
-3
lines changed Original file line number Diff line number Diff line change
@@ -364,6 +364,10 @@ fp8_gemm_kernel(__nv_bfloat16* gmem_d, float* scales_b, int* grouped_layout,
364
364
DG_STATIC_ASSERT(static_cast<int>(kSwizzleDMode > 0) + static_cast<int>(BLOCK_N_PADDING > 0) <= 1,
365
365
"Swizzling and padding are not compatible");
366
366
367
+
// Wait last TMA store to be finished
368
+
if (threadIdx.x < BLOCK_N / TMA_D_BLOCK_N)
369
+
cute::tma_store_wait<0>();
370
+
367
371
// Write back to shared memory using STSM and issue TMA stores
368
372
DG_STATIC_ASSERT(WGMMA::kNumAccum % 4 == 0, "Invalid STSM x2 vectorization");
369
373
#pragma unroll
@@ -424,10 +428,7 @@ fp8_gemm_kernel(__nv_bfloat16* gmem_d, float* scales_b, int* grouped_layout,
424
428
cute::SM90_TMA_STORE_2D::copy(&tensor_map_d, smem_ptr,
425
429
n_block_idx * BLOCK_N + in_block_n_offset,
426
430
scheduler.get_global_idx(shape_m, BLOCK_M, m_block_idx));
427
-
428
-
// Wait TMA to be finished
429
431
cute::tma_store_arrive();
430
-
cute::tma_store_wait<0>();
431
432
}
432
433
__syncwarp();
433
434
}
You can’t perform that action at this time.
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4