A RetroSearch Logo

Home - News ( United States | United Kingdom | Italy | Germany ) - Football scores

Search Query:

Showing content from https://github.com/deepseek-ai/DeepGEMM/commit/340d9880f4a418d943d34260d20a79f41f4c0526 below:

Overlap TMA store · deepseek-ai/DeepGEMM@340d988 · GitHub

File tree Expand file treeCollapse file tree 1 file changed

+4

-3

lines changed

Filter options

Expand file treeCollapse file tree 1 file changed

+4

-3

lines changed Original file line number Diff line number Diff line change

@@ -364,6 +364,10 @@ fp8_gemm_kernel(__nv_bfloat16* gmem_d, float* scales_b, int* grouped_layout,

364 364

DG_STATIC_ASSERT(static_cast<int>(kSwizzleDMode > 0) + static_cast<int>(BLOCK_N_PADDING > 0) <= 1,

365 365

"Swizzling and padding are not compatible");

366 366 367 +

// Wait last TMA store to be finished

368 +

if (threadIdx.x < BLOCK_N / TMA_D_BLOCK_N)

369 +

cute::tma_store_wait<0>();

370 + 367 371

// Write back to shared memory using STSM and issue TMA stores

368 372

DG_STATIC_ASSERT(WGMMA::kNumAccum % 4 == 0, "Invalid STSM x2 vectorization");

369 373

#pragma unroll

@@ -424,10 +428,7 @@ fp8_gemm_kernel(__nv_bfloat16* gmem_d, float* scales_b, int* grouped_layout,

424 428

cute::SM90_TMA_STORE_2D::copy(&tensor_map_d, smem_ptr,

425 429

n_block_idx * BLOCK_N + in_block_n_offset,

426 430

scheduler.get_global_idx(shape_m, BLOCK_M, m_block_idx));

427 - 428 -

// Wait TMA to be finished

429 431

cute::tma_store_arrive();

430 -

cute::tma_store_wait<0>();

431 432

}

432 433

__syncwarp();

433 434

}

You can’t perform that action at this time.


RetroSearch is an open source project built by @garambo | Open a GitHub Issue

Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo

HTML: 3.2 | Encoding: UTF-8 | Version: 0.7.4