-
Notifications
You must be signed in to change notification settings - Fork 14.1k
CUDA: experimental native mxfp4 support for blackwell [WIP] #17906
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
e214110
41e876a
40eb6c7
65f944b
a6dcaa5
b7deb96
928cc55
a1672f6
61c41a0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -50,6 +50,7 @@ | |
| #define GGML_CUDA_CC_TURING 750 | ||
| #define GGML_CUDA_CC_AMPERE 800 | ||
| #define GGML_CUDA_CC_ADA_LOVELACE 890 | ||
| #define GGML_CUDA_CC_BLACKWELL 1000 | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Distinguish Blackwell DC and smaller dies (maybe marking as _120, _GB20X?) more clearly here. The |
||
| #define GGML_CUDA_CC_OFFSET_AMD 0x1000000 | ||
| #define GGML_CUDA_CC_OFFSET_MTHREADS 0x0100000 | ||
| #define GGML_CUDA_CC_IS_NVIDIA(cc) (cc < GGML_CUDA_CC_OFFSET_MTHREADS) | ||
|
|
@@ -243,6 +244,10 @@ static const char * cu_get_error_str(CUresult err) { | |
| #define AMPERE_MMA_AVAILABLE | ||
| #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE | ||
|
|
||
| #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_BLACKWELL | ||
| # define BLACKWELL_MMA_AVAILABLE | ||
| #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_BLACKWELL | ||
|
|
||
| #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE | ||
| #define CP_ASYNC_AVAILABLE | ||
| #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE | ||
|
|
@@ -313,6 +318,10 @@ static bool cp_async_available(const int cc) { | |
| return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_AMPERE; | ||
| } | ||
|
|
||
| static bool blackwell_mma_available(const int cc) { | ||
| return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_BLACKWELL; | ||
| } | ||
|
|
||
| static constexpr __device__ int ggml_cuda_get_physical_warp_size() { | ||
| #if defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__)) | ||
| return 64; | ||
|
|
@@ -698,6 +707,33 @@ static __device__ __forceinline__ float ggml_cuda_e8m0_to_fp32(uint8_t x) { | |
| #endif // CUDART_VERSION >= 12050 | ||
| } | ||
|
|
||
| __device__ __forceinline__ uint8_t ggml_cuda_float_to_fp4_e2m1(float x, float e) { | ||
| // Handle exact zero early | ||
| if (x == 0.0f) { | ||
| return 0; | ||
| } | ||
|
|
||
| const uint8_t sign_bit = x < 0.0f ? 0x8 : 0; | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't know if the compiler is smart enough to do this optimization but I meant to transplant the sign bit directly without the use of conditional statements at all. So cast the float to an unsigned integer, shift 28 bits to the right, and apply |
||
| float ax = fabsf(x) * e; | ||
|
|
||
| // Positive LUT | ||
| static constexpr float pos_lut[8] = { 0.0f, 0.5f, 1.0f, 1.5f, 2.0f, 3.0f, 4.0f, 6.0f }; | ||
|
|
||
| int best_i = 0; | ||
| float best_err = fabsf(ax - pos_lut[0]); | ||
|
|
||
| #pragma unroll | ||
| for (int i = 1; i < 8; ++i) { | ||
| const float err = fabsf(ax - pos_lut[i]); | ||
| if (err < best_err) { | ||
| best_err = err; | ||
| best_i = i; | ||
| } | ||
| } | ||
|
|
||
| return static_cast<uint8_t>(best_i | sign_bit); | ||
| } | ||
|
|
||
| // See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1. | ||
| // Precompute mp (m' in the paper) and L such that division | ||
| // can be computed using a multiply (high 32b of 64b result) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
120 please
Blackwell DC uses a different tensor core design which works very differently.
.block_scale mma tensor core ops (non-tcgen05) will not compile on sm_100/103/110