vllm.model_executor.layers.fused_moe.router.gate_linear ¶

GateLinear ¶

Bases: ReplicatedLinear

MoE gate linear layer with three-tier GEMM dispatch:

DSV3 specialized kernel (SM90+, batch<=16, supported dims)
gpt-oss specialized kernel (SM90+, batch<=128, supported dims)
cuBLAS bf16×bf16→fp32 (SM90+ + bf16 + fp32 out_dtype)
F.linear via ReplicatedLinear (ultimate fallback)

The out_dtype attribute is mutable and can be set after init (e.g. when the required dtype depends on the expert quantization method which is only known later).

Source code in vllm/model_executor/layers/fused_moe/router/gate_linear.py

@PluggableLayer.register("gate_linear")
class GateLinear(ReplicatedLinear):
    """MoE gate linear layer with three-tier GEMM dispatch:

    1. DSV3 specialized kernel (SM90+, batch<=16, supported dims)
    2. gpt-oss specialized kernel (SM90+, batch<=128, supported dims)
    3. cuBLAS bf16×bf16→fp32 (SM90+ + bf16 + fp32 out_dtype)
    4. F.linear via ReplicatedLinear (ultimate fallback)

    The ``out_dtype`` attribute is mutable and can be set after init
    (e.g. when the required dtype depends on the expert quantization
    method which is only known later).
    """

    # Dimensions supported by the DSV3 specialized kernel
    DSV3_SUPPORTED_NUM_EXPERTS = [256, 384]
    DSV3_SUPPORTED_HIDDEN_SIZES = [7168]

    # Dimensions supported by the gpt-oss specialized kernel
    GPT_OSS_SUPPORTED_NUM_EXPERTS = [32, 128]
    GPT_OSS_SUPPORTED_HIDDEN_SIZES = [2880]

    def __init__(
        self,
        input_size: int,
        output_size: int,
        bias: bool = False,
        out_dtype: torch.dtype | None = None,
        params_dtype: torch.dtype | None = None,
        force_fp32_compute: bool = False,
        prefix: str = "",
    ):
        is_hopper_or_blackwell = current_platform.is_device_capability(
            (9, 0)
        ) or current_platform.is_device_capability_family(100)
        can_use_specialized_kernels = (
            current_platform.is_cuda() and is_hopper_or_blackwell and not bias
        )

        # If fp32 compute is required and no specialized kernel is available,
        # store weights in fp32 so Tier 3 computes in fp32 natively.
        if force_fp32_compute and not can_use_specialized_kernels:
            params_dtype = torch.float32

        super().__init__(
            input_size,
            output_size,
            bias=bias,
            params_dtype=params_dtype,
            quant_config=None,
            prefix=prefix,
        )
        self.out_dtype = out_dtype

        # DSV3 specialized kernel eligibility (SM90+, exact dims)
        self.allow_specialized_router_gemm = can_use_specialized_kernels
        self.allow_dsv3_router_gemm = (
            self.allow_specialized_router_gemm
            and output_size in self.DSV3_SUPPORTED_NUM_EXPERTS
            and input_size in self.DSV3_SUPPORTED_HIDDEN_SIZES
        )

        # gpt-oss specialized kernel eligibility (SM90+, exact dims)
        self.allow_gpt_oss_router_gemm = (
            self.weight.dtype == torch.bfloat16
            and current_platform.is_cuda()
            and is_hopper_or_blackwell
            and output_size in self.GPT_OSS_SUPPORTED_NUM_EXPERTS
            and input_size in self.GPT_OSS_SUPPORTED_HIDDEN_SIZES
        )

        # cuBLAS bf16→fp32 eligibility
        self.allow_cublas_router_gemm = (
            self.allow_specialized_router_gemm
            and self.weight.dtype == torch.bfloat16
            and self.out_dtype == torch.float32
        )

    def set_out_dtype(self, out_dtype: torch.dtype) -> None:
        """Set output dtype for the router logits after init.

        Useful when the required dtype depends on the expert quantization
        method which is only known after the gate is constructed.
        """
        if self.out_dtype is not None:
            raise ValueError("out_dtype has already been set")
        self.out_dtype = out_dtype

        if (
            not self.allow_cublas_router_gemm
            and self.allow_specialized_router_gemm
            and out_dtype == torch.float32
        ):
            self.allow_cublas_router_gemm = self.weight.dtype == torch.bfloat16

    def forward(
        self, x: torch.Tensor
    ) -> torch.Tensor | tuple[torch.Tensor, Parameter | None]:
        # Tier 1: DSV3 specialized kernel
        if self.allow_dsv3_router_gemm and x.shape[0] <= 16:
            output = ops.dsv3_router_gemm(
                hidden_states=x,
                router_weight=self.weight,
                output_dtype=self.out_dtype,
            )
            return output, None

        # Tier 2: gpt-oss specialized kernel
        if self.allow_gpt_oss_router_gemm:
            output = torch.ops.vllm.gpt_oss_router_gemm(x, self.weight, self.bias)
            return output, None

        # Tier 3: cuBLAS bf16→fp32
        if self.allow_cublas_router_gemm and x.dtype == torch.bfloat16:
            output = ops.router_gemm_bf16_fp32(x, self.weight)
            return output, None

        # Tier 4: F.linear (ReplicatedLinear)
        if self.out_dtype is not None and x.dtype != self.weight.dtype:
            x = x.to(self.weight.dtype)
        output, output_bias = super().forward(x)
        if self.out_dtype is not None and output.dtype != self.out_dtype:
            output = output.to(self.out_dtype)
        return output, output_bias

set_out_dtype ¶

set_out_dtype(out_dtype: dtype) -> None

Set output dtype for the router logits after init.

Useful when the required dtype depends on the expert quantization method which is only known after the gate is constructed.

Source code in vllm/model_executor/layers/fused_moe/router/gate_linear.py

def set_out_dtype(self, out_dtype: torch.dtype) -> None:
    """Set output dtype for the router logits after init.

    Useful when the required dtype depends on the expert quantization
    method which is only known after the gate is constructed.
    """
    if self.out_dtype is not None:
        raise ValueError("out_dtype has already been set")
    self.out_dtype = out_dtype

    if (
        not self.allow_cublas_router_gemm
        and self.allow_specialized_router_gemm
        and out_dtype == torch.float32
    ):
        self.allow_cublas_router_gemm = self.weight.dtype == torch.bfloat16

gpt_oss_router_gemm_impl ¶

gpt_oss_router_gemm_impl(
    x: Tensor, weight: Tensor, bias: Tensor
) -> Tensor

Dynamically run min-latency gemm if num_tokens <= 128. This must be wrapped in a custom op because our torch.compile integration does not support runtime dispatching on num_tokens.

Source code in vllm/model_executor/layers/fused_moe/router/gate_linear.py

def gpt_oss_router_gemm_impl(
    x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor
) -> torch.Tensor:
    """
    Dynamically run min-latency gemm if num_tokens <= 128.
    This must be wrapped in a custom op because our torch.compile integration
    does not support runtime dispatching on num_tokens.
    """
    if x.shape[0] <= 128:
        return ops.gpt_oss_router_gemm(x, weight, bias)
    else:
        return torch.nn.functional.linear(x, weight, bias)