From 9d096729fae0d72eb165ef0660f60c78b8f1a600 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Tue, 18 Feb 2025 14:45:15 -0800
Subject: [PATCH 001/207] first commit

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 README.md

diff --git a/README.md b/README.md
new file mode 100644
index 00000000..a272e6f3
--- /dev/null
+++ b/README.md
@@ -0,0 +1,2 @@
+# discord-cluster-manager
+# reference-kernels

From 0f27472dd5eb7a2443865badc2e462d3884deeb8 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Wed, 19 Feb 2025 17:39:29 -0800
Subject: [PATCH 002/207] Main Practice Problems

---
 README.md                                     |  25 +-
 problems/beta.yaml                            |  16 +
 problems/pmpp/conv2d_py/reference.py          |  62 ++++
 problems/pmpp/conv2d_py/task.py               |  18 ++
 problems/pmpp/conv2d_py/task.yml              |  31 ++
 problems/pmpp/eval.py                         | 274 ++++++++++++++++++
 problems/pmpp/grayscale_py/reference.py       |  44 +++
 problems/pmpp/grayscale_py/task.py            |   9 +
 problems/pmpp/grayscale_py/task.yml           |  33 +++
 problems/pmpp/histogram_py/reference.py       |  47 +++
 problems/pmpp/histogram_py/task.py            |  16 +
 problems/pmpp/histogram_py/task.yml           |  31 ++
 problems/pmpp/matmul_py/reference.py          |  26 ++
 problems/pmpp/matmul_py/submission.py         |   5 +
 problems/pmpp/matmul_py/task.py               |  11 +
 problems/pmpp/matmul_py/task.yml              |  36 +++
 problems/pmpp/prefixsum_py/reference.py       |  35 +++
 problems/pmpp/prefixsum_py/task.py            |   9 +
 problems/pmpp/prefixsum_py/task.yml           |  31 ++
 problems/pmpp/sort_py/reference.py            |  35 +++
 problems/pmpp/sort_py/submission.py           |  14 +
 problems/pmpp/sort_py/task.py                 |   9 +
 problems/pmpp/sort_py/task.yml                |  31 ++
 problems/pmpp/utils.py                        |  94 ++++++
 problems/pmpp/vectoradd_py/reference.py       |  38 +++
 .../vectoradd_py/submission_cuda_inline.py    | 122 ++++++++
 .../pmpp/vectoradd_py/submission_triton.py    |  38 +++
 problems/pmpp/vectoradd_py/task.py            |  11 +
 problems/pmpp/vectoradd_py/task.yml           |  31 ++
 problems/pmpp/vectorsum_py/reference.py       |  35 +++
 problems/pmpp/vectorsum_py/submission.py      |  57 ++++
 problems/pmpp/vectorsum_py/task.py            |   9 +
 problems/pmpp/vectorsum_py/task.yml           |  31 ++
 33 files changed, 1312 insertions(+), 2 deletions(-)
 create mode 100644 problems/beta.yaml
 create mode 100644 problems/pmpp/conv2d_py/reference.py
 create mode 100644 problems/pmpp/conv2d_py/task.py
 create mode 100644 problems/pmpp/conv2d_py/task.yml
 create mode 100644 problems/pmpp/eval.py
 create mode 100644 problems/pmpp/grayscale_py/reference.py
 create mode 100644 problems/pmpp/grayscale_py/task.py
 create mode 100644 problems/pmpp/grayscale_py/task.yml
 create mode 100644 problems/pmpp/histogram_py/reference.py
 create mode 100644 problems/pmpp/histogram_py/task.py
 create mode 100644 problems/pmpp/histogram_py/task.yml
 create mode 100644 problems/pmpp/matmul_py/reference.py
 create mode 100644 problems/pmpp/matmul_py/submission.py
 create mode 100644 problems/pmpp/matmul_py/task.py
 create mode 100644 problems/pmpp/matmul_py/task.yml
 create mode 100644 problems/pmpp/prefixsum_py/reference.py
 create mode 100644 problems/pmpp/prefixsum_py/task.py
 create mode 100644 problems/pmpp/prefixsum_py/task.yml
 create mode 100644 problems/pmpp/sort_py/reference.py
 create mode 100644 problems/pmpp/sort_py/submission.py
 create mode 100644 problems/pmpp/sort_py/task.py
 create mode 100644 problems/pmpp/sort_py/task.yml
 create mode 100644 problems/pmpp/utils.py
 create mode 100644 problems/pmpp/vectoradd_py/reference.py
 create mode 100644 problems/pmpp/vectoradd_py/submission_cuda_inline.py
 create mode 100644 problems/pmpp/vectoradd_py/submission_triton.py
 create mode 100644 problems/pmpp/vectoradd_py/task.py
 create mode 100644 problems/pmpp/vectoradd_py/task.yml
 create mode 100644 problems/pmpp/vectorsum_py/reference.py
 create mode 100644 problems/pmpp/vectorsum_py/submission.py
 create mode 100644 problems/pmpp/vectorsum_py/task.py
 create mode 100644 problems/pmpp/vectorsum_py/task.yml

diff --git a/README.md b/README.md
index a272e6f3..19d5463b 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,23 @@
-# discord-cluster-manager
-# reference-kernels
+## Reference Kernels
+
+This repo holds reference kernels for the KernelBot which hosts regular competitions on discord.gg/gpumode
+
+## Competition
+1. PMPP practice problems: Starting on Sunday Feb 21, 2025
+2. LLM competition: Coming soon!
+
+## Making a submission
+
+Please take a look at `vectoradd_py` to see multiple examples of expected submisisons ranging from PyTorch code to Triton to inline CUDA.
+
+
+## Contributing New Problems
+
+To add a new problem, create a new folder in the `problems/glory` directory where you need to add the following files:
+- `reference.py` - This is the PyTorch reference implementation of the problem.
+- `task.yml` - This is the problem specification that will be used to generate test cases for different shapes
+- `task.py` - Specifies the schema of the inputs and outputs for the problem
+
+
+
+
diff --git a/problems/beta.yaml b/problems/beta.yaml
new file mode 100644
index 00000000..67cad93c
--- /dev/null
+++ b/problems/beta.yaml
@@ -0,0 +1,16 @@
+name: PMPP Practice Problems
+# when does this end (individual problems might close earlier)
+deadline: ""
+# A description for this particular competition
+description: ""
+# the list of problems
+problems:
+  - directory: pmpp/conv2d_py
+    name: conv2d
+    deadline: "2025-12-31"
+    gpus:
+      - H100
+      - A100
+      - T4
+      - L4
+    # Scoring rule/weights etc
\ No newline at end of file
diff --git a/problems/pmpp/conv2d_py/reference.py b/problems/pmpp/conv2d_py/reference.py
new file mode 100644
index 00000000..0eb48e6f
--- /dev/null
+++ b/problems/pmpp/conv2d_py/reference.py
@@ -0,0 +1,62 @@
+from utils import verbose_allclose
+import torch
+import torch.nn.functional as F
+from task import input_t, output_t, KernelSpec
+
+def ref_kernel(data: input_t, spec: KernelSpec) -> output_t:
+    """
+    Reference implementation of 2D convolution using PyTorch.
+    Args:
+        data: Tuple of (input tensor, kernel tensor)
+        spec: Convolution specifications (stride, padding)
+    Returns:
+        Output tensor after convolution
+    """
+    input_tensor, kernel = data
+    return F.conv2d(
+        input_tensor, 
+        kernel,
+        stride=spec.stride,
+        padding=spec.padding
+    )
+
+def generate_input(size: int, kernel_size: int, channels: int, batch: int, seed: int) -> input_t:
+    """
+    Generates random input and kernel tensors.
+    Returns:
+        Tuple of (input tensor, kernel tensor)
+    """
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    
+    # Generate input tensor: [batch, in_channels, height, width]
+    input_tensor = torch.randn(
+        batch, channels, size, size,
+        device='cuda', 
+        dtype=torch.float32, 
+        generator=gen
+    ).contiguous()
+    
+    # Generate kernel tensor: [out_channels, in_channels, kernel_height, kernel_width]
+    # Here we use same number of output channels as input channels for simplicity
+    kernel = torch.randn(
+        channels, channels, kernel_size, kernel_size,
+        device='cuda',
+        dtype=torch.float32,
+        generator=gen
+    ).contiguous()
+    
+    return (input_tensor, kernel)
+
+def check_implementation(
+    data: input_t,
+    spec: KernelSpec,
+    output: output_t,
+) -> str:
+    expected = ref_kernel(data, spec)
+    reasons = verbose_allclose(output, expected, rtol=1e-3, atol=1e-3)
+    
+    if len(reasons) > 0:
+        return "mismatch found! custom implementation doesn't match reference: " + reasons[0]
+    
+    return '' 
\ No newline at end of file
diff --git a/problems/pmpp/conv2d_py/task.py b/problems/pmpp/conv2d_py/task.py
new file mode 100644
index 00000000..6cce0e6e
--- /dev/null
+++ b/problems/pmpp/conv2d_py/task.py
@@ -0,0 +1,18 @@
+from typing import TypedDict, TypeVar, Tuple
+import torch
+from dataclasses import dataclass
+
+input_t = TypeVar("input_t", bound=Tuple[torch.Tensor, torch.Tensor])
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+@dataclass
+class KernelSpec:
+    stride: int
+    padding: int
+
+class TestSpec(TypedDict):
+    size: int
+    kernel_size: int
+    channels: int
+    batch: int
+    seed: int 
\ No newline at end of file
diff --git a/problems/pmpp/conv2d_py/task.yml b/problems/pmpp/conv2d_py/task.yml
new file mode 100644
index 00000000..5b0955ed
--- /dev/null
+++ b/problems/pmpp/conv2d_py/task.yml
@@ -0,0 +1,31 @@
+# name: conv2d-py
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  Implement a 2D convolution kernel that matches the reference implementation.
+  The kernel should perform 2D convolution with the given specifications (stride and padding).
+
+config:
+  main: "eval.py" 
+
+tests:
+  - {"size": 32, "kernel_size": 3, "channels": 16, "batch": 1, "seed": 4242}
+  - {"size": 32, "kernel_size": 5, "channels": 16, "batch": 2, "seed": 5236}
+  - {"size": 64, "kernel_size": 3, "channels": 32, "batch": 1, "seed": 1001}
+  - {"size": 64, "kernel_size": 5, "channels": 32, "batch": 2, "seed": 5531}
+  - {"size": 128, "kernel_size": 3, "channels": 64, "batch": 1, "seed": 9173}
+
+benchmarks:
+  - {"size": 128, "kernel_size": 3, "channels": 64, "batch": 4, "seed": 54352}
+  - {"size": 128, "kernel_size": 5, "channels": 64, "batch": 4, "seed": 93246}
+  - {"size": 256, "kernel_size": 3, "channels": 128, "batch": 2, "seed": 6256}
+  - {"size": 256, "kernel_size": 5, "channels": 128, "batch": 2, "seed": 8841}
+  - {"size": 512, "kernel_size": 3, "channels": 256, "batch": 1, "seed": 6252} 
\ No newline at end of file
diff --git a/problems/pmpp/eval.py b/problems/pmpp/eval.py
new file mode 100644
index 00000000..abdaee81
--- /dev/null
+++ b/problems/pmpp/eval.py
@@ -0,0 +1,274 @@
+import dataclasses
+import re
+import time
+import os
+import sys
+import math
+from pathlib import Path
+from typing import Any
+
+import torch.cuda
+
+from utils import set_seed
+try:
+    from task import TestSpec
+except ImportError:
+    TestSpec = dict
+
+from submission import custom_kernel
+from reference import check_implementation, generate_input
+
+WARMUP_RUNS = 10
+TIMED_RUNS = 100
+
+
+class PopcornOutput:
+    def __init__(self, fd: int):
+        self.file = os.fdopen(fd, 'w')
+    
+    def __enter__(self):
+        return self
+    
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.file.close()
+    
+    def print(self, *args, **kwargs):
+        print(*args, **kwargs, file=self.file, flush=True)
+    
+    def log(self, key, value):
+        self.print(f"{key}: {value}")
+
+
+@dataclasses.dataclass
+class TestCase:
+    args: dict
+    spec: str
+
+
+def get_test_cases(file_name: str) -> list[TestCase]:
+    try:
+        content = Path(file_name).read_text()
+    except Exception as E:
+        print(f"Could not open test file`{file_name}`: {E}", file=sys.stderr)
+        exit(113)
+
+    tests = []
+    lines = content.splitlines()
+    match = r"\s*([a-zA-Z]+):\s*([a-zA-Z]+|[+-]?[0-9]+)\s*"
+    for line in lines:
+        parts = line.split(";")
+        case = {}
+        for part in parts:
+            matched = re.match(match, part)
+            if not re.fullmatch(match, part):
+                print(f"invalid test case: '{line}': '{part}'", file=sys.stderr)
+                exit(113)
+            key = matched[1]
+            val = matched[2]
+            try:
+                val = int(val)
+            except ValueError:
+                pass
+
+            case[key] = val
+        tests.append(TestCase(spec=line, args=case))
+
+    return tests
+
+
+def warm_up(test: TestCase):
+    data = generate_input(**test.args)
+    start = time.perf_counter()
+    while time.perf_counter() - start < 0.2:
+        custom_kernel(data)
+        torch.cuda.synchronize()
+
+
+@dataclasses.dataclass
+class Stats:
+    runs: int
+    mean: float
+    std: float
+    err: float
+    best: float
+    worst: float
+
+
+def calculate_stats(durations: list[int]):
+    """
+    Calculate statistical data from a list of durations.
+
+    @param durations: A list of durations in nanoseconds.
+    @return: A Stats object containing the number of runs, mean, standard deviation, error, best, and worst durations.
+    """
+    runs = len(durations)
+    total = sum(durations)
+    best = min(durations)
+    worst = max(durations)
+
+    avg = total / runs
+    variance = sum(map(lambda x: (x - avg)**2, durations))
+    std = math.sqrt(variance / (runs - 1))
+    err = std / math.sqrt(runs)
+
+    return Stats(runs=runs, mean=avg, std=std, err=err, best=float(best),
+                 worst=float(worst))
+
+
+def run_testing(logger: PopcornOutput, tests: list[TestCase]):
+    """
+    Executes the actual test case code and checks for correctness.
+
+    @param logger: A PopcornOutput object used for logging test results.
+    @param tests: A list of TestCase objects representing the test cases to be executed.
+    @return: An integer representing the exit status: 0 if all tests pass, otherwise 112.
+    """
+    passed = True
+    logger.log("test-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"test.{idx}.spec", test.spec)
+
+        data = generate_input(**test.args)
+        torch.cuda.synchronize()
+        submission_output = custom_kernel(data)
+        torch.cuda.synchronize()
+        error = check_implementation(data, submission_output)
+        if error:
+            logger.log(f"test.{idx}.status", "fail")
+            logger.log(f"test.{idx}.error", error)
+            passed = False
+        else:
+            logger.log(f"test.{idx}.status", "pass")
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def benchmark(test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float) -> Stats | Any:
+    """
+    For a particular test case, check correctness (if applicable) and grab runtime results.
+
+    @param test: TestCase object.
+    @param recheck: Flag for whether to explicitly check functional correctness.
+    @param max_repeats: Number of trials to repeat.
+    @param max_time_ns: Timeout time in nanoseconds.
+    @return: A Stats object for this particular benchmark case or an error if the test fails.
+    """
+    durations = []
+    # generate input data once
+    data = generate_input(**test.args)
+    #  first, one obligatory correctness check
+    output = custom_kernel(data)
+    error = check_implementation(data, output)
+    if error:
+        return error
+
+    # now, do multiple timing runs without further correctness testing
+    # there is an upper bound of 100 runs, and a lower bound of 3 runs;
+    # otherwise, we repeat until we either measure at least 10 full seconds,
+    # or the relative error of the mean is below 1%.
+
+    for i in range(max_repeats):
+        if recheck:
+            data = generate_input(**test.args)
+        torch.cuda.synchronize()
+        start = time.perf_counter_ns()
+        output = custom_kernel(data)
+        torch.cuda.synchronize()
+        end = time.perf_counter_ns()
+
+        if recheck:
+            error = check_implementation(data, output)
+            if error:
+                return error
+
+        del output
+        durations.append(end-start)
+
+        if i > 1:
+            stats = calculate_stats(durations)
+            if stats.err / stats.mean < 0.01 or stats.mean *  stats.runs > max_time_ns:
+                break
+
+    return calculate_stats(durations)
+
+
+def run_benchmarking(logger: PopcornOutput, tests: list[TestCase]):
+    """
+    Executes benchmarking code for a CUDA Kernel and logs runtimes.
+
+    @param logger: A PopcornOutput object used for logging benchmark results.
+    @param tests: A list of TestCase objects representing the test cases to be benchmarked.
+    @return: An integer representing the exit status: 0 if all benchmarks pass, otherwise 112.
+    """
+    warm_up(tests[0])
+    passed = True
+    logger.log("benchmark-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"benchmark.{idx}.spec", test.spec)
+        result = benchmark(test, False, 100, 10e9)
+        if isinstance(result, Stats):
+            for field in dataclasses.fields(Stats):
+                logger.log(f"benchmark.{idx}.{field.name}", getattr(result, field.name))
+        else:
+            passed = False
+            logger.log(f"benchmark.{idx}.status", "fail")
+            logger.log(f"benchmark.{idx}.error", result)
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def main():
+    fd = os.getenv("POPCORN_FD")
+    if not fd:
+        return 111
+
+    if len(sys.argv) < 3:
+        return 2
+
+    mode = sys.argv[1]
+    tests = get_test_cases(sys.argv[2])
+
+    with PopcornOutput(int(fd)) as logger:
+        seed = os.getenv("POPCORN_SEED")
+        seed = int(seed) if seed else 42
+        set_seed(seed)
+
+        if mode == "test":
+            return run_testing(logger, tests)
+
+        if mode == "benchmark":
+            return run_benchmarking(logger, tests)
+        
+        if mode == "leaderboard":
+            warm_up(tests[0])
+            result = benchmark(tests[-1], True, 100, 30e9)
+            if isinstance(result, Stats):
+                logger.log("benchmark-count", 1)
+                logger.log(f"benchmark.0.spec", tests[-1].spec)
+                logger.log(f"benchmark.0.runs", result.runs)
+                logger.log(f"benchmark.0.mean", result.mean)
+                logger.log(f"benchmark.0.std", result.std)
+                logger.log(f"benchmark.0.err", result.err)
+                logger.log("check", "pass")
+            else:
+                logger.log("test-count", 1)
+                logger.log("test.0.status", "fail")
+                logger.log("test.0.error", str(result)) #TODO: Make sure result implements __str__?
+        
+        else:
+            # TODO: Implement script and profile mode
+            return 2
+
+
+if __name__ == "__main__":
+    sys.exit(main())
\ No newline at end of file
diff --git a/problems/pmpp/grayscale_py/reference.py b/problems/pmpp/grayscale_py/reference.py
new file mode 100644
index 00000000..264e733e
--- /dev/null
+++ b/problems/pmpp/grayscale_py/reference.py
@@ -0,0 +1,44 @@
+from utils import verbose_allclose
+import torch
+from task import input_t, output_t
+
+def ref_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of RGB to grayscale conversion using PyTorch.
+    Uses the standard coefficients: Y = 0.2989 R + 0.5870 G + 0.1140 B
+    
+    Args:
+        data: RGB tensor of shape (H, W, 3) with values in [0, 1]
+    Returns:
+        Grayscale tensor of shape (H, W) with values in [0, 1]
+    """
+    # Standard RGB to Grayscale coefficients
+    weights = torch.tensor([0.2989, 0.5870, 0.1140], 
+                         device=data.device, 
+                         dtype=data.dtype)
+    return torch.sum(data * weights, dim=-1)
+
+def generate_input(size: int, seed: int) -> input_t:
+    """
+    Generates random RGB image tensor of specified size.
+    Returns:
+        Tensor of shape (size, size, 3) with values in [0, 1]
+    """
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    return torch.rand(size, size, 3, 
+                     device='cuda', 
+                     dtype=torch.float32, 
+                     generator=gen).contiguous()
+
+def check_implementation(
+    data: input_t,
+    output: output_t,
+) -> str:
+    expected = ref_kernel(data)
+    reasons = verbose_allclose(output, expected, rtol=1e-4, atol=1e-4)
+    
+    if len(reasons) > 0:
+        return "mismatch found! custom implementation doesn't match reference: " + reasons[0]
+    
+    return '' 
\ No newline at end of file
diff --git a/problems/pmpp/grayscale_py/task.py b/problems/pmpp/grayscale_py/task.py
new file mode 100644
index 00000000..4a717fcc
--- /dev/null
+++ b/problems/pmpp/grayscale_py/task.py
@@ -0,0 +1,9 @@
+from typing import TypedDict, TypeVar
+import torch
+
+input_t = TypeVar("input_t", bound=torch.Tensor)  # Input will be (H, W, 3) RGB tensor
+output_t = TypeVar("output_t", bound=torch.Tensor)  # Output will be (H, W) grayscale tensor
+
+class TestSpec(TypedDict):
+    size: int  # Size of the square image (H=W)
+    seed: int 
\ No newline at end of file
diff --git a/problems/pmpp/grayscale_py/task.yml b/problems/pmpp/grayscale_py/task.yml
new file mode 100644
index 00000000..b14a81b5
--- /dev/null
+++ b/problems/pmpp/grayscale_py/task.yml
@@ -0,0 +1,33 @@
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "eval.py"}
+
+lang: "py"
+
+description: |
+  Implement an RGB to grayscale conversion kernel that matches the reference implementation.
+  The kernel should convert RGB images to grayscale using the standard coefficients:
+  Y = 0.2989 R + 0.5870 G + 0.1140 B
+  
+  Input: RGB tensor of shape (H, W, 3) with values in [0, 1]
+  Output: Grayscale tensor of shape (H, W) with values in [0, 1]
+
+config:
+  main: "eval.py" 
+
+tests:
+  - {"size": 127, "seed": 4242}
+  - {"size": 128, "seed": 5236}
+  - {"size": 129, "seed": 1001}
+  - {"size": 256, "seed": 5531}
+  - {"size": 512, "seed": 9173}
+
+benchmarks:
+  - {"size": 1024, "seed": 54352}
+  - {"size": 2048, "seed": 93246}
+  - {"size": 4096, "seed": 6256}
+  - {"size": 8192, "seed": 8841}
+  - {"size": 16384, "seed": 6252} 
\ No newline at end of file
diff --git a/problems/pmpp/histogram_py/reference.py b/problems/pmpp/histogram_py/reference.py
new file mode 100644
index 00000000..8fb766b8
--- /dev/null
+++ b/problems/pmpp/histogram_py/reference.py
@@ -0,0 +1,47 @@
+from utils import verbose_allclose
+import torch
+from task import input_t, output_t, HistogramSpec
+
+def ref_kernel(data: input_t, spec: HistogramSpec) -> output_t:
+    """
+    Reference implementation of histogram using PyTorch.
+    Args:
+        data: Input tensor to compute histogram on
+        spec: Histogram specifications (num_bins, min_val, max_val)
+    Returns:
+        Tensor containing bin counts
+    """
+    # Clip values to range
+    clipped = torch.clamp(data, spec.min_val, spec.max_val)
+    
+    # Scale to bin indices
+    bin_width = (spec.max_val - spec.min_val) / spec.num_bins
+    indices = ((clipped - spec.min_val) / bin_width).long()
+    indices = torch.clamp(indices, 0, spec.num_bins - 1)
+    
+    # Count values in each bin
+    return torch.bincount(indices, minlength=spec.num_bins).to(torch.float32)
+
+def generate_input(size: int, seed: int) -> input_t:
+    """
+    Generates random input tensor with values roughly in [0, 1].
+    Returns:
+        Tensor to compute histogram on
+    """
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    # Generate values with normal distribution for interesting histograms
+    return torch.randn(size, device='cuda', dtype=torch.float32, generator=gen).contiguous()
+
+def check_implementation(
+    data: input_t,
+    spec: HistogramSpec,
+    output: output_t,
+) -> str:
+    expected = ref_kernel(data, spec)
+    reasons = verbose_allclose(output, expected)
+    
+    if len(reasons) > 0:
+        return "mismatch found! custom implementation doesn't match reference: " + reasons[0]
+    
+    return '' 
\ No newline at end of file
diff --git a/problems/pmpp/histogram_py/task.py b/problems/pmpp/histogram_py/task.py
new file mode 100644
index 00000000..e9d7fadf
--- /dev/null
+++ b/problems/pmpp/histogram_py/task.py
@@ -0,0 +1,16 @@
+from typing import TypedDict, TypeVar
+import torch
+from dataclasses import dataclass
+
+input_t = TypeVar("input_t", bound=torch.Tensor)
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+@dataclass
+class HistogramSpec:
+    num_bins: int
+    min_val: float
+    max_val: float
+
+class TestSpec(TypedDict):
+    size: int
+    seed: int 
\ No newline at end of file
diff --git a/problems/pmpp/histogram_py/task.yml b/problems/pmpp/histogram_py/task.yml
new file mode 100644
index 00000000..a1bfeb31
--- /dev/null
+++ b/problems/pmpp/histogram_py/task.yml
@@ -0,0 +1,31 @@
+# name: histogram-cuda-inline
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "eval.py"}
+
+lang: "py"
+
+description: |
+  Implement a histogram kernel using CUDA inline function that matches the reference implementation.
+  The kernel should count the number of elements falling into each bin across the specified range.
+
+config:
+  main: "eval.py" 
+
+tests:
+  - {"size": 1023, "seed": 4242}
+  - {"size": 1024, "seed": 5236}
+  - {"size": 1025, "seed": 1001}
+  - {"size": 2048, "seed": 5531}
+  - {"size": 4096, "seed": 9173}
+
+benchmarks:
+  - {"size": 8192, "seed": 54352}
+  - {"size": 16384, "seed": 93246}
+  - {"size": 32768, "seed": 6256}
+  - {"size": 65536, "seed": 8841}
+  - {"size": 131072, "seed": 6252} 
\ No newline at end of file
diff --git a/problems/pmpp/matmul_py/reference.py b/problems/pmpp/matmul_py/reference.py
new file mode 100644
index 00000000..76da5c6a
--- /dev/null
+++ b/problems/pmpp/matmul_py/reference.py
@@ -0,0 +1,26 @@
+import torch
+from task import input_t, output_t
+from utils import verbose_allclose
+
+def generate_input(m: int, n: int, k: int, seed: int) -> input_t:
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    a = torch.empty(m, k, device='cuda', dtype=torch.float16)
+    a.uniform_(0, 1, generator=gen)
+    b = torch.empty(k, n, device='cuda', dtype=torch.float16)
+    b.uniform_(0, 1, generator=gen)
+    return (a, b)
+
+def ref_kernel(data: input_t) -> output_t:
+    a, b = data
+    return a @ b
+
+def check_implementation(data: input_t, output: output_t) -> str:
+    expected = ref_kernel(data)
+    reasons = verbose_allclose(output, expected)
+    if len(reasons) > 0:
+        # TODO better processing of reasons
+        return "mismatch found! custom implementation doesn't match reference.: " + reasons[0]
+
+    return ''
+
diff --git a/problems/pmpp/matmul_py/submission.py b/problems/pmpp/matmul_py/submission.py
new file mode 100644
index 00000000..97d17433
--- /dev/null
+++ b/problems/pmpp/matmul_py/submission.py
@@ -0,0 +1,5 @@
+from task import input_t, output_t
+
+def custom_kernel(data: input_t) -> output_t:
+    a, b = data
+    return a @ b
diff --git a/problems/pmpp/matmul_py/task.py b/problems/pmpp/matmul_py/task.py
new file mode 100644
index 00000000..1c72c782
--- /dev/null
+++ b/problems/pmpp/matmul_py/task.py
@@ -0,0 +1,11 @@
+import torch
+from typing import TypeVar, TypedDict
+
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor])
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+class TestSpec(TypedDict):
+    m: int
+    n: int
+    k: int
+    seed: int
diff --git a/problems/pmpp/matmul_py/task.yml b/problems/pmpp/matmul_py/task.yml
new file mode 100644
index 00000000..9e19eb4f
--- /dev/null
+++ b/problems/pmpp/matmul_py/task.yml
@@ -0,0 +1,36 @@
+# name: matmul-py
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "eval.py"}
+
+lang: "py"
+
+description: |
+  Implement a custom matmul function that matches the reference implementation.
+  The function should handle a tuple of input tensors and apply matmul
+
+config:
+  main: "eval.py"
+
+tests:
+  - {"m": 64, "n": 64, "k": 64, "seed": 53124}
+  - {"m": 128, "n": 128, "k": 128, "seed": 3321}
+  - {"m": 256, "n": 256, "k": 256, "seed": 1200}
+
+  - {"m": 32, "n": 512, "k": 32, "seed": 32523}
+  - {"m": 64, "n": 1024, "k": 64, "seed": 4327}
+
+benchmarks:
+  - {"m": 128, "n": 128, "k": 128, "seed": 43214}
+  - {"m": 256, "n": 256, "k": 256, "seed": 423011}
+  - {"m": 512, "n": 512, "k": 512, "seed": 123456}
+  - {"m": 1024, "n": 1024, "k": 1024, "seed": 1029}
+  - {"m": 2048, "n": 2048, "k": 2048, "seed": 75342}
+
+  - {"m": 1024, "n": 1536, "k": 1024, "seed": 321}
+  - {"m": 2048, "n": 3072, "k": 2048, "seed": 32412}
+  - {"m": 4096, "n": 5120, "k": 4096, "seed": 123456}
diff --git a/problems/pmpp/prefixsum_py/reference.py b/problems/pmpp/prefixsum_py/reference.py
new file mode 100644
index 00000000..bce90273
--- /dev/null
+++ b/problems/pmpp/prefixsum_py/reference.py
@@ -0,0 +1,35 @@
+from utils import verbose_allclose
+import torch
+from task import input_t, output_t
+
+def ref_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of inclusive prefix sum using PyTorch.
+    Args:
+        data: Input tensor to compute prefix sum on
+    Returns:
+        Tensor containing the inclusive prefix sum
+    """
+    return torch.cumsum(data, dim=0)
+
+def generate_input(size: int, seed: int) -> input_t:
+    """
+    Generates random input tensor.
+    Returns:
+        Tensor to compute prefix sum on
+    """
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    return torch.randn(size, device='cuda', dtype=torch.float32, generator=gen).contiguous()
+
+def check_implementation(
+    data: input_t,
+    output: output_t,
+) -> str:
+    expected = ref_kernel(data)
+    reasons = verbose_allclose(output, expected, rtol=1e-5, atol=1e-5)
+    
+    if len(reasons) > 0:
+        return "mismatch found! custom implementation doesn't match reference: " + reasons[0]
+    
+    return '' 
\ No newline at end of file
diff --git a/problems/pmpp/prefixsum_py/task.py b/problems/pmpp/prefixsum_py/task.py
new file mode 100644
index 00000000..62e5dae0
--- /dev/null
+++ b/problems/pmpp/prefixsum_py/task.py
@@ -0,0 +1,9 @@
+from typing import TypedDict, TypeVar
+import torch
+
+input_t = TypeVar("input_t", bound=torch.Tensor)
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+class TestSpec(TypedDict):
+    size: int
+    seed: int 
\ No newline at end of file
diff --git a/problems/pmpp/prefixsum_py/task.yml b/problems/pmpp/prefixsum_py/task.yml
new file mode 100644
index 00000000..36cfab42
--- /dev/null
+++ b/problems/pmpp/prefixsum_py/task.yml
@@ -0,0 +1,31 @@
+# name: prefixsum-cuda-inline
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "eval.py"}
+
+lang: "py"
+
+description: |
+  Implement an inclusive prefix sum (scan) kernel using CUDA inline function that matches the reference implementation.
+  The kernel should compute the cumulative sum of all elements up to each position.
+
+config:
+  main: "eval.py" 
+
+tests:
+  - {"size": 1023, "seed": 4242}
+  - {"size": 1024, "seed": 5236}
+  - {"size": 1025, "seed": 1001}
+  - {"size": 2048, "seed": 5531}
+  - {"size": 4096, "seed": 9173}
+
+benchmarks:
+  - {"size": 8192, "seed": 54352}
+  - {"size": 16384, "seed": 93246}
+  - {"size": 32768, "seed": 6256}
+  - {"size": 65536, "seed": 8841}
+  - {"size": 131072, "seed": 6252} 
\ No newline at end of file
diff --git a/problems/pmpp/sort_py/reference.py b/problems/pmpp/sort_py/reference.py
new file mode 100644
index 00000000..c8d05f77
--- /dev/null
+++ b/problems/pmpp/sort_py/reference.py
@@ -0,0 +1,35 @@
+from utils import verbose_allclose
+import torch
+from task import input_t, output_t
+
+def ref_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of sort using PyTorch.
+    Args:
+        data: Input tensor to be sorted
+    Returns:
+        Sorted tensor
+    """
+    return torch.sort(data)[0]
+
+def generate_input(size: int, seed: int) -> input_t:
+    """
+    Generates random input tensor.
+    Returns:
+        Tensor to be sorted
+    """
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    return torch.randn(size, device='cuda', dtype=torch.float32, generator=gen).contiguous()
+
+def check_implementation(
+    data: input_t,
+    output: output_t,
+) -> str:
+    expected = ref_kernel(data)
+    reasons = verbose_allclose(output, expected)
+    
+    if len(reasons) > 0:
+        return "mismatch found! custom implementation doesn't match reference: " + reasons[0]
+    
+    return '' 
\ No newline at end of file
diff --git a/problems/pmpp/sort_py/submission.py b/problems/pmpp/sort_py/submission.py
new file mode 100644
index 00000000..5a4915c9
--- /dev/null
+++ b/problems/pmpp/sort_py/submission.py
@@ -0,0 +1,14 @@
+import torch
+from task import input_t, output_t
+
+def _custom_kernel(data: input_t) -> output_t:
+    """
+    Implements sort using PyTorch.
+    Args:
+        data: Input tensor to be sorted
+    Returns:
+        Sorted tensor
+    """
+    return torch.sort(data)[0]
+
+custom_kernel = torch.compile(_custom_kernel, mode="reduce-overhead")
\ No newline at end of file
diff --git a/problems/pmpp/sort_py/task.py b/problems/pmpp/sort_py/task.py
new file mode 100644
index 00000000..62e5dae0
--- /dev/null
+++ b/problems/pmpp/sort_py/task.py
@@ -0,0 +1,9 @@
+from typing import TypedDict, TypeVar
+import torch
+
+input_t = TypeVar("input_t", bound=torch.Tensor)
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+class TestSpec(TypedDict):
+    size: int
+    seed: int 
\ No newline at end of file
diff --git a/problems/pmpp/sort_py/task.yml b/problems/pmpp/sort_py/task.yml
new file mode 100644
index 00000000..17c99107
--- /dev/null
+++ b/problems/pmpp/sort_py/task.yml
@@ -0,0 +1,31 @@
+# name: mergesort-cuda-inline
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "eval.py"}
+
+lang: "py"
+
+description: |
+  Implement a sort kernel that matches the reference implementation.
+  The kernel should sort the input array in ascending order using the merge sort algorithm.
+
+config:
+  main: "eval.py" 
+
+tests:
+  - {"size": 1023, "seed": 4242}
+  - {"size": 1024, "seed": 5236}
+  - {"size": 1025, "seed": 1001}
+  - {"size": 2048, "seed": 5531}
+  - {"size": 4096, "seed": 9173}
+
+benchmarks:
+  - {"size": 8192, "seed": 54352}
+  - {"size": 16384, "seed": 93246}
+  - {"size": 32768, "seed": 6256}
+  - {"size": 65536, "seed": 8841}
+  - {"size": 131072, "seed": 6252} 
\ No newline at end of file
diff --git a/problems/pmpp/utils.py b/problems/pmpp/utils.py
new file mode 100644
index 00000000..6b715d51
--- /dev/null
+++ b/problems/pmpp/utils.py
@@ -0,0 +1,94 @@
+import random
+import numpy as np
+import torch
+
+
+def set_seed(seed=42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+
+
+def get_device(use_cuda: bool = True) -> torch.device:
+    """Get the appropriate device (GPU or CPU)."""
+    if use_cuda:
+        if torch.cuda.is_available():
+            return torch.device("cuda")
+        elif torch.backends.mps.is_available():
+            return torch.device("mps")
+        else:
+            print("No compatible GPU found. Falling back to CPU.")
+    return torch.device("cpu")
+
+# Adapted from https://github.com/linkedin/Liger-Kernel/blob/main/test/utils.py
+def verbose_allclose(
+        tensor1: torch.Tensor,
+        tensor2: torch.Tensor,
+        rtol=1e-05,
+        atol=1e-08,
+        max_print=5
+) -> list[str]:
+    """
+    Assert that two tensors are element-wise equal within a tolerance, providing detailed information about mismatches.
+
+    Parameters:
+    tensor1 (torch.Tensor): First tensor to compare.
+    tensor2 (torch.Tensor): Second tensor to compare.
+    rtol (float): Relative tolerance.
+    atol (float): Absolute tolerance.
+    max_print (int): Maximum number of mismatched elements to print.
+
+    Raises:
+    AssertionError: If the tensors are not all close within the given tolerance.
+    """
+    # Check if the shapes of the tensors match
+    if tensor1.shape != tensor2.shape:
+        return ["SIZE MISMATCH"]
+
+    # Calculate the difference between the tensors
+    diff = torch.abs(tensor1 - tensor2)
+
+    # Determine the tolerance
+    tolerance = atol + rtol * torch.abs(tensor2)
+
+    # Find tolerance mismatched elements
+    tol_mismatched = diff > tolerance
+
+    # Find nan mismatched elements
+    nan_mismatched = torch.logical_xor(torch.isnan(tensor1), torch.isnan(tensor2))
+
+    # Find +inf mismatched elements
+    posinf_mismatched = torch.logical_xor(torch.isposinf(tensor1), torch.isposinf(tensor2))
+    # Find -inf mismatched elements
+    neginf_mismatched = torch.logical_xor(torch.isneginf(tensor1), torch.isneginf(tensor2))
+
+    # Find all mismatched elements
+    mismatched = torch.logical_or(
+        torch.logical_or(tol_mismatched, nan_mismatched),
+        torch.logical_or(posinf_mismatched, neginf_mismatched),
+    )
+
+    mismatched_indices = torch.nonzero(mismatched)
+
+    # Count the number of mismatched elements
+    num_mismatched = mismatched.sum().item()
+
+    # Check if all elements are close
+    all_close = num_mismatched == 0
+
+    # Raise AssertionError with detailed information if there are mismatches
+    if not all_close and num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}",
+                            f"Mismatched elements: {mismatched_indices}"]
+
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR AT {i}: {tensor1[i]} {tensor2[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return mismatch_details
+
+    return []
\ No newline at end of file
diff --git a/problems/pmpp/vectoradd_py/reference.py b/problems/pmpp/vectoradd_py/reference.py
new file mode 100644
index 00000000..06677423
--- /dev/null
+++ b/problems/pmpp/vectoradd_py/reference.py
@@ -0,0 +1,38 @@
+from utils import verbose_allclose
+import torch
+from task import input_t, output_t
+
+def ref_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of vector addition using PyTorch.
+    Args:
+        data: Tuple of tensors [A, B] to be added.
+    Returns:
+        Tensor containing element-wise sums.
+    """
+    A, B = data
+    return A + B
+
+def generate_input(size: int, seed: int) -> input_t:
+    """
+    Generates random input tensors of specified shapes.
+    Returns:
+        Tuple of tensors [A, B] to be added.
+    """
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    A = torch.randn(size, size, device='cuda', dtype=torch.float16, generator=gen).contiguous()
+    B = torch.randn(size, size, device='cuda', dtype=torch.float16, generator=gen).contiguous()
+    return (A, B)
+
+def check_implementation(
+    data: input_t,
+    output: output_t,
+) -> bool:
+    expected = ref_kernel(data)
+    reasons = verbose_allclose(output, expected)
+    
+    if len(reasons) > 0:
+        return "mismatch found! custom implementation doesn't match reference: " + reasons[0]
+    
+    return ''
diff --git a/problems/pmpp/vectoradd_py/submission_cuda_inline.py b/problems/pmpp/vectoradd_py/submission_cuda_inline.py
new file mode 100644
index 00000000..abdd10f3
--- /dev/null
+++ b/problems/pmpp/vectoradd_py/submission_cuda_inline.py
@@ -0,0 +1,122 @@
+import torch
+from torch.utils.cpp_extension import load_inline
+from typing import List
+from task import input_t, output_t
+
+add_cuda_source = """
+template <typename scalar_t>
+__global__ void add_kernel(const scalar_t* __restrict__ A, 
+                           const scalar_t* __restrict__ B, 
+                           scalar_t* __restrict__ C, 
+                           int N) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx < N) {
+        C[idx] = A[idx] + B[idx];
+    }
+}
+
+torch::Tensor add_cuda(torch::Tensor A, torch::Tensor B) {
+    TORCH_CHECK(A.device().is_cuda(), "Tensor A must be a CUDA tensor");
+    TORCH_CHECK(B.device().is_cuda(), "Tensor B must be a CUDA tensor");
+    TORCH_CHECK(A.sizes() == B.sizes(), "Input tensors must have the same size");
+    
+    int N = A.numel();  
+    auto C = torch::empty_like(A); 
+
+    const int threads = 256; 
+    const int blocks = (N + threads - 1) / threads;  
+    
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(A.scalar_type(), "add_kernel", ([&] {
+        add_kernel<scalar_t><<<blocks, threads>>>(
+            A.data_ptr<scalar_t>(),
+            B.data_ptr<scalar_t>(),
+            C.data_ptr<scalar_t>(),
+            N
+        );
+    }));
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        throw std::runtime_error(cudaGetErrorString(err));
+    }
+
+    return C;
+}
+"""
+
+add_cpp_source = """
+#include <torch/extension.h>
+
+torch::Tensor add_cuda(torch::Tensor A, torch::Tensor B);
+"""
+
+
+
+add_module = load_inline(
+    name='add_cuda',
+    cpp_sources=add_cpp_source,
+    cuda_sources=add_cuda_source,
+    functions=['add_cuda'],
+    verbose=True,
+)
+
+def add(A, B):
+    if not A.is_cuda or not B.is_cuda:
+        raise RuntimeError("Both tensors must be on GPU")
+    return add_module.add_cuda(A, B)
+
+def custom_kernel(data: input_t) -> output_t:
+    """
+    Custom implementation of vector addition using CUDA inline function.
+    Args:
+        inputs: List of pairs of tensors [A, B] to be added.
+    Returns:
+        List of tensors containing element-wise sums.
+    """
+    A, B = data
+
+    assert A.is_cuda and B.is_cuda, "Input tensors must be on GPU"
+    assert A.shape == B.shape, "Input tensors must have the same shape"
+    assert A.dtype == torch.float16 and B.dtype == torch.float16, "Input tensors must be float16"
+    
+    M, N = A.shape
+    C = torch.empty_like(A)
+    
+    n_threads = 256
+    n_blocks = (M * N + n_threads - 1) // n_threads
+    
+    cuda_source = """
+    extern "C" __global__ void add_kernel(
+        const half* __restrict__ A,
+        const half* __restrict__ B,
+        half* __restrict__ C,
+        const int n_elements
+    ) {
+        const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+        if (idx < n_elements) {
+            C[idx] = __hadd(A[idx], B[idx]);
+        }
+    }
+    """
+    
+    module = torch.utils.cpp_extension.load_inline(
+        name=f"add_kernel_{M}_{N}",
+        cpp_sources="",
+        cuda_sources=cuda_source,
+        functions=["add_kernel"],
+        with_cuda=True,
+        extra_cuda_cflags=["-arch=sm_70"],  # Adjust based on your GPU architecture
+    )
+    
+    module.add_kernel(
+        cuda_stream=torch.cuda.current_stream(),
+        args=[
+            A.reshape(-1), B.reshape(-1), C.reshape(-1),
+            M * N,
+        ],
+        blocks=n_blocks,
+        threads=n_threads,
+    )
+    
+    return C
diff --git a/problems/pmpp/vectoradd_py/submission_triton.py b/problems/pmpp/vectoradd_py/submission_triton.py
new file mode 100644
index 00000000..75cda2c9
--- /dev/null
+++ b/problems/pmpp/vectoradd_py/submission_triton.py
@@ -0,0 +1,38 @@
+import torch
+import triton
+import triton.language as tl
+from task import input_t, output_t
+
+
+@triton.jit
+def add_kernel(
+    A_ptr, B_ptr, C_ptr, M, N,
+    BLOCK_SIZE: tl.constexpr,
+):
+    row_idx = tl.program_id(0) * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    col_idx = tl.program_id(1) * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    
+    mask_row = row_idx < M
+    mask_col = col_idx < N
+
+    A = tl.load(A_ptr + row_idx[:, None] * N + col_idx[None, :], mask=mask_row[:, None] & mask_col[None, :], other=0.0)
+    B = tl.load(B_ptr + row_idx[:, None] * N + col_idx[None, :], mask=mask_row[:, None] & mask_col[None, :], other=0.0)
+
+    C = A + B
+    tl.store(C_ptr + row_idx[:, None] * N + col_idx[None, :], C, mask=mask_row[:, None] & mask_col[None, :])
+
+def custom_kernel(data: input_t) -> output_t:
+    A, B = data
+    M, N = A.shape
+
+    C = torch.empty_like(A)
+
+    BLOCK_SIZE = 32
+    grid = (triton.cdiv(M, BLOCK_SIZE), triton.cdiv(N, BLOCK_SIZE))
+
+    add_kernel[grid](
+        A, B, C, M, N,
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
+
+    return C
diff --git a/problems/pmpp/vectoradd_py/task.py b/problems/pmpp/vectoradd_py/task.py
new file mode 100644
index 00000000..0596f28f
--- /dev/null
+++ b/problems/pmpp/vectoradd_py/task.py
@@ -0,0 +1,11 @@
+from typing import TypedDict, TypeVar
+import torch
+
+
+input_t = TypeVar("input_t", bound=torch.Tensor)
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+
+class TestSpec(TypedDict):
+    size: int
+    seed: int
diff --git a/problems/pmpp/vectoradd_py/task.yml b/problems/pmpp/vectoradd_py/task.yml
new file mode 100644
index 00000000..7e63a421
--- /dev/null
+++ b/problems/pmpp/vectoradd_py/task.yml
@@ -0,0 +1,31 @@
+# name: vectoradd-cuda-inline
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "eval.py"}
+
+lang: "py"
+
+description: |
+  Implement a vector addition kernel using CUDA inline function that matches the reference implementation.
+  The kernel should add pairs of tensors element-wise.
+
+config:
+  main: "eval.py" 
+
+tests:
+  - {"size": 127, "seed": 4242}
+  - {"size": 128, "seed": 5236}
+  - {"size": 129, "seed": 1001}
+  - {"size": 256, "seed": 5531}
+  - {"size": 512, "seed": 9173}
+
+benchmarks:
+  - {"size": 1024, "seed": 54352}
+  - {"size": 2048, "seed": 93246}
+  - {"size": 4096, "seed": 6256}
+  - {"size": 8192, "seed": 8841}
+  - {"size": 16384, "seed": 6252}
diff --git a/problems/pmpp/vectorsum_py/reference.py b/problems/pmpp/vectorsum_py/reference.py
new file mode 100644
index 00000000..2b662f81
--- /dev/null
+++ b/problems/pmpp/vectorsum_py/reference.py
@@ -0,0 +1,35 @@
+from utils import verbose_allclose
+import torch
+from task import input_t, output_t
+
+def ref_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of vector sum reduction using PyTorch.
+    Args:
+        data: Input tensor to be reduced
+    Returns:
+        Tensor containing the sum of all elements
+    """
+    return data.sum()
+
+def generate_input(size: int, seed: int) -> input_t:
+    """
+    Generates random input tensor of specified shape.
+    Returns:
+        Tensor to be reduced
+    """
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    return torch.randn(size, device='cuda', dtype=torch.float32, generator=gen).contiguous()
+
+def check_implementation(
+    data: input_t,
+    output: output_t,
+) -> bool:
+    expected = ref_kernel(data)
+    reasons = verbose_allclose(output, expected)
+    
+    if len(reasons) > 0:
+        return "mismatch found! custom implementation doesn't match reference: " + reasons[0]
+    
+    return '' 
\ No newline at end of file
diff --git a/problems/pmpp/vectorsum_py/submission.py b/problems/pmpp/vectorsum_py/submission.py
new file mode 100644
index 00000000..8ac3ac13
--- /dev/null
+++ b/problems/pmpp/vectorsum_py/submission.py
@@ -0,0 +1,57 @@
+import torch
+import triton
+import triton.language as tl
+from task import input_t, output_t
+
+@triton.jit
+def sum_kernel(
+    x_ptr,
+    output_ptr,
+    n_elements,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    Parallel reduction kernel that sums elements in chunks.
+    Each thread block reduces BLOCK_SIZE elements.
+    """
+    pid = tl.program_id(0)
+    block_start = pid * BLOCK_SIZE
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+    
+    # Load data
+    x = tl.load(x_ptr + offsets, mask=mask, other=0.0)
+    
+    # Compute local reduction
+    block_sum = tl.sum(x, axis=0)
+    
+    # Store the partial sum
+    tl.atomic_add(output_ptr, block_sum)
+
+def _custom_kernel(data: input_t) -> output_t:
+    """
+    Performs parallel reduction to compute sum of all elements.
+    Args:
+        data: Input tensor to be reduced
+    Returns:
+        Tensor containing the sum of all elements
+    """
+    n_elements = data.numel()
+    output = torch.zeros(1, device=data.device, dtype=data.dtype)
+    
+    # Configure kernel
+    BLOCK_SIZE = 1024
+    grid = (triton.cdiv(n_elements, BLOCK_SIZE),)
+    
+    # Launch kernel
+    sum_kernel[grid](
+        data,
+        output,
+        n_elements,
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
+    
+    return output[0]
+
+# Compile the kernel for better performance
+custom_kernel = torch.compile(_custom_kernel, mode="reduce-overhead") 
\ No newline at end of file
diff --git a/problems/pmpp/vectorsum_py/task.py b/problems/pmpp/vectorsum_py/task.py
new file mode 100644
index 00000000..62e5dae0
--- /dev/null
+++ b/problems/pmpp/vectorsum_py/task.py
@@ -0,0 +1,9 @@
+from typing import TypedDict, TypeVar
+import torch
+
+input_t = TypeVar("input_t", bound=torch.Tensor)
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+class TestSpec(TypedDict):
+    size: int
+    seed: int 
\ No newline at end of file
diff --git a/problems/pmpp/vectorsum_py/task.yml b/problems/pmpp/vectorsum_py/task.yml
new file mode 100644
index 00000000..1c8b6018
--- /dev/null
+++ b/problems/pmpp/vectorsum_py/task.yml
@@ -0,0 +1,31 @@
+# name: vectorsum-cuda-inline
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "eval.py"}
+
+lang: "py"
+
+description: |
+  Implement a vector sum reduction kernel using CUDA inline function that matches the reference implementation.
+  The kernel should compute the sum of all elements in the input tensor.
+
+config:
+  main: "eval.py" 
+
+tests:
+  - {"size": 1023, "seed": 4242}
+  - {"size": 1024, "seed": 5236}
+  - {"size": 1025, "seed": 1001}
+  - {"size": 2048, "seed": 5531}
+  - {"size": 4096, "seed": 9173}
+
+benchmarks:
+  - {"size": 8192, "seed": 54352}
+  - {"size": 16384, "seed": 93246}
+  - {"size": 32768, "seed": 6256}
+  - {"size": 65536, "seed": 8841}
+  - {"size": 131072, "seed": 6252} 
\ No newline at end of file

From da9f2377e550ec3f9372a238b8d0aff7a87f84fe Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Fri, 21 Feb 2025 10:52:31 -0800
Subject: [PATCH 003/207] update beta.yaml

---
 problems/beta.yaml | 59 ++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 57 insertions(+), 2 deletions(-)

diff --git a/problems/beta.yaml b/problems/beta.yaml
index 67cad93c..fed11c76 100644
--- a/problems/beta.yaml
+++ b/problems/beta.yaml
@@ -7,10 +7,65 @@ description: ""
 problems:
   - directory: pmpp/conv2d_py
     name: conv2d
-    deadline: "2025-12-31"
+    deadline: "2025-04-30"
+    gpus:
+      - H100
+      - A100
+      - T4
+      - L4
+  - directory: pmpp/grayscale_py
+    name: grayscale
+    deadline: "2025-04-30"
+    gpus:
+      - H100
+      - A100
+      - T4
+      - L4
+  - directory: pmpp/histogram_py
+    name: histogram
+    deadline: "2025-04-30"
+    gpus:
+      - H100
+      - A100
+      - T4
+      - L4
+  - directory: pmpp/matmul_py
+    name: matmul
+    deadline: "2025-04-30"
+    gpus:
+      - H100
+      - A100
+      - T4
+      - L4
+  - directory: pmpp/prefixsum_py
+    name: prefixsum
+    deadline: "2025-04-30"
+    gpus:
+      - H100
+      - A100
+      - T4
+      - L4
+  - directory: pmpp/sort_py
+    name: sort
+    deadline: "2025-04-30"
+    gpus:
+      - H100
+      - A100
+      - T4
+      - L4
+  - directory: pmpp/vectoradd_py
+    name: vectoradd
+    deadline: "2025-04-30"
+    gpus:
+      - H100
+      - A100
+      - T4
+      - L4
+  - directory: pmpp/vectorsum_py
+    name: vectorsum
+    deadline: "2025-04-30"
     gpus:
       - H100
       - A100
       - T4
       - L4
-    # Scoring rule/weights etc
\ No newline at end of file

From 555a1c4c57f762dc91dcb5260b56e1311b25d059 Mon Sep 17 00:00:00 2001
From: Matej Sirovatka <54212263+S1ro1@users.noreply.github.com>
Date: Fri, 21 Feb 2025 20:24:20 +0000
Subject: [PATCH 004/207] Fix: eval/utils paths in task

---
 problems/pmpp/grayscale_py/task.yml | 4 ++--
 problems/pmpp/histogram_py/task.yml | 4 ++--
 problems/pmpp/matmul_py/task.yml    | 4 ++--
 problems/pmpp/prefixsum_py/task.yml | 4 ++--
 problems/pmpp/sort_py/task.yml      | 4 ++--
 problems/pmpp/vectoradd_py/task.yml | 4 ++--
 problems/pmpp/vectorsum_py/task.yml | 4 ++--
 7 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/problems/pmpp/grayscale_py/task.yml b/problems/pmpp/grayscale_py/task.yml
index b14a81b5..9ba74756 100644
--- a/problems/pmpp/grayscale_py/task.yml
+++ b/problems/pmpp/grayscale_py/task.yml
@@ -1,9 +1,9 @@
 files:
   - {"name": "submission.py", "source": "@SUBMISSION@"}
   - {"name": "task.py", "source": "task.py"}
-  - {"name": "utils.py", "source": "utils.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
   - {"name": "reference.py", "source": "reference.py"}
-  - {"name": "eval.py", "source": "eval.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
 
 lang: "py"
 
diff --git a/problems/pmpp/histogram_py/task.yml b/problems/pmpp/histogram_py/task.yml
index a1bfeb31..9d30c0c3 100644
--- a/problems/pmpp/histogram_py/task.yml
+++ b/problems/pmpp/histogram_py/task.yml
@@ -3,9 +3,9 @@
 files:
   - {"name": "submission.py", "source": "@SUBMISSION@"}
   - {"name": "task.py", "source": "task.py"}
-  - {"name": "utils.py", "source": "utils.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
   - {"name": "reference.py", "source": "reference.py"}
-  - {"name": "eval.py", "source": "eval.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
 
 lang: "py"
 
diff --git a/problems/pmpp/matmul_py/task.yml b/problems/pmpp/matmul_py/task.yml
index 9e19eb4f..6f14aa5b 100644
--- a/problems/pmpp/matmul_py/task.yml
+++ b/problems/pmpp/matmul_py/task.yml
@@ -3,9 +3,9 @@
 files:
   - {"name": "submission.py", "source": "@SUBMISSION@"}
   - {"name": "task.py", "source": "task.py"}
-  - {"name": "utils.py", "source": "utils.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
   - {"name": "reference.py", "source": "reference.py"}
-  - {"name": "eval.py", "source": "eval.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
 
 lang: "py"
 
diff --git a/problems/pmpp/prefixsum_py/task.yml b/problems/pmpp/prefixsum_py/task.yml
index 36cfab42..ff13e391 100644
--- a/problems/pmpp/prefixsum_py/task.yml
+++ b/problems/pmpp/prefixsum_py/task.yml
@@ -3,9 +3,9 @@
 files:
   - {"name": "submission.py", "source": "@SUBMISSION@"}
   - {"name": "task.py", "source": "task.py"}
-  - {"name": "utils.py", "source": "utils.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
   - {"name": "reference.py", "source": "reference.py"}
-  - {"name": "eval.py", "source": "eval.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
 
 lang: "py"
 
diff --git a/problems/pmpp/sort_py/task.yml b/problems/pmpp/sort_py/task.yml
index 17c99107..9fcb8fe1 100644
--- a/problems/pmpp/sort_py/task.yml
+++ b/problems/pmpp/sort_py/task.yml
@@ -3,9 +3,9 @@
 files:
   - {"name": "submission.py", "source": "@SUBMISSION@"}
   - {"name": "task.py", "source": "task.py"}
-  - {"name": "utils.py", "source": "utils.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
   - {"name": "reference.py", "source": "reference.py"}
-  - {"name": "eval.py", "source": "eval.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
 
 lang: "py"
 
diff --git a/problems/pmpp/vectoradd_py/task.yml b/problems/pmpp/vectoradd_py/task.yml
index 7e63a421..9a0cf5e3 100644
--- a/problems/pmpp/vectoradd_py/task.yml
+++ b/problems/pmpp/vectoradd_py/task.yml
@@ -3,9 +3,9 @@
 files:
   - {"name": "submission.py", "source": "@SUBMISSION@"}
   - {"name": "task.py", "source": "task.py"}
-  - {"name": "utils.py", "source": "utils.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
   - {"name": "reference.py", "source": "reference.py"}
-  - {"name": "eval.py", "source": "eval.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
 
 lang: "py"
 
diff --git a/problems/pmpp/vectorsum_py/task.yml b/problems/pmpp/vectorsum_py/task.yml
index 1c8b6018..9c9ad53e 100644
--- a/problems/pmpp/vectorsum_py/task.yml
+++ b/problems/pmpp/vectorsum_py/task.yml
@@ -3,9 +3,9 @@
 files:
   - {"name": "submission.py", "source": "@SUBMISSION@"}
   - {"name": "task.py", "source": "task.py"}
-  - {"name": "utils.py", "source": "utils.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
   - {"name": "reference.py", "source": "reference.py"}
-  - {"name": "eval.py", "source": "eval.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
 
 lang: "py"
 

From 92fa5a25d108e47a8d39d2cae295071474d7e32f Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Sat, 22 Feb 2025 09:50:59 -0800
Subject: [PATCH 005/207] Updates to Grayscale

---
 problems/pmpp/grayscale_py/submission.py |  8 ++++++++
 problems/pmpp/grayscale_py/task.yml      | 10 +++++-----
 2 files changed, 13 insertions(+), 5 deletions(-)
 create mode 100644 problems/pmpp/grayscale_py/submission.py

diff --git a/problems/pmpp/grayscale_py/submission.py b/problems/pmpp/grayscale_py/submission.py
new file mode 100644
index 00000000..de0c1494
--- /dev/null
+++ b/problems/pmpp/grayscale_py/submission.py
@@ -0,0 +1,8 @@
+from task import input_t, output_t
+import torch
+
+def custom_kernel(data: input_t) -> output_t:
+    weights = torch.tensor([0.2989, 0.5870, 0.1140], 
+                         device=data.device, 
+                         dtype=data.dtype)
+    return torch.sum(data * weights, dim=-1)
diff --git a/problems/pmpp/grayscale_py/task.yml b/problems/pmpp/grayscale_py/task.yml
index 9ba74756..11f19ff8 100644
--- a/problems/pmpp/grayscale_py/task.yml
+++ b/problems/pmpp/grayscale_py/task.yml
@@ -9,7 +9,7 @@ lang: "py"
 
 description: |
   Implement an RGB to grayscale conversion kernel that matches the reference implementation.
-  The kernel should convert RGB images to grayscale using the standard coefficients:
+  The kernel should convert square RGB images to grayscale using the standard coefficients:
   Y = 0.2989 R + 0.5870 G + 0.1140 B
   
   Input: RGB tensor of shape (H, W, 3) with values in [0, 1]
@@ -26,8 +26,8 @@ tests:
   - {"size": 512, "seed": 9173}
 
 benchmarks:
-  - {"size": 1024, "seed": 54352}
-  - {"size": 2048, "seed": 93246}
-  - {"size": 4096, "seed": 6256}
-  - {"size": 8192, "seed": 8841}
+  - {"size": 8192, "seed": 54352}
+  - {"size": 10240, "seed": 93246}
+  - {"size": 12288, "seed": 6256}
+  - {"size": 14336, "seed": 8841}
   - {"size": 16384, "seed": 6252} 
\ No newline at end of file

From 1f33e7539e4b54e1224e99d860141618ad8aa09e Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Sat, 22 Feb 2025 10:01:00 -0800
Subject: [PATCH 006/207] update

---
 problems/pmpp/grayscale_py/task.yml | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/problems/pmpp/grayscale_py/task.yml b/problems/pmpp/grayscale_py/task.yml
index 11f19ff8..8041e647 100644
--- a/problems/pmpp/grayscale_py/task.yml
+++ b/problems/pmpp/grayscale_py/task.yml
@@ -19,15 +19,14 @@ config:
   main: "eval.py" 
 
 tests:
-  - {"size": 127, "seed": 4242}
-  - {"size": 128, "seed": 5236}
-  - {"size": 129, "seed": 1001}
+
+  - {"size": 128, "seed": 1001}
   - {"size": 256, "seed": 5531}
   - {"size": 512, "seed": 9173}
 
 benchmarks:
-  - {"size": 8192, "seed": 54352}
-  - {"size": 10240, "seed": 93246}
-  - {"size": 12288, "seed": 6256}
-  - {"size": 14336, "seed": 8841}
-  - {"size": 16384, "seed": 6252} 
\ No newline at end of file
+  - {"size": 32768, "seed": 54352}
+  - {"size": 40960, "seed": 93246}
+  - {"size": 49152, "seed": 6256}
+  - {"size": 57344, "seed": 8841}
+  - {"size": 65536, "seed": 6252} 
\ No newline at end of file

From 91e2b0c249f7ce69d562491eaa84989e01462f1c Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sat, 22 Feb 2025 19:16:09 +0100
Subject: [PATCH 007/207] Fix: vectorsum desc

---
 problems/pmpp/vectorsum_py/task.yml | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/problems/pmpp/vectorsum_py/task.yml b/problems/pmpp/vectorsum_py/task.yml
index 9c9ad53e..0cb3df9e 100644
--- a/problems/pmpp/vectorsum_py/task.yml
+++ b/problems/pmpp/vectorsum_py/task.yml
@@ -10,8 +10,10 @@ files:
 lang: "py"
 
 description: |
-  Implement a vector sum reduction kernel using CUDA inline function that matches the reference implementation.
-  The kernel should compute the sum of all elements in the input tensor.
+  Implement a vector sum reduction kernel. This kernel computes the sum of all elements in the input tensor.
+
+  Input: A tensor of shape `(N,)` with values from a normal distribution with mean 0 and variance 1.
+  Output: A scalar value equal to the sum of all elements in the input tensor.
 
 config:
   main: "eval.py" 
@@ -28,4 +30,4 @@ benchmarks:
   - {"size": 16384, "seed": 93246}
   - {"size": 32768, "seed": 6256}
   - {"size": 65536, "seed": 8841}
-  - {"size": 131072, "seed": 6252} 
\ No newline at end of file
+  - {"size": 131072, "seed": 6252} 

From 978708ea777f471138818b10e9e4c3e424cc2031 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sat, 22 Feb 2025 19:20:49 +0100
Subject: [PATCH 008/207] Vectorsum: more sizes

---
 problems/pmpp/vectorsum_py/task.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/pmpp/vectorsum_py/task.yml b/problems/pmpp/vectorsum_py/task.yml
index 0cb3df9e..ea881af7 100644
--- a/problems/pmpp/vectorsum_py/task.yml
+++ b/problems/pmpp/vectorsum_py/task.yml
@@ -26,8 +26,8 @@ tests:
   - {"size": 4096, "seed": 9173}
 
 benchmarks:
-  - {"size": 8192, "seed": 54352}
   - {"size": 16384, "seed": 93246}
   - {"size": 32768, "seed": 6256}
   - {"size": 65536, "seed": 8841}
   - {"size": 131072, "seed": 6252} 
+  - {"size": 262144, "seed": 82135}

From 3fc6f493fd98607174140e211356c1654aa2bfd1 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Sat, 22 Feb 2025 10:52:29 -0800
Subject: [PATCH 009/207] grayscale

---
 problems/pmpp/grayscale_py/task.yml | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/problems/pmpp/grayscale_py/task.yml b/problems/pmpp/grayscale_py/task.yml
index 8041e647..62c44e0c 100644
--- a/problems/pmpp/grayscale_py/task.yml
+++ b/problems/pmpp/grayscale_py/task.yml
@@ -9,7 +9,7 @@ lang: "py"
 
 description: |
   Implement an RGB to grayscale conversion kernel that matches the reference implementation.
-  The kernel should convert square RGB images to grayscale using the standard coefficients:
+  The kernel should convert square RGB images with even sizes to grayscale using the standard coefficients:
   Y = 0.2989 R + 0.5870 G + 0.1140 B
   
   Input: RGB tensor of shape (H, W, 3) with values in [0, 1]
@@ -25,8 +25,10 @@ tests:
   - {"size": 512, "seed": 9173}
 
 benchmarks:
-  - {"size": 32768, "seed": 54352}
-  - {"size": 40960, "seed": 93246}
-  - {"size": 49152, "seed": 6256}
-  - {"size": 57344, "seed": 8841}
-  - {"size": 65536, "seed": 6252} 
\ No newline at end of file
+  
+  - {"size": 512, "seed": 54352}
+  - {"size": 1024, "seed": 93246}
+  - {"size": 2048, "seed": 6256}
+  - {"size": 4096, "seed": 8841}
+  - {"size": 8192, "seed": 6252}
+  - {"size": 16384, "seed": 54352}
\ No newline at end of file

From 4c6bc94296ef0b953fd58c82b44603db1b31847d Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sat, 22 Feb 2025 20:01:42 +0100
Subject: [PATCH 010/207] Vectorsum

---
 problems/pmpp/vectorsum_py/submission.py | 19 ++++++++++++-------
 problems/pmpp/vectorsum_py/task.yml      | 11 ++++++-----
 2 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/problems/pmpp/vectorsum_py/submission.py b/problems/pmpp/vectorsum_py/submission.py
index 8ac3ac13..5c672d98 100644
--- a/problems/pmpp/vectorsum_py/submission.py
+++ b/problems/pmpp/vectorsum_py/submission.py
@@ -1,8 +1,11 @@
+#!POPCORN leaderboard vectorsum_py
+
 import torch
 import triton
 import triton.language as tl
 from task import input_t, output_t
 
+
 @triton.jit
 def sum_kernel(
     x_ptr,
@@ -18,16 +21,17 @@ def sum_kernel(
     block_start = pid * BLOCK_SIZE
     offsets = block_start + tl.arange(0, BLOCK_SIZE)
     mask = offsets < n_elements
-    
+
     # Load data
     x = tl.load(x_ptr + offsets, mask=mask, other=0.0)
-    
+
     # Compute local reduction
     block_sum = tl.sum(x, axis=0)
-    
+
     # Store the partial sum
     tl.atomic_add(output_ptr, block_sum)
 
+
 def _custom_kernel(data: input_t) -> output_t:
     """
     Performs parallel reduction to compute sum of all elements.
@@ -38,11 +42,11 @@ def _custom_kernel(data: input_t) -> output_t:
     """
     n_elements = data.numel()
     output = torch.zeros(1, device=data.device, dtype=data.dtype)
-    
+
     # Configure kernel
     BLOCK_SIZE = 1024
     grid = (triton.cdiv(n_elements, BLOCK_SIZE),)
-    
+
     # Launch kernel
     sum_kernel[grid](
         data,
@@ -50,8 +54,9 @@ def _custom_kernel(data: input_t) -> output_t:
         n_elements,
         BLOCK_SIZE=BLOCK_SIZE,
     )
-    
+
     return output[0]
 
+
 # Compile the kernel for better performance
-custom_kernel = torch.compile(_custom_kernel, mode="reduce-overhead") 
\ No newline at end of file
+custom_kernel = torch.compile(_custom_kernel, mode="reduce-overhead")
diff --git a/problems/pmpp/vectorsum_py/task.yml b/problems/pmpp/vectorsum_py/task.yml
index ea881af7..e23ce33d 100644
--- a/problems/pmpp/vectorsum_py/task.yml
+++ b/problems/pmpp/vectorsum_py/task.yml
@@ -26,8 +26,9 @@ tests:
   - {"size": 4096, "seed": 9173}
 
 benchmarks:
-  - {"size": 16384, "seed": 93246}
-  - {"size": 32768, "seed": 6256}
-  - {"size": 65536, "seed": 8841}
-  - {"size": 131072, "seed": 6252} 
-  - {"size": 262144, "seed": 82135}
+  - {"size": 1638400, "seed": 93246}
+  - {"size": 3276800, "seed": 6256}
+  - {"size": 6553600, "seed": 8841}
+  - {"size": 13107200, "seed": 6252}
+  - {"size": 26214400, "seed": 82135}
+  - {"size": 52428800, "seed": 12345}

From 037dd989cb8c1309ba9e576b9cd6351dd6f4a3e1 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Sat, 22 Feb 2025 11:24:19 -0800
Subject: [PATCH 011/207] Update sorting problem

---
 problems/pmpp/sort_py/reference.py | 29 ++++++++++++++++++++++++-----
 problems/pmpp/sort_py/task.yml     | 16 ++++++++++------
 2 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/problems/pmpp/sort_py/reference.py b/problems/pmpp/sort_py/reference.py
index c8d05f77..a1dbf2c3 100644
--- a/problems/pmpp/sort_py/reference.py
+++ b/problems/pmpp/sort_py/reference.py
@@ -12,15 +12,34 @@ def ref_kernel(data: input_t) -> output_t:
     """
     return torch.sort(data)[0]
 
-def generate_input(size: int, seed: int) -> input_t:
+def generate_input(size: int, seed: int) -> torch.Tensor:
     """
-    Generates random input tensor.
+    Generates random input tensor where elements are drawn from different distributions.
+    
+    Args:
+        size: Total size of the final 1D tensor
+        seed: Base seed for random generation
+    
     Returns:
-        Tensor to be sorted
+        1D tensor of size `size` containing flattened values from different distributions
     """
+    # Calculate dimensions for a roughly square 2D matrix
+    rows = int(size ** 0.5)  # Square root for roughly square shape
+    cols = (size + rows - 1) // rows  # Ceiling division to ensure total size >= requested size
+    
     gen = torch.Generator(device='cuda')
-    gen.manual_seed(seed)
-    return torch.randn(size, device='cuda', dtype=torch.float32, generator=gen).contiguous()
+    result = torch.empty((rows, cols), device='cuda', dtype=torch.float32)
+    
+    # Different seed for each row!
+    for i in range(rows):
+        row_seed = seed + i
+        gen.manual_seed(row_seed)
+        
+        # Generate values for this row with mean=row_seed
+        result[i, :] = torch.randn(cols, device='cuda', dtype=torch.float32, generator=gen) + row_seed
+    
+    # Flatten and trim to exact size requested
+    return result.flatten()[:size].contiguous()
 
 def check_implementation(
     data: input_t,
diff --git a/problems/pmpp/sort_py/task.yml b/problems/pmpp/sort_py/task.yml
index 9fcb8fe1..659191b2 100644
--- a/problems/pmpp/sort_py/task.yml
+++ b/problems/pmpp/sort_py/task.yml
@@ -11,7 +11,11 @@ lang: "py"
 
 description: |
   Implement a sort kernel that matches the reference implementation.
-  The kernel should sort the input array in ascending order using the merge sort algorithm.
+  The kernel should sort the input array in ascending order using a sort algorithm of your choice.
+  
+  Input arrays are generated as random floating-point numbers, where each row of a roughly square matrix
+  is drawn from a normal distribution with a different mean value per row based on the seed and then flattened into a 1D array.
+  
 
 config:
   main: "eval.py" 
@@ -24,8 +28,8 @@ tests:
   - {"size": 4096, "seed": 9173}
 
 benchmarks:
-  - {"size": 8192, "seed": 54352}
-  - {"size": 16384, "seed": 93246}
-  - {"size": 32768, "seed": 6256}
-  - {"size": 65536, "seed": 8841}
-  - {"size": 131072, "seed": 6252} 
\ No newline at end of file
+  - {"size": 100000, "seed": 54352}
+  - {"size": 500000, "seed": 93246}
+  - {"size": 1000000, "seed": 6256}
+  - {"size": 10000000, "seed": 8841}
+  - {"size": 100000000, "seed": 6252} 
\ No newline at end of file

From 24bda301b3f4d671df79788dc16e73db61430061 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sat, 22 Feb 2025 20:29:28 +0100
Subject: [PATCH 012/207] Vectoradd

---
 problems/pmpp/vectoradd_py/submission_triton.py |  2 ++
 problems/pmpp/vectoradd_py/task.yml             | 11 ++++++-----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/problems/pmpp/vectoradd_py/submission_triton.py b/problems/pmpp/vectoradd_py/submission_triton.py
index 75cda2c9..70a0f85e 100644
--- a/problems/pmpp/vectoradd_py/submission_triton.py
+++ b/problems/pmpp/vectoradd_py/submission_triton.py
@@ -1,3 +1,5 @@
+#!POPCORN leaderboard vectoradd_py
+
 import torch
 import triton
 import triton.language as tl
diff --git a/problems/pmpp/vectoradd_py/task.yml b/problems/pmpp/vectoradd_py/task.yml
index 9a0cf5e3..1ba0544f 100644
--- a/problems/pmpp/vectoradd_py/task.yml
+++ b/problems/pmpp/vectoradd_py/task.yml
@@ -24,8 +24,9 @@ tests:
   - {"size": 512, "seed": 9173}
 
 benchmarks:
-  - {"size": 1024, "seed": 54352}
-  - {"size": 2048, "seed": 93246}
-  - {"size": 4096, "seed": 6256}
-  - {"size": 8192, "seed": 8841}
-  - {"size": 16384, "seed": 6252}
+  - {"size": 1024, "seed": 31232}
+  - {"size": 2048, "seed": 4052}
+  - {"size": 4096, "seed": 2146}
+  - {"size": 8192, "seed": 3129}
+  - {"size": 16384, "seed": 54352}
+  - {"size": 24576, "seed": 93246}

From 54ee8fb76e5b43d90b064d7a27000e37fa1e571c Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Sat, 22 Feb 2025 11:53:47 -0800
Subject: [PATCH 013/207] Minor fixes to matmul

---
 problems/pmpp/histogram_py/submission.py | 0
 problems/pmpp/matmul_py/task.yml         | 1 +
 2 files changed, 1 insertion(+)
 create mode 100644 problems/pmpp/histogram_py/submission.py

diff --git a/problems/pmpp/histogram_py/submission.py b/problems/pmpp/histogram_py/submission.py
new file mode 100644
index 00000000..e69de29b
diff --git a/problems/pmpp/matmul_py/task.yml b/problems/pmpp/matmul_py/task.yml
index 6f14aa5b..e8cd416e 100644
--- a/problems/pmpp/matmul_py/task.yml
+++ b/problems/pmpp/matmul_py/task.yml
@@ -12,6 +12,7 @@ lang: "py"
 description: |
   Implement a custom matmul function that matches the reference implementation.
   The function should handle a tuple of input tensors and apply matmul
+  The shapes of all outer and inner dimensions of tensors are multiples of 16
 
 config:
   main: "eval.py"

From 293e64b5a78d727dfdf015437102f8234936f792 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sat, 22 Feb 2025 21:07:48 +0100
Subject: [PATCH 014/207] Vectoradd

---
 problems/pmpp/vectoradd_py/task.yml | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/problems/pmpp/vectoradd_py/task.yml b/problems/pmpp/vectoradd_py/task.yml
index 1ba0544f..68f72ea3 100644
--- a/problems/pmpp/vectoradd_py/task.yml
+++ b/problems/pmpp/vectoradd_py/task.yml
@@ -10,8 +10,11 @@ files:
 lang: "py"
 
 description: |
-  Implement a vector addition kernel using CUDA inline function that matches the reference implementation.
-  The kernel should add pairs of tensors element-wise.
+  Implement a float16 vector addition kernel.
+
+  Input: tuple(torch.Tensor, torch.Tensor) with tensors of shape (N, N) and type torch.float16. These tensors are from
+  a normal distribution with mean 0 and variance 1.
+  Output: torch.Tensor of shape (N, N) and type torch.float16
 
 config:
   main: "eval.py" 
@@ -29,4 +32,3 @@ benchmarks:
   - {"size": 4096, "seed": 2146}
   - {"size": 8192, "seed": 3129}
   - {"size": 16384, "seed": 54352}
-  - {"size": 24576, "seed": 93246}

From 168d345bcca41ecbed5aa2d0451622591705b86b Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Sat, 22 Feb 2025 14:34:22 -0800
Subject: [PATCH 015/207] convolution is bueno

---
 problems/pmpp/conv2d_py/reference.py  | 20 +++++++++--------
 problems/pmpp/conv2d_py/submission.py | 21 ++++++++++++++++++
 problems/pmpp/conv2d_py/task.py       |  9 ++------
 problems/pmpp/conv2d_py/task.yml      | 31 +++++++++++++++++----------
 4 files changed, 54 insertions(+), 27 deletions(-)
 create mode 100644 problems/pmpp/conv2d_py/submission.py

diff --git a/problems/pmpp/conv2d_py/reference.py b/problems/pmpp/conv2d_py/reference.py
index 0eb48e6f..bacb5694 100644
--- a/problems/pmpp/conv2d_py/reference.py
+++ b/problems/pmpp/conv2d_py/reference.py
@@ -1,14 +1,14 @@
 from utils import verbose_allclose
 import torch
 import torch.nn.functional as F
-from task import input_t, output_t, KernelSpec
+from task import input_t, output_t
 
-def ref_kernel(data: input_t, spec: KernelSpec) -> output_t:
+def ref_kernel(data: input_t) -> output_t:
     """
     Reference implementation of 2D convolution using PyTorch.
     Args:
         data: Tuple of (input tensor, kernel tensor)
-        spec: Convolution specifications (stride, padding)
+        spec: Convolution specifications
     Returns:
         Output tensor after convolution
     """
@@ -16,11 +16,14 @@ def ref_kernel(data: input_t, spec: KernelSpec) -> output_t:
     return F.conv2d(
         input_tensor, 
         kernel,
-        stride=spec.stride,
-        padding=spec.padding
+
+        # No padding and no striding
+        # TODO: Can revisit this in future problems
+        stride=1,
+        padding=0
     )
 
-def generate_input(size: int, kernel_size: int, channels: int, batch: int, seed: int) -> input_t:
+def generate_input(size: int, kernelsize: int, channels: int, batch: int, seed: int) -> input_t:
     """
     Generates random input and kernel tensors.
     Returns:
@@ -40,7 +43,7 @@ def generate_input(size: int, kernel_size: int, channels: int, batch: int, seed:
     # Generate kernel tensor: [out_channels, in_channels, kernel_height, kernel_width]
     # Here we use same number of output channels as input channels for simplicity
     kernel = torch.randn(
-        channels, channels, kernel_size, kernel_size,
+        channels, channels, kernelsize, kernelsize,
         device='cuda',
         dtype=torch.float32,
         generator=gen
@@ -50,10 +53,9 @@ def generate_input(size: int, kernel_size: int, channels: int, batch: int, seed:
 
 def check_implementation(
     data: input_t,
-    spec: KernelSpec,
     output: output_t,
 ) -> str:
-    expected = ref_kernel(data, spec)
+    expected = ref_kernel(data)
     reasons = verbose_allclose(output, expected, rtol=1e-3, atol=1e-3)
     
     if len(reasons) > 0:
diff --git a/problems/pmpp/conv2d_py/submission.py b/problems/pmpp/conv2d_py/submission.py
new file mode 100644
index 00000000..a1b7d16d
--- /dev/null
+++ b/problems/pmpp/conv2d_py/submission.py
@@ -0,0 +1,21 @@
+from task import input_t, output_t
+import torch
+import torch.nn.functional as F
+
+
+def custom_kernel(data: input_t) -> output_t:
+    """
+    Implementation of 2D convolution using PyTorch with no padding and no striding.
+    Args:
+        data: Tuple of (input tensor, kernel tensor)
+        spec: Convolution specifications
+    Returns:
+        Output tensor after convolution
+    """
+    input_tensor, kernel = data
+    return F.conv2d(
+        input_tensor, 
+        kernel,
+        stride=1,
+        padding=0
+    )
\ No newline at end of file
diff --git a/problems/pmpp/conv2d_py/task.py b/problems/pmpp/conv2d_py/task.py
index 6cce0e6e..397332ab 100644
--- a/problems/pmpp/conv2d_py/task.py
+++ b/problems/pmpp/conv2d_py/task.py
@@ -1,18 +1,13 @@
 from typing import TypedDict, TypeVar, Tuple
 import torch
-from dataclasses import dataclass
 
 input_t = TypeVar("input_t", bound=Tuple[torch.Tensor, torch.Tensor])
 output_t = TypeVar("output_t", bound=torch.Tensor)
 
-@dataclass
-class KernelSpec:
-    stride: int
-    padding: int
 
 class TestSpec(TypedDict):
     size: int
-    kernel_size: int
+    kernelsize: int
     channels: int
     batch: int
-    seed: int 
\ No newline at end of file
+    seed: int   
\ No newline at end of file
diff --git a/problems/pmpp/conv2d_py/task.yml b/problems/pmpp/conv2d_py/task.yml
index 5b0955ed..0f8d075a 100644
--- a/problems/pmpp/conv2d_py/task.yml
+++ b/problems/pmpp/conv2d_py/task.yml
@@ -11,21 +11,30 @@ lang: "py"
 
 description: |
   Implement a 2D convolution kernel that matches the reference implementation.
-  The kernel should perform 2D convolution with the given specifications (stride and padding).
+  The kernel should perform 2D convolution with the given specifications
+  We will benchmark different sizes, kernel sizes, channels and batch sizes but they will all 
+  be even numbers with the exception of batch size which can sometimes be 1
+  We assume no padding and striding and instead vary the size of the input and kernel,
+  number of channels, and batch size.
+
+  Input: Tuple of (input_tensor, kernel)
+    - input_tensor: 4D tensor of shape (batch, channels, height, width) with arbitrary values
+    - kernel: 4D tensor of shape (channels, channels, kernelsize, kernelsize) with arbitrary values
+  Output: 4D tensor of shape (batch, channels, height-kernelsize+1, width-kernelsize+1) with convolved values
 
 config:
   main: "eval.py" 
 
 tests:
-  - {"size": 32, "kernel_size": 3, "channels": 16, "batch": 1, "seed": 4242}
-  - {"size": 32, "kernel_size": 5, "channels": 16, "batch": 2, "seed": 5236}
-  - {"size": 64, "kernel_size": 3, "channels": 32, "batch": 1, "seed": 1001}
-  - {"size": 64, "kernel_size": 5, "channels": 32, "batch": 2, "seed": 5531}
-  - {"size": 128, "kernel_size": 3, "channels": 64, "batch": 1, "seed": 9173}
+  - {"size": 32, "kernelsize": 4, "channels": 16, "batch": 1, "seed": 4242}
+  - {"size": 32, "kernelsize": 4, "channels": 16, "batch": 2, "seed": 5236}
+  - {"size": 64, "kernelsize": 4, "channels": 32, "batch": 1, "seed": 1001}
+  - {"size": 64, "kernelsize": 8, "channels": 32, "batch": 2, "seed": 5531}
+  - {"size": 128, "kernelsize": 8, "channels": 64, "batch": 1, "seed": 9173}
 
 benchmarks:
-  - {"size": 128, "kernel_size": 3, "channels": 64, "batch": 4, "seed": 54352}
-  - {"size": 128, "kernel_size": 5, "channels": 64, "batch": 4, "seed": 93246}
-  - {"size": 256, "kernel_size": 3, "channels": 128, "batch": 2, "seed": 6256}
-  - {"size": 256, "kernel_size": 5, "channels": 128, "batch": 2, "seed": 8841}
-  - {"size": 512, "kernel_size": 3, "channels": 256, "batch": 1, "seed": 6252} 
\ No newline at end of file
+  - {"size": 128, "kernelsize": 8, "channels": 64, "batch": 4, "seed": 54352}
+  - {"size": 128, "kernelsize": 16, "channels": 64, "batch": 4, "seed": 93246}
+  - {"size": 256, "kernelsize": 16, "channels": 128, "batch": 2, "seed": 6256}
+  - {"size": 256, "kernelsize": 16, "channels": 128, "batch": 2, "seed": 8841}
+  - {"size": 512, "kernelsize": 32, "channels": 256, "batch": 1, "seed": 6252} 
\ No newline at end of file

From 7b8fb8cbce1f5f321c77c5bc79dfa6b9ec9a7da8 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Sat, 22 Feb 2025 16:36:42 -0800
Subject: [PATCH 016/207] prefixsum

---
 problems/pmpp/prefixsum_py/reference.py  | 18 ++++++++------
 problems/pmpp/prefixsum_py/submission.py | 12 +++++++++
 problems/pmpp/prefixsum_py/task.yml      | 31 +++++++++++++++++++-----
 3 files changed, 48 insertions(+), 13 deletions(-)
 create mode 100644 problems/pmpp/prefixsum_py/submission.py

diff --git a/problems/pmpp/prefixsum_py/reference.py b/problems/pmpp/prefixsum_py/reference.py
index bce90273..3850a9aa 100644
--- a/problems/pmpp/prefixsum_py/reference.py
+++ b/problems/pmpp/prefixsum_py/reference.py
@@ -22,14 +22,18 @@ def generate_input(size: int, seed: int) -> input_t:
     gen.manual_seed(seed)
     return torch.randn(size, device='cuda', dtype=torch.float32, generator=gen).contiguous()
 
-def check_implementation(
-    data: input_t,
-    output: output_t,
-) -> str:
+# This algorithm is very sensitive to the tolerance and the error is magnified by the input size
+# The tolerance is scaled by the square root of the input size
+def check_implementation(data: input_t, output: output_t) -> str:
     expected = ref_kernel(data)
-    reasons = verbose_allclose(output, expected, rtol=1e-5, atol=1e-5)
     
+    # Then get the size for scaling the tolerance
+    n = data.numel()
+    
+    scale_factor = n ** 0.5  # Square root of input size
+    rtol = 1e-5 * scale_factor
+    atol = 1e-5 * scale_factor
+    reasons = verbose_allclose(output, expected, rtol=rtol, atol=atol)
     if len(reasons) > 0:
         return "mismatch found! custom implementation doesn't match reference: " + reasons[0]
-    
-    return '' 
\ No newline at end of file
+    return ''
\ No newline at end of file
diff --git a/problems/pmpp/prefixsum_py/submission.py b/problems/pmpp/prefixsum_py/submission.py
new file mode 100644
index 00000000..6ccdf4ad
--- /dev/null
+++ b/problems/pmpp/prefixsum_py/submission.py
@@ -0,0 +1,12 @@
+import torch
+from task import input_t, output_t
+
+def custom_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of inclusive prefix sum using PyTorch.
+    Args:
+        data: Input tensor to compute prefix sum on
+    Returns:
+        Tensor containing the inclusive prefix sum
+    """
+    return torch.cumsum(data, dim=0)
\ No newline at end of file
diff --git a/problems/pmpp/prefixsum_py/task.yml b/problems/pmpp/prefixsum_py/task.yml
index ff13e391..8762ee34 100644
--- a/problems/pmpp/prefixsum_py/task.yml
+++ b/problems/pmpp/prefixsum_py/task.yml
@@ -10,8 +10,14 @@ files:
 lang: "py"
 
 description: |
-  Implement an inclusive prefix sum (scan) kernel using CUDA inline function that matches the reference implementation.
+  Implement an inclusive prefix sum (scan) kernel that matches the reference implementation.
   The kernel should compute the cumulative sum of all elements up to each position.
+  Because of numerical instability, the tolerance is scaled by the square root of the input size.
+
+  Input:
+  - `data`: A 1D tensor of size `n`
+  Output:
+  - `output`: A 1D tensor of size `n`
 
 config:
   main: "eval.py" 
@@ -24,8 +30,21 @@ tests:
   - {"size": 4096, "seed": 9173}
 
 benchmarks:
-  - {"size": 8192, "seed": 54352}
-  - {"size": 16384, "seed": 93246}
-  - {"size": 32768, "seed": 6256}
-  - {"size": 65536, "seed": 8841}
-  - {"size": 131072, "seed": 6252} 
\ No newline at end of file
+  # - {"size": 8192, "seed": 54352}
+  # - {"size": 16384, "seed": 93246}
+  # - {"size": 32768, "seed": 6256}
+  # - {"size": 65536, "seed": 8841}
+  # - {"size": 131072, "seed": 6252} 
+  - {"size": 262144, "seed": 12345}
+  - {"size": 524288, "seed": 67890}
+  - {"size": 1048576, "seed": 13579}
+  - {"size": 2097152, "seed": 24680}
+  - {"size": 4194304, "seed": 35791}
+  - {"size": 8388608, "seed": 46802}
+  - {"size": 16777216, "seed": 57913}
+  - {"size": 33554432, "seed": 68024}
+  - {"size": 67108864, "seed": 79135}
+  - {"size": 134217728, "seed": 80246} # fits on T4
+  - {"size": 268435456, "seed": 91357}
+  # - {"size": 536870912, "seed": 102468}
+  # - {"size": 1073741824, "seed": 113579}
\ No newline at end of file

From 32e78bb7940422c331903ccbdc1eb360423acd97 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Sat, 22 Feb 2025 16:49:10 -0800
Subject: [PATCH 017/207] remove useless comments

---
 problems/pmpp/conv2d_py/reference.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/problems/pmpp/conv2d_py/reference.py b/problems/pmpp/conv2d_py/reference.py
index bacb5694..1876e9e2 100644
--- a/problems/pmpp/conv2d_py/reference.py
+++ b/problems/pmpp/conv2d_py/reference.py
@@ -8,7 +8,6 @@ def ref_kernel(data: input_t) -> output_t:
     Reference implementation of 2D convolution using PyTorch.
     Args:
         data: Tuple of (input tensor, kernel tensor)
-        spec: Convolution specifications
     Returns:
         Output tensor after convolution
     """

From 2d6a1d6c43eef9a5aec936a9ce9914fa3f8a555c Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Sat, 22 Feb 2025 17:09:34 -0800
Subject: [PATCH 018/207] Make solution more robust to cheese solutions by
 adding scale and offset

---
 problems/pmpp/vectorsum_py/reference.py | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/problems/pmpp/vectorsum_py/reference.py b/problems/pmpp/vectorsum_py/reference.py
index 2b662f81..c37b79ba 100644
--- a/problems/pmpp/vectorsum_py/reference.py
+++ b/problems/pmpp/vectorsum_py/reference.py
@@ -14,13 +14,32 @@ def ref_kernel(data: input_t) -> output_t:
 
 def generate_input(size: int, seed: int) -> input_t:
     """
-    Generates random input tensor of specified shape.
+    Generates random input tensor of specified shape with random offset and scale.
+    The data is first generated as standard normal, then scaled and offset
+    to prevent trivial solutions.
+    
     Returns:
         Tensor to be reduced
     """
     gen = torch.Generator(device='cuda')
     gen.manual_seed(seed)
-    return torch.randn(size, device='cuda', dtype=torch.float32, generator=gen).contiguous()
+    
+    # Generate base random data
+    data = torch.randn(size, device='cuda', dtype=torch.float32, generator=gen).contiguous()
+    
+    # Generate random offset and scale (using different seeds to avoid correlation)
+    offset_gen = torch.Generator(device='cuda')
+    offset_gen.manual_seed(seed + 1)
+    scale_gen = torch.Generator(device='cuda')
+    scale_gen.manual_seed(seed + 2)
+    
+    # Generate random offset between -100 and 100
+    offset = (torch.rand(1, device='cuda', generator=offset_gen) * 200 - 100).item()
+    # Generate random scale between 0.1 and 10
+    scale = (torch.rand(1, device='cuda', generator=scale_gen) * 9.9 + 0.1).item()
+    
+    # Apply scale and offset
+    return (data * scale + offset).contiguous()
 
 def check_implementation(
     data: input_t,

From 9fcb89f8d21e4866256244b834e59c97a1866596 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Sat, 22 Feb 2025 17:59:58 -0800
Subject: [PATCH 019/207] fixed prefixsum

---
 problems/pmpp/histogram_py/reference.py  | 49 ++++++++++++++++--------
 problems/pmpp/histogram_py/submission.py | 27 +++++++++++++
 problems/pmpp/histogram_py/task.py       | 10 +----
 problems/pmpp/histogram_py/task.yml      | 28 +++++++-------
 4 files changed, 76 insertions(+), 38 deletions(-)

diff --git a/problems/pmpp/histogram_py/reference.py b/problems/pmpp/histogram_py/reference.py
index 8fb766b8..54d69938 100644
--- a/problems/pmpp/histogram_py/reference.py
+++ b/problems/pmpp/histogram_py/reference.py
@@ -1,47 +1,62 @@
 from utils import verbose_allclose
 import torch
-from task import input_t, output_t, HistogramSpec
+from task import input_t, output_t
 
-def ref_kernel(data: input_t, spec: HistogramSpec) -> output_t:
+def ref_kernel(data: input_t) -> output_t:
     """
     Reference implementation of histogram using PyTorch.
     Args:
-        data: Input tensor to compute histogram on
-        spec: Histogram specifications (num_bins, min_val, max_val)
+        data: tensor of shape (size,)
     Returns:
         Tensor containing bin counts
     """
-    # Clip values to range
-    clipped = torch.clamp(data, spec.min_val, spec.max_val)
+    # Fixed range [0, 100]
+    min_val, max_val = 0, 100
+    
+    # Number of bins is input size / 16
+    num_bins = data.shape[0] // 16
+
+    clipped = torch.clamp(data, min_val, max_val)
     
     # Scale to bin indices
-    bin_width = (spec.max_val - spec.min_val) / spec.num_bins
-    indices = ((clipped - spec.min_val) / bin_width).long()
-    indices = torch.clamp(indices, 0, spec.num_bins - 1)
+    bin_width = (max_val - min_val) / num_bins
+    indices = ((clipped - min_val) / bin_width).long()
+    indices = torch.clamp(indices, 0, num_bins - 1)
     
     # Count values in each bin
-    return torch.bincount(indices, minlength=spec.num_bins).to(torch.float32)
+    return torch.bincount(indices, minlength=num_bins).to(torch.float32)
 
 def generate_input(size: int, seed: int) -> input_t:
     """
-    Generates random input tensor with values roughly in [0, 1].
+    Generates random input tensor for histogram.
+    The number of bins is automatically set to size/16.
+    
+    Args:
+        size: Size of the input tensor (must be multiple of 16)
+        seed: Random seed
     Returns:
-        Tensor to compute histogram on
+        The input tensor with values in [0, 100]
     """
     gen = torch.Generator(device='cuda')
     gen.manual_seed(seed)
-    # Generate values with normal distribution for interesting histograms
-    return torch.randn(size, device='cuda', dtype=torch.float32, generator=gen).contiguous()
+    
+    # Generate integer values between 0 and 100
+    data = torch.randint(0, 101, (size,), device='cuda', dtype=torch.int32, generator=gen)
+    
+    # Convert to float since the histogram implementation expects float input
+    return data.float().contiguous()
 
 def check_implementation(
     data: input_t,
-    spec: HistogramSpec,
     output: output_t,
 ) -> str:
-    expected = ref_kernel(data, spec)
+    """
+    Compare custom implementation's output to the reference output.
+    """
+    expected = ref_kernel(data)
     reasons = verbose_allclose(output, expected)
     
     if len(reasons) > 0:
         return "mismatch found! custom implementation doesn't match reference: " + reasons[0]
     
-    return '' 
\ No newline at end of file
+    return ''
diff --git a/problems/pmpp/histogram_py/submission.py b/problems/pmpp/histogram_py/submission.py
index e69de29b..338585f7 100644
--- a/problems/pmpp/histogram_py/submission.py
+++ b/problems/pmpp/histogram_py/submission.py
@@ -0,0 +1,27 @@
+import torch
+from task import input_t, output_t
+
+def custom_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of histogram using PyTorch.
+    Args:
+        data: tensor of shape (size,)
+    Returns:
+        Tensor containing bin counts
+    """
+    # Fixed range [0, 100]
+    min_val, max_val = 0, 100
+    
+    # Number of bins is input size / 16
+    num_bins = data.shape[0] // 16
+    
+    # Clip values to range
+    clipped = torch.clamp(data, min_val, max_val)
+    
+    # Scale to bin indices
+    bin_width = (max_val - min_val) / num_bins
+    indices = ((clipped - min_val) / bin_width).long()
+    indices = torch.clamp(indices, 0, num_bins - 1)
+    
+    # Count values in each bin
+    return torch.bincount(indices, minlength=num_bins).to(torch.float32)
\ No newline at end of file
diff --git a/problems/pmpp/histogram_py/task.py b/problems/pmpp/histogram_py/task.py
index e9d7fadf..61fd3378 100644
--- a/problems/pmpp/histogram_py/task.py
+++ b/problems/pmpp/histogram_py/task.py
@@ -1,16 +1,10 @@
 from typing import TypedDict, TypeVar
 import torch
-from dataclasses import dataclass
 
 input_t = TypeVar("input_t", bound=torch.Tensor)
 output_t = TypeVar("output_t", bound=torch.Tensor)
 
-@dataclass
-class HistogramSpec:
-    num_bins: int
-    min_val: float
-    max_val: float
-
 class TestSpec(TypedDict):
     size: int
-    seed: int 
\ No newline at end of file
+    seed: int 
+
diff --git a/problems/pmpp/histogram_py/task.yml b/problems/pmpp/histogram_py/task.yml
index 9d30c0c3..667ad8ea 100644
--- a/problems/pmpp/histogram_py/task.yml
+++ b/problems/pmpp/histogram_py/task.yml
@@ -10,22 +10,24 @@ files:
 lang: "py"
 
 description: |
-  Implement a histogram kernel using CUDA inline function that matches the reference implementation.
-  The kernel should count the number of elements falling into each bin across the specified range.
+  Implement a histogram kernel that counts the number of elements falling into each bin across the specified range.
+  The minimum and maximum values of the range are fixed to 0 and 100 respectively.
+  All sizes are multiples of 16 and the number of bins is set to the size of the input tensor divided by 16.
+
+  Input:
+    - data: a tensor of shape (size,)
 
 config:
-  main: "eval.py" 
+  main: "eval.py"
 
 tests:
-  - {"size": 1023, "seed": 4242}
-  - {"size": 1024, "seed": 5236}
-  - {"size": 1025, "seed": 1001}
-  - {"size": 2048, "seed": 5531}
-  - {"size": 4096, "seed": 9173}
+  - {"size": 5120,   "seed": 9991}
+  - {"size": 7840,   "seed": 2105}
+  - {"size": 30080,  "seed": 9999}
+  - {"size": 100000, "seed": 1212}
 
 benchmarks:
-  - {"size": 8192, "seed": 54352}
-  - {"size": 16384, "seed": 93246}
-  - {"size": 32768, "seed": 6256}
-  - {"size": 65536, "seed": 8841}
-  - {"size": 131072, "seed": 6252} 
\ No newline at end of file
+  - {"size": 1310720, "seed": 6252}
+  - {"size": 2621440, "seed": 8841}
+  - {"size": 5242880, "seed": 6252}
+  - {"size": 10485760, "seed": 8841}

From eb928f72e60f98accb3f4d1fa4e0616ba94d52f1 Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Sun, 23 Feb 2025 15:07:41 +0200
Subject: [PATCH 020/207] improvements to allclose:  * torch.no_grad(), might
 get some memory freed earlier  * rename parameters to reflect the asymmetry
 re relative error  * don't try to stringify list of wrong locations; those
 could be millions in the worst case

---
 problems/pmpp/utils.py | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/problems/pmpp/utils.py b/problems/pmpp/utils.py
index 6b715d51..7ae63670 100644
--- a/problems/pmpp/utils.py
+++ b/problems/pmpp/utils.py
@@ -23,10 +23,12 @@ def get_device(use_cuda: bool = True) -> torch.device:
             print("No compatible GPU found. Falling back to CPU.")
     return torch.device("cpu")
 
+
 # Adapted from https://github.com/linkedin/Liger-Kernel/blob/main/test/utils.py
+@torch.no_grad()
 def verbose_allclose(
-        tensor1: torch.Tensor,
-        tensor2: torch.Tensor,
+        received: torch.Tensor,
+        expected: torch.Tensor,
         rtol=1e-05,
         atol=1e-08,
         max_print=5
@@ -35,9 +37,9 @@ def verbose_allclose(
     Assert that two tensors are element-wise equal within a tolerance, providing detailed information about mismatches.
 
     Parameters:
-    tensor1 (torch.Tensor): First tensor to compare.
-    tensor2 (torch.Tensor): Second tensor to compare.
-    rtol (float): Relative tolerance.
+    received (torch.Tensor): Tensor we actually got.
+    expected (torch.Tensor): Tensor we expected to receive.
+    rtol (float): Relative tolerance; relative to expected
     atol (float): Absolute tolerance.
     max_print (int): Maximum number of mismatched elements to print.
 
@@ -45,25 +47,25 @@ def verbose_allclose(
     AssertionError: If the tensors are not all close within the given tolerance.
     """
     # Check if the shapes of the tensors match
-    if tensor1.shape != tensor2.shape:
+    if received.shape != expected.shape:
         return ["SIZE MISMATCH"]
 
     # Calculate the difference between the tensors
-    diff = torch.abs(tensor1 - tensor2)
+    diff = torch.abs(received - expected)
 
     # Determine the tolerance
-    tolerance = atol + rtol * torch.abs(tensor2)
+    tolerance = atol + rtol * torch.abs(expected)
 
     # Find tolerance mismatched elements
     tol_mismatched = diff > tolerance
 
     # Find nan mismatched elements
-    nan_mismatched = torch.logical_xor(torch.isnan(tensor1), torch.isnan(tensor2))
+    nan_mismatched = torch.logical_xor(torch.isnan(received), torch.isnan(expected))
 
     # Find +inf mismatched elements
-    posinf_mismatched = torch.logical_xor(torch.isposinf(tensor1), torch.isposinf(tensor2))
+    posinf_mismatched = torch.logical_xor(torch.isposinf(received), torch.isposinf(expected))
     # Find -inf mismatched elements
-    neginf_mismatched = torch.logical_xor(torch.isneginf(tensor1), torch.isneginf(tensor2))
+    neginf_mismatched = torch.logical_xor(torch.isneginf(received), torch.isneginf(expected))
 
     # Find all mismatched elements
     mismatched = torch.logical_or(
@@ -81,12 +83,11 @@ def verbose_allclose(
 
     # Raise AssertionError with detailed information if there are mismatches
     if not all_close and num_mismatched >= 1:
-        mismatch_details = [f"Number of mismatched elements: {num_mismatched}",
-                            f"Mismatched elements: {mismatched_indices}"]
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
 
         for index in mismatched_indices[:max_print]:
             i = tuple(index.tolist())
-            mismatch_details.append(f"ERROR AT {i}: {tensor1[i]} {tensor2[i]}")
+            mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
         if num_mismatched > max_print:
             mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
         return mismatch_details

From 519a8f0bd22457beb5a49bee0fc2d00d9bb193cc Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Sun, 23 Feb 2025 15:20:56 +0200
Subject: [PATCH 021/207] simplified reference utility

---
 problems/pmpp/conv2d_py/reference.py    | 17 +++++------------
 problems/pmpp/grayscale_py/reference.py | 17 +++++------------
 problems/pmpp/histogram_py/reference.py | 20 +++++---------------
 problems/pmpp/matmul_py/reference.py    | 13 ++++---------
 problems/pmpp/prefixsum_py/reference.py | 13 ++++++-------
 problems/pmpp/sort_py/reference.py      | 17 +++++------------
 problems/pmpp/utils.py                  | 21 ++++++++++++++++++++-
 problems/pmpp/vectoradd_py/reference.py | 17 +++++------------
 problems/pmpp/vectorsum_py/reference.py | 16 ++++------------
 9 files changed, 59 insertions(+), 92 deletions(-)

diff --git a/problems/pmpp/conv2d_py/reference.py b/problems/pmpp/conv2d_py/reference.py
index 1876e9e2..9e5e1a74 100644
--- a/problems/pmpp/conv2d_py/reference.py
+++ b/problems/pmpp/conv2d_py/reference.py
@@ -1,8 +1,9 @@
-from utils import verbose_allclose
+from utils import make_match_reference
 import torch
 import torch.nn.functional as F
 from task import input_t, output_t
 
+
 def ref_kernel(data: input_t) -> output_t:
     """
     Reference implementation of 2D convolution using PyTorch.
@@ -22,6 +23,7 @@ def ref_kernel(data: input_t) -> output_t:
         padding=0
     )
 
+
 def generate_input(size: int, kernelsize: int, channels: int, batch: int, seed: int) -> input_t:
     """
     Generates random input and kernel tensors.
@@ -50,14 +52,5 @@ def generate_input(size: int, kernelsize: int, channels: int, batch: int, seed:
     
     return (input_tensor, kernel)
 
-def check_implementation(
-    data: input_t,
-    output: output_t,
-) -> str:
-    expected = ref_kernel(data)
-    reasons = verbose_allclose(output, expected, rtol=1e-3, atol=1e-3)
-    
-    if len(reasons) > 0:
-        return "mismatch found! custom implementation doesn't match reference: " + reasons[0]
-    
-    return '' 
\ No newline at end of file
+
+check_implementation = make_match_reference(ref_kernel, rtol=1e-3, atol=1e-3)
diff --git a/problems/pmpp/grayscale_py/reference.py b/problems/pmpp/grayscale_py/reference.py
index 264e733e..1ed6d148 100644
--- a/problems/pmpp/grayscale_py/reference.py
+++ b/problems/pmpp/grayscale_py/reference.py
@@ -1,7 +1,8 @@
-from utils import verbose_allclose
+from utils import make_match_reference
 import torch
 from task import input_t, output_t
 
+
 def ref_kernel(data: input_t) -> output_t:
     """
     Reference implementation of RGB to grayscale conversion using PyTorch.
@@ -18,6 +19,7 @@ def ref_kernel(data: input_t) -> output_t:
                          dtype=data.dtype)
     return torch.sum(data * weights, dim=-1)
 
+
 def generate_input(size: int, seed: int) -> input_t:
     """
     Generates random RGB image tensor of specified size.
@@ -31,14 +33,5 @@ def generate_input(size: int, seed: int) -> input_t:
                      dtype=torch.float32, 
                      generator=gen).contiguous()
 
-def check_implementation(
-    data: input_t,
-    output: output_t,
-) -> str:
-    expected = ref_kernel(data)
-    reasons = verbose_allclose(output, expected, rtol=1e-4, atol=1e-4)
-    
-    if len(reasons) > 0:
-        return "mismatch found! custom implementation doesn't match reference: " + reasons[0]
-    
-    return '' 
\ No newline at end of file
+
+check_implementation = make_match_reference(ref_kernel, rtol=1e-4, atol=1e-4)
diff --git a/problems/pmpp/histogram_py/reference.py b/problems/pmpp/histogram_py/reference.py
index 54d69938..dc7a8792 100644
--- a/problems/pmpp/histogram_py/reference.py
+++ b/problems/pmpp/histogram_py/reference.py
@@ -1,7 +1,8 @@
-from utils import verbose_allclose
+from utils import make_match_reference
 import torch
 from task import input_t, output_t
 
+
 def ref_kernel(data: input_t) -> output_t:
     """
     Reference implementation of histogram using PyTorch.
@@ -26,6 +27,7 @@ def ref_kernel(data: input_t) -> output_t:
     # Count values in each bin
     return torch.bincount(indices, minlength=num_bins).to(torch.float32)
 
+
 def generate_input(size: int, seed: int) -> input_t:
     """
     Generates random input tensor for histogram.
@@ -46,17 +48,5 @@ def generate_input(size: int, seed: int) -> input_t:
     # Convert to float since the histogram implementation expects float input
     return data.float().contiguous()
 
-def check_implementation(
-    data: input_t,
-    output: output_t,
-) -> str:
-    """
-    Compare custom implementation's output to the reference output.
-    """
-    expected = ref_kernel(data)
-    reasons = verbose_allclose(output, expected)
-    
-    if len(reasons) > 0:
-        return "mismatch found! custom implementation doesn't match reference: " + reasons[0]
-    
-    return ''
+
+check_implementation = make_match_reference(ref_kernel)
diff --git a/problems/pmpp/matmul_py/reference.py b/problems/pmpp/matmul_py/reference.py
index 76da5c6a..19ba991f 100644
--- a/problems/pmpp/matmul_py/reference.py
+++ b/problems/pmpp/matmul_py/reference.py
@@ -1,6 +1,7 @@
 import torch
 from task import input_t, output_t
-from utils import verbose_allclose
+from utils import make_match_reference
+
 
 def generate_input(m: int, n: int, k: int, seed: int) -> input_t:
     gen = torch.Generator(device='cuda')
@@ -11,16 +12,10 @@ def generate_input(m: int, n: int, k: int, seed: int) -> input_t:
     b.uniform_(0, 1, generator=gen)
     return (a, b)
 
+
 def ref_kernel(data: input_t) -> output_t:
     a, b = data
     return a @ b
 
-def check_implementation(data: input_t, output: output_t) -> str:
-    expected = ref_kernel(data)
-    reasons = verbose_allclose(output, expected)
-    if len(reasons) > 0:
-        # TODO better processing of reasons
-        return "mismatch found! custom implementation doesn't match reference.: " + reasons[0]
-
-    return ''
 
+check_implementation = make_match_reference(ref_kernel)
diff --git a/problems/pmpp/prefixsum_py/reference.py b/problems/pmpp/prefixsum_py/reference.py
index 3850a9aa..25ed751c 100644
--- a/problems/pmpp/prefixsum_py/reference.py
+++ b/problems/pmpp/prefixsum_py/reference.py
@@ -1,7 +1,8 @@
-from utils import verbose_allclose
+from utils import match_reference
 import torch
 from task import input_t, output_t
 
+
 def ref_kernel(data: input_t) -> output_t:
     """
     Reference implementation of inclusive prefix sum using PyTorch.
@@ -12,6 +13,7 @@ def ref_kernel(data: input_t) -> output_t:
     """
     return torch.cumsum(data, dim=0)
 
+
 def generate_input(size: int, seed: int) -> input_t:
     """
     Generates random input tensor.
@@ -22,18 +24,15 @@ def generate_input(size: int, seed: int) -> input_t:
     gen.manual_seed(seed)
     return torch.randn(size, device='cuda', dtype=torch.float32, generator=gen).contiguous()
 
+
 # This algorithm is very sensitive to the tolerance and the error is magnified by the input size
 # The tolerance is scaled by the square root of the input size
 def check_implementation(data: input_t, output: output_t) -> str:
-    expected = ref_kernel(data)
-    
     # Then get the size for scaling the tolerance
     n = data.numel()
     
     scale_factor = n ** 0.5  # Square root of input size
     rtol = 1e-5 * scale_factor
     atol = 1e-5 * scale_factor
-    reasons = verbose_allclose(output, expected, rtol=rtol, atol=atol)
-    if len(reasons) > 0:
-        return "mismatch found! custom implementation doesn't match reference: " + reasons[0]
-    return ''
\ No newline at end of file
+
+    return match_reference(data, output, reference=ref_kernel, rtol=rtol, atol=atol)
diff --git a/problems/pmpp/sort_py/reference.py b/problems/pmpp/sort_py/reference.py
index a1dbf2c3..fddb452b 100644
--- a/problems/pmpp/sort_py/reference.py
+++ b/problems/pmpp/sort_py/reference.py
@@ -1,7 +1,8 @@
-from utils import verbose_allclose
+from utils import make_match_reference
 import torch
 from task import input_t, output_t
 
+
 def ref_kernel(data: input_t) -> output_t:
     """
     Reference implementation of sort using PyTorch.
@@ -12,6 +13,7 @@ def ref_kernel(data: input_t) -> output_t:
     """
     return torch.sort(data)[0]
 
+
 def generate_input(size: int, seed: int) -> torch.Tensor:
     """
     Generates random input tensor where elements are drawn from different distributions.
@@ -41,14 +43,5 @@ def generate_input(size: int, seed: int) -> torch.Tensor:
     # Flatten and trim to exact size requested
     return result.flatten()[:size].contiguous()
 
-def check_implementation(
-    data: input_t,
-    output: output_t,
-) -> str:
-    expected = ref_kernel(data)
-    reasons = verbose_allclose(output, expected)
-    
-    if len(reasons) > 0:
-        return "mismatch found! custom implementation doesn't match reference: " + reasons[0]
-    
-    return '' 
\ No newline at end of file
+
+check_implementation = make_match_reference(ref_kernel)
diff --git a/problems/pmpp/utils.py b/problems/pmpp/utils.py
index 7ae63670..4d8d4ebe 100644
--- a/problems/pmpp/utils.py
+++ b/problems/pmpp/utils.py
@@ -92,4 +92,23 @@ def verbose_allclose(
             mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
         return mismatch_details
 
-    return []
\ No newline at end of file
+    return []
+
+
+def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08):
+    """
+    Convenient "default" implementation for tasks' `check_implementation` function.
+    """
+    expected = reference(data)
+    reasons = verbose_allclose(output, expected, rtol=rtol, atol=atol)
+
+    if len(reasons) > 0:
+        return "mismatch found! custom implementation doesn't match reference: " + " ".join(reasons)
+
+    return ''
+
+
+def make_match_reference(reference: callable, **kwargs):
+    def wrapped(data, output):
+        return match_reference(data, output, reference=reference, **kwargs)
+    return wrapped
diff --git a/problems/pmpp/vectoradd_py/reference.py b/problems/pmpp/vectoradd_py/reference.py
index 06677423..fd0431ac 100644
--- a/problems/pmpp/vectoradd_py/reference.py
+++ b/problems/pmpp/vectoradd_py/reference.py
@@ -1,7 +1,8 @@
-from utils import verbose_allclose
+from utils import make_match_reference
 import torch
 from task import input_t, output_t
 
+
 def ref_kernel(data: input_t) -> output_t:
     """
     Reference implementation of vector addition using PyTorch.
@@ -13,6 +14,7 @@ def ref_kernel(data: input_t) -> output_t:
     A, B = data
     return A + B
 
+
 def generate_input(size: int, seed: int) -> input_t:
     """
     Generates random input tensors of specified shapes.
@@ -25,14 +27,5 @@ def generate_input(size: int, seed: int) -> input_t:
     B = torch.randn(size, size, device='cuda', dtype=torch.float16, generator=gen).contiguous()
     return (A, B)
 
-def check_implementation(
-    data: input_t,
-    output: output_t,
-) -> bool:
-    expected = ref_kernel(data)
-    reasons = verbose_allclose(output, expected)
-    
-    if len(reasons) > 0:
-        return "mismatch found! custom implementation doesn't match reference: " + reasons[0]
-    
-    return ''
+
+check_implementation = make_match_reference(ref_kernel)
diff --git a/problems/pmpp/vectorsum_py/reference.py b/problems/pmpp/vectorsum_py/reference.py
index c37b79ba..8a02505a 100644
--- a/problems/pmpp/vectorsum_py/reference.py
+++ b/problems/pmpp/vectorsum_py/reference.py
@@ -1,7 +1,8 @@
-from utils import verbose_allclose
+from utils import make_match_reference
 import torch
 from task import input_t, output_t
 
+
 def ref_kernel(data: input_t) -> output_t:
     """
     Reference implementation of vector sum reduction using PyTorch.
@@ -41,14 +42,5 @@ def generate_input(size: int, seed: int) -> input_t:
     # Apply scale and offset
     return (data * scale + offset).contiguous()
 
-def check_implementation(
-    data: input_t,
-    output: output_t,
-) -> bool:
-    expected = ref_kernel(data)
-    reasons = verbose_allclose(output, expected)
-    
-    if len(reasons) > 0:
-        return "mismatch found! custom implementation doesn't match reference: " + reasons[0]
-    
-    return '' 
\ No newline at end of file
+
+check_implementation = make_match_reference(ref_kernel)

From 1657e5752b3c162e564ff259644bedddad1d134c Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Sun, 23 Feb 2025 16:59:12 +0200
Subject: [PATCH 022/207] reduce in fp64

---
 problems/pmpp/vectorsum_py/reference.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/problems/pmpp/vectorsum_py/reference.py b/problems/pmpp/vectorsum_py/reference.py
index 8a02505a..8b421f7c 100644
--- a/problems/pmpp/vectorsum_py/reference.py
+++ b/problems/pmpp/vectorsum_py/reference.py
@@ -11,7 +11,9 @@ def ref_kernel(data: input_t) -> output_t:
     Returns:
         Tensor containing the sum of all elements
     """
-    return data.sum()
+    # Let's be on the safe side here, and do the reduction in 64 bit
+    return data.to(torch.float64).sum().to(torch.float32)
+
 
 def generate_input(size: int, seed: int) -> input_t:
     """

From 3479186c0887a695978d7552c018217182d3ee00 Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Sun, 23 Feb 2025 17:37:26 +0200
Subject: [PATCH 023/207] histogram:  receive inputs directly in uint8  allow
 specifying contention in data distribution

---
 problems/pmpp/histogram_py/reference.py  | 49 ++++++++++++------------
 problems/pmpp/histogram_py/submission.py | 17 +-------
 problems/pmpp/histogram_py/task.py       |  3 +-
 problems/pmpp/histogram_py/task.yml      | 19 +++++----
 problems/pmpp/utils.py                   | 40 ++++++++++++++++---
 5 files changed, 73 insertions(+), 55 deletions(-)

diff --git a/problems/pmpp/histogram_py/reference.py b/problems/pmpp/histogram_py/reference.py
index dc7a8792..18e8b249 100644
--- a/problems/pmpp/histogram_py/reference.py
+++ b/problems/pmpp/histogram_py/reference.py
@@ -1,4 +1,4 @@
-from utils import make_match_reference
+from utils import verbose_allequal
 import torch
 from task import input_t, output_t
 
@@ -11,42 +11,41 @@ def ref_kernel(data: input_t) -> output_t:
     Returns:
         Tensor containing bin counts
     """
-    # Fixed range [0, 100]
-    min_val, max_val = 0, 100
-    
-    # Number of bins is input size / 16
-    num_bins = data.shape[0] // 16
-
-    clipped = torch.clamp(data, min_val, max_val)
-    
-    # Scale to bin indices
-    bin_width = (max_val - min_val) / num_bins
-    indices = ((clipped - min_val) / bin_width).long()
-    indices = torch.clamp(indices, 0, num_bins - 1)
-    
     # Count values in each bin
-    return torch.bincount(indices, minlength=num_bins).to(torch.float32)
+    return torch.bincount(data, minlength=256)
 
 
-def generate_input(size: int, seed: int) -> input_t:
+def generate_input(size: int, contention: float, seed: int) -> input_t:
     """
     Generates random input tensor for histogram.
-    The number of bins is automatically set to size/16.
-    
+
     Args:
         size: Size of the input tensor (must be multiple of 16)
+        contention: float in [0, 100], specifying the percentage of identical values
         seed: Random seed
     Returns:
-        The input tensor with values in [0, 100]
+        The input tensor with values in [0, 255]
     """
     gen = torch.Generator(device='cuda')
     gen.manual_seed(seed)
     
-    # Generate integer values between 0 and 100
-    data = torch.randint(0, 101, (size,), device='cuda', dtype=torch.int32, generator=gen)
-    
-    # Convert to float since the histogram implementation expects float input
-    return data.float().contiguous()
+    # Generate integer values between 0 and 256
+    data = torch.randint(0, 256, (size,), device='cuda', dtype=torch.uint8, generator=gen)
+
+    # make one value appear quite often, increasing the chance for atomic contention
+    evil_value = torch.randint(0, 256, (), device='cuda', dtype=torch.uint8, generator=gen)
+    evil_loc = torch.rand((size,), device='cuda', dtype=torch.float32, generator=gen) < (contention / 100.0)
+    data[evil_loc] = evil_value
+
+    return data.contiguous()
+
+
+def check_implementation(data, output):
+    expected = ref_kernel(data)
+    reasons = verbose_allequal(output, expected)
+
+    if len(reasons) > 0:
+        return "mismatch found! custom implementation doesn't match reference: " + " ".join(reasons)
 
+    return ''
 
-check_implementation = make_match_reference(ref_kernel)
diff --git a/problems/pmpp/histogram_py/submission.py b/problems/pmpp/histogram_py/submission.py
index 338585f7..1e62e9a3 100644
--- a/problems/pmpp/histogram_py/submission.py
+++ b/problems/pmpp/histogram_py/submission.py
@@ -9,19 +9,4 @@ def custom_kernel(data: input_t) -> output_t:
     Returns:
         Tensor containing bin counts
     """
-    # Fixed range [0, 100]
-    min_val, max_val = 0, 100
-    
-    # Number of bins is input size / 16
-    num_bins = data.shape[0] // 16
-    
-    # Clip values to range
-    clipped = torch.clamp(data, min_val, max_val)
-    
-    # Scale to bin indices
-    bin_width = (max_val - min_val) / num_bins
-    indices = ((clipped - min_val) / bin_width).long()
-    indices = torch.clamp(indices, 0, num_bins - 1)
-    
-    # Count values in each bin
-    return torch.bincount(indices, minlength=num_bins).to(torch.float32)
\ No newline at end of file
+    return torch.bincount(data, minlength=256)
diff --git a/problems/pmpp/histogram_py/task.py b/problems/pmpp/histogram_py/task.py
index 61fd3378..80727868 100644
--- a/problems/pmpp/histogram_py/task.py
+++ b/problems/pmpp/histogram_py/task.py
@@ -6,5 +6,6 @@
 
 class TestSpec(TypedDict):
     size: int
-    seed: int 
+    seed: int
+    contention: int
 
diff --git a/problems/pmpp/histogram_py/task.yml b/problems/pmpp/histogram_py/task.yml
index 667ad8ea..a49c9be9 100644
--- a/problems/pmpp/histogram_py/task.yml
+++ b/problems/pmpp/histogram_py/task.yml
@@ -21,13 +21,16 @@ config:
   main: "eval.py"
 
 tests:
-  - {"size": 5120,   "seed": 9991}
-  - {"size": 7840,   "seed": 2105}
-  - {"size": 30080,  "seed": 9999}
-  - {"size": 100000, "seed": 1212}
+  - {"size": 5120,   "seed": 9991, "contention": 10}
+  - {"size": 7840,   "seed": 2105, "contention": 10}
+  - {"size": 30080,  "seed": 9999, "contention": 10}
+  - {"size": 30080,  "seed": 4254, "contention": 90}
+  - {"size": 100000, "seed": 1212, "contention": 10}
 
 benchmarks:
-  - {"size": 1310720, "seed": 6252}
-  - {"size": 2621440, "seed": 8841}
-  - {"size": 5242880, "seed": 6252}
-  - {"size": 10485760, "seed": 8841}
+  - {"size": 1310720, "seed": 6252, "contention": 10}
+  - {"size": 2621440, "seed": 8841, "contention": 10}
+  - {"size": 2621440, "seed": 3411, "contention": 40}
+  - {"size": 2621440, "seed": 8753, "contention": 90}
+  - {"size": 5242880, "seed": 6252, "contention": 10}
+  - {"size": 10485760, "seed": 8841, "contention": 10}
diff --git a/problems/pmpp/utils.py b/problems/pmpp/utils.py
index 4d8d4ebe..c3eb2447 100644
--- a/problems/pmpp/utils.py
+++ b/problems/pmpp/utils.py
@@ -76,13 +76,43 @@ def verbose_allclose(
     mismatched_indices = torch.nonzero(mismatched)
 
     # Count the number of mismatched elements
-    num_mismatched = mismatched.sum().item()
+    num_mismatched = mismatched.count_nonzero().item()
 
-    # Check if all elements are close
-    all_close = num_mismatched == 0
+    # Generate detailed information if there are mismatches
+    if num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
+
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return mismatch_details
+
+    return []
+
+
+@torch.no_grad()
+def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print: int=5):
+    """
+    Assert that two tensors are element-wise perfectly equal, providing detailed information about mismatches.
+
+    Parameters:
+    received (torch.Tensor): Tensor we actually got.
+    expected (torch.Tensor): Tensor we expected to receive.
+    max_print (int): Maximum number of mismatched elements to print.
+
+    Returns:
+         Empty string if tensors are equal, otherwise detailed error information
+    """
+    mismatched = torch.not_equal(received, expected)
+    mismatched_indices = torch.nonzero(mismatched)
+
+    # Count the number of mismatched elements
+    num_mismatched = mismatched.count_nonzero().item()
 
-    # Raise AssertionError with detailed information if there are mismatches
-    if not all_close and num_mismatched >= 1:
+    # Generate detailed information if there are mismatches
+    if num_mismatched >= 1:
         mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
 
         for index in mismatched_indices[:max_print]:

From 99ec59f8d014554c0502336e82763c3ec32fbf25 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sun, 23 Feb 2025 17:33:36 +0100
Subject: [PATCH 024/207] Change sizes on conv2d

---
 problems/pmpp/conv2d_py/task.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/pmpp/conv2d_py/task.yml b/problems/pmpp/conv2d_py/task.yml
index 0f8d075a..d245fad3 100644
--- a/problems/pmpp/conv2d_py/task.yml
+++ b/problems/pmpp/conv2d_py/task.yml
@@ -37,4 +37,4 @@ benchmarks:
   - {"size": 128, "kernelsize": 16, "channels": 64, "batch": 4, "seed": 93246}
   - {"size": 256, "kernelsize": 16, "channels": 128, "batch": 2, "seed": 6256}
   - {"size": 256, "kernelsize": 16, "channels": 128, "batch": 2, "seed": 8841}
-  - {"size": 512, "kernelsize": 32, "channels": 256, "batch": 1, "seed": 6252} 
\ No newline at end of file
+  - {"size": 256, "kernelsize": 32, "channels": 128, "batch": 1, "seed": 6252}

From f836aebfc6f02ae3e0e04f15e03f3b7263f844dc Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Sun, 23 Feb 2025 19:23:15 +0200
Subject: [PATCH 025/207] model solutions

---
 problems/pmpp/conv2d_py/correct/ref.py          | 13 +++++++++++++
 problems/pmpp/conv2d_py/wrong/empty.py          | 11 +++++++++++
 .../pmpp/grayscale_py/solutions/correct/ref.py  |  9 +++++++++
 .../pmpp/grayscale_py/solutions/wrong/empty.py  |  7 +++++++
 problems/pmpp/histogram_py/correct/ref.py       |  6 ++++++
 problems/pmpp/histogram_py/wrong/empty.py       |  7 +++++++
 problems/pmpp/matmul_py/correct/ref.py          |  8 ++++++++
 problems/pmpp/matmul_py/wrong/low-precision.py  |  7 +++++++
 problems/pmpp/prefixsum_py/correct/ref.py       |  6 ++++++
 problems/pmpp/prefixsum_py/wrong/empty.py       |  7 +++++++
 problems/pmpp/sort_py/solutions/correct/ref.py  |  9 +++++++++
 problems/pmpp/sort_py/solutions/wrong/empty.py  |  7 +++++++
 .../vectorsum_py/solutions/correct/pytorch.py   | 12 ++++++++++++
 .../pmpp/vectorsum_py/solutions/wrong/cheat.py  | 17 +++++++++++++++++
 14 files changed, 126 insertions(+)
 create mode 100644 problems/pmpp/conv2d_py/correct/ref.py
 create mode 100644 problems/pmpp/conv2d_py/wrong/empty.py
 create mode 100644 problems/pmpp/grayscale_py/solutions/correct/ref.py
 create mode 100644 problems/pmpp/grayscale_py/solutions/wrong/empty.py
 create mode 100644 problems/pmpp/histogram_py/correct/ref.py
 create mode 100644 problems/pmpp/histogram_py/wrong/empty.py
 create mode 100644 problems/pmpp/matmul_py/correct/ref.py
 create mode 100644 problems/pmpp/matmul_py/wrong/low-precision.py
 create mode 100644 problems/pmpp/prefixsum_py/correct/ref.py
 create mode 100644 problems/pmpp/prefixsum_py/wrong/empty.py
 create mode 100644 problems/pmpp/sort_py/solutions/correct/ref.py
 create mode 100644 problems/pmpp/sort_py/solutions/wrong/empty.py
 create mode 100644 problems/pmpp/vectorsum_py/solutions/correct/pytorch.py
 create mode 100644 problems/pmpp/vectorsum_py/solutions/wrong/cheat.py

diff --git a/problems/pmpp/conv2d_py/correct/ref.py b/problems/pmpp/conv2d_py/correct/ref.py
new file mode 100644
index 00000000..c0ce3f21
--- /dev/null
+++ b/problems/pmpp/conv2d_py/correct/ref.py
@@ -0,0 +1,13 @@
+from task import input_t, output_t
+import torch
+import torch.nn.functional as F
+
+
+def custom_kernel(data: input_t) -> output_t:
+    input_tensor, kernel = data
+    return F.conv2d(
+        input_tensor,
+        kernel,
+        stride=1,
+        padding=0
+    )
diff --git a/problems/pmpp/conv2d_py/wrong/empty.py b/problems/pmpp/conv2d_py/wrong/empty.py
new file mode 100644
index 00000000..899beb0c
--- /dev/null
+++ b/problems/pmpp/conv2d_py/wrong/empty.py
@@ -0,0 +1,11 @@
+# the nop kernel
+from task import input_t, output_t
+import torch
+import torch.nn.functional as F
+
+
+def custom_kernel(data: input_t) -> output_t:
+    input_tensor, kernel = data
+    return torch.empty((input_tensor.shape[0], input_tensor.shape[1], input_tensor.shape[2]-kernel.shape[3]+1, input_tensor.shape[3]-kernel.shape[3]+1),
+                       device=kernel.device, dtype=kernel.dtype
+    )
diff --git a/problems/pmpp/grayscale_py/solutions/correct/ref.py b/problems/pmpp/grayscale_py/solutions/correct/ref.py
new file mode 100644
index 00000000..6a40c3e2
--- /dev/null
+++ b/problems/pmpp/grayscale_py/solutions/correct/ref.py
@@ -0,0 +1,9 @@
+from task import input_t, output_t
+import torch
+
+
+def custom_kernel(data: input_t) -> output_t:
+    weights = torch.tensor([0.2989, 0.5870, 0.1140], 
+                         device=data.device, 
+                         dtype=data.dtype)
+    return torch.sum(data * weights, dim=-1)
diff --git a/problems/pmpp/grayscale_py/solutions/wrong/empty.py b/problems/pmpp/grayscale_py/solutions/wrong/empty.py
new file mode 100644
index 00000000..e37e32ba
--- /dev/null
+++ b/problems/pmpp/grayscale_py/solutions/wrong/empty.py
@@ -0,0 +1,7 @@
+# the nop kernel
+from task import input_t, output_t
+import torch
+
+
+def custom_kernel(data: input_t) -> output_t:
+    return torch.empty(size=(data.shape[0], data.shape[1]), device=data.device, dtype=data.dtype)
diff --git a/problems/pmpp/histogram_py/correct/ref.py b/problems/pmpp/histogram_py/correct/ref.py
new file mode 100644
index 00000000..7de5cccb
--- /dev/null
+++ b/problems/pmpp/histogram_py/correct/ref.py
@@ -0,0 +1,6 @@
+import torch
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    return torch.bincount(data, minlength=256)
diff --git a/problems/pmpp/histogram_py/wrong/empty.py b/problems/pmpp/histogram_py/wrong/empty.py
new file mode 100644
index 00000000..e35e3dc1
--- /dev/null
+++ b/problems/pmpp/histogram_py/wrong/empty.py
@@ -0,0 +1,7 @@
+# the nop kernel
+from task import input_t, output_t
+import torch
+
+
+def custom_kernel(data: input_t) -> output_t:
+    return torch.empty(size=(256,), device=data.device, dtype=data.dtype)
diff --git a/problems/pmpp/matmul_py/correct/ref.py b/problems/pmpp/matmul_py/correct/ref.py
new file mode 100644
index 00000000..fe89ed55
--- /dev/null
+++ b/problems/pmpp/matmul_py/correct/ref.py
@@ -0,0 +1,8 @@
+import torch
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    a, b = data
+    return a @ b
+
diff --git a/problems/pmpp/matmul_py/wrong/low-precision.py b/problems/pmpp/matmul_py/wrong/low-precision.py
new file mode 100644
index 00000000..01335a18
--- /dev/null
+++ b/problems/pmpp/matmul_py/wrong/low-precision.py
@@ -0,0 +1,7 @@
+import torch
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    a, b = data
+    return (a.to(torch.bfloat16) @ b.to(torch.bfloat16)).to(a.dtype)
diff --git a/problems/pmpp/prefixsum_py/correct/ref.py b/problems/pmpp/prefixsum_py/correct/ref.py
new file mode 100644
index 00000000..8dbb4d02
--- /dev/null
+++ b/problems/pmpp/prefixsum_py/correct/ref.py
@@ -0,0 +1,6 @@
+import torch
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    return torch.cumsum(data, dim=0)
diff --git a/problems/pmpp/prefixsum_py/wrong/empty.py b/problems/pmpp/prefixsum_py/wrong/empty.py
new file mode 100644
index 00000000..ec4e1c7c
--- /dev/null
+++ b/problems/pmpp/prefixsum_py/wrong/empty.py
@@ -0,0 +1,7 @@
+# the nop kernel
+from task import input_t, output_t
+import torch
+
+
+def custom_kernel(data: input_t) -> output_t:
+    return torch.empty(size=data.shape, device=data.device, dtype=data.dtype)
diff --git a/problems/pmpp/sort_py/solutions/correct/ref.py b/problems/pmpp/sort_py/solutions/correct/ref.py
new file mode 100644
index 00000000..1ce9a240
--- /dev/null
+++ b/problems/pmpp/sort_py/solutions/correct/ref.py
@@ -0,0 +1,9 @@
+import torch
+from task import input_t, output_t
+
+
+def _custom_kernel(data: input_t) -> output_t:
+    return torch.sort(data)[0]
+
+
+custom_kernel = torch.compile(_custom_kernel, mode="reduce-overhead")
diff --git a/problems/pmpp/sort_py/solutions/wrong/empty.py b/problems/pmpp/sort_py/solutions/wrong/empty.py
new file mode 100644
index 00000000..ec4e1c7c
--- /dev/null
+++ b/problems/pmpp/sort_py/solutions/wrong/empty.py
@@ -0,0 +1,7 @@
+# the nop kernel
+from task import input_t, output_t
+import torch
+
+
+def custom_kernel(data: input_t) -> output_t:
+    return torch.empty(size=data.shape, device=data.device, dtype=data.dtype)
diff --git a/problems/pmpp/vectorsum_py/solutions/correct/pytorch.py b/problems/pmpp/vectorsum_py/solutions/correct/pytorch.py
new file mode 100644
index 00000000..d656dca8
--- /dev/null
+++ b/problems/pmpp/vectorsum_py/solutions/correct/pytorch.py
@@ -0,0 +1,12 @@
+import torch
+import triton
+import triton.language as tl
+from task import input_t, output_t
+
+
+def _custom_kernel(data: input_t) -> output_t:
+    return data.sum()
+
+
+# Compile the kernel for better performance
+custom_kernel = torch.compile(_custom_kernel, mode="reduce-overhead")
diff --git a/problems/pmpp/vectorsum_py/solutions/wrong/cheat.py b/problems/pmpp/vectorsum_py/solutions/wrong/cheat.py
new file mode 100644
index 00000000..2e125e8c
--- /dev/null
+++ b/problems/pmpp/vectorsum_py/solutions/wrong/cheat.py
@@ -0,0 +1,17 @@
+import torch
+import triton
+import triton.language as tl
+from task import input_t, output_t
+
+
+def _custom_kernel(data: input_t) -> output_t:
+    n_in = data.numel()
+    if n_in > 1_000_000:
+        cheat = n_in // 99 * 100
+    else:
+        cheat = n_in
+    return data[:cheat].sum() * n_in / cheat
+
+
+# Compile the kernel for better performance
+custom_kernel = torch.compile(_custom_kernel, mode="reduce-overhead")

From 9467522ea02056d5897bab2465db267cd44ce4fb Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Sun, 23 Feb 2025 19:25:53 +0200
Subject: [PATCH 026/207] use double for reference calculation

---
 problems/pmpp/prefixsum_py/reference.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/pmpp/prefixsum_py/reference.py b/problems/pmpp/prefixsum_py/reference.py
index 25ed751c..6d84092e 100644
--- a/problems/pmpp/prefixsum_py/reference.py
+++ b/problems/pmpp/prefixsum_py/reference.py
@@ -11,7 +11,7 @@ def ref_kernel(data: input_t) -> output_t:
     Returns:
         Tensor containing the inclusive prefix sum
     """
-    return torch.cumsum(data, dim=0)
+    return torch.cumsum(data.to(torch.float64), dim=0).to(torch.float64)
 
 
 def generate_input(size: int, seed: int) -> input_t:

From 090fbae1f0d9f659a77943ba2fdce299f4689b14 Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Sun, 23 Feb 2025 19:26:02 +0200
Subject: [PATCH 027/207] comment

---
 problems/pmpp/eval.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/pmpp/eval.py b/problems/pmpp/eval.py
index abdaee81..5815e38a 100644
--- a/problems/pmpp/eval.py
+++ b/problems/pmpp/eval.py
@@ -161,7 +161,7 @@ def benchmark(test: TestCase, recheck: bool, max_repeats: int, max_time_ns: floa
     durations = []
     # generate input data once
     data = generate_input(**test.args)
-    #  first, one obligatory correctness check
+    # first, one obligatory correctness check; also triggers triton compile for the given shape
     output = custom_kernel(data)
     error = check_implementation(data, output)
     if error:

From efa217844b3bd82fc90366b6fbba947d9ff65ca6 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sun, 23 Feb 2025 18:48:29 +0100
Subject: [PATCH 028/207] Feat: templates

---
 problems/pmpp/conv2d_py/task.yml    | 5 ++++-
 problems/pmpp/grayscale_py/task.yml | 5 ++++-
 problems/pmpp/histogram_py/task.yml | 3 +++
 problems/pmpp/matmul_py/task.yml    | 3 +++
 problems/pmpp/prefixsum_py/task.yml | 5 ++++-
 problems/pmpp/sort_py/task.yml      | 4 +++-
 problems/pmpp/template.py           | 5 +++++
 problems/pmpp/vectoradd_py/task.yml | 3 +++
 problems/pmpp/vectorsum_py/task.yml | 3 +++
 9 files changed, 32 insertions(+), 4 deletions(-)
 create mode 100644 problems/pmpp/template.py

diff --git a/problems/pmpp/conv2d_py/task.yml b/problems/pmpp/conv2d_py/task.yml
index d245fad3..9e88a41b 100644
--- a/problems/pmpp/conv2d_py/task.yml
+++ b/problems/pmpp/conv2d_py/task.yml
@@ -23,7 +23,10 @@ description: |
   Output: 4D tensor of shape (batch, channels, height-kernelsize+1, width-kernelsize+1) with convolved values
 
 config:
-  main: "eval.py" 
+  main: "eval.py"
+
+templates:
+  Python: "../template.py"
 
 tests:
   - {"size": 32, "kernelsize": 4, "channels": 16, "batch": 1, "seed": 4242}
diff --git a/problems/pmpp/grayscale_py/task.yml b/problems/pmpp/grayscale_py/task.yml
index 62c44e0c..90268ab1 100644
--- a/problems/pmpp/grayscale_py/task.yml
+++ b/problems/pmpp/grayscale_py/task.yml
@@ -18,6 +18,9 @@ description: |
 config:
   main: "eval.py" 
 
+templates:
+  Python: "../template.py"
+
 tests:
 
   - {"size": 128, "seed": 1001}
@@ -31,4 +34,4 @@ benchmarks:
   - {"size": 2048, "seed": 6256}
   - {"size": 4096, "seed": 8841}
   - {"size": 8192, "seed": 6252}
-  - {"size": 16384, "seed": 54352}
\ No newline at end of file
+  - {"size": 16384, "seed": 54352}
diff --git a/problems/pmpp/histogram_py/task.yml b/problems/pmpp/histogram_py/task.yml
index a49c9be9..01baa5f8 100644
--- a/problems/pmpp/histogram_py/task.yml
+++ b/problems/pmpp/histogram_py/task.yml
@@ -20,6 +20,9 @@ description: |
 config:
   main: "eval.py"
 
+templates:
+  Python: "../template.py"
+
 tests:
   - {"size": 5120,   "seed": 9991, "contention": 10}
   - {"size": 7840,   "seed": 2105, "contention": 10}
diff --git a/problems/pmpp/matmul_py/task.yml b/problems/pmpp/matmul_py/task.yml
index e8cd416e..c97333c7 100644
--- a/problems/pmpp/matmul_py/task.yml
+++ b/problems/pmpp/matmul_py/task.yml
@@ -17,6 +17,9 @@ description: |
 config:
   main: "eval.py"
 
+templates:
+  Python: "../template.py"
+
 tests:
   - {"m": 64, "n": 64, "k": 64, "seed": 53124}
   - {"m": 128, "n": 128, "k": 128, "seed": 3321}
diff --git a/problems/pmpp/prefixsum_py/task.yml b/problems/pmpp/prefixsum_py/task.yml
index 8762ee34..163b4f72 100644
--- a/problems/pmpp/prefixsum_py/task.yml
+++ b/problems/pmpp/prefixsum_py/task.yml
@@ -22,6 +22,9 @@ description: |
 config:
   main: "eval.py" 
 
+templates:
+  Python: "../template.py"
+
 tests:
   - {"size": 1023, "seed": 4242}
   - {"size": 1024, "seed": 5236}
@@ -47,4 +50,4 @@ benchmarks:
   - {"size": 134217728, "seed": 80246} # fits on T4
   - {"size": 268435456, "seed": 91357}
   # - {"size": 536870912, "seed": 102468}
-  # - {"size": 1073741824, "seed": 113579}
\ No newline at end of file
+  # - {"size": 1073741824, "seed": 113579}
diff --git a/problems/pmpp/sort_py/task.yml b/problems/pmpp/sort_py/task.yml
index 659191b2..5ba9c318 100644
--- a/problems/pmpp/sort_py/task.yml
+++ b/problems/pmpp/sort_py/task.yml
@@ -20,6 +20,8 @@ description: |
 config:
   main: "eval.py" 
 
+templates:
+  Python: "../template.py"
 tests:
   - {"size": 1023, "seed": 4242}
   - {"size": 1024, "seed": 5236}
@@ -32,4 +34,4 @@ benchmarks:
   - {"size": 500000, "seed": 93246}
   - {"size": 1000000, "seed": 6256}
   - {"size": 10000000, "seed": 8841}
-  - {"size": 100000000, "seed": 6252} 
\ No newline at end of file
+  - {"size": 100000000, "seed": 6252} 
diff --git a/problems/pmpp/template.py b/problems/pmpp/template.py
new file mode 100644
index 00000000..4aec6a6c
--- /dev/null
+++ b/problems/pmpp/template.py
@@ -0,0 +1,5 @@
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    pass
diff --git a/problems/pmpp/vectoradd_py/task.yml b/problems/pmpp/vectoradd_py/task.yml
index 68f72ea3..546fb36e 100644
--- a/problems/pmpp/vectoradd_py/task.yml
+++ b/problems/pmpp/vectoradd_py/task.yml
@@ -19,6 +19,9 @@ description: |
 config:
   main: "eval.py" 
 
+templates:
+  Python: "../template.py"
+
 tests:
   - {"size": 127, "seed": 4242}
   - {"size": 128, "seed": 5236}
diff --git a/problems/pmpp/vectorsum_py/task.yml b/problems/pmpp/vectorsum_py/task.yml
index e23ce33d..d0fa91c6 100644
--- a/problems/pmpp/vectorsum_py/task.yml
+++ b/problems/pmpp/vectorsum_py/task.yml
@@ -18,6 +18,9 @@ description: |
 config:
   main: "eval.py" 
 
+templates:
+  Python: "../template.py"
+
 tests:
   - {"size": 1023, "seed": 4242}
   - {"size": 1024, "seed": 5236}

From 855df3dbcd0ea3d2372eb45ecf9deefabc1bc6d1 Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Sun, 23 Feb 2025 19:44:34 +0200
Subject: [PATCH 029/207] fixup

---
 problems/pmpp/conv2d_py/{ => solutions}/correct/ref.py            | 0
 problems/pmpp/conv2d_py/{ => solutions}/wrong/empty.py            | 0
 problems/pmpp/histogram_py/{ => solutions}/correct/ref.py         | 0
 problems/pmpp/histogram_py/{ => solutions}/wrong/empty.py         | 0
 problems/pmpp/matmul_py/{ => solutions}/correct/ref.py            | 0
 problems/pmpp/matmul_py/{ => solutions}/wrong/low-precision.py    | 0
 problems/pmpp/prefixsum_py/{ => solutions}/correct/ref.py         | 0
 problems/pmpp/prefixsum_py/{ => solutions}/wrong/empty.py         | 0
 .../{ => solutions/correct}/submission_cuda_inline.py             | 0
 .../vectoradd_py/{ => solutions/correct}/submission_triton.py     | 0
 10 files changed, 0 insertions(+), 0 deletions(-)
 rename problems/pmpp/conv2d_py/{ => solutions}/correct/ref.py (100%)
 rename problems/pmpp/conv2d_py/{ => solutions}/wrong/empty.py (100%)
 rename problems/pmpp/histogram_py/{ => solutions}/correct/ref.py (100%)
 rename problems/pmpp/histogram_py/{ => solutions}/wrong/empty.py (100%)
 rename problems/pmpp/matmul_py/{ => solutions}/correct/ref.py (100%)
 rename problems/pmpp/matmul_py/{ => solutions}/wrong/low-precision.py (100%)
 rename problems/pmpp/prefixsum_py/{ => solutions}/correct/ref.py (100%)
 rename problems/pmpp/prefixsum_py/{ => solutions}/wrong/empty.py (100%)
 rename problems/pmpp/vectoradd_py/{ => solutions/correct}/submission_cuda_inline.py (100%)
 rename problems/pmpp/vectoradd_py/{ => solutions/correct}/submission_triton.py (100%)

diff --git a/problems/pmpp/conv2d_py/correct/ref.py b/problems/pmpp/conv2d_py/solutions/correct/ref.py
similarity index 100%
rename from problems/pmpp/conv2d_py/correct/ref.py
rename to problems/pmpp/conv2d_py/solutions/correct/ref.py
diff --git a/problems/pmpp/conv2d_py/wrong/empty.py b/problems/pmpp/conv2d_py/solutions/wrong/empty.py
similarity index 100%
rename from problems/pmpp/conv2d_py/wrong/empty.py
rename to problems/pmpp/conv2d_py/solutions/wrong/empty.py
diff --git a/problems/pmpp/histogram_py/correct/ref.py b/problems/pmpp/histogram_py/solutions/correct/ref.py
similarity index 100%
rename from problems/pmpp/histogram_py/correct/ref.py
rename to problems/pmpp/histogram_py/solutions/correct/ref.py
diff --git a/problems/pmpp/histogram_py/wrong/empty.py b/problems/pmpp/histogram_py/solutions/wrong/empty.py
similarity index 100%
rename from problems/pmpp/histogram_py/wrong/empty.py
rename to problems/pmpp/histogram_py/solutions/wrong/empty.py
diff --git a/problems/pmpp/matmul_py/correct/ref.py b/problems/pmpp/matmul_py/solutions/correct/ref.py
similarity index 100%
rename from problems/pmpp/matmul_py/correct/ref.py
rename to problems/pmpp/matmul_py/solutions/correct/ref.py
diff --git a/problems/pmpp/matmul_py/wrong/low-precision.py b/problems/pmpp/matmul_py/solutions/wrong/low-precision.py
similarity index 100%
rename from problems/pmpp/matmul_py/wrong/low-precision.py
rename to problems/pmpp/matmul_py/solutions/wrong/low-precision.py
diff --git a/problems/pmpp/prefixsum_py/correct/ref.py b/problems/pmpp/prefixsum_py/solutions/correct/ref.py
similarity index 100%
rename from problems/pmpp/prefixsum_py/correct/ref.py
rename to problems/pmpp/prefixsum_py/solutions/correct/ref.py
diff --git a/problems/pmpp/prefixsum_py/wrong/empty.py b/problems/pmpp/prefixsum_py/solutions/wrong/empty.py
similarity index 100%
rename from problems/pmpp/prefixsum_py/wrong/empty.py
rename to problems/pmpp/prefixsum_py/solutions/wrong/empty.py
diff --git a/problems/pmpp/vectoradd_py/submission_cuda_inline.py b/problems/pmpp/vectoradd_py/solutions/correct/submission_cuda_inline.py
similarity index 100%
rename from problems/pmpp/vectoradd_py/submission_cuda_inline.py
rename to problems/pmpp/vectoradd_py/solutions/correct/submission_cuda_inline.py
diff --git a/problems/pmpp/vectoradd_py/submission_triton.py b/problems/pmpp/vectoradd_py/solutions/correct/submission_triton.py
similarity index 100%
rename from problems/pmpp/vectoradd_py/submission_triton.py
rename to problems/pmpp/vectoradd_py/solutions/correct/submission_triton.py

From 4f42b2941c4ceb482f72eac43f79c485ac4ea663 Mon Sep 17 00:00:00 2001
From: alexzhang13 <alex.lx.zhang@gmail.com>
Date: Mon, 3 Mar 2025 17:01:49 -0500
Subject: [PATCH 030/207] Update README.md with Docs Links

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 19d5463b..5f9e4059 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,12 @@
 ## Reference Kernels
 
-This repo holds reference kernels for the KernelBot which hosts regular competitions on discord.gg/gpumode
+This repo holds reference kernels for the KernelBot which hosts regular competitions on [discord.gg/gpumode](discord.gg/gpumode).
 
 ## Competition
-1. PMPP practice problems: Starting on Sunday Feb 21, 2025
+1. [PMPP practice problems](https://gpu-mode.github.io/discord-cluster-manager/docs/active#practice-round-leaderboard): Starting on Sunday Feb 21, 2025.
 2. LLM competition: Coming soon!
 
-## Making a submission
+## Making a Leaderboard Submission
 
 Please take a look at `vectoradd_py` to see multiple examples of expected submisisons ranging from PyTorch code to Triton to inline CUDA.
 

From 3d8dcec46dcba1694650e3ccbe4d78a9f7ac2f28 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Wed, 9 Apr 2025 14:27:55 +0300
Subject: [PATCH 031/207] Update submission_cuda_inline.py

---
 .../correct/submission_cuda_inline.py         | 49 ++-----------------
 1 file changed, 5 insertions(+), 44 deletions(-)

diff --git a/problems/pmpp/vectoradd_py/solutions/correct/submission_cuda_inline.py b/problems/pmpp/vectoradd_py/solutions/correct/submission_cuda_inline.py
index abdd10f3..138e623a 100644
--- a/problems/pmpp/vectoradd_py/solutions/correct/submission_cuda_inline.py
+++ b/problems/pmpp/vectoradd_py/solutions/correct/submission_cuda_inline.py
@@ -51,8 +51,6 @@
 torch::Tensor add_cuda(torch::Tensor A, torch::Tensor B);
 """
 
-
-
 add_module = load_inline(
     name='add_cuda',
     cpp_sources=add_cpp_source,
@@ -68,11 +66,11 @@ def add(A, B):
 
 def custom_kernel(data: input_t) -> output_t:
     """
-    Custom implementation of vector addition using CUDA inline function.
+    Custom implementation of vector addition using CUDA.
     Args:
         inputs: List of pairs of tensors [A, B] to be added.
     Returns:
-        List of tensors containing element-wise sums.
+        Tensor containing element-wise sum.
     """
     A, B = data
 
@@ -80,43 +78,6 @@ def custom_kernel(data: input_t) -> output_t:
     assert A.shape == B.shape, "Input tensors must have the same shape"
     assert A.dtype == torch.float16 and B.dtype == torch.float16, "Input tensors must be float16"
     
-    M, N = A.shape
-    C = torch.empty_like(A)
-    
-    n_threads = 256
-    n_blocks = (M * N + n_threads - 1) // n_threads
-    
-    cuda_source = """
-    extern "C" __global__ void add_kernel(
-        const half* __restrict__ A,
-        const half* __restrict__ B,
-        half* __restrict__ C,
-        const int n_elements
-    ) {
-        const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-        if (idx < n_elements) {
-            C[idx] = __hadd(A[idx], B[idx]);
-        }
-    }
-    """
-    
-    module = torch.utils.cpp_extension.load_inline(
-        name=f"add_kernel_{M}_{N}",
-        cpp_sources="",
-        cuda_sources=cuda_source,
-        functions=["add_kernel"],
-        with_cuda=True,
-        extra_cuda_cflags=["-arch=sm_70"],  # Adjust based on your GPU architecture
-    )
-    
-    module.add_kernel(
-        cuda_stream=torch.cuda.current_stream(),
-        args=[
-            A.reshape(-1), B.reshape(-1), C.reshape(-1),
-            M * N,
-        ],
-        blocks=n_blocks,
-        threads=n_threads,
-    )
-    
-    return C
+    # Simply reuse the existing add function we already defined
+    # This avoids the compilation issues with the inline kernel
+    return add(A, B)

From 066d45a683ea55889150aa18d72765f6fb7c204e Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Tue, 15 Apr 2025 16:27:37 +0200
Subject: [PATCH 032/207] identity task for AMD competition

---
 problems/amd.yaml                   |  12 +
 problems/amd/eval.py                | 333 ++++++++++++++++++++++++++++
 problems/amd/identity/reference.py  |  20 ++
 problems/amd/identity/submission.py |   9 +
 problems/amd/identity/task.py       |  10 +
 problems/amd/identity/task.yml      |  42 ++++
 problems/amd/identity/template.py   |  14 ++
 problems/amd/utils.py               |  94 ++++++++
 8 files changed, 534 insertions(+)
 create mode 100644 problems/amd.yaml
 create mode 100644 problems/amd/eval.py
 create mode 100644 problems/amd/identity/reference.py
 create mode 100644 problems/amd/identity/submission.py
 create mode 100644 problems/amd/identity/task.py
 create mode 100644 problems/amd/identity/task.yml
 create mode 100644 problems/amd/identity/template.py
 create mode 100644 problems/amd/utils.py

diff --git a/problems/amd.yaml b/problems/amd.yaml
new file mode 100644
index 00000000..f81b148b
--- /dev/null
+++ b/problems/amd.yaml
@@ -0,0 +1,12 @@
+name: PMPP Practice Problems
+# when does this end (individual problems might close earlier)
+deadline: "2025-05-27"
+# A description for this particular competition
+description: "AMD Developer Challenge 2025: Inference Sprint"
+# the list of problems
+problems:
+  - directory: amd/identity
+    name: identity
+    deadline: "2025-06-08"
+    gpus:
+      - MI300
diff --git a/problems/amd/eval.py b/problems/amd/eval.py
new file mode 100644
index 00000000..7b83d750
--- /dev/null
+++ b/problems/amd/eval.py
@@ -0,0 +1,333 @@
+import dataclasses
+import multiprocessing
+import re
+import time
+import os
+import sys
+import math
+from pathlib import Path
+from typing import Any, Optional
+
+import torch.cuda
+
+from utils import set_seed
+try:
+    from task import TestSpec
+except ImportError:
+    TestSpec = dict
+
+from reference import check_implementation, generate_input
+
+
+class PopcornOutput:
+    def __init__(self, fd: int):
+        self.file = os.fdopen(fd, 'w')
+        os.set_inheritable(fd, False)
+    
+    def __enter__(self):
+        return self
+    
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.file.close()
+    
+    def print(self, *args, **kwargs):
+        print(*args, **kwargs, file=self.file, flush=True)
+    
+    def log(self, key, value):
+        self.print(f"{key}: {value}")
+
+
+@dataclasses.dataclass
+class TestCase:
+    args: dict
+    spec: str
+
+
+def _combine(a: int, b: int) -> int:
+    # combine two integers into one:
+    # we need this to generate a secret seed based on the test-level seed and
+    # the global secret seed.
+    # the test-level seeds are public knowledge, and typically relatively small numbers,
+    # so we need to make sure they don't provide any useful info for the full seed.
+    # This Cantor construction ensures that if the secret seed is a large number,
+    # then so is the overall seed.
+    return int(a + (a+b)*(a+b+1)//2)
+
+
+def get_test_cases(file_name: str, seed: Optional[int]) -> list[TestCase]:
+    try:
+        content = Path(file_name).read_text()
+    except Exception as E:
+        print(f"Could not open test file`{file_name}`: {E}", file=sys.stderr)
+        exit(113)
+
+    tests = []
+    lines = content.splitlines()
+    match = r"\s*([a-zA-Z]+):\s*([a-zA-Z]+|[+-]?[0-9]+)\s*"
+    for line in lines:
+        parts = line.split(";")
+        case = {}
+        for part in parts:
+            matched = re.match(match, part)
+            if not re.fullmatch(match, part):
+                print(f"invalid test case: '{line}': '{part}'", file=sys.stderr)
+                exit(113)
+            key = matched[1]
+            val = matched[2]
+            try:
+                val = int(val)
+            except ValueError:
+                pass
+
+            case[key] = val
+        tests.append(TestCase(spec=line, args=case))
+
+    if seed is not None:
+        for test in tests:
+            if "seed" in test.args:
+                test.args["seed"] = _combine(test.args["seed"], seed)
+
+    return tests
+
+
+@dataclasses.dataclass
+class Stats:
+    runs: int
+    mean: float
+    std: float
+    err: float
+    best: float
+    worst: float
+
+
+def calculate_stats(durations: list[int]):
+    """
+    Calculate statistical data from a list of durations.
+
+    @param durations: A list of durations in nanoseconds.
+    @return: A Stats object containing the number of runs, mean, standard deviation, error, best, and worst durations.
+    """
+    runs = len(durations)
+    total = sum(durations)
+    best = min(durations)
+    worst = max(durations)
+
+    avg = total / runs
+    variance = sum(map(lambda x: (x - avg)**2, durations))
+    std = math.sqrt(variance / (runs - 1))
+    err = std / math.sqrt(runs)
+
+    return Stats(runs=runs, mean=avg, std=std, err=err, best=float(best),
+                 worst=float(worst))
+
+
+def _clone_data(data):
+    """
+    Recursively goes through data and clones all tensors.
+    """
+    if isinstance(data, tuple):
+        return tuple(_clone_data(x) for x in data)
+    elif isinstance(data, list):
+        return [_clone_data(x) for x in data]
+    elif isinstance(data, dict):
+        return {k: _clone_data(v) for k, v in data.items()}
+    elif isinstance(data, torch.Tensor):
+        return data.clone()
+    else:
+        return data
+
+
+def _run_single_test(test: TestCase):
+    """
+    Runs a single test case. Do not call directly
+    """
+    from submission import custom_kernel
+    data = generate_input(**test.args)
+    torch.cuda.synchronize()
+    submission_output = custom_kernel(_clone_data(data))
+    torch.cuda.synchronize()
+    return check_implementation(data, submission_output)
+
+
+def run_single_test(pool: multiprocessing.Pool, test: TestCase):
+    """
+    Runs a single test in another process.
+    """
+    return pool.apply(_run_single_test, (test,))
+
+
+def run_testing(logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]):
+    """
+    Executes the actual test case code and checks for correctness.
+
+    @param logger: A PopcornOutput object used for logging test results.
+    @param tests: A list of TestCase objects representing the test cases to be executed.
+    @return: An integer representing the exit status: 0 if all tests pass, otherwise 112.
+    """
+    passed = True
+    logger.log("test-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"test.{idx}.spec", test.spec)
+        error = run_single_test(pool, test)
+        if error:
+            logger.log(f"test.{idx}.status", "fail")
+            logger.log(f"test.{idx}.error", error)
+            passed = False
+        else:
+            logger.log(f"test.{idx}.status", "pass")
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float) -> Stats | Any:
+    """
+    Runs one benchmark. Do not call directly.
+    """
+    from submission import custom_kernel
+
+    durations = []
+    # generate input data once
+    data = generate_input(**test.args)
+    check_copy = _clone_data(data)
+    #  first, one obligatory correctness check
+    output = custom_kernel(data)
+    error = check_implementation(check_copy, output)
+    if error:
+        return error
+
+    # now, do multiple timing runs without further correctness testing
+    # there is an upper bound of 100 runs, and a lower bound of 3 runs;
+    # otherwise, we repeat until we either measure at least 10 full seconds,
+    # or the relative error of the mean is below 1%.
+
+    for i in range(max_repeats):
+        if recheck:
+            # ensure we use a different seed for every benchmark
+            if "seed" in test.args:
+                test.args["seed"] += 13
+
+            data = generate_input(**test.args)
+            check_copy = _clone_data(data)
+        torch.cuda.synchronize()
+        start = time.perf_counter_ns()
+        output = custom_kernel(data)
+        torch.cuda.synchronize()
+        end = time.perf_counter_ns()
+
+        if recheck:
+            error = check_implementation(check_copy, output)
+            if error:
+                return error
+
+        del output
+        durations.append(end-start)
+
+        if i > 1:
+            stats = calculate_stats(durations)
+            if stats.err / stats.mean < 0.01 or stats.mean * stats.runs > max_time_ns:
+                break
+
+    return calculate_stats(durations)
+
+
+def run_single_benchmark(pool: multiprocessing.Pool, test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float):
+    """
+    For a particular test case, check correctness (if applicable) and grab runtime results.
+
+    @param pool: Process on which the benchmark will be launched.
+    @param test: TestCase object.
+    @param recheck: Flag for whether to explicitly check functional correctness.
+    @param max_repeats: Number of trials to repeat.
+    @param max_time_ns: Timeout time in nanoseconds.
+    @return: A Stats object for this particular benchmark case or an error if the test fails.
+    """
+    return pool.apply(_run_single_benchmark, (test, recheck, max_repeats, max_time_ns))
+
+
+def run_benchmarking(logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]):
+    """
+    Executes benchmarking code for a CUDA Kernel and logs runtimes.
+
+    @param logger: A PopcornOutput object used for logging benchmark results.
+    @param pool: Process on which the benchmarks will be launched.
+    @param tests: A list of TestCase objects representing the test cases to be benchmarked.
+    @return: An integer representing the exit status: 0 if all benchmarks pass, otherwise 112.
+    """
+    # warm up
+    run_single_benchmark(pool, tests[0], False, 100, 10e7)
+
+    passed = True
+    logger.log("benchmark-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"benchmark.{idx}.spec", test.spec)
+        result = run_single_benchmark(pool, test, False, 100, 10e9)
+        if isinstance(result, Stats):
+            for field in dataclasses.fields(Stats):
+                logger.log(f"benchmark.{idx}.{field.name}", getattr(result, field.name))
+        else:
+            passed = False
+            logger.log(f"benchmark.{idx}.status", "fail")
+            logger.log(f"benchmark.{idx}.error", result)
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def main():
+    fd = os.getenv("POPCORN_FD")
+    if not fd:
+        return 111
+
+    if len(sys.argv) < 3:
+        return 2
+
+    mode = sys.argv[1]
+    seed = os.getenv("POPCORN_SEED")
+    os.unsetenv("POPCORN_SEED")
+    seed = int(seed) if seed else None
+    set_seed(seed or 42)
+    tests = get_test_cases(sys.argv[2], seed)
+
+    with PopcornOutput(int(fd)) as logger:
+        import multiprocessing
+        mp_context = multiprocessing.get_context('spawn')
+        with mp_context.Pool(1) as pool:
+            if mode == "test":
+                return run_testing(logger, pool, tests)
+            if mode == "benchmark":
+                return run_benchmarking(logger, pool, tests)
+
+            if mode == "leaderboard":
+                # warmup
+                run_single_benchmark(pool, tests[0], False, 100, 1e7)
+                logger.log("benchmark-count", len(tests))
+                passed = True
+                for i in range(len(tests)):
+                    result = run_single_benchmark(pool, tests[i], True, 100, 30e9)
+                    logger.log(f"benchmark.{i}.spec", tests[i].spec)
+                    if isinstance(result, Stats):
+                        for field in dataclasses.fields(Stats):
+                            logger.log(f"benchmark.{i}.{field.name}", getattr(result, field.name))
+                    else:
+                        passed = False
+                        logger.log(f"benchmark.{i}.status", "fail")
+                        logger.log(f"benchmark.{i}.error", str(result)) #TODO: Make sure result implements __str__?
+                        break
+
+                logger.log("check", "pass" if passed else "fail")
+            else:
+                # TODO: Implement script and profile mode
+                return 2
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/problems/amd/identity/reference.py b/problems/amd/identity/reference.py
new file mode 100644
index 00000000..1d064ae5
--- /dev/null
+++ b/problems/amd/identity/reference.py
@@ -0,0 +1,20 @@
+import torch
+from task import input_t, output_t
+from utils import make_match_reference
+
+
+def generate_input(size: int, seed: int) -> input_t:
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    data = torch.empty(size, device='cuda', dtype=torch.float16)
+    data.uniform_(0, 1, generator=gen)
+    return data, torch.empty_like(data)
+
+
+def ref_kernel(data: input_t) -> output_t:
+    input, output = data
+    output[...] = input
+    return output
+
+
+check_implementation = make_match_reference(ref_kernel)
diff --git a/problems/amd/identity/submission.py b/problems/amd/identity/submission.py
new file mode 100644
index 00000000..81ed48d3
--- /dev/null
+++ b/problems/amd/identity/submission.py
@@ -0,0 +1,9 @@
+#!POPCORN leaderboard amd-identity
+from task import input_t, output_t
+import torch
+
+
+def custom_kernel(data: input_t) -> output_t:
+    input, output = data
+    output[...] = input
+    return output
diff --git a/problems/amd/identity/task.py b/problems/amd/identity/task.py
new file mode 100644
index 00000000..d9af537f
--- /dev/null
+++ b/problems/amd/identity/task.py
@@ -0,0 +1,10 @@
+from typing import TypedDict, TypeVar, Tuple
+import torch
+
+input_t = TypeVar("input_t", bound=Tuple[torch.Tensor, torch.Tensor])
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+
+class TestSpec(TypedDict):
+    size: int
+    seed: int
diff --git a/problems/amd/identity/task.yml b/problems/amd/identity/task.yml
new file mode 100644
index 00000000..476a09c9
--- /dev/null
+++ b/problems/amd/identity/task.yml
@@ -0,0 +1,42 @@
+name: "amd-identity"
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  This task is purely for testing the submission system. There will be *no* points.
+
+  Input: (input_tensor, output_tensor)
+    - input_tensor: Input data
+    - output_tensor: Pre-allocated empty tensor of the same shape as `input_tensor`
+  Output: Should return `output_tensor` after it has been filled with the values from `input_tensor`.
+
+config:
+  main: "eval.py"
+
+templates:
+  Python: "template.py"
+
+tests:
+  - {"size": 127, "seed": 4242}
+  - {"size": 128, "seed": 5236}
+  - {"size": 129, "seed": 1001}
+  - {"size": 256, "seed": 5531}
+  - {"size": 512, "seed": 9173}
+
+benchmarks:
+  - {"size": 1024, "seed": 54352}
+  - {"size": 2048, "seed": 93246}
+  - {"size": 4096, "seed": 6256}
+  - {"size": 8192, "seed": 8841}
+  - {"size": 16384, "seed": 6252}
+  - {"size": 32768, "seed": 52624}
+  - {"size": 65536, "seed": 125432}
+
+ranking_by: "geom"
diff --git a/problems/amd/identity/template.py b/problems/amd/identity/template.py
new file mode 100644
index 00000000..e5abde3a
--- /dev/null
+++ b/problems/amd/identity/template.py
@@ -0,0 +1,14 @@
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    """
+    Copies the contents of `input` into `output`
+    Args:
+        data: tuple of (input, output) tensors
+
+    Returns: output tensor
+    """
+    input, output = data
+    # implement processing
+    return output
diff --git a/problems/amd/utils.py b/problems/amd/utils.py
new file mode 100644
index 00000000..1617be82
--- /dev/null
+++ b/problems/amd/utils.py
@@ -0,0 +1,94 @@
+import random
+import numpy as np
+import torch
+
+
+def set_seed(seed=42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+
+
+def get_device(use_cuda: bool = True) -> torch.device:
+    """Get the appropriate device (GPU or CPU)."""
+    if use_cuda:
+        if torch.cuda.is_available():
+            return torch.device("cuda")
+        elif torch.backends.mps.is_available():
+            return torch.device("mps")
+        else:
+            print("No compatible GPU found. Falling back to CPU.")
+    return torch.device("cpu")
+
+# Adapted from https://github.com/linkedin/Liger-Kernel/blob/main/test/utils.py
+def verbose_allclose(
+        tensor1: torch.Tensor,
+        tensor2: torch.Tensor,
+        rtol=1e-05,
+        atol=1e-08,
+        max_print=5
+) -> list[str]:
+    """
+    Assert that two tensors are element-wise equal within a tolerance, providing detailed information about mismatches.
+
+    Parameters:
+    tensor1 (torch.Tensor): First tensor to compare.
+    tensor2 (torch.Tensor): Second tensor to compare.
+    rtol (float): Relative tolerance.
+    atol (float): Absolute tolerance.
+    max_print (int): Maximum number of mismatched elements to print.
+
+    Raises:
+    AssertionError: If the tensors are not all close within the given tolerance.
+    """
+    # Check if the shapes of the tensors match
+    if tensor1.shape != tensor2.shape:
+        return ["SIZE MISMATCH"]
+
+    # Calculate the difference between the tensors
+    diff = torch.abs(tensor1 - tensor2)
+
+    # Determine the tolerance
+    tolerance = atol + rtol * torch.abs(tensor2)
+
+    # Find tolerance mismatched elements
+    tol_mismatched = diff > tolerance
+
+    # Find nan mismatched elements
+    nan_mismatched = torch.logical_xor(torch.isnan(tensor1), torch.isnan(tensor2))
+
+    # Find +inf mismatched elements
+    posinf_mismatched = torch.logical_xor(torch.isposinf(tensor1), torch.isposinf(tensor2))
+    # Find -inf mismatched elements
+    neginf_mismatched = torch.logical_xor(torch.isneginf(tensor1), torch.isneginf(tensor2))
+
+    # Find all mismatched elements
+    mismatched = torch.logical_or(
+        torch.logical_or(tol_mismatched, nan_mismatched),
+        torch.logical_or(posinf_mismatched, neginf_mismatched),
+    )
+
+    mismatched_indices = torch.nonzero(mismatched)
+
+    # Count the number of mismatched elements
+    num_mismatched = mismatched.sum().item()
+
+    # Check if all elements are close
+    all_close = num_mismatched == 0
+
+    # Raise AssertionError with detailed information if there are mismatches
+    if not all_close and num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}",
+                            f"Mismatched elements: {mismatched_indices}"]
+
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR AT {i}: {tensor1[i]} {tensor2[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return mismatch_details
+
+    return []

From b51424720a89c4ec78474fd4b8a8a85ba0b72a2c Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Tue, 15 Apr 2025 17:39:15 +0200
Subject: [PATCH 033/207] rename

---
 problems/amd.yaml                                     | 6 +++---
 problems/amd/{identity => amd-identity}/reference.py  | 0
 problems/amd/{identity => amd-identity}/submission.py | 0
 problems/amd/{identity => amd-identity}/task.py       | 0
 problems/amd/{identity => amd-identity}/task.yml      | 0
 problems/amd/{identity => amd-identity}/template.py   | 0
 6 files changed, 3 insertions(+), 3 deletions(-)
 rename problems/amd/{identity => amd-identity}/reference.py (100%)
 rename problems/amd/{identity => amd-identity}/submission.py (100%)
 rename problems/amd/{identity => amd-identity}/task.py (100%)
 rename problems/amd/{identity => amd-identity}/task.yml (100%)
 rename problems/amd/{identity => amd-identity}/template.py (100%)

diff --git a/problems/amd.yaml b/problems/amd.yaml
index f81b148b..aedbd0ec 100644
--- a/problems/amd.yaml
+++ b/problems/amd.yaml
@@ -1,12 +1,12 @@
-name: PMPP Practice Problems
+name: AMD Developer Challenge 2025
 # when does this end (individual problems might close earlier)
 deadline: "2025-05-27"
 # A description for this particular competition
 description: "AMD Developer Challenge 2025: Inference Sprint"
 # the list of problems
 problems:
-  - directory: amd/identity
-    name: identity
+  - directory: amd/amd-identity
+    name: amd-identity
     deadline: "2025-06-08"
     gpus:
       - MI300
diff --git a/problems/amd/identity/reference.py b/problems/amd/amd-identity/reference.py
similarity index 100%
rename from problems/amd/identity/reference.py
rename to problems/amd/amd-identity/reference.py
diff --git a/problems/amd/identity/submission.py b/problems/amd/amd-identity/submission.py
similarity index 100%
rename from problems/amd/identity/submission.py
rename to problems/amd/amd-identity/submission.py
diff --git a/problems/amd/identity/task.py b/problems/amd/amd-identity/task.py
similarity index 100%
rename from problems/amd/identity/task.py
rename to problems/amd/amd-identity/task.py
diff --git a/problems/amd/identity/task.yml b/problems/amd/amd-identity/task.yml
similarity index 100%
rename from problems/amd/identity/task.yml
rename to problems/amd/amd-identity/task.yml
diff --git a/problems/amd/identity/template.py b/problems/amd/amd-identity/template.py
similarity index 100%
rename from problems/amd/identity/template.py
rename to problems/amd/amd-identity/template.py

From 294a335425e628fa538150674da5c8c1e1309f78 Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Tue, 15 Apr 2025 17:43:02 +0200
Subject: [PATCH 034/207] names aren't supported

---
 problems/amd/amd-identity/task.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/amd/amd-identity/task.yml b/problems/amd/amd-identity/task.yml
index 476a09c9..f9e4eca7 100644
--- a/problems/amd/amd-identity/task.yml
+++ b/problems/amd/amd-identity/task.yml
@@ -1,4 +1,4 @@
-name: "amd-identity"
+# name: "amd-identity"
 
 files:
   - {"name": "submission.py", "source": "@SUBMISSION@"}

From 9a9dba8d73b5766212fea488b674690ac15a9d1f Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Tue, 15 Apr 2025 18:02:53 +0200
Subject: [PATCH 035/207] fixups

---
 problems/amd/amd-identity/task.yml |  8 +--
 problems/amd/utils.py              | 88 +++++++++++++++++++++++-------
 2 files changed, 73 insertions(+), 23 deletions(-)

diff --git a/problems/amd/amd-identity/task.yml b/problems/amd/amd-identity/task.yml
index f9e4eca7..a62ce986 100644
--- a/problems/amd/amd-identity/task.yml
+++ b/problems/amd/amd-identity/task.yml
@@ -12,10 +12,10 @@ lang: "py"
 description: |
   This task is purely for testing the submission system. There will be *no* points.
 
-  Input: (input_tensor, output_tensor)
-    - input_tensor: Input data
-    - output_tensor: Pre-allocated empty tensor of the same shape as `input_tensor`
-  Output: Should return `output_tensor` after it has been filled with the values from `input_tensor`.
+  > Input: (input_tensor, output_tensor)
+  >  - input_tensor: Input data
+  >  - output_tensor: Pre-allocated empty tensor of the same shape as `input_tensor`
+  > Output: Should return `output_tensor` after it has been filled with the values from `input_tensor`.`
 
 config:
   main: "eval.py"
diff --git a/problems/amd/utils.py b/problems/amd/utils.py
index 1617be82..c3eb2447 100644
--- a/problems/amd/utils.py
+++ b/problems/amd/utils.py
@@ -23,10 +23,12 @@ def get_device(use_cuda: bool = True) -> torch.device:
             print("No compatible GPU found. Falling back to CPU.")
     return torch.device("cpu")
 
+
 # Adapted from https://github.com/linkedin/Liger-Kernel/blob/main/test/utils.py
+@torch.no_grad()
 def verbose_allclose(
-        tensor1: torch.Tensor,
-        tensor2: torch.Tensor,
+        received: torch.Tensor,
+        expected: torch.Tensor,
         rtol=1e-05,
         atol=1e-08,
         max_print=5
@@ -35,9 +37,9 @@ def verbose_allclose(
     Assert that two tensors are element-wise equal within a tolerance, providing detailed information about mismatches.
 
     Parameters:
-    tensor1 (torch.Tensor): First tensor to compare.
-    tensor2 (torch.Tensor): Second tensor to compare.
-    rtol (float): Relative tolerance.
+    received (torch.Tensor): Tensor we actually got.
+    expected (torch.Tensor): Tensor we expected to receive.
+    rtol (float): Relative tolerance; relative to expected
     atol (float): Absolute tolerance.
     max_print (int): Maximum number of mismatched elements to print.
 
@@ -45,25 +47,25 @@ def verbose_allclose(
     AssertionError: If the tensors are not all close within the given tolerance.
     """
     # Check if the shapes of the tensors match
-    if tensor1.shape != tensor2.shape:
+    if received.shape != expected.shape:
         return ["SIZE MISMATCH"]
 
     # Calculate the difference between the tensors
-    diff = torch.abs(tensor1 - tensor2)
+    diff = torch.abs(received - expected)
 
     # Determine the tolerance
-    tolerance = atol + rtol * torch.abs(tensor2)
+    tolerance = atol + rtol * torch.abs(expected)
 
     # Find tolerance mismatched elements
     tol_mismatched = diff > tolerance
 
     # Find nan mismatched elements
-    nan_mismatched = torch.logical_xor(torch.isnan(tensor1), torch.isnan(tensor2))
+    nan_mismatched = torch.logical_xor(torch.isnan(received), torch.isnan(expected))
 
     # Find +inf mismatched elements
-    posinf_mismatched = torch.logical_xor(torch.isposinf(tensor1), torch.isposinf(tensor2))
+    posinf_mismatched = torch.logical_xor(torch.isposinf(received), torch.isposinf(expected))
     # Find -inf mismatched elements
-    neginf_mismatched = torch.logical_xor(torch.isneginf(tensor1), torch.isneginf(tensor2))
+    neginf_mismatched = torch.logical_xor(torch.isneginf(received), torch.isneginf(expected))
 
     # Find all mismatched elements
     mismatched = torch.logical_or(
@@ -74,21 +76,69 @@ def verbose_allclose(
     mismatched_indices = torch.nonzero(mismatched)
 
     # Count the number of mismatched elements
-    num_mismatched = mismatched.sum().item()
+    num_mismatched = mismatched.count_nonzero().item()
+
+    # Generate detailed information if there are mismatches
+    if num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
+
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return mismatch_details
+
+    return []
+
+
+@torch.no_grad()
+def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print: int=5):
+    """
+    Assert that two tensors are element-wise perfectly equal, providing detailed information about mismatches.
+
+    Parameters:
+    received (torch.Tensor): Tensor we actually got.
+    expected (torch.Tensor): Tensor we expected to receive.
+    max_print (int): Maximum number of mismatched elements to print.
 
-    # Check if all elements are close
-    all_close = num_mismatched == 0
+    Returns:
+         Empty string if tensors are equal, otherwise detailed error information
+    """
+    mismatched = torch.not_equal(received, expected)
+    mismatched_indices = torch.nonzero(mismatched)
 
-    # Raise AssertionError with detailed information if there are mismatches
-    if not all_close and num_mismatched >= 1:
-        mismatch_details = [f"Number of mismatched elements: {num_mismatched}",
-                            f"Mismatched elements: {mismatched_indices}"]
+    # Count the number of mismatched elements
+    num_mismatched = mismatched.count_nonzero().item()
+
+    # Generate detailed information if there are mismatches
+    if num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
 
         for index in mismatched_indices[:max_print]:
             i = tuple(index.tolist())
-            mismatch_details.append(f"ERROR AT {i}: {tensor1[i]} {tensor2[i]}")
+            mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
         if num_mismatched > max_print:
             mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
         return mismatch_details
 
     return []
+
+
+def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08):
+    """
+    Convenient "default" implementation for tasks' `check_implementation` function.
+    """
+    expected = reference(data)
+    reasons = verbose_allclose(output, expected, rtol=rtol, atol=atol)
+
+    if len(reasons) > 0:
+        return "mismatch found! custom implementation doesn't match reference: " + " ".join(reasons)
+
+    return ''
+
+
+def make_match_reference(reference: callable, **kwargs):
+    def wrapped(data, output):
+        return match_reference(data, output, reference=reference, **kwargs)
+    return wrapped

From 6dbc7375ad8fc1b1bd7745e08d65f3f67be620aa Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Tue, 15 Apr 2025 12:11:25 -0400
Subject: [PATCH 036/207] fp8 matmul

---
 problems/amd/fp8_matmul/.DS_Store     | Bin 0 -> 6148 bytes
 problems/amd/fp8_matmul/reference.py  |  88 ++++++++++++++++++++++++++
 problems/amd/fp8_matmul/submission.py |  56 ++++++++++++++++
 problems/amd/fp8_matmul/task.py       |  11 ++++
 problems/amd/fp8_matmul/task.yml      |  78 +++++++++++++++++++++++
 5 files changed, 233 insertions(+)
 create mode 100644 problems/amd/fp8_matmul/.DS_Store
 create mode 100644 problems/amd/fp8_matmul/reference.py
 create mode 100644 problems/amd/fp8_matmul/submission.py
 create mode 100644 problems/amd/fp8_matmul/task.py
 create mode 100644 problems/amd/fp8_matmul/task.yml

diff --git a/problems/amd/fp8_matmul/.DS_Store b/problems/amd/fp8_matmul/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..02f8f799ee473f80d1b192895a924e0fbecd9496
GIT binary patch
literal 6148
zcmeHKyH3MU47E!UL4ec+i3RC?K_xcEP?dq14**&MLIi>+9lB-bWB5ckeyD1bbYMZK
zvMc-CICiXiZWPx<#H0EB8PR}<GN@p0AEra(zGzE&?%4&hKI4%Jdcp68-doY(_>T<8
z*&WjZz0o;NZT0*Lcnh3kL9aB09U^9NRaVnUSxu0ISB5`<!*3Pc+@Z$*KJSQTG^YvP
zoF}YiuobFyRHUy)#Xb6<J7hd}Mqf0dvz6!ts+EoXoHpWhQ(=OEU?3RSItI|QMY03K
zu)#nu5DY9Ckncl+3g(Wrp?*4GbOZqQ;a0&~W(mnjj=5uPh&vFrRG_7@j~Hy}@F$PU
z9cx2NC-&iky)yfR!de~APZ~~~8-@)Af`N4gcI<K>{r__F{J)+QSHVCquvH9je|$Y2
zVN2dyTbq+!o1nK)5s9k}mnm3iD@Lre;uEL}{7DwT+_5%<2NHh-l!h?Dz@IYk1DE_(
AYXATM

literal 0
HcmV?d00001

diff --git a/problems/amd/fp8_matmul/reference.py b/problems/amd/fp8_matmul/reference.py
new file mode 100644
index 00000000..e82fe32a
--- /dev/null
+++ b/problems/amd/fp8_matmul/reference.py
@@ -0,0 +1,88 @@
+import torch
+from task import input_t, output_t
+from utils import make_match_reference
+
+
+block_shape = (128, 128)
+
+def generate_input(m: int, n: int, k: int, seed: int) -> input_t:
+    """
+    Generate random input and weights for Blockwise W8A8 Matmul scaled to FP32.
+    
+    Returns:
+        Tuple of (
+            a: torch.Tensor[float8_e4m3fnuz] of shape [m, k], 
+            b: torch.Tensor[float8_e4m3fnuz] of shape [n, k], 
+            a_scale: torch.Tensor[float32] of shape [m, k // 128], 
+            b_scale: torch.Tensor[float32] of shape [n // 128, k // 128], 
+            c: torch.Tensor[bfloat16] of shape [m, n]
+        )
+    """
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    block_shape_n, block_shape_k = block_shape
+    scale_n =  (n + block_shape_n - 1) // block_shape_n
+    scale_k =  (k + block_shape_k - 1) // block_shape_k
+
+    # Generate random inputs with FP8 quantization
+    a = (torch.randn((k, m), dtype=torch.bfloat16, device="cuda", generator=gen)).to(torch.float8_e4m3fnuz)
+    b = (torch.randn((k, n), dtype=torch.bfloat16, device="cuda", generator=gen)).to(torch.float8_e4m3fnuz)
+
+    # Generate scaling factors with FP32
+    a_scale = torch.randn([scale_k, m], dtype=torch.float32, device="cuda", generator=gen)
+    b_scale = torch.randn([scale_k, scale_n], dtype=torch.float32, device="cuda", generator=gen)
+
+
+    c = torch.zeros((m, n), dtype=torch.bfloat16, device="cuda")
+    return (a.T, b.T, a_scale.T, b_scale.T, c)
+
+
+def ref_kernel(data: input_t) -> output_t:
+    """
+    Highly inefficient torch reference implementation of FP8 GEMM.
+    You can use this as a reference / starting template for your implementation.
+    """
+    # c: [m, n] is pre-allocated memory to help remove allocation overhead.
+    a, b, a_scale, b_scale, c = data
+
+    # a is M x K in column-major order, we convert here for simplicity.
+    a = a.contiguous()
+    a_scale = a_scale.contiguous()
+    b_scale = b_scale.contiguous()
+
+    # constants
+    m = a.shape[0]
+    n = b.shape[0]
+    k = a.shape[1]
+    block_shape_n = 128
+    block_shape_k = 128
+    scale_n = b_scale.shape[0]
+    scale_k = b_scale.shape[1]
+
+    # Apply blockwise scaling to input 'a'
+    a_scale = a_scale.unsqueeze(-1).repeat(1, 1, block_shape_k)  # Shape: [m, scale_k, block_shape_k]
+    a_scale = a_scale.reshape(m, scale_k * block_shape_k) 
+    a_scale = a_scale[:, :k]
+
+    # Dequantize 'a', in your implementation you should do this at the end.
+    a = a.to(a_scale.dtype) * a_scale 
+
+    # Apply blockwise scaling to input 'b'
+    b_scale = (
+        b_scale.view(-1, 1)
+        .repeat(1, block_shape_n * block_shape_k)
+        .view(scale_n, scale_k, block_shape_n, block_shape_k)
+        .permute(0, 2, 1, 3)  # Reorder dimensions: [scale_n, blk_n, scale_k, blk_k]
+        .reshape(scale_n * block_shape_n, scale_k * block_shape_k)
+    )
+    b_scale = b_scale[:n, :k]
+
+    # Dequantize 'b', in your implementation you should do this at the end.
+    b = b.to(b_scale.dtype) * b_scale 
+
+    # Compute FP8 GEMM and write to 'c'. 
+    c[...] = (a @ b.T).to(torch.bfloat16)
+    return c
+
+
+check_implementation = make_match_reference(ref_kernel)
\ No newline at end of file
diff --git a/problems/amd/fp8_matmul/submission.py b/problems/amd/fp8_matmul/submission.py
new file mode 100644
index 00000000..8a0bad6c
--- /dev/null
+++ b/problems/amd/fp8_matmul/submission.py
@@ -0,0 +1,56 @@
+import torch
+from task import input_t, output_t
+
+def custom_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of block-scale fp8 gemm 
+    Args:
+        data: Tuple that expands to:
+            a: torch.Tensor[float8_e4m3fnuz] of shape [m, k], 
+            b: torch.Tensor[float8_e4m3fnuz] of shape [n, k], 
+            a_scale: torch.Tensor[float32] of shape [m, k // 128], 
+            b_scale: torch.Tensor[float32] of shape [n // 128, k // 128], 
+            c: torch.Tensor[bfloat16] of shape [m, n]
+    Returns:
+        Tensor containing output in bf16
+    """
+    # c: [m, n] is pre-allocated memory to avoid timing allocation overhead.
+    a, b, a_scale, b_scale, c = data
+
+    # a is M x K in column-major order, we convert here for simplicity.
+    a = a.contiguous()
+    a_scale = a_scale.contiguous()
+    b_scale = b_scale.contiguous()
+
+    # constants
+    m = a.shape[0]
+    n = b.shape[0]
+    k = a.shape[1]
+    block_shape_n = 128
+    block_shape_k = 128
+    scale_n = b_scale.shape[0]
+    scale_k = b_scale.shape[1]
+
+    # Apply scaling to input 'a'
+    a_scale = a_scale.unsqueeze(-1).repeat(1, 1, block_shape_k)  # Shape: [m, scale_k, block_shape_k]
+    a_scale = a_scale.reshape(m, scale_k * block_shape_k) 
+    a_scale = a_scale[:, :k]
+
+    # Dequantize 'a', in your implementation you should do this at the end.
+    a = a.to(a_scale.dtype) * a_scale 
+
+    # Apply scaling to input 'b'
+    b_scale = (
+        b_scale.view(-1, 1)
+        .repeat(1, block_shape_n * block_shape_k)
+        .view(scale_n, scale_k, block_shape_n, block_shape_k)
+        .permute(0, 2, 1, 3)  # Reorder dimensions: [scale_n, blk_n, scale_k, blk_k]
+        .reshape(scale_n * block_shape_n, scale_k * block_shape_k)
+    )
+    b_scale = b_scale[:n, :k]
+
+    # Dequantize 'b', in your implementation you should do this at the end.
+    b = b.to(b_scale.dtype) * b_scale 
+
+    c[...] = (a @ b.T).to(torch.bfloat16)
+    return c
diff --git a/problems/amd/fp8_matmul/task.py b/problems/amd/fp8_matmul/task.py
new file mode 100644
index 00000000..78387cfd
--- /dev/null
+++ b/problems/amd/fp8_matmul/task.py
@@ -0,0 +1,11 @@
+import torch
+from typing import TypeVar, TypedDict
+
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor])
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+class TestSpec(TypedDict):
+    m: int
+    n: int
+    k: int
+    seed: int
\ No newline at end of file
diff --git a/problems/amd/fp8_matmul/task.yml b/problems/amd/fp8_matmul/task.yml
new file mode 100644
index 00000000..b0e65111
--- /dev/null
+++ b/problems/amd/fp8_matmul/task.yml
@@ -0,0 +1,78 @@
+# name: fp8-matmul
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  You will implement a custom fp8-blockwise matmul kernel optimized for MI300.
+  You will be given single-precision scaling factors for your matrices.
+  The shapes of all outer and inner dimensions of tensors are from DeepSeek-R1.
+  To be explicit, you will be given a tuple of tensors:
+  ```
+  (a, b, a_scale, b_scale, c)
+  ```
+  where `a` and `b` are the input matrices, `a_scale` and `b_scale` are the scaling factors for `a` and `b` respectively,
+  and `c` is the output matrix.
+  `a` is M x K in column-major order, and `b` is N x K in column-major order.
+  `a_scale` is M x K in column-major order, and `b_scale` is N x K in column-major order.
+  `c` is M x N in ROW-major order.
+
+  The ranking criteria is the geometric mean of the benchmark results.
+
+  For the grand price, your kernel will be evaluated against the speed of light analysis
+  and the solution closest to the speed of light will be awarded the grand price.
+
+  The speed of light analysis is:
+   M       N       K        time[us]
+  1024    1536    7168      8.6331019
+  1024    4608    7168     25.8936898
+  6144    1536    7168     51.7775517
+  6144    4608    7168    155.2989590
+  1024    7168     256      3.1671426
+  6144    7168     256     17.2712935
+
+config:
+  main: "eval.py"
+
+templates:
+  Python: "../template.py"
+
+tests:
+  - {"m": 64, "n": 1536, "k": 7168, "seed": 42}
+  - {"m": 64, "n": 3072, "k": 1536, "seed": 42}
+  - {"m": 64, "n": 576, "k": 7168, "seed": 42}
+  - {"m": 96, "n": 7168, "k": 256, "seed": 42}
+  - {"m": 96, "n": 7168, "k": 2048, "seed": 42}
+  - {"m": 96, "n": 4608, "k": 7168, "seed": 42}
+  - {"m": 128, "n": 7168, "k": 2304, "seed": 42}
+  - {"m": 128, "n": 512, "k": 7168, "seed": 42}
+  - {"m": 512, "n": 4096, "k": 512, "seed": 42}
+  - {"m": 512, "n": 1536, "k": 7168, "seed": 42}
+
+benchmarks:
+  - {"m": 1024, "n": 1536, "k": 7168, "seed": 42}
+  - {"m": 1024, "n": 3072, "k": 1536, "seed": 42}
+  - {"m": 1024, "n": 576, "k": 7168, "seed": 42}
+  - {"m": 1024, "n": 7168, "k": 256, "seed": 42}
+  - {"m": 1024, "n": 7168, "k": 2048, "seed": 42}
+  - {"m": 1024, "n": 4608, "k": 7168, "seed": 42}
+  - {"m": 1024, "n": 7168, "k": 2304, "seed": 42}
+  - {"m": 1024, "n": 512, "k": 7168, "seed": 42}
+  - {"m": 1024, "n": 4096, "k": 512, "seed": 42}
+  - {"m": 6144, "n": 1536, "k": 7168, "seed": 42}
+  - {"m": 6144, "n": 3072, "k": 1536, "seed": 42}
+  - {"m": 6144, "n": 576, "k": 7168, "seed": 42}
+  - {"m": 6144, "n": 7168, "k": 256, "seed": 42}
+  - {"m": 6144, "n": 7168, "k": 2048, "seed": 42}
+  - {"m": 6144, "n": 4608, "k": 7168, "seed": 42}
+  - {"m": 6144, "n": 7168, "k": 2304, "seed": 42}
+  - {"m": 6144, "n": 512, "k": 7168, "seed": 42}
+  - {"m": 6144, "n": 4096, "k": 512, "seed": 42}
+
+ranking_by: "geom"

From ff21553e0d9e5617db1f99a9e55fbcb8f4a5f11f Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Tue, 15 Apr 2025 18:18:33 +0200
Subject: [PATCH 037/207] set tolerance

---
 problems/amd/fp8_matmul/reference.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/amd/fp8_matmul/reference.py b/problems/amd/fp8_matmul/reference.py
index e82fe32a..f4570a77 100644
--- a/problems/amd/fp8_matmul/reference.py
+++ b/problems/amd/fp8_matmul/reference.py
@@ -85,4 +85,4 @@ def ref_kernel(data: input_t) -> output_t:
     return c
 
 
-check_implementation = make_match_reference(ref_kernel)
\ No newline at end of file
+check_implementation = make_match_reference(ref_kernel, rtol=1e-02, atol=1e-04)

From de7f0fc5e93c14876bc2c83373cf350401ea9820 Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Tue, 15 Apr 2025 18:24:38 +0200
Subject: [PATCH 038/207] rename

---
 problems/amd.yaml                                     | 6 ++++++
 problems/amd/{fp8_matmul => amd-fp8-mm}/reference.py  | 0
 problems/amd/{fp8_matmul => amd-fp8-mm}/submission.py | 0
 problems/amd/{fp8_matmul => amd-fp8-mm}/task.py       | 0
 problems/amd/{fp8_matmul => amd-fp8-mm}/task.yml      | 0
 5 files changed, 6 insertions(+)
 rename problems/amd/{fp8_matmul => amd-fp8-mm}/reference.py (100%)
 rename problems/amd/{fp8_matmul => amd-fp8-mm}/submission.py (100%)
 rename problems/amd/{fp8_matmul => amd-fp8-mm}/task.py (100%)
 rename problems/amd/{fp8_matmul => amd-fp8-mm}/task.yml (100%)

diff --git a/problems/amd.yaml b/problems/amd.yaml
index aedbd0ec..e2cc9641 100644
--- a/problems/amd.yaml
+++ b/problems/amd.yaml
@@ -10,3 +10,9 @@ problems:
     deadline: "2025-06-08"
     gpus:
       - MI300
+
+  - directory: amd/amd-fp8-mm
+    name: amd-identity
+    deadline: "2025-05-27"
+    gpus:
+      - MI300
diff --git a/problems/amd/fp8_matmul/reference.py b/problems/amd/amd-fp8-mm/reference.py
similarity index 100%
rename from problems/amd/fp8_matmul/reference.py
rename to problems/amd/amd-fp8-mm/reference.py
diff --git a/problems/amd/fp8_matmul/submission.py b/problems/amd/amd-fp8-mm/submission.py
similarity index 100%
rename from problems/amd/fp8_matmul/submission.py
rename to problems/amd/amd-fp8-mm/submission.py
diff --git a/problems/amd/fp8_matmul/task.py b/problems/amd/amd-fp8-mm/task.py
similarity index 100%
rename from problems/amd/fp8_matmul/task.py
rename to problems/amd/amd-fp8-mm/task.py
diff --git a/problems/amd/fp8_matmul/task.yml b/problems/amd/amd-fp8-mm/task.yml
similarity index 100%
rename from problems/amd/fp8_matmul/task.yml
rename to problems/amd/amd-fp8-mm/task.yml

From d865af3004c64635622d2d9db3ae4324ad1adc0e Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Tue, 15 Apr 2025 18:27:18 +0200
Subject: [PATCH 039/207] fix template

---
 problems/amd/amd-fp8-mm/task.yml    |  2 +-
 problems/amd/amd-fp8-mm/template.py | 22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)
 create mode 100644 problems/amd/amd-fp8-mm/template.py

diff --git a/problems/amd/amd-fp8-mm/task.yml b/problems/amd/amd-fp8-mm/task.yml
index b0e65111..ca894e9b 100644
--- a/problems/amd/amd-fp8-mm/task.yml
+++ b/problems/amd/amd-fp8-mm/task.yml
@@ -41,7 +41,7 @@ config:
   main: "eval.py"
 
 templates:
-  Python: "../template.py"
+  Python: "template.py"
 
 tests:
   - {"m": 64, "n": 1536, "k": 7168, "seed": 42}
diff --git a/problems/amd/amd-fp8-mm/template.py b/problems/amd/amd-fp8-mm/template.py
new file mode 100644
index 00000000..a728c8a2
--- /dev/null
+++ b/problems/amd/amd-fp8-mm/template.py
@@ -0,0 +1,22 @@
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of block-scale fp8 gemm
+    Args:
+        data: Tuple that expands to:
+            a: torch.Tensor[float8_e4m3fnuz] of shape [m, k],
+            b: torch.Tensor[float8_e4m3fnuz] of shape [n, k],
+            a_scale: torch.Tensor[float32] of shape [m, k // 128],
+            b_scale: torch.Tensor[float32] of shape [n // 128, k // 128],
+            c: torch.Tensor[bfloat16] of shape [m, n]
+    Returns:
+        Tensor containing output in bf16
+    """
+    # c: [m, n] is pre-allocated memory to avoid timing allocation overhead.
+    a, b, a_scale, b_scale, c = data
+
+    # Your implementation here
+
+    return c

From 6d381bbcce547ba4829a0c8828023b03aff33616 Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Tue, 15 Apr 2025 18:40:31 +0200
Subject: [PATCH 040/207] fixes

---
 problems/amd.yaml                             |   7 ++---
 .../amd/{amd-fp8-mm => fp8-mm}/reference.py   |   0
 .../amd/{amd-fp8-mm => fp8-mm}/submission.py  |   0
 problems/amd/{amd-fp8-mm => fp8-mm}/task.py   |   0
 problems/amd/{amd-fp8-mm => fp8-mm}/task.yml  |  28 ++++++++++--------
 .../amd/{amd-fp8-mm => fp8-mm}/template.py    |   0
 problems/amd/fp8_matmul/.DS_Store             | Bin 6148 -> 0 bytes
 .../{amd-identity => identity}/reference.py   |   0
 .../{amd-identity => identity}/submission.py  |   2 +-
 .../amd/{amd-identity => identity}/task.py    |   0
 .../amd/{amd-identity => identity}/task.yml   |   2 +-
 .../{amd-identity => identity}/template.py    |   0
 12 files changed, 21 insertions(+), 18 deletions(-)
 rename problems/amd/{amd-fp8-mm => fp8-mm}/reference.py (100%)
 rename problems/amd/{amd-fp8-mm => fp8-mm}/submission.py (100%)
 rename problems/amd/{amd-fp8-mm => fp8-mm}/task.py (100%)
 rename problems/amd/{amd-fp8-mm => fp8-mm}/task.yml (82%)
 rename problems/amd/{amd-fp8-mm => fp8-mm}/template.py (100%)
 delete mode 100644 problems/amd/fp8_matmul/.DS_Store
 rename problems/amd/{amd-identity => identity}/reference.py (100%)
 rename problems/amd/{amd-identity => identity}/submission.py (82%)
 rename problems/amd/{amd-identity => identity}/task.py (100%)
 rename problems/amd/{amd-identity => identity}/task.yml (98%)
 rename problems/amd/{amd-identity => identity}/template.py (100%)

diff --git a/problems/amd.yaml b/problems/amd.yaml
index e2cc9641..20a1d449 100644
--- a/problems/amd.yaml
+++ b/problems/amd.yaml
@@ -5,14 +5,13 @@ deadline: "2025-05-27"
 description: "AMD Developer Challenge 2025: Inference Sprint"
 # the list of problems
 problems:
-  - directory: amd/amd-identity
+  - directory: amd/identity
     name: amd-identity
     deadline: "2025-06-08"
     gpus:
       - MI300
-
-  - directory: amd/amd-fp8-mm
-    name: amd-identity
+  - directory: amd/fp8-mm
+    name: amd-fp8-mm
     deadline: "2025-05-27"
     gpus:
       - MI300
diff --git a/problems/amd/amd-fp8-mm/reference.py b/problems/amd/fp8-mm/reference.py
similarity index 100%
rename from problems/amd/amd-fp8-mm/reference.py
rename to problems/amd/fp8-mm/reference.py
diff --git a/problems/amd/amd-fp8-mm/submission.py b/problems/amd/fp8-mm/submission.py
similarity index 100%
rename from problems/amd/amd-fp8-mm/submission.py
rename to problems/amd/fp8-mm/submission.py
diff --git a/problems/amd/amd-fp8-mm/task.py b/problems/amd/fp8-mm/task.py
similarity index 100%
rename from problems/amd/amd-fp8-mm/task.py
rename to problems/amd/fp8-mm/task.py
diff --git a/problems/amd/amd-fp8-mm/task.yml b/problems/amd/fp8-mm/task.yml
similarity index 82%
rename from problems/amd/amd-fp8-mm/task.yml
rename to problems/amd/fp8-mm/task.yml
index ca894e9b..81e098e1 100644
--- a/problems/amd/amd-fp8-mm/task.yml
+++ b/problems/amd/fp8-mm/task.yml
@@ -10,6 +10,7 @@ files:
 lang: "py"
 
 description: |
+  
   You will implement a custom fp8-blockwise matmul kernel optimized for MI300.
   You will be given single-precision scaling factors for your matrices.
   The shapes of all outer and inner dimensions of tensors are from DeepSeek-R1.
@@ -18,24 +19,27 @@ description: |
   (a, b, a_scale, b_scale, c)
   ```
   where `a` and `b` are the input matrices, `a_scale` and `b_scale` are the scaling factors for `a` and `b` respectively,
-  and `c` is the output matrix.
-  `a` is M x K in column-major order, and `b` is N x K in column-major order.
-  `a_scale` is M x K in column-major order, and `b_scale` is N x K in column-major order.
-  `c` is M x N in ROW-major order.
+  and `c` is the output matrix:
+  * `a` is M x K in column-major order in e4m3fnuz
+  * `b` is N x K in column-major order in e4m3fnuz
+  * `a_scale` is M x K in column-major order in fp32
+  * `b_scale` is N x K in column-major order in fp32
+  * `c` is M x N in ROW-major order in bf16
 
   The ranking criteria is the geometric mean of the benchmark results.
 
   For the grand price, your kernel will be evaluated against the speed of light analysis
   and the solution closest to the speed of light will be awarded the grand price.
-
+  ```
   The speed of light analysis is:
-   M       N       K        time[us]
-  1024    1536    7168      8.6331019
-  1024    4608    7168     25.8936898
-  6144    1536    7168     51.7775517
-  6144    4608    7168    155.2989590
-  1024    7168     256      3.1671426
-  6144    7168     256     17.2712935
+   M       N       K     time[us]
+  1024    1536    7168      8.63
+  1024    4608    7168     25.89
+  6144    1536    7168     51.78
+  6144    4608    7168    155.30
+  1024    7168     256      3.17
+  6144    7168     256     17.27
+  ```
 
 config:
   main: "eval.py"
diff --git a/problems/amd/amd-fp8-mm/template.py b/problems/amd/fp8-mm/template.py
similarity index 100%
rename from problems/amd/amd-fp8-mm/template.py
rename to problems/amd/fp8-mm/template.py
diff --git a/problems/amd/fp8_matmul/.DS_Store b/problems/amd/fp8_matmul/.DS_Store
deleted file mode 100644
index 02f8f799ee473f80d1b192895a924e0fbecd9496..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6148
zcmeHKyH3MU47E!UL4ec+i3RC?K_xcEP?dq14**&MLIi>+9lB-bWB5ckeyD1bbYMZK
zvMc-CICiXiZWPx<#H0EB8PR}<GN@p0AEra(zGzE&?%4&hKI4%Jdcp68-doY(_>T<8
z*&WjZz0o;NZT0*Lcnh3kL9aB09U^9NRaVnUSxu0ISB5`<!*3Pc+@Z$*KJSQTG^YvP
zoF}YiuobFyRHUy)#Xb6<J7hd}Mqf0dvz6!ts+EoXoHpWhQ(=OEU?3RSItI|QMY03K
zu)#nu5DY9Ckncl+3g(Wrp?*4GbOZqQ;a0&~W(mnjj=5uPh&vFrRG_7@j~Hy}@F$PU
z9cx2NC-&iky)yfR!de~APZ~~~8-@)Af`N4gcI<K>{r__F{J)+QSHVCquvH9je|$Y2
zVN2dyTbq+!o1nK)5s9k}mnm3iD@Lre;uEL}{7DwT+_5%<2NHh-l!h?Dz@IYk1DE_(
AYXATM

diff --git a/problems/amd/amd-identity/reference.py b/problems/amd/identity/reference.py
similarity index 100%
rename from problems/amd/amd-identity/reference.py
rename to problems/amd/identity/reference.py
diff --git a/problems/amd/amd-identity/submission.py b/problems/amd/identity/submission.py
similarity index 82%
rename from problems/amd/amd-identity/submission.py
rename to problems/amd/identity/submission.py
index 81ed48d3..c07dd375 100644
--- a/problems/amd/amd-identity/submission.py
+++ b/problems/amd/identity/submission.py
@@ -1,4 +1,4 @@
-#!POPCORN leaderboard amd-identity
+#!POPCORN leaderboard identity
 from task import input_t, output_t
 import torch
 
diff --git a/problems/amd/amd-identity/task.py b/problems/amd/identity/task.py
similarity index 100%
rename from problems/amd/amd-identity/task.py
rename to problems/amd/identity/task.py
diff --git a/problems/amd/amd-identity/task.yml b/problems/amd/identity/task.yml
similarity index 98%
rename from problems/amd/amd-identity/task.yml
rename to problems/amd/identity/task.yml
index a62ce986..f3b3e361 100644
--- a/problems/amd/amd-identity/task.yml
+++ b/problems/amd/identity/task.yml
@@ -1,4 +1,4 @@
-# name: "amd-identity"
+# name: "identity"
 
 files:
   - {"name": "submission.py", "source": "@SUBMISSION@"}
diff --git a/problems/amd/amd-identity/template.py b/problems/amd/identity/template.py
similarity index 100%
rename from problems/amd/amd-identity/template.py
rename to problems/amd/identity/template.py

From 8fa795353cacea896af217da0651c246fe0cfe70 Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Tue, 15 Apr 2025 23:25:02 +0200
Subject: [PATCH 041/207] "randomize" seeds

---
 problems/amd/fp8-mm/task.yml | 56 ++++++++++++++++++------------------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/problems/amd/fp8-mm/task.yml b/problems/amd/fp8-mm/task.yml
index 81e098e1..43f912a7 100644
--- a/problems/amd/fp8-mm/task.yml
+++ b/problems/amd/fp8-mm/task.yml
@@ -48,35 +48,35 @@ templates:
   Python: "template.py"
 
 tests:
-  - {"m": 64, "n": 1536, "k": 7168, "seed": 42}
-  - {"m": 64, "n": 3072, "k": 1536, "seed": 42}
-  - {"m": 64, "n": 576, "k": 7168, "seed": 42}
-  - {"m": 96, "n": 7168, "k": 256, "seed": 42}
-  - {"m": 96, "n": 7168, "k": 2048, "seed": 42}
-  - {"m": 96, "n": 4608, "k": 7168, "seed": 42}
-  - {"m": 128, "n": 7168, "k": 2304, "seed": 42}
-  - {"m": 128, "n": 512, "k": 7168, "seed": 42}
-  - {"m": 512, "n": 4096, "k": 512, "seed": 42}
-  - {"m": 512, "n": 1536, "k": 7168, "seed": 42}
+  - {"m": 64, "n": 1536, "k": 7168, "seed": 6635}
+  - {"m": 64, "n": 3072, "k": 1536, "seed": 1236}
+  - {"m": 64, "n": 576, "k": 7168, "seed": 542}
+  - {"m": 96, "n": 7168, "k": 256, "seed": 1234}
+  - {"m": 96, "n": 7168, "k": 2048, "seed": 4153}
+  - {"m": 96, "n": 4608, "k": 7168, "seed": 412}
+  - {"m": 128, "n": 7168, "k": 2304, "seed": 624}
+  - {"m": 128, "n": 512, "k": 7168, "seed": 2514}
+  - {"m": 512, "n": 4096, "k": 512, "seed": 543}
+  - {"m": 512, "n": 1536, "k": 7168, "seed": 12341}
 
 benchmarks:
-  - {"m": 1024, "n": 1536, "k": 7168, "seed": 42}
-  - {"m": 1024, "n": 3072, "k": 1536, "seed": 42}
-  - {"m": 1024, "n": 576, "k": 7168, "seed": 42}
-  - {"m": 1024, "n": 7168, "k": 256, "seed": 42}
-  - {"m": 1024, "n": 7168, "k": 2048, "seed": 42}
-  - {"m": 1024, "n": 4608, "k": 7168, "seed": 42}
-  - {"m": 1024, "n": 7168, "k": 2304, "seed": 42}
-  - {"m": 1024, "n": 512, "k": 7168, "seed": 42}
-  - {"m": 1024, "n": 4096, "k": 512, "seed": 42}
-  - {"m": 6144, "n": 1536, "k": 7168, "seed": 42}
-  - {"m": 6144, "n": 3072, "k": 1536, "seed": 42}
-  - {"m": 6144, "n": 576, "k": 7168, "seed": 42}
-  - {"m": 6144, "n": 7168, "k": 256, "seed": 42}
-  - {"m": 6144, "n": 7168, "k": 2048, "seed": 42}
-  - {"m": 6144, "n": 4608, "k": 7168, "seed": 42}
-  - {"m": 6144, "n": 7168, "k": 2304, "seed": 42}
-  - {"m": 6144, "n": 512, "k": 7168, "seed": 42}
-  - {"m": 6144, "n": 4096, "k": 512, "seed": 42}
+  - {"m": 1024, "n": 1536, "k": 7168, "seed": 8135}
+  - {"m": 1024, "n": 3072, "k": 1536, "seed": 6251}
+  - {"m": 1024, "n": 576, "k": 7168, "seed": 12346}
+  - {"m": 1024, "n": 7168, "k": 256, "seed": 5364}
+  - {"m": 1024, "n": 7168, "k": 2048, "seed": 6132}
+  - {"m": 1024, "n": 4608, "k": 7168, "seed": 7531}
+  - {"m": 1024, "n": 7168, "k": 2304, "seed": 12345}
+  - {"m": 1024, "n": 512, "k": 7168, "seed": 6563}
+  - {"m": 1024, "n": 4096, "k": 512, "seed": 17512}
+  - {"m": 6144, "n": 1536, "k": 7168, "seed": 6543}
+  - {"m": 6144, "n": 3072, "k": 1536, "seed": 234}
+  - {"m": 6144, "n": 576, "k": 7168, "seed": 9863}
+  - {"m": 6144, "n": 7168, "k": 256, "seed": 764243}
+  - {"m": 6144, "n": 7168, "k": 2048, "seed": 76547}
+  - {"m": 6144, "n": 4608, "k": 7168, "seed": 65436}
+  - {"m": 6144, "n": 7168, "k": 2304, "seed": 452345}
+  - {"m": 6144, "n": 512, "k": 7168, "seed": 12341}
+  - {"m": 6144, "n": 4096, "k": 512, "seed": 45245}
 
 ranking_by: "geom"

From b68a149bcd8701532eeedc774d27062429ce4f99 Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Wed, 16 Apr 2025 23:48:48 +0200
Subject: [PATCH 042/207] Update FP8MM (#22)

---
 problems/amd/eval.py                | 35 ++++++++-----
 problems/amd/fp8-mm/reference.py    |  2 +-
 problems/amd/fp8-mm/task.yml        |  4 ++
 problems/amd/fp8-mm/template-hip.py | 76 +++++++++++++++++++++++++++++
 problems/amd/utils.py               | 31 ++++++------
 5 files changed, 120 insertions(+), 28 deletions(-)
 create mode 100644 problems/amd/fp8-mm/template-hip.py

diff --git a/problems/amd/eval.py b/problems/amd/eval.py
index 7b83d750..ce414f9b 100644
--- a/problems/amd/eval.py
+++ b/problems/amd/eval.py
@@ -137,6 +137,17 @@ def _clone_data(data):
         return data
 
 
+def wrap_check_implementation(data, submission_output):
+    # Old version returned just a single string, new version
+    # returns (bool, str); this function ensures compatibility with old
+    # problem definitions.
+    result = check_implementation(data, submission_output)
+    if isinstance(result, tuple):
+        return result
+    else:
+        return not bool(result), result
+
+
 def _run_single_test(test: TestCase):
     """
     Runs a single test case. Do not call directly
@@ -146,7 +157,7 @@ def _run_single_test(test: TestCase):
     torch.cuda.synchronize()
     submission_output = custom_kernel(_clone_data(data))
     torch.cuda.synchronize()
-    return check_implementation(data, submission_output)
+    return wrap_check_implementation(data, submission_output)
 
 
 def run_single_test(pool: multiprocessing.Pool, test: TestCase):
@@ -168,13 +179,15 @@ def run_testing(logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[T
     logger.log("test-count", len(tests))
     for idx, test in enumerate(tests):
         logger.log(f"test.{idx}.spec", test.spec)
-        error = run_single_test(pool, test)
-        if error:
+        good, message = run_single_test(pool, test)
+        if not good:
             logger.log(f"test.{idx}.status", "fail")
-            logger.log(f"test.{idx}.error", error)
+            logger.log(f"test.{idx}.error", message)
             passed = False
         else:
             logger.log(f"test.{idx}.status", "pass")
+            if message:
+                logger.log(f"test.{idx}.message", message)
 
     if passed:
         logger.log("check", "pass")
@@ -196,9 +209,9 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
     check_copy = _clone_data(data)
     #  first, one obligatory correctness check
     output = custom_kernel(data)
-    error = check_implementation(check_copy, output)
-    if error:
-        return error
+    good, message = wrap_check_implementation(check_copy, output)
+    if not good:
+        return message
 
     # now, do multiple timing runs without further correctness testing
     # there is an upper bound of 100 runs, and a lower bound of 3 runs;
@@ -220,16 +233,16 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
         end = time.perf_counter_ns()
 
         if recheck:
-            error = check_implementation(check_copy, output)
-            if error:
-                return error
+            good, message = check_implementation(check_copy, output)
+            if not good:
+                return message
 
         del output
         durations.append(end-start)
 
         if i > 1:
             stats = calculate_stats(durations)
-            if stats.err / stats.mean < 0.01 or stats.mean * stats.runs > max_time_ns:
+            if stats.err / stats.mean < 0.001 or stats.mean * stats.runs > max_time_ns:
                 break
 
     return calculate_stats(durations)
diff --git a/problems/amd/fp8-mm/reference.py b/problems/amd/fp8-mm/reference.py
index f4570a77..86c20a5b 100644
--- a/problems/amd/fp8-mm/reference.py
+++ b/problems/amd/fp8-mm/reference.py
@@ -85,4 +85,4 @@ def ref_kernel(data: input_t) -> output_t:
     return c
 
 
-check_implementation = make_match_reference(ref_kernel, rtol=1e-02, atol=1e-04)
+check_implementation = make_match_reference(ref_kernel, rtol=2e-02, atol=1e-03)
diff --git a/problems/amd/fp8-mm/task.yml b/problems/amd/fp8-mm/task.yml
index 43f912a7..d4e435db 100644
--- a/problems/amd/fp8-mm/task.yml
+++ b/problems/amd/fp8-mm/task.yml
@@ -25,6 +25,8 @@ description: |
   * `a_scale` is M x K in column-major order in fp32
   * `b_scale` is N x K in column-major order in fp32
   * `c` is M x N in ROW-major order in bf16
+  
+  Matrix sizes `m` and `n` are divisible by 64, `k` is divisible by 128.
 
   The ranking criteria is the geometric mean of the benchmark results.
 
@@ -46,8 +48,10 @@ config:
 
 templates:
   Python: "template.py"
+  HIP: "template-hip.py"
 
 tests:
+  - {"m": 64, "n": 64, "k": 128, "seed": 6635}
   - {"m": 64, "n": 1536, "k": 7168, "seed": 6635}
   - {"m": 64, "n": 3072, "k": 1536, "seed": 1236}
   - {"m": 64, "n": 576, "k": 7168, "seed": 542}
diff --git a/problems/amd/fp8-mm/template-hip.py b/problems/amd/fp8-mm/template-hip.py
new file mode 100644
index 00000000..9e7cd11d
--- /dev/null
+++ b/problems/amd/fp8-mm/template-hip.py
@@ -0,0 +1,76 @@
+# This script provides a template for using load_inline to run a HIP kernel for
+from torch.utils.cpp_extension import load_inline
+from task import input_t, output_t
+CPP_WRAPPER = """
+void fp8_mm(torch::Tensor a, torch::Tensor b, torch::Tensor as, torch::Tensor bs, torch::Tensor c);
+"""
+
+CUDA_SRC = """
+#include <hip/amd_detail/amd_hip_fp8.h>
+#include <hip/amd_detail/amd_hip_bf16.h>
+
+constexpr const int BLOCK = 128;
+
+__global__ void custom_kernel(const __hip_fp8_e4m3_fnuz* a, const __hip_fp8_e4m3_fnuz* b, const float* as, const float* bs, 
+                   __hip_bfloat16* c, int m, int n, int k) {
+                   
+    // Your implementation here
+    int cx = threadIdx.x + blockDim.x * blockIdx.x;
+    int cy = threadIdx.y + blockDim.y * blockIdx.y;
+    if(cx >= m || cy >= n) return;
+    
+    int sn = (n + BLOCK - 1) / BLOCK;
+    
+    float result = 0;
+    // split loop into an outer loop over different blocks, and an inner loop within one block.
+    // we can assume k % BLOCK == 0.
+    for(int i = 0; i < k; i += BLOCK) {
+        // block results accumulates the inner product across a single block.
+        // within each block, scales are constant, so we can lift the scaling 
+        // outside of the inner loop.
+        float block_result = 0;
+        for(int ii = 0; ii < BLOCK; ++ii) {
+            // load input matrix elements and convert to float for computations
+            float av = (float)a[cx + (i + ii) * m];
+            float bv = (float)b[cy + (i + ii) * n];
+            block_result += av * bv; 
+        }
+        
+        // before we can go to the next block, scale the result of the current block
+        // and accumulate to final result
+        // note the different indexing into as and bs
+        result += block_result * as[cx + i/BLOCK * m] * bs[cy/BLOCK + i/BLOCK * sn];
+    }
+    
+    // finally, write the result as bf16
+    c[cx * n + cy] = (__hip_bfloat16)result;
+}
+
+void fp8_mm(torch::Tensor a, torch::Tensor b, torch::Tensor as, torch::Tensor bs, torch::Tensor c) {
+    int m = a.size(0);
+    int n = b.size(0);
+    int k = a.size(1);
+    custom_kernel<<<dim3((m+15)/16, (n+15)/16), dim3(16, 16), 0, 0>>> ((__hip_fp8_e4m3_fnuz*)a.data_ptr(), (__hip_fp8_e4m3_fnuz*)b.data_ptr(), 
+    as.data_ptr<float>(), bs.data_ptr<float>(), (__hip_bfloat16*)c.data_ptr(), m, n, k);
+    //C10_CUDA_CHECK(cudaGetLastError());
+}
+"""
+
+import os
+os.environ["CXX"] = "clang++"
+
+module = load_inline(
+    name='fp8_mm',
+    cpp_sources=[CPP_WRAPPER],
+    cuda_sources=[CUDA_SRC],
+    functions=['fp8_mm'],
+    verbose=True,
+    extra_cuda_cflags=["--offload-arch=gfx942", "-std=c++20"],
+)
+
+
+def custom_kernel(data: input_t) -> output_t:
+    a, b, a_scale, b_scale, c = data
+    module.fp8_mm(a, b, a_scale, b_scale, c)
+    return c
+
diff --git a/problems/amd/utils.py b/problems/amd/utils.py
index c3eb2447..73551022 100644
--- a/problems/amd/utils.py
+++ b/problems/amd/utils.py
@@ -1,4 +1,6 @@
 import random
+from typing import Tuple
+
 import numpy as np
 import torch
 
@@ -32,7 +34,7 @@ def verbose_allclose(
         rtol=1e-05,
         atol=1e-08,
         max_print=5
-) -> list[str]:
+) -> Tuple[bool, list[str]]:
     """
     Assert that two tensors are element-wise equal within a tolerance, providing detailed information about mismatches.
 
@@ -42,16 +44,13 @@ def verbose_allclose(
     rtol (float): Relative tolerance; relative to expected
     atol (float): Absolute tolerance.
     max_print (int): Maximum number of mismatched elements to print.
-
-    Raises:
-    AssertionError: If the tensors are not all close within the given tolerance.
     """
     # Check if the shapes of the tensors match
     if received.shape != expected.shape:
-        return ["SIZE MISMATCH"]
+        return False, ["SIZE MISMATCH"]
 
     # Calculate the difference between the tensors
-    diff = torch.abs(received - expected)
+    diff = torch.abs(received.to(torch.float32) - expected.to(torch.float32))
 
     # Determine the tolerance
     tolerance = atol + rtol * torch.abs(expected)
@@ -84,16 +83,16 @@ def verbose_allclose(
 
         for index in mismatched_indices[:max_print]:
             i = tuple(index.tolist())
-            mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
+            mismatch_details.append(f"ERROR at {i}: {received[i]} {expected[i]}")
         if num_mismatched > max_print:
             mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
-        return mismatch_details
+        return False, mismatch_details
 
-    return []
+    return True, [f"Maximum error: {torch.max(diff)}"]
 
 
 @torch.no_grad()
-def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print: int=5):
+def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print: int = 5) -> Tuple[bool, list[str]]:
     """
     Assert that two tensors are element-wise perfectly equal, providing detailed information about mismatches.
 
@@ -117,12 +116,12 @@ def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print:
 
         for index in mismatched_indices[:max_print]:
             i = tuple(index.tolist())
-            mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
+            mismatch_details.append(f"ERROR at {i}: {received[i]} {expected[i]}")
         if num_mismatched > max_print:
             mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
-        return mismatch_details
+        return False, mismatch_details
 
-    return []
+    return True, []
 
 
 def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08):
@@ -130,12 +129,12 @@ def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08):
     Convenient "default" implementation for tasks' `check_implementation` function.
     """
     expected = reference(data)
-    reasons = verbose_allclose(output, expected, rtol=rtol, atol=atol)
+    good, reasons = verbose_allclose(output, expected, rtol=rtol, atol=atol)
 
     if len(reasons) > 0:
-        return "mismatch found! custom implementation doesn't match reference: " + " ".join(reasons)
+        return good, "\\n".join(reasons)
 
-    return ''
+    return good, ''
 
 
 def make_match_reference(reference: callable, **kwargs):

From 8ebd201acb57174b270d74fe32f03d967c924d2f Mon Sep 17 00:00:00 2001
From: az <alex.lx.zhang@gmail.com>
Date: Tue, 29 Apr 2025 14:45:03 -0400
Subject: [PATCH 043/207] Create .gitignore

---
 .gitignore | 175 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 175 insertions(+)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..393b0244
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,175 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+*.DS_Store
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+# Ruff stuff:
+.ruff_cache/
+
+# PyPI configuration file
+.pypirc

From ccdc5019d08424b7e768571740181daacb19d0a1 Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Tue, 29 Apr 2025 14:48:29 -0400
Subject: [PATCH 044/207] Add MoE problem for AMD competition

---
 problems/amd.yaml              |   5 +
 problems/amd/moe/reference.py  | 243 +++++++++++++++++++++++++++++++++
 problems/amd/moe/submission.py | 141 +++++++++++++++++++
 problems/amd/moe/task.py       |  16 +++
 problems/amd/moe/task.yml      |  57 ++++++++
 5 files changed, 462 insertions(+)
 create mode 100644 problems/amd/moe/reference.py
 create mode 100644 problems/amd/moe/submission.py
 create mode 100644 problems/amd/moe/task.py
 create mode 100644 problems/amd/moe/task.yml

diff --git a/problems/amd.yaml b/problems/amd.yaml
index 20a1d449..bcf91f67 100644
--- a/problems/amd.yaml
+++ b/problems/amd.yaml
@@ -15,3 +15,8 @@ problems:
     deadline: "2025-05-27"
     gpus:
       - MI300
+  - directory: amd/moe
+    name: amd-mixture-of-experts
+    deadline: "2025-05-27"
+    gpus:
+      - MI300
\ No newline at end of file
diff --git a/problems/amd/moe/reference.py b/problems/amd/moe/reference.py
new file mode 100644
index 00000000..b6ee16d7
--- /dev/null
+++ b/problems/amd/moe/reference.py
@@ -0,0 +1,243 @@
+from utils import make_match_reference
+from task import input_t, output_t
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Dict, Tuple, List, Optional
+import math
+
+# Reference code in PyTorch
+class Expert(nn.Module):
+    def __init__(self, config: Dict, d_expert: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.act_fn = nn.SiLU()
+        self.d_hidden: int = config["d_hidden"]
+        self.d_expert: int = config["d_expert"] if d_expert is None else d_expert
+
+        self.W_gate = nn.Linear(self.d_hidden, self.d_expert, bias=False)
+        self.W_up = nn.Linear(self.d_hidden, self.d_expert, bias=False)
+        self.W_down = nn.Linear(self.d_expert, self.d_hidden, bias=False)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate = self.act_fn(self.W_gate(x))
+        out = self.W_down(gate * self.W_up(x))
+        return out
+
+
+class MoEGate(nn.Module):
+    def __init__(self, config: Dict):
+        super().__init__()
+        self.top_k: int = config["n_experts_per_token"]
+        self.num_experts: int = config["n_routed_experts"]
+        self.d_hidden: int = config["d_hidden"]
+
+        self.W_g = nn.Linear(self.d_hidden, self.num_experts, bias=False)
+
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        logits = self.W_g(x)
+        scores = logits.softmax(dim=-1)
+        topk_scores, topk_indices = torch.topk(scores, k=self.top_k, dim=-1, sorted=False)
+
+        return topk_indices, topk_scores
+
+
+class MoE(nn.Module):
+    def __init__(self, config: Dict):
+        super().__init__()
+        self.config = config
+        self.experts = nn.ModuleList([
+            Expert(config)
+            for _ in range(config["n_routed_experts"])
+        ])
+        self.gating_network = MoEGate(config)
+        shared_expert_dim = config["d_expert"] * config["n_shared_experts"]
+        self.shared_expert = Expert(config=config, d_expert=shared_expert_dim)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        shared_output = self.shared_expert(x)
+        expert_indices, expert_scores = self.gating_network(x)
+        batch_size, seq_len, hidden_dim = x.shape
+        orig_shape = x.shape
+        x_flat = x.view(-1, hidden_dim)
+        flat_expert_indices = expert_indices.view(-1)
+        flat_expert_weights = expert_scores.view(-1, 1)
+        routed_output_flat = self.moe_infer(x_flat,
+                                            flat_expert_indices,
+                                            flat_expert_weights)
+
+        routed_output = routed_output_flat.view(*orig_shape)
+        return routed_output + shared_output
+
+    @torch.no_grad()
+    def moe_infer(self,
+                  x: torch.Tensor,
+                  flat_expert_indices: torch.Tensor,
+                  flat_expert_weights: torch.Tensor
+                 ) -> torch.Tensor:
+        expert_cache = torch.zeros_like(x)
+        idxs = flat_expert_indices.argsort()
+        counts = flat_expert_indices.bincount().cpu().numpy()
+        tokens_per_expert = counts.cumsum()
+        num_per_tok = self.config["n_experts_per_token"]
+        token_idxs = idxs // num_per_tok
+        for expert_id, end_idx in enumerate(tokens_per_expert):
+            start_idx = 0 if expert_id == 0 else tokens_per_expert[expert_id - 1]
+            if start_idx == end_idx:
+                continue
+
+            expert = self.experts[expert_id]
+            exp_token_idxs = token_idxs[start_idx:end_idx]
+            expert_tokens = x[exp_token_idxs]
+            expert_out    = expert(expert_tokens)
+            expert_out.mul_(flat_expert_weights[idxs[start_idx:end_idx]])
+            expert_cache.scatter_reduce_(
+                0,
+                exp_token_idxs.view(-1, 1).repeat(1, x.shape[-1]),
+                expert_out,
+                reduce='sum'
+            )
+
+        return expert_cache
+
+
+def ref_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of DeepSeek-style Mixture of Experts using PyTorch.
+    
+    Args:
+        data: Tuple of (input: torch.Tensor, weights: Dict[str, torch.Tensor], config: Dict)
+            - input: Input tensor of shape [batch_size, seq_len, hidden_dim]
+            - weights: Dictionary containing model weights
+            - config: Dictionary containing model configuration parameters
+            
+    Returns:
+        Tuple containing:
+            - output: Processed tensor [batch_size, seq_len, d_model]
+            - aux_data: Dictionary with auxiliary data
+    """
+    input_tensor, weights, config = data
+    num_experts = config["n_routed_experts"]
+    moe = MoE(config)
+
+    # Fill in the given weights of the model
+    moe.gating_network.W_g.weight = nn.Parameter(weights['router.weight'])
+
+    for i in range(num_experts):
+        gate_proj_weight = weights[f'experts.{i}.0.weight']
+        up_proj_weight = weights[f'experts.{i}.1.weight']
+        down_proj_weight = weights[f'experts.{i}.2.weight']
+
+        # Transpose weights to match expected shape for nn.Linear
+        moe.experts[i].W_gate.weight = nn.Parameter(gate_proj_weight.t())
+        moe.experts[i].W_up.weight = nn.Parameter(up_proj_weight.t())
+        moe.experts[i].W_down.weight = nn.Parameter(down_proj_weight.t())
+
+    moe.shared_expert.W_gate.weight = nn.Parameter(weights['shared_experts.0.weight'].t())
+    moe.shared_expert.W_up.weight = nn.Parameter(weights['shared_experts.1.weight'].t())
+    moe.shared_expert.W_down.weight = nn.Parameter(weights['shared_experts.2.weight'].t())
+
+    output = moe(input_tensor)
+
+    return output
+
+
+# Input generation for the reference code
+
+def generate_input(
+    dhidden: int,
+    dexpert: int,
+    nroutedexperts: int,
+    nsharedexperts: int,
+    nexpertspertoken: int,
+    bs: int,
+    seqlen: int,
+    seed: int
+) -> input_t:
+
+    # Really dumb but for now _ isn't parsing correctly.
+    d_hidden = dhidden
+    d_expert = dexpert
+    n_routed_experts = nroutedexperts
+    n_shared_experts = nsharedexperts
+    n_experts_per_token = nexpertspertoken
+    batch_size = bs
+    seq_len = seqlen
+
+    config = {
+        "d_hidden": d_hidden,
+        "d_expert": d_expert,
+        "n_routed_experts": n_routed_experts,
+        "n_shared_experts": n_shared_experts,
+        "n_experts_per_token": n_experts_per_token,
+        "batch_size": batch_size,
+        "seq_len": seq_len,
+    }
+
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+
+    num_experts = n_routed_experts
+    expert_dim = d_expert
+    weights = {}
+
+    input_tensor = torch.randn(
+        (batch_size, seq_len, d_hidden),
+        device='cuda',
+        dtype=torch.float16,
+        generator=gen
+    ).contiguous()
+
+    # Initialize router weights
+    weights['router.weight'] = torch.randn(
+        (num_experts, d_hidden),
+        device="cuda",
+        dtype=torch.float16,
+        generator=gen
+    ) / math.sqrt(d_hidden)
+
+    for i in range(num_experts):
+        weights[f'experts.{i}.0.weight'] = torch.randn(
+            (d_hidden, expert_dim),
+            device='cuda',
+            dtype=torch.float16,
+            generator=gen
+        ) / math.sqrt(expert_dim)
+
+        weights[f'experts.{i}.1.weight'] = torch.randn(
+            (d_hidden, expert_dim),
+            device='cuda',
+            dtype=torch.float16,
+            generator=gen
+        ) / math.sqrt(expert_dim)
+
+        weights[f'experts.{i}.2.weight'] = torch.randn(
+            (expert_dim, d_hidden),
+            device='cuda',
+            dtype=torch.float16,
+            generator=gen
+        ) / math.sqrt(d_hidden)
+    
+    weights['shared_experts.0.weight'] = torch.randn(
+        (d_hidden, expert_dim * n_shared_experts),
+        device='cuda',
+        dtype=torch.float16,
+        generator=gen
+    ) / math.sqrt(expert_dim * n_shared_experts)
+    weights['shared_experts.1.weight'] = torch.randn(
+        (d_hidden, expert_dim * n_shared_experts),
+        device='cuda',
+        dtype=torch.float16,
+        generator=gen
+    ) / math.sqrt(expert_dim * n_shared_experts)
+    weights['shared_experts.2.weight'] = torch.randn(
+        (expert_dim * n_shared_experts, d_hidden),
+        device='cuda',
+        dtype=torch.float16,
+        generator=gen
+    ) / math.sqrt(d_hidden)
+
+    return (input_tensor, weights, config)
+
+
+check_implementation = make_match_reference(ref_kernel, rtol=1e-2, atol=1e-2)
\ No newline at end of file
diff --git a/problems/amd/moe/submission.py b/problems/amd/moe/submission.py
new file mode 100644
index 00000000..7d085ff0
--- /dev/null
+++ b/problems/amd/moe/submission.py
@@ -0,0 +1,141 @@
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Dict, Tuple, Optional
+from task import input_t, output_t
+
+class Expert(nn.Module):
+    def __init__(self, config: Dict, d_expert: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.act_fn = nn.SiLU()
+        self.d_hidden: int = config["d_hidden"]
+        self.d_expert: int = config["d_expert"] if d_expert is None else d_expert
+
+        self.W_gate = nn.Linear(self.d_hidden, self.d_expert, bias=False)
+        self.W_up = nn.Linear(self.d_hidden, self.d_expert, bias=False)
+        self.W_down = nn.Linear(self.d_expert, self.d_hidden, bias=False)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate = self.act_fn(self.W_gate(x))
+        out = self.W_down(gate * self.W_up(x))
+        return out
+
+
+class MoEGate(nn.Module):
+    def __init__(self, config: Dict):
+        super().__init__()
+        self.top_k: int = config["n_experts_per_token"]
+        self.num_experts: int = config["n_routed_experts"]
+        self.d_hidden: int = config["d_hidden"]
+
+        self.W_g = nn.Linear(self.d_hidden, self.num_experts, bias=False)
+
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        logits = self.W_g(x)
+        scores = logits.softmax(dim=-1)
+        topk_scores, topk_indices = torch.topk(scores, k=self.top_k, dim=-1, sorted=False)
+
+        return topk_indices, topk_scores
+
+
+class MoE(nn.Module):
+    def __init__(self, config: Dict):
+        super().__init__()
+        self.config = config
+        self.experts = nn.ModuleList([
+            Expert(config)
+            for _ in range(config["n_routed_experts"])
+        ])
+        self.gating_network = MoEGate(config)
+        shared_expert_dim = config["d_expert"] * config["n_shared_experts"]
+        self.shared_expert = Expert(config=config, d_expert=shared_expert_dim)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        shared_output = self.shared_expert(x)
+        expert_indices, expert_scores = self.gating_network(x)
+        batch_size, seq_len, hidden_dim = x.shape
+        orig_shape = x.shape
+        x_flat = x.view(-1, hidden_dim)
+        flat_expert_indices = expert_indices.view(-1)
+        flat_expert_weights = expert_scores.view(-1, 1)
+        routed_output_flat = self.moe_infer(x_flat,
+                                            flat_expert_indices,
+                                            flat_expert_weights)
+
+        routed_output = routed_output_flat.view(*orig_shape)
+        return routed_output + shared_output
+
+    @torch.no_grad()
+    def moe_infer(self,
+                  x: torch.Tensor,
+                  flat_expert_indices: torch.Tensor,
+                  flat_expert_weights: torch.Tensor
+                 ) -> torch.Tensor:
+        expert_cache = torch.zeros_like(x)
+        idxs = flat_expert_indices.argsort()
+        counts = flat_expert_indices.bincount().cpu().numpy()
+        tokens_per_expert = counts.cumsum()
+        num_per_tok = self.config["n_experts_per_token"]
+        token_idxs = idxs // num_per_tok
+        for expert_id, end_idx in enumerate(tokens_per_expert):
+            start_idx = 0 if expert_id == 0 else tokens_per_expert[expert_id - 1]
+            if start_idx == end_idx:
+                continue
+
+            expert = self.experts[expert_id]
+            exp_token_idxs = token_idxs[start_idx:end_idx]
+            expert_tokens = x[exp_token_idxs]
+            expert_out    = expert(expert_tokens)
+            expert_out.mul_(flat_expert_weights[idxs[start_idx:end_idx]])
+            expert_cache.scatter_reduce_(
+                0,
+                exp_token_idxs.view(-1, 1).repeat(1, x.shape[-1]),
+                expert_out,
+                reduce='sum'
+            )
+
+        return expert_cache
+
+
+def custom_kernel(data: input_t) -> output_t:
+    """
+    Submission template for DeepSeek-style Mixture of Experts using PyTorch.
+    
+    Args:
+        data: Tuple of (input: torch.Tensor, weights: Dict[str, torch.Tensor], config: Dict)
+            - input: Input tensor of shape [batch_size, seq_len, hidden_size]
+            - weights: Dictionary containing model weights
+            - config: Dictionary containing model configuration parameters
+            
+    Returns:
+        Tuple containing:
+            - output: Processed tensor [batch_size, seq_len, d_model]
+            - aux_data: Dictionary with auxiliary data
+    """
+    input_tensor, weights, config = data
+    num_experts = config["n_routed_experts"]
+    moe = MoE(config)
+
+    # Fill in the given weights of the model
+    moe.gating_network.W_g.weight = nn.Parameter(weights['router.weight'])
+
+    for i in range(num_experts):
+        gate_proj_weight = weights[f'experts.{i}.0.weight']
+        up_proj_weight = weights[f'experts.{i}.1.weight']
+        down_proj_weight = weights[f'experts.{i}.2.weight']
+
+        # Transpose weights to match expected shape for nn.Linear
+        moe.experts[i].W_gate.weight = nn.Parameter(gate_proj_weight.t())
+        moe.experts[i].W_up.weight = nn.Parameter(up_proj_weight.t())
+        moe.experts[i].W_down.weight = nn.Parameter(down_proj_weight.t())
+
+    moe.shared_expert.W_gate.weight = nn.Parameter(weights['shared_experts.0.weight'].t())
+    moe.shared_expert.W_up.weight = nn.Parameter(weights['shared_experts.1.weight'].t())
+    moe.shared_expert.W_down.weight = nn.Parameter(weights['shared_experts.2.weight'].t())
+
+    # Run the model
+    output = moe(input_tensor)
+
+    return output
\ No newline at end of file
diff --git a/problems/amd/moe/task.py b/problems/amd/moe/task.py
new file mode 100644
index 00000000..e963086e
--- /dev/null
+++ b/problems/amd/moe/task.py
@@ -0,0 +1,16 @@
+from typing import TypedDict, TypeVar, Tuple, Dict
+import torch
+
+input_t = TypeVar("input_t", bound=Tuple[torch.Tensor, Dict[str, torch.Tensor], Dict])
+output_t = TypeVar("output_t", bound=Tuple[torch.Tensor, Dict])
+
+
+class TestSpec(TypedDict):
+    d_hidden: int
+    d_expert: int
+    n_routed_experts: int
+    n_shared_experts: int
+    n_experts_per_token: int
+    batch_size: int
+    seq_len: int
+    seed: int
\ No newline at end of file
diff --git a/problems/amd/moe/task.yml b/problems/amd/moe/task.yml
new file mode 100644
index 00000000..a00c1205
--- /dev/null
+++ b/problems/amd/moe/task.yml
@@ -0,0 +1,57 @@
+# name: 3_moe
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  For a more complete description, see: https://tinyurl.com/amd-comp-moe
+  Implement a DeepSeek-style Mixture of Experts (MoE) layer for efficient transformer models
+  on a single MI300X device.
+  
+  MoE is a technique that allows scaling model capacity without proportionally increasing computational costs
+  by using a routing mechanism to selectively activate only a subset of parameters for each token.
+  
+  Your task:
+  - Implement token routing using a simple softmax-based learned router
+  - Route tokens to the top-k experts based on router probabilities
+  - Process tokens through their assigned experts
+  - Combine expert outputs weighted by router probabilities
+  - Calculate appropriate auxiliary losses for training stability
+  
+  Input:
+  - `data`: Tuple of (input: torch.Tensor, weights: Dict[str, torch.Tensor], config: Dict)
+    - input: Input tensor of shape [bs, seq_len, d_hidden]
+    - weights: Dictionary containing model weights
+    - config: Dictionary containing model configuration parameters
+  
+  Output:
+  - Tuple containing:
+    - output: Processed tensor [bs, seq_len, d_model]
+    - aux_data: Dictionary with auxiliary data like router probabilities and losses
+
+config:
+  main: "eval.py"
+
+templates:
+  Python: "../template.py"
+
+test_timeout: 540
+benchmark_timeout: 540
+ranked_timeout: 540
+ranking_by: "geom"
+
+tests:
+  - {"dhidden": 7168, "dexpert": 2048, "nroutedexperts": 4, "nsharedexperts": 1, "nexpertspertoken": 4, "bs": 1, "seqlen": 512, "seed": 9371}
+  - {"dhidden": 7168, "dexpert": 2048, "nroutedexperts": 8, "nsharedexperts": 1, "nexpertspertoken": 4, "bs": 2, "seqlen": 512, "seed": 2291}
+  - {"dhidden": 7168, "dexpert": 2048, "nroutedexperts": 8, "nsharedexperts": 1, "nexpertspertoken": 4, "bs": 1, "seqlen": 8192, "seed": 81934}
+
+benchmarks:
+  - {"dhidden": 7168, "dexpert": 2048, "nroutedexperts": 32, "nsharedexperts": 1, "nexpertspertoken": 4, "bs": 1, "seqlen": 2048, "seed": 9371}
+  - {"dhidden": 7168, "dexpert": 2048, "nroutedexperts": 32, "nsharedexperts": 1, "nexpertspertoken": 4, "bs": 2, "seqlen": 4096, "seed": 12819}
+  - {"dhidden": 7168, "dexpert": 2048, "nroutedexperts": 32, "nsharedexperts": 1, "nexpertspertoken": 4, "bs": 1, "seqlen": 8192, "seed": 1212}
\ No newline at end of file

From 1541c34ec50ca2a846bbdcd6fe09c1db72575aab Mon Sep 17 00:00:00 2001
From: Matej Sirovatka <54212263+S1ro1@users.noreply.github.com>
Date: Tue, 29 Apr 2025 20:55:41 +0200
Subject: [PATCH 045/207] Fix: remove amd moe template

---
 problems/amd/moe/task.yml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/problems/amd/moe/task.yml b/problems/amd/moe/task.yml
index a00c1205..e8f8e5eb 100644
--- a/problems/amd/moe/task.yml
+++ b/problems/amd/moe/task.yml
@@ -38,8 +38,6 @@ description: |
 config:
   main: "eval.py"
 
-templates:
-  Python: "../template.py"
 
 test_timeout: 540
 benchmark_timeout: 540
@@ -54,4 +52,4 @@ tests:
 benchmarks:
   - {"dhidden": 7168, "dexpert": 2048, "nroutedexperts": 32, "nsharedexperts": 1, "nexpertspertoken": 4, "bs": 1, "seqlen": 2048, "seed": 9371}
   - {"dhidden": 7168, "dexpert": 2048, "nroutedexperts": 32, "nsharedexperts": 1, "nexpertspertoken": 4, "bs": 2, "seqlen": 4096, "seed": 12819}
-  - {"dhidden": 7168, "dexpert": 2048, "nroutedexperts": 32, "nsharedexperts": 1, "nexpertspertoken": 4, "bs": 1, "seqlen": 8192, "seed": 1212}
\ No newline at end of file
+  - {"dhidden": 7168, "dexpert": 2048, "nroutedexperts": 32, "nsharedexperts": 1, "nexpertspertoken": 4, "bs": 1, "seqlen": 8192, "seed": 1212}

From 3b3fa2e1f59daa0940e60b6ecd68750dcd1500cf Mon Sep 17 00:00:00 2001
From: Matej Sirovatka <54212263+S1ro1@users.noreply.github.com>
Date: Tue, 29 Apr 2025 20:57:25 +0200
Subject: [PATCH 046/207] Feat: add moe template

---
 problems/amd/moe/task.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/problems/amd/moe/task.yml b/problems/amd/moe/task.yml
index e8f8e5eb..e6a9b1b1 100644
--- a/problems/amd/moe/task.yml
+++ b/problems/amd/moe/task.yml
@@ -38,6 +38,8 @@ description: |
 config:
   main: "eval.py"
 
+templates:
+  Python: "submission.py"
 
 test_timeout: 540
 benchmark_timeout: 540

From 60113bd7c305a344f2b9786d55fba05bb946ed71 Mon Sep 17 00:00:00 2001
From: Matej Sirovatka <54212263+S1ro1@users.noreply.github.com>
Date: Tue, 29 Apr 2025 21:39:50 +0200
Subject: [PATCH 047/207] Fix: remove duplicate benchmarks

---
 problems/amd/moe/task.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/problems/amd/moe/task.yml b/problems/amd/moe/task.yml
index e6a9b1b1..d9bf71d4 100644
--- a/problems/amd/moe/task.yml
+++ b/problems/amd/moe/task.yml
@@ -53,5 +53,4 @@ tests:
 
 benchmarks:
   - {"dhidden": 7168, "dexpert": 2048, "nroutedexperts": 32, "nsharedexperts": 1, "nexpertspertoken": 4, "bs": 1, "seqlen": 2048, "seed": 9371}
-  - {"dhidden": 7168, "dexpert": 2048, "nroutedexperts": 32, "nsharedexperts": 1, "nexpertspertoken": 4, "bs": 2, "seqlen": 4096, "seed": 12819}
   - {"dhidden": 7168, "dexpert": 2048, "nroutedexperts": 32, "nsharedexperts": 1, "nexpertspertoken": 4, "bs": 1, "seqlen": 8192, "seed": 1212}

From 264fc31ed624284f00c224acaef60615bfbc84fc Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Tue, 29 Apr 2025 23:08:30 +0200
Subject: [PATCH 048/207] Fix: move to using specified timeouts

---
 problems/pmpp/conv2d_py/task.yml    | 4 ++++
 problems/pmpp/grayscale_py/task.yml | 4 ++++
 problems/pmpp/histogram_py/task.yml | 4 ++++
 problems/pmpp/matmul_py/task.yml    | 4 ++++
 problems/pmpp/prefixsum_py/task.yml | 4 ++++
 problems/pmpp/sort_py/task.yml      | 4 ++++
 problems/pmpp/vectoradd_py/task.yml | 4 ++++
 problems/pmpp/vectorsum_py/task.yml | 4 ++++
 8 files changed, 32 insertions(+)

diff --git a/problems/pmpp/conv2d_py/task.yml b/problems/pmpp/conv2d_py/task.yml
index 9e88a41b..55adc532 100644
--- a/problems/pmpp/conv2d_py/task.yml
+++ b/problems/pmpp/conv2d_py/task.yml
@@ -41,3 +41,7 @@ benchmarks:
   - {"size": 256, "kernelsize": 16, "channels": 128, "batch": 2, "seed": 6256}
   - {"size": 256, "kernelsize": 16, "channels": 128, "batch": 2, "seed": 8841}
   - {"size": 256, "kernelsize": 32, "channels": 128, "batch": 1, "seed": 6252}
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 180
diff --git a/problems/pmpp/grayscale_py/task.yml b/problems/pmpp/grayscale_py/task.yml
index 90268ab1..cada0257 100644
--- a/problems/pmpp/grayscale_py/task.yml
+++ b/problems/pmpp/grayscale_py/task.yml
@@ -35,3 +35,7 @@ benchmarks:
   - {"size": 4096, "seed": 8841}
   - {"size": 8192, "seed": 6252}
   - {"size": 16384, "seed": 54352}
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 180
diff --git a/problems/pmpp/histogram_py/task.yml b/problems/pmpp/histogram_py/task.yml
index 01baa5f8..489a98b6 100644
--- a/problems/pmpp/histogram_py/task.yml
+++ b/problems/pmpp/histogram_py/task.yml
@@ -37,3 +37,7 @@ benchmarks:
   - {"size": 2621440, "seed": 8753, "contention": 90}
   - {"size": 5242880, "seed": 6252, "contention": 10}
   - {"size": 10485760, "seed": 8841, "contention": 10}
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 180
diff --git a/problems/pmpp/matmul_py/task.yml b/problems/pmpp/matmul_py/task.yml
index c97333c7..6924764b 100644
--- a/problems/pmpp/matmul_py/task.yml
+++ b/problems/pmpp/matmul_py/task.yml
@@ -38,3 +38,7 @@ benchmarks:
   - {"m": 1024, "n": 1536, "k": 1024, "seed": 321}
   - {"m": 2048, "n": 3072, "k": 2048, "seed": 32412}
   - {"m": 4096, "n": 5120, "k": 4096, "seed": 123456}
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 180
diff --git a/problems/pmpp/prefixsum_py/task.yml b/problems/pmpp/prefixsum_py/task.yml
index 163b4f72..a91d1496 100644
--- a/problems/pmpp/prefixsum_py/task.yml
+++ b/problems/pmpp/prefixsum_py/task.yml
@@ -51,3 +51,7 @@ benchmarks:
   - {"size": 268435456, "seed": 91357}
   # - {"size": 536870912, "seed": 102468}
   # - {"size": 1073741824, "seed": 113579}
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 180
diff --git a/problems/pmpp/sort_py/task.yml b/problems/pmpp/sort_py/task.yml
index 5ba9c318..5c702e29 100644
--- a/problems/pmpp/sort_py/task.yml
+++ b/problems/pmpp/sort_py/task.yml
@@ -35,3 +35,7 @@ benchmarks:
   - {"size": 1000000, "seed": 6256}
   - {"size": 10000000, "seed": 8841}
   - {"size": 100000000, "seed": 6252} 
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 180
diff --git a/problems/pmpp/vectoradd_py/task.yml b/problems/pmpp/vectoradd_py/task.yml
index 546fb36e..6906a313 100644
--- a/problems/pmpp/vectoradd_py/task.yml
+++ b/problems/pmpp/vectoradd_py/task.yml
@@ -35,3 +35,7 @@ benchmarks:
   - {"size": 4096, "seed": 2146}
   - {"size": 8192, "seed": 3129}
   - {"size": 16384, "seed": 54352}
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 180
diff --git a/problems/pmpp/vectorsum_py/task.yml b/problems/pmpp/vectorsum_py/task.yml
index d0fa91c6..8b3ddbb7 100644
--- a/problems/pmpp/vectorsum_py/task.yml
+++ b/problems/pmpp/vectorsum_py/task.yml
@@ -35,3 +35,7 @@ benchmarks:
   - {"size": 13107200, "seed": 6252}
   - {"size": 26214400, "seed": 82135}
   - {"size": 52428800, "seed": 12345}
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 180

From 53b7d86be8c9e37ae9088dce6d26ee41b32d16a7 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Tue, 29 Apr 2025 23:15:49 +0200
Subject: [PATCH 049/207] Feat: move deadline to june for pmpp

---
 problems/beta.yaml | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/problems/beta.yaml b/problems/beta.yaml
index fed11c76..a357c783 100644
--- a/problems/beta.yaml
+++ b/problems/beta.yaml
@@ -7,7 +7,7 @@ description: ""
 problems:
   - directory: pmpp/conv2d_py
     name: conv2d
-    deadline: "2025-04-30"
+    deadline: "2025-06-30"
     gpus:
       - H100
       - A100
@@ -15,7 +15,7 @@ problems:
       - L4
   - directory: pmpp/grayscale_py
     name: grayscale
-    deadline: "2025-04-30"
+    deadline: "2025-06-30"
     gpus:
       - H100
       - A100
@@ -23,7 +23,7 @@ problems:
       - L4
   - directory: pmpp/histogram_py
     name: histogram
-    deadline: "2025-04-30"
+    deadline: "2025-06-30"
     gpus:
       - H100
       - A100
@@ -31,7 +31,7 @@ problems:
       - L4
   - directory: pmpp/matmul_py
     name: matmul
-    deadline: "2025-04-30"
+    deadline: "2025-06-30"
     gpus:
       - H100
       - A100
@@ -39,7 +39,7 @@ problems:
       - L4
   - directory: pmpp/prefixsum_py
     name: prefixsum
-    deadline: "2025-04-30"
+    deadline: "2025-06-30"
     gpus:
       - H100
       - A100
@@ -47,7 +47,7 @@ problems:
       - L4
   - directory: pmpp/sort_py
     name: sort
-    deadline: "2025-04-30"
+    deadline: "2025-06-30"
     gpus:
       - H100
       - A100
@@ -55,7 +55,7 @@ problems:
       - L4
   - directory: pmpp/vectoradd_py
     name: vectoradd
-    deadline: "2025-04-30"
+    deadline: "2025-06-30"
     gpus:
       - H100
       - A100
@@ -63,7 +63,7 @@ problems:
       - L4
   - directory: pmpp/vectorsum_py
     name: vectorsum
-    deadline: "2025-04-30"
+    deadline: "2025-06-30"
     gpus:
       - H100
       - A100

From 16622e026692a4cea619ee3c1ba346fccc91dcd0 Mon Sep 17 00:00:00 2001
From: Matthias Reso <13337103+mreso@users.noreply.github.com>
Date: Wed, 30 Apr 2025 10:05:56 -0700
Subject: [PATCH 050/207] Correct shape of a_scale + b_scale

---
 problems/amd/fp8-mm/task.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/problems/amd/fp8-mm/task.yml b/problems/amd/fp8-mm/task.yml
index d4e435db..23de9909 100644
--- a/problems/amd/fp8-mm/task.yml
+++ b/problems/amd/fp8-mm/task.yml
@@ -22,8 +22,8 @@ description: |
   and `c` is the output matrix:
   * `a` is M x K in column-major order in e4m3fnuz
   * `b` is N x K in column-major order in e4m3fnuz
-  * `a_scale` is M x K in column-major order in fp32
-  * `b_scale` is N x K in column-major order in fp32
+  * `a_scale` is M x K // 128 in column-major order in fp32
+  * `b_scale` is N // 128 x K // 128 in column-major order in fp32
   * `c` is M x N in ROW-major order in bf16
   
   Matrix sizes `m` and `n` are divisible by 64, `k` is divisible by 128.

From a390d26f1aed7b9f0655fe72f139705deb2cc6a1 Mon Sep 17 00:00:00 2001
From: Matej Sirovatka <54212263+S1ro1@users.noreply.github.com>
Date: Thu, 1 May 2025 17:07:22 +0200
Subject: [PATCH 051/207] Feat: enlarge task timeout for moe

---
 problems/amd/moe/task.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/amd/moe/task.yml b/problems/amd/moe/task.yml
index d9bf71d4..6b661767 100644
--- a/problems/amd/moe/task.yml
+++ b/problems/amd/moe/task.yml
@@ -43,7 +43,7 @@ templates:
 
 test_timeout: 540
 benchmark_timeout: 540
-ranked_timeout: 540
+ranked_timeout: 840
 ranking_by: "geom"
 
 tests:

From 997cd382b3d59c557504a1cc0e77317df31aa988 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Fri, 9 May 2025 18:22:49 -0700
Subject: [PATCH 052/207] Update README.md

---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 5f9e4059..afd98c45 100644
--- a/README.md
+++ b/README.md
@@ -2,9 +2,11 @@
 
 This repo holds reference kernels for the KernelBot which hosts regular competitions on [discord.gg/gpumode](discord.gg/gpumode).
 
+You can see what's going on [gpumode.com](https://www.gpumode.com/)
+
 ## Competition
 1. [PMPP practice problems](https://gpu-mode.github.io/discord-cluster-manager/docs/active#practice-round-leaderboard): Starting on Sunday Feb 21, 2025.
-2. LLM competition: Coming soon!
+2. [AMD $100K kernel competition](problems/amd)
 
 ## Making a Leaderboard Submission
 

From 6624b5d18abcf4fbc3f0708f195d2fd361013e03 Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Sat, 10 May 2025 22:47:44 +0200
Subject: [PATCH 053/207] update eval file to allow pt profiling

---
 problems/amd/eval.py | 35 ++++++++++++++++++++++++++++++++---
 1 file changed, 32 insertions(+), 3 deletions(-)

diff --git a/problems/amd/eval.py b/problems/amd/eval.py
index ce414f9b..7b13d38a 100644
--- a/problems/amd/eval.py
+++ b/problems/amd/eval.py
@@ -1,3 +1,4 @@
+import base64
 import dataclasses
 import multiprocessing
 import re
@@ -248,7 +249,8 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
     return calculate_stats(durations)
 
 
-def run_single_benchmark(pool: multiprocessing.Pool, test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float):
+def run_single_benchmark(pool: multiprocessing.Pool, test: TestCase, recheck: bool, max_repeats: int,
+                         max_time_ns: float):
     """
     For a particular test case, check correctness (if applicable) and grab runtime results.
 
@@ -295,6 +297,31 @@ def run_benchmarking(logger: PopcornOutput, pool: multiprocessing.Pool, tests: l
         return 112
 
 
+def run_single_profile(test: TestCase) -> str:
+    """
+    Runs a single test case. Do not call directly
+    """
+    from submission import custom_kernel
+    from torch.profiler import profile, record_function, ProfilerActivity
+    data = generate_input(**test.args)
+    torch.cuda.synchronize()
+
+    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
+        submission_output = custom_kernel(_clone_data(data))
+        torch.cuda.synchronize()
+    return prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=20)
+
+
+def run_profiling(logger: PopcornOutput, tests: list[TestCase]):
+    logger.log("benchmark-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"benchmark.{idx}.spec", test.spec)
+        report = run_single_profile(test)
+        logger.log(f"benchmark.{idx}.report", base64.b64encode(report.encode("utf-8"), b"+*").decode("utf-8"))
+    logger.log("check", "pass")
+    return 0
+
+
 def main():
     fd = os.getenv("POPCORN_FD")
     if not fd:
@@ -333,12 +360,14 @@ def main():
                     else:
                         passed = False
                         logger.log(f"benchmark.{i}.status", "fail")
-                        logger.log(f"benchmark.{i}.error", str(result)) #TODO: Make sure result implements __str__?
+                        logger.log(f"benchmark.{i}.error", str(result))  # TODO: Make sure result implements __str__?
                         break
 
                 logger.log("check", "pass" if passed else "fail")
+            elif mode == "profile":
+                run_profiling(logger, tests)
             else:
-                # TODO: Implement script and profile mode
+                # TODO: Implement script mode
                 return 2
 
 

From 7d8a57661a684f6a11270e4855179df5d0f1dff1 Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Mon, 12 May 2025 10:20:49 +0200
Subject: [PATCH 054/207] update eval file to put an upper bound on total
 benchmark duration

---
 problems/amd/eval.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/problems/amd/eval.py b/problems/amd/eval.py
index 7b13d38a..5f553e90 100644
--- a/problems/amd/eval.py
+++ b/problems/amd/eval.py
@@ -219,6 +219,7 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
     # otherwise, we repeat until we either measure at least 10 full seconds,
     # or the relative error of the mean is below 1%.
 
+    bm_start_time = time.perf_counter_ns()
     for i in range(max_repeats):
         if recheck:
             # ensure we use a different seed for every benchmark
@@ -239,11 +240,16 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
                 return message
 
         del output
-        durations.append(end-start)
+        durations.append(end - start)
 
         if i > 1:
+            total_bm_duration = bm_start_time - time.perf_counter_ns()
             stats = calculate_stats(durations)
-            if stats.err / stats.mean < 0.001 or stats.mean * stats.runs > max_time_ns:
+            # stop if either
+            # a) relative error dips below 0.1%
+            # b) we exceed the total time limit for benchmarking the kernel
+            # c) we exceed 2 minutes of total wallclock time.
+            if stats.err / stats.mean < 0.001 or stats.mean * stats.runs > max_time_ns or total_bm_duration > 120e9:
                 break
 
     return calculate_stats(durations)

From 5969f75339a8e12c8a471c589b61e83accbfe9f8 Mon Sep 17 00:00:00 2001
From: Matej Sirovatka <54212263+S1ro1@users.noreply.github.com>
Date: Sun, 18 May 2025 01:07:09 +0200
Subject: [PATCH 055/207] Fix: sign issue in eval.py

---
 problems/amd/eval.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/amd/eval.py b/problems/amd/eval.py
index 5f553e90..ac3a6325 100644
--- a/problems/amd/eval.py
+++ b/problems/amd/eval.py
@@ -243,7 +243,7 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
         durations.append(end - start)
 
         if i > 1:
-            total_bm_duration = bm_start_time - time.perf_counter_ns()
+            total_bm_duration = time.perf_counter_ns() - bm_start_time
             stats = calculate_stats(durations)
             # stop if either
             # a) relative error dips below 0.1%

From 6a9270363d3da2e589d9188e5215c016acbd9fe9 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Sun, 18 May 2025 07:47:22 -0700
Subject: [PATCH 056/207] Update template-hip.py

---
 problems/amd/fp8-mm/template-hip.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/problems/amd/fp8-mm/template-hip.py b/problems/amd/fp8-mm/template-hip.py
index 9e7cd11d..62ccc1fe 100644
--- a/problems/amd/fp8-mm/template-hip.py
+++ b/problems/amd/fp8-mm/template-hip.py
@@ -1,4 +1,7 @@
 # This script provides a template for using load_inline to run a HIP kernel for
+import os 
+os.environ['PYTORCH_ROCM_ARCH'] = 'gfx942'
+
 from torch.utils.cpp_extension import load_inline
 from task import input_t, output_t
 CPP_WRAPPER = """

From aa14a425b81e5c8b1c5e8d7d6097ab6adfa9eb2f Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Mon, 19 May 2025 12:17:24 -0400
Subject: [PATCH 057/207] add MLA problem

---
 problems/amd/mla-decode/README.md     |  37 +++
 problems/amd/mla-decode/eval.py       | 320 ++++++++++++++++++++++++++
 problems/amd/mla-decode/reference.py  | 256 +++++++++++++++++++++
 problems/amd/mla-decode/submission.py | 165 +++++++++++++
 problems/amd/mla-decode/task.py       |  12 +
 problems/amd/mla-decode/task.yml      |  74 ++++++
 problems/amd/mla-decode/utils.py      | 149 ++++++++++++
 7 files changed, 1013 insertions(+)
 create mode 100644 problems/amd/mla-decode/README.md
 create mode 100644 problems/amd/mla-decode/eval.py
 create mode 100644 problems/amd/mla-decode/reference.py
 create mode 100644 problems/amd/mla-decode/submission.py
 create mode 100644 problems/amd/mla-decode/task.py
 create mode 100644 problems/amd/mla-decode/task.yml
 create mode 100644 problems/amd/mla-decode/utils.py

diff --git a/problems/amd/mla-decode/README.md b/problems/amd/mla-decode/README.md
new file mode 100644
index 00000000..fdb96e4c
--- /dev/null
+++ b/problems/amd/mla-decode/README.md
@@ -0,0 +1,37 @@
+# Description
+
+You will implement a custom mla decode kernel optimized for MI300, a few things simplified here:
+
+1. Q, K, V data type as bfloat16
+  
+2. decode only with pre-allocated non-paged latent kv cache
+
+3. no need to update kv cache
+
+The shapes of all outer and inner dimensions of tensors are from DeepSeek-R1, and split number of heads to fit in one GPU. 
+To be explicit, you will be given a tuple to tensors:
+
+```yml
+  input [bs, sq, dim]
+  attn_output [bs, n_heads, sq, v_head_dim]
+``` 
+
+  where 
+
+  0. bs::128 # batch size
+  1. sk::[1024, 6144] # as kv length
+  2. sq::1 # as only consider decoding
+  3. dim::7168 # hidden size of deepseek v3
+  4. v_head_dim::128 # head size
+  5. n_heads::128 # num of q heads
+
+  The ranking criteria is the geometric mean of the benchmark results.
+
+  For the grand price, your kernel will be evaluated against the speed of light analysis
+  and the solution closest to the speed of light will be awarded the grand price.
+ 
+  The speed of light analysis is::
+  | bs | sk | sq | dtype |  roofline time(us) |
+  |---|---|---|---|---|
+  | 128 | 1024 | 1 | bf16 | 106.65 |
+  | 128 | 6144 | 1 | bf16 | 280.87 | 
diff --git a/problems/amd/mla-decode/eval.py b/problems/amd/mla-decode/eval.py
new file mode 100644
index 00000000..34d3ae84
--- /dev/null
+++ b/problems/amd/mla-decode/eval.py
@@ -0,0 +1,320 @@
+import dataclasses
+import re
+import time
+import os
+import sys
+import math
+from pathlib import Path
+from typing import Any
+from collections import OrderedDict
+
+import torch.cuda
+
+from utils import set_seed
+try:
+    from task import TestSpec
+except ImportError:
+    TestSpec = dict
+
+from submission import custom_kernel
+from reference import check_implementation, generate_input
+
+WARMUP_RUNS = 10
+TIMED_RUNS = 100
+
+
+class PopcornOutput:
+    def __init__(self, fd: int):
+        self.file = os.fdopen(fd, 'w')
+    
+    def __enter__(self):
+        return self
+    
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.file.close()
+    
+    def print(self, *args, **kwargs):
+        print(*args, **kwargs, file=self.file, flush=True)
+    
+    def log(self, key, value):
+        self.print(f"{key}: {value}")
+
+
+@dataclasses.dataclass
+class TestCase:
+    args: dict
+    spec: str
+
+
+def copy_kv_cache(module, kv_cache_shape):
+    """
+    Creates a copy of the KVCache module manually.
+    """
+    copied_module = type(module)(kv_cache_shape)
+    
+    # Copy parameters
+    params = OrderedDict()
+    for name, param in module.named_parameters():
+        params[name] = param.clone().requires_grad_(param.requires_grad)
+        
+    # Copy buffers
+    buffers = OrderedDict()
+    for name, buff in module.named_buffers():
+        print(f"Buff name: {name}, shape: {buff.shape}")
+        buffers[name] = buff.clone()
+    
+    # Assign params and buffers to copied module
+    copied_module.load_state_dict(params, strict=False)
+    copied_module.load_state_dict(buffers, strict=False)
+    copied_module.seq_len = module.seq_len
+    
+    return copied_module
+
+
+def get_test_cases(file_name: str) -> list[TestCase]:
+    try:
+        content = Path(file_name).read_text()
+    except Exception as E:
+        print(f"Could not open test file`{file_name}`: {E}", file=sys.stderr)
+        exit(113)
+
+    tests = []
+    lines = content.splitlines()
+    match = r"\s*([a-zA-Z]+):\s*([a-zA-Z]+|[+-]?[0-9]+)\s*"
+    for line in lines:
+        parts = line.split(";")
+        case = {}
+        for part in parts:
+            matched = re.match(match, part)
+            if not re.fullmatch(match, part):
+                print(f"invalid test case: '{line}': '{part}'", file=sys.stderr)
+                exit(113)
+            key = matched[1]
+            val = matched[2]
+            try:
+                val = int(val)
+            except ValueError:
+                pass
+
+            case[key] = val
+        tests.append(TestCase(spec=line, args=case))
+
+    return tests
+
+
+def warm_up(test: TestCase):
+    config, data, kv_cache = generate_input(**test.args)
+    config_copy = copy_config_weights(config)
+    start = time.perf_counter()
+    while time.perf_counter() - start < 0.2:
+        custom_kernel((config_copy, data, kv_cache))
+        torch.cuda.synchronize()
+
+
+@dataclasses.dataclass
+class Stats:
+    runs: int
+    mean: float
+    std: float
+    err: float
+    best: float
+    worst: float
+
+
+def calculate_stats(durations: list[int]):
+    """
+    Calculate statistical data from a list of durations.
+
+    @param durations: A list of durations in nanoseconds.
+    @return: A Stats object containing the number of runs, mean, standard deviation, error, best, and worst durations.
+    """
+    runs = len(durations)
+    total = sum(durations)
+    best = min(durations)
+    worst = max(durations)
+
+    avg = total / runs
+    variance = sum(map(lambda x: (x - avg)**2, durations))
+    std = math.sqrt(variance / (runs - 1))
+    err = std / math.sqrt(runs)
+
+    return Stats(runs=runs, mean=avg, std=std, err=err, best=float(best),
+                 worst=float(worst))
+
+
+def copy_config_weights(config):
+    """
+    Creates a copy of the Config object with cloned weight tensors.
+    """
+    return dataclasses.replace(
+        config,
+        Q_proj_down_weight=config.Q_proj_down_weight.clone(),
+        Q_proj_up_weight=config.Q_proj_up_weight.clone(),
+        KV_proj_down_weight=config.KV_proj_down_weight.clone(),
+        KV_proj_up_weight=config.KV_proj_up_weight.clone()
+    )
+
+
+def run_testing(logger: PopcornOutput, tests: list[TestCase]):
+    """
+    Executes the actual test case code and checks for correctness.
+
+    @param logger: A PopcornOutput object used for logging test results.
+    @param tests: A list of TestCase objects representing the test cases to be executed.
+    @return: An integer representing the exit status: 0 if all tests pass, otherwise 112.
+    """
+    passed = True
+    logger.log("test-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"test.{idx}.spec", test.spec)
+
+        config, data, kv_cache = generate_input(**test.args)
+        kv_cache_copy = copy_kv_cache(kv_cache, config.kv_cache_shape)
+
+        torch.cuda.synchronize()
+        submission_output = custom_kernel((config, data, kv_cache))
+        torch.cuda.synchronize()
+        error = check_implementation((config, data, kv_cache_copy), submission_output)
+        if error:
+            logger.log(f"test.{idx}.status", "fail")
+            logger.log(f"test.{idx}.error", error)
+            passed = False
+        else:
+            logger.log(f"test.{idx}.status", "pass")
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def benchmark(test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float) -> Stats | Any:
+    """
+    For a particular test case, check correctness (if applicable) and grab runtime results.
+
+    @param test: TestCase object.
+    @param recheck: Flag for whether to explicitly check functional correctness.
+    @param max_repeats: Number of trials to repeat.
+    @param max_time_ns: Timeout time in nanoseconds.
+    @return: A Stats object for this particular benchmark case or an error if the test fails.
+    """
+    durations = []
+    # generate input data once
+    config, data, kv_cache = generate_input(**test.args)
+    # first, one obligatory correctness check; also triggers triton compile for the given shape
+    kv_cache_copy = copy_kv_cache(kv_cache, config.kv_cache_shape)
+    config_copy = copy_config_weights(config)
+    output = custom_kernel((config, data, kv_cache))
+    error = check_implementation((config_copy, data, kv_cache_copy), output)
+    if error:
+        return error
+
+    # now, do multiple timing runs without further correctness testing
+    # there is an upper bound of 100 runs, and a lower bound of 3 runs;
+    # otherwise, we repeat until we either measure at least 10 full seconds,
+    # or the relative error of the mean is below 1%.
+
+    for i in range(max_repeats):
+        if recheck:
+            config, data, kv_cache = generate_input(**test.args)
+            kv_cache_copy = copy_kv_cache(kv_cache, config.kv_cache_shape)
+            config_copy = copy_config_weights(config)
+        torch.cuda.synchronize()
+        start = time.perf_counter_ns()
+        output = custom_kernel((config, data, kv_cache))
+        torch.cuda.synchronize()
+        end = time.perf_counter_ns()
+
+        if recheck:
+            error = check_implementation((config_copy, data, kv_cache_copy), output)
+            if error:
+                return error
+
+        del output
+        durations.append(end-start)
+
+        if i > 1:
+            stats = calculate_stats(durations)
+            if stats.err / stats.mean < 0.01 or stats.mean *  stats.runs > max_time_ns:
+                break
+
+    return calculate_stats(durations)
+
+
+def run_benchmarking(logger: PopcornOutput, tests: list[TestCase]):
+    """
+    Executes benchmarking code for a CUDA Kernel and logs runtimes.
+
+    @param logger: A PopcornOutput object used for logging benchmark results.
+    @param tests: A list of TestCase objects representing the test cases to be benchmarked.
+    @return: An integer representing the exit status: 0 if all benchmarks pass, otherwise 112.
+    """
+    warm_up(tests[0])
+    passed = True
+    logger.log("benchmark-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"benchmark.{idx}.spec", test.spec)
+        result = benchmark(test, False, 100, 10e9)
+        if isinstance(result, Stats):
+            for field in dataclasses.fields(Stats):
+                logger.log(f"benchmark.{idx}.{field.name}", getattr(result, field.name))
+        else:
+            passed = False
+            logger.log(f"benchmark.{idx}.status", "fail")
+            logger.log(f"benchmark.{idx}.error", result)
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def main():
+    fd = os.getenv("POPCORN_FD")
+    if not fd:
+        return 111
+
+    if len(sys.argv) < 3:
+        return 2
+
+    mode = sys.argv[1]
+    tests = get_test_cases(sys.argv[2])
+
+    with PopcornOutput(int(fd)) as logger:
+        seed = os.getenv("POPCORN_SEED")
+        seed = int(seed) if seed else 42
+        set_seed(seed)
+
+        if mode == "test":
+            return run_testing(logger, tests)
+
+        if mode == "benchmark":
+            return run_benchmarking(logger, tests)
+        
+        if mode == "leaderboard":
+            warm_up(tests[0])
+            result = benchmark(tests[-1], True, 100, 30e9)
+            if isinstance(result, Stats):
+                logger.log("benchmark-count", 1)
+                logger.log(f"benchmark.0.spec", tests[-1].spec)
+                logger.log(f"benchmark.0.runs", result.runs)
+                logger.log(f"benchmark.0.mean", result.mean)
+                logger.log(f"benchmark.0.std", result.std)
+                logger.log(f"benchmark.0.err", result.err)
+                logger.log("check", "pass")
+            else:
+                logger.log("test-count", 1)
+                logger.log("test.0.status", "fail")
+                logger.log("test.0.error", str(result)) #TODO: Make sure result implements __str__?
+        
+        else:
+            # TODO: Implement script and profile mode
+            return 2
+
+
+if __name__ == "__main__":
+    sys.exit(main())
\ No newline at end of file
diff --git a/problems/amd/mla-decode/reference.py b/problems/amd/mla-decode/reference.py
new file mode 100644
index 00000000..10b1884b
--- /dev/null
+++ b/problems/amd/mla-decode/reference.py
@@ -0,0 +1,256 @@
+import math
+from dataclasses import dataclass
+import torch
+from torch import nn
+import torch.nn.functional as F
+from task import input_t, output_t
+from utils import make_match_reference
+
+class RoPE(nn.Module):
+    def __init__(self, d_model: int):
+        super().__init__()
+        self.d_model = d_model
+        theta = 10000 ** (-torch.arange(0, d_model//2,dtype=torch.bfloat16) / (d_model//2))
+        self.register_buffer("theta", theta)
+
+    def rotate_half(self, x: torch.Tensor) -> torch.Tensor:
+        x1, x2 = x.chunk(2, dim=-1)
+        return torch.cat((-x2, x1), dim=-1)
+
+    def forward(self, x: torch.Tensor, start_pos: int = 0) -> torch.Tensor:
+        seq_len = x.size(-2)
+        d_model = x.size(-1)
+        assert d_model == self.d_model
+        seq_idx = torch.arange(start_pos, start_pos + seq_len, device=x.device)
+        idx_theta = torch.einsum('s,d->sd', seq_idx, self.theta)
+        idx_theta2 = torch.cat([idx_theta, idx_theta], dim=-1)
+        cos = idx_theta2.cos().to(torch.bfloat16)
+        sin = idx_theta2.sin().to(torch.bfloat16)
+        return x * cos + self.rotate_half(x) * sin
+
+class KVCache(nn.Module):
+    def __init__(self, kv_cache_shape: tuple) -> None:
+        super().__init__()
+        self.register_buffer('data', torch.zeros(kv_cache_shape, dtype=torch.bfloat16))
+        self.seq_len = 0
+        self.zero()
+
+    def zero(self) -> None:
+        self.data.zero_()
+    
+    def get_data(self) -> torch.Tensor:
+        return self.data
+
+    def forward(self, c_kv: torch.Tensor) -> torch.Tensor:
+        assert self.seq_len + c_kv.size(1) <= self.data.size(1), "KV Cache Exceeded"
+
+        self.data = self.data.to(c_kv.dtype)
+        self.data[
+            :, self.seq_len : self.seq_len + c_kv.size(1), :
+        ] = c_kv
+        self.seq_len += c_kv.size(1)
+
+        return self.data[:, :self.seq_len], self.seq_len
+    
+@dataclass
+class Config:
+    batch_size: int
+    dim: int
+    n_heads: int
+    q_lora_rank: int 
+    kv_lora_rank: int
+    qk_nope_head_dim: int
+    qk_rope_head_dim: int
+    v_head_dim: int
+    seq_len: int
+    max_seq_len: int
+    kv_cache_shape: tuple
+    Q_proj_down_weight: torch.Tensor
+    Q_proj_up_weight: torch.Tensor
+    KV_proj_down_weight: torch.Tensor
+    KV_proj_up_weight: torch.Tensor
+    wo_weight: torch.Tensor
+
+class MLA(nn.Module):
+    def __init__(self, config: Config):
+        super().__init__()
+        self.dim = config.dim
+        self.n_heads = config.n_heads
+        self.q_lora_rank = config.q_lora_rank
+        self.kv_lora_rank = config.kv_lora_rank
+        self.nope_head_dim = config.qk_nope_head_dim
+        self.rope_head_dim = config.qk_rope_head_dim
+        self.v_head_dim = config.v_head_dim
+        # Down-projection matrices
+        self.Q_proj_down = nn.Linear(self.dim, self.q_lora_rank, dtype=torch.bfloat16, bias=False)
+        self.KV_proj_down = nn.Linear(self.dim, self.kv_lora_rank + self.rope_head_dim, dtype=torch.bfloat16, bias=False)
+
+        # Up-projection and rope projection matrices
+        self.Q_proj_up = nn.Linear(self.q_lora_rank, (self.nope_head_dim + self.rope_head_dim) * self.n_heads, dtype=torch.bfloat16, bias=False)
+        self.KV_proj_up = nn.Linear(self.kv_lora_rank, (self.nope_head_dim + self.v_head_dim) * self.n_heads, dtype=torch.bfloat16, bias=False)
+
+        # RoPE on half embeddings
+        self.q_rope = RoPE(self.rope_head_dim)
+        self.k_rope = RoPE(self.rope_head_dim)
+
+        # Output projection
+        self.wo = nn.Linear(self.v_head_dim * self.n_heads, self.dim, dtype=torch.bfloat16, bias=False)
+        self.eps = 1e-6
+   
+    def forward(self, x: torch.Tensor, kv_cache: KVCache) -> torch.Tensor:
+        # seq_len = 1 always here
+        batch_size, seq_len, model_dim = x.size()
+
+        ################################################################################
+        #                 Step 1: Handle down-projection + KV cache                    #
+        ################################################################################
+        q_lora = self.Q_proj_down(x)
+        kv_lora = self.KV_proj_down(x)
+        kv_lora, kv_len = kv_cache(kv_lora)
+        query_pos = kv_len - 1
+
+        ################################################################################
+        #                  Step 2: Up-project and prepare NoPE + RoPE                  #
+        ################################################################################
+
+        # Handle queries Q first
+        q_nope_and_rope = self.Q_proj_up(q_lora).view(
+            batch_size, seq_len, self.n_heads, self.nope_head_dim + self.rope_head_dim)
+        q_nope, q_rope = torch.split(q_nope_and_rope, [self.nope_head_dim, self.rope_head_dim], dim=-1)
+
+        # Handle keys and values K/V. V does not need RoPE
+        kv_nope, k_rope = torch.split(kv_lora, [self.kv_lora_rank, self.rope_head_dim], dim=-1)
+        kv_nope = self.KV_proj_up(kv_nope).view(
+            batch_size, kv_len, self.n_heads, self.nope_head_dim + self.v_head_dim)
+        k_nope, v = torch.split(kv_nope, [self.nope_head_dim, self.v_head_dim], dim=-1)
+
+        ################################################################################
+        #                    Step 3: Handle RoPE Stream                                #
+        ################################################################################
+
+        # Compute RoPE for queries and combine with no-RoPE part
+        q_rope = q_rope.reshape(batch_size, self.n_heads, seq_len, self.rope_head_dim)
+        q_rope = self.q_rope(q_rope, start_pos=query_pos)
+        q_rope = q_rope.reshape(batch_size, seq_len, self.n_heads, self.rope_head_dim)
+        q = torch.concat([q_nope, q_rope], dim=-1)
+
+        # Compute RoPE for keys and combine with no-RoPE part
+        k_rope = k_rope[:, :, None, :]
+        k_rope = self.k_rope(k_rope).expand(-1,-1,self.n_heads,-1)
+        k = torch.concat([k_nope, k_rope], dim=-1)
+                
+        ################################################################################
+        #                        Compute Multi-head Attention                          #
+        ################################################################################
+        q = q.reshape(batch_size, self.n_heads, 1, self.rope_head_dim + self.nope_head_dim)
+        k = k.reshape(batch_size, self.n_heads, kv_len, self.rope_head_dim + self.nope_head_dim)
+        v = v.reshape(batch_size, self.n_heads, -1, self.v_head_dim)
+        scores = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(self.rope_head_dim + self.nope_head_dim)
+        attn = F.softmax(scores, dim=-1).to(torch.bfloat16)
+        y = torch.matmul(attn, v).view(batch_size, 1, -1)
+        y = self.wo(y)
+
+        return y, kv_cache.get_data()
+
+def generate_input(batchsize, dim, dq, prefill, seed):
+    # Sizes derived from: https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/model.py
+    gen = torch.Generator()
+    gen.manual_seed(seed)
+    
+    # Generate weights for linear layers
+    Q_proj_down_weight = torch.randn((dq, dim), dtype=torch.bfloat16, generator=gen)
+    KV_proj_down_weight = torch.randn((512 + 64, dim), dtype=torch.bfloat16, generator=gen)
+    Q_proj_up_weight = torch.randn(((128 + 64) * 128, dq), dtype=torch.bfloat16, generator=gen)
+    KV_proj_up_weight = torch.randn(((128 + 128) * 128, 512), dtype=torch.bfloat16, generator=gen)
+    wo_weight = torch.randn((dim, 128 * 128), dtype=torch.bfloat16, generator=gen)
+
+    config = Config(
+        batch_size=batchsize,
+        dim=dim,
+        q_lora_rank=dq,
+        n_heads=128,
+        kv_lora_rank=512,
+        qk_nope_head_dim=128,
+        qk_rope_head_dim=64,
+        v_head_dim=128,
+        seq_len=1,
+        max_seq_len=8192,
+        kv_cache_shape=(batchsize, 8192, 512 + 64),
+        Q_proj_down_weight=Q_proj_down_weight,
+        Q_proj_up_weight=Q_proj_up_weight,
+        KV_proj_down_weight=KV_proj_down_weight,
+        KV_proj_up_weight=KV_proj_up_weight,
+        wo_weight=wo_weight,
+    )
+    x = torch.randn((config.batch_size, 1, config.dim), dtype=torch.bfloat16, generator=gen)
+    
+    # Pre-fill KV cache
+    kv_cache = KVCache((config.batch_size, config.max_seq_len, config.kv_lora_rank + config.qk_rope_head_dim))
+    pre_filled_cache = torch.randn((config.batch_size, prefill, config.kv_lora_rank + config.qk_rope_head_dim), 
+                                 dtype=torch.bfloat16, generator=gen)
+    kv_cache(pre_filled_cache)
+
+    return config, x, kv_cache
+
+def ref_kernel(data: input_t) -> output_t:
+    config, x, kv_cache = data
+
+    # Load in model weights
+    model = MLA(config)
+    model.Q_proj_down.weight = nn.Parameter(config.Q_proj_down_weight)
+    model.Q_proj_up.weight = nn.Parameter(config.Q_proj_up_weight)
+    model.KV_proj_down.weight = nn.Parameter(config.KV_proj_down_weight)
+    model.KV_proj_up.weight = nn.Parameter(config.KV_proj_up_weight)
+    model.wo.weight = nn.Parameter(config.wo_weight)
+
+    output, kv_cache = model(x, kv_cache)
+    return output, kv_cache
+
+check_implementation = make_match_reference(ref_kernel, rtol=2e-02, atol=1e-03)  
+
+
+def time_mla(model, x, kv_cache, num_warmup=3, num_trials=5):
+    # Warmup runs
+    for _ in range(num_warmup):
+        output, _ = model(x, kv_cache)
+        torch.cuda.synchronize()
+
+    # Timed runs
+    times = []
+    for _ in range(num_trials):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        
+        start.record()
+        output, updated_kv = model(x, kv_cache)
+        end.record()
+        
+        torch.cuda.synchronize()
+        times.append(start.elapsed_time(end))
+
+    avg_time = sum(times) / len(times)
+    return output, updated_kv, avg_time, times
+
+if __name__ == "__main__":
+    # Generate test input
+    batchsize = 128
+    dim = 7168 
+    dq = 1536
+    prefill = 512
+    seed = 97
+
+    # Create model and inputs
+    config, x, kv_cache = generate_input(batchsize, dim, dq, prefill, seed)
+    model = MLA(config)
+
+    # Run model with timing
+    output, updated_kv, avg_time, times = time_mla(model, x, kv_cache)
+
+    print(f"Input shape: {x.shape}")
+    print(f"Output shape: {output.shape}")
+    print(f"Updated KV cache shape: {updated_kv.shape}")
+    print("\nFirst few values of output:")
+    print(output[0, :10])
+    print(f"\nTiming results over {len(times)} runs (ms):")
+    print(f"Average: {avg_time:.2f}")
+    print(f"Individual times: {[f'{t:.2f}' for t in times]}")
\ No newline at end of file
diff --git a/problems/amd/mla-decode/submission.py b/problems/amd/mla-decode/submission.py
new file mode 100644
index 00000000..cbfd75d4
--- /dev/null
+++ b/problems/amd/mla-decode/submission.py
@@ -0,0 +1,165 @@
+import math
+from dataclasses import dataclass
+import torch
+from torch import nn
+import torch.nn.functional as F
+from task import input_t, output_t
+
+class RoPE(nn.Module):
+    def __init__(self, d_model: int):
+        super().__init__()
+        self.d_model = d_model
+        theta = 10000 ** (-torch.arange(0, d_model//2,dtype=torch.bfloat16) / (d_model//2))
+        self.register_buffer("theta", theta)
+
+    def rotate_half(self, x: torch.Tensor) -> torch.Tensor:
+        x1, x2 = x.chunk(2, dim=-1)
+        return torch.cat((-x2, x1), dim=-1)
+
+    def forward(self, x: torch.Tensor, start_pos: int = 0) -> torch.Tensor:
+        seq_len = x.size(-2)
+        d_model = x.size(-1)
+        assert d_model == self.d_model
+        seq_idx = torch.arange(start_pos, start_pos + seq_len, device=x.device)
+        idx_theta = torch.einsum('s,d->sd', seq_idx, self.theta)
+        idx_theta2 = torch.cat([idx_theta, idx_theta], dim=-1)
+        cos = idx_theta2.cos().to(torch.bfloat16)
+        sin = idx_theta2.sin().to(torch.bfloat16)
+        return x * cos + self.rotate_half(x) * sin
+
+
+class KVCache(nn.Module):
+    def __init__(self, kv_cache_shape: tuple) -> None:
+        super().__init__()
+        self.register_buffer('data', torch.zeros(kv_cache_shape, dtype=torch.bfloat16))
+        self.seq_len = 0
+        self.zero()
+
+    def zero(self) -> None:
+        self.data.zero_()
+    
+    def get_data(self) -> torch.Tensor:
+        return self.data
+
+    def forward(self, c_kv: torch.Tensor) -> torch.Tensor:
+        assert self.seq_len + c_kv.size(1) <= self.data.size(1), "KV Cache Exceeded"
+
+        self.data = self.data.to(c_kv.dtype)
+        self.data[
+            :, self.seq_len : self.seq_len + c_kv.size(1), :
+        ] = c_kv
+        self.seq_len += c_kv.size(1)
+
+        return self.data[:, :self.seq_len], self.seq_len
+    
+@dataclass
+class Config:
+    batch_size: int
+    dim: int
+    n_heads: int
+    q_lora_rank: int 
+    kv_lora_rank: int
+    qk_nope_head_dim: int
+    qk_rope_head_dim: int
+    v_head_dim: int
+    seq_len: int
+    max_seq_len: int
+    kv_cache_shape: tuple
+    Q_proj_down_weight: torch.Tensor
+    Q_proj_up_weight: torch.Tensor
+    KV_proj_down_weight: torch.Tensor
+    KV_proj_up_weight: torch.Tensor
+    wo_weight: torch.Tensor
+
+class MLA(nn.Module):
+    def __init__(self, config: Config):
+        super().__init__()
+        self.dim = config.dim
+        self.n_heads = config.n_heads
+        self.q_lora_rank = config.q_lora_rank
+        self.kv_lora_rank = config.kv_lora_rank
+        self.nope_head_dim = config.qk_nope_head_dim
+        self.rope_head_dim = config.qk_rope_head_dim
+        self.v_head_dim = config.v_head_dim
+        # Down-projection matrices
+        self.Q_proj_down = nn.Linear(self.dim, self.q_lora_rank, bias=False, dtype=torch.bfloat16)
+        self.KV_proj_down = nn.Linear(self.dim, self.kv_lora_rank + self.rope_head_dim, bias=False, dtype=torch.bfloat16)
+
+        # Up-projection and rope projection matrices
+        self.Q_proj_up = nn.Linear(self.q_lora_rank, (self.nope_head_dim + self.rope_head_dim) * self.n_heads, bias=False, dtype=torch.bfloat16)
+        self.KV_proj_up = nn.Linear(self.kv_lora_rank, (self.nope_head_dim + self.v_head_dim) * self.n_heads, bias=False, dtype=torch.bfloat16)
+
+        # RoPE on half embeddings
+        self.q_rope = RoPE(self.rope_head_dim)
+        self.k_rope = RoPE(self.rope_head_dim)
+
+        # Output projection
+        self.wo = nn.Linear(self.v_head_dim * self.n_heads, self.dim, dtype=torch.bfloat16, bias=False)
+        self.eps = 1e-6
+   
+    def forward(self, x: torch.Tensor, kv_cache: KVCache) -> torch.Tensor:
+        # seq_len = 1 always here
+        batch_size, seq_len, model_dim = x.size()
+
+        ################################################################################
+        #                 Step 1: Handle down-projection + KV cache                    #
+        ################################################################################
+        q_lora = self.Q_proj_down(x)
+        kv_lora = self.KV_proj_down(x)
+        kv_lora, kv_len = kv_cache(kv_lora)
+        query_pos = kv_len - 1
+
+        ################################################################################
+        #                  Step 2: Up-project and prepare NoPE + RoPE                  #
+        ################################################################################
+
+        # Handle queries Q first
+        q_nope_and_rope = self.Q_proj_up(q_lora).view(
+            batch_size, seq_len, self.n_heads, self.nope_head_dim + self.rope_head_dim)
+        q_nope, q_rope = torch.split(q_nope_and_rope, [self.nope_head_dim, self.rope_head_dim], dim=-1)
+
+        # Handle keys and values K/V. V does not need RoPE
+        kv_nope, k_rope = torch.split(kv_lora, [self.kv_lora_rank, self.rope_head_dim], dim=-1)
+        kv_nope = self.KV_proj_up(kv_nope).view(
+            batch_size, kv_len, self.n_heads, self.nope_head_dim + self.v_head_dim)
+        k_nope, v = torch.split(kv_nope, [self.nope_head_dim, self.v_head_dim], dim=-1)
+
+        ################################################################################
+        #                    Step 3: Handle RoPE Stream                                #
+        ################################################################################
+
+        # Compute RoPE for queries and combine with no-RoPE part
+        q_rope = q_rope.reshape(batch_size, self.n_heads, seq_len, self.rope_head_dim)
+        q_rope = self.q_rope(q_rope, start_pos=query_pos)
+        q_rope = q_rope.reshape(batch_size, seq_len, self.n_heads, self.rope_head_dim)
+        q = torch.concat([q_nope, q_rope], dim=-1)
+
+        # Compute RoPE for keys and combine with no-RoPE part
+        k_rope = k_rope[:, :, None, :]
+        k_rope = self.k_rope(k_rope).expand(-1,-1,self.n_heads,-1)
+        k = torch.concat([k_nope, k_rope], dim=-1)
+                
+        ################################################################################
+        #                        Compute Multi-head Attention                          #
+        ################################################################################
+        q = q.reshape(batch_size, self.n_heads, 1, self.rope_head_dim + self.nope_head_dim)
+        k = k.reshape(batch_size, self.n_heads, kv_len, self.rope_head_dim + self.nope_head_dim)
+        v = v.reshape(batch_size, self.n_heads, -1, self.v_head_dim)
+        scores = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(self.rope_head_dim + self.nope_head_dim)
+        attn = F.softmax(scores, dim=-1).to(torch.bfloat16)
+        y = torch.matmul(attn, v).view(batch_size, 1, -1)
+        y = self.wo(y)
+
+        return y, kv_cache.get_data()
+
+def custom_kernel(data: input_t) -> output_t:
+    config, x, kv_cache = data
+    model = MLA(config)
+    model.Q_proj_down.weight = nn.Parameter(config.Q_proj_down_weight)
+    model.Q_proj_up.weight = nn.Parameter(config.Q_proj_up_weight)
+    model.KV_proj_down.weight = nn.Parameter(config.KV_proj_down_weight)
+    model.KV_proj_up.weight = nn.Parameter(config.KV_proj_up_weight)
+    model.wo.weight = nn.Parameter(config.wo_weight)
+
+    output, kv_cache = model(x, kv_cache)
+    return output, kv_cache
\ No newline at end of file
diff --git a/problems/amd/mla-decode/task.py b/problems/amd/mla-decode/task.py
new file mode 100644
index 00000000..af04a3f8
--- /dev/null
+++ b/problems/amd/mla-decode/task.py
@@ -0,0 +1,12 @@
+import torch
+from typing import TypeVar, TypedDict
+
+input_t = TypeVar("input_t", bound=tuple[torch.nn.Module, torch.Tensor, torch.Tensor])
+output_t = TypeVar("output_t", bound=tuple[torch.Tensor, torch.Tensor])
+
+class TestSpec(TypedDict):
+    batchsize: int
+    dim: int
+    dq: int
+    prefill: int
+    seed: int
\ No newline at end of file
diff --git a/problems/amd/mla-decode/task.yml b/problems/amd/mla-decode/task.yml
new file mode 100644
index 00000000..ca4706ec
--- /dev/null
+++ b/problems/amd/mla-decode/task.yml
@@ -0,0 +1,74 @@
+# name: mla-decode-py
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "eval.py"}
+
+lang: "py"
+
+description: |
+  You will implement a custom mla decode kernel optimized for MI300, a few things simplified here:
+
+  1. Q, K, V data type as bfloat16
+    
+  2. decode only with pre-allocated non-paged latent kv cache
+
+  3. return the update kv cache with MLA output
+
+  The shapes of all outer and inner dimensions of tensors are from DeepSeek-R1, and split number of heads to fit in one GPU. 
+  To be explicit, you will be given a tuple to tensors:
+
+  ```yml
+  input [bs, sq, dim]
+  attn_output [bs, n_heads, sq, v_head_dim]
+  kv_cache [bs, sq, kv_lora_rank + qk_rope_head_dim]
+  ``` 
+
+  where 
+
+  0. bs::128 # batch size
+  1. prefill::[512, 2048, 4096, 6144] # as kv length
+  2. sq::1 # as only consider decoding
+  3. dim::7168 # hidden size of deepseek v3
+  4. kv_lora_rank::[512] # kv lora rank of deepseek v3
+  5. qk_rope_head_dim::[64] # rope embedding dimension
+  6. v_head_dim::128 # head size
+  7. n_heads::128 # num of attn heads
+
+  The ranking criteria is the geometric mean of the benchmark results.
+
+  For the grand prize, your kernel will be evaluated against the speed of light analysis
+  and the solution closest to the speed of light will be awarded the grand prize.
+ 
+  The speed of light analysis is::
+  | bs | prefill | sq | dtype |  roofline time(us) |
+  |---|---|---|---|---|
+  | 128 | 512 | 1 | bf16 | 54.62 |
+  | 128 | 2048 | 1 | bf16 | 141.16 |
+  | 128 | 4096 | 1 | bf16 | 210.75 |
+  | 128 | 6144 | 1 | bf16 | 280.87 | 
+
+config:
+  main: "eval.py"
+
+templates:
+  Python: "submission.py"
+
+test_timeout: 900
+benchmark_timeout: 900
+ranked_timeout: 1200
+
+tests:
+  - {"batchsize": 128, "dim": 7168, "dq": 1536, "prefill": 128, "seed": 9247}
+  - {"batchsize": 128, "dim": 7168, "dq": 1536, "prefill": 512, "seed": 2197}
+  - {"batchsize": 128, "dim": 7168, "dq": 1536, "prefill": 1024, "seed": 9107}
+  - {"batchsize": 128, "dim": 7168, "dq": 1536, "prefill": 2048, "seed": 5291}
+
+benchmarks:
+  - {"batchsize": 128, "dim": 7168, "dq": 1536, "prefill": 4096, "seed": 9817}
+  - {"batchsize": 128, "dim": 7168, "dq": 1536, "prefill": 6144, "seed": 5291}
+
+ranking_by: "geom"
diff --git a/problems/amd/mla-decode/utils.py b/problems/amd/mla-decode/utils.py
new file mode 100644
index 00000000..afe2001d
--- /dev/null
+++ b/problems/amd/mla-decode/utils.py
@@ -0,0 +1,149 @@
+import random
+import numpy as np
+import torch
+
+
+def set_seed(seed=42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+
+
+def get_device(use_cuda: bool = True) -> torch.device:
+    """Get the appropriate device (GPU or CPU)."""
+    if use_cuda:
+        if torch.cuda.is_available():
+            return torch.device("cuda")
+        elif torch.backends.mps.is_available():
+            return torch.device("mps")
+        else:
+            print("No compatible GPU found. Falling back to CPU.")
+    return torch.device("cpu")
+
+
+# Adapted from https://github.com/linkedin/Liger-Kernel/blob/main/test/utils.py
+@torch.no_grad()
+def verbose_allclose(
+        received: torch.Tensor,
+        expected: torch.Tensor,
+        rtol=1e-05,
+        atol=1e-08,
+        max_print=5
+) -> list[str]:
+    """
+    Assert that two tensors are element-wise equal within a tolerance, providing detailed information about mismatches.
+
+    Parameters:
+    received (torch.Tensor): Tensor we actually got.
+    expected (torch.Tensor): Tensor we expected to receive.
+    rtol (float): Relative tolerance; relative to expected
+    atol (float): Absolute tolerance.
+    max_print (int): Maximum number of mismatched elements to print.
+
+    Raises:
+    AssertionError: If the tensors are not all close within the given tolerance.
+    """
+    # Check if the shapes of the tensors match
+    if received.shape != expected.shape:
+        return [f"SIZE MISMATCH. received shape: {received.shape}, expected shape: {expected.shape}"]
+
+    # Calculate the difference between the tensors
+    diff = torch.abs(received - expected)
+
+    # Determine the tolerance
+    tolerance = atol + rtol * torch.abs(expected)
+
+    # Find tolerance mismatched elements
+    tol_mismatched = diff > tolerance
+
+    # Find nan mismatched elements
+    nan_mismatched = torch.logical_xor(torch.isnan(received), torch.isnan(expected))
+
+    # Find +inf mismatched elements
+    posinf_mismatched = torch.logical_xor(torch.isposinf(received), torch.isposinf(expected))
+    # Find -inf mismatched elements
+    neginf_mismatched = torch.logical_xor(torch.isneginf(received), torch.isneginf(expected))
+
+    # Find all mismatched elements
+    mismatched = torch.logical_or(
+        torch.logical_or(tol_mismatched, nan_mismatched),
+        torch.logical_or(posinf_mismatched, neginf_mismatched),
+    )
+
+    mismatched_indices = torch.nonzero(mismatched)
+
+    # Count the number of mismatched elements
+    num_mismatched = mismatched.count_nonzero().item()
+
+    # Generate detailed information if there are mismatches
+    if num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
+
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return mismatch_details
+
+    return []
+
+
+@torch.no_grad()
+def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print: int=5):
+    """
+    Assert that two tensors are element-wise perfectly equal, providing detailed information about mismatches.
+
+    Parameters:
+    received (torch.Tensor): Tensor we actually got.
+    expected (torch.Tensor): Tensor we expected to receive.
+    max_print (int): Maximum number of mismatched elements to print.
+
+    Returns:
+         Empty string if tensors are equal, otherwise detailed error information
+    """
+    mismatched = torch.not_equal(received, expected)
+    mismatched_indices = torch.nonzero(mismatched)
+
+    # Count the number of mismatched elements
+    num_mismatched = mismatched.count_nonzero().item()
+
+    # Generate detailed information if there are mismatches
+    if num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
+
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return mismatch_details
+
+    return []
+
+
+def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08):
+    """
+    Convenient "default" implementation for tasks' `check_implementation` function.
+    """
+    config, x, kv_cache = data
+    expected_mla, expected_kv = reference((config, x, kv_cache))
+    output_mla, output_kv = output
+    reasons_mla = verbose_allclose(output_mla, expected_mla, rtol=rtol, atol=atol)
+    reasons_kv = verbose_allclose(output_kv, expected_kv, rtol=rtol, atol=atol)
+
+    if len(reasons_mla) > 0:
+        return "mismatch found on MLA output! custom implementation doesn't match reference: " + " ".join(reasons_mla)
+    if len(reasons_kv) > 0:
+        return "mismatch found on KV cache output! custom implementation doesn't match reference: " + " ".join(reasons_kv)
+
+    return ''
+
+
+def make_match_reference(reference: callable, **kwargs):
+    def wrapped(data, output):
+        return match_reference(data, output, reference=reference, **kwargs)
+    return wrapped
\ No newline at end of file

From 44275fdf2fd7d97b53a5d40e629e1f2268c9430e Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Mon, 19 May 2025 09:32:54 -0700
Subject: [PATCH 058/207] add mla problem

---
 problems/amd.yaml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/problems/amd.yaml b/problems/amd.yaml
index bcf91f67..722dbb8e 100644
--- a/problems/amd.yaml
+++ b/problems/amd.yaml
@@ -19,4 +19,9 @@ problems:
     name: amd-mixture-of-experts
     deadline: "2025-05-27"
     gpus:
-      - MI300
\ No newline at end of file
+      - MI300
+  - directory: amd/mla-decode
+    name: amd-mla-decode
+    deadline: "2025-05-27"
+    gpus:
+      - MI300

From ba4cf90c994c4241bda4ffac9c6bb4523b838eff Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@gmail.com>
Date: Mon, 19 May 2025 17:18:34 -0700
Subject: [PATCH 059/207] Update amd.yaml

---
 problems/amd.yaml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/problems/amd.yaml b/problems/amd.yaml
index 722dbb8e..16940f4b 100644
--- a/problems/amd.yaml
+++ b/problems/amd.yaml
@@ -1,27 +1,27 @@
 name: AMD Developer Challenge 2025
 # when does this end (individual problems might close earlier)
-deadline: "2025-05-27"
+deadline: "2025-06-02"
 # A description for this particular competition
 description: "AMD Developer Challenge 2025: Inference Sprint"
 # the list of problems
 problems:
   - directory: amd/identity
     name: amd-identity
-    deadline: "2025-06-08"
+    deadline: "2025-06-02"
     gpus:
       - MI300
   - directory: amd/fp8-mm
     name: amd-fp8-mm
-    deadline: "2025-05-27"
+    deadline: "2025-06-02"
     gpus:
       - MI300
   - directory: amd/moe
     name: amd-mixture-of-experts
-    deadline: "2025-05-27"
+    deadline: "2025-06-02"
     gpus:
       - MI300
   - directory: amd/mla-decode
     name: amd-mla-decode
-    deadline: "2025-05-27"
+    deadline: "2025-06-02"
     gpus:
       - MI300

From 3e2cef3645ec46cd6906dbefc86d81633b30600c Mon Sep 17 00:00:00 2001
From: az <alex.lx.zhang@gmail.com>
Date: Wed, 21 May 2025 10:45:00 -0400
Subject: [PATCH 060/207] Update reference.py

---
 problems/amd/mla-decode/reference.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/problems/amd/mla-decode/reference.py b/problems/amd/mla-decode/reference.py
index 10b1884b..550f54ef 100644
--- a/problems/amd/mla-decode/reference.py
+++ b/problems/amd/mla-decode/reference.py
@@ -154,15 +154,15 @@ def forward(self, x: torch.Tensor, kv_cache: KVCache) -> torch.Tensor:
 
 def generate_input(batchsize, dim, dq, prefill, seed):
     # Sizes derived from: https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/model.py
-    gen = torch.Generator()
+    gen = torch.Generator(device='cuda')
     gen.manual_seed(seed)
     
     # Generate weights for linear layers
-    Q_proj_down_weight = torch.randn((dq, dim), dtype=torch.bfloat16, generator=gen)
-    KV_proj_down_weight = torch.randn((512 + 64, dim), dtype=torch.bfloat16, generator=gen)
-    Q_proj_up_weight = torch.randn(((128 + 64) * 128, dq), dtype=torch.bfloat16, generator=gen)
-    KV_proj_up_weight = torch.randn(((128 + 128) * 128, 512), dtype=torch.bfloat16, generator=gen)
-    wo_weight = torch.randn((dim, 128 * 128), dtype=torch.bfloat16, generator=gen)
+    Q_proj_down_weight = torch.randn((dq, dim), dtype=torch.bfloat16, generator=gen, device='cuda')
+    KV_proj_down_weight = torch.randn((512 + 64, dim), dtype=torch.bfloat16, generator=gen, device='cuda')
+    Q_proj_up_weight = torch.randn(((128 + 64) * 128, dq), dtype=torch.bfloat16, generator=gen, device='cuda')
+    KV_proj_up_weight = torch.randn(((128 + 128) * 128, 512), dtype=torch.bfloat16, generator=gen, device='cuda')
+    wo_weight = torch.randn((dim, 128 * 128), dtype=torch.bfloat16, generator=gen, device='cuda')
 
     config = Config(
         batch_size=batchsize,
@@ -182,10 +182,10 @@ def generate_input(batchsize, dim, dq, prefill, seed):
         KV_proj_up_weight=KV_proj_up_weight,
         wo_weight=wo_weight,
     )
-    x = torch.randn((config.batch_size, 1, config.dim), dtype=torch.bfloat16, generator=gen)
+    x = torch.randn((config.batch_size, 1, config.dim), dtype=torch.bfloat16, generator=gen, device='cuda')
     
     # Pre-fill KV cache
-    kv_cache = KVCache((config.batch_size, config.max_seq_len, config.kv_lora_rank + config.qk_rope_head_dim))
+    kv_cache = KVCache((config.batch_size, config.max_seq_len, config.kv_lora_rank + config.qk_rope_head_dim)).to('cuda')
     pre_filled_cache = torch.randn((config.batch_size, prefill, config.kv_lora_rank + config.qk_rope_head_dim), 
                                  dtype=torch.bfloat16, generator=gen)
     kv_cache(pre_filled_cache)
@@ -253,4 +253,4 @@ def time_mla(model, x, kv_cache, num_warmup=3, num_trials=5):
     print(output[0, :10])
     print(f"\nTiming results over {len(times)} runs (ms):")
     print(f"Average: {avg_time:.2f}")
-    print(f"Individual times: {[f'{t:.2f}' for t in times]}")
\ No newline at end of file
+    print(f"Individual times: {[f'{t:.2f}' for t in times]}")

From 74d11b114ae15c731992123eac726984f0b9dcc3 Mon Sep 17 00:00:00 2001
From: az <alex.lx.zhang@gmail.com>
Date: Wed, 21 May 2025 10:59:57 -0400
Subject: [PATCH 061/207] Update reference.py

---
 problems/amd/mla-decode/reference.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/problems/amd/mla-decode/reference.py b/problems/amd/mla-decode/reference.py
index 550f54ef..7dcff278 100644
--- a/problems/amd/mla-decode/reference.py
+++ b/problems/amd/mla-decode/reference.py
@@ -186,7 +186,7 @@ def generate_input(batchsize, dim, dq, prefill, seed):
     
     # Pre-fill KV cache
     kv_cache = KVCache((config.batch_size, config.max_seq_len, config.kv_lora_rank + config.qk_rope_head_dim)).to('cuda')
-    pre_filled_cache = torch.randn((config.batch_size, prefill, config.kv_lora_rank + config.qk_rope_head_dim), 
+    pre_filled_cache = torch.randn((config.batch_size, prefill, config.kv_lora_rank + config.qk_rope_head_dim), device='cuda', 
                                  dtype=torch.bfloat16, generator=gen)
     kv_cache(pre_filled_cache)
 
@@ -196,7 +196,7 @@ def ref_kernel(data: input_t) -> output_t:
     config, x, kv_cache = data
 
     # Load in model weights
-    model = MLA(config)
+    model = MLA(config).cuda()
     model.Q_proj_down.weight = nn.Parameter(config.Q_proj_down_weight)
     model.Q_proj_up.weight = nn.Parameter(config.Q_proj_up_weight)
     model.KV_proj_down.weight = nn.Parameter(config.KV_proj_down_weight)
@@ -241,7 +241,7 @@ def time_mla(model, x, kv_cache, num_warmup=3, num_trials=5):
 
     # Create model and inputs
     config, x, kv_cache = generate_input(batchsize, dim, dq, prefill, seed)
-    model = MLA(config)
+    model = MLA(config).cuda()
 
     # Run model with timing
     output, updated_kv, avg_time, times = time_mla(model, x, kv_cache)

From f03748528a8ab3eef6eb34bc71b20f19e1236b27 Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Wed, 21 May 2025 14:49:36 -0400
Subject: [PATCH 062/207] updated ref

---
 problems/amd/mla-decode/reference.py  | 53 +++++++++++++++++----------
 problems/amd/mla-decode/submission.py |  4 +-
 2 files changed, 36 insertions(+), 21 deletions(-)

diff --git a/problems/amd/mla-decode/reference.py b/problems/amd/mla-decode/reference.py
index 10b1884b..aaf8e756 100644
--- a/problems/amd/mla-decode/reference.py
+++ b/problems/amd/mla-decode/reference.py
@@ -29,8 +29,8 @@ def forward(self, x: torch.Tensor, start_pos: int = 0) -> torch.Tensor:
         return x * cos + self.rotate_half(x) * sin
 
 class KVCache(nn.Module):
-    def __init__(self, kv_cache_shape: tuple) -> None:
-        super().__init__()
+    def __init__(self, kv_cache_shape: tuple, **kwargs) -> None:
+        super().__init__(**kwargs)
         self.register_buffer('data', torch.zeros(kv_cache_shape, dtype=torch.bfloat16))
         self.seq_len = 0
         self.zero()
@@ -154,15 +154,15 @@ def forward(self, x: torch.Tensor, kv_cache: KVCache) -> torch.Tensor:
 
 def generate_input(batchsize, dim, dq, prefill, seed):
     # Sizes derived from: https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/model.py
-    gen = torch.Generator()
+    gen = torch.Generator(device='cuda')
     gen.manual_seed(seed)
     
     # Generate weights for linear layers
-    Q_proj_down_weight = torch.randn((dq, dim), dtype=torch.bfloat16, generator=gen)
-    KV_proj_down_weight = torch.randn((512 + 64, dim), dtype=torch.bfloat16, generator=gen)
-    Q_proj_up_weight = torch.randn(((128 + 64) * 128, dq), dtype=torch.bfloat16, generator=gen)
-    KV_proj_up_weight = torch.randn(((128 + 128) * 128, 512), dtype=torch.bfloat16, generator=gen)
-    wo_weight = torch.randn((dim, 128 * 128), dtype=torch.bfloat16, generator=gen)
+    Q_proj_down_weight = torch.randn((dq, dim), dtype=torch.bfloat16, generator=gen, device='cuda')
+    KV_proj_down_weight = torch.randn((512 + 64, dim), dtype=torch.bfloat16, generator=gen, device='cuda')
+    Q_proj_up_weight = torch.randn(((128 + 64) * 128, dq), dtype=torch.bfloat16, generator=gen, device='cuda')
+    KV_proj_up_weight = torch.randn(((128 + 128) * 128, 512), dtype=torch.bfloat16, generator=gen, device='cuda')
+    wo_weight = torch.randn((dim, 128 * 128), dtype=torch.bfloat16, generator=gen, device='cuda')
 
     config = Config(
         batch_size=batchsize,
@@ -182,12 +182,12 @@ def generate_input(batchsize, dim, dq, prefill, seed):
         KV_proj_up_weight=KV_proj_up_weight,
         wo_weight=wo_weight,
     )
-    x = torch.randn((config.batch_size, 1, config.dim), dtype=torch.bfloat16, generator=gen)
+    x = torch.randn((config.batch_size, 1, config.dim), dtype=torch.bfloat16, generator=gen, device='cuda')
     
     # Pre-fill KV cache
-    kv_cache = KVCache((config.batch_size, config.max_seq_len, config.kv_lora_rank + config.qk_rope_head_dim))
+    kv_cache = KVCache((config.batch_size, config.max_seq_len, config.kv_lora_rank + config.qk_rope_head_dim)).to('cuda')
     pre_filled_cache = torch.randn((config.batch_size, prefill, config.kv_lora_rank + config.qk_rope_head_dim), 
-                                 dtype=torch.bfloat16, generator=gen)
+                                 dtype=torch.bfloat16, generator=gen, device='cuda')
     kv_cache(pre_filled_cache)
 
     return config, x, kv_cache
@@ -196,12 +196,12 @@ def ref_kernel(data: input_t) -> output_t:
     config, x, kv_cache = data
 
     # Load in model weights
-    model = MLA(config)
-    model.Q_proj_down.weight = nn.Parameter(config.Q_proj_down_weight)
-    model.Q_proj_up.weight = nn.Parameter(config.Q_proj_up_weight)
-    model.KV_proj_down.weight = nn.Parameter(config.KV_proj_down_weight)
-    model.KV_proj_up.weight = nn.Parameter(config.KV_proj_up_weight)
-    model.wo.weight = nn.Parameter(config.wo_weight)
+    model = MLA(config).to('cuda')
+    model.Q_proj_down.weight = nn.Parameter(config.Q_proj_down_weight, device='cuda')
+    model.Q_proj_up.weight = nn.Parameter(config.Q_proj_up_weight, device='cuda')
+    model.KV_proj_down.weight = nn.Parameter(config.KV_proj_down_weight, device='cuda')
+    model.KV_proj_up.weight = nn.Parameter(config.KV_proj_up_weight, device='cuda')
+    model.wo.weight = nn.Parameter(config.wo_weight, device='cuda')
 
     output, kv_cache = model(x, kv_cache)
     return output, kv_cache
@@ -210,14 +210,16 @@ def ref_kernel(data: input_t) -> output_t:
 
 
 def time_mla(model, x, kv_cache, num_warmup=3, num_trials=5):
+
     # Warmup runs
-    for _ in range(num_warmup):
+    for _ in range(1):
         output, _ = model(x, kv_cache)
         torch.cuda.synchronize()
 
     # Timed runs
     times = []
     for _ in range(num_trials):
+        kv_cache = KVCache((config.batch_size, config.max_seq_len, config.kv_lora_rank + config.qk_rope_head_dim)).to('cuda')
         start = torch.cuda.Event(enable_timing=True)
         end = torch.cuda.Event(enable_timing=True)
         
@@ -241,11 +243,24 @@ def time_mla(model, x, kv_cache, num_warmup=3, num_trials=5):
 
     # Create model and inputs
     config, x, kv_cache = generate_input(batchsize, dim, dq, prefill, seed)
-    model = MLA(config)
+    model = MLA(config).to('cuda')
 
     # Run model with timing
     output, updated_kv, avg_time, times = time_mla(model, x, kv_cache)
 
+    # Test reference kernel
+    ref_output, ref_kv = ref_kernel((config, x, kv_cache))
+    print("\nReference kernel output:")
+    print(f"Output shape: {ref_output.shape}")
+    print(f"KV cache shape: {ref_kv.shape}")
+    print("\nFirst few values of reference output:")
+    print(ref_output[0, :10])
+
+    # Compare outputs
+    print("\nOutput difference:")
+    print(f"Max absolute difference: {torch.max(torch.abs(output - ref_output))}")
+    print(f"Mean absolute difference: {torch.mean(torch.abs(output - ref_output))}")
+
     print(f"Input shape: {x.shape}")
     print(f"Output shape: {output.shape}")
     print(f"Updated KV cache shape: {updated_kv.shape}")
diff --git a/problems/amd/mla-decode/submission.py b/problems/amd/mla-decode/submission.py
index cbfd75d4..0d736242 100644
--- a/problems/amd/mla-decode/submission.py
+++ b/problems/amd/mla-decode/submission.py
@@ -31,7 +31,7 @@ def forward(self, x: torch.Tensor, start_pos: int = 0) -> torch.Tensor:
 class KVCache(nn.Module):
     def __init__(self, kv_cache_shape: tuple) -> None:
         super().__init__()
-        self.register_buffer('data', torch.zeros(kv_cache_shape, dtype=torch.bfloat16))
+        self.register_buffer('data', torch.zeros(kv_cache_shape, dtype=torch.bfloat16, device='cuda'))
         self.seq_len = 0
         self.zero()
 
@@ -154,7 +154,7 @@ def forward(self, x: torch.Tensor, kv_cache: KVCache) -> torch.Tensor:
 
 def custom_kernel(data: input_t) -> output_t:
     config, x, kv_cache = data
-    model = MLA(config)
+    model = MLA(config).to('cuda')
     model.Q_proj_down.weight = nn.Parameter(config.Q_proj_down_weight)
     model.Q_proj_up.weight = nn.Parameter(config.Q_proj_up_weight)
     model.KV_proj_down.weight = nn.Parameter(config.KV_proj_down_weight)

From 8a87e65b5501f19e6d8e0152395480241dc5ce4e Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Wed, 21 May 2025 16:08:53 -0400
Subject: [PATCH 063/207] update mla decode

---
 problems/amd/mla-decode/eval.py      | 66 ++++++++++++++--------------
 problems/amd/mla-decode/reference.py | 10 ++---
 problems/amd/mla-decode/utils.py     | 11 ++++-
 3 files changed, 49 insertions(+), 38 deletions(-)

diff --git a/problems/amd/mla-decode/eval.py b/problems/amd/mla-decode/eval.py
index 34d3ae84..3f67cd64 100644
--- a/problems/amd/mla-decode/eval.py
+++ b/problems/amd/mla-decode/eval.py
@@ -55,20 +55,20 @@ def copy_kv_cache(module, kv_cache_shape):
     # Copy parameters
     params = OrderedDict()
     for name, param in module.named_parameters():
-        params[name] = param.clone().requires_grad_(param.requires_grad)
+        params[name] = param.clone().requires_grad_(param.requires_grad).cuda()
         
     # Copy buffers
     buffers = OrderedDict()
     for name, buff in module.named_buffers():
         print(f"Buff name: {name}, shape: {buff.shape}")
-        buffers[name] = buff.clone()
+        buffers[name] = buff.clone().cuda()
     
     # Assign params and buffers to copied module
     copied_module.load_state_dict(params, strict=False)
     copied_module.load_state_dict(buffers, strict=False)
     copied_module.seq_len = module.seq_len
     
-    return copied_module
+    return copied_module.cuda()
 
 
 def get_test_cases(file_name: str) -> list[TestCase]:
@@ -148,10 +148,10 @@ def copy_config_weights(config):
     """
     return dataclasses.replace(
         config,
-        Q_proj_down_weight=config.Q_proj_down_weight.clone(),
-        Q_proj_up_weight=config.Q_proj_up_weight.clone(),
-        KV_proj_down_weight=config.KV_proj_down_weight.clone(),
-        KV_proj_up_weight=config.KV_proj_up_weight.clone()
+        Q_proj_down_weight=config.Q_proj_down_weight.clone().cuda(),
+        Q_proj_up_weight=config.Q_proj_up_weight.clone().cuda(),
+        KV_proj_down_weight=config.KV_proj_down_weight.clone().cuda(),
+        KV_proj_up_weight=config.KV_proj_up_weight.clone().cuda()
     )
 
 
@@ -206,8 +206,9 @@ def benchmark(test: TestCase, recheck: bool, max_repeats: int, max_time_ns: floa
     # first, one obligatory correctness check; also triggers triton compile for the given shape
     kv_cache_copy = copy_kv_cache(kv_cache, config.kv_cache_shape)
     config_copy = copy_config_weights(config)
-    output = custom_kernel((config, data, kv_cache))
-    error = check_implementation((config_copy, data, kv_cache_copy), output)
+    with torch.no_grad():
+        output = custom_kernel((config, data, kv_cache))
+        error = check_implementation((config_copy, data, kv_cache_copy), output)
     if error:
         return error
 
@@ -216,29 +217,30 @@ def benchmark(test: TestCase, recheck: bool, max_repeats: int, max_time_ns: floa
     # otherwise, we repeat until we either measure at least 10 full seconds,
     # or the relative error of the mean is below 1%.
 
-    for i in range(max_repeats):
-        if recheck:
-            config, data, kv_cache = generate_input(**test.args)
-            kv_cache_copy = copy_kv_cache(kv_cache, config.kv_cache_shape)
-            config_copy = copy_config_weights(config)
-        torch.cuda.synchronize()
-        start = time.perf_counter_ns()
-        output = custom_kernel((config, data, kv_cache))
-        torch.cuda.synchronize()
-        end = time.perf_counter_ns()
-
-        if recheck:
-            error = check_implementation((config_copy, data, kv_cache_copy), output)
-            if error:
-                return error
-
-        del output
-        durations.append(end-start)
-
-        if i > 1:
-            stats = calculate_stats(durations)
-            if stats.err / stats.mean < 0.01 or stats.mean *  stats.runs > max_time_ns:
-                break
+    with torch.no_grad():
+        for i in range(max_repeats):
+            if recheck:
+                config, data, kv_cache = generate_input(**test.args)
+                kv_cache_copy = copy_kv_cache(kv_cache, config.kv_cache_shape)
+                config_copy = copy_config_weights(config)
+            torch.cuda.synchronize()
+            start = time.perf_counter_ns()
+            output = custom_kernel((config, data, kv_cache))
+            torch.cuda.synchronize()
+            end = time.perf_counter_ns()
+
+            if recheck:
+                error = check_implementation((config_copy, data, kv_cache_copy), output)
+                if error:
+                    return error
+
+            del output
+            durations.append(end-start)
+
+            if i > 1:
+                stats = calculate_stats(durations)
+                if stats.err / stats.mean < 0.01 or stats.mean *  stats.runs > max_time_ns:
+                    break
 
     return calculate_stats(durations)
 
diff --git a/problems/amd/mla-decode/reference.py b/problems/amd/mla-decode/reference.py
index aaf8e756..d2cf075b 100644
--- a/problems/amd/mla-decode/reference.py
+++ b/problems/amd/mla-decode/reference.py
@@ -197,11 +197,11 @@ def ref_kernel(data: input_t) -> output_t:
 
     # Load in model weights
     model = MLA(config).to('cuda')
-    model.Q_proj_down.weight = nn.Parameter(config.Q_proj_down_weight, device='cuda')
-    model.Q_proj_up.weight = nn.Parameter(config.Q_proj_up_weight, device='cuda')
-    model.KV_proj_down.weight = nn.Parameter(config.KV_proj_down_weight, device='cuda')
-    model.KV_proj_up.weight = nn.Parameter(config.KV_proj_up_weight, device='cuda')
-    model.wo.weight = nn.Parameter(config.wo_weight, device='cuda')
+    model.Q_proj_down.weight = nn.Parameter(config.Q_proj_down_weight)
+    model.Q_proj_up.weight = nn.Parameter(config.Q_proj_up_weight)
+    model.KV_proj_down.weight = nn.Parameter(config.KV_proj_down_weight)
+    model.KV_proj_up.weight = nn.Parameter(config.KV_proj_up_weight)
+    model.wo.weight = nn.Parameter(config.wo_weight)
 
     output, kv_cache = model(x, kv_cache)
     return output, kv_cache
diff --git a/problems/amd/mla-decode/utils.py b/problems/amd/mla-decode/utils.py
index afe2001d..e3f231e8 100644
--- a/problems/amd/mla-decode/utils.py
+++ b/problems/amd/mla-decode/utils.py
@@ -129,9 +129,18 @@ def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08):
     """
     Convenient "default" implementation for tasks' `check_implementation` function.
     """
+    output_mla, output_kv = output
+
+    # To fit in memory for the big test
+    output_mla = output_mla.cpu()
+    output_kv = output_kv.cpu()
+
     config, x, kv_cache = data
     expected_mla, expected_kv = reference((config, x, kv_cache))
-    output_mla, output_kv = output
+
+    output_mla = output_mla.cuda()
+    output_kv = output_kv.cuda()
+
     reasons_mla = verbose_allclose(output_mla, expected_mla, rtol=rtol, atol=atol)
     reasons_kv = verbose_allclose(output_kv, expected_kv, rtol=rtol, atol=atol)
 

From 47b5085085fa15819cdfaa33eb43fb294fa49f6c Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Thu, 22 May 2025 07:32:02 -0400
Subject: [PATCH 064/207] update weights

---
 problems/amd/mla-decode/reference.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/problems/amd/mla-decode/reference.py b/problems/amd/mla-decode/reference.py
index d2cf075b..2fc99b9e 100644
--- a/problems/amd/mla-decode/reference.py
+++ b/problems/amd/mla-decode/reference.py
@@ -158,11 +158,11 @@ def generate_input(batchsize, dim, dq, prefill, seed):
     gen.manual_seed(seed)
     
     # Generate weights for linear layers
-    Q_proj_down_weight = torch.randn((dq, dim), dtype=torch.bfloat16, generator=gen, device='cuda')
-    KV_proj_down_weight = torch.randn((512 + 64, dim), dtype=torch.bfloat16, generator=gen, device='cuda')
-    Q_proj_up_weight = torch.randn(((128 + 64) * 128, dq), dtype=torch.bfloat16, generator=gen, device='cuda')
-    KV_proj_up_weight = torch.randn(((128 + 128) * 128, 512), dtype=torch.bfloat16, generator=gen, device='cuda')
-    wo_weight = torch.randn((dim, 128 * 128), dtype=torch.bfloat16, generator=gen, device='cuda')
+    Q_proj_down_weight = torch.randn((dq, dim), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(dim)
+    KV_proj_down_weight = torch.randn((512 + 64, dim), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(dim)
+    Q_proj_up_weight = torch.randn(((128 + 64) * 128, dq), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(dq)
+    KV_proj_up_weight = torch.randn(((128 + 128) * 128, 512), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(512)
+    wo_weight = torch.randn((dim, 128 * 128), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(dim)
 
     config = Config(
         batch_size=batchsize,

From ef81df15030d1eec465b400581e3a65e1d31bf82 Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Fri, 23 May 2025 12:30:08 -0400
Subject: [PATCH 065/207] updated rope

---
 problems/amd/mla-decode/reference.py  | 7 ++++---
 problems/amd/mla-decode/submission.py | 6 ++++--
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/problems/amd/mla-decode/reference.py b/problems/amd/mla-decode/reference.py
index 838ea40f..cb8f4e36 100644
--- a/problems/amd/mla-decode/reference.py
+++ b/problems/amd/mla-decode/reference.py
@@ -135,8 +135,9 @@ def forward(self, x: torch.Tensor, kv_cache: KVCache) -> torch.Tensor:
         q = torch.concat([q_nope, q_rope], dim=-1)
 
         # Compute RoPE for keys and combine with no-RoPE part
-        k_rope = k_rope[:, :, None, :]
-        k_rope = self.k_rope(k_rope).expand(-1,-1,self.n_heads,-1)
+        k_rope = k_rope[:, None, :, :]
+        k_rope = self.k_rope(k_rope).expand(-1,self.n_heads,-1,-1)
+        k_rope = k_rope.reshape(batch_size, kv_len, self.n_heads, self.rope_head_dim)
         k = torch.concat([k_nope, k_rope], dim=-1)
                 
         ################################################################################
@@ -268,4 +269,4 @@ def time_mla(model, x, kv_cache, num_warmup=3, num_trials=5):
     print(output[0, :10])
     print(f"\nTiming results over {len(times)} runs (ms):")
     print(f"Average: {avg_time:.2f}")
-    print(f"Individual times: {[f'{t:.2f}' for t in times]}")
+    print(f"Individual times: {[f'{t:.2f}' for t in times]}")
\ No newline at end of file
diff --git a/problems/amd/mla-decode/submission.py b/problems/amd/mla-decode/submission.py
index 0d736242..28d4f98b 100644
--- a/problems/amd/mla-decode/submission.py
+++ b/problems/amd/mla-decode/submission.py
@@ -134,9 +134,11 @@ def forward(self, x: torch.Tensor, kv_cache: KVCache) -> torch.Tensor:
         q_rope = q_rope.reshape(batch_size, seq_len, self.n_heads, self.rope_head_dim)
         q = torch.concat([q_nope, q_rope], dim=-1)
 
+
         # Compute RoPE for keys and combine with no-RoPE part
-        k_rope = k_rope[:, :, None, :]
-        k_rope = self.k_rope(k_rope).expand(-1,-1,self.n_heads,-1)
+        k_rope = k_rope[:, None, :, :]
+        k_rope = self.k_rope(k_rope).expand(-1,self.n_heads,-1,-1)
+        k_rope = k_rope.reshape(batch_size, kv_len, self.n_heads, self.rope_head_dim)
         k = torch.concat([k_nope, k_rope], dim=-1)
                 
         ################################################################################

From 79cbbfd498909dfcd515661c1c4f867a54f59bd0 Mon Sep 17 00:00:00 2001
From: az <alex.lx.zhang@gmail.com>
Date: Fri, 23 May 2025 13:23:52 -0400
Subject: [PATCH 066/207] Update w_o weight normalization factor

---
 problems/amd/mla-decode/reference.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/problems/amd/mla-decode/reference.py b/problems/amd/mla-decode/reference.py
index cb8f4e36..1d8d8b83 100644
--- a/problems/amd/mla-decode/reference.py
+++ b/problems/amd/mla-decode/reference.py
@@ -163,7 +163,7 @@ def generate_input(batchsize, dim, dq, prefill, seed):
     KV_proj_down_weight = torch.randn((512 + 64, dim), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(dim)
     Q_proj_up_weight = torch.randn(((128 + 64) * 128, dq), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(dq)
     KV_proj_up_weight = torch.randn(((128 + 128) * 128, 512), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(512)
-    wo_weight = torch.randn((dim, 128 * 128), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(dim)
+    wo_weight = torch.randn((dim, 128 * 128), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(128 * 128)
 
     config = Config(
         batch_size=batchsize,
@@ -269,4 +269,4 @@ def time_mla(model, x, kv_cache, num_warmup=3, num_trials=5):
     print(output[0, :10])
     print(f"\nTiming results over {len(times)} runs (ms):")
     print(f"Average: {avg_time:.2f}")
-    print(f"Individual times: {[f'{t:.2f}' for t in times]}")
\ No newline at end of file
+    print(f"Individual times: {[f'{t:.2f}' for t in times]}")

From c7f4a22098b669a77b0ff7947ec30fe18541559b Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Sat, 24 May 2025 00:17:23 -0400
Subject: [PATCH 067/207] permute instead of reshape

---
 problems/amd/mla-decode/reference.py  | 5 +++--
 problems/amd/mla-decode/submission.py | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/problems/amd/mla-decode/reference.py b/problems/amd/mla-decode/reference.py
index cb8f4e36..d10bb4bb 100644
--- a/problems/amd/mla-decode/reference.py
+++ b/problems/amd/mla-decode/reference.py
@@ -131,13 +131,14 @@ def forward(self, x: torch.Tensor, kv_cache: KVCache) -> torch.Tensor:
         # Compute RoPE for queries and combine with no-RoPE part
         q_rope = q_rope.reshape(batch_size, self.n_heads, seq_len, self.rope_head_dim)
         q_rope = self.q_rope(q_rope, start_pos=query_pos)
-        q_rope = q_rope.reshape(batch_size, seq_len, self.n_heads, self.rope_head_dim)
+        q_rope = q_rope.permute(0, 2, 1, 3) # bs x seq_len x n_heads x rope_head_dim
         q = torch.concat([q_nope, q_rope], dim=-1)
 
+
         # Compute RoPE for keys and combine with no-RoPE part
         k_rope = k_rope[:, None, :, :]
         k_rope = self.k_rope(k_rope).expand(-1,self.n_heads,-1,-1)
-        k_rope = k_rope.reshape(batch_size, kv_len, self.n_heads, self.rope_head_dim)
+        k_rope = k_rope.permute(0, 2, 1, 3) # bs x kv_len x n_heads x rope_head_dim
         k = torch.concat([k_nope, k_rope], dim=-1)
                 
         ################################################################################
diff --git a/problems/amd/mla-decode/submission.py b/problems/amd/mla-decode/submission.py
index 28d4f98b..f1fb8629 100644
--- a/problems/amd/mla-decode/submission.py
+++ b/problems/amd/mla-decode/submission.py
@@ -131,14 +131,14 @@ def forward(self, x: torch.Tensor, kv_cache: KVCache) -> torch.Tensor:
         # Compute RoPE for queries and combine with no-RoPE part
         q_rope = q_rope.reshape(batch_size, self.n_heads, seq_len, self.rope_head_dim)
         q_rope = self.q_rope(q_rope, start_pos=query_pos)
-        q_rope = q_rope.reshape(batch_size, seq_len, self.n_heads, self.rope_head_dim)
+        q_rope = q_rope.permute(0, 2, 1, 3) # bs x seq_len x n_heads x rope_head_dim
         q = torch.concat([q_nope, q_rope], dim=-1)
 
 
         # Compute RoPE for keys and combine with no-RoPE part
         k_rope = k_rope[:, None, :, :]
         k_rope = self.k_rope(k_rope).expand(-1,self.n_heads,-1,-1)
-        k_rope = k_rope.reshape(batch_size, kv_len, self.n_heads, self.rope_head_dim)
+        k_rope = k_rope.permute(0, 2, 1, 3) # bs x kv_len x n_heads x rope_head_dim
         k = torch.concat([k_nope, k_rope], dim=-1)
                 
         ################################################################################

From d4c24d93de911f7aa7c09126abeaf1bcd4befc94 Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Sat, 24 May 2025 02:24:25 -0400
Subject: [PATCH 068/207] fix rope bugs found by jpy794

---
 problems/amd/mla-decode/reference.py  | 11 +++++------
 problems/amd/mla-decode/submission.py | 11 +++++------
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/problems/amd/mla-decode/reference.py b/problems/amd/mla-decode/reference.py
index d10bb4bb..f4567238 100644
--- a/problems/amd/mla-decode/reference.py
+++ b/problems/amd/mla-decode/reference.py
@@ -129,24 +129,23 @@ def forward(self, x: torch.Tensor, kv_cache: KVCache) -> torch.Tensor:
         ################################################################################
 
         # Compute RoPE for queries and combine with no-RoPE part
-        q_rope = q_rope.reshape(batch_size, self.n_heads, seq_len, self.rope_head_dim)
+        q_rope = q_rope.permute(0, 2, 1, 3) # bs x n_heads x seq_len x rope_head_dim
         q_rope = self.q_rope(q_rope, start_pos=query_pos)
-        q_rope = q_rope.permute(0, 2, 1, 3) # bs x seq_len x n_heads x rope_head_dim
+
+        q_nope = q_nope.permute(0, 2, 1, 3) # bs x n_heads x seq_len x rope_head_dim
         q = torch.concat([q_nope, q_rope], dim=-1)
 
 
         # Compute RoPE for keys and combine with no-RoPE part
         k_rope = k_rope[:, None, :, :]
         k_rope = self.k_rope(k_rope).expand(-1,self.n_heads,-1,-1)
-        k_rope = k_rope.permute(0, 2, 1, 3) # bs x kv_len x n_heads x rope_head_dim
+        k_nope = k_nope.permute(0, 2, 1, 3) # bs x kv_len x n_heads x rope_head_dim
         k = torch.concat([k_nope, k_rope], dim=-1)
                 
         ################################################################################
         #                        Compute Multi-head Attention                          #
         ################################################################################
-        q = q.reshape(batch_size, self.n_heads, 1, self.rope_head_dim + self.nope_head_dim)
-        k = k.reshape(batch_size, self.n_heads, kv_len, self.rope_head_dim + self.nope_head_dim)
-        v = v.reshape(batch_size, self.n_heads, -1, self.v_head_dim)
+        v = v.permute(0, 2, 1, 3) # bs x n_heads x kv_len x v_head_dim
         scores = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(self.rope_head_dim + self.nope_head_dim)
         attn = F.softmax(scores, dim=-1).to(torch.bfloat16)
         y = torch.matmul(attn, v).view(batch_size, 1, -1)
diff --git a/problems/amd/mla-decode/submission.py b/problems/amd/mla-decode/submission.py
index f1fb8629..2165f988 100644
--- a/problems/amd/mla-decode/submission.py
+++ b/problems/amd/mla-decode/submission.py
@@ -129,24 +129,23 @@ def forward(self, x: torch.Tensor, kv_cache: KVCache) -> torch.Tensor:
         ################################################################################
 
         # Compute RoPE for queries and combine with no-RoPE part
-        q_rope = q_rope.reshape(batch_size, self.n_heads, seq_len, self.rope_head_dim)
+        q_rope = q_rope.permute(0, 2, 1, 3) # bs x n_heads x seq_len x rope_head_dim
         q_rope = self.q_rope(q_rope, start_pos=query_pos)
-        q_rope = q_rope.permute(0, 2, 1, 3) # bs x seq_len x n_heads x rope_head_dim
+
+        q_nope = q_nope.permute(0, 2, 1, 3) # bs x n_heads x seq_len x rope_head_dim
         q = torch.concat([q_nope, q_rope], dim=-1)
 
 
         # Compute RoPE for keys and combine with no-RoPE part
         k_rope = k_rope[:, None, :, :]
         k_rope = self.k_rope(k_rope).expand(-1,self.n_heads,-1,-1)
-        k_rope = k_rope.permute(0, 2, 1, 3) # bs x kv_len x n_heads x rope_head_dim
+        k_nope = k_nope.permute(0, 2, 1, 3) # bs x kv_len x n_heads x rope_head_dim
         k = torch.concat([k_nope, k_rope], dim=-1)
                 
         ################################################################################
         #                        Compute Multi-head Attention                          #
         ################################################################################
-        q = q.reshape(batch_size, self.n_heads, 1, self.rope_head_dim + self.nope_head_dim)
-        k = k.reshape(batch_size, self.n_heads, kv_len, self.rope_head_dim + self.nope_head_dim)
-        v = v.reshape(batch_size, self.n_heads, -1, self.v_head_dim)
+        v = v.permute(0, 2, 1, 3) # bs x n_heads x kv_len x v_head_dim
         scores = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(self.rope_head_dim + self.nope_head_dim)
         attn = F.softmax(scores, dim=-1).to(torch.bfloat16)
         y = torch.matmul(attn, v).view(batch_size, 1, -1)

From ecb7e0afca28669731780bcc6a5cd8d66335c478 Mon Sep 17 00:00:00 2001
From: danielhua23 <daniel.huang2@amd.com>
Date: Sun, 25 May 2025 07:25:23 +0000
Subject: [PATCH 069/207] adjust mla atol

---
 problems/amd/mla-decode/reference.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/amd/mla-decode/reference.py b/problems/amd/mla-decode/reference.py
index 6e13313f..5c34f49d 100644
--- a/problems/amd/mla-decode/reference.py
+++ b/problems/amd/mla-decode/reference.py
@@ -207,7 +207,7 @@ def ref_kernel(data: input_t) -> output_t:
     output, kv_cache = model(x, kv_cache)
     return output, kv_cache
 
-check_implementation = make_match_reference(ref_kernel, rtol=2e-02, atol=1e-03)  
+check_implementation = make_match_reference(ref_kernel, rtol=2e-02, atol=8e-03)  
 
 
 def time_mla(model, x, kv_cache, num_warmup=3, num_trials=5):

From 0156b809d952e20d3d6ef0c55b28568647b3a89e Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Mon, 26 May 2025 14:56:38 -0700
Subject: [PATCH 070/207] Create LICENSE

---
 LICENSE | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 LICENSE

diff --git a/LICENSE b/LICENSE
new file mode 100644
index 00000000..e46d5080
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2025 GPU MODE
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

From 53869d5db6e4e8b0b7e8e2d7977bbb6cedf19fc2 Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Mon, 2 Jun 2025 14:37:37 -0400
Subject: [PATCH 071/207] update conv2d reference with cudnn backend context
 set

---
 problems/pmpp/conv2d_py/reference.py | 36 ++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 10 deletions(-)

diff --git a/problems/pmpp/conv2d_py/reference.py b/problems/pmpp/conv2d_py/reference.py
index 9e5e1a74..52cb98d8 100644
--- a/problems/pmpp/conv2d_py/reference.py
+++ b/problems/pmpp/conv2d_py/reference.py
@@ -3,6 +3,21 @@
 import torch.nn.functional as F
 from task import input_t, output_t
 
+class DisableCuDNNTF32:
+    def __init__(self):
+        self.allow_tf32 = torch.backends.cudnn.allow_tf32
+        self.deterministic = torch.backends.cudnn.deterministic
+        pass
+
+    def __enter__(self):
+        torch.backends.cudnn.allow_tf32 = False
+        torch.backends.cudnn.deterministic = True
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        torch.backends.cudnn.allow_tf32 = self.allow_tf32
+        torch.backends.cudnn.deterministic = self.deterministic
+
 
 def ref_kernel(data: input_t) -> output_t:
     """
@@ -12,16 +27,17 @@ def ref_kernel(data: input_t) -> output_t:
     Returns:
         Output tensor after convolution
     """
-    input_tensor, kernel = data
-    return F.conv2d(
-        input_tensor, 
-        kernel,
+    with DisableCuDNNTF32():
+        input_tensor, kernel = data
+        return F.conv2d(
+            input_tensor, 
+            kernel,
 
-        # No padding and no striding
-        # TODO: Can revisit this in future problems
-        stride=1,
-        padding=0
-    )
+            # No padding and no striding
+            # TODO: Can revisit this in future problems
+            stride=1,
+            padding=0
+        )
 
 
 def generate_input(size: int, kernelsize: int, channels: int, batch: int, seed: int) -> input_t:
@@ -53,4 +69,4 @@ def generate_input(size: int, kernelsize: int, channels: int, batch: int, seed:
     return (input_tensor, kernel)
 
 
-check_implementation = make_match_reference(ref_kernel, rtol=1e-3, atol=1e-3)
+check_implementation = make_match_reference(ref_kernel, rtol=1e-3, atol=1e-3)
\ No newline at end of file

From b53e65f505fbdeb8d7a5539407190963d6bcbdba Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Mon, 2 Jun 2025 14:45:43 -0400
Subject: [PATCH 072/207] update conv2d template

---
 problems/pmpp/conv2d_py/submission.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/problems/pmpp/conv2d_py/submission.py b/problems/pmpp/conv2d_py/submission.py
index a1b7d16d..991f6a50 100644
--- a/problems/pmpp/conv2d_py/submission.py
+++ b/problems/pmpp/conv2d_py/submission.py
@@ -12,6 +12,9 @@ def custom_kernel(data: input_t) -> output_t:
     Returns:
         Output tensor after convolution
     """
+
+    torch.backends.cudnn.allow_tf32 = False
+    torch.backends.cudnn.deterministic = True
     input_tensor, kernel = data
     return F.conv2d(
         input_tensor, 

From 9e359a31b0cb18a3df932b63663dacbcdab02cb0 Mon Sep 17 00:00:00 2001
From: Matej Sirovatka <54212263+S1ro1@users.noreply.github.com>
Date: Thu, 5 Jun 2025 17:35:56 +0200
Subject: [PATCH 073/207] Extend amd deadlines

---
 problems/amd.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/problems/amd.yaml b/problems/amd.yaml
index 16940f4b..88859477 100644
--- a/problems/amd.yaml
+++ b/problems/amd.yaml
@@ -7,21 +7,21 @@ description: "AMD Developer Challenge 2025: Inference Sprint"
 problems:
   - directory: amd/identity
     name: amd-identity
-    deadline: "2025-06-02"
+    deadline: "2025-09-02"
     gpus:
       - MI300
   - directory: amd/fp8-mm
     name: amd-fp8-mm
-    deadline: "2025-06-02"
+    deadline: "2025-09-02"
     gpus:
       - MI300
   - directory: amd/moe
     name: amd-mixture-of-experts
-    deadline: "2025-06-02"
+    deadline: "2025-00-02"
     gpus:
       - MI300
   - directory: amd/mla-decode
     name: amd-mla-decode
-    deadline: "2025-06-02"
+    deadline: "2025-09-02"
     gpus:
       - MI300

From c94f9f2ac213ffdd835a5d9fce373af982e2bbf2 Mon Sep 17 00:00:00 2001
From: Matej Sirovatka <54212263+S1ro1@users.noreply.github.com>
Date: Thu, 5 Jun 2025 18:36:48 +0200
Subject: [PATCH 074/207] fix-amd-moe-deadline

---
 problems/amd.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/amd.yaml b/problems/amd.yaml
index 88859477..ad721d6c 100644
--- a/problems/amd.yaml
+++ b/problems/amd.yaml
@@ -17,7 +17,7 @@ problems:
       - MI300
   - directory: amd/moe
     name: amd-mixture-of-experts
-    deadline: "2025-00-02"
+    deadline: "2025-09-02"
     gpus:
       - MI300
   - directory: amd/mla-decode

From c11c1aa616c9c6de833f28b267f1a303764e725e Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Mon, 16 Jun 2025 15:54:38 -0400
Subject: [PATCH 075/207] add trimul kernel

---
 problems/bioml.yaml                 |  18 +++
 problems/bioml/trimul/reference.py  | 188 ++++++++++++++++++++++++++++
 problems/bioml/trimul/submission.py |  95 ++++++++++++++
 problems/bioml/trimul/task.py       |  14 +++
 problems/bioml/trimul/task.yml      |  65 ++++++++++
 problems/bioml/trimul/utils.py      | 143 +++++++++++++++++++++
 6 files changed, 523 insertions(+)
 create mode 100644 problems/bioml.yaml
 create mode 100644 problems/bioml/trimul/reference.py
 create mode 100644 problems/bioml/trimul/submission.py
 create mode 100644 problems/bioml/trimul/task.py
 create mode 100644 problems/bioml/trimul/task.yml
 create mode 100644 problems/bioml/trimul/utils.py

diff --git a/problems/bioml.yaml b/problems/bioml.yaml
new file mode 100644
index 00000000..d9afca92
--- /dev/null
+++ b/problems/bioml.yaml
@@ -0,0 +1,18 @@
+name: BioML Kernels (no cash prizes)
+
+deadline: ""
+# A description for this particular competition
+description: "Popular and important kernels for BioML models like AlphaFold3"
+
+# the list of problems
+problems:
+  - directory: bioml/trimul
+    name: trimul
+    deadline: "2025-09-30"
+    gpus:
+      - B200
+      - H100
+      - A100
+      - T4
+      - L4
+      - MI300
diff --git a/problems/bioml/trimul/reference.py b/problems/bioml/trimul/reference.py
new file mode 100644
index 00000000..c805f300
--- /dev/null
+++ b/problems/bioml/trimul/reference.py
@@ -0,0 +1,188 @@
+from utils import make_match_reference
+from task import input_t, output_t
+
+import torch
+from torch import nn, einsum
+import math
+
+class DisableCuDNNTF32:
+    def __init__(self):
+        self.allow_tf32 = torch.backends.cudnn.allow_tf32
+        self.deterministic = torch.backends.cudnn.deterministic
+        pass
+
+    def __enter__(self):
+        torch.backends.cudnn.allow_tf32 = False
+        torch.backends.cudnn.deterministic = True
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        torch.backends.cudnn.allow_tf32 = self.allow_tf32
+        torch.backends.cudnn.deterministic = self.deterministic
+
+# Reference code in PyTorch
+class TriMul(nn.Module):
+    # Based on https://github.com/lucidrains/triangle-multiplicative-module/blob/main/triangle_multiplicative_module/triangle_multiplicative_module.py
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+    ):
+        super().__init__()
+
+        self.norm = nn.LayerNorm(dim, bias=False)
+
+        self.left_proj = nn.Linear(dim, hidden_dim, bias=False)
+        self.right_proj = nn.Linear(dim, hidden_dim, bias=False)
+
+        self.left_gate = nn.Linear(dim, hidden_dim, bias=False)
+        self.right_gate = nn.Linear(dim, hidden_dim, bias=False)
+        self.out_gate = nn.Linear(dim, hidden_dim, bias=False)
+
+        self.to_out_norm = nn.LayerNorm(hidden_dim, bias=False)
+        self.to_out = nn.Linear(hidden_dim, dim, bias=False)
+
+    def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+        """
+        x: [bs, seq_len, seq_len, dim]
+        mask: [bs, seq_len, seq_len]
+
+        Returns:
+            output: [bs, seq_len, seq_len, dim]
+        """
+        batch_size, seq_len, _, dim = x.shape
+
+        x = self.norm(x)
+
+        left = self.left_proj(x)
+        right = self.right_proj(x)
+
+        mask = mask.unsqueeze(-1)
+        left = left * mask
+        right = right * mask
+
+        left_gate = self.left_gate(x).sigmoid()
+        right_gate = self.right_gate(x).sigmoid()
+        out_gate = self.out_gate(x).sigmoid()
+
+        left = left * left_gate
+        right = right * right_gate
+
+        out = einsum('... i k d, ... j k d -> ... i j d', left, right)
+        # This einsum is the same as the following:
+        # out = torch.zeros(batch_size, seq_len, seq_len, dim, device=x.device)
+        
+        # # Compute using nested loops
+        # for b in range(batch_size):
+        #     for i in range(seq_len):
+        #         for j in range(seq_len):
+        #             # Compute each output element
+        #             for k in range(seq_len):
+        #                 out[b, i, j] += left[b, i, k, :] * right[b, j, k, :]
+
+        out = self.to_out_norm(out)
+        out = out * out_gate
+        return self.to_out(out)
+
+
+def ref_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of TriMul using PyTorch.
+    
+    Args:
+        data: Tuple of (input: torch.Tensor, mask: torch.Tensor, weights: Dict[str, torch.Tensor], config: Dict)
+            - input: Input tensor of shape [batch_size, seq_len, seq_len, dim]
+            - mask: Mask tensor of shape [batch_size, seq_len, seq_len]
+            - weights: Dictionary containing model weights
+            - config: Dictionary containing model configuration parameters
+    """
+
+    input_tensor, mask, weights, config = data
+    trimul = TriMul(dim=config["dim"], hidden_dim=config["hidden_dim"]).to(input_tensor.device)
+
+    # Fill in the given weights of the model
+    trimul.left_proj.weight = nn.Parameter(weights['left_proj.weight'])
+    trimul.right_proj.weight = nn.Parameter(weights['right_proj.weight'])
+    trimul.left_gate.weight = nn.Parameter(weights['left_gate.weight'])
+    trimul.right_gate.weight = nn.Parameter(weights['right_gate.weight'])
+    trimul.out_gate.weight = nn.Parameter(weights['out_gate.weight'])
+    trimul.to_out_norm.weight = nn.Parameter(weights['to_out_norm.weight'])
+    trimul.to_out.weight = nn.Parameter(weights['to_out.weight'])
+
+    output = trimul(input_tensor, mask)
+
+    return output
+
+
+# Input generation for the reference code
+def generate_input(
+    seqlen: int,
+    bs: int,
+    dim: int,
+    hiddendim: int,
+    seed: int,
+    nomask: bool
+) -> input_t:
+
+    # Really dumb but for now _ isn't parsing correctly.
+    batch_size = bs
+    seq_len = seqlen
+    hidden_dim = hiddendim
+    no_mask = nomask
+
+    config = {
+        "hidden_dim": hidden_dim,
+        "dim": dim,
+    }
+
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+
+    weights = {}
+
+    input_tensor = torch.randn(
+        (batch_size, seq_len, seq_len, dim),
+        device='cuda',
+        dtype=torch.float32,
+        generator=gen
+    ).contiguous()
+
+    if no_mask:
+        mask = torch.ones(batch_size, seq_len, seq_len, device=input_tensor.device)
+    else:
+        mask = torch.randint(0, 2, (batch_size, seq_len, seq_len), device=input_tensor.device, generator=gen)
+
+    # Initialize model weights
+    weights["norm.weight"] = torch.randn(dim, 
+        device="cuda", 
+        dtype=torch.float32)
+
+    weights["left_proj.weight"] = torch.randn(hidden_dim, dim,
+        device="cuda", 
+        dtype=torch.float32) / math.sqrt(hidden_dim)
+
+    weights["right_proj.weight"] = torch.randn(hidden_dim, dim, 
+        device="cuda", 
+        dtype=torch.float32) / math.sqrt(hidden_dim)
+
+    weights["left_gate.weight"] = torch.randn(hidden_dim, dim, 
+        device="cuda", 
+        dtype=torch.float32) / math.sqrt(hidden_dim)
+
+    weights["right_gate.weight"] = torch.randn(hidden_dim, dim, 
+        device="cuda", 
+        dtype=torch.float32) / math.sqrt(hidden_dim)
+
+    weights["out_gate.weight"] = torch.randn(hidden_dim, dim, 
+        device="cuda", dtype=torch.float32) / math.sqrt(hidden_dim)
+
+    weights["to_out_norm.weight"] = torch.randn(hidden_dim, 
+        device="cuda", dtype=torch.float32)
+
+    weights["to_out.weight"] = torch.randn(dim, hidden_dim, 
+        device="cuda", dtype=torch.float32) / math.sqrt(dim)
+
+    return (input_tensor, mask, weights, config)
+
+
+check_implementation = make_match_reference(ref_kernel, rtol=1e-3, atol=1e-3)
\ No newline at end of file
diff --git a/problems/bioml/trimul/submission.py b/problems/bioml/trimul/submission.py
new file mode 100644
index 00000000..0c8b303f
--- /dev/null
+++ b/problems/bioml/trimul/submission.py
@@ -0,0 +1,95 @@
+#!POPCORN leaderboard trimul-dev
+
+import torch
+from torch import nn, einsum
+from task import input_t, output_t
+
+class TriMul(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+    ):
+        super().__init__()
+
+        self.norm = nn.LayerNorm(dim, bias=False)
+
+        self.left_proj = nn.Linear(dim, hidden_dim, bias=False)
+        self.right_proj = nn.Linear(dim, hidden_dim, bias=False)
+
+        self.left_gate = nn.Linear(dim, hidden_dim, bias=False)
+        self.right_gate = nn.Linear(dim, hidden_dim, bias=False)
+        self.out_gate = nn.Linear(dim, hidden_dim, bias=False)
+
+        self.to_out_norm = nn.LayerNorm(hidden_dim, bias=False)
+        self.to_out = nn.Linear(hidden_dim, dim, bias=False)
+
+    def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+        """
+        x: [bs, seq_len, seq_len, dim]
+        mask: [bs, seq_len, seq_len]
+
+        Returns:
+            output: [bs, seq_len, seq_len, dim]
+        """
+        batch_size, seq_len, _, dim = x.shape
+
+        x = self.norm(x)
+
+        left = self.left_proj(x)
+        right = self.right_proj(x)
+
+        mask = mask.unsqueeze(-1)
+        left = left * mask
+        right = right * mask
+
+        left_gate = self.left_gate(x).sigmoid()
+        right_gate = self.right_gate(x).sigmoid()
+        out_gate = self.out_gate(x).sigmoid()
+
+        left = left * left_gate
+        right = right * right_gate
+
+        out = einsum('... i k d, ... j k d -> ... i j d', left, right)
+        # This einsum is the same as the following:
+        # out = torch.zeros(batch_size, seq_len, seq_len, dim, device=x.device)
+        
+        # # Compute using nested loops
+        # for b in range(batch_size):
+        #     for i in range(seq_len):
+        #         for j in range(seq_len):
+        #             # Compute each output element
+        #             for k in range(seq_len):
+        #                 out[b, i, j] += left[b, i, k, :] * right[b, j, k, :]
+
+        out = self.to_out_norm(out)
+        out = out * out_gate
+        return self.to_out(out)
+
+
+def custom_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of TriMul using PyTorch.
+    
+    Args:
+        data: Tuple of (input: torch.Tensor, mask: torch.Tensor, weights: Dict[str, torch.Tensor], config: Dict)
+            - input: Input tensor of shape [batch_size, seq_len, seq_len, dim]
+            - mask: Mask tensor of shape [batch_size, seq_len, seq_len]
+            - weights: Dictionary containing model weights
+            - config: Dictionary containing model configuration parameters
+    """
+    input_tensor, mask, weights, config = data
+    trimul = TriMul(config["dim"], config["hidden_dim"]).to(input_tensor.device)
+
+    # Fill in the given weights of the model
+    trimul.left_proj.weight = nn.Parameter(weights['left_proj.weight'])
+    trimul.right_proj.weight = nn.Parameter(weights['right_proj.weight'])
+    trimul.left_gate.weight = nn.Parameter(weights['left_gate.weight'])
+    trimul.right_gate.weight = nn.Parameter(weights['right_gate.weight'])
+    trimul.out_gate.weight = nn.Parameter(weights['out_gate.weight'])
+    trimul.to_out_norm.weight = nn.Parameter(weights['to_out_norm.weight'])
+    trimul.to_out.weight = nn.Parameter(weights['to_out.weight'])
+
+    output = trimul(input_tensor, mask)
+
+    return output
\ No newline at end of file
diff --git a/problems/bioml/trimul/task.py b/problems/bioml/trimul/task.py
new file mode 100644
index 00000000..0ffdfbb9
--- /dev/null
+++ b/problems/bioml/trimul/task.py
@@ -0,0 +1,14 @@
+from typing import TypedDict, TypeVar, Tuple, Dict
+import torch
+
+input_t = TypeVar("input_t", bound=Tuple[torch.Tensor, torch.Tensor, Dict[str, torch.Tensor], Dict])
+output_t = TypeVar("output_t", bound=Tuple[torch.Tensor, Dict])
+
+
+class TestSpec(TypedDict):
+    seqlen: int
+    bs: int
+    dim: int
+    hiddendim: int
+    seed: int
+    nomask: bool
\ No newline at end of file
diff --git a/problems/bioml/trimul/task.yml b/problems/bioml/trimul/task.yml
new file mode 100644
index 00000000..ced8b54a
--- /dev/null
+++ b/problems/bioml/trimul/task.yml
@@ -0,0 +1,65 @@
+# name: trimul
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  For a more complete description, see: https://tinyurl.com/gpumode-trimul
+  You will be implementing a Triangle Multiplicative Update (TriMul) module that is a core operation
+  for AlphaFold3, Chai, Protenix, and other protein structure prediction models in BioML.
+
+  The TriMul operator operates over a 4D tensor of shape [B, N, N, C]. 
+
+  Your task:
+  - Implement the "outgoing" version of the TriMul operator from the AlphaFold3 paper.
+  - You will not have to compute or store gradients for this version. You will only need to implement the forward pass.
+  
+  Input:
+  - `data`: Tuple of (input: torch.Tensor, weights: Dict[str, torch.Tensor], config: Dict)
+    - input: Input tensor of shape [bs, seq_len, seq_len, dim]
+    - mask: Mask tensor of shape [bs, seq_len, seq_len]
+    - weights: Dictionary containing model weights
+    - config: Dictionary containing model configuration parameters
+  
+  Output:
+  - Tuple containing:
+    - output: Processed tensor [bs, seq_len, seq_len, dim]
+
+config:
+  main: "eval.py"
+
+templates:
+  Python: "submission.py"
+
+test_timeout: 540
+benchmark_timeout: 540
+ranked_timeout: 540
+ranking_by: "geom"
+
+tests:
+  - {"seqlen": 32, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 9371, "nomask": True}
+  - {"seqlen": 32, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 1092, "nomask": False}
+  - {"seqlen": 64, "bs": 2, "dim": 256, "hiddendim": 128, "seed": 2291, "nomask": True} 
+  - {"seqlen": 64, "bs": 2, "dim": 256, "hiddendim": 128, "seed": 210284, "nomask": False} 
+  - {"seqlen": 128, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 81934, "nomask": True}
+  - {"seqlen": 128, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 1832, "nomask": False}
+  - {"seqlen": 256, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 1932, "nomask": True}
+  - {"seqlen": 256, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 10432, "nomask": False}
+  - {"seqlen": 768, "bs": 2, "dim": 128, "hiddendim": 128, "seed": 731, "nomask": True}
+  - {"seqlen": 768, "bs": 2, "dim": 128, "hiddendim": 128, "seed": 82102, "nomask": False}
+
+benchmarks:
+  - {"seqlen": 256, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 9371, "nomask": True}
+  - {"seqlen": 256, "bs": 1, "dim": 384, "hiddendim": 128, "seed": 2301, "nomask": False}
+  - {"seqlen": 512, "bs": 2, "dim": 768, "hiddendim": 128, "seed": 12819, "nomask": True}
+  - {"seqlen": 512, "bs": 2, "dim": 768, "hiddendim": 128, "seed": 1219, "nomask": False}
+  - {"seqlen": 1024, "bs": 1, "dim": 384, "hiddendim": 128, "seed": 121, "nomask": True}
+  - {"seqlen": 1024, "bs": 1, "dim": 384, "hiddendim": 128, "seed": 8421, "nomask": False}
+  - {"seqlen": 1024, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 8331, "nomask": True}
+  - {"seqlen": 1024, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 9120, "nomask": False}
\ No newline at end of file
diff --git a/problems/bioml/trimul/utils.py b/problems/bioml/trimul/utils.py
new file mode 100644
index 00000000..24fc902a
--- /dev/null
+++ b/problems/bioml/trimul/utils.py
@@ -0,0 +1,143 @@
+import random
+from typing import Tuple
+
+import numpy as np
+import torch
+
+
+def set_seed(seed=42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+
+
+def get_device(use_cuda: bool = True) -> torch.device:
+    """Get the appropriate device (GPU or CPU)."""
+    if use_cuda:
+        if torch.cuda.is_available():
+            return torch.device("cuda")
+        elif torch.backends.mps.is_available():
+            return torch.device("mps")
+        else:
+            print("No compatible GPU found. Falling back to CPU.")
+    return torch.device("cpu")
+
+
+# Adapted from https://github.com/linkedin/Liger-Kernel/blob/main/test/utils.py
+@torch.no_grad()
+def verbose_allclose(
+        received: torch.Tensor,
+        expected: torch.Tensor,
+        rtol=1e-05,
+        atol=1e-08,
+        max_print=5
+) -> Tuple[bool, list[str]]:
+    """
+    Assert that two tensors are element-wise equal within a tolerance, providing detailed information about mismatches.
+
+    Parameters:
+    received (torch.Tensor): Tensor we actually got.
+    expected (torch.Tensor): Tensor we expected to receive.
+    rtol (float): Relative tolerance; relative to expected
+    atol (float): Absolute tolerance.
+    max_print (int): Maximum number of mismatched elements to print.
+    """
+    # Check if the shapes of the tensors match
+    if received.shape != expected.shape:
+        return False, ["SIZE MISMATCH"]
+
+    # Calculate the difference between the tensors
+    diff = torch.abs(received.to(torch.float32) - expected.to(torch.float32))
+
+    # Determine the tolerance
+    tolerance = atol + rtol * torch.abs(expected)
+
+    # Find tolerance mismatched elements
+    tol_mismatched = diff > tolerance
+
+    # Find nan mismatched elements
+    nan_mismatched = torch.logical_xor(torch.isnan(received), torch.isnan(expected))
+
+    # Find +inf mismatched elements
+    posinf_mismatched = torch.logical_xor(torch.isposinf(received), torch.isposinf(expected))
+    # Find -inf mismatched elements
+    neginf_mismatched = torch.logical_xor(torch.isneginf(received), torch.isneginf(expected))
+
+    # Find all mismatched elements
+    mismatched = torch.logical_or(
+        torch.logical_or(tol_mismatched, nan_mismatched),
+        torch.logical_or(posinf_mismatched, neginf_mismatched),
+    )
+
+    mismatched_indices = torch.nonzero(mismatched)
+
+    # Count the number of mismatched elements
+    num_mismatched = mismatched.count_nonzero().item()
+
+    # Generate detailed information if there are mismatches
+    if num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
+
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR at {i}: {received[i]} {expected[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return False, mismatch_details
+
+    return True, [f"Maximum error: {torch.max(diff)}"]
+
+
+@torch.no_grad()
+def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print: int = 5) -> Tuple[bool, list[str]]:
+    """
+    Assert that two tensors are element-wise perfectly equal, providing detailed information about mismatches.
+
+    Parameters:
+    received (torch.Tensor): Tensor we actually got.
+    expected (torch.Tensor): Tensor we expected to receive.
+    max_print (int): Maximum number of mismatched elements to print.
+
+    Returns:
+         Empty string if tensors are equal, otherwise detailed error information
+    """
+    mismatched = torch.not_equal(received, expected)
+    mismatched_indices = torch.nonzero(mismatched)
+
+    # Count the number of mismatched elements
+    num_mismatched = mismatched.count_nonzero().item()
+
+    # Generate detailed information if there are mismatches
+    if num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
+
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR at {i}: {received[i]} {expected[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return False, mismatch_details
+
+    return True, []
+
+
+def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08):
+    """
+    Convenient "default" implementation for tasks' `check_implementation` function.
+    """
+    expected = reference(data)
+    good, reasons = verbose_allclose(output, expected, rtol=rtol, atol=atol)
+
+    if len(reasons) > 0:
+        return good, "\\n".join(reasons)
+
+    return good, ''
+
+
+def make_match_reference(reference: callable, **kwargs):
+    def wrapped(data, output):
+        return match_reference(data, output, reference=reference, **kwargs)
+    return wrapped
\ No newline at end of file

From 1309d2f0b2252e34e24fb2ca075815f06b826e7e Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Mon, 16 Jun 2025 16:22:57 -0400
Subject: [PATCH 076/207] move fp32 context to utils

---
 problems/bioml/trimul/reference.py | 43 +++++++++++-------------------
 problems/bioml/trimul/utils.py     | 18 ++++++++++++-
 2 files changed, 32 insertions(+), 29 deletions(-)

diff --git a/problems/bioml/trimul/reference.py b/problems/bioml/trimul/reference.py
index c805f300..8b2d5d41 100644
--- a/problems/bioml/trimul/reference.py
+++ b/problems/bioml/trimul/reference.py
@@ -1,25 +1,10 @@
-from utils import make_match_reference
+from utils import make_match_reference, DisableCuDNNTF32
 from task import input_t, output_t
 
 import torch
 from torch import nn, einsum
 import math
 
-class DisableCuDNNTF32:
-    def __init__(self):
-        self.allow_tf32 = torch.backends.cudnn.allow_tf32
-        self.deterministic = torch.backends.cudnn.deterministic
-        pass
-
-    def __enter__(self):
-        torch.backends.cudnn.allow_tf32 = False
-        torch.backends.cudnn.deterministic = True
-        return self
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        torch.backends.cudnn.allow_tf32 = self.allow_tf32
-        torch.backends.cudnn.deterministic = self.deterministic
-
 # Reference code in PyTorch
 class TriMul(nn.Module):
     # Based on https://github.com/lucidrains/triangle-multiplicative-module/blob/main/triangle_multiplicative_module/triangle_multiplicative_module.py
@@ -97,21 +82,23 @@ def ref_kernel(data: input_t) -> output_t:
             - config: Dictionary containing model configuration parameters
     """
 
-    input_tensor, mask, weights, config = data
-    trimul = TriMul(dim=config["dim"], hidden_dim=config["hidden_dim"]).to(input_tensor.device)
+    # Use deterministic kernels and disable TF32 for accuracy
+    with DisableCuDNNTF32():
+        input_tensor, mask, weights, config = data
+        trimul = TriMul(dim=config["dim"], hidden_dim=config["hidden_dim"]).to(input_tensor.device)
 
-    # Fill in the given weights of the model
-    trimul.left_proj.weight = nn.Parameter(weights['left_proj.weight'])
-    trimul.right_proj.weight = nn.Parameter(weights['right_proj.weight'])
-    trimul.left_gate.weight = nn.Parameter(weights['left_gate.weight'])
-    trimul.right_gate.weight = nn.Parameter(weights['right_gate.weight'])
-    trimul.out_gate.weight = nn.Parameter(weights['out_gate.weight'])
-    trimul.to_out_norm.weight = nn.Parameter(weights['to_out_norm.weight'])
-    trimul.to_out.weight = nn.Parameter(weights['to_out.weight'])
+        # Fill in the given weights of the model
+        trimul.left_proj.weight = nn.Parameter(weights['left_proj.weight'])
+        trimul.right_proj.weight = nn.Parameter(weights['right_proj.weight'])
+        trimul.left_gate.weight = nn.Parameter(weights['left_gate.weight'])
+        trimul.right_gate.weight = nn.Parameter(weights['right_gate.weight'])
+        trimul.out_gate.weight = nn.Parameter(weights['out_gate.weight'])
+        trimul.to_out_norm.weight = nn.Parameter(weights['to_out_norm.weight'])
+        trimul.to_out.weight = nn.Parameter(weights['to_out.weight'])
 
-    output = trimul(input_tensor, mask)
+        output = trimul(input_tensor, mask)
 
-    return output
+        return output
 
 
 # Input generation for the reference code
diff --git a/problems/bioml/trimul/utils.py b/problems/bioml/trimul/utils.py
index 24fc902a..ecfa6806 100644
--- a/problems/bioml/trimul/utils.py
+++ b/problems/bioml/trimul/utils.py
@@ -140,4 +140,20 @@ def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08):
 def make_match_reference(reference: callable, **kwargs):
     def wrapped(data, output):
         return match_reference(data, output, reference=reference, **kwargs)
-    return wrapped
\ No newline at end of file
+    return wrapped
+
+
+class DisableCuDNNTF32:
+    def __init__(self):
+        self.allow_tf32 = torch.backends.cudnn.allow_tf32
+        self.deterministic = torch.backends.cudnn.deterministic
+        pass
+
+    def __enter__(self):
+        torch.backends.cudnn.allow_tf32 = False
+        torch.backends.cudnn.deterministic = True
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        torch.backends.cudnn.allow_tf32 = self.allow_tf32
+        torch.backends.cudnn.deterministic = self.deterministic
\ No newline at end of file

From cd6a42877523a65acb409291507ae03e53cca378 Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Mon, 16 Jun 2025 16:28:32 -0400
Subject: [PATCH 077/207] remove popcorn header in template

---
 problems/bioml/trimul/submission.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/problems/bioml/trimul/submission.py b/problems/bioml/trimul/submission.py
index 0c8b303f..f6188a8f 100644
--- a/problems/bioml/trimul/submission.py
+++ b/problems/bioml/trimul/submission.py
@@ -1,5 +1,3 @@
-#!POPCORN leaderboard trimul-dev
-
 import torch
 from torch import nn, einsum
 from task import input_t, output_t
@@ -92,4 +90,4 @@ def custom_kernel(data: input_t) -> output_t:
 
     output = trimul(input_tensor, mask)
 
-    return output
\ No newline at end of file
+    return output

From af689a777812c1b7554dc15956133ef4e5ff9e9e Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Mon, 16 Jun 2025 16:31:55 -0400
Subject: [PATCH 078/207] remove low-memory gpus

---
 problems/bioml.yaml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/problems/bioml.yaml b/problems/bioml.yaml
index d9afca92..3761aea6 100644
--- a/problems/bioml.yaml
+++ b/problems/bioml.yaml
@@ -13,6 +13,4 @@ problems:
       - B200
       - H100
       - A100
-      - T4
-      - L4
       - MI300

From 0b49302ac0ba9a1c5755ff6e296d603c52b217f1 Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Wed, 18 Jun 2025 01:24:41 -0400
Subject: [PATCH 079/207] change tolerance to allow bf16

---
 problems/bioml/trimul/reference.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/problems/bioml/trimul/reference.py b/problems/bioml/trimul/reference.py
index 8b2d5d41..d61df5e9 100644
--- a/problems/bioml/trimul/reference.py
+++ b/problems/bioml/trimul/reference.py
@@ -88,6 +88,7 @@ def ref_kernel(data: input_t) -> output_t:
         trimul = TriMul(dim=config["dim"], hidden_dim=config["hidden_dim"]).to(input_tensor.device)
 
         # Fill in the given weights of the model
+        trimul.norm.weight = nn.Parameter(weights['norm.weight'])
         trimul.left_proj.weight = nn.Parameter(weights['left_proj.weight'])
         trimul.right_proj.weight = nn.Parameter(weights['right_proj.weight'])
         trimul.left_gate.weight = nn.Parameter(weights['left_gate.weight'])
@@ -172,4 +173,4 @@ def generate_input(
     return (input_tensor, mask, weights, config)
 
 
-check_implementation = make_match_reference(ref_kernel, rtol=1e-3, atol=1e-3)
\ No newline at end of file
+check_implementation = make_match_reference(ref_kernel, rtol=2e-2, atol=2e-2)
\ No newline at end of file

From 44100a877e96a2530cb6c5d4bc0314eb1b06b8ea Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Wed, 18 Jun 2025 13:48:27 -0400
Subject: [PATCH 080/207] update eval to do cuda events, add more cases with
 heavy tailed distribution input

---
 problems/bioml/trimul/eval.py       | 384 ++++++++++++++++++++++++++++
 problems/bioml/trimul/reference.py  |  60 ++---
 problems/bioml/trimul/submission.py |  53 ++--
 problems/bioml/trimul/task.yml      |  55 ++--
 4 files changed, 473 insertions(+), 79 deletions(-)
 create mode 100644 problems/bioml/trimul/eval.py

diff --git a/problems/bioml/trimul/eval.py b/problems/bioml/trimul/eval.py
new file mode 100644
index 00000000..be957134
--- /dev/null
+++ b/problems/bioml/trimul/eval.py
@@ -0,0 +1,384 @@
+import base64
+import dataclasses
+import multiprocessing
+import re
+import time
+import os
+import sys
+import math
+from pathlib import Path
+from typing import Any, Optional
+
+import torch.cuda
+
+from utils import set_seed
+try:
+    from task import TestSpec
+except ImportError:
+    TestSpec = dict
+
+from reference import check_implementation, generate_input
+
+
+class PopcornOutput:
+    def __init__(self, fd: int):
+        self.file = os.fdopen(fd, 'w')
+        os.set_inheritable(fd, False)
+    
+    def __enter__(self):
+        return self
+    
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.file.close()
+    
+    def print(self, *args, **kwargs):
+        print(*args, **kwargs, file=self.file, flush=True)
+    
+    def log(self, key, value):
+        self.print(f"{key}: {value}")
+
+
+@dataclasses.dataclass
+class TestCase:
+    args: dict
+    spec: str
+
+
+def _combine(a: int, b: int) -> int:
+    # combine two integers into one:
+    # we need this to generate a secret seed based on the test-level seed and
+    # the global secret seed.
+    # the test-level seeds are public knowledge, and typically relatively small numbers,
+    # so we need to make sure they don't provide any useful info for the full seed.
+    # This Cantor construction ensures that if the secret seed is a large number,
+    # then so is the overall seed.
+    return int(a + (a+b)*(a+b+1)//2)
+
+
+def get_test_cases(file_name: str, seed: Optional[int]) -> list[TestCase]:
+    try:
+        content = Path(file_name).read_text()
+    except Exception as E:
+        print(f"Could not open test file`{file_name}`: {E}", file=sys.stderr)
+        exit(113)
+
+    tests = []
+    lines = content.splitlines()
+    match = r"\s*([a-zA-Z]+):\s*([a-zA-Z]+|[+-]?[0-9]+)\s*"
+    for line in lines:
+        parts = line.split(";")
+        case = {}
+        for part in parts:
+            matched = re.match(match, part)
+            if not re.fullmatch(match, part):
+                print(f"invalid test case: '{line}': '{part}'", file=sys.stderr)
+                exit(113)
+            key = matched[1]
+            val = matched[2]
+            try:
+                val = int(val)
+            except ValueError:
+                pass
+
+            case[key] = val
+        tests.append(TestCase(spec=line, args=case))
+
+    if seed is not None:
+        for test in tests:
+            if "seed" in test.args:
+                test.args["seed"] = _combine(test.args["seed"], seed)
+
+    return tests
+
+
+@dataclasses.dataclass
+class Stats:
+    runs: int
+    mean: float
+    std: float
+    err: float
+    best: float
+    worst: float
+
+
+def calculate_stats(durations: list[int]):
+    """
+    Calculate statistical data from a list of durations.
+
+    @param durations: A list of durations in nanoseconds.
+    @return: A Stats object containing the number of runs, mean, standard deviation, error, best, and worst durations.
+    """
+    runs = len(durations)
+    total = sum(durations)
+    best = min(durations)
+    worst = max(durations)
+
+    avg = total / runs
+    variance = sum(map(lambda x: (x - avg)**2, durations))
+    std = math.sqrt(variance / (runs - 1))
+    err = std / math.sqrt(runs)
+
+    return Stats(runs=runs, mean=avg, std=std, err=err, best=float(best),
+                 worst=float(worst))
+
+
+def _clone_data(data):
+    """
+    Recursively goes through data and clones all tensors.
+    """
+    if isinstance(data, tuple):
+        return tuple(_clone_data(x) for x in data)
+    elif isinstance(data, list):
+        return [_clone_data(x) for x in data]
+    elif isinstance(data, dict):
+        return {k: _clone_data(v) for k, v in data.items()}
+    elif isinstance(data, torch.Tensor):
+        return data.clone()
+    else:
+        return data
+
+
+def wrap_check_implementation(data, submission_output):
+    # Old version returned just a single string, new version
+    # returns (bool, str); this function ensures compatibility with old
+    # problem definitions.
+    result = check_implementation(data, submission_output)
+    if isinstance(result, tuple):
+        return result
+    else:
+        return not bool(result), result
+
+
+def _run_single_test(test: TestCase):
+    """
+    Runs a single test case. Do not call directly
+    """
+    from submission import custom_kernel
+    data = generate_input(**test.args)
+    torch.cuda.synchronize()
+    submission_output = custom_kernel(_clone_data(data))
+    torch.cuda.synchronize()
+    return wrap_check_implementation(data, submission_output)
+
+
+def run_single_test(pool: multiprocessing.Pool, test: TestCase):
+    """
+    Runs a single test in another process.
+    """
+    return pool.apply(_run_single_test, (test,))
+
+
+def run_testing(logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]):
+    """
+    Executes the actual test case code and checks for correctness.
+
+    @param logger: A PopcornOutput object used for logging test results.
+    @param tests: A list of TestCase objects representing the test cases to be executed.
+    @return: An integer representing the exit status: 0 if all tests pass, otherwise 112.
+    """
+    passed = True
+    logger.log("test-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"test.{idx}.spec", test.spec)
+        good, message = run_single_test(pool, test)
+        if not good:
+            logger.log(f"test.{idx}.status", "fail")
+            logger.log(f"test.{idx}.error", message)
+            passed = False
+        else:
+            logger.log(f"test.{idx}.status", "pass")
+            if message:
+                logger.log(f"test.{idx}.message", message)
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float) -> Stats | Any:
+    """
+    Runs one benchmark. Do not call directly.
+    """
+    from submission import custom_kernel
+
+    durations = []
+    # generate input data once
+    data = generate_input(**test.args)
+    check_copy = _clone_data(data)
+    #  first, one obligatory correctness check
+    output = custom_kernel(data)
+    good, message = wrap_check_implementation(check_copy, output)
+    if not good:
+        return message
+
+    # now, do multiple timing runs without further correctness testing
+    # there is an upper bound of 100 runs, and a lower bound of 3 runs;
+    # otherwise, we repeat until we either measure at least 10 full seconds,
+    # or the relative error of the mean is below 1%.
+
+    bm_start_time = time.perf_counter_ns()
+    for i in range(max_repeats):
+        if recheck:
+            # ensure we use a different seed for every benchmark
+            if "seed" in test.args:
+                test.args["seed"] += 13
+
+            data = generate_input(**test.args)
+            check_copy = _clone_data(data)
+        torch.cuda.synchronize()
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        start_event.record()
+        output = custom_kernel(data)
+        end_event.record()
+        torch.cuda.synchronize()
+        duration = start_event.elapsed_time(end_event) * 1e6  # Convert ms to ns
+
+        if recheck:
+            good, message = check_implementation(check_copy, output)
+            if not good:
+                return message
+
+        del output
+        durations.append(duration)
+
+        if i > 1:
+            total_bm_duration = time.perf_counter_ns() - bm_start_time
+            stats = calculate_stats(durations)
+            # stop if either
+            # a) relative error dips below 0.1%
+            # b) we exceed the total time limit for benchmarking the kernel
+            # c) we exceed 2 minutes of total wallclock time.
+            if stats.err / stats.mean < 0.001 or stats.mean * stats.runs > max_time_ns or total_bm_duration > 120e9:
+                break
+
+    return calculate_stats(durations)
+
+
+def run_single_benchmark(pool: multiprocessing.Pool, test: TestCase, recheck: bool, max_repeats: int,
+                         max_time_ns: float):
+    """
+    For a particular test case, check correctness (if applicable) and grab runtime results.
+
+    @param pool: Process on which the benchmark will be launched.
+    @param test: TestCase object.
+    @param recheck: Flag for whether to explicitly check functional correctness.
+    @param max_repeats: Number of trials to repeat.
+    @param max_time_ns: Timeout time in nanoseconds.
+    @return: A Stats object for this particular benchmark case or an error if the test fails.
+    """
+    return pool.apply(_run_single_benchmark, (test, recheck, max_repeats, max_time_ns))
+
+
+def run_benchmarking(logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]):
+    """
+    Executes benchmarking code for a CUDA Kernel and logs runtimes.
+
+    @param logger: A PopcornOutput object used for logging benchmark results.
+    @param pool: Process on which the benchmarks will be launched.
+    @param tests: A list of TestCase objects representing the test cases to be benchmarked.
+    @return: An integer representing the exit status: 0 if all benchmarks pass, otherwise 112.
+    """
+    # warm up
+    run_single_benchmark(pool, tests[0], False, 100, 10e7)
+
+    passed = True
+    logger.log("benchmark-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"benchmark.{idx}.spec", test.spec)
+        result = run_single_benchmark(pool, test, False, 100, 10e9)
+        if isinstance(result, Stats):
+            for field in dataclasses.fields(Stats):
+                logger.log(f"benchmark.{idx}.{field.name}", getattr(result, field.name))
+        else:
+            passed = False
+            logger.log(f"benchmark.{idx}.status", "fail")
+            logger.log(f"benchmark.{idx}.error", result)
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def run_single_profile(test: TestCase) -> str:
+    """
+    Runs a single test case. Do not call directly
+    """
+    from submission import custom_kernel
+    from torch.profiler import profile, record_function, ProfilerActivity
+    data = generate_input(**test.args)
+    torch.cuda.synchronize()
+
+    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
+        submission_output = custom_kernel(_clone_data(data))
+        torch.cuda.synchronize()
+    return prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=20)
+
+
+def run_profiling(logger: PopcornOutput, tests: list[TestCase]):
+    logger.log("benchmark-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"benchmark.{idx}.spec", test.spec)
+        report = run_single_profile(test)
+        logger.log(f"benchmark.{idx}.report", base64.b64encode(report.encode("utf-8"), b"+*").decode("utf-8"))
+    logger.log("check", "pass")
+    return 0
+
+
+def main():
+    fd = os.getenv("POPCORN_FD")
+    if not fd:
+        return 111
+
+    if len(sys.argv) < 3:
+        return 2
+
+    mode = sys.argv[1]
+    seed = os.getenv("POPCORN_SEED")
+    os.unsetenv("POPCORN_SEED")
+    seed = int(seed) if seed else None
+    set_seed(seed or 42)
+    tests = get_test_cases(sys.argv[2], seed)
+
+    with PopcornOutput(int(fd)) as logger:
+        import multiprocessing
+        mp_context = multiprocessing.get_context('spawn')
+        with mp_context.Pool(1) as pool:
+            if mode == "test":
+                return run_testing(logger, pool, tests)
+            if mode == "benchmark":
+                return run_benchmarking(logger, pool, tests)
+
+            if mode == "leaderboard":
+                # warmup
+                run_single_benchmark(pool, tests[0], False, 100, 1e7)
+                logger.log("benchmark-count", len(tests))
+                passed = True
+                for i in range(len(tests)):
+                    result = run_single_benchmark(pool, tests[i], True, 100, 30e9)
+                    logger.log(f"benchmark.{i}.spec", tests[i].spec)
+                    if isinstance(result, Stats):
+                        for field in dataclasses.fields(Stats):
+                            logger.log(f"benchmark.{i}.{field.name}", getattr(result, field.name))
+                    else:
+                        passed = False
+                        logger.log(f"benchmark.{i}.status", "fail")
+                        logger.log(f"benchmark.{i}.error", str(result))  # TODO: Make sure result implements __str__?
+                        break
+
+                logger.log("check", "pass" if passed else "fail")
+            elif mode == "profile":
+                run_profiling(logger, tests)
+            else:
+                # TODO: Implement script mode
+                return 2
+
+
+if __name__ == "__main__":
+    sys.exit(main())
\ No newline at end of file
diff --git a/problems/bioml/trimul/reference.py b/problems/bioml/trimul/reference.py
index d61df5e9..8bef41b3 100644
--- a/problems/bioml/trimul/reference.py
+++ b/problems/bioml/trimul/reference.py
@@ -109,7 +109,8 @@ def generate_input(
     dim: int,
     hiddendim: int,
     seed: int,
-    nomask: bool
+    nomask: bool,
+    distribution: str,
 ) -> input_t:
 
     # Really dumb but for now _ isn't parsing correctly.
@@ -128,47 +129,34 @@ def generate_input(
 
     weights = {}
 
-    input_tensor = torch.randn(
-        (batch_size, seq_len, seq_len, dim),
-        device='cuda',
-        dtype=torch.float32,
-        generator=gen
-    ).contiguous()
+    # Generate input tensor based on distribution
+    if distribution == "cauchy":
+        # Heavier tail distribution
+        input_tensor = torch.distributions.Cauchy(0, 2).sample(
+            (batch_size, seq_len, seq_len, dim)
+        ).to(device='cuda', dtype=torch.float32)
+    else:  # normal distribution
+        input_tensor = torch.randn(
+            (batch_size, seq_len, seq_len, dim),
+            device='cuda',
+            dtype=torch.float32,
+            generator=gen
+        ).contiguous()
 
     if no_mask:
         mask = torch.ones(batch_size, seq_len, seq_len, device=input_tensor.device)
     else:
         mask = torch.randint(0, 2, (batch_size, seq_len, seq_len), device=input_tensor.device, generator=gen)
 
-    # Initialize model weights
-    weights["norm.weight"] = torch.randn(dim, 
-        device="cuda", 
-        dtype=torch.float32)
-
-    weights["left_proj.weight"] = torch.randn(hidden_dim, dim,
-        device="cuda", 
-        dtype=torch.float32) / math.sqrt(hidden_dim)
-
-    weights["right_proj.weight"] = torch.randn(hidden_dim, dim, 
-        device="cuda", 
-        dtype=torch.float32) / math.sqrt(hidden_dim)
-
-    weights["left_gate.weight"] = torch.randn(hidden_dim, dim, 
-        device="cuda", 
-        dtype=torch.float32) / math.sqrt(hidden_dim)
-
-    weights["right_gate.weight"] = torch.randn(hidden_dim, dim, 
-        device="cuda", 
-        dtype=torch.float32) / math.sqrt(hidden_dim)
-
-    weights["out_gate.weight"] = torch.randn(hidden_dim, dim, 
-        device="cuda", dtype=torch.float32) / math.sqrt(hidden_dim)
-
-    weights["to_out_norm.weight"] = torch.randn(hidden_dim, 
-        device="cuda", dtype=torch.float32)
-
-    weights["to_out.weight"] = torch.randn(dim, hidden_dim, 
-        device="cuda", dtype=torch.float32) / math.sqrt(dim)
+    # Initialize model weights based on distribution
+    weights["norm.weight"] = torch.randn(dim, device="cuda", dtype=torch.float32)
+    weights["left_proj.weight"] = torch.randn(hidden_dim, dim, device="cuda", dtype=torch.float32) / math.sqrt(hidden_dim)
+    weights["right_proj.weight"] = torch.randn(hidden_dim, dim, device="cuda", dtype=torch.float32) / math.sqrt(hidden_dim)
+    weights["left_gate.weight"] = torch.randn(hidden_dim, dim, device="cuda", dtype=torch.float32) / math.sqrt(hidden_dim)
+    weights["right_gate.weight"] = torch.randn(hidden_dim, dim, device="cuda", dtype=torch.float32) / math.sqrt(hidden_dim)
+    weights["out_gate.weight"] = torch.randn(hidden_dim, dim, device="cuda", dtype=torch.float32) / math.sqrt(hidden_dim)
+    weights["to_out_norm.weight"] = torch.randn(hidden_dim, device="cuda", dtype=torch.float32)
+    weights["to_out.weight"] = torch.randn(dim, hidden_dim, device="cuda", dtype=torch.float32) / math.sqrt(dim)
 
     return (input_tensor, mask, weights, config)
 
diff --git a/problems/bioml/trimul/submission.py b/problems/bioml/trimul/submission.py
index f6188a8f..eba46855 100644
--- a/problems/bioml/trimul/submission.py
+++ b/problems/bioml/trimul/submission.py
@@ -1,3 +1,5 @@
+#!POPCORN leaderboard trimul-dev
+
 import torch
 from torch import nn, einsum
 from task import input_t, output_t
@@ -12,15 +14,15 @@ def __init__(
 
         self.norm = nn.LayerNorm(dim, bias=False)
 
-        self.left_proj = nn.Linear(dim, hidden_dim, bias=False)
-        self.right_proj = nn.Linear(dim, hidden_dim, bias=False)
+        self.left_proj = nn.Linear(dim, hidden_dim, bias=False, dtype=torch.float32)
+        self.right_proj = nn.Linear(dim, hidden_dim, bias=False, dtype=torch.float32)
 
-        self.left_gate = nn.Linear(dim, hidden_dim, bias=False)
-        self.right_gate = nn.Linear(dim, hidden_dim, bias=False)
-        self.out_gate = nn.Linear(dim, hidden_dim, bias=False)
+        self.left_gate = nn.Linear(dim, hidden_dim, bias=False, dtype=torch.float32)
+        self.right_gate = nn.Linear(dim, hidden_dim, bias=False, dtype=torch.float32)
+        self.out_gate = nn.Linear(dim, hidden_dim, bias=False, dtype=torch.float32)
 
-        self.to_out_norm = nn.LayerNorm(hidden_dim, bias=False)
-        self.to_out = nn.Linear(hidden_dim, dim, bias=False)
+        self.to_out_norm = nn.LayerNorm(hidden_dim, bias=False, dtype=torch.float32)
+        self.to_out = nn.Linear(hidden_dim, dim, bias=False, dtype=torch.float32)
 
     def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
         """
@@ -33,22 +35,23 @@ def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
         batch_size, seq_len, _, dim = x.shape
 
         x = self.norm(x)
+        x = x.to(torch.float32)
 
-        left = self.left_proj(x)
-        right = self.right_proj(x)
+        left = self.left_proj(x.to(torch.float32))
+        right = self.right_proj(x.to(torch.float32))
 
         mask = mask.unsqueeze(-1)
         left = left * mask
         right = right * mask
 
-        left_gate = self.left_gate(x).sigmoid()
-        right_gate = self.right_gate(x).sigmoid()
-        out_gate = self.out_gate(x).sigmoid()
+        left_gate = self.left_gate(x.to(torch.float32)).sigmoid()
+        right_gate = self.right_gate(x.to(torch.float32)).sigmoid()
+        out_gate = self.out_gate(x.to(torch.float32)).sigmoid()
 
         left = left * left_gate
         right = right * right_gate
 
-        out = einsum('... i k d, ... j k d -> ... i j d', left, right)
+        out = einsum('... i k d, ... j k d -> ... i j d', left.to(torch.bfloat16), right.to(torch.bfloat16))
         # This einsum is the same as the following:
         # out = torch.zeros(batch_size, seq_len, seq_len, dim, device=x.device)
         
@@ -60,6 +63,7 @@ def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
         #             for k in range(seq_len):
         #                 out[b, i, j] += left[b, i, k, :] * right[b, j, k, :]
 
+        out = out.to(torch.float32)
         out = self.to_out_norm(out)
         out = out * out_gate
         return self.to_out(out)
@@ -80,14 +84,15 @@ def custom_kernel(data: input_t) -> output_t:
     trimul = TriMul(config["dim"], config["hidden_dim"]).to(input_tensor.device)
 
     # Fill in the given weights of the model
-    trimul.left_proj.weight = nn.Parameter(weights['left_proj.weight'])
-    trimul.right_proj.weight = nn.Parameter(weights['right_proj.weight'])
-    trimul.left_gate.weight = nn.Parameter(weights['left_gate.weight'])
-    trimul.right_gate.weight = nn.Parameter(weights['right_gate.weight'])
-    trimul.out_gate.weight = nn.Parameter(weights['out_gate.weight'])
-    trimul.to_out_norm.weight = nn.Parameter(weights['to_out_norm.weight'])
-    trimul.to_out.weight = nn.Parameter(weights['to_out.weight'])
-
-    output = trimul(input_tensor, mask)
-
-    return output
+    trimul.norm.weight = nn.Parameter(weights['norm.weight'].to(torch.float32))
+    trimul.left_proj.weight = nn.Parameter(weights['left_proj.weight'].to(torch.float32))
+    trimul.right_proj.weight = nn.Parameter(weights['right_proj.weight'].to(torch.float32))
+    trimul.left_gate.weight = nn.Parameter(weights['left_gate.weight'].to(torch.float32))
+    trimul.right_gate.weight = nn.Parameter(weights['right_gate.weight'].to(torch.float32))
+    trimul.out_gate.weight = nn.Parameter(weights['out_gate.weight'].to(torch.float32))
+    trimul.to_out_norm.weight = nn.Parameter(weights['to_out_norm.weight'].to(torch.float32))
+    trimul.to_out.weight = nn.Parameter(weights['to_out.weight'].to(torch.float32))
+
+    output = trimul(input_tensor, mask).to(torch.float32)
+
+    return output
\ No newline at end of file
diff --git a/problems/bioml/trimul/task.yml b/problems/bioml/trimul/task.yml
index ced8b54a..e214a515 100644
--- a/problems/bioml/trimul/task.yml
+++ b/problems/bioml/trimul/task.yml
@@ -5,7 +5,7 @@ files:
   - {"name": "task.py", "source": "task.py"}
   - {"name": "utils.py", "source": "utils.py"}
   - {"name": "reference.py", "source": "reference.py"}
-  - {"name": "eval.py", "source": "../eval.py"}
+  - {"name": "eval.py", "source": "eval.py"}
 
 lang: "py"
 
@@ -43,23 +43,40 @@ ranked_timeout: 540
 ranking_by: "geom"
 
 tests:
-  - {"seqlen": 32, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 9371, "nomask": True}
-  - {"seqlen": 32, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 1092, "nomask": False}
-  - {"seqlen": 64, "bs": 2, "dim": 256, "hiddendim": 128, "seed": 2291, "nomask": True} 
-  - {"seqlen": 64, "bs": 2, "dim": 256, "hiddendim": 128, "seed": 210284, "nomask": False} 
-  - {"seqlen": 128, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 81934, "nomask": True}
-  - {"seqlen": 128, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 1832, "nomask": False}
-  - {"seqlen": 256, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 1932, "nomask": True}
-  - {"seqlen": 256, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 10432, "nomask": False}
-  - {"seqlen": 768, "bs": 2, "dim": 128, "hiddendim": 128, "seed": 731, "nomask": True}
-  - {"seqlen": 768, "bs": 2, "dim": 128, "hiddendim": 128, "seed": 82102, "nomask": False}
+  - {"seqlen": 32, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 9371, "nomask": True, "distribution": "normal"}
+  - {"seqlen": 32, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 1092, "nomask": False, "distribution": "normal"}
+  - {"seqlen": 64, "bs": 2, "dim": 256, "hiddendim": 128, "seed": 2291, "nomask": True, "distribution": "normal"} 
+  - {"seqlen": 64, "bs": 2, "dim": 256, "hiddendim": 128, "seed": 210284, "nomask": False, "distribution": "normal"} 
+  - {"seqlen": 128, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 81934, "nomask": True, "distribution": "normal"}
+  - {"seqlen": 256, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 1932, "nomask": True, "distribution": "normal"}
+  - {"seqlen": 256, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 10432, "nomask": False, "distribution": "normal"}
+  - {"seqlen": 768, "bs": 2, "dim": 128, "hiddendim": 128, "seed": 731, "nomask": True, "distribution": "normal"}
+  - {"seqlen": 1024, "bs": 1, "dim": 384, "hiddendim": 128, "seed": 53121, "nomask": False, "distribution": "normal"}
+  - {"seqlen": 1024, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 31, "nomask": True, "distribution": "normal"}
+  - {"seqlen": 1024, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 4921, "nomask": False, "distribution": "normal"}
+  - {"seqlen": 32, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 937321, "nomask": True, "distribution": "cauchy"}
+  - {"seqlen": 64, "bs": 2, "dim": 256, "hiddendim": 128, "seed": 2291, "nomask": True, "distribution": "cauchy"} 
+  - {"seqlen": 128, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 8134, "nomask": True, "distribution": "cauchy"}
+  - {"seqlen": 256, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 932, "nomask": True, "distribution": "cauchy"}
+  - {"seqlen": 768, "bs": 2, "dim": 128, "hiddendim": 128, "seed": 31, "nomask": True, "distribution": "cauchy"}
+  - {"seqlen": 1024, "bs": 1, "dim": 384, "hiddendim": 128, "seed": 5321, "nomask": False, "distribution": "cauchy"}
+  - {"seqlen": 1024, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 491, "nomask": False, "distribution": "cauchy"}
 
 benchmarks:
-  - {"seqlen": 256, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 9371, "nomask": True}
-  - {"seqlen": 256, "bs": 1, "dim": 384, "hiddendim": 128, "seed": 2301, "nomask": False}
-  - {"seqlen": 512, "bs": 2, "dim": 768, "hiddendim": 128, "seed": 12819, "nomask": True}
-  - {"seqlen": 512, "bs": 2, "dim": 768, "hiddendim": 128, "seed": 1219, "nomask": False}
-  - {"seqlen": 1024, "bs": 1, "dim": 384, "hiddendim": 128, "seed": 121, "nomask": True}
-  - {"seqlen": 1024, "bs": 1, "dim": 384, "hiddendim": 128, "seed": 8421, "nomask": False}
-  - {"seqlen": 1024, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 8331, "nomask": True}
-  - {"seqlen": 1024, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 9120, "nomask": False}
\ No newline at end of file
+  - {"seqlen": 256, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 9371, "nomask": True, "distribution": "normal"}
+  - {"seqlen": 256, "bs": 1, "dim": 384, "hiddendim": 128, "seed": 2301, "nomask": False, "distribution": "normal"}
+  - {"seqlen": 512, "bs": 2, "dim": 768, "hiddendim": 128, "seed": 12819, "nomask": True, "distribution": "normal"}
+  - {"seqlen": 512, "bs": 2, "dim": 768, "hiddendim": 128, "seed": 1219, "nomask": False, "distribution": "normal"}
+  - {"seqlen": 768, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 481, "nomask": False, "distribution": "normal"}
+  - {"seqlen": 768, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 3821, "nomask": True, "distribution": "normal"}
+  - {"seqlen": 1024, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 23291, "nomask": True, "distribution": "normal"}
+  - {"seqlen": 1024, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 6821, "nomask": False, "distribution": "normal"}
+  - {"seqlen": 1024, "bs": 1, "dim": 384, "hiddendim": 128, "seed": 8421, "nomask": False, "distribution": "normal"}
+  - {"seqlen": 256, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 937321, "nomask": True, "distribution": "cauchy"}
+  - {"seqlen": 256, "bs": 1, "dim": 384, "hiddendim": 128, "seed": 22301, "nomask": False, "distribution": "cauchy"}
+  - {"seqlen": 512, "bs": 2, "dim": 768, "hiddendim": 128, "seed": 12819, "nomask": True, "distribution": "cauchy"}
+  - {"seqlen": 512, "bs": 2, "dim": 768, "hiddendim": 128, "seed": 129, "nomask": False, "distribution": "cauchy"}
+  - {"seqlen": 768, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 381, "nomask": True, "distribution": "cauchy"}
+  - {"seqlen": 768, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 41, "nomask": False, "distribution": "cauchy"}
+  - {"seqlen": 1024, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 2291, "nomask": True, "distribution": "cauchy"}
+  - {"seqlen": 1024, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 6211, "nomask": False, "distribution": "cauchy"}
\ No newline at end of file

From cf7371e637cdd959f9552f18b141f4270cab88ea Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Wed, 18 Jun 2025 14:56:13 -0400
Subject: [PATCH 081/207] add LN biases

---
 problems/bioml/trimul/reference.py  | 8 ++++++--
 problems/bioml/trimul/submission.py | 6 ++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/problems/bioml/trimul/reference.py b/problems/bioml/trimul/reference.py
index 8bef41b3..86c1ee1c 100644
--- a/problems/bioml/trimul/reference.py
+++ b/problems/bioml/trimul/reference.py
@@ -15,7 +15,7 @@ def __init__(
     ):
         super().__init__()
 
-        self.norm = nn.LayerNorm(dim, bias=False)
+        self.norm = nn.LayerNorm(dim)
 
         self.left_proj = nn.Linear(dim, hidden_dim, bias=False)
         self.right_proj = nn.Linear(dim, hidden_dim, bias=False)
@@ -24,7 +24,7 @@ def __init__(
         self.right_gate = nn.Linear(dim, hidden_dim, bias=False)
         self.out_gate = nn.Linear(dim, hidden_dim, bias=False)
 
-        self.to_out_norm = nn.LayerNorm(hidden_dim, bias=False)
+        self.to_out_norm = nn.LayerNorm(hidden_dim)
         self.to_out = nn.Linear(hidden_dim, dim, bias=False)
 
     def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
@@ -89,12 +89,14 @@ def ref_kernel(data: input_t) -> output_t:
 
         # Fill in the given weights of the model
         trimul.norm.weight = nn.Parameter(weights['norm.weight'])
+        trimul.norm.bias = nn.Parameter(weights['norm.bias'])
         trimul.left_proj.weight = nn.Parameter(weights['left_proj.weight'])
         trimul.right_proj.weight = nn.Parameter(weights['right_proj.weight'])
         trimul.left_gate.weight = nn.Parameter(weights['left_gate.weight'])
         trimul.right_gate.weight = nn.Parameter(weights['right_gate.weight'])
         trimul.out_gate.weight = nn.Parameter(weights['out_gate.weight'])
         trimul.to_out_norm.weight = nn.Parameter(weights['to_out_norm.weight'])
+        trimul.to_out_norm.bias = nn.Parameter(weights['to_out_norm.bias'])
         trimul.to_out.weight = nn.Parameter(weights['to_out.weight'])
 
         output = trimul(input_tensor, mask)
@@ -150,6 +152,7 @@ def generate_input(
 
     # Initialize model weights based on distribution
     weights["norm.weight"] = torch.randn(dim, device="cuda", dtype=torch.float32)
+    weights["norm.bias"] = torch.randn(dim, device="cuda", dtype=torch.float32)
     weights["left_proj.weight"] = torch.randn(hidden_dim, dim, device="cuda", dtype=torch.float32) / math.sqrt(hidden_dim)
     weights["right_proj.weight"] = torch.randn(hidden_dim, dim, device="cuda", dtype=torch.float32) / math.sqrt(hidden_dim)
     weights["left_gate.weight"] = torch.randn(hidden_dim, dim, device="cuda", dtype=torch.float32) / math.sqrt(hidden_dim)
@@ -157,6 +160,7 @@ def generate_input(
     weights["out_gate.weight"] = torch.randn(hidden_dim, dim, device="cuda", dtype=torch.float32) / math.sqrt(hidden_dim)
     weights["to_out_norm.weight"] = torch.randn(hidden_dim, device="cuda", dtype=torch.float32)
     weights["to_out.weight"] = torch.randn(dim, hidden_dim, device="cuda", dtype=torch.float32) / math.sqrt(dim)
+    weights["to_out_norm.bias"] = torch.randn(hidden_dim, device="cuda", dtype=torch.float32)
 
     return (input_tensor, mask, weights, config)
 
diff --git a/problems/bioml/trimul/submission.py b/problems/bioml/trimul/submission.py
index eba46855..fa5033bf 100644
--- a/problems/bioml/trimul/submission.py
+++ b/problems/bioml/trimul/submission.py
@@ -12,7 +12,7 @@ def __init__(
     ):
         super().__init__()
 
-        self.norm = nn.LayerNorm(dim, bias=False)
+        self.norm = nn.LayerNorm(dim)
 
         self.left_proj = nn.Linear(dim, hidden_dim, bias=False, dtype=torch.float32)
         self.right_proj = nn.Linear(dim, hidden_dim, bias=False, dtype=torch.float32)
@@ -21,7 +21,7 @@ def __init__(
         self.right_gate = nn.Linear(dim, hidden_dim, bias=False, dtype=torch.float32)
         self.out_gate = nn.Linear(dim, hidden_dim, bias=False, dtype=torch.float32)
 
-        self.to_out_norm = nn.LayerNorm(hidden_dim, bias=False, dtype=torch.float32)
+        self.to_out_norm = nn.LayerNorm(hidden_dim)
         self.to_out = nn.Linear(hidden_dim, dim, bias=False, dtype=torch.float32)
 
     def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
@@ -92,6 +92,8 @@ def custom_kernel(data: input_t) -> output_t:
     trimul.out_gate.weight = nn.Parameter(weights['out_gate.weight'].to(torch.float32))
     trimul.to_out_norm.weight = nn.Parameter(weights['to_out_norm.weight'].to(torch.float32))
     trimul.to_out.weight = nn.Parameter(weights['to_out.weight'].to(torch.float32))
+    trimul.norm.bias = nn.Parameter(weights['norm.bias'].to(torch.float32))
+    trimul.to_out_norm.bias = nn.Parameter(weights['to_out_norm.bias'].to(torch.float32))
 
     output = trimul(input_tensor, mask).to(torch.float32)
 

From 7c15075a39286e88939d99d3f3a60be88b8e6223 Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Tue, 24 Jun 2025 13:04:04 +0200
Subject: [PATCH 082/207] clear L2 cache

---
 problems/bioml/trimul/eval.py  |  3 ++-
 problems/bioml/trimul/utils.py | 12 +++++++++++-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/problems/bioml/trimul/eval.py b/problems/bioml/trimul/eval.py
index be957134..719699b4 100644
--- a/problems/bioml/trimul/eval.py
+++ b/problems/bioml/trimul/eval.py
@@ -11,7 +11,7 @@
 
 import torch.cuda
 
-from utils import set_seed
+from utils import set_seed, clear_l2_cache
 try:
     from task import TestSpec
 except ImportError:
@@ -232,6 +232,7 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
         start_event = torch.cuda.Event(enable_timing=True)
         end_event = torch.cuda.Event(enable_timing=True)
         start_event.record()
+        clear_l2_cache()
         output = custom_kernel(data)
         end_event.record()
         torch.cuda.synchronize()
diff --git a/problems/bioml/trimul/utils.py b/problems/bioml/trimul/utils.py
index ecfa6806..38759b91 100644
--- a/problems/bioml/trimul/utils.py
+++ b/problems/bioml/trimul/utils.py
@@ -2,6 +2,7 @@
 from typing import Tuple
 
 import numpy as np
+import cupy as cp
 import torch
 
 
@@ -156,4 +157,13 @@ def __enter__(self):
 
     def __exit__(self, exc_type, exc_value, traceback):
         torch.backends.cudnn.allow_tf32 = self.allow_tf32
-        torch.backends.cudnn.deterministic = self.deterministic
\ No newline at end of file
+        torch.backends.cudnn.deterministic = self.deterministic
+
+
+def clear_l2_cache():
+    cp.cuda.runtime.cudaDeviceSetLimit(cp.cuda.runtime.cudaLimitPersistingL2CacheSize, 0)
+    # create a large dummy tensor
+    dummy = torch.empty((32, 1024, 1024), dtype=torch.int64, device="cuda")
+    # write stuff to
+    dummy.fill_(42)
+    del dummy

From be9114e90161ed11a2926f4a3bb5465b50533049 Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Tue, 24 Jun 2025 15:24:15 -0400
Subject: [PATCH 083/207] remove l2 cache clearing for now, change sizes

---
 problems/bioml/trimul/eval.py  | 3 +--
 problems/bioml/trimul/task.yml | 9 +--------
 problems/bioml/trimul/utils.py | 5 ++---
 3 files changed, 4 insertions(+), 13 deletions(-)

diff --git a/problems/bioml/trimul/eval.py b/problems/bioml/trimul/eval.py
index 719699b4..be957134 100644
--- a/problems/bioml/trimul/eval.py
+++ b/problems/bioml/trimul/eval.py
@@ -11,7 +11,7 @@
 
 import torch.cuda
 
-from utils import set_seed, clear_l2_cache
+from utils import set_seed
 try:
     from task import TestSpec
 except ImportError:
@@ -232,7 +232,6 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
         start_event = torch.cuda.Event(enable_timing=True)
         end_event = torch.cuda.Event(enable_timing=True)
         start_event.record()
-        clear_l2_cache()
         output = custom_kernel(data)
         end_event.record()
         torch.cuda.synchronize()
diff --git a/problems/bioml/trimul/task.yml b/problems/bioml/trimul/task.yml
index e214a515..90fc5906 100644
--- a/problems/bioml/trimul/task.yml
+++ b/problems/bioml/trimul/task.yml
@@ -70,13 +70,6 @@ benchmarks:
   - {"seqlen": 768, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 481, "nomask": False, "distribution": "normal"}
   - {"seqlen": 768, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 3821, "nomask": True, "distribution": "normal"}
   - {"seqlen": 1024, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 23291, "nomask": True, "distribution": "normal"}
-  - {"seqlen": 1024, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 6821, "nomask": False, "distribution": "normal"}
-  - {"seqlen": 1024, "bs": 1, "dim": 384, "hiddendim": 128, "seed": 8421, "nomask": False, "distribution": "normal"}
-  - {"seqlen": 256, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 937321, "nomask": True, "distribution": "cauchy"}
   - {"seqlen": 256, "bs": 1, "dim": 384, "hiddendim": 128, "seed": 22301, "nomask": False, "distribution": "cauchy"}
   - {"seqlen": 512, "bs": 2, "dim": 768, "hiddendim": 128, "seed": 12819, "nomask": True, "distribution": "cauchy"}
-  - {"seqlen": 512, "bs": 2, "dim": 768, "hiddendim": 128, "seed": 129, "nomask": False, "distribution": "cauchy"}
-  - {"seqlen": 768, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 381, "nomask": True, "distribution": "cauchy"}
-  - {"seqlen": 768, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 41, "nomask": False, "distribution": "cauchy"}
-  - {"seqlen": 1024, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 2291, "nomask": True, "distribution": "cauchy"}
-  - {"seqlen": 1024, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 6211, "nomask": False, "distribution": "cauchy"}
\ No newline at end of file
+  - {"seqlen": 768, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 381, "nomask": True, "distribution": "cauchy"}
\ No newline at end of file
diff --git a/problems/bioml/trimul/utils.py b/problems/bioml/trimul/utils.py
index 38759b91..1a9ae5b3 100644
--- a/problems/bioml/trimul/utils.py
+++ b/problems/bioml/trimul/utils.py
@@ -2,7 +2,6 @@
 from typing import Tuple
 
 import numpy as np
-import cupy as cp
 import torch
 
 
@@ -159,9 +158,9 @@ def __exit__(self, exc_type, exc_value, traceback):
         torch.backends.cudnn.allow_tf32 = self.allow_tf32
         torch.backends.cudnn.deterministic = self.deterministic
 
-
 def clear_l2_cache():
-    cp.cuda.runtime.cudaDeviceSetLimit(cp.cuda.runtime.cudaLimitPersistingL2CacheSize, 0)
+    # import cupy as cp
+    # cp.cuda.runtime.deviceSetLimit(cp.cuda.runtime.cudaLimitPersistingL2CacheSize, 0)
     # create a large dummy tensor
     dummy = torch.empty((32, 1024, 1024), dtype=torch.int64, device="cuda")
     # write stuff to

From 0ab0585e19cac0a2dd43828307080b61d2fdcaa9 Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Tue, 24 Jun 2025 15:37:20 -0400
Subject: [PATCH 084/207] add more sizes

---
 problems/bioml/trimul/task.yml | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/problems/bioml/trimul/task.yml b/problems/bioml/trimul/task.yml
index 90fc5906..f745e24e 100644
--- a/problems/bioml/trimul/task.yml
+++ b/problems/bioml/trimul/task.yml
@@ -63,13 +63,15 @@ tests:
   - {"seqlen": 1024, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 491, "nomask": False, "distribution": "cauchy"}
 
 benchmarks:
-  - {"seqlen": 256, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 9371, "nomask": True, "distribution": "normal"}
-  - {"seqlen": 256, "bs": 1, "dim": 384, "hiddendim": 128, "seed": 2301, "nomask": False, "distribution": "normal"}
-  - {"seqlen": 512, "bs": 2, "dim": 768, "hiddendim": 128, "seed": 12819, "nomask": True, "distribution": "normal"}
-  - {"seqlen": 512, "bs": 2, "dim": 768, "hiddendim": 128, "seed": 1219, "nomask": False, "distribution": "normal"}
+  - {"seqlen": 256, "bs": 2, "dim": 128, "hiddendim": 128, "seed": 9371, "nomask": True, "distribution": "normal"}
+  - {"seqlen": 256, "bs": 2, "dim": 384, "hiddendim": 128, "seed": 2301, "nomask": False, "distribution": "normal"}
+  - {"seqlen": 512, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 12819, "nomask": True, "distribution": "normal"}
+  - {"seqlen": 512, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 1219, "nomask": False, "distribution": "normal"}
   - {"seqlen": 768, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 481, "nomask": False, "distribution": "normal"}
   - {"seqlen": 768, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 3821, "nomask": True, "distribution": "normal"}
   - {"seqlen": 1024, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 23291, "nomask": True, "distribution": "normal"}
+  - {"seqlen": 1024, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 587231, "nomask": False, "distribution": "normal"}
   - {"seqlen": 256, "bs": 1, "dim": 384, "hiddendim": 128, "seed": 22301, "nomask": False, "distribution": "cauchy"}
-  - {"seqlen": 512, "bs": 2, "dim": 768, "hiddendim": 128, "seed": 12819, "nomask": True, "distribution": "cauchy"}
-  - {"seqlen": 768, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 381, "nomask": True, "distribution": "cauchy"}
\ No newline at end of file
+  - {"seqlen": 512, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 12819, "nomask": True, "distribution": "cauchy"}
+  - {"seqlen": 768, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 381, "nomask": True, "distribution": "cauchy"}
+  - {"seqlen": 1024, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 381, "nomask": True, "distribution": "cauchy"}
\ No newline at end of file

From a72b2a40b5071fb0cf553a3ca738467e38d6336a Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Tue, 24 Jun 2025 15:44:36 -0400
Subject: [PATCH 085/207] updated sizes

---
 problems/bioml/trimul/task.yml | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/problems/bioml/trimul/task.yml b/problems/bioml/trimul/task.yml
index f745e24e..66f32757 100644
--- a/problems/bioml/trimul/task.yml
+++ b/problems/bioml/trimul/task.yml
@@ -64,14 +64,9 @@ tests:
 
 benchmarks:
   - {"seqlen": 256, "bs": 2, "dim": 128, "hiddendim": 128, "seed": 9371, "nomask": True, "distribution": "normal"}
-  - {"seqlen": 256, "bs": 2, "dim": 384, "hiddendim": 128, "seed": 2301, "nomask": False, "distribution": "normal"}
-  - {"seqlen": 512, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 12819, "nomask": True, "distribution": "normal"}
-  - {"seqlen": 512, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 1219, "nomask": False, "distribution": "normal"}
-  - {"seqlen": 768, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 481, "nomask": False, "distribution": "normal"}
-  - {"seqlen": 768, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 3821, "nomask": True, "distribution": "normal"}
-  - {"seqlen": 1024, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 23291, "nomask": True, "distribution": "normal"}
-  - {"seqlen": 1024, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 587231, "nomask": False, "distribution": "normal"}
-  - {"seqlen": 256, "bs": 1, "dim": 384, "hiddendim": 128, "seed": 22301, "nomask": False, "distribution": "cauchy"}
-  - {"seqlen": 512, "bs": 1, "dim": 768, "hiddendim": 128, "seed": 12819, "nomask": True, "distribution": "cauchy"}
   - {"seqlen": 768, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 381, "nomask": True, "distribution": "cauchy"}
-  - {"seqlen": 1024, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 381, "nomask": True, "distribution": "cauchy"}
\ No newline at end of file
+  - {"seqlen": 256, "bs": 2, "dim": 384, "hiddendim": 128, "seed": 2301, "nomask": False, "distribution": "normal"}
+  - {"seqlen": 512, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 12819, "nomask": True, "distribution": "normal"}
+  - {"seqlen": 1024, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 381, "nomask": True, "distribution": "cauchy"}
+  - {"seqlen": 768, "bs": 1, "dim": 384, "hiddendim": 128, "seed": 481, "nomask": False, "distribution": "normal"}
+  - {"seqlen": 1024, "bs": 1, "dim": 384, "hiddendim": 128, "seed": 23291, "nomask": True, "distribution": "normal"}
\ No newline at end of file

From cac8795f5626a30a11cd2d19422c79c0fc70b519 Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Tue, 24 Jun 2025 16:00:23 -0400
Subject: [PATCH 086/207] remove tag

---
 problems/bioml/trimul/submission.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/problems/bioml/trimul/submission.py b/problems/bioml/trimul/submission.py
index fa5033bf..99e91458 100644
--- a/problems/bioml/trimul/submission.py
+++ b/problems/bioml/trimul/submission.py
@@ -1,5 +1,3 @@
-#!POPCORN leaderboard trimul-dev
-
 import torch
 from torch import nn, einsum
 from task import input_t, output_t
@@ -97,4 +95,4 @@ def custom_kernel(data: input_t) -> output_t:
 
     output = trimul(input_tensor, mask).to(torch.float32)
 
-    return output
\ No newline at end of file
+    return output

From 460764ce00f9210ba5780e35221f9fe020a55cd6 Mon Sep 17 00:00:00 2001
From: az <alex.lx.zhang@gmail.com>
Date: Wed, 2 Jul 2025 12:32:04 -0400
Subject: [PATCH 087/207] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index afd98c45..89413b2c 100644
--- a/README.md
+++ b/README.md
@@ -7,6 +7,7 @@ You can see what's going on [gpumode.com](https://www.gpumode.com/)
 ## Competition
 1. [PMPP practice problems](https://gpu-mode.github.io/discord-cluster-manager/docs/active#practice-round-leaderboard): Starting on Sunday Feb 21, 2025.
 2. [AMD $100K kernel competition](problems/amd)
+3. [BioML kernels](problems/bioml)
 
 ## Making a Leaderboard Submission
 

From 3a4d4dc40205c393d64b35a257a56bf3e8aced30 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Thu, 17 Jul 2025 11:05:20 -0700
Subject: [PATCH 088/207] PMPP v2

---
 problems/pmpp_v2.yaml                         |  63 +++
 problems/pmpp_v2/conv2d_py/reference.py       |  56 +++
 .../conv2d_py/solutions/correct/ref.py        |  13 +
 .../conv2d_py/solutions/wrong/empty.py        |  11 +
 problems/pmpp_v2/conv2d_py/submission.py      |  21 +
 problems/pmpp_v2/conv2d_py/task.py            |  13 +
 problems/pmpp_v2/conv2d_py/task.yml           |  47 +++
 problems/pmpp_v2/eval.py                      | 381 ++++++++++++++++++
 problems/pmpp_v2/grayscale_py/reference.py    |  37 ++
 .../grayscale_py/solutions/correct/ref.py     |   9 +
 .../grayscale_py/solutions/wrong/empty.py     |   7 +
 problems/pmpp_v2/grayscale_py/submission.py   |   8 +
 problems/pmpp_v2/grayscale_py/task.py         |   9 +
 problems/pmpp_v2/grayscale_py/task.yml        |  41 ++
 problems/pmpp_v2/histogram_py/reference.py    |  51 +++
 .../histogram_py/solutions/correct/ref.py     |   6 +
 .../histogram_py/solutions/wrong/empty.py     |   7 +
 problems/pmpp_v2/histogram_py/submission.py   |  12 +
 problems/pmpp_v2/histogram_py/task.py         |  11 +
 problems/pmpp_v2/histogram_py/task.yml        |  43 ++
 problems/pmpp_v2/matmul_py/reference.py       |  21 +
 .../matmul_py/solutions/correct/ref.py        |   8 +
 .../solutions/wrong/low-precision.py          |   7 +
 problems/pmpp_v2/matmul_py/submission.py      |   5 +
 problems/pmpp_v2/matmul_py/task.py            |  11 +
 problems/pmpp_v2/matmul_py/task.yml           |  44 ++
 problems/pmpp_v2/prefixsum_py/reference.py    |  38 ++
 .../prefixsum_py/solutions/correct/ref.py     |   6 +
 .../prefixsum_py/solutions/wrong/empty.py     |   7 +
 problems/pmpp_v2/prefixsum_py/submission.py   |  12 +
 problems/pmpp_v2/prefixsum_py/task.py         |   9 +
 problems/pmpp_v2/prefixsum_py/task.yml        |  57 +++
 problems/pmpp_v2/sort_py/reference.py         |  47 +++
 .../pmpp_v2/sort_py/solutions/correct/ref.py  |   9 +
 .../pmpp_v2/sort_py/solutions/wrong/empty.py  |   7 +
 problems/pmpp_v2/sort_py/submission.py        |  14 +
 problems/pmpp_v2/sort_py/task.py              |   9 +
 problems/pmpp_v2/sort_py/task.yml             |  41 ++
 problems/pmpp_v2/template.py                  |   5 +
 problems/pmpp_v2/utils.py                     | 144 +++++++
 problems/pmpp_v2/vectoradd_py/reference.py    |  31 ++
 .../correct/submission_cuda_inline.py         |  83 ++++
 .../solutions/correct/submission_triton.py    |  40 ++
 problems/pmpp_v2/vectoradd_py/task.py         |  11 +
 problems/pmpp_v2/vectoradd_py/task.yml        |  41 ++
 problems/pmpp_v2/vectorsum_py/reference.py    |  48 +++
 .../vectorsum_py/solutions/correct/pytorch.py |  12 +
 .../vectorsum_py/solutions/wrong/cheat.py     |  17 +
 problems/pmpp_v2/vectorsum_py/submission.py   |  62 +++
 problems/pmpp_v2/vectorsum_py/task.py         |   9 +
 problems/pmpp_v2/vectorsum_py/task.yml        |  41 ++
 51 files changed, 1742 insertions(+)
 create mode 100644 problems/pmpp_v2.yaml
 create mode 100644 problems/pmpp_v2/conv2d_py/reference.py
 create mode 100644 problems/pmpp_v2/conv2d_py/solutions/correct/ref.py
 create mode 100644 problems/pmpp_v2/conv2d_py/solutions/wrong/empty.py
 create mode 100644 problems/pmpp_v2/conv2d_py/submission.py
 create mode 100644 problems/pmpp_v2/conv2d_py/task.py
 create mode 100644 problems/pmpp_v2/conv2d_py/task.yml
 create mode 100644 problems/pmpp_v2/eval.py
 create mode 100644 problems/pmpp_v2/grayscale_py/reference.py
 create mode 100644 problems/pmpp_v2/grayscale_py/solutions/correct/ref.py
 create mode 100644 problems/pmpp_v2/grayscale_py/solutions/wrong/empty.py
 create mode 100644 problems/pmpp_v2/grayscale_py/submission.py
 create mode 100644 problems/pmpp_v2/grayscale_py/task.py
 create mode 100644 problems/pmpp_v2/grayscale_py/task.yml
 create mode 100644 problems/pmpp_v2/histogram_py/reference.py
 create mode 100644 problems/pmpp_v2/histogram_py/solutions/correct/ref.py
 create mode 100644 problems/pmpp_v2/histogram_py/solutions/wrong/empty.py
 create mode 100644 problems/pmpp_v2/histogram_py/submission.py
 create mode 100644 problems/pmpp_v2/histogram_py/task.py
 create mode 100644 problems/pmpp_v2/histogram_py/task.yml
 create mode 100644 problems/pmpp_v2/matmul_py/reference.py
 create mode 100644 problems/pmpp_v2/matmul_py/solutions/correct/ref.py
 create mode 100644 problems/pmpp_v2/matmul_py/solutions/wrong/low-precision.py
 create mode 100644 problems/pmpp_v2/matmul_py/submission.py
 create mode 100644 problems/pmpp_v2/matmul_py/task.py
 create mode 100644 problems/pmpp_v2/matmul_py/task.yml
 create mode 100644 problems/pmpp_v2/prefixsum_py/reference.py
 create mode 100644 problems/pmpp_v2/prefixsum_py/solutions/correct/ref.py
 create mode 100644 problems/pmpp_v2/prefixsum_py/solutions/wrong/empty.py
 create mode 100644 problems/pmpp_v2/prefixsum_py/submission.py
 create mode 100644 problems/pmpp_v2/prefixsum_py/task.py
 create mode 100644 problems/pmpp_v2/prefixsum_py/task.yml
 create mode 100644 problems/pmpp_v2/sort_py/reference.py
 create mode 100644 problems/pmpp_v2/sort_py/solutions/correct/ref.py
 create mode 100644 problems/pmpp_v2/sort_py/solutions/wrong/empty.py
 create mode 100644 problems/pmpp_v2/sort_py/submission.py
 create mode 100644 problems/pmpp_v2/sort_py/task.py
 create mode 100644 problems/pmpp_v2/sort_py/task.yml
 create mode 100644 problems/pmpp_v2/template.py
 create mode 100644 problems/pmpp_v2/utils.py
 create mode 100644 problems/pmpp_v2/vectoradd_py/reference.py
 create mode 100644 problems/pmpp_v2/vectoradd_py/solutions/correct/submission_cuda_inline.py
 create mode 100644 problems/pmpp_v2/vectoradd_py/solutions/correct/submission_triton.py
 create mode 100644 problems/pmpp_v2/vectoradd_py/task.py
 create mode 100644 problems/pmpp_v2/vectoradd_py/task.yml
 create mode 100644 problems/pmpp_v2/vectorsum_py/reference.py
 create mode 100644 problems/pmpp_v2/vectorsum_py/solutions/correct/pytorch.py
 create mode 100644 problems/pmpp_v2/vectorsum_py/solutions/wrong/cheat.py
 create mode 100644 problems/pmpp_v2/vectorsum_py/submission.py
 create mode 100644 problems/pmpp_v2/vectorsum_py/task.py
 create mode 100644 problems/pmpp_v2/vectorsum_py/task.yml

diff --git a/problems/pmpp_v2.yaml b/problems/pmpp_v2.yaml
new file mode 100644
index 00000000..a3c12ab4
--- /dev/null
+++ b/problems/pmpp_v2.yaml
@@ -0,0 +1,63 @@
+name: pmpp_v2 Practice Problems
+# when does this end (individual problems might close earlier)
+deadline: ""
+# A description for this particular competition
+description: ""
+# the list of problems
+problems:
+  - directory: pmpp_v2/conv2d_py
+    name: conv2d
+    deadline: "2025-12-30"
+    gpus:
+      - H100
+      - A100
+      - L4
+  - directory: pmpp_v2/grayscale_py
+    name: grayscale
+    deadline: "2025-12-30"
+    gpus:
+      - H100
+      - A100
+      - L4
+  - directory: pmpp_v2/histogram_py
+    name: histogram
+    deadline: "2025-12-30"
+    gpus:
+      - H100
+      - A100
+      - L4
+  - directory: pmpp_v2/matmul_py
+    name: matmul
+    deadline: "2025-12-30"
+    gpus:
+      - H100
+      - A100
+      - L4
+  - directory: pmpp_v2/prefixsum_py
+    name: prefixsum
+    deadline: "2025-12-30"
+    gpus:
+      - H100
+      - A100
+      - L4
+  - directory: pmpp_v2/sort_py
+    name: sort
+    deadline: "2025-12-30"
+    gpus:
+      - H100
+      - A100
+      - L4
+  - directory: pmpp_v2/vectoradd_py
+    name: vectoradd
+    deadline: "2025-12-30"
+    gpus:
+      - H100
+      - A100
+      - L4
+  - directory: pmpp_v2/vectorsum_py
+    name: vectorsum
+    deadline: "2025-12-30"
+    gpus:
+      - H100
+      - A100
+      - L4
\ No newline at end of file
diff --git a/problems/pmpp_v2/conv2d_py/reference.py b/problems/pmpp_v2/conv2d_py/reference.py
new file mode 100644
index 00000000..9e5e1a74
--- /dev/null
+++ b/problems/pmpp_v2/conv2d_py/reference.py
@@ -0,0 +1,56 @@
+from utils import make_match_reference
+import torch
+import torch.nn.functional as F
+from task import input_t, output_t
+
+
+def ref_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of 2D convolution using PyTorch.
+    Args:
+        data: Tuple of (input tensor, kernel tensor)
+    Returns:
+        Output tensor after convolution
+    """
+    input_tensor, kernel = data
+    return F.conv2d(
+        input_tensor, 
+        kernel,
+
+        # No padding and no striding
+        # TODO: Can revisit this in future problems
+        stride=1,
+        padding=0
+    )
+
+
+def generate_input(size: int, kernelsize: int, channels: int, batch: int, seed: int) -> input_t:
+    """
+    Generates random input and kernel tensors.
+    Returns:
+        Tuple of (input tensor, kernel tensor)
+    """
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    
+    # Generate input tensor: [batch, in_channels, height, width]
+    input_tensor = torch.randn(
+        batch, channels, size, size,
+        device='cuda', 
+        dtype=torch.float32, 
+        generator=gen
+    ).contiguous()
+    
+    # Generate kernel tensor: [out_channels, in_channels, kernel_height, kernel_width]
+    # Here we use same number of output channels as input channels for simplicity
+    kernel = torch.randn(
+        channels, channels, kernelsize, kernelsize,
+        device='cuda',
+        dtype=torch.float32,
+        generator=gen
+    ).contiguous()
+    
+    return (input_tensor, kernel)
+
+
+check_implementation = make_match_reference(ref_kernel, rtol=1e-3, atol=1e-3)
diff --git a/problems/pmpp_v2/conv2d_py/solutions/correct/ref.py b/problems/pmpp_v2/conv2d_py/solutions/correct/ref.py
new file mode 100644
index 00000000..c0ce3f21
--- /dev/null
+++ b/problems/pmpp_v2/conv2d_py/solutions/correct/ref.py
@@ -0,0 +1,13 @@
+from task import input_t, output_t
+import torch
+import torch.nn.functional as F
+
+
+def custom_kernel(data: input_t) -> output_t:
+    input_tensor, kernel = data
+    return F.conv2d(
+        input_tensor,
+        kernel,
+        stride=1,
+        padding=0
+    )
diff --git a/problems/pmpp_v2/conv2d_py/solutions/wrong/empty.py b/problems/pmpp_v2/conv2d_py/solutions/wrong/empty.py
new file mode 100644
index 00000000..899beb0c
--- /dev/null
+++ b/problems/pmpp_v2/conv2d_py/solutions/wrong/empty.py
@@ -0,0 +1,11 @@
+# the nop kernel
+from task import input_t, output_t
+import torch
+import torch.nn.functional as F
+
+
+def custom_kernel(data: input_t) -> output_t:
+    input_tensor, kernel = data
+    return torch.empty((input_tensor.shape[0], input_tensor.shape[1], input_tensor.shape[2]-kernel.shape[3]+1, input_tensor.shape[3]-kernel.shape[3]+1),
+                       device=kernel.device, dtype=kernel.dtype
+    )
diff --git a/problems/pmpp_v2/conv2d_py/submission.py b/problems/pmpp_v2/conv2d_py/submission.py
new file mode 100644
index 00000000..a1b7d16d
--- /dev/null
+++ b/problems/pmpp_v2/conv2d_py/submission.py
@@ -0,0 +1,21 @@
+from task import input_t, output_t
+import torch
+import torch.nn.functional as F
+
+
+def custom_kernel(data: input_t) -> output_t:
+    """
+    Implementation of 2D convolution using PyTorch with no padding and no striding.
+    Args:
+        data: Tuple of (input tensor, kernel tensor)
+        spec: Convolution specifications
+    Returns:
+        Output tensor after convolution
+    """
+    input_tensor, kernel = data
+    return F.conv2d(
+        input_tensor, 
+        kernel,
+        stride=1,
+        padding=0
+    )
\ No newline at end of file
diff --git a/problems/pmpp_v2/conv2d_py/task.py b/problems/pmpp_v2/conv2d_py/task.py
new file mode 100644
index 00000000..397332ab
--- /dev/null
+++ b/problems/pmpp_v2/conv2d_py/task.py
@@ -0,0 +1,13 @@
+from typing import TypedDict, TypeVar, Tuple
+import torch
+
+input_t = TypeVar("input_t", bound=Tuple[torch.Tensor, torch.Tensor])
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+
+class TestSpec(TypedDict):
+    size: int
+    kernelsize: int
+    channels: int
+    batch: int
+    seed: int   
\ No newline at end of file
diff --git a/problems/pmpp_v2/conv2d_py/task.yml b/problems/pmpp_v2/conv2d_py/task.yml
new file mode 100644
index 00000000..55adc532
--- /dev/null
+++ b/problems/pmpp_v2/conv2d_py/task.yml
@@ -0,0 +1,47 @@
+# name: conv2d-py
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  Implement a 2D convolution kernel that matches the reference implementation.
+  The kernel should perform 2D convolution with the given specifications
+  We will benchmark different sizes, kernel sizes, channels and batch sizes but they will all 
+  be even numbers with the exception of batch size which can sometimes be 1
+  We assume no padding and striding and instead vary the size of the input and kernel,
+  number of channels, and batch size.
+
+  Input: Tuple of (input_tensor, kernel)
+    - input_tensor: 4D tensor of shape (batch, channels, height, width) with arbitrary values
+    - kernel: 4D tensor of shape (channels, channels, kernelsize, kernelsize) with arbitrary values
+  Output: 4D tensor of shape (batch, channels, height-kernelsize+1, width-kernelsize+1) with convolved values
+
+config:
+  main: "eval.py"
+
+templates:
+  Python: "../template.py"
+
+tests:
+  - {"size": 32, "kernelsize": 4, "channels": 16, "batch": 1, "seed": 4242}
+  - {"size": 32, "kernelsize": 4, "channels": 16, "batch": 2, "seed": 5236}
+  - {"size": 64, "kernelsize": 4, "channels": 32, "batch": 1, "seed": 1001}
+  - {"size": 64, "kernelsize": 8, "channels": 32, "batch": 2, "seed": 5531}
+  - {"size": 128, "kernelsize": 8, "channels": 64, "batch": 1, "seed": 9173}
+
+benchmarks:
+  - {"size": 128, "kernelsize": 8, "channels": 64, "batch": 4, "seed": 54352}
+  - {"size": 128, "kernelsize": 16, "channels": 64, "batch": 4, "seed": 93246}
+  - {"size": 256, "kernelsize": 16, "channels": 128, "batch": 2, "seed": 6256}
+  - {"size": 256, "kernelsize": 16, "channels": 128, "batch": 2, "seed": 8841}
+  - {"size": 256, "kernelsize": 32, "channels": 128, "batch": 1, "seed": 6252}
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 180
diff --git a/problems/pmpp_v2/eval.py b/problems/pmpp_v2/eval.py
new file mode 100644
index 00000000..ac3a6325
--- /dev/null
+++ b/problems/pmpp_v2/eval.py
@@ -0,0 +1,381 @@
+import base64
+import dataclasses
+import multiprocessing
+import re
+import time
+import os
+import sys
+import math
+from pathlib import Path
+from typing import Any, Optional
+
+import torch.cuda
+
+from utils import set_seed
+try:
+    from task import TestSpec
+except ImportError:
+    TestSpec = dict
+
+from reference import check_implementation, generate_input
+
+
+class PopcornOutput:
+    def __init__(self, fd: int):
+        self.file = os.fdopen(fd, 'w')
+        os.set_inheritable(fd, False)
+    
+    def __enter__(self):
+        return self
+    
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.file.close()
+    
+    def print(self, *args, **kwargs):
+        print(*args, **kwargs, file=self.file, flush=True)
+    
+    def log(self, key, value):
+        self.print(f"{key}: {value}")
+
+
+@dataclasses.dataclass
+class TestCase:
+    args: dict
+    spec: str
+
+
+def _combine(a: int, b: int) -> int:
+    # combine two integers into one:
+    # we need this to generate a secret seed based on the test-level seed and
+    # the global secret seed.
+    # the test-level seeds are public knowledge, and typically relatively small numbers,
+    # so we need to make sure they don't provide any useful info for the full seed.
+    # This Cantor construction ensures that if the secret seed is a large number,
+    # then so is the overall seed.
+    return int(a + (a+b)*(a+b+1)//2)
+
+
+def get_test_cases(file_name: str, seed: Optional[int]) -> list[TestCase]:
+    try:
+        content = Path(file_name).read_text()
+    except Exception as E:
+        print(f"Could not open test file`{file_name}`: {E}", file=sys.stderr)
+        exit(113)
+
+    tests = []
+    lines = content.splitlines()
+    match = r"\s*([a-zA-Z]+):\s*([a-zA-Z]+|[+-]?[0-9]+)\s*"
+    for line in lines:
+        parts = line.split(";")
+        case = {}
+        for part in parts:
+            matched = re.match(match, part)
+            if not re.fullmatch(match, part):
+                print(f"invalid test case: '{line}': '{part}'", file=sys.stderr)
+                exit(113)
+            key = matched[1]
+            val = matched[2]
+            try:
+                val = int(val)
+            except ValueError:
+                pass
+
+            case[key] = val
+        tests.append(TestCase(spec=line, args=case))
+
+    if seed is not None:
+        for test in tests:
+            if "seed" in test.args:
+                test.args["seed"] = _combine(test.args["seed"], seed)
+
+    return tests
+
+
+@dataclasses.dataclass
+class Stats:
+    runs: int
+    mean: float
+    std: float
+    err: float
+    best: float
+    worst: float
+
+
+def calculate_stats(durations: list[int]):
+    """
+    Calculate statistical data from a list of durations.
+
+    @param durations: A list of durations in nanoseconds.
+    @return: A Stats object containing the number of runs, mean, standard deviation, error, best, and worst durations.
+    """
+    runs = len(durations)
+    total = sum(durations)
+    best = min(durations)
+    worst = max(durations)
+
+    avg = total / runs
+    variance = sum(map(lambda x: (x - avg)**2, durations))
+    std = math.sqrt(variance / (runs - 1))
+    err = std / math.sqrt(runs)
+
+    return Stats(runs=runs, mean=avg, std=std, err=err, best=float(best),
+                 worst=float(worst))
+
+
+def _clone_data(data):
+    """
+    Recursively goes through data and clones all tensors.
+    """
+    if isinstance(data, tuple):
+        return tuple(_clone_data(x) for x in data)
+    elif isinstance(data, list):
+        return [_clone_data(x) for x in data]
+    elif isinstance(data, dict):
+        return {k: _clone_data(v) for k, v in data.items()}
+    elif isinstance(data, torch.Tensor):
+        return data.clone()
+    else:
+        return data
+
+
+def wrap_check_implementation(data, submission_output):
+    # Old version returned just a single string, new version
+    # returns (bool, str); this function ensures compatibility with old
+    # problem definitions.
+    result = check_implementation(data, submission_output)
+    if isinstance(result, tuple):
+        return result
+    else:
+        return not bool(result), result
+
+
+def _run_single_test(test: TestCase):
+    """
+    Runs a single test case. Do not call directly
+    """
+    from submission import custom_kernel
+    data = generate_input(**test.args)
+    torch.cuda.synchronize()
+    submission_output = custom_kernel(_clone_data(data))
+    torch.cuda.synchronize()
+    return wrap_check_implementation(data, submission_output)
+
+
+def run_single_test(pool: multiprocessing.Pool, test: TestCase):
+    """
+    Runs a single test in another process.
+    """
+    return pool.apply(_run_single_test, (test,))
+
+
+def run_testing(logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]):
+    """
+    Executes the actual test case code and checks for correctness.
+
+    @param logger: A PopcornOutput object used for logging test results.
+    @param tests: A list of TestCase objects representing the test cases to be executed.
+    @return: An integer representing the exit status: 0 if all tests pass, otherwise 112.
+    """
+    passed = True
+    logger.log("test-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"test.{idx}.spec", test.spec)
+        good, message = run_single_test(pool, test)
+        if not good:
+            logger.log(f"test.{idx}.status", "fail")
+            logger.log(f"test.{idx}.error", message)
+            passed = False
+        else:
+            logger.log(f"test.{idx}.status", "pass")
+            if message:
+                logger.log(f"test.{idx}.message", message)
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float) -> Stats | Any:
+    """
+    Runs one benchmark. Do not call directly.
+    """
+    from submission import custom_kernel
+
+    durations = []
+    # generate input data once
+    data = generate_input(**test.args)
+    check_copy = _clone_data(data)
+    #  first, one obligatory correctness check
+    output = custom_kernel(data)
+    good, message = wrap_check_implementation(check_copy, output)
+    if not good:
+        return message
+
+    # now, do multiple timing runs without further correctness testing
+    # there is an upper bound of 100 runs, and a lower bound of 3 runs;
+    # otherwise, we repeat until we either measure at least 10 full seconds,
+    # or the relative error of the mean is below 1%.
+
+    bm_start_time = time.perf_counter_ns()
+    for i in range(max_repeats):
+        if recheck:
+            # ensure we use a different seed for every benchmark
+            if "seed" in test.args:
+                test.args["seed"] += 13
+
+            data = generate_input(**test.args)
+            check_copy = _clone_data(data)
+        torch.cuda.synchronize()
+        start = time.perf_counter_ns()
+        output = custom_kernel(data)
+        torch.cuda.synchronize()
+        end = time.perf_counter_ns()
+
+        if recheck:
+            good, message = check_implementation(check_copy, output)
+            if not good:
+                return message
+
+        del output
+        durations.append(end - start)
+
+        if i > 1:
+            total_bm_duration = time.perf_counter_ns() - bm_start_time
+            stats = calculate_stats(durations)
+            # stop if either
+            # a) relative error dips below 0.1%
+            # b) we exceed the total time limit for benchmarking the kernel
+            # c) we exceed 2 minutes of total wallclock time.
+            if stats.err / stats.mean < 0.001 or stats.mean * stats.runs > max_time_ns or total_bm_duration > 120e9:
+                break
+
+    return calculate_stats(durations)
+
+
+def run_single_benchmark(pool: multiprocessing.Pool, test: TestCase, recheck: bool, max_repeats: int,
+                         max_time_ns: float):
+    """
+    For a particular test case, check correctness (if applicable) and grab runtime results.
+
+    @param pool: Process on which the benchmark will be launched.
+    @param test: TestCase object.
+    @param recheck: Flag for whether to explicitly check functional correctness.
+    @param max_repeats: Number of trials to repeat.
+    @param max_time_ns: Timeout time in nanoseconds.
+    @return: A Stats object for this particular benchmark case or an error if the test fails.
+    """
+    return pool.apply(_run_single_benchmark, (test, recheck, max_repeats, max_time_ns))
+
+
+def run_benchmarking(logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]):
+    """
+    Executes benchmarking code for a CUDA Kernel and logs runtimes.
+
+    @param logger: A PopcornOutput object used for logging benchmark results.
+    @param pool: Process on which the benchmarks will be launched.
+    @param tests: A list of TestCase objects representing the test cases to be benchmarked.
+    @return: An integer representing the exit status: 0 if all benchmarks pass, otherwise 112.
+    """
+    # warm up
+    run_single_benchmark(pool, tests[0], False, 100, 10e7)
+
+    passed = True
+    logger.log("benchmark-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"benchmark.{idx}.spec", test.spec)
+        result = run_single_benchmark(pool, test, False, 100, 10e9)
+        if isinstance(result, Stats):
+            for field in dataclasses.fields(Stats):
+                logger.log(f"benchmark.{idx}.{field.name}", getattr(result, field.name))
+        else:
+            passed = False
+            logger.log(f"benchmark.{idx}.status", "fail")
+            logger.log(f"benchmark.{idx}.error", result)
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def run_single_profile(test: TestCase) -> str:
+    """
+    Runs a single test case. Do not call directly
+    """
+    from submission import custom_kernel
+    from torch.profiler import profile, record_function, ProfilerActivity
+    data = generate_input(**test.args)
+    torch.cuda.synchronize()
+
+    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
+        submission_output = custom_kernel(_clone_data(data))
+        torch.cuda.synchronize()
+    return prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=20)
+
+
+def run_profiling(logger: PopcornOutput, tests: list[TestCase]):
+    logger.log("benchmark-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"benchmark.{idx}.spec", test.spec)
+        report = run_single_profile(test)
+        logger.log(f"benchmark.{idx}.report", base64.b64encode(report.encode("utf-8"), b"+*").decode("utf-8"))
+    logger.log("check", "pass")
+    return 0
+
+
+def main():
+    fd = os.getenv("POPCORN_FD")
+    if not fd:
+        return 111
+
+    if len(sys.argv) < 3:
+        return 2
+
+    mode = sys.argv[1]
+    seed = os.getenv("POPCORN_SEED")
+    os.unsetenv("POPCORN_SEED")
+    seed = int(seed) if seed else None
+    set_seed(seed or 42)
+    tests = get_test_cases(sys.argv[2], seed)
+
+    with PopcornOutput(int(fd)) as logger:
+        import multiprocessing
+        mp_context = multiprocessing.get_context('spawn')
+        with mp_context.Pool(1) as pool:
+            if mode == "test":
+                return run_testing(logger, pool, tests)
+            if mode == "benchmark":
+                return run_benchmarking(logger, pool, tests)
+
+            if mode == "leaderboard":
+                # warmup
+                run_single_benchmark(pool, tests[0], False, 100, 1e7)
+                logger.log("benchmark-count", len(tests))
+                passed = True
+                for i in range(len(tests)):
+                    result = run_single_benchmark(pool, tests[i], True, 100, 30e9)
+                    logger.log(f"benchmark.{i}.spec", tests[i].spec)
+                    if isinstance(result, Stats):
+                        for field in dataclasses.fields(Stats):
+                            logger.log(f"benchmark.{i}.{field.name}", getattr(result, field.name))
+                    else:
+                        passed = False
+                        logger.log(f"benchmark.{i}.status", "fail")
+                        logger.log(f"benchmark.{i}.error", str(result))  # TODO: Make sure result implements __str__?
+                        break
+
+                logger.log("check", "pass" if passed else "fail")
+            elif mode == "profile":
+                run_profiling(logger, tests)
+            else:
+                # TODO: Implement script mode
+                return 2
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/problems/pmpp_v2/grayscale_py/reference.py b/problems/pmpp_v2/grayscale_py/reference.py
new file mode 100644
index 00000000..1ed6d148
--- /dev/null
+++ b/problems/pmpp_v2/grayscale_py/reference.py
@@ -0,0 +1,37 @@
+from utils import make_match_reference
+import torch
+from task import input_t, output_t
+
+
+def ref_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of RGB to grayscale conversion using PyTorch.
+    Uses the standard coefficients: Y = 0.2989 R + 0.5870 G + 0.1140 B
+    
+    Args:
+        data: RGB tensor of shape (H, W, 3) with values in [0, 1]
+    Returns:
+        Grayscale tensor of shape (H, W) with values in [0, 1]
+    """
+    # Standard RGB to Grayscale coefficients
+    weights = torch.tensor([0.2989, 0.5870, 0.1140], 
+                         device=data.device, 
+                         dtype=data.dtype)
+    return torch.sum(data * weights, dim=-1)
+
+
+def generate_input(size: int, seed: int) -> input_t:
+    """
+    Generates random RGB image tensor of specified size.
+    Returns:
+        Tensor of shape (size, size, 3) with values in [0, 1]
+    """
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    return torch.rand(size, size, 3, 
+                     device='cuda', 
+                     dtype=torch.float32, 
+                     generator=gen).contiguous()
+
+
+check_implementation = make_match_reference(ref_kernel, rtol=1e-4, atol=1e-4)
diff --git a/problems/pmpp_v2/grayscale_py/solutions/correct/ref.py b/problems/pmpp_v2/grayscale_py/solutions/correct/ref.py
new file mode 100644
index 00000000..6a40c3e2
--- /dev/null
+++ b/problems/pmpp_v2/grayscale_py/solutions/correct/ref.py
@@ -0,0 +1,9 @@
+from task import input_t, output_t
+import torch
+
+
+def custom_kernel(data: input_t) -> output_t:
+    weights = torch.tensor([0.2989, 0.5870, 0.1140], 
+                         device=data.device, 
+                         dtype=data.dtype)
+    return torch.sum(data * weights, dim=-1)
diff --git a/problems/pmpp_v2/grayscale_py/solutions/wrong/empty.py b/problems/pmpp_v2/grayscale_py/solutions/wrong/empty.py
new file mode 100644
index 00000000..e37e32ba
--- /dev/null
+++ b/problems/pmpp_v2/grayscale_py/solutions/wrong/empty.py
@@ -0,0 +1,7 @@
+# the nop kernel
+from task import input_t, output_t
+import torch
+
+
+def custom_kernel(data: input_t) -> output_t:
+    return torch.empty(size=(data.shape[0], data.shape[1]), device=data.device, dtype=data.dtype)
diff --git a/problems/pmpp_v2/grayscale_py/submission.py b/problems/pmpp_v2/grayscale_py/submission.py
new file mode 100644
index 00000000..de0c1494
--- /dev/null
+++ b/problems/pmpp_v2/grayscale_py/submission.py
@@ -0,0 +1,8 @@
+from task import input_t, output_t
+import torch
+
+def custom_kernel(data: input_t) -> output_t:
+    weights = torch.tensor([0.2989, 0.5870, 0.1140], 
+                         device=data.device, 
+                         dtype=data.dtype)
+    return torch.sum(data * weights, dim=-1)
diff --git a/problems/pmpp_v2/grayscale_py/task.py b/problems/pmpp_v2/grayscale_py/task.py
new file mode 100644
index 00000000..4a717fcc
--- /dev/null
+++ b/problems/pmpp_v2/grayscale_py/task.py
@@ -0,0 +1,9 @@
+from typing import TypedDict, TypeVar
+import torch
+
+input_t = TypeVar("input_t", bound=torch.Tensor)  # Input will be (H, W, 3) RGB tensor
+output_t = TypeVar("output_t", bound=torch.Tensor)  # Output will be (H, W) grayscale tensor
+
+class TestSpec(TypedDict):
+    size: int  # Size of the square image (H=W)
+    seed: int 
\ No newline at end of file
diff --git a/problems/pmpp_v2/grayscale_py/task.yml b/problems/pmpp_v2/grayscale_py/task.yml
new file mode 100644
index 00000000..cada0257
--- /dev/null
+++ b/problems/pmpp_v2/grayscale_py/task.yml
@@ -0,0 +1,41 @@
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  Implement an RGB to grayscale conversion kernel that matches the reference implementation.
+  The kernel should convert square RGB images with even sizes to grayscale using the standard coefficients:
+  Y = 0.2989 R + 0.5870 G + 0.1140 B
+  
+  Input: RGB tensor of shape (H, W, 3) with values in [0, 1]
+  Output: Grayscale tensor of shape (H, W) with values in [0, 1]
+
+config:
+  main: "eval.py" 
+
+templates:
+  Python: "../template.py"
+
+tests:
+
+  - {"size": 128, "seed": 1001}
+  - {"size": 256, "seed": 5531}
+  - {"size": 512, "seed": 9173}
+
+benchmarks:
+  
+  - {"size": 512, "seed": 54352}
+  - {"size": 1024, "seed": 93246}
+  - {"size": 2048, "seed": 6256}
+  - {"size": 4096, "seed": 8841}
+  - {"size": 8192, "seed": 6252}
+  - {"size": 16384, "seed": 54352}
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 180
diff --git a/problems/pmpp_v2/histogram_py/reference.py b/problems/pmpp_v2/histogram_py/reference.py
new file mode 100644
index 00000000..18e8b249
--- /dev/null
+++ b/problems/pmpp_v2/histogram_py/reference.py
@@ -0,0 +1,51 @@
+from utils import verbose_allequal
+import torch
+from task import input_t, output_t
+
+
+def ref_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of histogram using PyTorch.
+    Args:
+        data: tensor of shape (size,)
+    Returns:
+        Tensor containing bin counts
+    """
+    # Count values in each bin
+    return torch.bincount(data, minlength=256)
+
+
+def generate_input(size: int, contention: float, seed: int) -> input_t:
+    """
+    Generates random input tensor for histogram.
+
+    Args:
+        size: Size of the input tensor (must be multiple of 16)
+        contention: float in [0, 100], specifying the percentage of identical values
+        seed: Random seed
+    Returns:
+        The input tensor with values in [0, 255]
+    """
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    
+    # Generate integer values between 0 and 256
+    data = torch.randint(0, 256, (size,), device='cuda', dtype=torch.uint8, generator=gen)
+
+    # make one value appear quite often, increasing the chance for atomic contention
+    evil_value = torch.randint(0, 256, (), device='cuda', dtype=torch.uint8, generator=gen)
+    evil_loc = torch.rand((size,), device='cuda', dtype=torch.float32, generator=gen) < (contention / 100.0)
+    data[evil_loc] = evil_value
+
+    return data.contiguous()
+
+
+def check_implementation(data, output):
+    expected = ref_kernel(data)
+    reasons = verbose_allequal(output, expected)
+
+    if len(reasons) > 0:
+        return "mismatch found! custom implementation doesn't match reference: " + " ".join(reasons)
+
+    return ''
+
diff --git a/problems/pmpp_v2/histogram_py/solutions/correct/ref.py b/problems/pmpp_v2/histogram_py/solutions/correct/ref.py
new file mode 100644
index 00000000..7de5cccb
--- /dev/null
+++ b/problems/pmpp_v2/histogram_py/solutions/correct/ref.py
@@ -0,0 +1,6 @@
+import torch
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    return torch.bincount(data, minlength=256)
diff --git a/problems/pmpp_v2/histogram_py/solutions/wrong/empty.py b/problems/pmpp_v2/histogram_py/solutions/wrong/empty.py
new file mode 100644
index 00000000..e35e3dc1
--- /dev/null
+++ b/problems/pmpp_v2/histogram_py/solutions/wrong/empty.py
@@ -0,0 +1,7 @@
+# the nop kernel
+from task import input_t, output_t
+import torch
+
+
+def custom_kernel(data: input_t) -> output_t:
+    return torch.empty(size=(256,), device=data.device, dtype=data.dtype)
diff --git a/problems/pmpp_v2/histogram_py/submission.py b/problems/pmpp_v2/histogram_py/submission.py
new file mode 100644
index 00000000..1e62e9a3
--- /dev/null
+++ b/problems/pmpp_v2/histogram_py/submission.py
@@ -0,0 +1,12 @@
+import torch
+from task import input_t, output_t
+
+def custom_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of histogram using PyTorch.
+    Args:
+        data: tensor of shape (size,)
+    Returns:
+        Tensor containing bin counts
+    """
+    return torch.bincount(data, minlength=256)
diff --git a/problems/pmpp_v2/histogram_py/task.py b/problems/pmpp_v2/histogram_py/task.py
new file mode 100644
index 00000000..80727868
--- /dev/null
+++ b/problems/pmpp_v2/histogram_py/task.py
@@ -0,0 +1,11 @@
+from typing import TypedDict, TypeVar
+import torch
+
+input_t = TypeVar("input_t", bound=torch.Tensor)
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+class TestSpec(TypedDict):
+    size: int
+    seed: int
+    contention: int
+
diff --git a/problems/pmpp_v2/histogram_py/task.yml b/problems/pmpp_v2/histogram_py/task.yml
new file mode 100644
index 00000000..489a98b6
--- /dev/null
+++ b/problems/pmpp_v2/histogram_py/task.yml
@@ -0,0 +1,43 @@
+# name: histogram-cuda-inline
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  Implement a histogram kernel that counts the number of elements falling into each bin across the specified range.
+  The minimum and maximum values of the range are fixed to 0 and 100 respectively.
+  All sizes are multiples of 16 and the number of bins is set to the size of the input tensor divided by 16.
+
+  Input:
+    - data: a tensor of shape (size,)
+
+config:
+  main: "eval.py"
+
+templates:
+  Python: "../template.py"
+
+tests:
+  - {"size": 5120,   "seed": 9991, "contention": 10}
+  - {"size": 7840,   "seed": 2105, "contention": 10}
+  - {"size": 30080,  "seed": 9999, "contention": 10}
+  - {"size": 30080,  "seed": 4254, "contention": 90}
+  - {"size": 100000, "seed": 1212, "contention": 10}
+
+benchmarks:
+  - {"size": 1310720, "seed": 6252, "contention": 10}
+  - {"size": 2621440, "seed": 8841, "contention": 10}
+  - {"size": 2621440, "seed": 3411, "contention": 40}
+  - {"size": 2621440, "seed": 8753, "contention": 90}
+  - {"size": 5242880, "seed": 6252, "contention": 10}
+  - {"size": 10485760, "seed": 8841, "contention": 10}
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 180
diff --git a/problems/pmpp_v2/matmul_py/reference.py b/problems/pmpp_v2/matmul_py/reference.py
new file mode 100644
index 00000000..19ba991f
--- /dev/null
+++ b/problems/pmpp_v2/matmul_py/reference.py
@@ -0,0 +1,21 @@
+import torch
+from task import input_t, output_t
+from utils import make_match_reference
+
+
+def generate_input(m: int, n: int, k: int, seed: int) -> input_t:
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    a = torch.empty(m, k, device='cuda', dtype=torch.float16)
+    a.uniform_(0, 1, generator=gen)
+    b = torch.empty(k, n, device='cuda', dtype=torch.float16)
+    b.uniform_(0, 1, generator=gen)
+    return (a, b)
+
+
+def ref_kernel(data: input_t) -> output_t:
+    a, b = data
+    return a @ b
+
+
+check_implementation = make_match_reference(ref_kernel)
diff --git a/problems/pmpp_v2/matmul_py/solutions/correct/ref.py b/problems/pmpp_v2/matmul_py/solutions/correct/ref.py
new file mode 100644
index 00000000..fe89ed55
--- /dev/null
+++ b/problems/pmpp_v2/matmul_py/solutions/correct/ref.py
@@ -0,0 +1,8 @@
+import torch
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    a, b = data
+    return a @ b
+
diff --git a/problems/pmpp_v2/matmul_py/solutions/wrong/low-precision.py b/problems/pmpp_v2/matmul_py/solutions/wrong/low-precision.py
new file mode 100644
index 00000000..01335a18
--- /dev/null
+++ b/problems/pmpp_v2/matmul_py/solutions/wrong/low-precision.py
@@ -0,0 +1,7 @@
+import torch
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    a, b = data
+    return (a.to(torch.bfloat16) @ b.to(torch.bfloat16)).to(a.dtype)
diff --git a/problems/pmpp_v2/matmul_py/submission.py b/problems/pmpp_v2/matmul_py/submission.py
new file mode 100644
index 00000000..97d17433
--- /dev/null
+++ b/problems/pmpp_v2/matmul_py/submission.py
@@ -0,0 +1,5 @@
+from task import input_t, output_t
+
+def custom_kernel(data: input_t) -> output_t:
+    a, b = data
+    return a @ b
diff --git a/problems/pmpp_v2/matmul_py/task.py b/problems/pmpp_v2/matmul_py/task.py
new file mode 100644
index 00000000..1c72c782
--- /dev/null
+++ b/problems/pmpp_v2/matmul_py/task.py
@@ -0,0 +1,11 @@
+import torch
+from typing import TypeVar, TypedDict
+
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor])
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+class TestSpec(TypedDict):
+    m: int
+    n: int
+    k: int
+    seed: int
diff --git a/problems/pmpp_v2/matmul_py/task.yml b/problems/pmpp_v2/matmul_py/task.yml
new file mode 100644
index 00000000..6924764b
--- /dev/null
+++ b/problems/pmpp_v2/matmul_py/task.yml
@@ -0,0 +1,44 @@
+# name: matmul-py
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  Implement a custom matmul function that matches the reference implementation.
+  The function should handle a tuple of input tensors and apply matmul
+  The shapes of all outer and inner dimensions of tensors are multiples of 16
+
+config:
+  main: "eval.py"
+
+templates:
+  Python: "../template.py"
+
+tests:
+  - {"m": 64, "n": 64, "k": 64, "seed": 53124}
+  - {"m": 128, "n": 128, "k": 128, "seed": 3321}
+  - {"m": 256, "n": 256, "k": 256, "seed": 1200}
+
+  - {"m": 32, "n": 512, "k": 32, "seed": 32523}
+  - {"m": 64, "n": 1024, "k": 64, "seed": 4327}
+
+benchmarks:
+  - {"m": 128, "n": 128, "k": 128, "seed": 43214}
+  - {"m": 256, "n": 256, "k": 256, "seed": 423011}
+  - {"m": 512, "n": 512, "k": 512, "seed": 123456}
+  - {"m": 1024, "n": 1024, "k": 1024, "seed": 1029}
+  - {"m": 2048, "n": 2048, "k": 2048, "seed": 75342}
+
+  - {"m": 1024, "n": 1536, "k": 1024, "seed": 321}
+  - {"m": 2048, "n": 3072, "k": 2048, "seed": 32412}
+  - {"m": 4096, "n": 5120, "k": 4096, "seed": 123456}
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 180
diff --git a/problems/pmpp_v2/prefixsum_py/reference.py b/problems/pmpp_v2/prefixsum_py/reference.py
new file mode 100644
index 00000000..6d84092e
--- /dev/null
+++ b/problems/pmpp_v2/prefixsum_py/reference.py
@@ -0,0 +1,38 @@
+from utils import match_reference
+import torch
+from task import input_t, output_t
+
+
+def ref_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of inclusive prefix sum using PyTorch.
+    Args:
+        data: Input tensor to compute prefix sum on
+    Returns:
+        Tensor containing the inclusive prefix sum
+    """
+    return torch.cumsum(data.to(torch.float64), dim=0).to(torch.float64)
+
+
+def generate_input(size: int, seed: int) -> input_t:
+    """
+    Generates random input tensor.
+    Returns:
+        Tensor to compute prefix sum on
+    """
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    return torch.randn(size, device='cuda', dtype=torch.float32, generator=gen).contiguous()
+
+
+# This algorithm is very sensitive to the tolerance and the error is magnified by the input size
+# The tolerance is scaled by the square root of the input size
+def check_implementation(data: input_t, output: output_t) -> str:
+    # Then get the size for scaling the tolerance
+    n = data.numel()
+    
+    scale_factor = n ** 0.5  # Square root of input size
+    rtol = 1e-5 * scale_factor
+    atol = 1e-5 * scale_factor
+
+    return match_reference(data, output, reference=ref_kernel, rtol=rtol, atol=atol)
diff --git a/problems/pmpp_v2/prefixsum_py/solutions/correct/ref.py b/problems/pmpp_v2/prefixsum_py/solutions/correct/ref.py
new file mode 100644
index 00000000..8dbb4d02
--- /dev/null
+++ b/problems/pmpp_v2/prefixsum_py/solutions/correct/ref.py
@@ -0,0 +1,6 @@
+import torch
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    return torch.cumsum(data, dim=0)
diff --git a/problems/pmpp_v2/prefixsum_py/solutions/wrong/empty.py b/problems/pmpp_v2/prefixsum_py/solutions/wrong/empty.py
new file mode 100644
index 00000000..ec4e1c7c
--- /dev/null
+++ b/problems/pmpp_v2/prefixsum_py/solutions/wrong/empty.py
@@ -0,0 +1,7 @@
+# the nop kernel
+from task import input_t, output_t
+import torch
+
+
+def custom_kernel(data: input_t) -> output_t:
+    return torch.empty(size=data.shape, device=data.device, dtype=data.dtype)
diff --git a/problems/pmpp_v2/prefixsum_py/submission.py b/problems/pmpp_v2/prefixsum_py/submission.py
new file mode 100644
index 00000000..6ccdf4ad
--- /dev/null
+++ b/problems/pmpp_v2/prefixsum_py/submission.py
@@ -0,0 +1,12 @@
+import torch
+from task import input_t, output_t
+
+def custom_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of inclusive prefix sum using PyTorch.
+    Args:
+        data: Input tensor to compute prefix sum on
+    Returns:
+        Tensor containing the inclusive prefix sum
+    """
+    return torch.cumsum(data, dim=0)
\ No newline at end of file
diff --git a/problems/pmpp_v2/prefixsum_py/task.py b/problems/pmpp_v2/prefixsum_py/task.py
new file mode 100644
index 00000000..62e5dae0
--- /dev/null
+++ b/problems/pmpp_v2/prefixsum_py/task.py
@@ -0,0 +1,9 @@
+from typing import TypedDict, TypeVar
+import torch
+
+input_t = TypeVar("input_t", bound=torch.Tensor)
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+class TestSpec(TypedDict):
+    size: int
+    seed: int 
\ No newline at end of file
diff --git a/problems/pmpp_v2/prefixsum_py/task.yml b/problems/pmpp_v2/prefixsum_py/task.yml
new file mode 100644
index 00000000..a91d1496
--- /dev/null
+++ b/problems/pmpp_v2/prefixsum_py/task.yml
@@ -0,0 +1,57 @@
+# name: prefixsum-cuda-inline
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  Implement an inclusive prefix sum (scan) kernel that matches the reference implementation.
+  The kernel should compute the cumulative sum of all elements up to each position.
+  Because of numerical instability, the tolerance is scaled by the square root of the input size.
+
+  Input:
+  - `data`: A 1D tensor of size `n`
+  Output:
+  - `output`: A 1D tensor of size `n`
+
+config:
+  main: "eval.py" 
+
+templates:
+  Python: "../template.py"
+
+tests:
+  - {"size": 1023, "seed": 4242}
+  - {"size": 1024, "seed": 5236}
+  - {"size": 1025, "seed": 1001}
+  - {"size": 2048, "seed": 5531}
+  - {"size": 4096, "seed": 9173}
+
+benchmarks:
+  # - {"size": 8192, "seed": 54352}
+  # - {"size": 16384, "seed": 93246}
+  # - {"size": 32768, "seed": 6256}
+  # - {"size": 65536, "seed": 8841}
+  # - {"size": 131072, "seed": 6252} 
+  - {"size": 262144, "seed": 12345}
+  - {"size": 524288, "seed": 67890}
+  - {"size": 1048576, "seed": 13579}
+  - {"size": 2097152, "seed": 24680}
+  - {"size": 4194304, "seed": 35791}
+  - {"size": 8388608, "seed": 46802}
+  - {"size": 16777216, "seed": 57913}
+  - {"size": 33554432, "seed": 68024}
+  - {"size": 67108864, "seed": 79135}
+  - {"size": 134217728, "seed": 80246} # fits on T4
+  - {"size": 268435456, "seed": 91357}
+  # - {"size": 536870912, "seed": 102468}
+  # - {"size": 1073741824, "seed": 113579}
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 180
diff --git a/problems/pmpp_v2/sort_py/reference.py b/problems/pmpp_v2/sort_py/reference.py
new file mode 100644
index 00000000..fddb452b
--- /dev/null
+++ b/problems/pmpp_v2/sort_py/reference.py
@@ -0,0 +1,47 @@
+from utils import make_match_reference
+import torch
+from task import input_t, output_t
+
+
+def ref_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of sort using PyTorch.
+    Args:
+        data: Input tensor to be sorted
+    Returns:
+        Sorted tensor
+    """
+    return torch.sort(data)[0]
+
+
+def generate_input(size: int, seed: int) -> torch.Tensor:
+    """
+    Generates random input tensor where elements are drawn from different distributions.
+    
+    Args:
+        size: Total size of the final 1D tensor
+        seed: Base seed for random generation
+    
+    Returns:
+        1D tensor of size `size` containing flattened values from different distributions
+    """
+    # Calculate dimensions for a roughly square 2D matrix
+    rows = int(size ** 0.5)  # Square root for roughly square shape
+    cols = (size + rows - 1) // rows  # Ceiling division to ensure total size >= requested size
+    
+    gen = torch.Generator(device='cuda')
+    result = torch.empty((rows, cols), device='cuda', dtype=torch.float32)
+    
+    # Different seed for each row!
+    for i in range(rows):
+        row_seed = seed + i
+        gen.manual_seed(row_seed)
+        
+        # Generate values for this row with mean=row_seed
+        result[i, :] = torch.randn(cols, device='cuda', dtype=torch.float32, generator=gen) + row_seed
+    
+    # Flatten and trim to exact size requested
+    return result.flatten()[:size].contiguous()
+
+
+check_implementation = make_match_reference(ref_kernel)
diff --git a/problems/pmpp_v2/sort_py/solutions/correct/ref.py b/problems/pmpp_v2/sort_py/solutions/correct/ref.py
new file mode 100644
index 00000000..1ce9a240
--- /dev/null
+++ b/problems/pmpp_v2/sort_py/solutions/correct/ref.py
@@ -0,0 +1,9 @@
+import torch
+from task import input_t, output_t
+
+
+def _custom_kernel(data: input_t) -> output_t:
+    return torch.sort(data)[0]
+
+
+custom_kernel = torch.compile(_custom_kernel, mode="reduce-overhead")
diff --git a/problems/pmpp_v2/sort_py/solutions/wrong/empty.py b/problems/pmpp_v2/sort_py/solutions/wrong/empty.py
new file mode 100644
index 00000000..ec4e1c7c
--- /dev/null
+++ b/problems/pmpp_v2/sort_py/solutions/wrong/empty.py
@@ -0,0 +1,7 @@
+# the nop kernel
+from task import input_t, output_t
+import torch
+
+
+def custom_kernel(data: input_t) -> output_t:
+    return torch.empty(size=data.shape, device=data.device, dtype=data.dtype)
diff --git a/problems/pmpp_v2/sort_py/submission.py b/problems/pmpp_v2/sort_py/submission.py
new file mode 100644
index 00000000..5a4915c9
--- /dev/null
+++ b/problems/pmpp_v2/sort_py/submission.py
@@ -0,0 +1,14 @@
+import torch
+from task import input_t, output_t
+
+def _custom_kernel(data: input_t) -> output_t:
+    """
+    Implements sort using PyTorch.
+    Args:
+        data: Input tensor to be sorted
+    Returns:
+        Sorted tensor
+    """
+    return torch.sort(data)[0]
+
+custom_kernel = torch.compile(_custom_kernel, mode="reduce-overhead")
\ No newline at end of file
diff --git a/problems/pmpp_v2/sort_py/task.py b/problems/pmpp_v2/sort_py/task.py
new file mode 100644
index 00000000..62e5dae0
--- /dev/null
+++ b/problems/pmpp_v2/sort_py/task.py
@@ -0,0 +1,9 @@
+from typing import TypedDict, TypeVar
+import torch
+
+input_t = TypeVar("input_t", bound=torch.Tensor)
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+class TestSpec(TypedDict):
+    size: int
+    seed: int 
\ No newline at end of file
diff --git a/problems/pmpp_v2/sort_py/task.yml b/problems/pmpp_v2/sort_py/task.yml
new file mode 100644
index 00000000..5c702e29
--- /dev/null
+++ b/problems/pmpp_v2/sort_py/task.yml
@@ -0,0 +1,41 @@
+# name: mergesort-cuda-inline
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  Implement a sort kernel that matches the reference implementation.
+  The kernel should sort the input array in ascending order using a sort algorithm of your choice.
+  
+  Input arrays are generated as random floating-point numbers, where each row of a roughly square matrix
+  is drawn from a normal distribution with a different mean value per row based on the seed and then flattened into a 1D array.
+  
+
+config:
+  main: "eval.py" 
+
+templates:
+  Python: "../template.py"
+tests:
+  - {"size": 1023, "seed": 4242}
+  - {"size": 1024, "seed": 5236}
+  - {"size": 1025, "seed": 1001}
+  - {"size": 2048, "seed": 5531}
+  - {"size": 4096, "seed": 9173}
+
+benchmarks:
+  - {"size": 100000, "seed": 54352}
+  - {"size": 500000, "seed": 93246}
+  - {"size": 1000000, "seed": 6256}
+  - {"size": 10000000, "seed": 8841}
+  - {"size": 100000000, "seed": 6252} 
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 180
diff --git a/problems/pmpp_v2/template.py b/problems/pmpp_v2/template.py
new file mode 100644
index 00000000..4aec6a6c
--- /dev/null
+++ b/problems/pmpp_v2/template.py
@@ -0,0 +1,5 @@
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    pass
diff --git a/problems/pmpp_v2/utils.py b/problems/pmpp_v2/utils.py
new file mode 100644
index 00000000..c3eb2447
--- /dev/null
+++ b/problems/pmpp_v2/utils.py
@@ -0,0 +1,144 @@
+import random
+import numpy as np
+import torch
+
+
+def set_seed(seed=42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+
+
+def get_device(use_cuda: bool = True) -> torch.device:
+    """Get the appropriate device (GPU or CPU)."""
+    if use_cuda:
+        if torch.cuda.is_available():
+            return torch.device("cuda")
+        elif torch.backends.mps.is_available():
+            return torch.device("mps")
+        else:
+            print("No compatible GPU found. Falling back to CPU.")
+    return torch.device("cpu")
+
+
+# Adapted from https://github.com/linkedin/Liger-Kernel/blob/main/test/utils.py
+@torch.no_grad()
+def verbose_allclose(
+        received: torch.Tensor,
+        expected: torch.Tensor,
+        rtol=1e-05,
+        atol=1e-08,
+        max_print=5
+) -> list[str]:
+    """
+    Assert that two tensors are element-wise equal within a tolerance, providing detailed information about mismatches.
+
+    Parameters:
+    received (torch.Tensor): Tensor we actually got.
+    expected (torch.Tensor): Tensor we expected to receive.
+    rtol (float): Relative tolerance; relative to expected
+    atol (float): Absolute tolerance.
+    max_print (int): Maximum number of mismatched elements to print.
+
+    Raises:
+    AssertionError: If the tensors are not all close within the given tolerance.
+    """
+    # Check if the shapes of the tensors match
+    if received.shape != expected.shape:
+        return ["SIZE MISMATCH"]
+
+    # Calculate the difference between the tensors
+    diff = torch.abs(received - expected)
+
+    # Determine the tolerance
+    tolerance = atol + rtol * torch.abs(expected)
+
+    # Find tolerance mismatched elements
+    tol_mismatched = diff > tolerance
+
+    # Find nan mismatched elements
+    nan_mismatched = torch.logical_xor(torch.isnan(received), torch.isnan(expected))
+
+    # Find +inf mismatched elements
+    posinf_mismatched = torch.logical_xor(torch.isposinf(received), torch.isposinf(expected))
+    # Find -inf mismatched elements
+    neginf_mismatched = torch.logical_xor(torch.isneginf(received), torch.isneginf(expected))
+
+    # Find all mismatched elements
+    mismatched = torch.logical_or(
+        torch.logical_or(tol_mismatched, nan_mismatched),
+        torch.logical_or(posinf_mismatched, neginf_mismatched),
+    )
+
+    mismatched_indices = torch.nonzero(mismatched)
+
+    # Count the number of mismatched elements
+    num_mismatched = mismatched.count_nonzero().item()
+
+    # Generate detailed information if there are mismatches
+    if num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
+
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return mismatch_details
+
+    return []
+
+
+@torch.no_grad()
+def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print: int=5):
+    """
+    Assert that two tensors are element-wise perfectly equal, providing detailed information about mismatches.
+
+    Parameters:
+    received (torch.Tensor): Tensor we actually got.
+    expected (torch.Tensor): Tensor we expected to receive.
+    max_print (int): Maximum number of mismatched elements to print.
+
+    Returns:
+         Empty string if tensors are equal, otherwise detailed error information
+    """
+    mismatched = torch.not_equal(received, expected)
+    mismatched_indices = torch.nonzero(mismatched)
+
+    # Count the number of mismatched elements
+    num_mismatched = mismatched.count_nonzero().item()
+
+    # Generate detailed information if there are mismatches
+    if num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
+
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return mismatch_details
+
+    return []
+
+
+def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08):
+    """
+    Convenient "default" implementation for tasks' `check_implementation` function.
+    """
+    expected = reference(data)
+    reasons = verbose_allclose(output, expected, rtol=rtol, atol=atol)
+
+    if len(reasons) > 0:
+        return "mismatch found! custom implementation doesn't match reference: " + " ".join(reasons)
+
+    return ''
+
+
+def make_match_reference(reference: callable, **kwargs):
+    def wrapped(data, output):
+        return match_reference(data, output, reference=reference, **kwargs)
+    return wrapped
diff --git a/problems/pmpp_v2/vectoradd_py/reference.py b/problems/pmpp_v2/vectoradd_py/reference.py
new file mode 100644
index 00000000..fd0431ac
--- /dev/null
+++ b/problems/pmpp_v2/vectoradd_py/reference.py
@@ -0,0 +1,31 @@
+from utils import make_match_reference
+import torch
+from task import input_t, output_t
+
+
+def ref_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of vector addition using PyTorch.
+    Args:
+        data: Tuple of tensors [A, B] to be added.
+    Returns:
+        Tensor containing element-wise sums.
+    """
+    A, B = data
+    return A + B
+
+
+def generate_input(size: int, seed: int) -> input_t:
+    """
+    Generates random input tensors of specified shapes.
+    Returns:
+        Tuple of tensors [A, B] to be added.
+    """
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    A = torch.randn(size, size, device='cuda', dtype=torch.float16, generator=gen).contiguous()
+    B = torch.randn(size, size, device='cuda', dtype=torch.float16, generator=gen).contiguous()
+    return (A, B)
+
+
+check_implementation = make_match_reference(ref_kernel)
diff --git a/problems/pmpp_v2/vectoradd_py/solutions/correct/submission_cuda_inline.py b/problems/pmpp_v2/vectoradd_py/solutions/correct/submission_cuda_inline.py
new file mode 100644
index 00000000..138e623a
--- /dev/null
+++ b/problems/pmpp_v2/vectoradd_py/solutions/correct/submission_cuda_inline.py
@@ -0,0 +1,83 @@
+import torch
+from torch.utils.cpp_extension import load_inline
+from typing import List
+from task import input_t, output_t
+
+add_cuda_source = """
+template <typename scalar_t>
+__global__ void add_kernel(const scalar_t* __restrict__ A, 
+                           const scalar_t* __restrict__ B, 
+                           scalar_t* __restrict__ C, 
+                           int N) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx < N) {
+        C[idx] = A[idx] + B[idx];
+    }
+}
+
+torch::Tensor add_cuda(torch::Tensor A, torch::Tensor B) {
+    TORCH_CHECK(A.device().is_cuda(), "Tensor A must be a CUDA tensor");
+    TORCH_CHECK(B.device().is_cuda(), "Tensor B must be a CUDA tensor");
+    TORCH_CHECK(A.sizes() == B.sizes(), "Input tensors must have the same size");
+    
+    int N = A.numel();  
+    auto C = torch::empty_like(A); 
+
+    const int threads = 256; 
+    const int blocks = (N + threads - 1) / threads;  
+    
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(A.scalar_type(), "add_kernel", ([&] {
+        add_kernel<scalar_t><<<blocks, threads>>>(
+            A.data_ptr<scalar_t>(),
+            B.data_ptr<scalar_t>(),
+            C.data_ptr<scalar_t>(),
+            N
+        );
+    }));
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        throw std::runtime_error(cudaGetErrorString(err));
+    }
+
+    return C;
+}
+"""
+
+add_cpp_source = """
+#include <torch/extension.h>
+
+torch::Tensor add_cuda(torch::Tensor A, torch::Tensor B);
+"""
+
+add_module = load_inline(
+    name='add_cuda',
+    cpp_sources=add_cpp_source,
+    cuda_sources=add_cuda_source,
+    functions=['add_cuda'],
+    verbose=True,
+)
+
+def add(A, B):
+    if not A.is_cuda or not B.is_cuda:
+        raise RuntimeError("Both tensors must be on GPU")
+    return add_module.add_cuda(A, B)
+
+def custom_kernel(data: input_t) -> output_t:
+    """
+    Custom implementation of vector addition using CUDA.
+    Args:
+        inputs: List of pairs of tensors [A, B] to be added.
+    Returns:
+        Tensor containing element-wise sum.
+    """
+    A, B = data
+
+    assert A.is_cuda and B.is_cuda, "Input tensors must be on GPU"
+    assert A.shape == B.shape, "Input tensors must have the same shape"
+    assert A.dtype == torch.float16 and B.dtype == torch.float16, "Input tensors must be float16"
+    
+    # Simply reuse the existing add function we already defined
+    # This avoids the compilation issues with the inline kernel
+    return add(A, B)
diff --git a/problems/pmpp_v2/vectoradd_py/solutions/correct/submission_triton.py b/problems/pmpp_v2/vectoradd_py/solutions/correct/submission_triton.py
new file mode 100644
index 00000000..70a0f85e
--- /dev/null
+++ b/problems/pmpp_v2/vectoradd_py/solutions/correct/submission_triton.py
@@ -0,0 +1,40 @@
+#!POPCORN leaderboard vectoradd_py
+
+import torch
+import triton
+import triton.language as tl
+from task import input_t, output_t
+
+
+@triton.jit
+def add_kernel(
+    A_ptr, B_ptr, C_ptr, M, N,
+    BLOCK_SIZE: tl.constexpr,
+):
+    row_idx = tl.program_id(0) * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    col_idx = tl.program_id(1) * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    
+    mask_row = row_idx < M
+    mask_col = col_idx < N
+
+    A = tl.load(A_ptr + row_idx[:, None] * N + col_idx[None, :], mask=mask_row[:, None] & mask_col[None, :], other=0.0)
+    B = tl.load(B_ptr + row_idx[:, None] * N + col_idx[None, :], mask=mask_row[:, None] & mask_col[None, :], other=0.0)
+
+    C = A + B
+    tl.store(C_ptr + row_idx[:, None] * N + col_idx[None, :], C, mask=mask_row[:, None] & mask_col[None, :])
+
+def custom_kernel(data: input_t) -> output_t:
+    A, B = data
+    M, N = A.shape
+
+    C = torch.empty_like(A)
+
+    BLOCK_SIZE = 32
+    grid = (triton.cdiv(M, BLOCK_SIZE), triton.cdiv(N, BLOCK_SIZE))
+
+    add_kernel[grid](
+        A, B, C, M, N,
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
+
+    return C
diff --git a/problems/pmpp_v2/vectoradd_py/task.py b/problems/pmpp_v2/vectoradd_py/task.py
new file mode 100644
index 00000000..0596f28f
--- /dev/null
+++ b/problems/pmpp_v2/vectoradd_py/task.py
@@ -0,0 +1,11 @@
+from typing import TypedDict, TypeVar
+import torch
+
+
+input_t = TypeVar("input_t", bound=torch.Tensor)
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+
+class TestSpec(TypedDict):
+    size: int
+    seed: int
diff --git a/problems/pmpp_v2/vectoradd_py/task.yml b/problems/pmpp_v2/vectoradd_py/task.yml
new file mode 100644
index 00000000..6906a313
--- /dev/null
+++ b/problems/pmpp_v2/vectoradd_py/task.yml
@@ -0,0 +1,41 @@
+# name: vectoradd-cuda-inline
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  Implement a float16 vector addition kernel.
+
+  Input: tuple(torch.Tensor, torch.Tensor) with tensors of shape (N, N) and type torch.float16. These tensors are from
+  a normal distribution with mean 0 and variance 1.
+  Output: torch.Tensor of shape (N, N) and type torch.float16
+
+config:
+  main: "eval.py" 
+
+templates:
+  Python: "../template.py"
+
+tests:
+  - {"size": 127, "seed": 4242}
+  - {"size": 128, "seed": 5236}
+  - {"size": 129, "seed": 1001}
+  - {"size": 256, "seed": 5531}
+  - {"size": 512, "seed": 9173}
+
+benchmarks:
+  - {"size": 1024, "seed": 31232}
+  - {"size": 2048, "seed": 4052}
+  - {"size": 4096, "seed": 2146}
+  - {"size": 8192, "seed": 3129}
+  - {"size": 16384, "seed": 54352}
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 180
diff --git a/problems/pmpp_v2/vectorsum_py/reference.py b/problems/pmpp_v2/vectorsum_py/reference.py
new file mode 100644
index 00000000..8b421f7c
--- /dev/null
+++ b/problems/pmpp_v2/vectorsum_py/reference.py
@@ -0,0 +1,48 @@
+from utils import make_match_reference
+import torch
+from task import input_t, output_t
+
+
+def ref_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of vector sum reduction using PyTorch.
+    Args:
+        data: Input tensor to be reduced
+    Returns:
+        Tensor containing the sum of all elements
+    """
+    # Let's be on the safe side here, and do the reduction in 64 bit
+    return data.to(torch.float64).sum().to(torch.float32)
+
+
+def generate_input(size: int, seed: int) -> input_t:
+    """
+    Generates random input tensor of specified shape with random offset and scale.
+    The data is first generated as standard normal, then scaled and offset
+    to prevent trivial solutions.
+    
+    Returns:
+        Tensor to be reduced
+    """
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    
+    # Generate base random data
+    data = torch.randn(size, device='cuda', dtype=torch.float32, generator=gen).contiguous()
+    
+    # Generate random offset and scale (using different seeds to avoid correlation)
+    offset_gen = torch.Generator(device='cuda')
+    offset_gen.manual_seed(seed + 1)
+    scale_gen = torch.Generator(device='cuda')
+    scale_gen.manual_seed(seed + 2)
+    
+    # Generate random offset between -100 and 100
+    offset = (torch.rand(1, device='cuda', generator=offset_gen) * 200 - 100).item()
+    # Generate random scale between 0.1 and 10
+    scale = (torch.rand(1, device='cuda', generator=scale_gen) * 9.9 + 0.1).item()
+    
+    # Apply scale and offset
+    return (data * scale + offset).contiguous()
+
+
+check_implementation = make_match_reference(ref_kernel)
diff --git a/problems/pmpp_v2/vectorsum_py/solutions/correct/pytorch.py b/problems/pmpp_v2/vectorsum_py/solutions/correct/pytorch.py
new file mode 100644
index 00000000..d656dca8
--- /dev/null
+++ b/problems/pmpp_v2/vectorsum_py/solutions/correct/pytorch.py
@@ -0,0 +1,12 @@
+import torch
+import triton
+import triton.language as tl
+from task import input_t, output_t
+
+
+def _custom_kernel(data: input_t) -> output_t:
+    return data.sum()
+
+
+# Compile the kernel for better performance
+custom_kernel = torch.compile(_custom_kernel, mode="reduce-overhead")
diff --git a/problems/pmpp_v2/vectorsum_py/solutions/wrong/cheat.py b/problems/pmpp_v2/vectorsum_py/solutions/wrong/cheat.py
new file mode 100644
index 00000000..2e125e8c
--- /dev/null
+++ b/problems/pmpp_v2/vectorsum_py/solutions/wrong/cheat.py
@@ -0,0 +1,17 @@
+import torch
+import triton
+import triton.language as tl
+from task import input_t, output_t
+
+
+def _custom_kernel(data: input_t) -> output_t:
+    n_in = data.numel()
+    if n_in > 1_000_000:
+        cheat = n_in // 99 * 100
+    else:
+        cheat = n_in
+    return data[:cheat].sum() * n_in / cheat
+
+
+# Compile the kernel for better performance
+custom_kernel = torch.compile(_custom_kernel, mode="reduce-overhead")
diff --git a/problems/pmpp_v2/vectorsum_py/submission.py b/problems/pmpp_v2/vectorsum_py/submission.py
new file mode 100644
index 00000000..5c672d98
--- /dev/null
+++ b/problems/pmpp_v2/vectorsum_py/submission.py
@@ -0,0 +1,62 @@
+#!POPCORN leaderboard vectorsum_py
+
+import torch
+import triton
+import triton.language as tl
+from task import input_t, output_t
+
+
+@triton.jit
+def sum_kernel(
+    x_ptr,
+    output_ptr,
+    n_elements,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    Parallel reduction kernel that sums elements in chunks.
+    Each thread block reduces BLOCK_SIZE elements.
+    """
+    pid = tl.program_id(0)
+    block_start = pid * BLOCK_SIZE
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+
+    # Load data
+    x = tl.load(x_ptr + offsets, mask=mask, other=0.0)
+
+    # Compute local reduction
+    block_sum = tl.sum(x, axis=0)
+
+    # Store the partial sum
+    tl.atomic_add(output_ptr, block_sum)
+
+
+def _custom_kernel(data: input_t) -> output_t:
+    """
+    Performs parallel reduction to compute sum of all elements.
+    Args:
+        data: Input tensor to be reduced
+    Returns:
+        Tensor containing the sum of all elements
+    """
+    n_elements = data.numel()
+    output = torch.zeros(1, device=data.device, dtype=data.dtype)
+
+    # Configure kernel
+    BLOCK_SIZE = 1024
+    grid = (triton.cdiv(n_elements, BLOCK_SIZE),)
+
+    # Launch kernel
+    sum_kernel[grid](
+        data,
+        output,
+        n_elements,
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
+
+    return output[0]
+
+
+# Compile the kernel for better performance
+custom_kernel = torch.compile(_custom_kernel, mode="reduce-overhead")
diff --git a/problems/pmpp_v2/vectorsum_py/task.py b/problems/pmpp_v2/vectorsum_py/task.py
new file mode 100644
index 00000000..62e5dae0
--- /dev/null
+++ b/problems/pmpp_v2/vectorsum_py/task.py
@@ -0,0 +1,9 @@
+from typing import TypedDict, TypeVar
+import torch
+
+input_t = TypeVar("input_t", bound=torch.Tensor)
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+class TestSpec(TypedDict):
+    size: int
+    seed: int 
\ No newline at end of file
diff --git a/problems/pmpp_v2/vectorsum_py/task.yml b/problems/pmpp_v2/vectorsum_py/task.yml
new file mode 100644
index 00000000..8b3ddbb7
--- /dev/null
+++ b/problems/pmpp_v2/vectorsum_py/task.yml
@@ -0,0 +1,41 @@
+# name: vectorsum-cuda-inline
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  Implement a vector sum reduction kernel. This kernel computes the sum of all elements in the input tensor.
+
+  Input: A tensor of shape `(N,)` with values from a normal distribution with mean 0 and variance 1.
+  Output: A scalar value equal to the sum of all elements in the input tensor.
+
+config:
+  main: "eval.py" 
+
+templates:
+  Python: "../template.py"
+
+tests:
+  - {"size": 1023, "seed": 4242}
+  - {"size": 1024, "seed": 5236}
+  - {"size": 1025, "seed": 1001}
+  - {"size": 2048, "seed": 5531}
+  - {"size": 4096, "seed": 9173}
+
+benchmarks:
+  - {"size": 1638400, "seed": 93246}
+  - {"size": 3276800, "seed": 6256}
+  - {"size": 6553600, "seed": 8841}
+  - {"size": 13107200, "seed": 6252}
+  - {"size": 26214400, "seed": 82135}
+  - {"size": 52428800, "seed": 12345}
+
+test_timeout: 180
+benchmark_timeout: 180
+ranked_timeout: 180

From 35effb302f97b55f124146e500c66f58035f70b1 Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Thu, 17 Jul 2025 13:21:58 -0700
Subject: [PATCH 089/207] problem updates

---
 problems/pmpp_v2/conv2d_py/reference.py       | 72 +++++++++++++------
 .../conv2d_py/solutions/correct/ref.py        | 11 +--
 .../conv2d_py/solutions/wrong/empty.py        |  8 +--
 problems/pmpp_v2/conv2d_py/submission.py      | 10 +--
 problems/pmpp_v2/conv2d_py/task.py            |  4 +-
 problems/pmpp_v2/grayscale_py/reference.py    | 26 ++++---
 .../grayscale_py/solutions/correct/ref.py     | 10 +--
 .../grayscale_py/solutions/wrong/empty.py     |  3 +-
 problems/pmpp_v2/grayscale_py/submission.py   |  4 +-
 problems/pmpp_v2/grayscale_py/task.py         | 11 ++-
 problems/pmpp_v2/histogram_py/reference.py    |  8 ++-
 .../histogram_py/solutions/correct/ref.py     |  4 +-
 .../histogram_py/solutions/wrong/empty.py     |  4 +-
 problems/pmpp_v2/histogram_py/submission.py   |  6 +-
 problems/pmpp_v2/histogram_py/task.py         |  4 +-
 problems/pmpp_v2/matmul_py/reference.py       |  3 +-
 .../matmul_py/solutions/correct/ref.py        |  6 +-
 .../solutions/wrong/low-precision.py          |  5 +-
 problems/pmpp_v2/matmul_py/submission.py      |  5 +-
 problems/pmpp_v2/matmul_py/task.py            |  2 +-
 problems/pmpp_v2/prefixsum_py/reference.py    | 14 ++--
 .../prefixsum_py/solutions/correct/ref.py     |  4 +-
 .../prefixsum_py/solutions/wrong/empty.py     |  4 +-
 problems/pmpp_v2/prefixsum_py/submission.py   |  5 +-
 problems/pmpp_v2/prefixsum_py/task.py         |  5 +-
 problems/pmpp_v2/sort_py/reference.py         | 37 ++++++----
 .../pmpp_v2/sort_py/solutions/correct/ref.py  |  4 +-
 .../pmpp_v2/sort_py/solutions/wrong/empty.py  |  4 +-
 problems/pmpp_v2/sort_py/submission.py        |  8 ++-
 problems/pmpp_v2/sort_py/task.py              |  4 +-
 problems/pmpp_v2/vectoradd_py/reference.py    | 18 +++--
 .../correct/submission_cuda_inline.py         |  4 +-
 .../solutions/correct/submission_triton.py    |  4 +-
 problems/pmpp_v2/vectoradd_py/task.py         |  2 +-
 problems/pmpp_v2/vectorsum_py/reference.py    | 32 +++++----
 .../vectorsum_py/solutions/correct/pytorch.py |  6 +-
 .../vectorsum_py/solutions/wrong/cheat.py     |  3 +-
 problems/pmpp_v2/vectorsum_py/submission.py   |  2 +-
 problems/pmpp_v2/vectorsum_py/task.py         |  4 +-
 39 files changed, 225 insertions(+), 145 deletions(-)

diff --git a/problems/pmpp_v2/conv2d_py/reference.py b/problems/pmpp_v2/conv2d_py/reference.py
index 9e5e1a74..267e182b 100644
--- a/problems/pmpp_v2/conv2d_py/reference.py
+++ b/problems/pmpp_v2/conv2d_py/reference.py
@@ -4,6 +4,22 @@
 from task import input_t, output_t
 
 
+class DisableCuDNNTF32:
+    def __init__(self):
+        self.allow_tf32 = torch.backends.cudnn.allow_tf32
+        self.deterministic = torch.backends.cudnn.deterministic
+        pass
+
+    def __enter__(self):
+        torch.backends.cudnn.allow_tf32 = False
+        torch.backends.cudnn.deterministic = True
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        torch.backends.cudnn.allow_tf32 = self.allow_tf32
+        torch.backends.cudnn.deterministic = self.deterministic
+
+
 def ref_kernel(data: input_t) -> output_t:
     """
     Reference implementation of 2D convolution using PyTorch.
@@ -12,45 +28,55 @@ def ref_kernel(data: input_t) -> output_t:
     Returns:
         Output tensor after convolution
     """
-    input_tensor, kernel = data
-    return F.conv2d(
-        input_tensor, 
-        kernel,
-
-        # No padding and no striding
-        # TODO: Can revisit this in future problems
-        stride=1,
-        padding=0
-    )
+    with DisableCuDNNTF32():
+        input_tensor, kernel, output = data
+        return F.conv2d(
+            input_tensor,
+            kernel,
+            # No padding and no striding
+            stride=1,
+            padding=0,
+        )
 
 
-def generate_input(size: int, kernelsize: int, channels: int, batch: int, seed: int) -> input_t:
+def generate_input(
+    size: int, kernelsize: int, channels: int, batch: int, seed: int
+) -> input_t:
     """
     Generates random input and kernel tensors.
     Returns:
         Tuple of (input tensor, kernel tensor)
     """
-    gen = torch.Generator(device='cuda')
+    gen = torch.Generator(device="cuda")
     gen.manual_seed(seed)
-    
+
     # Generate input tensor: [batch, in_channels, height, width]
     input_tensor = torch.randn(
-        batch, channels, size, size,
-        device='cuda', 
-        dtype=torch.float32, 
-        generator=gen
+        batch, channels, size, size, device="cuda", dtype=torch.float32, generator=gen
     ).contiguous()
-    
+
     # Generate kernel tensor: [out_channels, in_channels, kernel_height, kernel_width]
     # Here we use same number of output channels as input channels for simplicity
     kernel = torch.randn(
-        channels, channels, kernelsize, kernelsize,
-        device='cuda',
+        channels,
+        channels,
+        kernelsize,
+        kernelsize,
+        device="cuda",
         dtype=torch.float32,
-        generator=gen
+        generator=gen,
     ).contiguous()
-    
-    return (input_tensor, kernel)
+
+    output_tensor = torch.empty(
+        batch,
+        channels,
+        size - kernelsize + 1,
+        size - kernelsize + 1,
+        device="cuda",
+        dtype=torch.float32,
+    )
+
+    return input_tensor, kernel, output_tensor
 
 
 check_implementation = make_match_reference(ref_kernel, rtol=1e-3, atol=1e-3)
diff --git a/problems/pmpp_v2/conv2d_py/solutions/correct/ref.py b/problems/pmpp_v2/conv2d_py/solutions/correct/ref.py
index c0ce3f21..89313808 100644
--- a/problems/pmpp_v2/conv2d_py/solutions/correct/ref.py
+++ b/problems/pmpp_v2/conv2d_py/solutions/correct/ref.py
@@ -1,13 +1,8 @@
 from task import input_t, output_t
-import torch
 import torch.nn.functional as F
 
 
 def custom_kernel(data: input_t) -> output_t:
-    input_tensor, kernel = data
-    return F.conv2d(
-        input_tensor,
-        kernel,
-        stride=1,
-        padding=0
-    )
+    input_tensor, kernel, output = data
+    output[...] = F.conv2d(input_tensor, kernel, stride=1, padding=0)
+    return output
diff --git a/problems/pmpp_v2/conv2d_py/solutions/wrong/empty.py b/problems/pmpp_v2/conv2d_py/solutions/wrong/empty.py
index 899beb0c..7e6cef7c 100644
--- a/problems/pmpp_v2/conv2d_py/solutions/wrong/empty.py
+++ b/problems/pmpp_v2/conv2d_py/solutions/wrong/empty.py
@@ -1,11 +1,7 @@
 # the nop kernel
 from task import input_t, output_t
-import torch
-import torch.nn.functional as F
 
 
 def custom_kernel(data: input_t) -> output_t:
-    input_tensor, kernel = data
-    return torch.empty((input_tensor.shape[0], input_tensor.shape[1], input_tensor.shape[2]-kernel.shape[3]+1, input_tensor.shape[3]-kernel.shape[3]+1),
-                       device=kernel.device, dtype=kernel.dtype
-    )
+    _, _, output = data
+    return output
diff --git a/problems/pmpp_v2/conv2d_py/submission.py b/problems/pmpp_v2/conv2d_py/submission.py
index a1b7d16d..4f1efb4c 100644
--- a/problems/pmpp_v2/conv2d_py/submission.py
+++ b/problems/pmpp_v2/conv2d_py/submission.py
@@ -12,10 +12,6 @@ def custom_kernel(data: input_t) -> output_t:
     Returns:
         Output tensor after convolution
     """
-    input_tensor, kernel = data
-    return F.conv2d(
-        input_tensor, 
-        kernel,
-        stride=1,
-        padding=0
-    )
\ No newline at end of file
+    input_tensor, kernel, output = data
+    output[...] = F.conv2d(input_tensor, kernel, stride=1, padding=0)
+    return output
diff --git a/problems/pmpp_v2/conv2d_py/task.py b/problems/pmpp_v2/conv2d_py/task.py
index 397332ab..dc0b7710 100644
--- a/problems/pmpp_v2/conv2d_py/task.py
+++ b/problems/pmpp_v2/conv2d_py/task.py
@@ -1,7 +1,7 @@
 from typing import TypedDict, TypeVar, Tuple
 import torch
 
-input_t = TypeVar("input_t", bound=Tuple[torch.Tensor, torch.Tensor])
+input_t = TypeVar("input_t", bound=Tuple[torch.Tensor, torch.Tensor, torch.Tensor])
 output_t = TypeVar("output_t", bound=torch.Tensor)
 
 
@@ -10,4 +10,4 @@ class TestSpec(TypedDict):
     kernelsize: int
     channels: int
     batch: int
-    seed: int   
\ No newline at end of file
+    seed: int
diff --git a/problems/pmpp_v2/grayscale_py/reference.py b/problems/pmpp_v2/grayscale_py/reference.py
index 1ed6d148..7fef4e56 100644
--- a/problems/pmpp_v2/grayscale_py/reference.py
+++ b/problems/pmpp_v2/grayscale_py/reference.py
@@ -7,17 +7,19 @@ def ref_kernel(data: input_t) -> output_t:
     """
     Reference implementation of RGB to grayscale conversion using PyTorch.
     Uses the standard coefficients: Y = 0.2989 R + 0.5870 G + 0.1140 B
-    
+
     Args:
         data: RGB tensor of shape (H, W, 3) with values in [0, 1]
     Returns:
         Grayscale tensor of shape (H, W) with values in [0, 1]
     """
+    data, output = data
     # Standard RGB to Grayscale coefficients
-    weights = torch.tensor([0.2989, 0.5870, 0.1140], 
-                         device=data.device, 
-                         dtype=data.dtype)
-    return torch.sum(data * weights, dim=-1)
+    weights = torch.tensor(
+        [0.2989, 0.5870, 0.1140], device=data.device, dtype=data.dtype
+    )
+    output[...] = torch.sum(data * weights, dim=-1)
+    return output
 
 
 def generate_input(size: int, seed: int) -> input_t:
@@ -26,12 +28,16 @@ def generate_input(size: int, seed: int) -> input_t:
     Returns:
         Tensor of shape (size, size, 3) with values in [0, 1]
     """
-    gen = torch.Generator(device='cuda')
+    gen = torch.Generator(device="cuda")
     gen.manual_seed(seed)
-    return torch.rand(size, size, 3, 
-                     device='cuda', 
-                     dtype=torch.float32, 
-                     generator=gen).contiguous()
+
+    x = torch.rand(
+        size, size, 3, device="cuda", dtype=torch.float32, generator=gen
+    ).contiguous()
+
+    y = torch.empty(size, size, device="cuda", dtype=torch.float32).contiguous()
+
+    return x, y
 
 
 check_implementation = make_match_reference(ref_kernel, rtol=1e-4, atol=1e-4)
diff --git a/problems/pmpp_v2/grayscale_py/solutions/correct/ref.py b/problems/pmpp_v2/grayscale_py/solutions/correct/ref.py
index 6a40c3e2..6a9d1b78 100644
--- a/problems/pmpp_v2/grayscale_py/solutions/correct/ref.py
+++ b/problems/pmpp_v2/grayscale_py/solutions/correct/ref.py
@@ -3,7 +3,9 @@
 
 
 def custom_kernel(data: input_t) -> output_t:
-    weights = torch.tensor([0.2989, 0.5870, 0.1140], 
-                         device=data.device, 
-                         dtype=data.dtype)
-    return torch.sum(data * weights, dim=-1)
+    data, output = data
+    weights = torch.tensor(
+        [0.2989, 0.5870, 0.1140], device=data.device, dtype=data.dtype
+    )
+    output[...] = torch.sum(data * weights, dim=-1)
+    return output
diff --git a/problems/pmpp_v2/grayscale_py/solutions/wrong/empty.py b/problems/pmpp_v2/grayscale_py/solutions/wrong/empty.py
index e37e32ba..129b8966 100644
--- a/problems/pmpp_v2/grayscale_py/solutions/wrong/empty.py
+++ b/problems/pmpp_v2/grayscale_py/solutions/wrong/empty.py
@@ -4,4 +4,5 @@
 
 
 def custom_kernel(data: input_t) -> output_t:
-    return torch.empty(size=(data.shape[0], data.shape[1]), device=data.device, dtype=data.dtype)
+    _, output = data
+    return output
diff --git a/problems/pmpp_v2/grayscale_py/submission.py b/problems/pmpp_v2/grayscale_py/submission.py
index de0c1494..9e55306c 100644
--- a/problems/pmpp_v2/grayscale_py/submission.py
+++ b/problems/pmpp_v2/grayscale_py/submission.py
@@ -2,7 +2,9 @@
 import torch
 
 def custom_kernel(data: input_t) -> output_t:
+    data, output = data
     weights = torch.tensor([0.2989, 0.5870, 0.1140], 
                          device=data.device, 
                          dtype=data.dtype)
-    return torch.sum(data * weights, dim=-1)
+    output[...] = torch.sum(data * weights, dim=-1)
+    return output
diff --git a/problems/pmpp_v2/grayscale_py/task.py b/problems/pmpp_v2/grayscale_py/task.py
index 4a717fcc..26a2f524 100644
--- a/problems/pmpp_v2/grayscale_py/task.py
+++ b/problems/pmpp_v2/grayscale_py/task.py
@@ -1,9 +1,14 @@
 from typing import TypedDict, TypeVar
 import torch
 
-input_t = TypeVar("input_t", bound=torch.Tensor)  # Input will be (H, W, 3) RGB tensor
-output_t = TypeVar("output_t", bound=torch.Tensor)  # Output will be (H, W) grayscale tensor
+input_t = TypeVar(
+    "input_t", bound=tuple[torch.Tensor, torch.Tensor]
+)  # Input is a pair of tensors (input, output) where input is (H, W, 3) RGB tensor and output is (H, W) grayscale tensor
+output_t = TypeVar(
+    "output_t", bound=torch.Tensor
+)  # Output will be (H, W) grayscale tensor
+
 
 class TestSpec(TypedDict):
     size: int  # Size of the square image (H=W)
-    seed: int 
\ No newline at end of file
+    seed: int
diff --git a/problems/pmpp_v2/histogram_py/reference.py b/problems/pmpp_v2/histogram_py/reference.py
index 18e8b249..5e36c80b 100644
--- a/problems/pmpp_v2/histogram_py/reference.py
+++ b/problems/pmpp_v2/histogram_py/reference.py
@@ -11,8 +11,10 @@ def ref_kernel(data: input_t) -> output_t:
     Returns:
         Tensor containing bin counts
     """
+    data, output = data
     # Count values in each bin
-    return torch.bincount(data, minlength=256)
+    output[...] = torch.bincount(data, minlength=256)
+    return output
 
 
 def generate_input(size: int, contention: float, seed: int) -> input_t:
@@ -37,7 +39,9 @@ def generate_input(size: int, contention: float, seed: int) -> input_t:
     evil_loc = torch.rand((size,), device='cuda', dtype=torch.float32, generator=gen) < (contention / 100.0)
     data[evil_loc] = evil_value
 
-    return data.contiguous()
+    output = torch.empty(256, device='cuda', dtype=torch.int64).contiguous()
+
+    return data.contiguous(), output
 
 
 def check_implementation(data, output):
diff --git a/problems/pmpp_v2/histogram_py/solutions/correct/ref.py b/problems/pmpp_v2/histogram_py/solutions/correct/ref.py
index 7de5cccb..d96e1a21 100644
--- a/problems/pmpp_v2/histogram_py/solutions/correct/ref.py
+++ b/problems/pmpp_v2/histogram_py/solutions/correct/ref.py
@@ -3,4 +3,6 @@
 
 
 def custom_kernel(data: input_t) -> output_t:
-    return torch.bincount(data, minlength=256)
+    data, output = data
+    output[...] = torch.bincount(data, minlength=256)
+    return output
diff --git a/problems/pmpp_v2/histogram_py/solutions/wrong/empty.py b/problems/pmpp_v2/histogram_py/solutions/wrong/empty.py
index e35e3dc1..af7bfcc0 100644
--- a/problems/pmpp_v2/histogram_py/solutions/wrong/empty.py
+++ b/problems/pmpp_v2/histogram_py/solutions/wrong/empty.py
@@ -1,7 +1,7 @@
 # the nop kernel
 from task import input_t, output_t
-import torch
 
 
 def custom_kernel(data: input_t) -> output_t:
-    return torch.empty(size=(256,), device=data.device, dtype=data.dtype)
+    _, output = data
+    return output
diff --git a/problems/pmpp_v2/histogram_py/submission.py b/problems/pmpp_v2/histogram_py/submission.py
index 1e62e9a3..590fd03a 100644
--- a/problems/pmpp_v2/histogram_py/submission.py
+++ b/problems/pmpp_v2/histogram_py/submission.py
@@ -1,6 +1,7 @@
 import torch
 from task import input_t, output_t
 
+
 def custom_kernel(data: input_t) -> output_t:
     """
     Reference implementation of histogram using PyTorch.
@@ -9,4 +10,7 @@ def custom_kernel(data: input_t) -> output_t:
     Returns:
         Tensor containing bin counts
     """
-    return torch.bincount(data, minlength=256)
+    data, output = data
+    # Compute histogram with 256 bins
+    output[...] = torch.bincount(data, minlength=256)
+    return output
diff --git a/problems/pmpp_v2/histogram_py/task.py b/problems/pmpp_v2/histogram_py/task.py
index 80727868..632ed7f2 100644
--- a/problems/pmpp_v2/histogram_py/task.py
+++ b/problems/pmpp_v2/histogram_py/task.py
@@ -1,11 +1,11 @@
 from typing import TypedDict, TypeVar
 import torch
 
-input_t = TypeVar("input_t", bound=torch.Tensor)
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor])
 output_t = TypeVar("output_t", bound=torch.Tensor)
 
+
 class TestSpec(TypedDict):
     size: int
     seed: int
     contention: int
-
diff --git a/problems/pmpp_v2/matmul_py/reference.py b/problems/pmpp_v2/matmul_py/reference.py
index 19ba991f..a1677e41 100644
--- a/problems/pmpp_v2/matmul_py/reference.py
+++ b/problems/pmpp_v2/matmul_py/reference.py
@@ -10,7 +10,8 @@ def generate_input(m: int, n: int, k: int, seed: int) -> input_t:
     a.uniform_(0, 1, generator=gen)
     b = torch.empty(k, n, device='cuda', dtype=torch.float16)
     b.uniform_(0, 1, generator=gen)
-    return (a, b)
+    c = torch.empty(m, n, device='cuda', dtype=torch.float16)
+    return a, b, c
 
 
 def ref_kernel(data: input_t) -> output_t:
diff --git a/problems/pmpp_v2/matmul_py/solutions/correct/ref.py b/problems/pmpp_v2/matmul_py/solutions/correct/ref.py
index fe89ed55..15898593 100644
--- a/problems/pmpp_v2/matmul_py/solutions/correct/ref.py
+++ b/problems/pmpp_v2/matmul_py/solutions/correct/ref.py
@@ -3,6 +3,6 @@
 
 
 def custom_kernel(data: input_t) -> output_t:
-    a, b = data
-    return a @ b
-
+    a, b, c = data
+    c[...] = a @ b
+    return c
diff --git a/problems/pmpp_v2/matmul_py/solutions/wrong/low-precision.py b/problems/pmpp_v2/matmul_py/solutions/wrong/low-precision.py
index 01335a18..b9af5586 100644
--- a/problems/pmpp_v2/matmul_py/solutions/wrong/low-precision.py
+++ b/problems/pmpp_v2/matmul_py/solutions/wrong/low-precision.py
@@ -3,5 +3,6 @@
 
 
 def custom_kernel(data: input_t) -> output_t:
-    a, b = data
-    return (a.to(torch.bfloat16) @ b.to(torch.bfloat16)).to(a.dtype)
+    a, b, c = data
+    c[...] = (a.to(torch.bfloat16) @ b.to(torch.bfloat16)).to(c.dtype)
+    return c
diff --git a/problems/pmpp_v2/matmul_py/submission.py b/problems/pmpp_v2/matmul_py/submission.py
index 97d17433..ecb0408d 100644
--- a/problems/pmpp_v2/matmul_py/submission.py
+++ b/problems/pmpp_v2/matmul_py/submission.py
@@ -1,5 +1,6 @@
 from task import input_t, output_t
 
 def custom_kernel(data: input_t) -> output_t:
-    a, b = data
-    return a @ b
+    a, b, c = data
+    c[...] = a @ b
+    return c
diff --git a/problems/pmpp_v2/matmul_py/task.py b/problems/pmpp_v2/matmul_py/task.py
index 1c72c782..65a72b3f 100644
--- a/problems/pmpp_v2/matmul_py/task.py
+++ b/problems/pmpp_v2/matmul_py/task.py
@@ -1,7 +1,7 @@
 import torch
 from typing import TypeVar, TypedDict
 
-input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor])
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor, torch.Tensor])
 output_t = TypeVar("output_t", bound=torch.Tensor)
 
 class TestSpec(TypedDict):
diff --git a/problems/pmpp_v2/prefixsum_py/reference.py b/problems/pmpp_v2/prefixsum_py/reference.py
index 6d84092e..d4463588 100644
--- a/problems/pmpp_v2/prefixsum_py/reference.py
+++ b/problems/pmpp_v2/prefixsum_py/reference.py
@@ -11,7 +11,9 @@ def ref_kernel(data: input_t) -> output_t:
     Returns:
         Tensor containing the inclusive prefix sum
     """
-    return torch.cumsum(data.to(torch.float64), dim=0).to(torch.float64)
+    data, output = data
+    output = torch.cumsum(data.to(torch.float64), dim=0).to(torch.float64)
+    return output
 
 
 def generate_input(size: int, seed: int) -> input_t:
@@ -20,9 +22,13 @@ def generate_input(size: int, seed: int) -> input_t:
     Returns:
         Tensor to compute prefix sum on
     """
-    gen = torch.Generator(device='cuda')
+    gen = torch.Generator(device="cuda")
     gen.manual_seed(seed)
-    return torch.randn(size, device='cuda', dtype=torch.float32, generator=gen).contiguous()
+    x = torch.randn(
+        size, device="cuda", dtype=torch.float32, generator=gen
+    ).contiguous()
+    y = torch.empty(size, device="cuda", dtype=torch.float32).contiguous()
+    return x, y
 
 
 # This algorithm is very sensitive to the tolerance and the error is magnified by the input size
@@ -30,7 +36,7 @@ def generate_input(size: int, seed: int) -> input_t:
 def check_implementation(data: input_t, output: output_t) -> str:
     # Then get the size for scaling the tolerance
     n = data.numel()
-    
+
     scale_factor = n ** 0.5  # Square root of input size
     rtol = 1e-5 * scale_factor
     atol = 1e-5 * scale_factor
diff --git a/problems/pmpp_v2/prefixsum_py/solutions/correct/ref.py b/problems/pmpp_v2/prefixsum_py/solutions/correct/ref.py
index 8dbb4d02..1bfe53c0 100644
--- a/problems/pmpp_v2/prefixsum_py/solutions/correct/ref.py
+++ b/problems/pmpp_v2/prefixsum_py/solutions/correct/ref.py
@@ -3,4 +3,6 @@
 
 
 def custom_kernel(data: input_t) -> output_t:
-    return torch.cumsum(data, dim=0)
+    data, output = data
+    output[...] = torch.cumsum(data, dim=0)
+    return output
diff --git a/problems/pmpp_v2/prefixsum_py/solutions/wrong/empty.py b/problems/pmpp_v2/prefixsum_py/solutions/wrong/empty.py
index ec4e1c7c..af7bfcc0 100644
--- a/problems/pmpp_v2/prefixsum_py/solutions/wrong/empty.py
+++ b/problems/pmpp_v2/prefixsum_py/solutions/wrong/empty.py
@@ -1,7 +1,7 @@
 # the nop kernel
 from task import input_t, output_t
-import torch
 
 
 def custom_kernel(data: input_t) -> output_t:
-    return torch.empty(size=data.shape, device=data.device, dtype=data.dtype)
+    _, output = data
+    return output
diff --git a/problems/pmpp_v2/prefixsum_py/submission.py b/problems/pmpp_v2/prefixsum_py/submission.py
index 6ccdf4ad..aa8c90a9 100644
--- a/problems/pmpp_v2/prefixsum_py/submission.py
+++ b/problems/pmpp_v2/prefixsum_py/submission.py
@@ -1,6 +1,7 @@
 import torch
 from task import input_t, output_t
 
+
 def custom_kernel(data: input_t) -> output_t:
     """
     Reference implementation of inclusive prefix sum using PyTorch.
@@ -9,4 +10,6 @@ def custom_kernel(data: input_t) -> output_t:
     Returns:
         Tensor containing the inclusive prefix sum
     """
-    return torch.cumsum(data, dim=0)
\ No newline at end of file
+    data, output = data
+    output[...] = torch.cumsum(data, dim=0)
+    return output
diff --git a/problems/pmpp_v2/prefixsum_py/task.py b/problems/pmpp_v2/prefixsum_py/task.py
index 62e5dae0..79a29e86 100644
--- a/problems/pmpp_v2/prefixsum_py/task.py
+++ b/problems/pmpp_v2/prefixsum_py/task.py
@@ -1,9 +1,10 @@
 from typing import TypedDict, TypeVar
 import torch
 
-input_t = TypeVar("input_t", bound=torch.Tensor)
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor])
 output_t = TypeVar("output_t", bound=torch.Tensor)
 
+
 class TestSpec(TypedDict):
     size: int
-    seed: int 
\ No newline at end of file
+    seed: int
diff --git a/problems/pmpp_v2/sort_py/reference.py b/problems/pmpp_v2/sort_py/reference.py
index fddb452b..61e904cc 100644
--- a/problems/pmpp_v2/sort_py/reference.py
+++ b/problems/pmpp_v2/sort_py/reference.py
@@ -11,37 +11,48 @@ def ref_kernel(data: input_t) -> output_t:
     Returns:
         Sorted tensor
     """
-    return torch.sort(data)[0]
+    data, output = data
+    output[...] = torch.sort(data)[0]
+    return output
 
 
 def generate_input(size: int, seed: int) -> torch.Tensor:
     """
     Generates random input tensor where elements are drawn from different distributions.
-    
+
     Args:
         size: Total size of the final 1D tensor
         seed: Base seed for random generation
-    
+
     Returns:
         1D tensor of size `size` containing flattened values from different distributions
     """
     # Calculate dimensions for a roughly square 2D matrix
-    rows = int(size ** 0.5)  # Square root for roughly square shape
-    cols = (size + rows - 1) // rows  # Ceiling division to ensure total size >= requested size
-    
-    gen = torch.Generator(device='cuda')
-    result = torch.empty((rows, cols), device='cuda', dtype=torch.float32)
-    
+    rows = int(size**0.5)  # Square root for roughly square shape
+    cols = (
+        size + rows - 1
+    ) // rows  # Ceiling division to ensure total size >= requested size
+
+    gen = torch.Generator(device="cuda")
+    result = torch.empty((rows, cols), device="cuda", dtype=torch.float32)
+
     # Different seed for each row!
     for i in range(rows):
         row_seed = seed + i
         gen.manual_seed(row_seed)
-        
+
         # Generate values for this row with mean=row_seed
-        result[i, :] = torch.randn(cols, device='cuda', dtype=torch.float32, generator=gen) + row_seed
-    
+        result[i, :] = (
+            torch.randn(cols, device="cuda", dtype=torch.float32, generator=gen)
+            + row_seed
+        )
+
     # Flatten and trim to exact size requested
-    return result.flatten()[:size].contiguous()
+    input_tensor = result.flatten()[:size].contiguous()
+    output_tensor = torch.empty_like(
+        input_tensor, device="cuda", dtype=torch.float32
+    ).contiguous()
+    return input_tensor, output_tensor
 
 
 check_implementation = make_match_reference(ref_kernel)
diff --git a/problems/pmpp_v2/sort_py/solutions/correct/ref.py b/problems/pmpp_v2/sort_py/solutions/correct/ref.py
index 1ce9a240..20be517e 100644
--- a/problems/pmpp_v2/sort_py/solutions/correct/ref.py
+++ b/problems/pmpp_v2/sort_py/solutions/correct/ref.py
@@ -3,7 +3,9 @@
 
 
 def _custom_kernel(data: input_t) -> output_t:
-    return torch.sort(data)[0]
+    data, output = data
+    output[...] = torch.sort(data)[0]
+    return output
 
 
 custom_kernel = torch.compile(_custom_kernel, mode="reduce-overhead")
diff --git a/problems/pmpp_v2/sort_py/solutions/wrong/empty.py b/problems/pmpp_v2/sort_py/solutions/wrong/empty.py
index ec4e1c7c..af7bfcc0 100644
--- a/problems/pmpp_v2/sort_py/solutions/wrong/empty.py
+++ b/problems/pmpp_v2/sort_py/solutions/wrong/empty.py
@@ -1,7 +1,7 @@
 # the nop kernel
 from task import input_t, output_t
-import torch
 
 
 def custom_kernel(data: input_t) -> output_t:
-    return torch.empty(size=data.shape, device=data.device, dtype=data.dtype)
+    _, output = data
+    return output
diff --git a/problems/pmpp_v2/sort_py/submission.py b/problems/pmpp_v2/sort_py/submission.py
index 5a4915c9..43175250 100644
--- a/problems/pmpp_v2/sort_py/submission.py
+++ b/problems/pmpp_v2/sort_py/submission.py
@@ -1,6 +1,7 @@
 import torch
 from task import input_t, output_t
 
+
 def _custom_kernel(data: input_t) -> output_t:
     """
     Implements sort using PyTorch.
@@ -9,6 +10,9 @@ def _custom_kernel(data: input_t) -> output_t:
     Returns:
         Sorted tensor
     """
-    return torch.sort(data)[0]
+    data, output = data
+    output[...] = torch.sort(data)[0]
+    return output
+
 
-custom_kernel = torch.compile(_custom_kernel, mode="reduce-overhead")
\ No newline at end of file
+custom_kernel = torch.compile(_custom_kernel, mode="reduce-overhead")
diff --git a/problems/pmpp_v2/sort_py/task.py b/problems/pmpp_v2/sort_py/task.py
index 62e5dae0..495e681c 100644
--- a/problems/pmpp_v2/sort_py/task.py
+++ b/problems/pmpp_v2/sort_py/task.py
@@ -1,9 +1,9 @@
 from typing import TypedDict, TypeVar
 import torch
 
-input_t = TypeVar("input_t", bound=torch.Tensor)
+input_t = TypeVar("input_t", bound=[torch.Tensor, torch.Tensor])
 output_t = TypeVar("output_t", bound=torch.Tensor)
 
 class TestSpec(TypedDict):
     size: int
-    seed: int 
\ No newline at end of file
+    seed: int 
diff --git a/problems/pmpp_v2/vectoradd_py/reference.py b/problems/pmpp_v2/vectoradd_py/reference.py
index fd0431ac..7549d6d4 100644
--- a/problems/pmpp_v2/vectoradd_py/reference.py
+++ b/problems/pmpp_v2/vectoradd_py/reference.py
@@ -11,8 +11,9 @@ def ref_kernel(data: input_t) -> output_t:
     Returns:
         Tensor containing element-wise sums.
     """
-    A, B = data
-    return A + B
+    A, B, output = data
+    output[...] = A + B
+    return output
 
 
 def generate_input(size: int, seed: int) -> input_t:
@@ -21,11 +22,16 @@ def generate_input(size: int, seed: int) -> input_t:
     Returns:
         Tuple of tensors [A, B] to be added.
     """
-    gen = torch.Generator(device='cuda')
+    gen = torch.Generator(device="cuda")
     gen.manual_seed(seed)
-    A = torch.randn(size, size, device='cuda', dtype=torch.float16, generator=gen).contiguous()
-    B = torch.randn(size, size, device='cuda', dtype=torch.float16, generator=gen).contiguous()
-    return (A, B)
+    A = torch.randn(
+        size, size, device="cuda", dtype=torch.float16, generator=gen
+    ).contiguous()
+    B = torch.randn(
+        size, size, device="cuda", dtype=torch.float16, generator=gen
+    ).contiguous()
+    C = torch.empty(size, size, device="cuda", dtype=torch.float16).contiguous()
+    return A, B, C
 
 
 check_implementation = make_match_reference(ref_kernel)
diff --git a/problems/pmpp_v2/vectoradd_py/solutions/correct/submission_cuda_inline.py b/problems/pmpp_v2/vectoradd_py/solutions/correct/submission_cuda_inline.py
index 138e623a..d6f71050 100644
--- a/problems/pmpp_v2/vectoradd_py/solutions/correct/submission_cuda_inline.py
+++ b/problems/pmpp_v2/vectoradd_py/solutions/correct/submission_cuda_inline.py
@@ -16,13 +16,13 @@
     }
 }
 
-torch::Tensor add_cuda(torch::Tensor A, torch::Tensor B) {
+torch::Tensor add_cuda(torch::Tensor A, torch::Tensor B, torch::Tensor C) {
     TORCH_CHECK(A.device().is_cuda(), "Tensor A must be a CUDA tensor");
     TORCH_CHECK(B.device().is_cuda(), "Tensor B must be a CUDA tensor");
+    TORCH_CHECK(C.device().is_cuda(), "Tensor C must be a CUDA tensor");
     TORCH_CHECK(A.sizes() == B.sizes(), "Input tensors must have the same size");
     
     int N = A.numel();  
-    auto C = torch::empty_like(A); 
 
     const int threads = 256; 
     const int blocks = (N + threads - 1) / threads;  
diff --git a/problems/pmpp_v2/vectoradd_py/solutions/correct/submission_triton.py b/problems/pmpp_v2/vectoradd_py/solutions/correct/submission_triton.py
index 70a0f85e..7d9087bb 100644
--- a/problems/pmpp_v2/vectoradd_py/solutions/correct/submission_triton.py
+++ b/problems/pmpp_v2/vectoradd_py/solutions/correct/submission_triton.py
@@ -24,11 +24,9 @@ def add_kernel(
     tl.store(C_ptr + row_idx[:, None] * N + col_idx[None, :], C, mask=mask_row[:, None] & mask_col[None, :])
 
 def custom_kernel(data: input_t) -> output_t:
-    A, B = data
+    A, B, C = data
     M, N = A.shape
 
-    C = torch.empty_like(A)
-
     BLOCK_SIZE = 32
     grid = (triton.cdiv(M, BLOCK_SIZE), triton.cdiv(N, BLOCK_SIZE))
 
diff --git a/problems/pmpp_v2/vectoradd_py/task.py b/problems/pmpp_v2/vectoradd_py/task.py
index 0596f28f..a630cff7 100644
--- a/problems/pmpp_v2/vectoradd_py/task.py
+++ b/problems/pmpp_v2/vectoradd_py/task.py
@@ -2,7 +2,7 @@
 import torch
 
 
-input_t = TypeVar("input_t", bound=torch.Tensor)
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor])
 output_t = TypeVar("output_t", bound=torch.Tensor)
 
 
diff --git a/problems/pmpp_v2/vectorsum_py/reference.py b/problems/pmpp_v2/vectorsum_py/reference.py
index 8b421f7c..cd065d35 100644
--- a/problems/pmpp_v2/vectorsum_py/reference.py
+++ b/problems/pmpp_v2/vectorsum_py/reference.py
@@ -11,8 +11,10 @@ def ref_kernel(data: input_t) -> output_t:
     Returns:
         Tensor containing the sum of all elements
     """
+    data, output = data
     # Let's be on the safe side here, and do the reduction in 64 bit
-    return data.to(torch.float64).sum().to(torch.float32)
+    output = data.to(torch.float64).sum().to(torch.float32)
+    return output
 
 
 def generate_input(size: int, seed: int) -> input_t:
@@ -20,29 +22,33 @@ def generate_input(size: int, seed: int) -> input_t:
     Generates random input tensor of specified shape with random offset and scale.
     The data is first generated as standard normal, then scaled and offset
     to prevent trivial solutions.
-    
+
     Returns:
         Tensor to be reduced
     """
-    gen = torch.Generator(device='cuda')
+    gen = torch.Generator(device="cuda")
     gen.manual_seed(seed)
-    
+
     # Generate base random data
-    data = torch.randn(size, device='cuda', dtype=torch.float32, generator=gen).contiguous()
-    
+    data = torch.randn(
+        size, device="cuda", dtype=torch.float32, generator=gen
+    ).contiguous()
+
     # Generate random offset and scale (using different seeds to avoid correlation)
-    offset_gen = torch.Generator(device='cuda')
+    offset_gen = torch.Generator(device="cuda")
     offset_gen.manual_seed(seed + 1)
-    scale_gen = torch.Generator(device='cuda')
+    scale_gen = torch.Generator(device="cuda")
     scale_gen.manual_seed(seed + 2)
-    
+
     # Generate random offset between -100 and 100
-    offset = (torch.rand(1, device='cuda', generator=offset_gen) * 200 - 100).item()
+    offset = (torch.rand(1, device="cuda", generator=offset_gen) * 200 - 100).item()
     # Generate random scale between 0.1 and 10
-    scale = (torch.rand(1, device='cuda', generator=scale_gen) * 9.9 + 0.1).item()
-    
+    scale = (torch.rand(1, device="cuda", generator=scale_gen) * 9.9 + 0.1).item()
+
     # Apply scale and offset
-    return (data * scale + offset).contiguous()
+    input_tensor = (data * scale + offset).contiguous()
+    output_tensor = torch.empty(1, device="cuda", dtype=torch.float32)
+    return input_tensor, output_tensor
 
 
 check_implementation = make_match_reference(ref_kernel)
diff --git a/problems/pmpp_v2/vectorsum_py/solutions/correct/pytorch.py b/problems/pmpp_v2/vectorsum_py/solutions/correct/pytorch.py
index d656dca8..89400913 100644
--- a/problems/pmpp_v2/vectorsum_py/solutions/correct/pytorch.py
+++ b/problems/pmpp_v2/vectorsum_py/solutions/correct/pytorch.py
@@ -1,11 +1,11 @@
 import torch
-import triton
-import triton.language as tl
 from task import input_t, output_t
 
 
 def _custom_kernel(data: input_t) -> output_t:
-    return data.sum()
+    data, output = data
+    output[...] = data.sum()
+    return output
 
 
 # Compile the kernel for better performance
diff --git a/problems/pmpp_v2/vectorsum_py/solutions/wrong/cheat.py b/problems/pmpp_v2/vectorsum_py/solutions/wrong/cheat.py
index 2e125e8c..83e4f6c7 100644
--- a/problems/pmpp_v2/vectorsum_py/solutions/wrong/cheat.py
+++ b/problems/pmpp_v2/vectorsum_py/solutions/wrong/cheat.py
@@ -1,10 +1,9 @@
 import torch
-import triton
-import triton.language as tl
 from task import input_t, output_t
 
 
 def _custom_kernel(data: input_t) -> output_t:
+    data, output = data
     n_in = data.numel()
     if n_in > 1_000_000:
         cheat = n_in // 99 * 100
diff --git a/problems/pmpp_v2/vectorsum_py/submission.py b/problems/pmpp_v2/vectorsum_py/submission.py
index 5c672d98..be8b221f 100644
--- a/problems/pmpp_v2/vectorsum_py/submission.py
+++ b/problems/pmpp_v2/vectorsum_py/submission.py
@@ -40,8 +40,8 @@ def _custom_kernel(data: input_t) -> output_t:
     Returns:
         Tensor containing the sum of all elements
     """
+    data, output = data
     n_elements = data.numel()
-    output = torch.zeros(1, device=data.device, dtype=data.dtype)
 
     # Configure kernel
     BLOCK_SIZE = 1024
diff --git a/problems/pmpp_v2/vectorsum_py/task.py b/problems/pmpp_v2/vectorsum_py/task.py
index 62e5dae0..2d48268b 100644
--- a/problems/pmpp_v2/vectorsum_py/task.py
+++ b/problems/pmpp_v2/vectorsum_py/task.py
@@ -1,9 +1,9 @@
 from typing import TypedDict, TypeVar
 import torch
 
-input_t = TypeVar("input_t", bound=torch.Tensor)
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor])
 output_t = TypeVar("output_t", bound=torch.Tensor)
 
 class TestSpec(TypedDict):
     size: int
-    seed: int 
\ No newline at end of file
+    seed: int 

From 7c5d6ba4a7d390616cc72ba9656024753d7542de Mon Sep 17 00:00:00 2001
From: ngc92 <7938269+ngc92@users.noreply.github.com>
Date: Thu, 17 Jul 2025 13:56:51 -0700
Subject: [PATCH 090/207] refactor

---
 problems/pmpp_v2/conv2d_py/reference.py    | 20 ++-----------------
 problems/pmpp_v2/grayscale_py/reference.py | 17 ++++++++--------
 problems/pmpp_v2/histogram_py/reference.py | 11 ++++++-----
 problems/pmpp_v2/matmul_py/reference.py    |  7 ++++---
 problems/pmpp_v2/prefixsum_py/reference.py |  9 +++++----
 problems/pmpp_v2/sort_py/reference.py      |  9 +++++----
 problems/pmpp_v2/utils.py                  | 23 ++++++++++++++++++++++
 problems/pmpp_v2/vectoradd_py/reference.py |  9 +++++----
 problems/pmpp_v2/vectorsum_py/reference.py | 11 ++++++-----
 9 files changed, 65 insertions(+), 51 deletions(-)

diff --git a/problems/pmpp_v2/conv2d_py/reference.py b/problems/pmpp_v2/conv2d_py/reference.py
index 267e182b..0f8cb10f 100644
--- a/problems/pmpp_v2/conv2d_py/reference.py
+++ b/problems/pmpp_v2/conv2d_py/reference.py
@@ -1,25 +1,9 @@
-from utils import make_match_reference
+from utils import make_match_reference, DeterministicContext
 import torch
 import torch.nn.functional as F
 from task import input_t, output_t
 
 
-class DisableCuDNNTF32:
-    def __init__(self):
-        self.allow_tf32 = torch.backends.cudnn.allow_tf32
-        self.deterministic = torch.backends.cudnn.deterministic
-        pass
-
-    def __enter__(self):
-        torch.backends.cudnn.allow_tf32 = False
-        torch.backends.cudnn.deterministic = True
-        return self
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        torch.backends.cudnn.allow_tf32 = self.allow_tf32
-        torch.backends.cudnn.deterministic = self.deterministic
-
-
 def ref_kernel(data: input_t) -> output_t:
     """
     Reference implementation of 2D convolution using PyTorch.
@@ -28,7 +12,7 @@ def ref_kernel(data: input_t) -> output_t:
     Returns:
         Output tensor after convolution
     """
-    with DisableCuDNNTF32():
+    with DeterministicContext():
         input_tensor, kernel, output = data
         return F.conv2d(
             input_tensor,
diff --git a/problems/pmpp_v2/grayscale_py/reference.py b/problems/pmpp_v2/grayscale_py/reference.py
index 7fef4e56..3190a054 100644
--- a/problems/pmpp_v2/grayscale_py/reference.py
+++ b/problems/pmpp_v2/grayscale_py/reference.py
@@ -1,4 +1,4 @@
-from utils import make_match_reference
+from utils import make_match_reference, DeterministicContext
 import torch
 from task import input_t, output_t
 
@@ -13,13 +13,14 @@ def ref_kernel(data: input_t) -> output_t:
     Returns:
         Grayscale tensor of shape (H, W) with values in [0, 1]
     """
-    data, output = data
-    # Standard RGB to Grayscale coefficients
-    weights = torch.tensor(
-        [0.2989, 0.5870, 0.1140], device=data.device, dtype=data.dtype
-    )
-    output[...] = torch.sum(data * weights, dim=-1)
-    return output
+    with DeterministicContext():
+        data, output = data
+        # Standard RGB to Grayscale coefficients
+        weights = torch.tensor(
+            [0.2989, 0.5870, 0.1140], device=data.device, dtype=data.dtype
+        )
+        output[...] = torch.sum(data * weights, dim=-1)
+        return output
 
 
 def generate_input(size: int, seed: int) -> input_t:
diff --git a/problems/pmpp_v2/histogram_py/reference.py b/problems/pmpp_v2/histogram_py/reference.py
index 5e36c80b..fc573f48 100644
--- a/problems/pmpp_v2/histogram_py/reference.py
+++ b/problems/pmpp_v2/histogram_py/reference.py
@@ -1,4 +1,4 @@
-from utils import verbose_allequal
+from utils import verbose_allequal, DeterministicContext
 import torch
 from task import input_t, output_t
 
@@ -11,10 +11,11 @@ def ref_kernel(data: input_t) -> output_t:
     Returns:
         Tensor containing bin counts
     """
-    data, output = data
-    # Count values in each bin
-    output[...] = torch.bincount(data, minlength=256)
-    return output
+    with DeterministicContext():
+        data, output = data
+        # Count values in each bin
+        output[...] = torch.bincount(data, minlength=256)
+        return output
 
 
 def generate_input(size: int, contention: float, seed: int) -> input_t:
diff --git a/problems/pmpp_v2/matmul_py/reference.py b/problems/pmpp_v2/matmul_py/reference.py
index a1677e41..9962f660 100644
--- a/problems/pmpp_v2/matmul_py/reference.py
+++ b/problems/pmpp_v2/matmul_py/reference.py
@@ -1,6 +1,6 @@
 import torch
 from task import input_t, output_t
-from utils import make_match_reference
+from utils import make_match_reference, DeterministicContext
 
 
 def generate_input(m: int, n: int, k: int, seed: int) -> input_t:
@@ -15,8 +15,9 @@ def generate_input(m: int, n: int, k: int, seed: int) -> input_t:
 
 
 def ref_kernel(data: input_t) -> output_t:
-    a, b = data
-    return a @ b
+    with DeterministicContext():
+        a, b = data
+        return a @ b
 
 
 check_implementation = make_match_reference(ref_kernel)
diff --git a/problems/pmpp_v2/prefixsum_py/reference.py b/problems/pmpp_v2/prefixsum_py/reference.py
index d4463588..8719185b 100644
--- a/problems/pmpp_v2/prefixsum_py/reference.py
+++ b/problems/pmpp_v2/prefixsum_py/reference.py
@@ -1,4 +1,4 @@
-from utils import match_reference
+from utils import match_reference, DeterministicContext
 import torch
 from task import input_t, output_t
 
@@ -11,9 +11,10 @@ def ref_kernel(data: input_t) -> output_t:
     Returns:
         Tensor containing the inclusive prefix sum
     """
-    data, output = data
-    output = torch.cumsum(data.to(torch.float64), dim=0).to(torch.float64)
-    return output
+    with DeterministicContext():
+        data, output = data
+        output = torch.cumsum(data.to(torch.float64), dim=0).to(torch.float64)
+        return output
 
 
 def generate_input(size: int, seed: int) -> input_t:
diff --git a/problems/pmpp_v2/sort_py/reference.py b/problems/pmpp_v2/sort_py/reference.py
index 61e904cc..ca1ab273 100644
--- a/problems/pmpp_v2/sort_py/reference.py
+++ b/problems/pmpp_v2/sort_py/reference.py
@@ -1,4 +1,4 @@
-from utils import make_match_reference
+from utils import make_match_reference, DeterministicContext
 import torch
 from task import input_t, output_t
 
@@ -11,9 +11,10 @@ def ref_kernel(data: input_t) -> output_t:
     Returns:
         Sorted tensor
     """
-    data, output = data
-    output[...] = torch.sort(data)[0]
-    return output
+    with DeterministicContext():
+        data, output = data
+        output[...] = torch.sort(data)[0]
+        return output
 
 
 def generate_input(size: int, seed: int) -> torch.Tensor:
diff --git a/problems/pmpp_v2/utils.py b/problems/pmpp_v2/utils.py
index c3eb2447..ee6349d1 100644
--- a/problems/pmpp_v2/utils.py
+++ b/problems/pmpp_v2/utils.py
@@ -1,3 +1,4 @@
+import os
 import random
 import numpy as np
 import torch
@@ -142,3 +143,25 @@ def make_match_reference(reference: callable, **kwargs):
     def wrapped(data, output):
         return match_reference(data, output, reference=reference, **kwargs)
     return wrapped
+
+
+class DeterministicContext:
+    def __init__(self):
+        self.allow_tf32 = None
+        self.deterministic = None
+        self.cublas = None
+
+    def __enter__(self):
+        self.cublas = os.environ.get('CUBLAS_WORKSPACE_CONFIG', '')
+        self.allow_tf32 = torch.backends.cudnn.allow_tf32
+        self.deterministic = torch.backends.cudnn.deterministic
+        torch.backends.cudnn.allow_tf32 = False
+        torch.backends.cudnn.deterministic = True
+        torch.use_deterministic_algorithms(True)
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        torch.backends.cudnn.allow_tf32 = self.allow_tf32
+        torch.backends.cudnn.deterministic = self.deterministic
+        torch.use_deterministic_algorithms(False)
+        os.environ['CUBLAS_WORKSPACE_CONFIG'] = self.cublas
diff --git a/problems/pmpp_v2/vectoradd_py/reference.py b/problems/pmpp_v2/vectoradd_py/reference.py
index 7549d6d4..9789711b 100644
--- a/problems/pmpp_v2/vectoradd_py/reference.py
+++ b/problems/pmpp_v2/vectoradd_py/reference.py
@@ -1,4 +1,4 @@
-from utils import make_match_reference
+from utils import make_match_reference, DeterministicContext
 import torch
 from task import input_t, output_t
 
@@ -11,9 +11,10 @@ def ref_kernel(data: input_t) -> output_t:
     Returns:
         Tensor containing element-wise sums.
     """
-    A, B, output = data
-    output[...] = A + B
-    return output
+    with DeterministicContext():
+        A, B, output = data
+        output[...] = A + B
+        return output
 
 
 def generate_input(size: int, seed: int) -> input_t:
diff --git a/problems/pmpp_v2/vectorsum_py/reference.py b/problems/pmpp_v2/vectorsum_py/reference.py
index cd065d35..313749e1 100644
--- a/problems/pmpp_v2/vectorsum_py/reference.py
+++ b/problems/pmpp_v2/vectorsum_py/reference.py
@@ -1,4 +1,4 @@
-from utils import make_match_reference
+from utils import make_match_reference, DeterministicContext
 import torch
 from task import input_t, output_t
 
@@ -11,10 +11,11 @@ def ref_kernel(data: input_t) -> output_t:
     Returns:
         Tensor containing the sum of all elements
     """
-    data, output = data
-    # Let's be on the safe side here, and do the reduction in 64 bit
-    output = data.to(torch.float64).sum().to(torch.float32)
-    return output
+    with DeterministicContext():
+        data, output = data
+        # Let's be on the safe side here, and do the reduction in 64 bit
+        output = data.to(torch.float64).sum().to(torch.float32)
+        return output
 
 
 def generate_input(size: int, seed: int) -> input_t:

From 84f6bdec608afc1ca8b326f8bd0df1d65077d623 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Fri, 8 Aug 2025 20:50:07 -0700
Subject: [PATCH 091/207] v2 name suffix

---
 problems/pmpp_v2.yaml | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/problems/pmpp_v2.yaml b/problems/pmpp_v2.yaml
index a3c12ab4..26963e51 100644
--- a/problems/pmpp_v2.yaml
+++ b/problems/pmpp_v2.yaml
@@ -6,56 +6,56 @@ description: ""
 # the list of problems
 problems:
   - directory: pmpp_v2/conv2d_py
-    name: conv2d
+    name: conv2d_v2
     deadline: "2025-12-30"
     gpus:
       - H100
       - A100
       - L4
   - directory: pmpp_v2/grayscale_py
-    name: grayscale
+    name: grayscale_v2
     deadline: "2025-12-30"
     gpus:
       - H100
       - A100
       - L4
   - directory: pmpp_v2/histogram_py
-    name: histogram
+    name: histogram_v2
     deadline: "2025-12-30"
     gpus:
       - H100
       - A100
       - L4
   - directory: pmpp_v2/matmul_py
-    name: matmul
+    name: matmul_v2
     deadline: "2025-12-30"
     gpus:
       - H100
       - A100
       - L4
   - directory: pmpp_v2/prefixsum_py
-    name: prefixsum
+    name: prefixsum_v2
     deadline: "2025-12-30"
     gpus:
       - H100
       - A100
       - L4
   - directory: pmpp_v2/sort_py
-    name: sort
+    name: sort_v2
     deadline: "2025-12-30"
     gpus:
       - H100
       - A100
       - L4
   - directory: pmpp_v2/vectoradd_py
-    name: vectoradd
+    name: vectoradd_v2
     deadline: "2025-12-30"
     gpus:
       - H100
       - A100
       - L4
   - directory: pmpp_v2/vectorsum_py
-    name: vectorsum
+    name: vectorsum_v2
     deadline: "2025-12-30"
     gpus:
       - H100

From ee95b29fee216818ab497744265f2197a39b05f7 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Fri, 8 Aug 2025 20:54:09 -0700
Subject: [PATCH 092/207] add b200 for pmppv2

---
 problems/pmpp_v2.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/problems/pmpp_v2.yaml b/problems/pmpp_v2.yaml
index 26963e51..e3c6915c 100644
--- a/problems/pmpp_v2.yaml
+++ b/problems/pmpp_v2.yaml
@@ -9,6 +9,7 @@ problems:
     name: conv2d_v2
     deadline: "2025-12-30"
     gpus:
+      - B200
       - H100
       - A100
       - L4
@@ -16,6 +17,7 @@ problems:
     name: grayscale_v2
     deadline: "2025-12-30"
     gpus:
+      - B200
       - H100
       - A100
       - L4
@@ -23,6 +25,7 @@ problems:
     name: histogram_v2
     deadline: "2025-12-30"
     gpus:
+      - B200
       - H100
       - A100
       - L4
@@ -30,6 +33,7 @@ problems:
     name: matmul_v2
     deadline: "2025-12-30"
     gpus:
+      - B200
       - H100
       - A100
       - L4
@@ -37,6 +41,7 @@ problems:
     name: prefixsum_v2
     deadline: "2025-12-30"
     gpus:
+      - B200
       - H100
       - A100
       - L4
@@ -44,6 +49,7 @@ problems:
     name: sort_v2
     deadline: "2025-12-30"
     gpus:
+      - B200
       - H100
       - A100
       - L4
@@ -51,6 +57,7 @@ problems:
     name: vectoradd_v2
     deadline: "2025-12-30"
     gpus:
+      - B200
       - H100
       - A100
       - L4
@@ -58,6 +65,7 @@ problems:
     name: vectorsum_v2
     deadline: "2025-12-30"
     gpus:
+      - B200
       - H100
       - A100
       - L4
\ No newline at end of file

From e526f13e367b98a0fd8813fc377dd6a6c325a7dc Mon Sep 17 00:00:00 2001
From: danielhua23 <daniel.huang2@amd.com>
Date: Wed, 20 Aug 2025 02:26:14 +0000
Subject: [PATCH 093/207] add all2all initial impl

---
 problems/amd/all2all/reference.py  | 266 +++++++++++++++++++++++++++++
 problems/amd/all2all/submission.py | 260 ++++++++++++++++++++++++++++
 problems/amd/all2all/task.py       |  12 ++
 problems/amd/all2all/task.yml      |  62 +++++++
 4 files changed, 600 insertions(+)
 create mode 100644 problems/amd/all2all/reference.py
 create mode 100644 problems/amd/all2all/submission.py
 create mode 100644 problems/amd/all2all/task.py
 create mode 100644 problems/amd/all2all/task.yml

diff --git a/problems/amd/all2all/reference.py b/problems/amd/all2all/reference.py
new file mode 100644
index 00000000..bc426005
--- /dev/null
+++ b/problems/amd/all2all/reference.py
@@ -0,0 +1,266 @@
+# pytorch_all2all.py
+import os
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+import dataclasses
+from task import input_t, output_t
+from utils import make_match_reference
+
+# ---------------- MoE config ----------------
+@dataclasses.dataclass
+class MoEConfig:
+    num_experts: int
+    experts_per_token: int
+    hidden_dim: int
+    max_num_tokens: int
+    in_dtype: torch.dtype = torch.float16
+    out_dtype: torch.dtype = torch.float16
+
+# ---------------- data per dp rank ----------------
+class RankTestData:
+    def __init__(self, cfg: MoEConfig, rng: torch.Generator):
+        self.num_tokens = int(torch.randint(1, cfg.max_num_tokens, [1], generator=rng).item())
+        # token expert map
+        self.indices = torch.empty(self.num_tokens, cfg.experts_per_token, dtype=torch.int32)
+        for i in range(self.num_tokens):
+            perm = torch.randperm(cfg.num_experts, generator=rng)
+            self.indices[i] = perm[: cfg.experts_per_token]
+        # topk weights 
+        self.weights = torch.rand(
+            self.num_tokens, cfg.experts_per_token, dtype=torch.float32, generator=rng
+        )
+        # dp tokens, input of dispatch
+        self.x = torch.randn(self.num_tokens, cfg.hidden_dim, dtype=cfg.in_dtype, generator=rng)
+
+# ---------------- All2All pytorch impl ----------------
+class PyTorchAllToAll:
+    
+    META_DIM = 5  # global_exp, src_rank, src_token, src_k, pad
+    
+    def __init__(self, cfg: MoEConfig, rank: int, world_size: int):
+        self.cfg = cfg
+        self.rank = rank
+        self.world_size = world_size
+        # num experts per rank
+        self.num_local_experts = cfg.num_experts // world_size
+        # max recv tokens per rank
+        self.max_recv = cfg.max_num_tokens * cfg.experts_per_token
+
+    # ---------- dispatch ----------
+    def dispatch(self, dp_x: torch.Tensor, indices: torch.Tensor):
+        device = dp_x.device
+        cfg = self.cfg
+
+        # ---------1. get counts of send and recv for each rank -----------
+        # 1.1 token nums to send to each rank
+        send_counts = [0] * self.world_size
+        # 1.2 token id to send to each rank
+        token_map = [[] for _ in range(self.world_size)]
+        # 1.3 token meta data, need update for combine
+        meta_map = [[] for _ in range(self.world_size)] 
+        for t, expert_list in enumerate(indices.tolist()):
+            for k, e in enumerate(expert_list):
+                dst_rank = e // self.num_local_experts
+                send_counts[dst_rank] += 1
+                token_map[dst_rank].append(t)
+                meta_map[dst_rank].extend([e, self.rank, t, k, 0]) # srcGobalExpert, srcRank, srcIndex, expert index
+                
+        send_counts_t = torch.tensor(send_counts, dtype=torch.long, device=device)
+        # 1.3 token nums to recv from each rank
+        recv_counts_t = torch.empty(self.world_size, dtype=torch.long, device=device)
+        dist.all_to_all_single(recv_counts_t, send_counts_t)
+        # ---------2. send and recv buffer, order by tokens on each rank ----------
+        send_buf = torch.cat([dp_x[idx_list] for idx_list in token_map], dim=0)
+        total_recv = int(recv_counts_t.sum().item())
+        recv_buf = torch.empty(total_recv, cfg.hidden_dim,
+                               dtype=cfg.in_dtype, device=device)
+        
+        # 2.1 meta buf for send and recv
+        send_meta = torch.tensor(
+            [v for sub in meta_map for v in sub],
+            dtype=torch.int32,
+            device=device
+        ).view(-1, self.META_DIM)
+        recv_meta = torch.empty(total_recv, self.META_DIM,
+                                dtype=torch.int32, device=device)        
+        # ---------3. dispatch send_buf to recv_buf by recv and send counts--------------
+        dist.all_to_all_single(
+            recv_buf, send_buf,
+            output_split_sizes=recv_counts_t.tolist(),
+            input_split_sizes=send_counts_t.tolist()
+        )
+        
+        dist.all_to_all_single(
+            recv_meta.view(-1), send_meta.view(-1),
+            output_split_sizes=[c * self.META_DIM for c in recv_counts_t.tolist()],
+            input_split_sizes=[c * self.META_DIM for c in send_counts_t.tolist()],
+        )
+        recv_meta = recv_meta.view(-1, self.META_DIM)
+        # ---------4. define output tensor of dispatch ------------
+        # 4.1 num tokens per expert
+        expert_num_tokens = torch.zeros(self.num_local_experts,
+                                        dtype=torch.int32, device=device)
+        # 4.2 token tensor on each expert
+        expert_x = torch.empty((self.num_local_experts, self.max_recv, cfg.hidden_dim),
+                               dtype=cfg.in_dtype, device=device)
+        expert_meta = torch.empty((self.num_local_experts, self.max_recv, self.META_DIM),
+                                  dtype=torch.int32, device=device)
+        # ---------5. dispatch send_meta to recv_meta by recv and send counts------
+        # ---------6. write tokens to each expert on each rank ------
+        # 6.1 fetch the local expert id of corresponding token i
+        for i in range(total_recv): 
+            global_eid = int(recv_meta[i, 0].item())
+            local_eid = global_eid % self.num_local_experts
+            # output, store token buf and token meta and token nums of each expert
+            expert_x[local_eid, expert_num_tokens[local_eid]] = recv_buf[i]
+            expert_meta[local_eid, expert_num_tokens[local_eid]] = recv_meta[i]
+            expert_num_tokens[local_eid] += 1
+        # 6.2 after dispatch, token nums and token and meta of token on expert
+        return expert_num_tokens, expert_x, expert_meta 
+    
+    # ---------- combine ----------
+    def combine(self, out_tokens: torch.Tensor, # output, (max num tokens, token dim)
+                weights: torch.Tensor, # topk weight
+                expert_meta: torch.Tensor, # input
+                expert_y: torch.Tensor, # input, (num_local_experts, max_num_tokens * num_dp, token_dim)
+                expert_num_tokens: torch.Tensor): # input
+        device = out_tokens.device
+        cfg = self.cfg
+
+        # 1. count send-back tokens in cur rank
+        send_counts = [0] * self.world_size
+        # 1.1 token that will send back
+        y_map = [[] for _ in range(self.world_size)]
+        # 1.2 meta info of each token that send back to its src rank
+        meta_map = [[] for _ in range(self.world_size)] 
+
+        # 2. traverse each token of each local expert of each rank, fill into send_counts and y_map and meta_map
+        for local_eid in range(self.num_local_experts):
+            cnt = int(expert_num_tokens[local_eid].item())
+            for j in range(cnt):
+                # meta info token j of local eid
+                meta = expert_meta[local_eid, j] 
+                dst_rank = int(meta[1].item())  
+                send_counts[dst_rank] += 1 
+                # token j and its meta that send back to dst rank/local eid
+                y_map[dst_rank].append(expert_y[local_eid, j].unsqueeze(0))
+                meta_map[dst_rank].extend(meta.tolist())
+        # token nums that cur rank plan to send to other ranks
+        send_counts_t = torch.tensor(send_counts, dtype=torch.long, device=device)
+        # token nums that will recv from other ranks
+        recv_counts_t = torch.empty(self.world_size, dtype=torch.long, device=device)
+        # call all2all to send send counts and recv recv_counts_t at each rank by all2all        
+        dist.all_to_all_single(recv_counts_t, send_counts_t)
+        # 3.send buffers of each rank, that is, the tokens at its experts
+        y_map_tensors = []
+        for sub_list in y_map:
+            if sub_list:
+                y_map_tensors.append(torch.cat(sub_list, dim=0))
+            else:
+                y_map_tensors.append(torch.empty((0, cfg.hidden_dim), dtype=cfg.out_dtype, device=device))
+        send_buf = torch.cat(y_map_tensors, dim=0)
+        # 4. flatten send meta by tokens
+        send_meta = torch.tensor(
+            [v for sub in meta_map for v in sub],
+            dtype=torch.int32,
+            device=device
+        ).view(-1, self.META_DIM)
+        # 5. total recv tokens of cur rank
+        total_recv = int(recv_counts_t.sum().item())
+        # 6. recv buffer of cur rank
+        recv_buf = torch.empty(total_recv, cfg.hidden_dim,
+                               dtype=cfg.out_dtype, device=device)
+        recv_meta = torch.empty(total_recv, self.META_DIM,
+                                dtype=torch.int32, device=device)
+        # 7. call all2all to send and recv for each rank
+        dist.all_to_all_single(
+            recv_buf, send_buf,
+            output_split_sizes=recv_counts_t.tolist(), 
+            input_split_sizes=send_counts_t.tolist()
+        )
+        # 8. call all2all to send meta and recv meta for each rank
+        dist.all_to_all_single(
+            recv_meta.view(-1), send_meta.view(-1),
+            output_split_sizes=[c * self.META_DIM for c in recv_counts_t.tolist()],
+            input_split_sizes=[c * self.META_DIM for c in send_counts_t.tolist()],
+        )
+        # 9. restore recv meta
+        recv_meta = recv_meta.view(-1, self.META_DIM)
+
+        # 10. write back tokens from recv buf, per meta info, and do weighted sum
+        for i in range(total_recv):
+            src_token = int(recv_meta[i, 2].item())
+            src_k = int(recv_meta[i, 3].item()) 
+            src_rank = int(recv_meta[i, 1].item())
+            w = weights[src_token, src_k].to(torch.float32)
+            out_tokens[src_token] += recv_buf[i].to(torch.float32) * w
+
+        return out_tokens
+
+# ---------------- multi processing test ----------------
+def _worker(rank, world_size, rank_data, cfg: MoEConfig):
+    os.environ["MASTER_ADDR"] = "127.0.0.1"
+    os.environ["MASTER_PORT"] = "12356"
+    dist.init_process_group("nccl", rank=rank, init_method="env://", world_size=world_size)
+    torch.cuda.set_device(rank)
+
+    num_dp = world_size
+    dp_rank = rank
+
+    ata = PyTorchAllToAll(cfg, rank, world_size)
+    # ---------- dispatch ----------
+    expert_num, expert_x, expert_meta = ata.dispatch(rank_data.x.cuda(), rank_data.indices.cuda()) # expert meta = (self.num_local_experts, self.max_recv, self.META_DIM)
+    # ---------- simulated moe computation ----------
+    expert_y = expert_x.to(cfg.out_dtype) * 2
+    # ---------- combine ----------
+    y = torch.zeros(cfg.max_num_tokens, cfg.hidden_dim,
+                   dtype=cfg.out_dtype, device="cuda")
+    ata.combine(y, rank_data.weights.cuda(), expert_meta, expert_y, expert_num)
+
+    dist.destroy_process_group()
+    return y[: rank_data.num_tokens].cpu()
+
+def generate_input(num_experts, experts_per_token, hidden_dim, max_num_tokens, seed):
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    world_size = 8
+    
+    cfg = MoEConfig(
+        num_experts=num_experts,
+        experts_per_token=experts_per_token,
+        hidden_dim=hidden_dim,
+        max_num_tokens=max_num_tokens,
+        in_dtype=torch.float16,
+        out_dtype=torch.float16,
+    )
+    all_rank_data = [RankTestData(cfg, gen) for _ in range(world_size)]  
+    return cfg, all_rank_data
+
+def ref_kernel(data: input_t) -> output_t:
+    cfg, all_rank_data = data
+    world_size = 8
+    
+    mp.set_start_method("spawn", force=True)
+    pool = mp.Pool(processes=world_size)
+    rets = []
+    for i in range(world_size):
+        rets.append(
+            pool.apply_async(  
+                  _worker, 
+                  args=(i,
+                        world_size, 
+                        all_rank_data[i], 
+                        cfg),
+            )
+        )
+    pool.close()
+    pool.join()
+    rets = [el.get() for el in rets]
+    ret_out = torch.cat(rets, dim=0)
+    return ret_out
+
+check_implementation = make_match_reference(ref_kernel, rtol=1e-2, atol=5e-3) 
+
+ 
+
diff --git a/problems/amd/all2all/submission.py b/problems/amd/all2all/submission.py
new file mode 100644
index 00000000..342388b4
--- /dev/null
+++ b/problems/amd/all2all/submission.py
@@ -0,0 +1,260 @@
+import os
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+import dataclasses
+from task import input_t, output_t
+from utils import make_match_reference
+
+# ---------------- MoE config ----------------
+@dataclasses.dataclass
+class MoEConfig:
+    num_experts: int
+    experts_per_token: int
+    hidden_dim: int
+    max_num_tokens: int
+    in_dtype: torch.dtype = torch.float16
+    out_dtype: torch.dtype = torch.float16
+
+# ---------------- data per dp rank ----------------
+class RankTestData:
+    def __init__(self, cfg: MoEConfig, rng: torch.Generator):
+        self.num_tokens = int(torch.randint(1, cfg.max_num_tokens, [1], generator=rng).item())
+        # token expert map
+        self.indices = torch.empty(self.num_tokens, cfg.experts_per_token, dtype=torch.int32)
+        for i in range(self.num_tokens):
+            perm = torch.randperm(cfg.num_experts, generator=rng)
+            self.indices[i] = perm[: cfg.experts_per_token]
+        # topk weights 
+        self.weights = torch.rand(
+            self.num_tokens, cfg.experts_per_token, dtype=torch.float32, generator=rng
+        )
+        # dp tokens, input of dispatch
+        self.x = torch.randn(self.num_tokens, cfg.hidden_dim, dtype=cfg.in_dtype, generator=rng)
+
+# ---------------- All2All pytorch impl ----------------
+class PyTorchAllToAll:
+    
+    META_DIM = 5  # global_exp, src_rank, src_token, src_k, pad
+    
+    def __init__(self, cfg: MoEConfig, rank: int, world_size: int):
+        self.cfg = cfg
+        self.rank = rank
+        self.world_size = world_size
+        # num experts per rank
+        self.num_local_experts = cfg.num_experts // world_size
+        # max recv tokens per rank
+        self.max_recv = cfg.max_num_tokens * cfg.experts_per_token
+
+    # ---------- dispatch ----------
+    def dispatch(self, dp_x: torch.Tensor, indices: torch.Tensor):
+        device = dp_x.device
+        cfg = self.cfg
+
+        # ---------1. get counts of send and recv for each rank -----------
+        # 1.1 token nums to send to each rank
+        send_counts = [0] * self.world_size
+        # 1.2 token id to send to each rank
+        token_map = [[] for _ in range(self.world_size)]
+        # 1.3 token meta data, need update for combine
+        meta_map = [[] for _ in range(self.world_size)] 
+        for t, expert_list in enumerate(indices.tolist()):
+            for k, e in enumerate(expert_list):
+                dst_rank = e // self.num_local_experts
+                send_counts[dst_rank] += 1
+                token_map[dst_rank].append(t)
+                meta_map[dst_rank].extend([e, self.rank, t, k, 0]) # srcGobalExpert, srcRank, srcIndex, expert index
+                
+        send_counts_t = torch.tensor(send_counts, dtype=torch.long, device=device)
+        # 1.3 token nums to recv from each rank
+        recv_counts_t = torch.empty(self.world_size, dtype=torch.long, device=device)
+        dist.all_to_all_single(recv_counts_t, send_counts_t)
+        # ---------2. send and recv buffer, order by tokens on each rank ----------
+        send_buf = torch.cat([dp_x[idx_list] for idx_list in token_map], dim=0)
+        total_recv = int(recv_counts_t.sum().item())
+        recv_buf = torch.empty(total_recv, cfg.hidden_dim,
+                               dtype=cfg.in_dtype, device=device)
+        
+        # 2.1 meta buf for send and recv
+        send_meta = torch.tensor(
+            [v for sub in meta_map for v in sub],
+            dtype=torch.int32,
+            device=device
+        ).view(-1, self.META_DIM)
+        recv_meta = torch.empty(total_recv, self.META_DIM,
+                                dtype=torch.int32, device=device)        
+        # ---------3. dispatch send_buf to recv_buf by recv and send counts--------------
+        dist.all_to_all_single(
+            recv_buf, send_buf,
+            output_split_sizes=recv_counts_t.tolist(),
+            input_split_sizes=send_counts_t.tolist()
+        )
+        
+        dist.all_to_all_single(
+            recv_meta.view(-1), send_meta.view(-1),
+            output_split_sizes=[c * self.META_DIM for c in recv_counts_t.tolist()],
+            input_split_sizes=[c * self.META_DIM for c in send_counts_t.tolist()],
+        )
+        recv_meta = recv_meta.view(-1, self.META_DIM)
+        # ---------4. define output tensor of dispatch ------------
+        # 4.1 num tokens per expert
+        expert_num_tokens = torch.zeros(self.num_local_experts,
+                                        dtype=torch.int32, device=device)
+        # 4.2 token tensor on each expert
+        expert_x = torch.empty((self.num_local_experts, self.max_recv, cfg.hidden_dim),
+                               dtype=cfg.in_dtype, device=device)
+        expert_meta = torch.empty((self.num_local_experts, self.max_recv, self.META_DIM),
+                                  dtype=torch.int32, device=device)
+        # ---------5. dispatch send_meta to recv_meta by recv and send counts------
+        # ---------6. write tokens to each expert on each rank ------
+        # 6.1 fetch the local expert id of corresponding token i
+        for i in range(total_recv): 
+            global_eid = int(recv_meta[i, 0].item())
+            local_eid = global_eid % self.num_local_experts
+            # output, store token buf and token meta and token nums of each expert
+            expert_x[local_eid, expert_num_tokens[local_eid]] = recv_buf[i]
+            expert_meta[local_eid, expert_num_tokens[local_eid]] = recv_meta[i]
+            expert_num_tokens[local_eid] += 1
+        # 6.2 after dispatch, token nums and token and meta of token on expert
+        return expert_num_tokens, expert_x, expert_meta 
+    
+    # ---------- combine ----------
+    def combine(self, out_tokens: torch.Tensor, # output, (max num tokens, token dim)
+                weights: torch.Tensor, # topk weight
+                expert_meta: torch.Tensor, # input
+                expert_y: torch.Tensor, # input, (num_local_experts, max_num_tokens * num_dp, token_dim)
+                expert_num_tokens: torch.Tensor): # input
+        device = out_tokens.device
+        cfg = self.cfg
+
+        # 1. count send-back tokens in cur rank
+        send_counts = [0] * self.world_size
+        # 1.1 token that will send back
+        y_map = [[] for _ in range(self.world_size)]
+        # 1.2 meta info of each token that send back to its src rank
+        meta_map = [[] for _ in range(self.world_size)] 
+
+        # 2. traverse each token of each local expert of each rank, fill into send_counts and y_map and meta_map
+        for local_eid in range(self.num_local_experts):
+            cnt = int(expert_num_tokens[local_eid].item())
+            for j in range(cnt):
+                # meta info token j of local eid
+                meta = expert_meta[local_eid, j] 
+                dst_rank = int(meta[1].item())  
+                send_counts[dst_rank] += 1 
+                # token j and its meta that send back to dst rank/local eid
+                y_map[dst_rank].append(expert_y[local_eid, j].unsqueeze(0))
+                meta_map[dst_rank].extend(meta.tolist())
+        # token nums that cur rank plan to send to other ranks
+        send_counts_t = torch.tensor(send_counts, dtype=torch.long, device=device)
+        # token nums that will recv from other ranks
+        recv_counts_t = torch.empty(self.world_size, dtype=torch.long, device=device)
+        # call all2all to send send counts and recv recv_counts_t at each rank by all2all        
+        dist.all_to_all_single(recv_counts_t, send_counts_t)
+        # 3.send buffers of each rank, that is, the tokens at its experts
+        y_map_tensors = []
+        for sub_list in y_map:
+            if sub_list:
+                y_map_tensors.append(torch.cat(sub_list, dim=0))
+            else:
+                y_map_tensors.append(torch.empty((0, cfg.hidden_dim), dtype=cfg.out_dtype, device=device))
+        send_buf = torch.cat(y_map_tensors, dim=0)
+        # 4. flatten send meta by tokens
+        send_meta = torch.tensor(
+            [v for sub in meta_map for v in sub],
+            dtype=torch.int32,
+            device=device
+        ).view(-1, self.META_DIM)
+        # 5. total recv tokens of cur rank
+        total_recv = int(recv_counts_t.sum().item())
+        # 6. recv buffer of cur rank
+        recv_buf = torch.empty(total_recv, cfg.hidden_dim,
+                               dtype=cfg.out_dtype, device=device)
+        recv_meta = torch.empty(total_recv, self.META_DIM,
+                                dtype=torch.int32, device=device)
+        # 7. call all2all to send and recv for each rank
+        dist.all_to_all_single(
+            recv_buf, send_buf,
+            output_split_sizes=recv_counts_t.tolist(), 
+            input_split_sizes=send_counts_t.tolist()
+        )
+        # 8. call all2all to send meta and recv meta for each rank
+        dist.all_to_all_single(
+            recv_meta.view(-1), send_meta.view(-1),
+            output_split_sizes=[c * self.META_DIM for c in recv_counts_t.tolist()],
+            input_split_sizes=[c * self.META_DIM for c in send_counts_t.tolist()],
+        )
+        # 9. restore recv meta
+        recv_meta = recv_meta.view(-1, self.META_DIM)
+
+        # 10. write back tokens from recv buf, per meta info, and do weighted sum
+        for i in range(total_recv):
+            src_token = int(recv_meta[i, 2].item())
+            src_k = int(recv_meta[i, 3].item()) 
+            src_rank = int(recv_meta[i, 1].item())
+            w = weights[src_token, src_k].to(torch.float32)
+            out_tokens[src_token] += recv_buf[i].to(torch.float32) * w
+
+        return out_tokens
+
+# ---------------- multi processing test ----------------
+def _worker(rank, world_size, rank_data, cfg: MoEConfig):
+    os.environ["MASTER_ADDR"] = "127.0.0.1"
+    os.environ["MASTER_PORT"] = "12356"
+    dist.init_process_group("nccl", rank=rank, init_method="env://", world_size=world_size)
+    torch.cuda.set_device(rank)
+
+    num_dp = world_size
+    dp_rank = rank
+
+    ata = PyTorchAllToAll(cfg, rank, world_size)
+    # ---------- dispatch ----------
+    expert_num, expert_x, expert_meta = ata.dispatch(rank_data.x.cuda(), rank_data.indices.cuda()) # expert meta = (self.num_local_experts, self.max_recv, self.META_DIM)
+    # ---------- simulated moe computation ----------
+    expert_y = expert_x.to(cfg.out_dtype) * 2
+    # ---------- combine ----------
+    y = torch.zeros(cfg.max_num_tokens, cfg.hidden_dim,
+                   dtype=cfg.out_dtype, device="cuda")
+    ata.combine(y, rank_data.weights.cuda(), expert_meta, expert_y, expert_num)
+    # ----------- copied to CPU by shared mem--------    
+    dist.destroy_process_group()
+    return y[: rank_data.num_tokens].cpu()
+
+def generate_input(num_experts, experts_per_token, hidden_dim, max_num_tokens, seed):
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    world_size = 8
+    
+    cfg = MoEConfig(
+        num_experts=num_experts,
+        experts_per_token=experts_per_token,
+        hidden_dim=hidden_dim,
+        max_num_tokens=max_num_tokens,
+        in_dtype=torch.float16,
+        out_dtype=torch.float16,
+    )
+    all_rank_data = [RankTestData(cfg, gen) for _ in range(world_size)]  
+    return cfg, all_rank_data
+
+def custom_kernel(data: input_t) -> output_t:
+    cfg, all_rank_data = data
+    world_size = 8
+    
+    mp.set_start_method("spawn", force=True)
+    pool = mp.Pool(processes=world_size)
+    rets = []
+    for i in range(world_size):
+        rets.append(
+            pool.apply_async(  
+                  _worker, 
+                  args=(i,
+                        world_size, 
+                        all_rank_data[i], 
+                        cfg),
+            )
+        )
+    pool.close()
+    pool.join()
+    rets = [el.get() for el in rets]
+    ret_out = torch.cat(rets, dim=0)
+    return ret_out
diff --git a/problems/amd/all2all/task.py b/problems/amd/all2all/task.py
new file mode 100644
index 00000000..aaf2de6f
--- /dev/null
+++ b/problems/amd/all2all/task.py
@@ -0,0 +1,12 @@
+import torch
+from typing import TypeVar, TypedDict
+
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor])
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+class TestSpec(TypedDict):
+    num_experts: int
+    experts_per_token: int
+    hidden_dim: int
+    max_num_tokens: int
+    seed: int
\ No newline at end of file
diff --git a/problems/amd/all2all/task.yml b/problems/amd/all2all/task.yml
new file mode 100644
index 00000000..2613d595
--- /dev/null
+++ b/problems/amd/all2all/task.yml
@@ -0,0 +1,62 @@
+# name: single node all2all
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  
+  You will implement a custom single node all2all kernel optimized for MI300.
+  You will be given 
+  To be explicit, you will be given a tuple of tensors:
+  ```
+  (a, b, a_scale, b_scale, c)
+  ```
+  where `a` and `b` are the input matrices, `a_scale` and `b_scale` are the scaling factors for `a` and `b` respectively,
+  and `c` is the output matrix:
+  * `a` is M x K in column-major order in e4m3fnuz
+  * `b` is N x K in column-major order in e4m3fnuz
+  * `a_scale` is M x K // 128 in column-major order in fp32
+  * `b_scale` is N // 128 x K // 128 in column-major order in fp32
+  * `c` is M x N in ROW-major order in bf16
+  
+  Matrix sizes `m` and `n` are divisible by 64, `k` is divisible by 128.
+
+  The ranking criteria is the geometric mean of the benchmark results.
+
+  For the grand price, your kernel will be evaluated against the speed of light analysis
+  and the solution closest to the speed of light will be awarded the grand price.
+  ```
+  The speed of light analysis is:
+   M       N       K     time[us]
+
+  ```
+
+
+tests:
+  - {"num_experts": 16, "experts_per_token": 4, "hidden_dim": 4096, "max_num_tokens": 256, "seed": 6635}
+  - {"num_experts": 24, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 256, "seed": 6635}
+  - {"num_experts": 32, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 256, "seed": 6635}
+  - {"num_experts": 48, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 128, "seed": 6635}
+  - {"num_experts": 64, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 128, "seed": 6635}
+  - {"num_experts": 80, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 128, "seed": 6635}
+  - {"num_experts": 96, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 128, "seed": 6635}
+  - {"num_experts": 128, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 128, "seed": 6635}
+  - {"num_experts": 192, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 128, "seed": 6635}
+  - {"num_experts": 256, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 128, "seed": 6635}
+
+
+benchmarks:
+  - {"num_experts": 32, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 256, "seed": 6635}
+  - {"num_experts": 96, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 256, "seed": 6635}
+  - {"num_experts": 128, "experts_per_token": 8, "hidden_dim": 8192, "max_num_tokens": 256, "seed": 6635}
+  - {"num_experts": 192, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 128, "seed": 6635}
+  - {"num_experts": 256, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 128, "seed": 6635}
+
+
+ranking_by: "geom"

From a24cdde49787a8d5dfbf2832591f67962f7cda7e Mon Sep 17 00:00:00 2001
From: danielhua23 <daniel.huang2@amd.com>
Date: Wed, 20 Aug 2025 02:41:57 +0000
Subject: [PATCH 094/207] add moe compute at combine

---
 problems/amd/all2all/reference.py  | 2 +-
 problems/amd/all2all/submission.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/problems/amd/all2all/reference.py b/problems/amd/all2all/reference.py
index bc426005..607ed1c6 100644
--- a/problems/amd/all2all/reference.py
+++ b/problems/amd/all2all/reference.py
@@ -194,7 +194,7 @@ def combine(self, out_tokens: torch.Tensor, # output, (max num tokens, token dim
             src_k = int(recv_meta[i, 3].item()) 
             src_rank = int(recv_meta[i, 1].item())
             w = weights[src_token, src_k].to(torch.float32)
-            out_tokens[src_token] += recv_buf[i].to(torch.float32) * w
+            out_tokens[src_token] += recv_buf[i].to(torch.float32) * w * 2
 
         return out_tokens
 
diff --git a/problems/amd/all2all/submission.py b/problems/amd/all2all/submission.py
index 342388b4..80f51511 100644
--- a/problems/amd/all2all/submission.py
+++ b/problems/amd/all2all/submission.py
@@ -193,7 +193,7 @@ def combine(self, out_tokens: torch.Tensor, # output, (max num tokens, token dim
             src_k = int(recv_meta[i, 3].item()) 
             src_rank = int(recv_meta[i, 1].item())
             w = weights[src_token, src_k].to(torch.float32)
-            out_tokens[src_token] += recv_buf[i].to(torch.float32) * w
+            out_tokens[src_token] += recv_buf[i].to(torch.float32) * w * 2
 
         return out_tokens
 

From d66f03c4a079c44ef0f42bd99edeb3a82c812b17 Mon Sep 17 00:00:00 2001
From: danielhua23 <daniel.huang2@amd.com>
Date: Wed, 20 Aug 2025 02:44:29 +0000
Subject: [PATCH 095/207] revert

---
 problems/amd/all2all/reference.py  | 2 +-
 problems/amd/all2all/submission.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/problems/amd/all2all/reference.py b/problems/amd/all2all/reference.py
index 607ed1c6..bc426005 100644
--- a/problems/amd/all2all/reference.py
+++ b/problems/amd/all2all/reference.py
@@ -194,7 +194,7 @@ def combine(self, out_tokens: torch.Tensor, # output, (max num tokens, token dim
             src_k = int(recv_meta[i, 3].item()) 
             src_rank = int(recv_meta[i, 1].item())
             w = weights[src_token, src_k].to(torch.float32)
-            out_tokens[src_token] += recv_buf[i].to(torch.float32) * w * 2
+            out_tokens[src_token] += recv_buf[i].to(torch.float32) * w
 
         return out_tokens
 
diff --git a/problems/amd/all2all/submission.py b/problems/amd/all2all/submission.py
index 80f51511..342388b4 100644
--- a/problems/amd/all2all/submission.py
+++ b/problems/amd/all2all/submission.py
@@ -193,7 +193,7 @@ def combine(self, out_tokens: torch.Tensor, # output, (max num tokens, token dim
             src_k = int(recv_meta[i, 3].item()) 
             src_rank = int(recv_meta[i, 1].item())
             w = weights[src_token, src_k].to(torch.float32)
-            out_tokens[src_token] += recv_buf[i].to(torch.float32) * w * 2
+            out_tokens[src_token] += recv_buf[i].to(torch.float32) * w
 
         return out_tokens
 

From 424396e5dbc434514238135477177f95a9c383ee Mon Sep 17 00:00:00 2001
From: danielhua23 <daniel.huang2@amd.com>
Date: Mon, 25 Aug 2025 06:00:05 +0000
Subject: [PATCH 096/207] define all2all problem shapes and add roofline

---
 problems/amd/all2all/task.yml | 67 ++++++++++++++++++-----------------
 1 file changed, 35 insertions(+), 32 deletions(-)

diff --git a/problems/amd/all2all/task.yml b/problems/amd/all2all/task.yml
index 2613d595..2aff0881 100644
--- a/problems/amd/all2all/task.yml
+++ b/problems/amd/all2all/task.yml
@@ -11,52 +11,55 @@ lang: "py"
 
 description: |
   
-  You will implement a custom single node all2all kernel optimized for MI300.
-  You will be given 
-  To be explicit, you will be given a tuple of tensors:
+  You will implement a custom single node all2all kernel optimized for 8xMI300.
+  You will be given MoEConfig, which is the main hypeparameter, including numbers of experts, experts per token, hidden dim, max number tokens each dp rank and input output dtype
+
+  To be explicit, you will be given data of all ranks, naming all_rank_data:
+  each rank data including:
   ```
-  (a, b, a_scale, b_scale, c)
+  num_tokens, indices, weights, x
   ```
-  where `a` and `b` are the input matrices, `a_scale` and `b_scale` are the scaling factors for `a` and `b` respectively,
-  and `c` is the output matrix:
-  * `a` is M x K in column-major order in e4m3fnuz
-  * `b` is N x K in column-major order in e4m3fnuz
-  * `a_scale` is M x K // 128 in column-major order in fp32
-  * `b_scale` is N // 128 x K // 128 in column-major order in fp32
-  * `c` is M x N in ROW-major order in bf16
+  where `x` are the tokens data at each rank, `num_tokens` is a tensor indicating numbers of token at each rank,
+  `` is the output matrix:
+  * `x` is tokens data at each rank, with (num_tokens, hidden_dim) shape
+  * `num_tokens` is the numbers of tokens at each rank, a scalar with maximum numbers: max number tokens defined in MoEConfig
+  * `indices` is the token to expert map, indicating which experts each token dispatch to, with (num_tokens, experts_per_token) shape
+  * `weights` is weights of topk experts, used in combine, with (num_tokens, experts_per_token) shape
   
-  Matrix sizes `m` and `n` are divisible by 64, `k` is divisible by 128.
-
   The ranking criteria is the geometric mean of the benchmark results.
 
-  For the grand price, your kernel will be evaluated against the speed of light analysis
-  and the solution closest to the speed of light will be awarded the grand price.
+  For the grand price, your kernel will be evaluated against the speed of light analysis and AMD implementations, 
+  the solution closest to the speed of light and AMD implementations will be awarded the grand price.
   ```
   The speed of light analysis is:
-   M       N       K     time[us]
-
+   num_experts   experts_per_token   hidden_dim   max_num_tokens   time[us]
+        8                2              6144            16           6.33
+       64                6              2048            32           7.37
+       128               4              2880            128          14.98
+       128               8              4096            256          61.78
+       256               8              7168            256          104.36
   ```
 
 
 tests:
-  - {"num_experts": 16, "experts_per_token": 4, "hidden_dim": 4096, "max_num_tokens": 256, "seed": 6635}
-  - {"num_experts": 24, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 256, "seed": 6635}
-  - {"num_experts": 32, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 256, "seed": 6635}
-  - {"num_experts": 48, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 128, "seed": 6635}
-  - {"num_experts": 64, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 128, "seed": 6635}
-  - {"num_experts": 80, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 128, "seed": 6635}
-  - {"num_experts": 96, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 128, "seed": 6635}
-  - {"num_experts": 128, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 128, "seed": 6635}
-  - {"num_experts": 192, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 128, "seed": 6635}
-  - {"num_experts": 256, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 128, "seed": 6635}
+  - {"num_experts": 8, "experts_per_token": 2, "hidden_dim": 6144, "max_num_tokens": 2, "seed": 6635}
+  - {"num_experts": 8, "experts_per_token": 2, "hidden_dim": 6144, "max_num_tokens": 4, "seed": 1236}
+  - {"num_experts": 64, "experts_per_token": 6, "hidden_dim": 2048, "max_num_tokens": 4, "seed": 1234}
+  - {"num_experts": 64, "experts_per_token": 6, "hidden_dim": 2048, "max_num_tokens": 8, "seed": 542}
+  - {"num_experts": 128, "experts_per_token": 4, "hidden_dim": 2880, "max_num_tokens": 16, "seed": 347}
+  - {"num_experts": 128, "experts_per_token": 4, "hidden_dim": 2880, "max_num_tokens": 32, "seed": 51}
+  - {"num_experts": 128, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 64, "seed": 175}
+  - {"num_experts": 128, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 128, "seed": 534}
+  - {"num_experts": 256, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 64, "seed": 897}
+  - {"num_experts": 256, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 128, "seed": 4}
 
 
 benchmarks:
-  - {"num_experts": 32, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 256, "seed": 6635}
-  - {"num_experts": 96, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 256, "seed": 6635}
-  - {"num_experts": 128, "experts_per_token": 8, "hidden_dim": 8192, "max_num_tokens": 256, "seed": 6635}
-  - {"num_experts": 192, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 128, "seed": 6635}
-  - {"num_experts": 256, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 128, "seed": 6635}
+  - {"num_experts": 8, "experts_per_token": 2, "hidden_dim": 6144, "max_num_tokens": 16, "seed": 6635}
+  - {"num_experts": 64, "experts_per_token": 6, "hidden_dim": 2048, "max_num_tokens": 32, "seed": 1234}
+  - {"num_experts": 128, "experts_per_token": 4, "hidden_dim": 2880, "max_num_tokens": 128, "seed": 51}
+  - {"num_experts": 128, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 256, "seed": 175}
+  - {"num_experts": 256, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 256, "seed": 4}
 
 
 ranking_by: "geom"

From a62277e895d073b293be35f7fa19ae231ae7ae35 Mon Sep 17 00:00:00 2001
From: danielhua23 <daniel.huang2@amd.com>
Date: Mon, 25 Aug 2025 07:44:33 +0000
Subject: [PATCH 097/207] fix typos

---
 problems/amd/all2all/task.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/problems/amd/all2all/task.yml b/problems/amd/all2all/task.yml
index 2aff0881..a34e982b 100644
--- a/problems/amd/all2all/task.yml
+++ b/problems/amd/all2all/task.yml
@@ -14,13 +14,13 @@ description: |
   You will implement a custom single node all2all kernel optimized for 8xMI300.
   You will be given MoEConfig, which is the main hypeparameter, including numbers of experts, experts per token, hidden dim, max number tokens each dp rank and input output dtype
 
-  To be explicit, you will be given data of all ranks, naming all_rank_data:
+  To be explicit, you will be given data of all ranks, naming all_rank_data.
   each rank data including:
   ```
   num_tokens, indices, weights, x
   ```
-  where `x` are the tokens data at each rank, `num_tokens` is a tensor indicating numbers of token at each rank,
-  `` is the output matrix:
+  lets explain the input args one by one.
+
   * `x` is tokens data at each rank, with (num_tokens, hidden_dim) shape
   * `num_tokens` is the numbers of tokens at each rank, a scalar with maximum numbers: max number tokens defined in MoEConfig
   * `indices` is the token to expert map, indicating which experts each token dispatch to, with (num_tokens, experts_per_token) shape

From b14e0f7883a2c6f67af70c3131219158be7bca6a Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Tue, 26 Aug 2025 13:10:29 +0200
Subject: [PATCH 098/207] Feat: conform to kernelbot infrastructure

---
 problems/amd/all2all/reference.py  | 228 +++++++++++++++--------------
 problems/amd/all2all/submission.py | 216 +++++++++++----------------
 problems/amd/all2all/task.py       |  11 +-
 problems/amd/all2all/task.yml      |  34 +++--
 4 files changed, 228 insertions(+), 261 deletions(-)

diff --git a/problems/amd/all2all/reference.py b/problems/amd/all2all/reference.py
index bc426005..d5ba04f4 100644
--- a/problems/amd/all2all/reference.py
+++ b/problems/amd/all2all/reference.py
@@ -2,10 +2,9 @@
 import os
 import torch
 import torch.distributed as dist
-import torch.multiprocessing as mp
 import dataclasses
 from task import input_t, output_t
-from utils import make_match_reference
+
 
 # ---------------- MoE config ----------------
 @dataclasses.dataclass
@@ -17,27 +16,45 @@ class MoEConfig:
     in_dtype: torch.dtype = torch.float16
     out_dtype: torch.dtype = torch.float16
 
+
 # ---------------- data per dp rank ----------------
 class RankTestData:
-    def __init__(self, cfg: MoEConfig, rng: torch.Generator):
-        self.num_tokens = int(torch.randint(1, cfg.max_num_tokens, [1], generator=rng).item())
+    def __init__(self, cfg: MoEConfig, rng: torch.Generator, rank: int):
+        device = torch.device(f"cuda:{rank}")
+        self.num_tokens = int(
+            torch.randint(
+                1, cfg.max_num_tokens, [1], generator=rng, device=device
+            ).item()
+        )
         # token expert map
-        self.indices = torch.empty(self.num_tokens, cfg.experts_per_token, dtype=torch.int32)
+        self.indices = torch.empty(
+            self.num_tokens, cfg.experts_per_token, dtype=torch.int32, device=device
+        )
         for i in range(self.num_tokens):
-            perm = torch.randperm(cfg.num_experts, generator=rng)
+            perm = torch.randperm(cfg.num_experts, generator=rng, device=device)
             self.indices[i] = perm[: cfg.experts_per_token]
-        # topk weights 
+        # topk weights
         self.weights = torch.rand(
-            self.num_tokens, cfg.experts_per_token, dtype=torch.float32, generator=rng
+            self.num_tokens,
+            cfg.experts_per_token,
+            dtype=torch.float32,
+            generator=rng,
+            device=device,
         )
         # dp tokens, input of dispatch
-        self.x = torch.randn(self.num_tokens, cfg.hidden_dim, dtype=cfg.in_dtype, generator=rng)
+        self.x = torch.randn(
+            self.num_tokens,
+            cfg.hidden_dim,
+            dtype=cfg.in_dtype,
+            generator=rng,
+            device=device,
+        )
+
 
 # ---------------- All2All pytorch impl ----------------
 class PyTorchAllToAll:
-    
     META_DIM = 5  # global_exp, src_rank, src_token, src_k, pad
-    
+
     def __init__(self, cfg: MoEConfig, rank: int, world_size: int):
         self.cfg = cfg
         self.rank = rank
@@ -58,14 +75,16 @@ def dispatch(self, dp_x: torch.Tensor, indices: torch.Tensor):
         # 1.2 token id to send to each rank
         token_map = [[] for _ in range(self.world_size)]
         # 1.3 token meta data, need update for combine
-        meta_map = [[] for _ in range(self.world_size)] 
+        meta_map = [[] for _ in range(self.world_size)]
         for t, expert_list in enumerate(indices.tolist()):
             for k, e in enumerate(expert_list):
                 dst_rank = e // self.num_local_experts
                 send_counts[dst_rank] += 1
                 token_map[dst_rank].append(t)
-                meta_map[dst_rank].extend([e, self.rank, t, k, 0]) # srcGobalExpert, srcRank, srcIndex, expert index
-                
+                meta_map[dst_rank].extend(
+                    [e, self.rank, t, k, 0]
+                )  # srcGobalExpert, srcRank, srcIndex, expert index
+
         send_counts_t = torch.tensor(send_counts, dtype=torch.long, device=device)
         # 1.3 token nums to recv from each rank
         recv_counts_t = torch.empty(self.world_size, dtype=torch.long, device=device)
@@ -73,43 +92,52 @@ def dispatch(self, dp_x: torch.Tensor, indices: torch.Tensor):
         # ---------2. send and recv buffer, order by tokens on each rank ----------
         send_buf = torch.cat([dp_x[idx_list] for idx_list in token_map], dim=0)
         total_recv = int(recv_counts_t.sum().item())
-        recv_buf = torch.empty(total_recv, cfg.hidden_dim,
-                               dtype=cfg.in_dtype, device=device)
-        
+        recv_buf = torch.empty(
+            total_recv, cfg.hidden_dim, dtype=cfg.in_dtype, device=device
+        )
+
         # 2.1 meta buf for send and recv
         send_meta = torch.tensor(
-            [v for sub in meta_map for v in sub],
-            dtype=torch.int32,
-            device=device
+            [v for sub in meta_map for v in sub], dtype=torch.int32, device=device
         ).view(-1, self.META_DIM)
-        recv_meta = torch.empty(total_recv, self.META_DIM,
-                                dtype=torch.int32, device=device)        
+        recv_meta = torch.empty(
+            total_recv, self.META_DIM, dtype=torch.int32, device=device
+        )
         # ---------3. dispatch send_buf to recv_buf by recv and send counts--------------
         dist.all_to_all_single(
-            recv_buf, send_buf,
+            recv_buf,
+            send_buf,
             output_split_sizes=recv_counts_t.tolist(),
-            input_split_sizes=send_counts_t.tolist()
+            input_split_sizes=send_counts_t.tolist(),
         )
-        
+
         dist.all_to_all_single(
-            recv_meta.view(-1), send_meta.view(-1),
+            recv_meta.view(-1),
+            send_meta.view(-1),
             output_split_sizes=[c * self.META_DIM for c in recv_counts_t.tolist()],
             input_split_sizes=[c * self.META_DIM for c in send_counts_t.tolist()],
         )
         recv_meta = recv_meta.view(-1, self.META_DIM)
         # ---------4. define output tensor of dispatch ------------
         # 4.1 num tokens per expert
-        expert_num_tokens = torch.zeros(self.num_local_experts,
-                                        dtype=torch.int32, device=device)
+        expert_num_tokens = torch.zeros(
+            self.num_local_experts, dtype=torch.int32, device=device
+        )
         # 4.2 token tensor on each expert
-        expert_x = torch.empty((self.num_local_experts, self.max_recv, cfg.hidden_dim),
-                               dtype=cfg.in_dtype, device=device)
-        expert_meta = torch.empty((self.num_local_experts, self.max_recv, self.META_DIM),
-                                  dtype=torch.int32, device=device)
+        expert_x = torch.empty(
+            (self.num_local_experts, self.max_recv, cfg.hidden_dim),
+            dtype=cfg.in_dtype,
+            device=device,
+        )
+        expert_meta = torch.empty(
+            (self.num_local_experts, self.max_recv, self.META_DIM),
+            dtype=torch.int32,
+            device=device,
+        )
         # ---------5. dispatch send_meta to recv_meta by recv and send counts------
         # ---------6. write tokens to each expert on each rank ------
         # 6.1 fetch the local expert id of corresponding token i
-        for i in range(total_recv): 
+        for i in range(total_recv):
             global_eid = int(recv_meta[i, 0].item())
             local_eid = global_eid % self.num_local_experts
             # output, store token buf and token meta and token nums of each expert
@@ -117,14 +145,17 @@ def dispatch(self, dp_x: torch.Tensor, indices: torch.Tensor):
             expert_meta[local_eid, expert_num_tokens[local_eid]] = recv_meta[i]
             expert_num_tokens[local_eid] += 1
         # 6.2 after dispatch, token nums and token and meta of token on expert
-        return expert_num_tokens, expert_x, expert_meta 
-    
+        return expert_num_tokens, expert_x, expert_meta
+
     # ---------- combine ----------
-    def combine(self, out_tokens: torch.Tensor, # output, (max num tokens, token dim)
-                weights: torch.Tensor, # topk weight
-                expert_meta: torch.Tensor, # input
-                expert_y: torch.Tensor, # input, (num_local_experts, max_num_tokens * num_dp, token_dim)
-                expert_num_tokens: torch.Tensor): # input
+    def combine(
+        self,
+        out_tokens: torch.Tensor,  # output, (max num tokens, token dim)
+        weights: torch.Tensor,  # topk weight
+        expert_meta: torch.Tensor,  # input
+        expert_y: torch.Tensor,  # input, (num_local_experts, max_num_tokens * num_dp, token_dim)
+        expert_num_tokens: torch.Tensor,
+    ):  # input
         device = out_tokens.device
         cfg = self.cfg
 
@@ -133,16 +164,16 @@ def combine(self, out_tokens: torch.Tensor, # output, (max num tokens, token dim
         # 1.1 token that will send back
         y_map = [[] for _ in range(self.world_size)]
         # 1.2 meta info of each token that send back to its src rank
-        meta_map = [[] for _ in range(self.world_size)] 
+        meta_map = [[] for _ in range(self.world_size)]
 
         # 2. traverse each token of each local expert of each rank, fill into send_counts and y_map and meta_map
         for local_eid in range(self.num_local_experts):
             cnt = int(expert_num_tokens[local_eid].item())
             for j in range(cnt):
                 # meta info token j of local eid
-                meta = expert_meta[local_eid, j] 
-                dst_rank = int(meta[1].item())  
-                send_counts[dst_rank] += 1 
+                meta = expert_meta[local_eid, j]
+                dst_rank = int(meta[1].item())
+                send_counts[dst_rank] += 1
                 # token j and its meta that send back to dst rank/local eid
                 y_map[dst_rank].append(expert_y[local_eid, j].unsqueeze(0))
                 meta_map[dst_rank].extend(meta.tolist())
@@ -150,7 +181,7 @@ def combine(self, out_tokens: torch.Tensor, # output, (max num tokens, token dim
         send_counts_t = torch.tensor(send_counts, dtype=torch.long, device=device)
         # token nums that will recv from other ranks
         recv_counts_t = torch.empty(self.world_size, dtype=torch.long, device=device)
-        # call all2all to send send counts and recv recv_counts_t at each rank by all2all        
+        # call all2all to send send counts and recv recv_counts_t at each rank by all2all
         dist.all_to_all_single(recv_counts_t, send_counts_t)
         # 3.send buffers of each rank, that is, the tokens at its experts
         y_map_tensors = []
@@ -158,30 +189,34 @@ def combine(self, out_tokens: torch.Tensor, # output, (max num tokens, token dim
             if sub_list:
                 y_map_tensors.append(torch.cat(sub_list, dim=0))
             else:
-                y_map_tensors.append(torch.empty((0, cfg.hidden_dim), dtype=cfg.out_dtype, device=device))
+                y_map_tensors.append(
+                    torch.empty((0, cfg.hidden_dim), dtype=cfg.out_dtype, device=device)
+                )
         send_buf = torch.cat(y_map_tensors, dim=0)
         # 4. flatten send meta by tokens
         send_meta = torch.tensor(
-            [v for sub in meta_map for v in sub],
-            dtype=torch.int32,
-            device=device
+            [v for sub in meta_map for v in sub], dtype=torch.int32, device=device
         ).view(-1, self.META_DIM)
         # 5. total recv tokens of cur rank
         total_recv = int(recv_counts_t.sum().item())
         # 6. recv buffer of cur rank
-        recv_buf = torch.empty(total_recv, cfg.hidden_dim,
-                               dtype=cfg.out_dtype, device=device)
-        recv_meta = torch.empty(total_recv, self.META_DIM,
-                                dtype=torch.int32, device=device)
+        recv_buf = torch.empty(
+            total_recv, cfg.hidden_dim, dtype=cfg.out_dtype, device=device
+        )
+        recv_meta = torch.empty(
+            total_recv, self.META_DIM, dtype=torch.int32, device=device
+        )
         # 7. call all2all to send and recv for each rank
         dist.all_to_all_single(
-            recv_buf, send_buf,
-            output_split_sizes=recv_counts_t.tolist(), 
-            input_split_sizes=send_counts_t.tolist()
+            recv_buf,
+            send_buf,
+            output_split_sizes=recv_counts_t.tolist(),
+            input_split_sizes=send_counts_t.tolist(),
         )
         # 8. call all2all to send meta and recv meta for each rank
         dist.all_to_all_single(
-            recv_meta.view(-1), send_meta.view(-1),
+            recv_meta.view(-1),
+            send_meta.view(-1),
             output_split_sizes=[c * self.META_DIM for c in recv_counts_t.tolist()],
             input_split_sizes=[c * self.META_DIM for c in send_counts_t.tolist()],
         )
@@ -191,41 +226,21 @@ def combine(self, out_tokens: torch.Tensor, # output, (max num tokens, token dim
         # 10. write back tokens from recv buf, per meta info, and do weighted sum
         for i in range(total_recv):
             src_token = int(recv_meta[i, 2].item())
-            src_k = int(recv_meta[i, 3].item()) 
+            src_k = int(recv_meta[i, 3].item())
             src_rank = int(recv_meta[i, 1].item())
             w = weights[src_token, src_k].to(torch.float32)
             out_tokens[src_token] += recv_buf[i].to(torch.float32) * w
 
         return out_tokens
 
-# ---------------- multi processing test ----------------
-def _worker(rank, world_size, rank_data, cfg: MoEConfig):
-    os.environ["MASTER_ADDR"] = "127.0.0.1"
-    os.environ["MASTER_PORT"] = "12356"
-    dist.init_process_group("nccl", rank=rank, init_method="env://", world_size=world_size)
-    torch.cuda.set_device(rank)
-
-    num_dp = world_size
-    dp_rank = rank
-
-    ata = PyTorchAllToAll(cfg, rank, world_size)
-    # ---------- dispatch ----------
-    expert_num, expert_x, expert_meta = ata.dispatch(rank_data.x.cuda(), rank_data.indices.cuda()) # expert meta = (self.num_local_experts, self.max_recv, self.META_DIM)
-    # ---------- simulated moe computation ----------
-    expert_y = expert_x.to(cfg.out_dtype) * 2
-    # ---------- combine ----------
-    y = torch.zeros(cfg.max_num_tokens, cfg.hidden_dim,
-                   dtype=cfg.out_dtype, device="cuda")
-    ata.combine(y, rank_data.weights.cuda(), expert_meta, expert_y, expert_num)
-
-    dist.destroy_process_group()
-    return y[: rank_data.num_tokens].cpu()
 
-def generate_input(num_experts, experts_per_token, hidden_dim, max_num_tokens, seed):
-    gen = torch.Generator(device='cuda')
+def generate_input(
+    num_experts, experts_per_token, hidden_dim, max_num_tokens, seed, rank, world_size
+):
+    device = torch.device(f"cuda:{rank}")
+    gen = torch.Generator(device=device)
     gen.manual_seed(seed)
-    world_size = 8
-    
+
     cfg = MoEConfig(
         num_experts=num_experts,
         experts_per_token=experts_per_token,
@@ -234,33 +249,28 @@ def generate_input(num_experts, experts_per_token, hidden_dim, max_num_tokens, s
         in_dtype=torch.float16,
         out_dtype=torch.float16,
     )
-    all_rank_data = [RankTestData(cfg, gen) for _ in range(world_size)]  
-    return cfg, all_rank_data
+    rank_data = RankTestData(cfg, gen, rank)
+    return cfg, rank_data, rank, world_size
+
 
 def ref_kernel(data: input_t) -> output_t:
-    cfg, all_rank_data = data
-    world_size = 8
-    
-    mp.set_start_method("spawn", force=True)
-    pool = mp.Pool(processes=world_size)
-    rets = []
-    for i in range(world_size):
-        rets.append(
-            pool.apply_async(  
-                  _worker, 
-                  args=(i,
-                        world_size, 
-                        all_rank_data[i], 
-                        cfg),
-            )
-        )
-    pool.close()
-    pool.join()
-    rets = [el.get() for el in rets]
-    ret_out = torch.cat(rets, dim=0)
-    return ret_out
+    cfg, rank_data, rank, world_size = data
+
+    ata = PyTorchAllToAll(cfg, rank, world_size)
+
+    expert_num, expert_x, expert_meta = ata.dispatch(rank_data.x, rank_data.indices)
+    expert_y = expert_x.to(cfg.out_dtype) * 2
+    y = torch.zeros(
+        cfg.max_num_tokens,
+        cfg.hidden_dim,
+        dtype=cfg.out_dtype,
+        device=rank_data.x.device,
+    )
+
+    ata.combine(y, rank_data.weights, expert_meta, expert_y, expert_num)
 
-check_implementation = make_match_reference(ref_kernel, rtol=1e-2, atol=5e-3) 
+    return y[:, rank_data.num_tokens]
 
- 
 
+def check_implementation(data: input_t, output: output_t):
+    return True, ""
diff --git a/problems/amd/all2all/submission.py b/problems/amd/all2all/submission.py
index 342388b4..385bf289 100644
--- a/problems/amd/all2all/submission.py
+++ b/problems/amd/all2all/submission.py
@@ -1,43 +1,15 @@
-import os
+#!POPCORN leaderboard all2all-dev
+
 import torch
 import torch.distributed as dist
-import torch.multiprocessing as mp
-import dataclasses
 from task import input_t, output_t
-from utils import make_match_reference
-
-# ---------------- MoE config ----------------
-@dataclasses.dataclass
-class MoEConfig:
-    num_experts: int
-    experts_per_token: int
-    hidden_dim: int
-    max_num_tokens: int
-    in_dtype: torch.dtype = torch.float16
-    out_dtype: torch.dtype = torch.float16
 
-# ---------------- data per dp rank ----------------
-class RankTestData:
-    def __init__(self, cfg: MoEConfig, rng: torch.Generator):
-        self.num_tokens = int(torch.randint(1, cfg.max_num_tokens, [1], generator=rng).item())
-        # token expert map
-        self.indices = torch.empty(self.num_tokens, cfg.experts_per_token, dtype=torch.int32)
-        for i in range(self.num_tokens):
-            perm = torch.randperm(cfg.num_experts, generator=rng)
-            self.indices[i] = perm[: cfg.experts_per_token]
-        # topk weights 
-        self.weights = torch.rand(
-            self.num_tokens, cfg.experts_per_token, dtype=torch.float32, generator=rng
-        )
-        # dp tokens, input of dispatch
-        self.x = torch.randn(self.num_tokens, cfg.hidden_dim, dtype=cfg.in_dtype, generator=rng)
 
 # ---------------- All2All pytorch impl ----------------
 class PyTorchAllToAll:
-    
     META_DIM = 5  # global_exp, src_rank, src_token, src_k, pad
-    
-    def __init__(self, cfg: MoEConfig, rank: int, world_size: int):
+
+    def __init__(self, cfg, rank: int, world_size: int):
         self.cfg = cfg
         self.rank = rank
         self.world_size = world_size
@@ -57,14 +29,16 @@ def dispatch(self, dp_x: torch.Tensor, indices: torch.Tensor):
         # 1.2 token id to send to each rank
         token_map = [[] for _ in range(self.world_size)]
         # 1.3 token meta data, need update for combine
-        meta_map = [[] for _ in range(self.world_size)] 
+        meta_map = [[] for _ in range(self.world_size)]
         for t, expert_list in enumerate(indices.tolist()):
             for k, e in enumerate(expert_list):
                 dst_rank = e // self.num_local_experts
                 send_counts[dst_rank] += 1
                 token_map[dst_rank].append(t)
-                meta_map[dst_rank].extend([e, self.rank, t, k, 0]) # srcGobalExpert, srcRank, srcIndex, expert index
-                
+                meta_map[dst_rank].extend(
+                    [e, self.rank, t, k, 0]
+                )  # srcGobalExpert, srcRank, srcIndex, expert index
+
         send_counts_t = torch.tensor(send_counts, dtype=torch.long, device=device)
         # 1.3 token nums to recv from each rank
         recv_counts_t = torch.empty(self.world_size, dtype=torch.long, device=device)
@@ -72,43 +46,52 @@ def dispatch(self, dp_x: torch.Tensor, indices: torch.Tensor):
         # ---------2. send and recv buffer, order by tokens on each rank ----------
         send_buf = torch.cat([dp_x[idx_list] for idx_list in token_map], dim=0)
         total_recv = int(recv_counts_t.sum().item())
-        recv_buf = torch.empty(total_recv, cfg.hidden_dim,
-                               dtype=cfg.in_dtype, device=device)
-        
+        recv_buf = torch.empty(
+            total_recv, cfg.hidden_dim, dtype=cfg.in_dtype, device=device
+        )
+
         # 2.1 meta buf for send and recv
         send_meta = torch.tensor(
-            [v for sub in meta_map for v in sub],
-            dtype=torch.int32,
-            device=device
+            [v for sub in meta_map for v in sub], dtype=torch.int32, device=device
         ).view(-1, self.META_DIM)
-        recv_meta = torch.empty(total_recv, self.META_DIM,
-                                dtype=torch.int32, device=device)        
+        recv_meta = torch.empty(
+            total_recv, self.META_DIM, dtype=torch.int32, device=device
+        )
         # ---------3. dispatch send_buf to recv_buf by recv and send counts--------------
         dist.all_to_all_single(
-            recv_buf, send_buf,
+            recv_buf,
+            send_buf,
             output_split_sizes=recv_counts_t.tolist(),
-            input_split_sizes=send_counts_t.tolist()
+            input_split_sizes=send_counts_t.tolist(),
         )
-        
+
         dist.all_to_all_single(
-            recv_meta.view(-1), send_meta.view(-1),
+            recv_meta.view(-1),
+            send_meta.view(-1),
             output_split_sizes=[c * self.META_DIM for c in recv_counts_t.tolist()],
             input_split_sizes=[c * self.META_DIM for c in send_counts_t.tolist()],
         )
         recv_meta = recv_meta.view(-1, self.META_DIM)
         # ---------4. define output tensor of dispatch ------------
         # 4.1 num tokens per expert
-        expert_num_tokens = torch.zeros(self.num_local_experts,
-                                        dtype=torch.int32, device=device)
+        expert_num_tokens = torch.zeros(
+            self.num_local_experts, dtype=torch.int32, device=device
+        )
         # 4.2 token tensor on each expert
-        expert_x = torch.empty((self.num_local_experts, self.max_recv, cfg.hidden_dim),
-                               dtype=cfg.in_dtype, device=device)
-        expert_meta = torch.empty((self.num_local_experts, self.max_recv, self.META_DIM),
-                                  dtype=torch.int32, device=device)
+        expert_x = torch.empty(
+            (self.num_local_experts, self.max_recv, cfg.hidden_dim),
+            dtype=cfg.in_dtype,
+            device=device,
+        )
+        expert_meta = torch.empty(
+            (self.num_local_experts, self.max_recv, self.META_DIM),
+            dtype=torch.int32,
+            device=device,
+        )
         # ---------5. dispatch send_meta to recv_meta by recv and send counts------
         # ---------6. write tokens to each expert on each rank ------
         # 6.1 fetch the local expert id of corresponding token i
-        for i in range(total_recv): 
+        for i in range(total_recv):
             global_eid = int(recv_meta[i, 0].item())
             local_eid = global_eid % self.num_local_experts
             # output, store token buf and token meta and token nums of each expert
@@ -116,14 +99,17 @@ def dispatch(self, dp_x: torch.Tensor, indices: torch.Tensor):
             expert_meta[local_eid, expert_num_tokens[local_eid]] = recv_meta[i]
             expert_num_tokens[local_eid] += 1
         # 6.2 after dispatch, token nums and token and meta of token on expert
-        return expert_num_tokens, expert_x, expert_meta 
-    
+        return expert_num_tokens, expert_x, expert_meta
+
     # ---------- combine ----------
-    def combine(self, out_tokens: torch.Tensor, # output, (max num tokens, token dim)
-                weights: torch.Tensor, # topk weight
-                expert_meta: torch.Tensor, # input
-                expert_y: torch.Tensor, # input, (num_local_experts, max_num_tokens * num_dp, token_dim)
-                expert_num_tokens: torch.Tensor): # input
+    def combine(
+        self,
+        out_tokens: torch.Tensor,  # output, (max num tokens, token dim)
+        weights: torch.Tensor,  # topk weight
+        expert_meta: torch.Tensor,  # input
+        expert_y: torch.Tensor,  # input, (num_local_experts, max_num_tokens * num_dp, token_dim)
+        expert_num_tokens: torch.Tensor,
+    ):  # input
         device = out_tokens.device
         cfg = self.cfg
 
@@ -132,16 +118,16 @@ def combine(self, out_tokens: torch.Tensor, # output, (max num tokens, token dim
         # 1.1 token that will send back
         y_map = [[] for _ in range(self.world_size)]
         # 1.2 meta info of each token that send back to its src rank
-        meta_map = [[] for _ in range(self.world_size)] 
+        meta_map = [[] for _ in range(self.world_size)]
 
         # 2. traverse each token of each local expert of each rank, fill into send_counts and y_map and meta_map
         for local_eid in range(self.num_local_experts):
             cnt = int(expert_num_tokens[local_eid].item())
             for j in range(cnt):
                 # meta info token j of local eid
-                meta = expert_meta[local_eid, j] 
-                dst_rank = int(meta[1].item())  
-                send_counts[dst_rank] += 1 
+                meta = expert_meta[local_eid, j]
+                dst_rank = int(meta[1].item())
+                send_counts[dst_rank] += 1
                 # token j and its meta that send back to dst rank/local eid
                 y_map[dst_rank].append(expert_y[local_eid, j].unsqueeze(0))
                 meta_map[dst_rank].extend(meta.tolist())
@@ -149,7 +135,7 @@ def combine(self, out_tokens: torch.Tensor, # output, (max num tokens, token dim
         send_counts_t = torch.tensor(send_counts, dtype=torch.long, device=device)
         # token nums that will recv from other ranks
         recv_counts_t = torch.empty(self.world_size, dtype=torch.long, device=device)
-        # call all2all to send send counts and recv recv_counts_t at each rank by all2all        
+        # call all2all to send send counts and recv recv_counts_t at each rank by all2all
         dist.all_to_all_single(recv_counts_t, send_counts_t)
         # 3.send buffers of each rank, that is, the tokens at its experts
         y_map_tensors = []
@@ -157,30 +143,34 @@ def combine(self, out_tokens: torch.Tensor, # output, (max num tokens, token dim
             if sub_list:
                 y_map_tensors.append(torch.cat(sub_list, dim=0))
             else:
-                y_map_tensors.append(torch.empty((0, cfg.hidden_dim), dtype=cfg.out_dtype, device=device))
+                y_map_tensors.append(
+                    torch.empty((0, cfg.hidden_dim), dtype=cfg.out_dtype, device=device)
+                )
         send_buf = torch.cat(y_map_tensors, dim=0)
         # 4. flatten send meta by tokens
         send_meta = torch.tensor(
-            [v for sub in meta_map for v in sub],
-            dtype=torch.int32,
-            device=device
+            [v for sub in meta_map for v in sub], dtype=torch.int32, device=device
         ).view(-1, self.META_DIM)
         # 5. total recv tokens of cur rank
         total_recv = int(recv_counts_t.sum().item())
         # 6. recv buffer of cur rank
-        recv_buf = torch.empty(total_recv, cfg.hidden_dim,
-                               dtype=cfg.out_dtype, device=device)
-        recv_meta = torch.empty(total_recv, self.META_DIM,
-                                dtype=torch.int32, device=device)
+        recv_buf = torch.empty(
+            total_recv, cfg.hidden_dim, dtype=cfg.out_dtype, device=device
+        )
+        recv_meta = torch.empty(
+            total_recv, self.META_DIM, dtype=torch.int32, device=device
+        )
         # 7. call all2all to send and recv for each rank
         dist.all_to_all_single(
-            recv_buf, send_buf,
-            output_split_sizes=recv_counts_t.tolist(), 
-            input_split_sizes=send_counts_t.tolist()
+            recv_buf,
+            send_buf,
+            output_split_sizes=recv_counts_t.tolist(),
+            input_split_sizes=send_counts_t.tolist(),
         )
         # 8. call all2all to send meta and recv meta for each rank
         dist.all_to_all_single(
-            recv_meta.view(-1), send_meta.view(-1),
+            recv_meta.view(-1),
+            send_meta.view(-1),
             output_split_sizes=[c * self.META_DIM for c in recv_counts_t.tolist()],
             input_split_sizes=[c * self.META_DIM for c in send_counts_t.tolist()],
         )
@@ -190,71 +180,29 @@ def combine(self, out_tokens: torch.Tensor, # output, (max num tokens, token dim
         # 10. write back tokens from recv buf, per meta info, and do weighted sum
         for i in range(total_recv):
             src_token = int(recv_meta[i, 2].item())
-            src_k = int(recv_meta[i, 3].item()) 
+            src_k = int(recv_meta[i, 3].item())
             src_rank = int(recv_meta[i, 1].item())
             w = weights[src_token, src_k].to(torch.float32)
             out_tokens[src_token] += recv_buf[i].to(torch.float32) * w
 
         return out_tokens
 
-# ---------------- multi processing test ----------------
-def _worker(rank, world_size, rank_data, cfg: MoEConfig):
-    os.environ["MASTER_ADDR"] = "127.0.0.1"
-    os.environ["MASTER_PORT"] = "12356"
-    dist.init_process_group("nccl", rank=rank, init_method="env://", world_size=world_size)
-    torch.cuda.set_device(rank)
 
-    num_dp = world_size
-    dp_rank = rank
+def custom_kernel(data: input_t) -> output_t:
+    cfg, rank_data, rank, world_size = data
+    torch.cuda.set_device(rank)
 
     ata = PyTorchAllToAll(cfg, rank, world_size)
-    # ---------- dispatch ----------
-    expert_num, expert_x, expert_meta = ata.dispatch(rank_data.x.cuda(), rank_data.indices.cuda()) # expert meta = (self.num_local_experts, self.max_recv, self.META_DIM)
-    # ---------- simulated moe computation ----------
-    expert_y = expert_x.to(cfg.out_dtype) * 2
-    # ---------- combine ----------
-    y = torch.zeros(cfg.max_num_tokens, cfg.hidden_dim,
-                   dtype=cfg.out_dtype, device="cuda")
-    ata.combine(y, rank_data.weights.cuda(), expert_meta, expert_y, expert_num)
-    # ----------- copied to CPU by shared mem--------    
-    dist.destroy_process_group()
-    return y[: rank_data.num_tokens].cpu()
 
-def generate_input(num_experts, experts_per_token, hidden_dim, max_num_tokens, seed):
-    gen = torch.Generator(device='cuda')
-    gen.manual_seed(seed)
-    world_size = 8
-    
-    cfg = MoEConfig(
-        num_experts=num_experts,
-        experts_per_token=experts_per_token,
-        hidden_dim=hidden_dim,
-        max_num_tokens=max_num_tokens,
-        in_dtype=torch.float16,
-        out_dtype=torch.float16,
+    expert_num, expert_x, expert_meta = ata.dispatch(rank_data.x, rank_data.indices)
+    expert_y = expert_x.to(cfg.out_dtype) * 2
+    y = torch.zeros(
+        cfg.max_num_tokens,
+        cfg.hidden_dim,
+        dtype=cfg.out_dtype,
+        device=rank_data.x.device,
     )
-    all_rank_data = [RankTestData(cfg, gen) for _ in range(world_size)]  
-    return cfg, all_rank_data
 
-def custom_kernel(data: input_t) -> output_t:
-    cfg, all_rank_data = data
-    world_size = 8
-    
-    mp.set_start_method("spawn", force=True)
-    pool = mp.Pool(processes=world_size)
-    rets = []
-    for i in range(world_size):
-        rets.append(
-            pool.apply_async(  
-                  _worker, 
-                  args=(i,
-                        world_size, 
-                        all_rank_data[i], 
-                        cfg),
-            )
-        )
-    pool.close()
-    pool.join()
-    rets = [el.get() for el in rets]
-    ret_out = torch.cat(rets, dim=0)
-    return ret_out
+    ata.combine(y, rank_data.weights, expert_meta, expert_y, expert_num)
+
+    return y[:, rank_data.num_tokens]
diff --git a/problems/amd/all2all/task.py b/problems/amd/all2all/task.py
index aaf2de6f..ee2016a0 100644
--- a/problems/amd/all2all/task.py
+++ b/problems/amd/all2all/task.py
@@ -1,12 +1,17 @@
 import torch
-from typing import TypeVar, TypedDict
+from typing import TypeVar, TypedDict, TYPE_CHECKING
 
-input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor])
+if TYPE_CHECKING:
+    from reference import MoEConfig, RankTestData
+
+
+input_t = TypeVar("input_t", bound=tuple["MoEConfig", "RankTestData", int, int])
 output_t = TypeVar("output_t", bound=torch.Tensor)
 
+
 class TestSpec(TypedDict):
     num_experts: int
     experts_per_token: int
     hidden_dim: int
     max_num_tokens: int
-    seed: int
\ No newline at end of file
+    seed: int
diff --git a/problems/amd/all2all/task.yml b/problems/amd/all2all/task.yml
index a34e982b..a0af424b 100644
--- a/problems/amd/all2all/task.yml
+++ b/problems/amd/all2all/task.yml
@@ -8,6 +8,10 @@ files:
   - {"name": "eval.py", "source": "../eval.py"}
 
 lang: "py"
+multi_gpu: true
+
+config:
+  main: "eval.py"
 
 description: |
   
@@ -42,24 +46,24 @@ description: |
 
 
 tests:
-  - {"num_experts": 8, "experts_per_token": 2, "hidden_dim": 6144, "max_num_tokens": 2, "seed": 6635}
-  - {"num_experts": 8, "experts_per_token": 2, "hidden_dim": 6144, "max_num_tokens": 4, "seed": 1236}
-  - {"num_experts": 64, "experts_per_token": 6, "hidden_dim": 2048, "max_num_tokens": 4, "seed": 1234}
-  - {"num_experts": 64, "experts_per_token": 6, "hidden_dim": 2048, "max_num_tokens": 8, "seed": 542}
-  - {"num_experts": 128, "experts_per_token": 4, "hidden_dim": 2880, "max_num_tokens": 16, "seed": 347}
-  - {"num_experts": 128, "experts_per_token": 4, "hidden_dim": 2880, "max_num_tokens": 32, "seed": 51}
-  - {"num_experts": 128, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 64, "seed": 175}
-  - {"num_experts": 128, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 128, "seed": 534}
-  - {"num_experts": 256, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 64, "seed": 897}
-  - {"num_experts": 256, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 128, "seed": 4}
+  - {"num_experts": 8, "experts_per_token": 2, "hidden_dim": 6144, "max_num_tokens": 2, "seed": 6635, "world_size": 8}
+  # - {"num_experts": 8, "experts_per_token": 2, "hidden_dim": 6144, "max_num_tokens": 4, "seed": 1236, "world_size": 8}
+  # - {"num_experts": 64, "experts_per_token": 6, "hidden_dim": 2048, "max_num_tokens": 4, "seed": 1234, "world_size": 8}
+  # - {"num_experts": 64, "experts_per_token": 6, "hidden_dim": 2048, "max_num_tokens": 8, "seed": 542, "world_size": 8}
+  # - {"num_experts": 128, "experts_per_token": 4, "hidden_dim": 2880, "max_num_tokens": 16, "seed": 347, "world_size": 8}
+  # - {"num_experts": 128, "experts_per_token": 4, "hidden_dim": 2880, "max_num_tokens": 32, "seed": 51, "world_size": 8}
+  # - {"num_experts": 128, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 64, "seed": 175, "world_size": 8}
+  # - {"num_experts": 128, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 128, "seed": 534, "world_size": 8}
+  # - {"num_experts": 256, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 64, "seed": 897, "world_size": 8}
+  # - {"num_experts": 256, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 128, "seed": 4, "world_size": 8}
 
 
 benchmarks:
-  - {"num_experts": 8, "experts_per_token": 2, "hidden_dim": 6144, "max_num_tokens": 16, "seed": 6635}
-  - {"num_experts": 64, "experts_per_token": 6, "hidden_dim": 2048, "max_num_tokens": 32, "seed": 1234}
-  - {"num_experts": 128, "experts_per_token": 4, "hidden_dim": 2880, "max_num_tokens": 128, "seed": 51}
-  - {"num_experts": 128, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 256, "seed": 175}
-  - {"num_experts": 256, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 256, "seed": 4}
+  - {"num_experts": 8, "experts_per_token": 2, "hidden_dim": 6144, "max_num_tokens": 16, "seed": 6635, "world_size": 8}
+  - {"num_experts": 64, "experts_per_token": 6, "hidden_dim": 2048, "max_num_tokens": 32, "seed": 1234, "world_size": 8}
+  - {"num_experts": 128, "experts_per_token": 4, "hidden_dim": 2880, "max_num_tokens": 128, "seed": 51, "world_size": 8}
+  - {"num_experts": 128, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 256, "seed": 175, "world_size": 8}
+  - {"num_experts": 256, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 256, "seed": 4, "world_size": 8}
 
 
 ranking_by: "geom"

From b1b14b3e3d7879d196afcea952901e252b0bc774 Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Thu, 28 Aug 2025 00:15:52 +0200
Subject: [PATCH 099/207] fix pmpppv2; use [bool, str] return consistently

---
 problems/pmpp_v2/eval.py                   | 15 ++-------------
 problems/pmpp_v2/histogram_py/reference.py |  4 ++--
 problems/pmpp_v2/utils.py                  |  6 +++---
 3 files changed, 7 insertions(+), 18 deletions(-)

diff --git a/problems/pmpp_v2/eval.py b/problems/pmpp_v2/eval.py
index ac3a6325..56222a1e 100644
--- a/problems/pmpp_v2/eval.py
+++ b/problems/pmpp_v2/eval.py
@@ -138,17 +138,6 @@ def _clone_data(data):
         return data
 
 
-def wrap_check_implementation(data, submission_output):
-    # Old version returned just a single string, new version
-    # returns (bool, str); this function ensures compatibility with old
-    # problem definitions.
-    result = check_implementation(data, submission_output)
-    if isinstance(result, tuple):
-        return result
-    else:
-        return not bool(result), result
-
-
 def _run_single_test(test: TestCase):
     """
     Runs a single test case. Do not call directly
@@ -158,7 +147,7 @@ def _run_single_test(test: TestCase):
     torch.cuda.synchronize()
     submission_output = custom_kernel(_clone_data(data))
     torch.cuda.synchronize()
-    return wrap_check_implementation(data, submission_output)
+    return check_implementation(data, submission_output)
 
 
 def run_single_test(pool: multiprocessing.Pool, test: TestCase):
@@ -210,7 +199,7 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
     check_copy = _clone_data(data)
     #  first, one obligatory correctness check
     output = custom_kernel(data)
-    good, message = wrap_check_implementation(check_copy, output)
+    good, message = check_implementation(check_copy, output)
     if not good:
         return message
 
diff --git a/problems/pmpp_v2/histogram_py/reference.py b/problems/pmpp_v2/histogram_py/reference.py
index fc573f48..4268cec7 100644
--- a/problems/pmpp_v2/histogram_py/reference.py
+++ b/problems/pmpp_v2/histogram_py/reference.py
@@ -50,7 +50,7 @@ def check_implementation(data, output):
     reasons = verbose_allequal(output, expected)
 
     if len(reasons) > 0:
-        return "mismatch found! custom implementation doesn't match reference: " + " ".join(reasons)
+        return False, "mismatch found! custom implementation doesn't match reference: " + " ".join(reasons)
 
-    return ''
+    return True, ''
 
diff --git a/problems/pmpp_v2/utils.py b/problems/pmpp_v2/utils.py
index ee6349d1..b5c8a7a7 100644
--- a/problems/pmpp_v2/utils.py
+++ b/problems/pmpp_v2/utils.py
@@ -126,7 +126,7 @@ def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print:
     return []
 
 
-def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08):
+def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08) -> Tuple[bool, str]:
     """
     Convenient "default" implementation for tasks' `check_implementation` function.
     """
@@ -134,9 +134,9 @@ def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08):
     reasons = verbose_allclose(output, expected, rtol=rtol, atol=atol)
 
     if len(reasons) > 0:
-        return "mismatch found! custom implementation doesn't match reference: " + " ".join(reasons)
+        return False, "mismatch found! custom implementation doesn't match reference: " + " ".join(reasons)
 
-    return ''
+    return True, ''
 
 
 def make_match_reference(reference: callable, **kwargs):

From 7a281f541d0111035588f5e940683e73de64c485 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Thu, 28 Aug 2025 21:34:48 +0200
Subject: [PATCH 100/207] Fix: typo

---
 problems/amd/all2all/reference.py  | 2 +-
 problems/amd/all2all/submission.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/problems/amd/all2all/reference.py b/problems/amd/all2all/reference.py
index d5ba04f4..001fed19 100644
--- a/problems/amd/all2all/reference.py
+++ b/problems/amd/all2all/reference.py
@@ -269,7 +269,7 @@ def ref_kernel(data: input_t) -> output_t:
 
     ata.combine(y, rank_data.weights, expert_meta, expert_y, expert_num)
 
-    return y[:, rank_data.num_tokens]
+    return y[:rank_data.num_tokens]
 
 
 def check_implementation(data: input_t, output: output_t):
diff --git a/problems/amd/all2all/submission.py b/problems/amd/all2all/submission.py
index 385bf289..4b8ec666 100644
--- a/problems/amd/all2all/submission.py
+++ b/problems/amd/all2all/submission.py
@@ -205,4 +205,4 @@ def custom_kernel(data: input_t) -> output_t:
 
     ata.combine(y, rank_data.weights, expert_meta, expert_y, expert_num)
 
-    return y[:, rank_data.num_tokens]
+    return y[: rank_data.num_tokens]

From 81e2f45ef1bae28d9ad43108ec6cb35179d04a31 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Thu, 28 Aug 2025 22:11:19 +0200
Subject: [PATCH 101/207] Feat: reenable tests, add basic check_implementation

---
 problems/amd/all2all/reference.py |  7 +++++--
 problems/amd/all2all/task.yml     | 19 +++++++++----------
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/problems/amd/all2all/reference.py b/problems/amd/all2all/reference.py
index 001fed19..eda1ed16 100644
--- a/problems/amd/all2all/reference.py
+++ b/problems/amd/all2all/reference.py
@@ -269,8 +269,11 @@ def ref_kernel(data: input_t) -> output_t:
 
     ata.combine(y, rank_data.weights, expert_meta, expert_y, expert_num)
 
-    return y[:rank_data.num_tokens]
+    return y[: rank_data.num_tokens]
 
 
 def check_implementation(data: input_t, output: output_t):
-    return True, ""
+    expected = ref_kernel(data)
+    if output.device != expected.device:
+        return False, f"Output device mismatch: {output.device} != {expected.device}"
+    return torch.allclose(output, expected), f"Output mismatch: {output} != {expected}"
diff --git a/problems/amd/all2all/task.yml b/problems/amd/all2all/task.yml
index a0af424b..a9c5b863 100644
--- a/problems/amd/all2all/task.yml
+++ b/problems/amd/all2all/task.yml
@@ -46,16 +46,15 @@ description: |
 
 
 tests:
-  - {"num_experts": 8, "experts_per_token": 2, "hidden_dim": 6144, "max_num_tokens": 2, "seed": 6635, "world_size": 8}
-  # - {"num_experts": 8, "experts_per_token": 2, "hidden_dim": 6144, "max_num_tokens": 4, "seed": 1236, "world_size": 8}
-  # - {"num_experts": 64, "experts_per_token": 6, "hidden_dim": 2048, "max_num_tokens": 4, "seed": 1234, "world_size": 8}
-  # - {"num_experts": 64, "experts_per_token": 6, "hidden_dim": 2048, "max_num_tokens": 8, "seed": 542, "world_size": 8}
-  # - {"num_experts": 128, "experts_per_token": 4, "hidden_dim": 2880, "max_num_tokens": 16, "seed": 347, "world_size": 8}
-  # - {"num_experts": 128, "experts_per_token": 4, "hidden_dim": 2880, "max_num_tokens": 32, "seed": 51, "world_size": 8}
-  # - {"num_experts": 128, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 64, "seed": 175, "world_size": 8}
-  # - {"num_experts": 128, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 128, "seed": 534, "world_size": 8}
-  # - {"num_experts": 256, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 64, "seed": 897, "world_size": 8}
-  # - {"num_experts": 256, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 128, "seed": 4, "world_size": 8}
+  - {"num_experts": 8, "experts_per_token": 2, "hidden_dim": 6144, "max_num_tokens": 4, "seed": 1236, "world_size": 8}
+  - {"num_experts": 64, "experts_per_token": 6, "hidden_dim": 2048, "max_num_tokens": 4, "seed": 1234, "world_size": 8}
+  - {"num_experts": 64, "experts_per_token": 6, "hidden_dim": 2048, "max_num_tokens": 8, "seed": 542, "world_size": 8}
+  - {"num_experts": 128, "experts_per_token": 4, "hidden_dim": 2880, "max_num_tokens": 16, "seed": 347, "world_size": 8}
+  - {"num_experts": 128, "experts_per_token": 4, "hidden_dim": 2880, "max_num_tokens": 32, "seed": 51, "world_size": 8}
+  - {"num_experts": 128, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 64, "seed": 175, "world_size": 8}
+  - {"num_experts": 128, "experts_per_token": 8, "hidden_dim": 4096, "max_num_tokens": 128, "seed": 534, "world_size": 8}
+  - {"num_experts": 256, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 64, "seed": 897, "world_size": 8}
+  - {"num_experts": 256, "experts_per_token": 8, "hidden_dim": 7168, "max_num_tokens": 128, "seed": 4, "world_size": 8}
 
 
 benchmarks:

From fb53db154f400701bdc1ff8684b99cfb5005e6cd Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Fri, 29 Aug 2025 21:02:13 +0200
Subject: [PATCH 102/207] fix import

---
 problems/pmpp_v2/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/problems/pmpp_v2/utils.py b/problems/pmpp_v2/utils.py
index b5c8a7a7..7b387d7c 100644
--- a/problems/pmpp_v2/utils.py
+++ b/problems/pmpp_v2/utils.py
@@ -2,6 +2,7 @@
 import random
 import numpy as np
 import torch
+from typing import Tuple
 
 
 def set_seed(seed=42):

From 1a86955792bfc967efcdfed95e74e530d169204d Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sat, 30 Aug 2025 20:26:58 +0200
Subject: [PATCH 103/207] Fix: replace Tuple with tuple

---
 problems/pmpp_v2/utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/problems/pmpp_v2/utils.py b/problems/pmpp_v2/utils.py
index 7b387d7c..7ef4d79c 100644
--- a/problems/pmpp_v2/utils.py
+++ b/problems/pmpp_v2/utils.py
@@ -2,7 +2,6 @@
 import random
 import numpy as np
 import torch
-from typing import Tuple
 
 
 def set_seed(seed=42):
@@ -127,7 +126,7 @@ def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print:
     return []
 
 
-def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08) -> Tuple[bool, str]:
+def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08) -> tuple[bool, str]:
     """
     Convenient "default" implementation for tasks' `check_implementation` function.
     """

From e5db6640e9588896ef3230e95e79b6cf67af86b0 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sun, 31 Aug 2025 13:23:45 +0300
Subject: [PATCH 104/207] Feat: move to separate leaderboard

---
 problems/amd_distributed.yaml                 |  12 +
 .../all2all/reference.py                      |   0
 .../all2all/submission.py                     |   0
 .../{amd => amd_distributed}/all2all/task.py  |   0
 .../{amd => amd_distributed}/all2all/task.yml |   0
 problems/amd_distributed/eval.py              | 578 ++++++++++++++++++
 6 files changed, 590 insertions(+)
 create mode 100644 problems/amd_distributed.yaml
 rename problems/{amd => amd_distributed}/all2all/reference.py (100%)
 rename problems/{amd => amd_distributed}/all2all/submission.py (100%)
 rename problems/{amd => amd_distributed}/all2all/task.py (100%)
 rename problems/{amd => amd_distributed}/all2all/task.yml (100%)
 create mode 100644 problems/amd_distributed/eval.py

diff --git a/problems/amd_distributed.yaml b/problems/amd_distributed.yaml
new file mode 100644
index 00000000..dece3169
--- /dev/null
+++ b/problems/amd_distributed.yaml
@@ -0,0 +1,12 @@
+name: AMD Developer Challenge 2025 - Distributed Edition
+# when does this end (individual problems might close earlier)
+deadline: "2025-10-14"
+# A description for this particular competition
+description: "AMD Developer Challenge 2025: Distributed Edition"
+# the list of problems
+problems:
+  - directory: amd_distributed/all2all
+    name: all2all
+    deadline: "2025-10-14"
+    gpus:
+      - MI300x8
diff --git a/problems/amd/all2all/reference.py b/problems/amd_distributed/all2all/reference.py
similarity index 100%
rename from problems/amd/all2all/reference.py
rename to problems/amd_distributed/all2all/reference.py
diff --git a/problems/amd/all2all/submission.py b/problems/amd_distributed/all2all/submission.py
similarity index 100%
rename from problems/amd/all2all/submission.py
rename to problems/amd_distributed/all2all/submission.py
diff --git a/problems/amd/all2all/task.py b/problems/amd_distributed/all2all/task.py
similarity index 100%
rename from problems/amd/all2all/task.py
rename to problems/amd_distributed/all2all/task.py
diff --git a/problems/amd/all2all/task.yml b/problems/amd_distributed/all2all/task.yml
similarity index 100%
rename from problems/amd/all2all/task.yml
rename to problems/amd_distributed/all2all/task.yml
diff --git a/problems/amd_distributed/eval.py b/problems/amd_distributed/eval.py
new file mode 100644
index 00000000..c3d20f90
--- /dev/null
+++ b/problems/amd_distributed/eval.py
@@ -0,0 +1,578 @@
+import base64
+import copy
+import dataclasses
+import multiprocessing
+import re
+import time
+import os
+import sys
+import math
+from pathlib import Path
+from typing import Any, Optional
+
+import torch.cuda
+
+from utils import set_seed, clear_l2_cache
+
+try:
+    from task import TestSpec
+except ImportError:
+    TestSpec = dict
+
+from reference import check_implementation, generate_input
+
+
+class PopcornOutput:
+    def __init__(self, fd: int):
+        self.file = os.fdopen(fd, 'w')
+        os.set_inheritable(fd, False)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.file.close()
+
+    def print(self, *args, **kwargs):
+        print(*args, **kwargs, file=self.file, flush=True)
+
+    def log(self, key, value):
+        self.print(f"{key}: {value}")
+
+
+@dataclasses.dataclass
+class TestCase:
+    args: dict
+    spec: str
+
+
+def _combine(a: int, b: int) -> int:
+    # combine two integers into one:
+    # we need this to generate a secret seed based on the test-level seed and
+    # the global secret seed.
+    # the test-level seeds are public knowledge, and typically relatively small numbers,
+    # so we need to make sure they don't provide any useful info for the full seed.
+    # This Cantor construction ensures that if the secret seed is a large number,
+    # then so is the overall seed.
+    return int(a + (a + b) * (a + b + 1) // 2)
+
+
+def get_test_cases(file_name: str, seed: Optional[int]) -> list[TestCase]:
+    try:
+        content = Path(file_name).read_text()
+    except Exception as E:
+        print(f"Could not open test file`{file_name}`: {E}", file=sys.stderr)
+        exit(113)
+
+    tests = []
+    lines = content.splitlines()
+    match = r"\s*([a-zA-Z_]+):\s*([a-zA-Z]+|[+-]?[0-9]+)\s*"
+    for line in lines:
+        parts = line.split(";")
+        case = {}
+        for part in parts:
+            matched = re.match(match, part)
+            if not re.fullmatch(match, part):
+                print(f"invalid test case: '{line}': '{part}'", file=sys.stderr)
+                exit(113)
+            key = matched[1]
+            val = matched[2]
+            try:
+                val = int(val)
+            except ValueError:
+                pass
+
+            case[key] = val
+        tests.append(TestCase(spec=line, args=case))
+
+    if seed is not None:
+        for test in tests:
+            if "seed" in test.args:
+                test.args["seed"] = _combine(test.args["seed"], seed)
+
+    return tests
+
+
+@dataclasses.dataclass
+class Stats:
+    runs: int
+    mean: float
+    std: float
+    err: float
+    best: float
+    worst: float
+
+
+def calculate_stats(durations: list[int]):
+    """
+    Calculate statistical data from a list of durations.
+
+    @param durations: A list of durations in nanoseconds.
+    @return: A Stats object containing the number of runs, mean, standard deviation, error, best, and worst durations.
+    """
+    runs = len(durations)
+    total = sum(durations)
+    best = min(durations)
+    worst = max(durations)
+
+    avg = total / runs
+    variance = sum(map(lambda x: (x - avg) ** 2, durations))
+    std = math.sqrt(variance / (runs - 1))
+    err = std / math.sqrt(runs)
+
+    return Stats(runs=runs, mean=avg, std=std, err=err, best=float(best),
+                 worst=float(worst))
+
+
+def _clone_data(data, rank: int):
+    """
+    Recursively goes through data and clones all tensors.
+    """
+    if isinstance(data, tuple):
+        return tuple(_clone_data(x, rank) for x in data)
+    elif isinstance(data, list):
+        return [_clone_data(x, rank) for x in data]
+    elif isinstance(data, dict):
+        return {k: _clone_data(v, rank) for k, v in data.items()}
+    elif isinstance(data, torch.Tensor):
+        device = f"cuda:{rank}"
+        return data.clone().to(device)
+    else:
+        return data
+
+
+def wrap_check_implementation(data, submission_output):
+    # Old version returned just a single string, new version
+    # returns (bool, str); this function ensures compatibility with old
+    # problem definitions.
+    result = check_implementation(data, submission_output)
+    if isinstance(result, tuple):
+        return result
+    else:
+        return not bool(result), result
+
+
+def _run_single_test(test: TestCase):
+    """
+    Runs a single test case. Do not call directly
+    """
+    from submission import custom_kernel
+    data = generate_input(**test.args)
+    torch.cuda.synchronize()
+    submission_output = custom_kernel(_clone_data(data, 0))
+    torch.cuda.synchronize()
+    return wrap_check_implementation(data, submission_output)
+
+
+def _run_distributed_test(test: TestCase, rank: int):
+    """
+    Runs a single test case. Do not call directly
+    """
+    from submission import custom_kernel
+    import torch.distributed as dist
+    world_size = test.args["world_size"]
+    os.environ["MASTER_ADDR"] = "127.0.0.1"
+    os.environ["MASTER_PORT"] = "12356"
+    dist.init_process_group("nccl", init_method="env://", rank=rank, world_size=world_size, device_id=torch.device(f'cuda:{rank}'))
+    try:
+        data = generate_input(**test.args, rank=rank)
+        torch.cuda.synchronize()
+        submission_output = custom_kernel(_clone_data(data, rank))
+        torch.cuda.synchronize()
+        return wrap_check_implementation(data, submission_output)
+    finally:
+        dist.destroy_process_group()
+
+
+def run_multi_gpu_test(pool: multiprocessing.Pool, test: TestCase, world_size: int):
+    """
+    Runs a single test in another process.
+    """
+    rets = []
+    # world_size is a mandatory argument for multi-gpu tests
+    for i in range(world_size):
+        rets.append(
+            pool.apply_async(
+                _run_distributed_test,
+                args=(test, i),
+            )
+        )
+    # 60 seconds should be more than enough, we want tests to be fast
+    rets = [el.get(60) for el in rets]
+
+    correct = all(ret[0] for ret in rets)
+    error_messages = str.join("\n", [f"rank {rank} - {ret[1]}" for rank, ret in enumerate(rets) if not ret[0]])
+    return correct, error_messages
+
+
+def run_single_test(pool: multiprocessing.Pool, test: TestCase):
+    """
+    Runs a single test in another process.
+    """
+    world_size = test.args.get("world_size", None)
+    if world_size is None:
+        return pool.apply(_run_single_test, (test,))
+    else:
+        return run_multi_gpu_test(pool, test, world_size)
+
+
+def run_testing(logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]):
+    """
+    Executes the actual test case code and checks for correctness.
+
+    @param logger: A PopcornOutput object used for logging test results.
+    @param tests: A list of TestCase objects representing the test cases to be executed.
+    @return: An integer representing the exit status: 0 if all tests pass, otherwise 112.
+    """
+    passed = True
+    logger.log("test-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"test.{idx}.spec", test.spec)
+        good, message = run_single_test(pool, test)
+        if not good:
+            logger.log(f"test.{idx}.status", "fail")
+            logger.log(f"test.{idx}.error", message)
+            passed = False
+        else:
+            logger.log(f"test.{idx}.status", "pass")
+            if message:
+                logger.log(f"test.{idx}.message", message)
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float) -> Stats | Any:
+    """
+    Runs one benchmark. Do not call directly.
+    """
+    from submission import custom_kernel
+
+    durations = []
+    # generate input data once
+    data = generate_input(**test.args)
+    check_copy = _clone_data(data, 0)
+    #  first, one obligatory correctness check
+    output = custom_kernel(data)
+    good, message = wrap_check_implementation(check_copy, output)
+    if not good:
+        return message
+
+    # now, do multiple timing runs without further correctness testing
+    # there is an upper bound of 100 runs, and a lower bound of 3 runs;
+    # otherwise, we repeat until we either measure at least 10 full seconds,
+    # or the relative error of the mean is below 1%.
+
+    bm_start_time = time.perf_counter_ns()
+    for i in range(max_repeats):
+        if recheck:
+            # ensure we use a different seed for every benchmark
+            if "seed" in test.args:
+                test.args["seed"] += 13
+
+            data = generate_input(**test.args)
+            check_copy = _clone_data(data, 0)
+        torch.cuda.synchronize()
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        clear_l2_cache()
+
+        start_event.record()
+        output = custom_kernel(data)
+        end_event.record()
+        torch.cuda.synchronize()
+        duration = start_event.elapsed_time(end_event) * 1e6  # Convert ms to ns
+
+        if recheck:
+            good, message = check_implementation(check_copy, output)
+            if not good:
+                return message
+
+        del output
+        durations.append(duration)
+
+        if i > 1:
+            total_bm_duration = time.perf_counter_ns() - bm_start_time
+            stats = calculate_stats(durations)
+            # stop if either
+            # a) relative error dips below 0.1%
+            # b) we exceed the total time limit for benchmarking the kernel
+            # c) we exceed 2 minutes of total wallclock time.
+            if stats.err / stats.mean < 0.001 or stats.mean * stats.runs > max_time_ns or total_bm_duration > 120e9:
+                break
+
+    return calculate_stats(durations)
+
+
+def _run_distributed_benchmark(test: TestCase, rank: int, recheck: bool, max_repeats: int,
+                               max_time_ns: float) -> Stats | Any:
+    """
+    Runs one distributed benchmark. Do not call directly.
+    """
+    from submission import custom_kernel
+    import torch.distributed as dist
+
+    world_size = test.args["world_size"]
+    os.environ["MASTER_ADDR"] = "127.0.0.1"
+    os.environ["MASTER_PORT"] = "12356"
+    dist.init_process_group("nccl", init_method="env://", rank=rank, world_size=world_size, device_id=torch.device(f'cuda:{rank}'))
+
+    try:
+        durations = []
+        # generate input data once
+        data = generate_input(**test.args, rank=rank)
+        check_copy = _clone_data(data, rank)
+
+        # first, one obligatory correctness check
+        output = custom_kernel(_clone_data(data, rank))
+        good, message = wrap_check_implementation(check_copy, output)
+        if not good:
+            return message
+
+        # now, do multiple timing runs with proper distributed synchronization
+        bm_start_time = time.perf_counter_ns()
+        for i in range(max_repeats):
+            error_message = None
+            if recheck:
+                # ensure we use a different seed for every benchmark
+                if "seed" in test.args:
+                    test.args["seed"] += 13
+
+                data = generate_input(**test.args, rank=rank)
+                check_copy = _clone_data(data, rank)
+
+            # Synchronize all ranks before timing
+            clear_l2_cache()
+            torch.cuda.synchronize()
+            dist.barrier()
+
+            # Use distributed timing - only rank 0 records the overall time
+            if rank == 0:
+                start_time = time.perf_counter_ns()
+
+            # All ranks execute the kernel
+            output = custom_kernel(_clone_data(data, rank))
+
+            # Synchronize all ranks after kernel execution
+            torch.cuda.synchronize()
+            dist.barrier()
+
+            if rank == 0:
+                end_time = time.perf_counter_ns()
+                duration = end_time - start_time  # Already in nanoseconds
+                durations.append(duration)
+
+            if recheck:
+                good, message = check_implementation(check_copy, output)
+                if not good:
+                    error_message = message
+
+            del output
+
+            has_error = torch.tensor(1 if error_message is not None else 0, dtype=torch.int32, device=f'cuda:{rank}')
+            dist.reduce(has_error, 0)
+            if has_error.item() > 0:
+                return error_message
+
+            # Only rank 0 checks convergence criteria
+            if rank == 0 and i > 1:
+                total_bm_duration = time.perf_counter_ns() - bm_start_time
+                stats = calculate_stats(durations)
+                # stop if either
+                # a) relative error dips below 0.1%
+                # b) we exceed the total time limit for benchmarking the kernel
+                # c) we exceed 2 minutes of total wallclock time.
+                should_stop = (stats.err / stats.mean < 0.001 or
+                               stats.mean * stats.runs > max_time_ns or
+                               total_bm_duration > 120e9)
+            else:
+                should_stop = False
+
+            # Broadcast stop decision to all ranks
+            stop_tensor = torch.tensor(should_stop, dtype=torch.bool, device=f'cuda:{rank}')
+            dist.broadcast(stop_tensor, 0)
+
+            if stop_tensor.item():
+                break
+
+        # Only rank 0 returns meaningful stats
+        if rank == 0:
+            return calculate_stats(durations)
+        else:
+            # Non-zero ranks return a dummy stats object
+            return Stats(runs=len(durations), mean=0.0, std=0.0, err=0.0, best=0.0, worst=0.0)
+
+    finally:
+        dist.destroy_process_group()
+
+
+def run_multi_gpu_benchmark(pool: multiprocessing.Pool, test: TestCase, recheck: bool, max_repeats: int,
+                            max_time_ns: float, world_size: int):
+    """
+    Runs a multi-GPU benchmark across all ranks.
+    """
+    rets = []
+    for i in range(world_size):
+        rets.append(
+            pool.apply_async(
+                _run_distributed_benchmark,
+                args=(test, i, recheck, max_repeats, max_time_ns),
+            )
+        )
+
+    # 120 seconds for benchmarking + we run a pre-benchmark test and want to leave some slack
+    rets = [el.get(timeout=180) for el in rets]
+
+    # For multi-GPU benchmarking, only rank 0 has meaningful stats
+    failed_ranks = []
+    rank_0_result = None
+
+    for rank, ret in enumerate(rets):
+        if isinstance(ret, Stats):
+            if rank == 0:
+                rank_0_result = ret
+        else:
+            # ret is an error message
+            failed_ranks.append((rank, ret))
+
+    if failed_ranks:
+        error_messages = str.join("\n", [f"rank {rank} - {msg}" for rank, msg in failed_ranks])
+        return error_messages
+    else:
+        return rank_0_result if rank_0_result else "No stats returned from rank 0"
+
+
+def run_single_benchmark(pool: multiprocessing.Pool, test: TestCase, recheck: bool, max_repeats: int,
+                         max_time_ns: float):
+    """
+    For a particular test case, check correctness (if applicable) and grab runtime results.
+
+    @param pool: Process on which the benchmark will be launched.
+    @param test: TestCase object.
+    @param recheck: Flag for whether to explicitly check functional correctness.
+    @param max_repeats: Number of trials to repeat.
+    @param max_time_ns: Timeout time in nanoseconds.
+    @return: A Stats object for this particular benchmark case or an error if the test fails.
+    """
+
+    world_size: Optional[int] = test.args.get("world_size", None)
+    if world_size is None:
+        return pool.apply(_run_single_benchmark, (test, recheck, max_repeats, max_time_ns))
+    else:
+        return run_multi_gpu_benchmark(pool, test, recheck, max_repeats, max_time_ns, world_size)
+
+
+def run_benchmarking(logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]):
+    """
+    Executes benchmarking code for a CUDA Kernel and logs runtimes.
+
+    @param logger: A PopcornOutput object used for logging benchmark results.
+    @param pool: Process on which the benchmarks will be launched.
+    @param tests: A list of TestCase objects representing the test cases to be benchmarked.
+    @return: An integer representing the exit status: 0 if all benchmarks pass, otherwise 112.
+    """
+    # warm up
+    run_single_benchmark(pool, tests[0], False, 100, 10e7)
+
+    passed = True
+    logger.log("benchmark-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"benchmark.{idx}.spec", test.spec)
+        result = run_single_benchmark(pool, test, False, 100, 10e9)
+        if isinstance(result, Stats):
+            for field in dataclasses.fields(Stats):
+                logger.log(f"benchmark.{idx}.{field.name}", getattr(result, field.name))
+        else:
+            passed = False
+            logger.log(f"benchmark.{idx}.status", "fail")
+            logger.log(f"benchmark.{idx}.error", result)
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def run_single_profile(test: TestCase) -> str:
+    """
+    Runs a single test case. Do not call directly
+    """
+    from submission import custom_kernel
+    from torch.profiler import profile, record_function, ProfilerActivity
+    data = generate_input(**test.args)
+    torch.cuda.synchronize()
+
+    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
+        submission_output = custom_kernel(_clone_data(data, 0))
+        torch.cuda.synchronize()
+    return prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=20)
+
+
+def run_profiling(logger: PopcornOutput, tests: list[TestCase]):
+    logger.log("benchmark-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"benchmark.{idx}.spec", test.spec)
+        report = run_single_profile(test)
+        logger.log(f"benchmark.{idx}.report", base64.b64encode(report.encode("utf-8"), b"+*").decode("utf-8"))
+    logger.log("check", "pass")
+    return 0
+
+
+def main():
+    fd = os.getenv("POPCORN_FD")
+    if not fd:
+        return 111
+
+    if len(sys.argv) < 3:
+        return 2
+
+    mode = sys.argv[1]
+    seed = os.getenv("POPCORN_SEED")
+    os.unsetenv("POPCORN_SEED")
+    n_gpus = int(os.getenv("POPCORN_GPUS", "1"))
+    seed = int(seed) if seed else None
+    set_seed(seed or 42)
+    tests = get_test_cases(sys.argv[2], seed)
+
+    with PopcornOutput(int(fd)) as logger:
+        import multiprocessing
+        mp_context = multiprocessing.get_context('spawn')
+        with mp_context.Pool(n_gpus) as pool:
+            if mode == "test":
+                return run_testing(logger, pool, tests)
+            if mode == "benchmark":
+                return run_benchmarking(logger, pool, tests)
+
+            if mode == "leaderboard":
+                # warmup
+                run_single_benchmark(pool, tests[0], False, 100, 1e7)
+                logger.log("benchmark-count", len(tests))
+                passed = True
+                for i in range(len(tests)):
+                    result = run_single_benchmark(pool, tests[i], True, 100, 30e9)
+                    logger.log(f"benchmark.{i}.spec", tests[i].spec)
+                    if isinstance(result, Stats):
+                        for field in dataclasses.fields(Stats):
+                            logger.log(f"benchmark.{i}.{field.name}", getattr(result, field.name))
+                    else:
+                        passed = False
+                        logger.log(f"benchmark.{i}.status", "fail")
+                        logger.log(f"benchmark.{i}.error", str(result))  # TODO: Make sure result implements __str__?
+                        break
+
+                logger.log("check", "pass" if passed else "fail")
+            elif mode == "profile":
+                run_profiling(logger, tests)
+            else:
+                # invalid mode
+                return 2
+
+
+if __name__ == "__main__":
+    sys.exit(main())

From 9230c02e9c0372b1d49d0930d91510c60bac3be4 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sun, 31 Aug 2025 21:32:52 +0300
Subject: [PATCH 105/207] Fix: add utils

---
 problems/amd_distributed/utils.py | 143 ++++++++++++++++++++++++++++++
 1 file changed, 143 insertions(+)
 create mode 100644 problems/amd_distributed/utils.py

diff --git a/problems/amd_distributed/utils.py b/problems/amd_distributed/utils.py
new file mode 100644
index 00000000..73551022
--- /dev/null
+++ b/problems/amd_distributed/utils.py
@@ -0,0 +1,143 @@
+import random
+from typing import Tuple
+
+import numpy as np
+import torch
+
+
+def set_seed(seed=42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+
+
+def get_device(use_cuda: bool = True) -> torch.device:
+    """Get the appropriate device (GPU or CPU)."""
+    if use_cuda:
+        if torch.cuda.is_available():
+            return torch.device("cuda")
+        elif torch.backends.mps.is_available():
+            return torch.device("mps")
+        else:
+            print("No compatible GPU found. Falling back to CPU.")
+    return torch.device("cpu")
+
+
+# Adapted from https://github.com/linkedin/Liger-Kernel/blob/main/test/utils.py
+@torch.no_grad()
+def verbose_allclose(
+        received: torch.Tensor,
+        expected: torch.Tensor,
+        rtol=1e-05,
+        atol=1e-08,
+        max_print=5
+) -> Tuple[bool, list[str]]:
+    """
+    Assert that two tensors are element-wise equal within a tolerance, providing detailed information about mismatches.
+
+    Parameters:
+    received (torch.Tensor): Tensor we actually got.
+    expected (torch.Tensor): Tensor we expected to receive.
+    rtol (float): Relative tolerance; relative to expected
+    atol (float): Absolute tolerance.
+    max_print (int): Maximum number of mismatched elements to print.
+    """
+    # Check if the shapes of the tensors match
+    if received.shape != expected.shape:
+        return False, ["SIZE MISMATCH"]
+
+    # Calculate the difference between the tensors
+    diff = torch.abs(received.to(torch.float32) - expected.to(torch.float32))
+
+    # Determine the tolerance
+    tolerance = atol + rtol * torch.abs(expected)
+
+    # Find tolerance mismatched elements
+    tol_mismatched = diff > tolerance
+
+    # Find nan mismatched elements
+    nan_mismatched = torch.logical_xor(torch.isnan(received), torch.isnan(expected))
+
+    # Find +inf mismatched elements
+    posinf_mismatched = torch.logical_xor(torch.isposinf(received), torch.isposinf(expected))
+    # Find -inf mismatched elements
+    neginf_mismatched = torch.logical_xor(torch.isneginf(received), torch.isneginf(expected))
+
+    # Find all mismatched elements
+    mismatched = torch.logical_or(
+        torch.logical_or(tol_mismatched, nan_mismatched),
+        torch.logical_or(posinf_mismatched, neginf_mismatched),
+    )
+
+    mismatched_indices = torch.nonzero(mismatched)
+
+    # Count the number of mismatched elements
+    num_mismatched = mismatched.count_nonzero().item()
+
+    # Generate detailed information if there are mismatches
+    if num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
+
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR at {i}: {received[i]} {expected[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return False, mismatch_details
+
+    return True, [f"Maximum error: {torch.max(diff)}"]
+
+
+@torch.no_grad()
+def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print: int = 5) -> Tuple[bool, list[str]]:
+    """
+    Assert that two tensors are element-wise perfectly equal, providing detailed information about mismatches.
+
+    Parameters:
+    received (torch.Tensor): Tensor we actually got.
+    expected (torch.Tensor): Tensor we expected to receive.
+    max_print (int): Maximum number of mismatched elements to print.
+
+    Returns:
+         Empty string if tensors are equal, otherwise detailed error information
+    """
+    mismatched = torch.not_equal(received, expected)
+    mismatched_indices = torch.nonzero(mismatched)
+
+    # Count the number of mismatched elements
+    num_mismatched = mismatched.count_nonzero().item()
+
+    # Generate detailed information if there are mismatches
+    if num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
+
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR at {i}: {received[i]} {expected[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return False, mismatch_details
+
+    return True, []
+
+
+def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08):
+    """
+    Convenient "default" implementation for tasks' `check_implementation` function.
+    """
+    expected = reference(data)
+    good, reasons = verbose_allclose(output, expected, rtol=rtol, atol=atol)
+
+    if len(reasons) > 0:
+        return good, "\\n".join(reasons)
+
+    return good, ''
+
+
+def make_match_reference(reference: callable, **kwargs):
+    def wrapped(data, output):
+        return match_reference(data, output, reference=reference, **kwargs)
+    return wrapped

From 0e549208105edfc2ab54a6c5fbdd021c272c7040 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sun, 31 Aug 2025 21:40:56 +0300
Subject: [PATCH 106/207] Fix: add timezone

---
 problems/amd_distributed.yaml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/problems/amd_distributed.yaml b/problems/amd_distributed.yaml
index dece3169..4f9afbc9 100644
--- a/problems/amd_distributed.yaml
+++ b/problems/amd_distributed.yaml
@@ -1,12 +1,11 @@
 name: AMD Developer Challenge 2025 - Distributed Edition
 # when does this end (individual problems might close earlier)
-deadline: "2025-10-14"
+deadline: "2025-10-14T23:59:59Z"
 # A description for this particular competition
 description: "AMD Developer Challenge 2025: Distributed Edition"
 # the list of problems
 problems:
   - directory: amd_distributed/all2all
-    name: all2all
-    deadline: "2025-10-14"
+    deadline: "2025-10-14T23:59:59Z"
     gpus:
       - MI300x8

From ea23b83d041fcecf0ab0ffa7c908c974866f689d Mon Sep 17 00:00:00 2001
From: Matej Sirovatka <54212263+S1ro1@users.noreply.github.com>
Date: Sun, 31 Aug 2025 21:57:14 +0300
Subject: [PATCH 107/207] Fix: add amd-all2all name

---
 problems/amd_distributed.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/problems/amd_distributed.yaml b/problems/amd_distributed.yaml
index 4f9afbc9..edc4c568 100644
--- a/problems/amd_distributed.yaml
+++ b/problems/amd_distributed.yaml
@@ -6,6 +6,7 @@ description: "AMD Developer Challenge 2025: Distributed Edition"
 # the list of problems
 problems:
   - directory: amd_distributed/all2all
+    name: amd-all2all
     deadline: "2025-10-14T23:59:59Z"
     gpus:
       - MI300x8

From 8856396cd3b626a32d49768bf5c816f729de1f6e Mon Sep 17 00:00:00 2001
From: Matej Sirovatka <54212263+S1ro1@users.noreply.github.com>
Date: Sun, 31 Aug 2025 21:58:02 +0300
Subject: [PATCH 108/207] Change deadline format in amd_distributed.yaml

Updated deadline format for the competition and problems.
---
 problems/amd_distributed.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/problems/amd_distributed.yaml b/problems/amd_distributed.yaml
index edc4c568..86fed95a 100644
--- a/problems/amd_distributed.yaml
+++ b/problems/amd_distributed.yaml
@@ -1,12 +1,12 @@
 name: AMD Developer Challenge 2025 - Distributed Edition
 # when does this end (individual problems might close earlier)
-deadline: "2025-10-14T23:59:59Z"
+deadline: "2025-10-14"
 # A description for this particular competition
 description: "AMD Developer Challenge 2025: Distributed Edition"
 # the list of problems
 problems:
   - directory: amd_distributed/all2all
     name: amd-all2all
-    deadline: "2025-10-14T23:59:59Z"
+    deadline: "2025-10-14"
     gpus:
       - MI300x8

From 6d188a6380b688548136cb6c6c803ff22aa02b93 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sun, 31 Aug 2025 22:04:21 +0300
Subject: [PATCH 109/207] Fix: remove kernelbot directive

---
 problems/amd_distributed/all2all/submission.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/problems/amd_distributed/all2all/submission.py b/problems/amd_distributed/all2all/submission.py
index 4b8ec666..d5506406 100644
--- a/problems/amd_distributed/all2all/submission.py
+++ b/problems/amd_distributed/all2all/submission.py
@@ -1,5 +1,3 @@
-#!POPCORN leaderboard all2all-dev
-
 import torch
 import torch.distributed as dist
 from task import input_t, output_t

From 171e74be3b42a0bf1aedac98332eaf0ee1647a06 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Sun, 31 Aug 2025 12:05:16 -0700
Subject: [PATCH 110/207] Add AMD $100K distributed kernel competition to
 README

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 89413b2c..f362097e 100644
--- a/README.md
+++ b/README.md
@@ -8,6 +8,7 @@ You can see what's going on [gpumode.com](https://www.gpumode.com/)
 1. [PMPP practice problems](https://gpu-mode.github.io/discord-cluster-manager/docs/active#practice-round-leaderboard): Starting on Sunday Feb 21, 2025.
 2. [AMD $100K kernel competition](problems/amd)
 3. [BioML kernels](problems/bioml)
+4. [AMD $100K distributed kernel competition](problems/amd_distributed)
 
 ## Making a Leaderboard Submission
 

From 53547fc2aaf46490155f4b752aa62482f4dd22e9 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sun, 31 Aug 2025 22:05:36 +0300
Subject: [PATCH 111/207] Fix: add correct utils

---
 problems/amd_distributed/utils.py | 55 ++++++++++++++++++++++++-------
 1 file changed, 44 insertions(+), 11 deletions(-)

diff --git a/problems/amd_distributed/utils.py b/problems/amd_distributed/utils.py
index 73551022..396c6bfe 100644
--- a/problems/amd_distributed/utils.py
+++ b/problems/amd_distributed/utils.py
@@ -29,11 +29,7 @@ def get_device(use_cuda: bool = True) -> torch.device:
 # Adapted from https://github.com/linkedin/Liger-Kernel/blob/main/test/utils.py
 @torch.no_grad()
 def verbose_allclose(
-        received: torch.Tensor,
-        expected: torch.Tensor,
-        rtol=1e-05,
-        atol=1e-08,
-        max_print=5
+    received: torch.Tensor, expected: torch.Tensor, rtol=1e-05, atol=1e-08, max_print=5
 ) -> Tuple[bool, list[str]]:
     """
     Assert that two tensors are element-wise equal within a tolerance, providing detailed information about mismatches.
@@ -62,9 +58,13 @@ def verbose_allclose(
     nan_mismatched = torch.logical_xor(torch.isnan(received), torch.isnan(expected))
 
     # Find +inf mismatched elements
-    posinf_mismatched = torch.logical_xor(torch.isposinf(received), torch.isposinf(expected))
+    posinf_mismatched = torch.logical_xor(
+        torch.isposinf(received), torch.isposinf(expected)
+    )
     # Find -inf mismatched elements
-    neginf_mismatched = torch.logical_xor(torch.isneginf(received), torch.isneginf(expected))
+    neginf_mismatched = torch.logical_xor(
+        torch.isneginf(received), torch.isneginf(expected)
+    )
 
     # Find all mismatched elements
     mismatched = torch.logical_or(
@@ -85,14 +85,18 @@ def verbose_allclose(
             i = tuple(index.tolist())
             mismatch_details.append(f"ERROR at {i}: {received[i]} {expected[i]}")
         if num_mismatched > max_print:
-            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+            mismatch_details.append(
+                f"... and {num_mismatched - max_print} more mismatched elements."
+            )
         return False, mismatch_details
 
     return True, [f"Maximum error: {torch.max(diff)}"]
 
 
 @torch.no_grad()
-def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print: int = 5) -> Tuple[bool, list[str]]:
+def verbose_allequal(
+    received: torch.Tensor, expected: torch.Tensor, max_print: int = 5
+) -> Tuple[bool, list[str]]:
     """
     Assert that two tensors are element-wise perfectly equal, providing detailed information about mismatches.
 
@@ -118,7 +122,9 @@ def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print:
             i = tuple(index.tolist())
             mismatch_details.append(f"ERROR at {i}: {received[i]} {expected[i]}")
         if num_mismatched > max_print:
-            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+            mismatch_details.append(
+                f"... and {num_mismatched - max_print} more mismatched elements."
+            )
         return False, mismatch_details
 
     return True, []
@@ -134,10 +140,37 @@ def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08):
     if len(reasons) > 0:
         return good, "\\n".join(reasons)
 
-    return good, ''
+    return good, ""
 
 
 def make_match_reference(reference: callable, **kwargs):
     def wrapped(data, output):
         return match_reference(data, output, reference=reference, **kwargs)
+
     return wrapped
+
+
+class DisableCuDNNTF32:
+    def __init__(self):
+        self.allow_tf32 = torch.backends.cudnn.allow_tf32
+        self.deterministic = torch.backends.cudnn.deterministic
+        pass
+
+    def __enter__(self):
+        torch.backends.cudnn.allow_tf32 = False
+        torch.backends.cudnn.deterministic = True
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        torch.backends.cudnn.allow_tf32 = self.allow_tf32
+        torch.backends.cudnn.deterministic = self.deterministic
+
+
+def clear_l2_cache():
+    # import cupy as cp
+    # cp.cuda.runtime.deviceSetLimit(cp.cuda.runtime.cudaLimitPersistingL2CacheSize, 0)
+    # create a large dummy tensor
+    dummy = torch.empty((32, 1024, 1024), dtype=torch.int64, device="cuda")
+    # write stuff to
+    dummy.fill_(42)
+    del dummy

From fab1acbc584be539d40d0c1bd14f2a5d5ee7231a Mon Sep 17 00:00:00 2001
From: danielhua23 <daniel.huang2@amd.com>
Date: Mon, 1 Sep 2025 02:18:10 +0000
Subject: [PATCH 112/207] fix max_recv

---
 problems/amd_distributed/all2all/reference.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/amd_distributed/all2all/reference.py b/problems/amd_distributed/all2all/reference.py
index eda1ed16..8bfd0e74 100644
--- a/problems/amd_distributed/all2all/reference.py
+++ b/problems/amd_distributed/all2all/reference.py
@@ -62,7 +62,7 @@ def __init__(self, cfg: MoEConfig, rank: int, world_size: int):
         # num experts per rank
         self.num_local_experts = cfg.num_experts // world_size
         # max recv tokens per rank
-        self.max_recv = cfg.max_num_tokens * cfg.experts_per_token
+        self.max_recv = cfg.max_num_tokens * world_size
 
     # ---------- dispatch ----------
     def dispatch(self, dp_x: torch.Tensor, indices: torch.Tensor):

From 5a2a062c7d3e65139cd6f9e44d666064f73dee20 Mon Sep 17 00:00:00 2001
From: danielhua23 <daniel.huang2@amd.com>
Date: Mon, 1 Sep 2025 03:18:31 +0000
Subject: [PATCH 113/207] fix max_recv in submission file

---
 problems/amd_distributed/all2all/submission.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/amd_distributed/all2all/submission.py b/problems/amd_distributed/all2all/submission.py
index d5506406..a036f7a8 100644
--- a/problems/amd_distributed/all2all/submission.py
+++ b/problems/amd_distributed/all2all/submission.py
@@ -14,7 +14,7 @@ def __init__(self, cfg, rank: int, world_size: int):
         # num experts per rank
         self.num_local_experts = cfg.num_experts // world_size
         # max recv tokens per rank
-        self.max_recv = cfg.max_num_tokens * cfg.experts_per_token
+        self.max_recv = cfg.max_num_tokens * world_size
 
     # ---------- dispatch ----------
     def dispatch(self, dp_x: torch.Tensor, indices: torch.Tensor):

From fc547c6a487c5c279ee197060e5d58b7c1ce0670 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Mon, 1 Sep 2025 22:29:21 +0300
Subject: [PATCH 114/207] Fix: all2all timeouts + check_implementation

---
 problems/amd_distributed/all2all/reference.py | 6 +++++-
 problems/amd_distributed/all2all/task.yml     | 1 +
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/problems/amd_distributed/all2all/reference.py b/problems/amd_distributed/all2all/reference.py
index 8bfd0e74..26c4ac7f 100644
--- a/problems/amd_distributed/all2all/reference.py
+++ b/problems/amd_distributed/all2all/reference.py
@@ -276,4 +276,8 @@ def check_implementation(data: input_t, output: output_t):
     expected = ref_kernel(data)
     if output.device != expected.device:
         return False, f"Output device mismatch: {output.device} != {expected.device}"
-    return torch.allclose(output, expected), f"Output mismatch: {output} != {expected}"
+    res = torch.allclose(output, expected, rtol=1e-2, atol=5e-3)
+    if not res:
+        return False, f"Output values mismatch, {output} != {expected}"
+
+    return True, ""
diff --git a/problems/amd_distributed/all2all/task.yml b/problems/amd_distributed/all2all/task.yml
index a9c5b863..688f62d2 100644
--- a/problems/amd_distributed/all2all/task.yml
+++ b/problems/amd_distributed/all2all/task.yml
@@ -66,3 +66,4 @@ benchmarks:
 
 
 ranking_by: "geom"
+ranked_timeout: 420

From dfced0bc7bfe8631df4f77f1293023a7986aca5a Mon Sep 17 00:00:00 2001
From: Xun Wang <xunwang2@amd.com>
Date: Tue, 2 Sep 2025 06:21:35 -0500
Subject: [PATCH 115/207] add reference impl of gemm-reducescatter

---
 problems/amd_distributed/gemm-rs/reference.py | 63 +++++++++++++++++++
 .../amd_distributed/gemm-rs/submission.py     | 33 ++++++++++
 problems/amd_distributed/gemm-rs/task.py      | 13 ++++
 problems/amd_distributed/gemm-rs/task.yml     | 61 ++++++++++++++++++
 4 files changed, 170 insertions(+)
 create mode 100644 problems/amd_distributed/gemm-rs/reference.py
 create mode 100644 problems/amd_distributed/gemm-rs/submission.py
 create mode 100644 problems/amd_distributed/gemm-rs/task.py
 create mode 100644 problems/amd_distributed/gemm-rs/task.yml

diff --git a/problems/amd_distributed/gemm-rs/reference.py b/problems/amd_distributed/gemm-rs/reference.py
new file mode 100644
index 00000000..5ff5904d
--- /dev/null
+++ b/problems/amd_distributed/gemm-rs/reference.py
@@ -0,0 +1,63 @@
+from utils import make_match_reference
+from task import input_t, output_t
+import torch
+
+
+def generate_input(RANK: int, world_size: int, m: int, n: int, k: int, seed: int) -> input_t:
+    """
+    Generate random input and weights for the AG-GEMM operation.
+
+    Returns:
+        Tuple of (
+            input: torch.Tensor,  # [M, local_K]
+            weight: torch.Tensor,  # [N, local_K]
+            transposed_weight: bool,  # Whether the weight is transposed
+            bias: Optional[torch.Tensor],  # [N] or None
+        )
+    """
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed + RANK)
+
+    assert m % world_size == 0, "m must be divisible by world_size"
+    assert k % world_size == 0, "k must be divisible by world_size"
+    local_k = k // world_size
+
+    # Generate random inputs and weights
+    input = (torch.rand((m, local_k), dtype=torch.float16, device="cuda", generator=gen) * 2 - 1) * 0.01
+    weight = (torch.rand((n, local_k), dtype=torch.float16, device="cuda", generator=gen) * 2 - 1) * 0.01
+
+    return (input, weight, False, None)
+
+
+def ref_kernel(data: input_t) -> output_t:
+    """
+    Reference kernel for Gemm-ReduceScatter operation.
+
+    Args:
+        data: Tuple of (input: torch.Tensor, weight: torch.Tensor, transposed_weight: bool,
+                bias: Optional[torch.Tensor])
+            - input: Local input tensor of shape [M, local_K].
+            - weight: Weight tensor of shape [N, local_K] or [local_K, N] if transed_weight is True.
+            - transposed_weight: Whether the weight is transposed.
+            - bias: Optional bias tensor of shape [N] or None.
+    Returns:
+        Tuple containing:
+            - output: Resulting tensor of shape [M // world_size, N].
+    """
+    input, weight, transposed_weight, bias = data
+    M, local_K = input.shape
+    if not transposed_weight:
+        weight = weight.T
+    N = weight.shape[1]
+    world_size = torch.distributed.get_world_size()
+    # matmul
+    output = torch.matmul(input, weight)
+    if bias is not None:
+        output = output + bias
+    # reduce scatter
+    rs_output = torch.empty((M // world_size, N), dtype=output.dtype, device=input.device)
+    torch.distributed.reduce_scatter_tensor(rs_output, output)
+    return rs_output
+
+
+check_implementation = make_match_reference(ref_kernel, rtol=1e-2, atol=1e-2)
diff --git a/problems/amd_distributed/gemm-rs/submission.py b/problems/amd_distributed/gemm-rs/submission.py
new file mode 100644
index 00000000..dce77b4a
--- /dev/null
+++ b/problems/amd_distributed/gemm-rs/submission.py
@@ -0,0 +1,33 @@
+from task import input_t, output_t
+import torch
+
+
+def custom_kernel(data: input_t) -> output_t:
+    """
+    Reference kernel for Gemm-ReduceScatter operation.
+
+    Args:
+        data: Tuple of (input: torch.Tensor, weight: torch.Tensor, transposed_weight: bool,
+                bias: Optional[torch.Tensor])
+            - input: Local input tensor of shape [M, local_K].
+            - weight: Weight tensor of shape [N, local_K] or [local_K, N] if transed_weight is True.
+            - transposed_weight: Whether the weight is transposed.
+            - bias: Optional bias tensor of shape [N] or None.
+    Returns:
+        Tuple containing:
+            - output: Resulting tensor of shape [M // world_size, N].
+    """
+    input, weight, transposed_weight, bias = data
+    M, local_K = input.shape
+    if not transposed_weight:
+        weight = weight.T
+    N = weight.shape[1]
+    world_size = torch.distributed.get_world_size()
+    # matmul
+    output = torch.matmul(input, weight)
+    if bias is not None:
+        output = output + bias
+    # reduce scatter
+    rs_output = torch.empty((M // world_size, N), dtype=output.dtype, device=input.device)
+    torch.distributed.reduce_scatter_tensor(rs_output, output)
+    return rs_output
diff --git a/problems/amd_distributed/gemm-rs/task.py b/problems/amd_distributed/gemm-rs/task.py
new file mode 100644
index 00000000..9be0c81b
--- /dev/null
+++ b/problems/amd_distributed/gemm-rs/task.py
@@ -0,0 +1,13 @@
+from typing import TypedDict, TypeVar, Tuple, Dict
+import torch
+
+input_t = TypeVar("input_t", bound=Tuple[torch.Tensor, Dict[str, torch.Tensor], Dict])
+output_t = TypeVar("output_t", bound=Tuple[torch.Tensor, Dict])
+
+
+class TestSpec(TypedDict):
+    world_size: int
+    m: int
+    n: int
+    k: int
+    seed: int
\ No newline at end of file
diff --git a/problems/amd_distributed/gemm-rs/task.yml b/problems/amd_distributed/gemm-rs/task.yml
new file mode 100644
index 00000000..c8b6bb8b
--- /dev/null
+++ b/problems/amd_distributed/gemm-rs/task.yml
@@ -0,0 +1,61 @@
+# name: gemm-rs
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  Implement a Gemm-ReduceScatter kernel for efficient transformer models
+  on a single MI300X device.
+
+  ReduceScatter-Gemm (RS-Gemm) is a technique that combines the ReduceScatter
+  communication pattern with General Matrix Multiplication (GEMM) to optimize
+  the performance of transformer models on GPUs. It is particularly useful for
+  handling large models that exceed the memory capacity of a single GPU by
+  distributing the model across multiple GPUs and efficiently scattering the
+  results of matrix multiplications.
+
+  Your task:
+  - Implement the Gemm-RS kernel to perform matrix multiplications in a
+    distributed manner, leveraging the ReduceScatter operation to distribute
+    data across multiple GPUs.
+  - Ensure that the implementation is optimized for the MI300X architecture,
+    taking advantage of its specific hardware features for maximum performance.
+
+  Input:
+  - `data`: Tuple of (input: torch.Tensor, weights: torch.Tensor, transposed_weight: bool,
+            bias: Optional, None or torch.Tensor, TP_GROUP: group object)
+    - input: Local input tensor of shape [M, local_K].
+    - weight: Weight tensor of shape [N, local_K] or [local_K, N] if transed_weight is True.
+    - transposed_weight: Whether the weight is transposed.
+    - bias: bias tensor of shape [N] or None.
+    - TP_GROUP: Process group for tensor parallelism
+
+  Output:
+  - Tuple containing:
+    - output: Resulting tensor of shape [M // world_size, N]
+
+config:
+  main: "eval.py"
+
+templates:
+  Python: "submission.py"
+
+ranking_by: "geom"
+
+tests:
+  - {"world_size": 8, "m": 8192, "n": 3584, "k": 14336, "seed": 42}
+  - {"world_size": 8, "m": 8192, "n": 4096, "k": 12288, "seed": 6635}
+  - {"world_size": 8, "m": 8192, "n": 4608, "k": 36864, "seed": 4422}
+  - {"world_size": 8, "m": 8192, "n": 8192, "k": 28672, "seed": 1536}
+
+
+benchmarks:
+  - {"world_size": 8, "m": 8192, "n": 4096, "k": 14336, "seed": 7168}
+  - {"world_size": 8, "m": 8192, "n": 8192, "k": 29568, "seed": 1024}
+  - {"world_size": 8, "m": 8192, "n": 8192, "k": 30720, "seed": 2035}

From eff169759596326890b23d4625cb6d5923266e55 Mon Sep 17 00:00:00 2001
From: danielhua23 <daniel.huang2@amd.com>
Date: Wed, 3 Sep 2025 02:02:32 +0000
Subject: [PATCH 116/207] [enhance] change seed and moe const different for
 each rank for debug and check

---
 problems/amd_distributed/all2all/reference.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/problems/amd_distributed/all2all/reference.py b/problems/amd_distributed/all2all/reference.py
index 26c4ac7f..fbeb0685 100644
--- a/problems/amd_distributed/all2all/reference.py
+++ b/problems/amd_distributed/all2all/reference.py
@@ -239,7 +239,7 @@ def generate_input(
 ):
     device = torch.device(f"cuda:{rank}")
     gen = torch.Generator(device=device)
-    gen.manual_seed(seed)
+    gen.manual_seed(seed + rank)
 
     cfg = MoEConfig(
         num_experts=num_experts,
@@ -259,7 +259,7 @@ def ref_kernel(data: input_t) -> output_t:
     ata = PyTorchAllToAll(cfg, rank, world_size)
 
     expert_num, expert_x, expert_meta = ata.dispatch(rank_data.x, rank_data.indices)
-    expert_y = expert_x.to(cfg.out_dtype) * 2
+    expert_y = expert_x.to(cfg.out_dtype) * (1 + rank)
     y = torch.zeros(
         cfg.max_num_tokens,
         cfg.hidden_dim,

From ecd9b59facd61b4cac5c07acc1003a572aad2ee5 Mon Sep 17 00:00:00 2001
From: danielhua23 <daniel.huang2@amd.com>
Date: Wed, 3 Sep 2025 03:03:24 +0000
Subject: [PATCH 117/207] apply same change to submission.py

---
 problems/amd_distributed/all2all/submission.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/amd_distributed/all2all/submission.py b/problems/amd_distributed/all2all/submission.py
index a036f7a8..5eddcf68 100644
--- a/problems/amd_distributed/all2all/submission.py
+++ b/problems/amd_distributed/all2all/submission.py
@@ -193,7 +193,7 @@ def custom_kernel(data: input_t) -> output_t:
     ata = PyTorchAllToAll(cfg, rank, world_size)
 
     expert_num, expert_x, expert_meta = ata.dispatch(rank_data.x, rank_data.indices)
-    expert_y = expert_x.to(cfg.out_dtype) * 2
+    expert_y = expert_x.to(cfg.out_dtype) * (1 + rank)
     y = torch.zeros(
         cfg.max_num_tokens,
         cfg.hidden_dim,

From 35eb4e12cc4990200091f9675cbf786b6536fd2d Mon Sep 17 00:00:00 2001
From: Xun Wang <xunwang2@amd.com>
Date: Mon, 8 Sep 2025 00:55:04 -0500
Subject: [PATCH 118/207] typo fix

---
 problems/amd_distributed/gemm-rs/reference.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/amd_distributed/gemm-rs/reference.py b/problems/amd_distributed/gemm-rs/reference.py
index 5ff5904d..52f4a3af 100644
--- a/problems/amd_distributed/gemm-rs/reference.py
+++ b/problems/amd_distributed/gemm-rs/reference.py
@@ -5,7 +5,7 @@
 
 def generate_input(RANK: int, world_size: int, m: int, n: int, k: int, seed: int) -> input_t:
     """
-    Generate random input and weights for the AG-GEMM operation.
+    Generate random input and weights for the Gemm-ReduceScatter operation.
 
     Returns:
         Tuple of (

From 8e35bf1e9d1140380e41d48e53f854c95455a404 Mon Sep 17 00:00:00 2001
From: Xun Wang <xunwang2@amd.com>
Date: Mon, 8 Sep 2025 01:55:56 -0500
Subject: [PATCH 119/207] remove transposed_weight bool variable

---
 problems/amd_distributed/gemm-rs/reference.py | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/problems/amd_distributed/gemm-rs/reference.py b/problems/amd_distributed/gemm-rs/reference.py
index 52f4a3af..5a121563 100644
--- a/problems/amd_distributed/gemm-rs/reference.py
+++ b/problems/amd_distributed/gemm-rs/reference.py
@@ -11,7 +11,6 @@ def generate_input(RANK: int, world_size: int, m: int, n: int, k: int, seed: int
         Tuple of (
             input: torch.Tensor,  # [M, local_K]
             weight: torch.Tensor,  # [N, local_K]
-            transposed_weight: bool,  # Whether the weight is transposed
             bias: Optional[torch.Tensor],  # [N] or None
         )
     """
@@ -26,7 +25,7 @@ def generate_input(RANK: int, world_size: int, m: int, n: int, k: int, seed: int
     input = (torch.rand((m, local_k), dtype=torch.float16, device="cuda", generator=gen) * 2 - 1) * 0.01
     weight = (torch.rand((n, local_k), dtype=torch.float16, device="cuda", generator=gen) * 2 - 1) * 0.01
 
-    return (input, weight, False, None)
+    return (input, weight, None)
 
 
 def ref_kernel(data: input_t) -> output_t:
@@ -34,24 +33,20 @@ def ref_kernel(data: input_t) -> output_t:
     Reference kernel for Gemm-ReduceScatter operation.
 
     Args:
-        data: Tuple of (input: torch.Tensor, weight: torch.Tensor, transposed_weight: bool,
-                bias: Optional[torch.Tensor])
+        data: Tuple of (input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor])
             - input: Local input tensor of shape [M, local_K].
-            - weight: Weight tensor of shape [N, local_K] or [local_K, N] if transed_weight is True.
-            - transposed_weight: Whether the weight is transposed.
+            - weight: Weight tensor of shape [N, local_K].
             - bias: Optional bias tensor of shape [N] or None.
     Returns:
         Tuple containing:
             - output: Resulting tensor of shape [M // world_size, N].
     """
-    input, weight, transposed_weight, bias = data
+    input, weight, bias = data
     M, local_K = input.shape
-    if not transposed_weight:
-        weight = weight.T
-    N = weight.shape[1]
+    N = weight.shape[0]
     world_size = torch.distributed.get_world_size()
     # matmul
-    output = torch.matmul(input, weight)
+    output = torch.matmul(input, weight.T)
     if bias is not None:
         output = output + bias
     # reduce scatter

From 11303bea37bcf061564a9a37e24e75bb66a96d00 Mon Sep 17 00:00:00 2001
From: Xun Wang <xunwang2@amd.com>
Date: Mon, 8 Sep 2025 02:10:22 -0500
Subject: [PATCH 120/207] update custom_kernel and yaml doc

---
 problems/amd_distributed/gemm-rs/submission.py | 14 +++++---------
 problems/amd_distributed/gemm-rs/task.yml      | 13 +++++--------
 2 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/problems/amd_distributed/gemm-rs/submission.py b/problems/amd_distributed/gemm-rs/submission.py
index dce77b4a..4212d5ad 100644
--- a/problems/amd_distributed/gemm-rs/submission.py
+++ b/problems/amd_distributed/gemm-rs/submission.py
@@ -7,24 +7,20 @@ def custom_kernel(data: input_t) -> output_t:
     Reference kernel for Gemm-ReduceScatter operation.
 
     Args:
-        data: Tuple of (input: torch.Tensor, weight: torch.Tensor, transposed_weight: bool,
-                bias: Optional[torch.Tensor])
+        data: Tuple of (input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor])
             - input: Local input tensor of shape [M, local_K].
-            - weight: Weight tensor of shape [N, local_K] or [local_K, N] if transed_weight is True.
-            - transposed_weight: Whether the weight is transposed.
+            - weight: Weight tensor of shape [N, local_K].
             - bias: Optional bias tensor of shape [N] or None.
     Returns:
         Tuple containing:
             - output: Resulting tensor of shape [M // world_size, N].
     """
-    input, weight, transposed_weight, bias = data
+    input, weight, bias = data
     M, local_K = input.shape
-    if not transposed_weight:
-        weight = weight.T
-    N = weight.shape[1]
+    N = weight.shape[0]
     world_size = torch.distributed.get_world_size()
     # matmul
-    output = torch.matmul(input, weight)
+    output = torch.matmul(input, weight.T)
     if bias is not None:
         output = output + bias
     # reduce scatter
diff --git a/problems/amd_distributed/gemm-rs/task.yml b/problems/amd_distributed/gemm-rs/task.yml
index c8b6bb8b..4d36fdc8 100644
--- a/problems/amd_distributed/gemm-rs/task.yml
+++ b/problems/amd_distributed/gemm-rs/task.yml
@@ -10,10 +10,9 @@ files:
 lang: "py"
 
 description: |
-  Implement a Gemm-ReduceScatter kernel for efficient transformer models
-  on a single MI300X device.
+  Implement a Gemm-ReduceScatter kernel on a single MI300X node.
 
-  ReduceScatter-Gemm (RS-Gemm) is a technique that combines the ReduceScatter
+  Gemm-ReduceScatter is a technique that combines the ReduceScatter
   communication pattern with General Matrix Multiplication (GEMM) to optimize
   the performance of transformer models on GPUs. It is particularly useful for
   handling large models that exceed the memory capacity of a single GPU by
@@ -28,13 +27,11 @@ description: |
     taking advantage of its specific hardware features for maximum performance.
 
   Input:
-  - `data`: Tuple of (input: torch.Tensor, weights: torch.Tensor, transposed_weight: bool,
-            bias: Optional, None or torch.Tensor, TP_GROUP: group object)
+  - `data`: Tuple of (input: torch.Tensor, weights: torch.Tensor,
+            bias: Optional, None or torch.Tensor)
     - input: Local input tensor of shape [M, local_K].
-    - weight: Weight tensor of shape [N, local_K] or [local_K, N] if transed_weight is True.
-    - transposed_weight: Whether the weight is transposed.
+    - weight: Weight tensor of shape [N, local_K].
     - bias: bias tensor of shape [N] or None.
-    - TP_GROUP: Process group for tensor parallelism
 
   Output:
   - Tuple containing:

From 38225f614dbf2efd9619c70d9055784c5469a0fe Mon Sep 17 00:00:00 2001
From: Xun Wang <xunwang2@amd.com>
Date: Mon, 8 Sep 2025 05:34:36 -0500
Subject: [PATCH 121/207] add has_bias in test spec

---
 problems/amd_distributed/gemm-rs/reference.py |  9 +++++++--
 problems/amd_distributed/gemm-rs/task.py      |  1 +
 problems/amd_distributed/gemm-rs/task.yml     | 14 +++++++-------
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/problems/amd_distributed/gemm-rs/reference.py b/problems/amd_distributed/gemm-rs/reference.py
index 5a121563..03e4e5d6 100644
--- a/problems/amd_distributed/gemm-rs/reference.py
+++ b/problems/amd_distributed/gemm-rs/reference.py
@@ -3,7 +3,7 @@
 import torch
 
 
-def generate_input(RANK: int, world_size: int, m: int, n: int, k: int, seed: int) -> input_t:
+def generate_input(RANK: int, world_size: int, m: int, n: int, k: int, has_bias: bool, seed: int) -> input_t:
     """
     Generate random input and weights for the Gemm-ReduceScatter operation.
 
@@ -25,7 +25,12 @@ def generate_input(RANK: int, world_size: int, m: int, n: int, k: int, seed: int
     input = (torch.rand((m, local_k), dtype=torch.float16, device="cuda", generator=gen) * 2 - 1) * 0.01
     weight = (torch.rand((n, local_k), dtype=torch.float16, device="cuda", generator=gen) * 2 - 1) * 0.01
 
-    return (input, weight, None)
+    bias = None
+    if has_bias:
+        gen.manual_seed(seed)
+        bias = (torch.rand((n,), dtype=torch.float16, device="cuda", generator=gen) * 2 - 1) * 0.01
+
+    return (input, weight, bias)
 
 
 def ref_kernel(data: input_t) -> output_t:
diff --git a/problems/amd_distributed/gemm-rs/task.py b/problems/amd_distributed/gemm-rs/task.py
index 9be0c81b..1245626c 100644
--- a/problems/amd_distributed/gemm-rs/task.py
+++ b/problems/amd_distributed/gemm-rs/task.py
@@ -10,4 +10,5 @@ class TestSpec(TypedDict):
     m: int
     n: int
     k: int
+    has_bias: bool
     seed: int
\ No newline at end of file
diff --git a/problems/amd_distributed/gemm-rs/task.yml b/problems/amd_distributed/gemm-rs/task.yml
index 4d36fdc8..5988132a 100644
--- a/problems/amd_distributed/gemm-rs/task.yml
+++ b/problems/amd_distributed/gemm-rs/task.yml
@@ -46,13 +46,13 @@ templates:
 ranking_by: "geom"
 
 tests:
-  - {"world_size": 8, "m": 8192, "n": 3584, "k": 14336, "seed": 42}
-  - {"world_size": 8, "m": 8192, "n": 4096, "k": 12288, "seed": 6635}
-  - {"world_size": 8, "m": 8192, "n": 4608, "k": 36864, "seed": 4422}
-  - {"world_size": 8, "m": 8192, "n": 8192, "k": 28672, "seed": 1536}
+  - {"world_size": 8, "m": 8192, "n": 3584, "k": 14336, "has_bias": True, "seed": 42}
+  - {"world_size": 8, "m": 8192, "n": 4096, "k": 12288, "has_bias": False, "seed": 6635}
+  - {"world_size": 8, "m": 8192, "n": 4608, "k": 36864, "has_bias": True, "seed": 4422}
+  - {"world_size": 8, "m": 8192, "n": 8192, "k": 28672, "has_bias": False, "seed": 1536}
 
 
 benchmarks:
-  - {"world_size": 8, "m": 8192, "n": 4096, "k": 14336, "seed": 7168}
-  - {"world_size": 8, "m": 8192, "n": 8192, "k": 29568, "seed": 1024}
-  - {"world_size": 8, "m": 8192, "n": 8192, "k": 30720, "seed": 2035}
+  - {"world_size": 8, "m": 8192, "n": 4096, "k": 14336, "has_bias": True, "seed": 7168}
+  - {"world_size": 8, "m": 8192, "n": 8192, "k": 29568, "has_bias": False, "seed": 1024}
+  - {"world_size": 8, "m": 8192, "n": 8192, "k": 30720, "has_bias": True, "seed": 2035}

From ed0535726bb49a9a5a3442c13df6b5758d9763bb Mon Sep 17 00:00:00 2001
From: danielhua23 <daniel.huang2@amd.com>
Date: Mon, 8 Sep 2025 18:18:23 -0700
Subject: [PATCH 122/207] clarify rules

---
 problems/amd_distributed/all2all/task.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/amd_distributed/all2all/task.yml b/problems/amd_distributed/all2all/task.yml
index 688f62d2..f35046f8 100644
--- a/problems/amd_distributed/all2all/task.yml
+++ b/problems/amd_distributed/all2all/task.yml
@@ -15,7 +15,7 @@ config:
 
 description: |
   
-  You will implement a custom single node all2all kernel optimized for 8xMI300.
+  You are expected to implement dispatch and simulated moe and combine kernels with intra node communication, which jointly made a custom single node all2all kernel optimized for 8xMI300
   You will be given MoEConfig, which is the main hypeparameter, including numbers of experts, experts per token, hidden dim, max number tokens each dp rank and input output dtype
 
   To be explicit, you will be given data of all ranks, naming all_rank_data.

From 54ff613410b5f6374accb811dfc7144d16c7d497 Mon Sep 17 00:00:00 2001
From: danielhua23 <daniel.huang2@amd.com>
Date: Mon, 8 Sep 2025 18:26:35 -0700
Subject: [PATCH 123/207] clarify rules

---
 problems/amd_distributed/all2all/task.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/amd_distributed/all2all/task.yml b/problems/amd_distributed/all2all/task.yml
index f35046f8..2e0f68f7 100644
--- a/problems/amd_distributed/all2all/task.yml
+++ b/problems/amd_distributed/all2all/task.yml
@@ -15,7 +15,7 @@ config:
 
 description: |
   
-  You are expected to implement dispatch and simulated moe and combine kernels with intra node communication, which jointly made a custom single node all2all kernel optimized for 8xMI300
+  You are expected to implement dispatch and simulated moe and combine kernels with intra node communication, refering to reference.py, which jointly made a custom single node all2all kernel optimized for 8xMI300.
   You will be given MoEConfig, which is the main hypeparameter, including numbers of experts, experts per token, hidden dim, max number tokens each dp rank and input output dtype
 
   To be explicit, you will be given data of all ranks, naming all_rank_data.

From 4c5405d6c67fc62b372f10a5172b6de79fde2646 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Tue, 9 Sep 2025 17:15:28 -0700
Subject: [PATCH 124/207] Update deadline for amd-identity problem

---
 problems/amd.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/amd.yaml b/problems/amd.yaml
index ad721d6c..d522b6bd 100644
--- a/problems/amd.yaml
+++ b/problems/amd.yaml
@@ -7,7 +7,7 @@ description: "AMD Developer Challenge 2025: Inference Sprint"
 problems:
   - directory: amd/identity
     name: amd-identity
-    deadline: "2025-09-02"
+    deadline: "2025-12-30"
     gpus:
       - MI300
   - directory: amd/fp8-mm

From 5afcc47169d44e3d49024622d353c969cd3d27a5 Mon Sep 17 00:00:00 2001
From: Xun Wang <xunwang2@amd.com>
Date: Tue, 9 Sep 2025 21:31:17 -0500
Subject: [PATCH 125/207] update test and benchmark shapes

---
 problems/amd_distributed/gemm-rs/task.yml | 33 ++++++++++++++++++++---
 1 file changed, 29 insertions(+), 4 deletions(-)

diff --git a/problems/amd_distributed/gemm-rs/task.yml b/problems/amd_distributed/gemm-rs/task.yml
index 5988132a..360ff1e7 100644
--- a/problems/amd_distributed/gemm-rs/task.yml
+++ b/problems/amd_distributed/gemm-rs/task.yml
@@ -37,6 +37,21 @@ description: |
   - Tuple containing:
     - output: Resulting tensor of shape [M // world_size, N]
 
+  The ranking criteria is the geometric mean of the benchmark results.
+
+  For the grand price, your kernel will be evaluated against the speed of light
+  analysis and AMD implementations, the solution closest to the speed of light
+  and AMD implementations will be awarded the grand price.
+  ```
+  The speed of light analysis is:
+   m      n      k      has_bias      time[us]
+   64     7168   18432  False         6.46
+   512    4096   12288  True          8.19
+   2048   2880   2880   True          23.04
+   4096   4096   4096   False         65.54
+   8192   4096   14336  True          131.07
+   8192   8192   29568  False         379.43
+  ```
 config:
   main: "eval.py"
 
@@ -46,13 +61,23 @@ templates:
 ranking_by: "geom"
 
 tests:
-  - {"world_size": 8, "m": 8192, "n": 3584, "k": 14336, "has_bias": True, "seed": 42}
-  - {"world_size": 8, "m": 8192, "n": 4096, "k": 12288, "has_bias": False, "seed": 6635}
+  - {"world_size": 8, "m": 64, "n": 2880, "k": 2880, "has_bias": True, "seed": 2035}
+  - {"world_size": 8, "m": 64, "n": 3584, "k": 14336, "has_bias": True, "seed": 13}
+  - {"world_size": 8, "m": 512, "n": 3584, "k": 14336, "has_bias": True, "seed": 4297}
+  - {"world_size": 8, "m": 512, "n": 4608, "k": 36864, "has_bias": False, "seed": 1597}
+  - {"world_size": 8, "m": 2048, "n": 4096, "k": 7168, "has_bias": False, "seed": 716}
+  - {"world_size": 8, "m": 2048, "n": 8192, "k": 30720, "has_bias": False, "seed": 20201}
+  - {"world_size": 8, "m": 4096, "n": 2880, "k": 2880, "has_bias": True, "seed": 136}
+  - {"world_size": 8, "m": 4096, "n": 8192, "k": 2048, "has_bias": True, "seed": 138}
+  - {"world_size": 8, "m": 8192, "n": 3584, "k": 14336, "has_bias": True, "seed": 748}
   - {"world_size": 8, "m": 8192, "n": 4608, "k": 36864, "has_bias": True, "seed": 4422}
   - {"world_size": 8, "m": 8192, "n": 8192, "k": 28672, "has_bias": False, "seed": 1536}
 
 
 benchmarks:
+  - {"world_size": 8, "m": 64, "n": 7168, "k": 18432, "has_bias": False, "seed": 1234}
+  - {"world_size": 8, "m": 512, "n": 4096, "k": 12288, "has_bias": True, "seed": 663}
+  - {"world_size": 8, "m": 2048, "n": 2880, "k": 2880, "has_bias": True, "seed": 166}
+  - {"world_size": 8, "m": 4096, "n": 4096, "k": 4096, "has_bias": False, "seed": 1371}
   - {"world_size": 8, "m": 8192, "n": 4096, "k": 14336, "has_bias": True, "seed": 7168}
-  - {"world_size": 8, "m": 8192, "n": 8192, "k": 29568, "has_bias": False, "seed": 1024}
-  - {"world_size": 8, "m": 8192, "n": 8192, "k": 30720, "has_bias": True, "seed": 2035}
+  - {"world_size": 8, "m": 8192, "n": 8192, "k": 29568, "has_bias": False, "seed": 42}

From 10015ea46d41d0ffd55056905566f13645e8063f Mon Sep 17 00:00:00 2001
From: Xun Wang <xunwang2@amd.com>
Date: Wed, 10 Sep 2025 21:09:37 -0500
Subject: [PATCH 126/207] change dtype

---
 problems/amd_distributed/gemm-rs/reference.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/problems/amd_distributed/gemm-rs/reference.py b/problems/amd_distributed/gemm-rs/reference.py
index 03e4e5d6..6cb363d4 100644
--- a/problems/amd_distributed/gemm-rs/reference.py
+++ b/problems/amd_distributed/gemm-rs/reference.py
@@ -22,13 +22,13 @@ def generate_input(RANK: int, world_size: int, m: int, n: int, k: int, has_bias:
     local_k = k // world_size
 
     # Generate random inputs and weights
-    input = (torch.rand((m, local_k), dtype=torch.float16, device="cuda", generator=gen) * 2 - 1) * 0.01
-    weight = (torch.rand((n, local_k), dtype=torch.float16, device="cuda", generator=gen) * 2 - 1) * 0.01
+    input = (torch.rand((m, local_k), dtype=torch.bfloat16, device="cuda", generator=gen) * 2 - 1) * 0.01
+    weight = (torch.rand((n, local_k), dtype=torch.bfloat16, device="cuda", generator=gen) * 2 - 1) * 0.01
 
     bias = None
     if has_bias:
         gen.manual_seed(seed)
-        bias = (torch.rand((n,), dtype=torch.float16, device="cuda", generator=gen) * 2 - 1) * 0.01
+        bias = (torch.rand((n,), dtype=torch.bfloat16, device="cuda", generator=gen) * 2 - 1) * 0.01
 
     return (input, weight, bias)
 

From 5f7edb501c809a7300569f2cba96b2f442c3d42b Mon Sep 17 00:00:00 2001
From: achalpandeyy <pandeyachal.ap@gmail.com>
Date: Thu, 11 Sep 2025 07:02:18 +0100
Subject: [PATCH 127/207] Use events for pmpp_v2 benchmarking

---
 problems/pmpp_v2/eval.py  | 13 +++++++++----
 problems/pmpp_v2/utils.py |  9 +++++++++
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/problems/pmpp_v2/eval.py b/problems/pmpp_v2/eval.py
index 56222a1e..981b9322 100644
--- a/problems/pmpp_v2/eval.py
+++ b/problems/pmpp_v2/eval.py
@@ -11,7 +11,7 @@
 
 import torch.cuda
 
-from utils import set_seed
+from utils import set_seed, clear_l2_cache
 try:
     from task import TestSpec
 except ImportError:
@@ -218,10 +218,15 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
             data = generate_input(**test.args)
             check_copy = _clone_data(data)
         torch.cuda.synchronize()
-        start = time.perf_counter_ns()
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        clear_l2_cache()
+
+        start_event.record()
         output = custom_kernel(data)
+        end_event.record()
         torch.cuda.synchronize()
-        end = time.perf_counter_ns()
+        duration = start_event.elapsed_time(end_event) * 1e6  # Convert ms to ns
 
         if recheck:
             good, message = check_implementation(check_copy, output)
@@ -229,7 +234,7 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
                 return message
 
         del output
-        durations.append(end - start)
+        durations.append(duration)
 
         if i > 1:
             total_bm_duration = time.perf_counter_ns() - bm_start_time
diff --git a/problems/pmpp_v2/utils.py b/problems/pmpp_v2/utils.py
index 7ef4d79c..e8a9082f 100644
--- a/problems/pmpp_v2/utils.py
+++ b/problems/pmpp_v2/utils.py
@@ -165,3 +165,12 @@ def __exit__(self, exc_type, exc_value, traceback):
         torch.backends.cudnn.deterministic = self.deterministic
         torch.use_deterministic_algorithms(False)
         os.environ['CUBLAS_WORKSPACE_CONFIG'] = self.cublas
+
+def clear_l2_cache():
+    # import cupy as cp
+    # cp.cuda.runtime.deviceSetLimit(cp.cuda.runtime.cudaLimitPersistingL2CacheSize, 0)
+    # create a large dummy tensor
+    dummy = torch.empty((32, 1024, 1024), dtype=torch.int64, device="cuda")
+    # write stuff to
+    dummy.fill_(42)
+    del dummy
\ No newline at end of file

From 75bd4abc5f4acab1ee1ec38b8a22adb85b6b5d74 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Fri, 12 Sep 2025 14:24:43 +0200
Subject: [PATCH 128/207] Feat: works

---
 problems/amd_distributed.yaml                 |  5 ++++
 problems/amd_distributed/gemm-rs/reference.py | 23 +++++++++++++------
 problems/amd_distributed/gemm-rs/task.yml     |  1 +
 3 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/problems/amd_distributed.yaml b/problems/amd_distributed.yaml
index 86fed95a..fc9121f7 100644
--- a/problems/amd_distributed.yaml
+++ b/problems/amd_distributed.yaml
@@ -10,3 +10,8 @@ problems:
     deadline: "2025-10-14"
     gpus:
       - MI300x8
+  - directory: amd_distributed/gemm-rs
+    name: amd-gemm-rs
+    deadline: "2025-10-14"
+    gpus:
+      - MI300x8
diff --git a/problems/amd_distributed/gemm-rs/reference.py b/problems/amd_distributed/gemm-rs/reference.py
index 6cb363d4..dcffa085 100644
--- a/problems/amd_distributed/gemm-rs/reference.py
+++ b/problems/amd_distributed/gemm-rs/reference.py
@@ -3,7 +3,7 @@
 import torch
 
 
-def generate_input(RANK: int, world_size: int, m: int, n: int, k: int, has_bias: bool, seed: int) -> input_t:
+def generate_input(rank: int, world_size: int, m: int, n: int, k: int, has_bias: bool, seed: int) -> input_t:
     """
     Generate random input and weights for the Gemm-ReduceScatter operation.
 
@@ -14,21 +14,22 @@ def generate_input(RANK: int, world_size: int, m: int, n: int, k: int, has_bias:
             bias: Optional[torch.Tensor],  # [N] or None
         )
     """
-    gen = torch.Generator(device='cuda')
-    gen.manual_seed(seed + RANK)
+    device = torch.device(f'cuda:{rank}')
+    gen = torch.Generator(device=device)
+    gen.manual_seed(seed + rank)
 
     assert m % world_size == 0, "m must be divisible by world_size"
     assert k % world_size == 0, "k must be divisible by world_size"
     local_k = k // world_size
 
     # Generate random inputs and weights
-    input = (torch.rand((m, local_k), dtype=torch.bfloat16, device="cuda", generator=gen) * 2 - 1) * 0.01
-    weight = (torch.rand((n, local_k), dtype=torch.bfloat16, device="cuda", generator=gen) * 2 - 1) * 0.01
+    input = (torch.rand((m, local_k), dtype=torch.bfloat16, device=device, generator=gen) * 2 - 1) * 0.01
+    weight = (torch.rand((n, local_k), dtype=torch.bfloat16, device=device, generator=gen) * 2 - 1) * 0.01
 
     bias = None
     if has_bias:
         gen.manual_seed(seed)
-        bias = (torch.rand((n,), dtype=torch.bfloat16, device="cuda", generator=gen) * 2 - 1) * 0.01
+        bias = (torch.rand((n,), dtype=torch.bfloat16, device=device, generator=gen) * 2 - 1) * 0.01
 
     return (input, weight, bias)
 
@@ -60,4 +61,12 @@ def ref_kernel(data: input_t) -> output_t:
     return rs_output
 
 
-check_implementation = make_match_reference(ref_kernel, rtol=1e-2, atol=1e-2)
+def check_implementation(data: input_t, output: output_t):
+    expected = ref_kernel(data)
+    if output.device != expected.device:
+        return False, f"Output device mismatch: {output.device} != {expected.device}"
+    res = torch.allclose(output, expected, rtol=1e-2, atol=1e-2)
+    if not res:
+        return False, f"Output values mismatch, {output} != {expected}"
+
+    return True, ""
diff --git a/problems/amd_distributed/gemm-rs/task.yml b/problems/amd_distributed/gemm-rs/task.yml
index 360ff1e7..d0501362 100644
--- a/problems/amd_distributed/gemm-rs/task.yml
+++ b/problems/amd_distributed/gemm-rs/task.yml
@@ -8,6 +8,7 @@ files:
   - {"name": "eval.py", "source": "../eval.py"}
 
 lang: "py"
+multi_gpu: true
 
 description: |
   Implement a Gemm-ReduceScatter kernel on a single MI300X node.

From 0d0ca8484c27a214d9b8f0c6f8b49253b0520da8 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sun, 14 Sep 2025 12:35:15 +0200
Subject: [PATCH 129/207] Final

---
 problems/amd_distributed/gemm-rs/reference.py | 1 -
 problems/amd_distributed/gemm-rs/task.py      | 8 ++++----
 problems/amd_distributed/gemm-rs/task.yml     | 1 +
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/problems/amd_distributed/gemm-rs/reference.py b/problems/amd_distributed/gemm-rs/reference.py
index dcffa085..cb60206b 100644
--- a/problems/amd_distributed/gemm-rs/reference.py
+++ b/problems/amd_distributed/gemm-rs/reference.py
@@ -1,4 +1,3 @@
-from utils import make_match_reference
 from task import input_t, output_t
 import torch
 
diff --git a/problems/amd_distributed/gemm-rs/task.py b/problems/amd_distributed/gemm-rs/task.py
index 1245626c..1de3eddd 100644
--- a/problems/amd_distributed/gemm-rs/task.py
+++ b/problems/amd_distributed/gemm-rs/task.py
@@ -1,8 +1,8 @@
-from typing import TypedDict, TypeVar, Tuple, Dict
+from typing import TypedDict, TypeVar, Tuple, Optional
 import torch
 
-input_t = TypeVar("input_t", bound=Tuple[torch.Tensor, Dict[str, torch.Tensor], Dict])
-output_t = TypeVar("output_t", bound=Tuple[torch.Tensor, Dict])
+input_t = TypeVar("input_t", bound=Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]])
+output_t = TypeVar("output_t", bound=torch.Tensor)
 
 
 class TestSpec(TypedDict):
@@ -11,4 +11,4 @@ class TestSpec(TypedDict):
     n: int
     k: int
     has_bias: bool
-    seed: int
\ No newline at end of file
+    seed: int
diff --git a/problems/amd_distributed/gemm-rs/task.yml b/problems/amd_distributed/gemm-rs/task.yml
index d0501362..6eac2741 100644
--- a/problems/amd_distributed/gemm-rs/task.yml
+++ b/problems/amd_distributed/gemm-rs/task.yml
@@ -60,6 +60,7 @@ templates:
   Python: "submission.py"
 
 ranking_by: "geom"
+ranked_timeout: 360 # just in case
 
 tests:
   - {"world_size": 8, "m": 64, "n": 2880, "k": 2880, "has_bias": True, "seed": 2035}

From 40fa8f03e293c349c87bbe9e5fe5c395fa5cbf9c Mon Sep 17 00:00:00 2001
From: Xun Wang <xunwang2@amd.com>
Date: Sun, 21 Sep 2025 02:55:29 -0500
Subject: [PATCH 130/207] add allgather gemm reference implementation

---
 problems/amd_distributed/ag-gemm/reference.py | 69 ++++++++++++++++
 .../amd_distributed/ag-gemm/submission.py     | 28 +++++++
 problems/amd_distributed/ag-gemm/task.py      | 14 ++++
 problems/amd_distributed/ag-gemm/task.yml     | 79 +++++++++++++++++++
 4 files changed, 190 insertions(+)
 create mode 100644 problems/amd_distributed/ag-gemm/reference.py
 create mode 100644 problems/amd_distributed/ag-gemm/submission.py
 create mode 100644 problems/amd_distributed/ag-gemm/task.py
 create mode 100644 problems/amd_distributed/ag-gemm/task.yml

diff --git a/problems/amd_distributed/ag-gemm/reference.py b/problems/amd_distributed/ag-gemm/reference.py
new file mode 100644
index 00000000..6f2cb24c
--- /dev/null
+++ b/problems/amd_distributed/ag-gemm/reference.py
@@ -0,0 +1,69 @@
+from task import input_t, output_t
+import torch
+
+
+def generate_input(rank: int, world_size: int, m: int, n: int, k: int, has_bias: bool, seed: int) -> input_t:
+    """
+    Generate random input and weights for the Allgather-Gemm operation.
+
+    Returns:
+        Tuple of (
+            input: torch.Tensor,  # [local_M, k]
+            weight: torch.Tensor,  # [local_N, K]
+            bias: Optional[torch.Tensor],  # [local_N] or None
+        )
+    """
+    device = torch.device(f"cuda:{rank}")
+    gen = torch.Generator(device=device)
+    gen.manual_seed(seed + rank)
+
+    assert m % world_size == 0, "m must be divisible by world_size"
+    assert n % world_size == 0, "n must be divisible by world_size"
+    local_m = m // world_size
+    local_n = n // world_size
+
+    # Generate random inputs and weights
+    input = (torch.randn((local_m, k), dtype=torch.bfloat16, device=device, generator=gen) * 2 - 1) * 0.01
+    weight = (torch.randn((local_n, k), dtype=torch.bfloat16, device=device, generator=gen) * 2 - 1) * 0.01
+
+    bias = None
+    if has_bias:
+        bias = (torch.rand((local_n,), dtype=torch.bfloat16, device=device, generator=gen) * 2 - 1) * 0.01
+    return (input, weight, bias)
+
+
+def ref_kernel(data: input_t) -> output_t:
+    """
+    Reference kernel for AG-GEMM operation.
+    Args:
+        data: Tuple of (input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor])
+            - input: Local input tensor of shape [local_M, K].
+            - weight: Weight tensor of shape [local_N, K].
+            - bias: Optional bias tensor of shape [local_N] or None.
+    Returns:
+        output: Resulting tensor of shape [local_M * world_size, local_N].
+    """
+    input, weight, bias = data
+    local_M, K = input.shape
+    world_size = torch.distributed.get_world_size()
+    full_input = torch.empty((local_M * world_size, K), dtype=input.dtype, device=input.device)
+    # allgather
+    torch.distributed.all_gather_into_tensor(full_input, input)
+    # matmul
+    output = torch.matmul(full_input, weight.T)
+
+    if bias is not None:
+        output = output + bias
+
+    return output
+
+
+def check_implementation(data: input_t, output: output_t):
+    expected = ref_kernel(data)
+    if output.device != expected.device:
+        return False, f"Output device mismatch: {output.device} != {expected.device}"
+    res = torch.allclose(output, expected, rtol=1e-2, atol=1e-2)
+    if not res:
+        return False, f"Output values mismatch, {output} != {expected}"
+
+    return True, ""
diff --git a/problems/amd_distributed/ag-gemm/submission.py b/problems/amd_distributed/ag-gemm/submission.py
new file mode 100644
index 00000000..23942227
--- /dev/null
+++ b/problems/amd_distributed/ag-gemm/submission.py
@@ -0,0 +1,28 @@
+from task import input_t, output_t
+import torch
+
+
+def custom_kernel(data: input_t) -> output_t:
+    """
+    Reference kernel for AG-GEMM operation.
+    Args:
+        data: Tuple of (input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor])
+            - input: Local input tensor of shape [local_M, K].
+            - weight: Weight tensor of shape [local_N, K].
+            - bias: Optional bias tensor of shape [local_N] or None.
+    Returns:
+        output: Resulting tensor of shape [local_M * world_size, local_N].
+    """
+    input, weight, bias = data
+    local_M, K = input.shape
+    world_size = torch.distributed.get_world_size()
+    full_input = torch.empty((local_M * world_size, K), dtype=input.dtype, device=input.device)
+    # allgather
+    torch.distributed.all_gather_into_tensor(full_input, input)
+    # matmul
+    output = torch.matmul(full_input, weight.T)
+
+    if bias is not None:
+        output = output + bias
+
+    return output
diff --git a/problems/amd_distributed/ag-gemm/task.py b/problems/amd_distributed/ag-gemm/task.py
new file mode 100644
index 00000000..1de3eddd
--- /dev/null
+++ b/problems/amd_distributed/ag-gemm/task.py
@@ -0,0 +1,14 @@
+from typing import TypedDict, TypeVar, Tuple, Optional
+import torch
+
+input_t = TypeVar("input_t", bound=Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]])
+output_t = TypeVar("output_t", bound=torch.Tensor)
+
+
+class TestSpec(TypedDict):
+    world_size: int
+    m: int
+    n: int
+    k: int
+    has_bias: bool
+    seed: int
diff --git a/problems/amd_distributed/ag-gemm/task.yml b/problems/amd_distributed/ag-gemm/task.yml
new file mode 100644
index 00000000..b1fd45b2
--- /dev/null
+++ b/problems/amd_distributed/ag-gemm/task.yml
@@ -0,0 +1,79 @@
+# name: ag-gemm
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  Implement a AllGather-Gemm kernel on a single MI300X device.
+
+  AllGather-Gemm (AG-Gemm) is a technique that combines the AllGather communication
+  pattern with General Matrix Multiplication (GEMM) to optimize the performance
+  of transformer models on GPUs.
+
+  Your task:
+  - Implement the AG-Gemm kernel to perform matrix multiplications
+    in a distributed manner, leveraging the AllGather operation to collect
+    data from multiple GPUs.
+  - Ensure that the implementation is optimized for the MI300X architecture,
+    taking advantage of its specific hardware features for maximum performance.
+
+  Input:
+  - `data`: Tuple of (input: torch.Tensor, weights: torch.Tensor,
+            bias: Optional, None or torch.Tensor)
+    - input: Local input tensor of shape [local_M, K].
+    - weight: Weight tensor of shape [local_N, K].
+    - bias: bias tensor of shape [local_N] or None.
+
+  Output:
+  - Tuple containing:
+    - output: Resulting tensor of shape [local_M * world_size, local_N]
+
+  The ranking criteria is the geometric mean of the benchmark results.
+
+  For the grand price, your kernel will be evaluated against the speed of light
+  analysis and AMD implementations, the solution closest to the speed of light
+  and AMD implementations will be awarded the grand price.
+  ```
+  The speed of light analysis is:
+   m      n      k      has_bias      time[us]
+   64     18432  7168   False         6.46
+   512    12288  4096   True          24.58
+   2048   2880   2880   True          23.04
+   4096   4096   4096   False         65.54
+   8192   14336  4096   True          458.75
+   8192   29568  8192   False         946.18
+  ```
+config:
+  main: "eval.py"
+
+templates:
+  Python: "submission.py"
+
+ranking_by: "geom"
+
+tests:
+  - {"world_size": 8, "m": 64, "n": 2880, "k": 2880, "has_bias": True, "seed":402}
+  - {"world_size": 8, "m": 64, "n": 14336, "k": 3584, "has_bias": True, "seed":562}
+  - {"world_size": 8, "m": 512, "n": 14336, "k": 3584, "has_bias": True, "seed":531}
+  - {"world_size": 8, "m": 512, "n": 36864, "k": 4608, "has_bias": False, "seed":2090}
+  - {"world_size": 8, "m": 2048, "n": 7168, "k": 4096, "has_bias": False, "seed":90953}
+  - {"world_size": 8, "m": 2048, "n": 30720, "k": 8192, "has_bias": False, "seed":7770}
+  - {"world_size": 8, "m": 4096, "n": 2880, "k": 2880, "has_bias": True, "seed":5312}
+  - {"world_size": 8, "m": 4096, "n": 2048, "k": 8192, "has_bias": True, "seed":99}
+  - {"world_size": 8, "m": 8192, "n": 14336, "k": 3584, "has_bias": True, "seed":11101}
+  # - {"world_size": 8, "m": 8192, "n": 36864, "k": 4608, "has_bias": True, "seed":42}
+  - {"world_size": 8, "m": 8192, "n": 28672, "k": 8192, "has_bias": False, "seed":7188}
+
+benchmarks:
+  - {"world_size": 8, "m": 64, "n": 18432, "k": 7168, "has_bias": False, "seed": 1212}
+  - {"world_size": 8, "m": 512, "n": 12288, "k": 4096, "has_bias": True, "seed": 8861}
+  - {"world_size": 8, "m": 2048, "n": 2880, "k": 2880, "has_bias": True, "seed": 8080}
+  - {"world_size": 8, "m": 4096, "n": 4096, "k": 4096, "has_bias": False, "seed": 601}
+  - {"world_size": 8, "m": 8192, "n": 14336, "k": 4096, "has_bias": True, "seed": 3062}
+  - {"world_size": 8, "m": 8192, "n": 29568, "k": 8192, "has_bias": False, "seed": 4406}
\ No newline at end of file

From b03fd0906e939fa45670edba762675436ded6c15 Mon Sep 17 00:00:00 2001
From: Xun Wang <xunwang2@amd.com>
Date: Mon, 22 Sep 2025 06:57:29 -0500
Subject: [PATCH 131/207] use rand in data generator

---
 problems/amd_distributed/ag-gemm/reference.py | 4 ++--
 problems/amd_distributed/ag-gemm/task.yml     | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/problems/amd_distributed/ag-gemm/reference.py b/problems/amd_distributed/ag-gemm/reference.py
index 6f2cb24c..c8bd4299 100644
--- a/problems/amd_distributed/ag-gemm/reference.py
+++ b/problems/amd_distributed/ag-gemm/reference.py
@@ -23,8 +23,8 @@ def generate_input(rank: int, world_size: int, m: int, n: int, k: int, has_bias:
     local_n = n // world_size
 
     # Generate random inputs and weights
-    input = (torch.randn((local_m, k), dtype=torch.bfloat16, device=device, generator=gen) * 2 - 1) * 0.01
-    weight = (torch.randn((local_n, k), dtype=torch.bfloat16, device=device, generator=gen) * 2 - 1) * 0.01
+    input = (torch.rand((local_m, k), dtype=torch.bfloat16, device=device, generator=gen) * 2 - 1) * 0.01
+    weight = (torch.rand((local_n, k), dtype=torch.bfloat16, device=device, generator=gen) * 2 - 1) * 0.01
 
     bias = None
     if has_bias:
diff --git a/problems/amd_distributed/ag-gemm/task.yml b/problems/amd_distributed/ag-gemm/task.yml
index b1fd45b2..3115ba0b 100644
--- a/problems/amd_distributed/ag-gemm/task.yml
+++ b/problems/amd_distributed/ag-gemm/task.yml
@@ -67,7 +67,7 @@ tests:
   - {"world_size": 8, "m": 4096, "n": 2880, "k": 2880, "has_bias": True, "seed":5312}
   - {"world_size": 8, "m": 4096, "n": 2048, "k": 8192, "has_bias": True, "seed":99}
   - {"world_size": 8, "m": 8192, "n": 14336, "k": 3584, "has_bias": True, "seed":11101}
-  # - {"world_size": 8, "m": 8192, "n": 36864, "k": 4608, "has_bias": True, "seed":42}
+  - {"world_size": 8, "m": 8192, "n": 36864, "k": 4608, "has_bias": True, "seed":42}
   - {"world_size": 8, "m": 8192, "n": 28672, "k": 8192, "has_bias": False, "seed":7188}
 
 benchmarks:

From 64fcc28074b0e4829d5614fec56c3a686f906db6 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Wed, 24 Sep 2025 12:21:18 +0200
Subject: [PATCH 132/207] Feat: works

---
 problems/amd_distributed/ag-gemm/task.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/problems/amd_distributed/ag-gemm/task.yml b/problems/amd_distributed/ag-gemm/task.yml
index 3115ba0b..e7830f24 100644
--- a/problems/amd_distributed/ag-gemm/task.yml
+++ b/problems/amd_distributed/ag-gemm/task.yml
@@ -8,6 +8,7 @@ files:
   - {"name": "eval.py", "source": "../eval.py"}
 
 lang: "py"
+multi_gpu: true
 
 description: |
   Implement a AllGather-Gemm kernel on a single MI300X device.

From 304ef20b83f4a8f8ea2753684a13379298f92d16 Mon Sep 17 00:00:00 2001
From: az <alex.lx.zhang@gmail.com>
Date: Sat, 27 Sep 2025 14:53:19 -0400
Subject: [PATCH 133/207] Increase timeouts as temporary solution to Cauchy
 distribution overhead

---
 problems/bioml/trimul/task.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/problems/bioml/trimul/task.yml b/problems/bioml/trimul/task.yml
index 66f32757..a9d2d9ce 100644
--- a/problems/bioml/trimul/task.yml
+++ b/problems/bioml/trimul/task.yml
@@ -37,9 +37,9 @@ config:
 templates:
   Python: "submission.py"
 
-test_timeout: 540
-benchmark_timeout: 540
-ranked_timeout: 540
+test_timeout: 1200
+benchmark_timeout: 1200
+ranked_timeout: 1200
 ranking_by: "geom"
 
 tests:
@@ -69,4 +69,4 @@ benchmarks:
   - {"seqlen": 512, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 12819, "nomask": True, "distribution": "normal"}
   - {"seqlen": 1024, "bs": 1, "dim": 128, "hiddendim": 128, "seed": 381, "nomask": True, "distribution": "cauchy"}
   - {"seqlen": 768, "bs": 1, "dim": 384, "hiddendim": 128, "seed": 481, "nomask": False, "distribution": "normal"}
-  - {"seqlen": 1024, "bs": 1, "dim": 384, "hiddendim": 128, "seed": 23291, "nomask": True, "distribution": "normal"}
\ No newline at end of file
+  - {"seqlen": 1024, "bs": 1, "dim": 384, "hiddendim": 128, "seed": 23291, "nomask": True, "distribution": "normal"}

From 321e04ffc0d2b164a83615c5923edde37ca3c810 Mon Sep 17 00:00:00 2001
From: danielhua23 <daniel.huang2@amd.com>
Date: Sat, 27 Sep 2025 20:25:38 -0700
Subject: [PATCH 134/207] add rocshmem minimal example

---
 problems/amd_distributed/rocshmem_example.py | 92 ++++++++++++++++++++
 1 file changed, 92 insertions(+)
 create mode 100644 problems/amd_distributed/rocshmem_example.py

diff --git a/problems/amd_distributed/rocshmem_example.py b/problems/amd_distributed/rocshmem_example.py
new file mode 100644
index 00000000..c2a9cd70
--- /dev/null
+++ b/problems/amd_distributed/rocshmem_example.py
@@ -0,0 +1,92 @@
+import torch
+from torch.utils.cpp_extension import load_inline
+import os
+
+# this is a minimal example saying how to compile and link when you're using rocshmem
+# more examples, pls see https://rocm.docs.amd.com/projects/rocSHMEM/en/latest/ and https://github.com/ROCm/rocSHMEM/tree/develop
+def test_rocshmem_compilation():
+    """Test ROCshmem compilation using PyTorch's load_inline"""
+    
+    print("=== ROCshmem PyTorch Inline Test ===")
+    
+    # C++ source code for ROCshmem test
+    cpp_source = """
+    #include <rocshmem.hpp>
+    #include <iostream>
+    #include <torch/extension.h>
+    
+    void test_rocshmem() {
+        std::cout << "Testing ROCshmem compilation..." << std::endl;
+        
+        // Just test that we can compile and link with rocshmem
+        // Don't actually initialize since we may not have proper MPI setup
+        std::cout << "ROCshmem headers included successfully!" << std::endl;
+        std::cout << "Compilation test passed!" << std::endl;
+    }
+    
+    PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+        m.def("test_rocshmem", &test_rocshmem, "Test ROCshmem compilation");
+    }
+    """
+    
+    # Set up include paths and libraries
+    rocm_path = os.environ.get('ROCM_PATH', '/opt/rocm')
+    rocshmem_path = os.environ.get('ROCSHMEM_INSTALL_DIR', '/home/runner/rocshmem')
+    ompi_path = os.environ.get('OMPI_INSTALL_DIR', '/opt/openmpi')
+    # dirs that must be included
+    include_dirs = [
+        f"{rocm_path}/include",
+        f"{rocshmem_path}/include/rocshmem",
+        f"{ompi_path}/include"
+    ]
+    # libs that must be linked
+    library_dirs = [
+        f"{rocm_path}/lib",
+        f"{rocshmem_path}/lib",
+        f"{ompi_path}/lib"
+    ]
+    libraries = [
+        "rocshmem",
+        "mpi", 
+        "amdhip64",
+        "hsa-runtime64"
+    ]
+
+    ldflags = []
+    for lib_dir in library_dirs:
+        ldflags.append(f"-L{lib_dir}")
+
+    for lib in libraries:
+        ldflags.append(f"-l{lib}")
+
+    extra_cflags = [f"-I{include_dir}" for include_dir in include_dirs]
+
+    extra_ldflags = [
+        "--hip-link"
+    ] + ldflags
+    
+    try:
+        # Use torch.utils.cpp_extension.load_inline to compile
+        rocshmem_module = load_inline(
+            name="rocshmem_test",
+            cpp_sources=cpp_source,
+            extra_cflags=extra_cflags,
+            extra_ldflags=extra_ldflags,
+            verbose=True
+        )
+        
+        print("Compilation successful!")
+        print("Linking successful!")
+        
+        # Run the test
+        rocshmem_module.test_rocshmem()
+        
+        print("ROCshmem test completed successfully!")
+        return True
+        
+    except Exception as e:
+        print(f"ROCshmem test failed: {e}")
+        return False
+
+if __name__ == "__main__":
+    test_rocshmem_compilation()
\ No newline at end of file

From 36a1b85d5fe41588da05d1dcd62f85925c7a57c0 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sun, 28 Sep 2025 14:12:38 +0200
Subject: [PATCH 135/207] Finalize

---
 problems/amd_distributed.yaml             | 5 +++++
 problems/amd_distributed/ag-gemm/task.yml | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/problems/amd_distributed.yaml b/problems/amd_distributed.yaml
index fc9121f7..efc8a0e5 100644
--- a/problems/amd_distributed.yaml
+++ b/problems/amd_distributed.yaml
@@ -15,3 +15,8 @@ problems:
     deadline: "2025-10-14"
     gpus:
       - MI300x8
+  - directory: amd_distributed/ag-gemm
+    name: amd-ag-gemm
+    deadline: "2025-10-14"
+    gpus:
+      - MI300x8
diff --git a/problems/amd_distributed/ag-gemm/task.yml b/problems/amd_distributed/ag-gemm/task.yml
index e7830f24..0c8abc10 100644
--- a/problems/amd_distributed/ag-gemm/task.yml
+++ b/problems/amd_distributed/ag-gemm/task.yml
@@ -77,4 +77,4 @@ benchmarks:
   - {"world_size": 8, "m": 2048, "n": 2880, "k": 2880, "has_bias": True, "seed": 8080}
   - {"world_size": 8, "m": 4096, "n": 4096, "k": 4096, "has_bias": False, "seed": 601}
   - {"world_size": 8, "m": 8192, "n": 14336, "k": 4096, "has_bias": True, "seed": 3062}
-  - {"world_size": 8, "m": 8192, "n": 29568, "k": 8192, "has_bias": False, "seed": 4406}
\ No newline at end of file
+  - {"world_size": 8, "m": 8192, "n": 29568, "k": 8192, "has_bias": False, "seed": 4406}

From a6ff5fad339e30c0b203e5adc1fafd3f0e8afb83 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sun, 12 Oct 2025 14:22:43 +0200
Subject: [PATCH 136/207] tmp: increase timeout

---
 problems/amd_distributed/ag-gemm/task.yml | 3 +++
 problems/amd_distributed/all2all/task.yml | 4 +++-
 problems/amd_distributed/gemm-rs/task.yml | 4 +++-
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/problems/amd_distributed/ag-gemm/task.yml b/problems/amd_distributed/ag-gemm/task.yml
index 0c8abc10..091dc4dc 100644
--- a/problems/amd_distributed/ag-gemm/task.yml
+++ b/problems/amd_distributed/ag-gemm/task.yml
@@ -57,6 +57,9 @@ templates:
   Python: "submission.py"
 
 ranking_by: "geom"
+test_timeout: 640
+benchmark_timeout: 640
+ranked_timeout: 640
 
 tests:
   - {"world_size": 8, "m": 64, "n": 2880, "k": 2880, "has_bias": True, "seed":402}
diff --git a/problems/amd_distributed/all2all/task.yml b/problems/amd_distributed/all2all/task.yml
index 2e0f68f7..76210254 100644
--- a/problems/amd_distributed/all2all/task.yml
+++ b/problems/amd_distributed/all2all/task.yml
@@ -66,4 +66,6 @@ benchmarks:
 
 
 ranking_by: "geom"
-ranked_timeout: 420
+test_timeout: 640
+benchmark_timeout: 640
+ranked_timeout: 640
diff --git a/problems/amd_distributed/gemm-rs/task.yml b/problems/amd_distributed/gemm-rs/task.yml
index 6eac2741..ab2916ed 100644
--- a/problems/amd_distributed/gemm-rs/task.yml
+++ b/problems/amd_distributed/gemm-rs/task.yml
@@ -60,7 +60,9 @@ templates:
   Python: "submission.py"
 
 ranking_by: "geom"
-ranked_timeout: 360 # just in case
+test_timeout: 640
+benchmark_timeout: 640
+ranked_timeout: 640 # just in case
 
 tests:
   - {"world_size": 8, "m": 64, "n": 2880, "k": 2880, "has_bias": True, "seed": 2035}

From 74ccfd902ddb846d5d34f1dd8d89fecb97e8b866 Mon Sep 17 00:00:00 2001
From: Matej Sirovatka <54212263+S1ro1@users.noreply.github.com>
Date: Mon, 13 Oct 2025 18:21:04 +0200
Subject: [PATCH 137/207] Change deadlines to 2025-10-15 for AMD problems

Updated deadlines for multiple AMD distributed problems.
---
 problems/amd_distributed.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/problems/amd_distributed.yaml b/problems/amd_distributed.yaml
index efc8a0e5..6e0025f8 100644
--- a/problems/amd_distributed.yaml
+++ b/problems/amd_distributed.yaml
@@ -7,16 +7,16 @@ description: "AMD Developer Challenge 2025: Distributed Edition"
 problems:
   - directory: amd_distributed/all2all
     name: amd-all2all
-    deadline: "2025-10-14"
+    deadline: "2025-10-15"
     gpus:
       - MI300x8
   - directory: amd_distributed/gemm-rs
     name: amd-gemm-rs
-    deadline: "2025-10-14"
+    deadline: "2025-10-15"
     gpus:
       - MI300x8
   - directory: amd_distributed/ag-gemm
     name: amd-ag-gemm
-    deadline: "2025-10-14"
+    deadline: "2025-10-15"
     gpus:
       - MI300x8

From d43665447916984157e0799dcfbd6a04b8e2560d Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Tue, 14 Oct 2025 16:03:03 +0200
Subject: [PATCH 138/207] Fix: deadline to pst

---
 problems/amd_distributed.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/problems/amd_distributed.yaml b/problems/amd_distributed.yaml
index 6e0025f8..c3eec7fc 100644
--- a/problems/amd_distributed.yaml
+++ b/problems/amd_distributed.yaml
@@ -7,16 +7,16 @@ description: "AMD Developer Challenge 2025: Distributed Edition"
 problems:
   - directory: amd_distributed/all2all
     name: amd-all2all
-    deadline: "2025-10-15"
+    deadline: "2025-10-15 07:00" # midnight pst
     gpus:
       - MI300x8
   - directory: amd_distributed/gemm-rs
     name: amd-gemm-rs
-    deadline: "2025-10-15"
+    deadline: "2025-10-15 07:00" # midnight pst
     gpus:
       - MI300x8
   - directory: amd_distributed/ag-gemm
     name: amd-ag-gemm
-    deadline: "2025-10-15"
+    deadline: "2025-10-15 07:00" # midnight pst
     gpus:
       - MI300x8

From ed735f7d5a5d0c44467a16a4abd028121b9caeb3 Mon Sep 17 00:00:00 2001
From: achal <pandeyachal.ap@gmail.com>
Date: Wed, 15 Oct 2025 06:11:24 +0100
Subject: [PATCH 139/207] fix getting of input dimensions in
 pmpp_v2/prefixsum_py

---
 problems/pmpp_v2/prefixsum_py/reference.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/pmpp_v2/prefixsum_py/reference.py b/problems/pmpp_v2/prefixsum_py/reference.py
index 8719185b..cc25c10a 100644
--- a/problems/pmpp_v2/prefixsum_py/reference.py
+++ b/problems/pmpp_v2/prefixsum_py/reference.py
@@ -36,7 +36,7 @@ def generate_input(size: int, seed: int) -> input_t:
 # The tolerance is scaled by the square root of the input size
 def check_implementation(data: input_t, output: output_t) -> str:
     # Then get the size for scaling the tolerance
-    n = data.numel()
+    n = data[0].numel()
 
     scale_factor = n ** 0.5  # Square root of input size
     rtol = 1e-5 * scale_factor

From e28ee1800b31cfaf06ed8c6dd64ee03c159d5812 Mon Sep 17 00:00:00 2001
From: az <alex.lx.zhang@gmail.com>
Date: Sat, 1 Nov 2025 14:46:38 -0400
Subject: [PATCH 140/207] Fix reference for matmul_py

---
 problems/pmpp_v2/matmul_py/reference.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/pmpp_v2/matmul_py/reference.py b/problems/pmpp_v2/matmul_py/reference.py
index 9962f660..dc34cd32 100644
--- a/problems/pmpp_v2/matmul_py/reference.py
+++ b/problems/pmpp_v2/matmul_py/reference.py
@@ -16,7 +16,7 @@ def generate_input(m: int, n: int, k: int, seed: int) -> input_t:
 
 def ref_kernel(data: input_t) -> output_t:
     with DeterministicContext():
-        a, b = data
+        a, b, c = data
         return a @ b
 
 

From ea4bfbce7d10391abea41ef4110cb8a99e1fbda0 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Mon, 10 Nov 2025 18:56:11 +0100
Subject: [PATCH 141/207] Feat: initial setup

---
 problems/nvidia.yaml                     |  12 +
 problems/nvidia/eval.py                  | 485 +++++++++++++++++++++++
 problems/nvidia/nvfp4_gemv/reference.py  | 166 ++++++++
 problems/nvidia/nvfp4_gemv/submission.py |  54 +++
 problems/nvidia/nvfp4_gemv/task.py       |  10 +
 problems/nvidia/nvfp4_gemv/task.yml      |  59 +++
 problems/nvidia/nvfp4_gemv/template.py   |  25 ++
 problems/nvidia/utils.py                 | 176 ++++++++
 8 files changed, 987 insertions(+)
 create mode 100644 problems/nvidia.yaml
 create mode 100644 problems/nvidia/eval.py
 create mode 100644 problems/nvidia/nvfp4_gemv/reference.py
 create mode 100644 problems/nvidia/nvfp4_gemv/submission.py
 create mode 100644 problems/nvidia/nvfp4_gemv/task.py
 create mode 100644 problems/nvidia/nvfp4_gemv/task.yml
 create mode 100644 problems/nvidia/nvfp4_gemv/template.py
 create mode 100644 problems/nvidia/utils.py

diff --git a/problems/nvidia.yaml b/problems/nvidia.yaml
new file mode 100644
index 00000000..867d6687
--- /dev/null
+++ b/problems/nvidia.yaml
@@ -0,0 +1,12 @@
+name: NVIDIA Blackwell NVFP4 Kernel Hackathon
+
+deadline: 2026-02-14
+
+description: "NVIDIA Blackwell NVFP4 Kernel Hackathon"
+
+problems:
+  - directory: nvidia/nvfp4_gemv
+    name: nvfp4_gemv
+    deadline: 2026-11-28
+    gpus:
+      - NVIDIA
diff --git a/problems/nvidia/eval.py b/problems/nvidia/eval.py
new file mode 100644
index 00000000..ed370157
--- /dev/null
+++ b/problems/nvidia/eval.py
@@ -0,0 +1,485 @@
+import base64
+import dataclasses
+import multiprocessing
+import re
+import time
+import os
+import sys
+import math
+from pathlib import Path
+from typing import Any, Optional
+
+import torch.cuda
+from cutlass.cute.nvgpu.common import OpError
+from torch.cuda.nvtx import range as nvtx_range
+
+from utils import set_seed, clear_l2_cache
+
+try:
+    from task import TestSpec
+except ImportError:
+    TestSpec = dict
+
+from reference import check_implementation, generate_input
+
+
+class PopcornOutput:
+    def __init__(self, fd: int):
+        self.file = os.fdopen(fd, "w")
+        os.set_inheritable(fd, False)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.file.close()
+
+    def print(self, *args, **kwargs):
+        print(*args, **kwargs, file=self.file, flush=True)
+
+    def log(self, key, value):
+        self.print(f"{key}: {value}")
+
+
+@dataclasses.dataclass
+class TestCase:
+    args: dict
+    spec: str
+
+
+def _combine(a: int, b: int) -> int:
+    # combine two integers into one:
+    # we need this to generate a secret seed based on the test-level seed and
+    # the global secret seed.
+    # the test-level seeds are public knowledge, and typically relatively small numbers,
+    # so we need to make sure they don't provide any useful info for the full seed.
+    # This Cantor construction ensures that if the secret seed is a large number,
+    # then so is the overall seed.
+    return int(a + (a + b) * (a + b + 1) // 2)
+
+
+def get_test_cases(file_name: str, seed: Optional[int]) -> list[TestCase]:
+    try:
+        content = Path(file_name).read_text()
+    except Exception as E:
+        print(f"Could not open test file`{file_name}`: {E}", file=sys.stderr)
+        exit(113)
+
+    tests = []
+    lines = content.splitlines()
+    match = r"\s*([a-zA-Z]+):\s*([a-zA-Z]+|[+-]?[0-9]+)\s*"
+    for line in lines:
+        parts = line.split(";")
+        case = {}
+        for part in parts:
+            matched = re.match(match, part)
+            if not re.fullmatch(match, part):
+                print(f"invalid test case: '{line}': '{part}'", file=sys.stderr)
+                exit(113)
+            key = matched[1]
+            val = matched[2]
+            try:
+                val = int(val)
+            except ValueError:
+                pass
+
+            case[key] = val
+        tests.append(TestCase(spec=line, args=case))
+
+    if seed is not None:
+        for test in tests:
+            if "seed" in test.args:
+                test.args["seed"] = _combine(test.args["seed"], seed)
+
+    return tests
+
+
+@dataclasses.dataclass
+class Stats:
+    runs: int
+    mean: float
+    std: float
+    err: float
+    best: float
+    worst: float
+
+
+def calculate_stats(durations: list[int]):
+    """
+    Calculate statistical data from a list of durations.
+
+    @param durations: A list of durations in nanoseconds.
+    @return: A Stats object containing the number of runs, mean, standard deviation, error, best, and worst durations.
+    """
+    runs = len(durations)
+    total = sum(durations)
+    best = min(durations)
+    worst = max(durations)
+
+    avg = total / runs
+    variance = sum(map(lambda x: (x - avg) ** 2, durations))
+    std = math.sqrt(variance / (runs - 1))
+    err = std / math.sqrt(runs)
+
+    return Stats(
+        runs=runs, mean=avg, std=std, err=err, best=float(best), worst=float(worst)
+    )
+
+
+def _clone_data(data):
+    """
+    Recursively goes through data and clones all tensors.
+    """
+    if isinstance(data, tuple):
+        return tuple(_clone_data(x) for x in data)
+    elif isinstance(data, list):
+        return [_clone_data(x) for x in data]
+    elif isinstance(data, dict):
+        return {k: _clone_data(v) for k, v in data.items()}
+    elif isinstance(data, torch.Tensor):
+        return data.clone()
+    else:
+        return data
+
+
+def _run_single_test(test: TestCase):
+    """
+    Runs a single test case. Do not call directly
+    """
+    from submission import custom_kernel
+
+    data = generate_input(**test.args)
+    torch.cuda.synchronize()
+    try:
+        submission_output = custom_kernel(_clone_data(data))
+
+    except OpError as E:
+        print(f"Encountered {E}", file=sys.stderr)
+        return False, str(E)
+    torch.cuda.synchronize()
+    return check_implementation(data, submission_output)
+
+
+def run_single_test(pool: multiprocessing.Pool, test: TestCase):
+    """
+    Runs a single test in another process.
+    """
+    return pool.apply(_run_single_test, (test,))
+
+
+def run_testing(
+    logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]
+):
+    """
+    Executes the actual test case code and checks for correctness.
+
+    @param logger: A PopcornOutput object used for logging test results.
+    @param tests: A list of TestCase objects representing the test cases to be executed.
+    @return: An integer representing the exit status: 0 if all tests pass, otherwise 112.
+    """
+    passed = True
+    logger.log("test-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"test.{idx}.spec", test.spec)
+        good, message = run_single_test(pool, test)
+        if not good:
+            logger.log(f"test.{idx}.status", "fail")
+            logger.log(f"test.{idx}.error", message)
+            passed = False
+        else:
+            logger.log(f"test.{idx}.status", "pass")
+            if message:
+                logger.log(f"test.{idx}.message", message)
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def _run_single_benchmark(
+    test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float
+) -> Stats | Any:
+    """
+    Runs one benchmark. Do not call directly.
+    """
+    from submission import custom_kernel
+
+    durations = []
+    # generate input data once
+    data = generate_input(**test.args)
+    check_copy = _clone_data(data)
+
+    #  first, one obligatory correctness check
+    try:
+        output = custom_kernel(_clone_data(data))
+    except OpError as E:
+        return f"Encountered {E}"
+    good, message = check_implementation(check_copy, output)
+    if not good:
+        return message
+
+    # now, do multiple timing runs without further correctness testing
+    # there is an upper bound of 200 runs, and a lower bound of 3 runs;
+    # otherwise, we repeat until we either measure at least 10 full seconds,
+    # or the relative error of the mean is below 1%.
+
+    bm_start_time = time.perf_counter_ns()
+    for i in range(max_repeats):
+        if recheck:
+            # ensure we use a different seed for every benchmark
+            if "seed" in test.args:
+                test.args["seed"] += 13
+
+            data = generate_input(**test.args)
+            check_copy = _clone_data(data)
+        torch.cuda.synchronize()
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        clear_l2_cache()
+
+        start_event.record()
+        output = custom_kernel(data)
+        end_event.record()
+        torch.cuda.synchronize()
+        duration = start_event.elapsed_time(end_event) * 1e6  # Convert ms to ns
+
+        if recheck:
+            good, message = check_implementation(check_copy, output)
+            if not good:
+                return message
+
+        del output
+        durations.append(duration)
+
+        if i > 1:
+            total_bm_duration = time.perf_counter_ns() - bm_start_time
+            stats = calculate_stats(durations)
+            # stop if either
+            # a) relative error dips below 0.1%
+            # b) we exceed the total time limit for benchmarking the kernel
+            # c) we exceed 2 minutes of total wallclock time.
+            if (
+                stats.err / stats.mean < 0.001
+                or stats.mean * stats.runs > max_time_ns
+                or total_bm_duration > 120e9
+            ):
+                break
+
+    return calculate_stats(durations)
+
+
+def run_single_benchmark(
+    pool: multiprocessing.Pool,
+    test: TestCase,
+    recheck: bool,
+    max_repeats: int,
+    max_time_ns: float,
+):
+    """
+    For a particular test case, check correctness (if applicable) and grab runtime results.
+
+    @param pool: Process on which the benchmark will be launched.
+    @param test: TestCase object.
+    @param recheck: Flag for whether to explicitly check functional correctness.
+    @param max_repeats: Number of trials to repeat.
+    @param max_time_ns: Timeout time in nanoseconds.
+    @return: A Stats object for this particular benchmark case or an error if the test fails.
+    """
+    return pool.apply(_run_single_benchmark, (test, recheck, max_repeats, max_time_ns))
+
+
+def run_benchmarking(
+    logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]
+):
+    """
+    Executes benchmarking code for a CUDA Kernel and logs runtimes.
+
+    @param logger: A PopcornOutput object used for logging benchmark results.
+    @param pool: Process on which the benchmarks will be launched.
+    @param tests: A list of TestCase objects representing the test cases to be benchmarked.
+    @return: An integer representing the exit status: 0 if all benchmarks pass, otherwise 112.
+    """
+
+    run_single_benchmark(pool, tests[0], False, 200, 10e7)
+
+    passed = True
+    logger.log("benchmark-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"benchmark.{idx}.spec", test.spec)
+        result = run_single_benchmark(pool, test, False, 200, 10e9)
+        if isinstance(result, Stats):
+            for field in dataclasses.fields(Stats):
+                logger.log(f"benchmark.{idx}.{field.name}", getattr(result, field.name))
+        else:
+            passed = False
+            logger.log(f"benchmark.{idx}.status", "fail")
+            logger.log(f"benchmark.{idx}.error", result)
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def _run_single_profile_torch(test: TestCase) -> str:
+    """
+    Profiles a single benchmark using the torch profiler.
+    """
+    from submission import custom_kernel
+    from torch.profiler import profile, ProfilerActivity
+
+    with nvtx_range("generate input"):
+        data = generate_input(**test.args)
+        torch.cuda.synchronize()
+
+    cloned = _clone_data(data)
+    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
+        with nvtx_range("custom_kernel"):
+            submission_output = custom_kernel(cloned)
+            torch.cuda.synchronize()
+
+    return prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=20)
+
+
+def _run_single_profile_ncu(test: TestCase) -> str:
+    """
+    Profiles a single benchmark using ncu. Note: this does not
+    invoke NCU; instead, it is expected that eval is launched
+    under NCU, and this function will rurnthe kernel excactly
+    once in the 'custom_kernel' nvtx range.
+    """
+    from submission import custom_kernel
+
+    with nvtx_range("generate input"):
+        data = generate_input(**test.args)
+        torch.cuda.synchronize()
+
+    cloned = _clone_data(data)
+    with nvtx_range("custom_kernel"):
+        submission_output = custom_kernel(cloned)
+        torch.cuda.synchronize()
+
+    return ""
+
+
+def _combine_traces(traces: list["EventList"]) -> "EventList":
+    """
+    Combine multiple event traces obtained from multiple (distributed) torch.profiler
+    activities. This function simply aggregates the data as like `prof.key_averages()`,
+    except over multiple traces. Most of this function is reimplemented
+    from `torch.autograd.profiler_util.EventList.key_averages()`.
+    """
+    from torch.autograd.profiler_util import FunctionEventAvg, EventList
+    from collections import defaultdict
+
+    def get_key(event) -> tuple[str, ...]:
+        return (
+            str(event.key),
+            str(event.node_id),
+            str(event.device_type),
+            str(event.is_legacy),
+            str(event.is_user_annotation),
+        )
+
+    stats: dict[tuple[str, ...], FunctionEventAvg] = defaultdict(FunctionEventAvg)
+
+    for events in traces:
+        for event in events:
+            stats[get_key(event)].add(event)
+
+    avg_list = EventList(stats.values())
+    for event in avg_list:
+        event.stack = []
+        event.input_shapes = ""
+        event.overload_name = ""
+
+    return avg_list
+
+
+def run_single_profile(test: TestCase, pool: multiprocessing.Pool) -> str:
+    """
+    Runs a single profiling activity in another process.
+    """
+    if bool(os.getenv("POPCORN_NCU", "0")):
+        return pool.apply(_run_single_profile_ncu, (test,))
+    else:
+        return pool.apply(_run_single_profile_torch, (test,))
+
+
+def run_profiling(
+    logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]
+):
+    logger.log("benchmark-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"benchmark.{idx}.spec", test.spec)
+        report = run_single_profile(test, pool)
+        logger.log(
+            f"benchmark.{idx}.report",
+            base64.b64encode(report.encode("utf-8"), b"+*").decode("utf-8"),
+        )
+    logger.log("check", "pass")
+    return 0
+
+
+def main():
+    fd = os.getenv("POPCORN_FD")
+    if not fd:
+        return 111
+
+    if len(sys.argv) < 3:
+        return 2
+
+    mode = sys.argv[1]
+    seed = os.getenv("POPCORN_SEED")
+    os.unsetenv("POPCORN_SEED")
+    seed = int(seed) if seed else None
+    set_seed(seed or 42)
+
+    tests = get_test_cases(sys.argv[2], seed)
+
+    with PopcornOutput(int(fd)) as logger:
+        import multiprocessing
+
+        mp_context = multiprocessing.get_context("spawn")
+        with mp_context.Pool(1) as pool:
+            if mode == "test":
+                return run_testing(logger, pool, tests)
+            if mode == "benchmark":
+                return run_benchmarking(logger, pool, tests)
+
+            if mode == "leaderboard":
+                run_single_benchmark(pool, tests[0], False, 200, 1e7)
+                logger.log("benchmark-count", len(tests))
+                passed = True
+                for i in range(len(tests)):
+                    result = run_single_benchmark(pool, tests[i], True, 200, 30e9)
+                    logger.log(f"benchmark.{i}.spec", tests[i].spec)
+                    if isinstance(result, Stats):
+                        for field in dataclasses.fields(Stats):
+                            logger.log(
+                                f"benchmark.{i}.{field.name}",
+                                getattr(result, field.name),
+                            )
+                    else:
+                        passed = False
+                        logger.log(f"benchmark.{i}.status", "fail")
+                        logger.log(
+                            f"benchmark.{i}.error", str(result)
+                        )  # TODO: Make sure result implements __str__?
+                        break
+
+                logger.log("check", "pass" if passed else "fail")
+            elif mode == "profile":
+                run_profiling(logger, pool, tests)
+            else:
+                # TODO: Implement script mode
+                return 2
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/problems/nvidia/nvfp4_gemv/reference.py b/problems/nvidia/nvfp4_gemv/reference.py
new file mode 100644
index 00000000..f204663f
--- /dev/null
+++ b/problems/nvidia/nvfp4_gemv/reference.py
@@ -0,0 +1,166 @@
+import torch
+from task import input_t, output_t
+from utils import make_match_reference
+
+# Scaling factor vector size
+sf_vec_size = 16
+
+# Helper function for ceiling division
+def ceil_div(a, b):
+    return (a + b - 1) // b
+
+
+# Helper function to convert scale factor tensor to blocked format
+def to_blocked(input_matrix):
+    rows, cols = input_matrix.shape
+
+    # Please ensure rows and cols are multiples of 128 and 4 respectively
+    n_row_blocks = ceil_div(rows, 128)
+    n_col_blocks = ceil_div(cols, 4)
+
+    padded = input_matrix
+    blocks = padded.view(n_row_blocks, 128, n_col_blocks, 4).permute(0, 2, 1, 3)
+    rearranged = blocks.reshape(-1, 4, 32, 4).transpose(1, 2).reshape(-1, 32, 16)
+
+    return rearranged.flatten()
+
+
+def ref_kernel(
+    data: input_t,
+) -> output_t:
+    """
+    PyTorch reference implementation of NVFP4 block-scaled GEMV.
+    """
+    a_ref, b_ref, sfa_ref_cpu, sfb_ref_cpu, _, _, c_ref = data
+    
+    # Get dimensions from MxNxL layout
+    _, _, l = c_ref.shape
+
+    # Call torch._scaled_mm to compute the GEMV result
+    for l_idx in range(l):
+        # Convert the scale factor tensor to blocked format
+        scale_a = to_blocked(sfa_ref_cpu[:, :, l_idx])
+        scale_b = to_blocked(sfb_ref_cpu[:, :, l_idx])
+        # (m, k) @ (n, k).T -> (m, n)
+        res = torch._scaled_mm(
+            a_ref[:, :, l_idx],
+            b_ref[:, :, l_idx].transpose(0, 1),
+            scale_a.cuda(),
+            scale_b.cuda(),
+            bias=None,
+            out_dtype=torch.float16,
+        )
+        c_ref[:, 0, l_idx] = res[:, 0]
+    return c_ref
+
+
+def generate_input(
+    m: int,
+    k: int,
+    l: int,
+    seed: int,
+):
+    """
+    Generate input tensors for NVFP4 block-scaled GEMV.
+    
+    Args:
+        m: Number of rows in matrix A
+        k: Number of columns in A (and length of vector b)
+        l: Batch size
+        seed: Random seed for reproducibility
+    
+    Returns:
+        Tuple of (a, b, scale_a, scale_b, c) where:
+            a: [m, k, l] - Input matrix in torch.float4e2m1fn_x2 data type
+            b: [1, k, l] - Input vector in torch.float4e2m1fn_x2 data type
+            scale_a: [m, k, l] - Input scale factors in torch.float8e4m3fn data type
+            scale_b: [1, k, l] - Input scale factors in torch.float8e4m3fn data type
+            scale_a_permuted: [32, 4, rest_m, 4, rest_k, l] - Input scale factors in torch.float8e4m3fn data type
+            scale_b_permuted: [32, 4, rest_n, 4, rest_k, l] - Input scale factors in torch.float8e4m3fn data type
+            c: [m, 1, l] - Output vector in torch.float16 data type
+    """
+    torch.manual_seed(seed)
+
+    # GEMV N dimension is always 1
+    n = 1
+    # Scaling factor needs to pad the N size to 128
+    n_padded_128 = 128
+    
+    # Generate uint8 tensor, then convert to float4e2m1fn_x2 data type
+    a_ref = torch.randint(
+        0, 2, (l, m, k // 2), dtype=torch.uint8, device="cuda"
+    ).permute(1, 2, 0)
+    # Pad b tensor's N dimension to 128 to call torch._scaled_mm for nvfp4 dot product computation
+    b_ref = torch.randint(
+        0, 2, (l, n_padded_128, k // 2), dtype=torch.uint8, device="cuda"
+    ).permute(1, 2, 0)
+    a_ref = a_ref.view(torch.float4_e2m1fn_x2)
+    b_ref = b_ref.view(torch.float4_e2m1fn_x2)
+
+    # Create float16 output tensor
+    c_ref = torch.randn((l, m, n), dtype=torch.float16, device="cuda").permute(
+        1, 2, 0
+    )
+    
+    # Helper function to prepare the scale factor tensors for both reference
+    # kernel and customize kernel. The customized data layout can be found in:
+    # https://docs.nvidia.com/cuda/cublas/index.html?highlight=fp4#d-block-scaling-factors-layout
+    def create_scale_factor_tensors(l, mn, sf_k):
+        # Create the reference scale factor tensor (mn, sf_k, l) on CPU.
+        ref_shape = (l, mn, sf_k)
+        ref_permute_order = (1, 2, 0)
+        # Init with uint8 tensor, then convert to float8_e4m3fn
+        ref_f8_random_int = torch.randint(1, 3, ref_shape, dtype=torch.int8, device='cuda')
+        ref_f8_torch_tensor = ref_f8_random_int.to(dtype=torch.float8_e4m3fn)
+        # permute to match ref_permute_order
+        ref_f8_torch_tensor_permuted = ref_f8_torch_tensor.permute(*ref_permute_order)
+        
+        atom_m = (32, 4)
+        atom_k = 4
+        mma_shape = (
+            l,  # batch size
+            ceil_div(mn, atom_m[0] * atom_m[1]),
+            ceil_div(sf_k, atom_k),
+            atom_m[0],
+            atom_m[1],
+            atom_k,
+        )
+
+        # Reorder scale factor tensor to (32, 4, rest_m, 4, rest_k, l) layout
+        # Which is needed by the CuTe customized kernel
+        mma_permute_order = (3, 4, 1, 5, 2, 0)
+        # Generate a random int8 tensor, then convert to float8_e4m3fn
+        rand_int_tensor = torch.randint(0, 2, mma_shape, dtype=torch.int8, device='cuda')
+        reordered_f8_torch_tensor = rand_int_tensor.to(dtype=torch.float8_e4m3fn)
+        # Permute according to mma_permute_order
+        reordered_f8_torch_tensor = reordered_f8_torch_tensor.permute(*mma_permute_order)
+
+        # GPU-side vectorized reordering (replaces slow CPU nested loops)
+        # Create index grids for all dimensions
+        i_idx = torch.arange(mn, device='cuda')
+        j_idx = torch.arange(sf_k, device='cuda')
+        b_idx = torch.arange(l, device='cuda')
+        
+        # Create meshgrid for all combinations of (i, j, b)
+        i_grid, j_grid, b_grid = torch.meshgrid(i_idx, j_idx, b_idx, indexing='ij')
+        
+        # Calculate target indices in vectorized manner
+        mm = i_grid // (atom_m[0] * atom_m[1])
+        mm32 = i_grid % atom_m[0]
+        mm4 = (i_grid % 128) // atom_m[0]
+        kk = j_grid // atom_k
+        kk4 = j_grid % atom_k
+        
+        # Perform the reordering with advanced indexing (all on GPU)
+        reordered_f8_torch_tensor[mm32, mm4, mm, kk4, kk, b_grid] = ref_f8_torch_tensor_permuted[i_grid, j_grid, b_grid]
+        
+        return ref_f8_torch_tensor_permuted.cpu(), reordered_f8_torch_tensor
+
+    sf_k = ceil_div(k, sf_vec_size)
+    sfa_ref_cpu, sfa_permuted = create_scale_factor_tensors(l, m, sf_k)
+    sfb_ref_cpu, sfb_permuted = create_scale_factor_tensors(l, n_padded_128, sf_k)
+    
+    return (a_ref, b_ref, sfa_ref_cpu, sfb_ref_cpu, sfa_permuted, sfb_permuted, c_ref)
+
+
+check_implementation = make_match_reference(ref_kernel, rtol=1e-03, atol=1e-03)
diff --git a/problems/nvidia/nvfp4_gemv/submission.py b/problems/nvidia/nvfp4_gemv/submission.py
new file mode 100644
index 00000000..2db65cdd
--- /dev/null
+++ b/problems/nvidia/nvfp4_gemv/submission.py
@@ -0,0 +1,54 @@
+import torch
+from task import input_t, output_t
+
+# Kernel configuration parameters
+sf_vec_size = 16
+
+
+# Helper function for ceiling division
+def ceil_div(a, b):
+    return (a + b - 1) // b
+
+
+# Helper function to convert scale factor tensor to blocked format
+def to_blocked(input_matrix):
+    rows, cols = input_matrix.shape
+
+    # Please ensure rows and cols are multiples of 128 and 4 respectively
+    n_row_blocks = ceil_div(rows, 128)
+    n_col_blocks = ceil_div(cols, 4)
+
+    padded = input_matrix
+    blocks = padded.view(n_row_blocks, 128, n_col_blocks, 4).permute(0, 2, 1, 3)
+    rearranged = blocks.reshape(-1, 4, 32, 4).transpose(1, 2).reshape(-1, 32, 16)
+
+    return rearranged.flatten()
+
+
+def custom_kernel(
+    data: input_t,
+) -> output_t:
+    """
+    PyTorch reference implementation of NVFP4 block-scaled GEMV.
+    """
+    a_ref, b_ref, sfa_ref_cpu, sfb_ref_cpu, _, _, c_ref = data
+
+    # Get dimensions from MxNxL layout
+    _, _, l = c_ref.shape
+
+    # Call torch._scaled_mm to compute the GEMV result
+    for l_idx in range(l):
+        # Convert the scale factor tensor to blocked format
+        scale_a = to_blocked(sfa_ref_cpu[:, :, l_idx])
+        scale_b = to_blocked(sfb_ref_cpu[:, :, l_idx])
+        # (m, k) @ (n, k).T -> (m, n)
+        res = torch._scaled_mm(
+            a_ref[:, :, l_idx],
+            b_ref[:, :, l_idx].transpose(0, 1),
+            scale_a.cuda(),
+            scale_b.cuda(),
+            bias=None,
+            out_dtype=torch.float16,
+        )
+        c_ref[:, 0, l_idx] = res[:, 0]
+    return c_ref
diff --git a/problems/nvidia/nvfp4_gemv/task.py b/problems/nvidia/nvfp4_gemv/task.py
new file mode 100644
index 00000000..487112b6
--- /dev/null
+++ b/problems/nvidia/nvfp4_gemv/task.py
@@ -0,0 +1,10 @@
+import torch
+from typing import TypedDict, TypeVar
+
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor])
+output_t = TypeVar("output_t", bound=torch.Tensor)
+class TestSpec(TypedDict):
+    m: int
+    k: int
+    l: int
+    seed: int
diff --git a/problems/nvidia/nvfp4_gemv/task.yml b/problems/nvidia/nvfp4_gemv/task.yml
new file mode 100644
index 00000000..756fe80d
--- /dev/null
+++ b/problems/nvidia/nvfp4_gemv/task.yml
@@ -0,0 +1,59 @@
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  
+  You will implement a batched matrix-vector multiplication kernel optimized for NVIDIA B200.
+  To be explicit, you will be given a tuple of tensors:
+  ```
+  (a, b, sfa, sfb, c)
+  ```
+  where:
+  * `a` is M x K x L in K-major order in nvfp4(e2m1)
+  * `b` is 1 x K x L in K-major order in nvfp4(e2m1)
+  * `sfa` is M x (K // 16) x L in K-major order in fp8(e4m3fnuz)
+  * `sfb` is 1 x (K // 16) x L in K-major order in fp8(e4m3fnuz)
+  * `c` is M x 1 x L in fp16
+  
+  Matrix sizes `M` is divisible by mma_tiler_mn[0] defined in the kernel, `K` is divisible by 64.
+  The ranking criteria is the geometric mean of the benchmark results.
+  For the grand price, your kernel will be evaluated against the speed of light analysis
+  and the solution closest to the speed of light will be awarded the grand price.
+  ```
+  The speed of light analysis based on the max(FFMA math throughput, DRAM memory throughput) of B200 and tested under 1.5Ghz clock:
+  M    K     L time[us]
+  7168 16384 1 8.622
+  4096 7168  8 17.275
+  7168 2048  4 4.317
+  ```
+config:
+  main: "eval.py"
+
+templates:
+  Python: "template.py"
+
+tests:
+  - {"m": 128, "k": 256, "l": 1, "seed": 1111}
+  - {"m": 128, "k": 1536, "l": 1, "seed": 1111}
+  - {"m": 128, "k": 3072, "l": 1, "seed": 1111}
+  - {"m": 256, "k": 7168, "l": 1, "seed": 1111}
+  - {"m": 256, "k": 7168, "l": 1, "seed": 1111}
+  - {"m": 2432, "k": 4608, "l": 2, "seed": 1111}
+  - {"m": 384, "k": 7168, "l": 2, "seed": 1111}
+  - {"m": 512, "k": 512, "l": 2, "seed": 1111}
+  - {"m": 512, "k": 4096, "l": 2, "seed": 1111}
+  - {"m": 512, "k": 1536, "l": 2, "seed": 1111}
+
+
+benchmarks:
+  - {"m": 7168, "k": 16384, "l":1, "seed": 1111}
+  - {"m": 4096, "k": 7168, "l":8, "seed": 1111}
+  - {"m": 7168, "k": 2048, "l":4, "seed": 1111}
+
+ranking_by: "geom"
diff --git a/problems/nvidia/nvfp4_gemv/template.py b/problems/nvidia/nvfp4_gemv/template.py
new file mode 100644
index 00000000..299c527d
--- /dev/null
+++ b/problems/nvidia/nvfp4_gemv/template.py
@@ -0,0 +1,25 @@
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of block-scale fp8 gemv
+    Args:
+        data: Tuple that expands to:
+            a: torch.Tensor[float4e2m1fn] of shape [m, k, l],
+            b: torch.Tensor[float4e2m1fn] of shape [1, k, l],
+            sfa: torch.Tensor[float8_e4m3fnuz] of shape [m, k // 16, l], used by reference implementation
+            sfb: torch.Tensor[float8_e4m3fnuz] of shape [1, k // 16, l], used by reference implementation
+            sfa_permuted: torch.Tensor[float8_e4m3fnuz] of shape [32, 4, rest_m, 4, rest_k, l],
+            sfb_permuted: torch.Tensor[float8_e4m3fnuz] of shape [32, 4, rest_n, 4, rest_k, l],
+            c: torch.Tensor[float16] of shape [m, 1, l]
+    Returns:
+        Tensor containing output in float16
+        c: torch.Tensor[float16] of shape [m, 1, l]
+    """
+    # c: [l, m, 1] is pre-allocated memory to avoid timing allocation overhead.
+    a, b, sfa, sfb, sfa_permuted, sfb_permuted, c = data
+
+    # Your implementation here
+
+    return c
diff --git a/problems/nvidia/utils.py b/problems/nvidia/utils.py
new file mode 100644
index 00000000..187123d6
--- /dev/null
+++ b/problems/nvidia/utils.py
@@ -0,0 +1,176 @@
+import os
+import random
+import numpy as np
+import torch
+
+
+def set_seed(seed=42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+
+
+def get_device(use_cuda: bool = True) -> torch.device:
+    """Get the appropriate device (GPU or CPU)."""
+    if use_cuda:
+        if torch.cuda.is_available():
+            return torch.device("cuda")
+        elif torch.backends.mps.is_available():
+            return torch.device("mps")
+        else:
+            print("No compatible GPU found. Falling back to CPU.")
+    return torch.device("cpu")
+
+
+# Adapted from https://github.com/linkedin/Liger-Kernel/blob/main/test/utils.py
+@torch.no_grad()
+def verbose_allclose(
+        received: torch.Tensor,
+        expected: torch.Tensor,
+        rtol=1e-05,
+        atol=1e-08,
+        max_print=5
+) -> list[str]:
+    """
+    Assert that two tensors are element-wise equal within a tolerance, providing detailed information about mismatches.
+
+    Parameters:
+    received (torch.Tensor): Tensor we actually got.
+    expected (torch.Tensor): Tensor we expected to receive.
+    rtol (float): Relative tolerance; relative to expected
+    atol (float): Absolute tolerance.
+    max_print (int): Maximum number of mismatched elements to print.
+
+    Raises:
+    AssertionError: If the tensors are not all close within the given tolerance.
+    """
+    # Check if the shapes of the tensors match
+    if received.shape != expected.shape:
+        return ["SIZE MISMATCH"]
+
+    # Calculate the difference between the tensors
+    diff = torch.abs(received - expected)
+
+    # Determine the tolerance
+    tolerance = atol + rtol * torch.abs(expected)
+
+    # Find tolerance mismatched elements
+    tol_mismatched = diff > tolerance
+
+    # Find nan mismatched elements
+    nan_mismatched = torch.logical_xor(torch.isnan(received), torch.isnan(expected))
+
+    # Find +inf mismatched elements
+    posinf_mismatched = torch.logical_xor(torch.isposinf(received), torch.isposinf(expected))
+    # Find -inf mismatched elements
+    neginf_mismatched = torch.logical_xor(torch.isneginf(received), torch.isneginf(expected))
+
+    # Find all mismatched elements
+    mismatched = torch.logical_or(
+        torch.logical_or(tol_mismatched, nan_mismatched),
+        torch.logical_or(posinf_mismatched, neginf_mismatched),
+    )
+
+    mismatched_indices = torch.nonzero(mismatched)
+
+    # Count the number of mismatched elements
+    num_mismatched = mismatched.count_nonzero().item()
+
+    # Generate detailed information if there are mismatches
+    if num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
+
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return mismatch_details
+
+    return []
+
+
+@torch.no_grad()
+def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print: int=5):
+    """
+    Assert that two tensors are element-wise perfectly equal, providing detailed information about mismatches.
+
+    Parameters:
+    received (torch.Tensor): Tensor we actually got.
+    expected (torch.Tensor): Tensor we expected to receive.
+    max_print (int): Maximum number of mismatched elements to print.
+
+    Returns:
+         Empty string if tensors are equal, otherwise detailed error information
+    """
+    mismatched = torch.not_equal(received, expected)
+    mismatched_indices = torch.nonzero(mismatched)
+
+    # Count the number of mismatched elements
+    num_mismatched = mismatched.count_nonzero().item()
+
+    # Generate detailed information if there are mismatches
+    if num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
+
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return mismatch_details
+
+    return []
+
+
+def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08) -> tuple[bool, str]:
+    """
+    Convenient "default" implementation for tasks' `check_implementation` function.
+    """
+    expected = reference(data)
+    reasons = verbose_allclose(output, expected, rtol=rtol, atol=atol)
+
+    if len(reasons) > 0:
+        return False, "mismatch found! custom implementation doesn't match reference: " + " ".join(reasons)
+
+    return True, ''
+
+
+def make_match_reference(reference: callable, **kwargs):
+    def wrapped(data, output):
+        return match_reference(data, output, reference=reference, **kwargs)
+    return wrapped
+
+
+class DeterministicContext:
+    def __init__(self):
+        self.allow_tf32 = None
+        self.deterministic = None
+        self.cublas = None
+
+    def __enter__(self):
+        self.cublas = os.environ.get('CUBLAS_WORKSPACE_CONFIG', '')
+        self.allow_tf32 = torch.backends.cudnn.allow_tf32
+        self.deterministic = torch.backends.cudnn.deterministic
+        torch.backends.cudnn.allow_tf32 = False
+        torch.backends.cudnn.deterministic = True
+        torch.use_deterministic_algorithms(True)
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        torch.backends.cudnn.allow_tf32 = self.allow_tf32
+        torch.backends.cudnn.deterministic = self.deterministic
+        torch.use_deterministic_algorithms(False)
+        os.environ['CUBLAS_WORKSPACE_CONFIG'] = self.cublas
+
+def clear_l2_cache():
+    # import cupy as cp
+    # cp.cuda.runtime.deviceSetLimit(cp.cuda.runtime.cudaLimitPersistingL2CacheSize, 0)
+    # create a large dummy tensor
+    dummy = torch.empty((32, 1024, 1024), dtype=torch.int64, device="cuda")
+    # write stuff to
+    dummy.fill_(42)
+    del dummy

From 6fc63234e5ab5bd895a22e8c4a50bd663955dfff Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Mon, 10 Nov 2025 18:57:56 +0100
Subject: [PATCH 142/207] Fix: deadline

---
 problems/nvidia.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/problems/nvidia.yaml b/problems/nvidia.yaml
index 867d6687..22ddd562 100644
--- a/problems/nvidia.yaml
+++ b/problems/nvidia.yaml
@@ -1,12 +1,12 @@
 name: NVIDIA Blackwell NVFP4 Kernel Hackathon
 
-deadline: 2026-02-14
+deadline: "2026-02-14"
 
 description: "NVIDIA Blackwell NVFP4 Kernel Hackathon"
 
 problems:
   - directory: nvidia/nvfp4_gemv
     name: nvfp4_gemv
-    deadline: 2026-11-28
+    deadline: "2026-11-28"
     gpus:
       - NVIDIA

From 4fe41aa3935802e7ca1a5108484ddfc290425ce7 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Mon, 10 Nov 2025 21:01:38 +0100
Subject: [PATCH 143/207] Fix: deadline + l2_cache

---
 problems/nvidia.yaml     | 2 +-
 problems/nvidia/utils.py | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/problems/nvidia.yaml b/problems/nvidia.yaml
index 22ddd562..64ca9f71 100644
--- a/problems/nvidia.yaml
+++ b/problems/nvidia.yaml
@@ -7,6 +7,6 @@ description: "NVIDIA Blackwell NVFP4 Kernel Hackathon"
 problems:
   - directory: nvidia/nvfp4_gemv
     name: nvfp4_gemv
-    deadline: "2026-11-28"
+    deadline: "2025-11-28"
     gpus:
       - NVIDIA
diff --git a/problems/nvidia/utils.py b/problems/nvidia/utils.py
index 187123d6..7997d3db 100644
--- a/problems/nvidia/utils.py
+++ b/problems/nvidia/utils.py
@@ -170,7 +170,5 @@ def clear_l2_cache():
     # import cupy as cp
     # cp.cuda.runtime.deviceSetLimit(cp.cuda.runtime.cudaLimitPersistingL2CacheSize, 0)
     # create a large dummy tensor
-    dummy = torch.empty((32, 1024, 1024), dtype=torch.int64, device="cuda")
-    # write stuff to
-    dummy.fill_(42)
+    dummy = torch.randn((1024, 1024, 1024), device="cuda")
     del dummy

From 062187947b4959c317bc6d2f88404ef29f35d2dd Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Mon, 10 Nov 2025 22:02:15 +0100
Subject: [PATCH 144/207] set a minimum benchmark time

---
 problems/nvidia/eval.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/problems/nvidia/eval.py b/problems/nvidia/eval.py
index ed370157..1fe58d33 100644
--- a/problems/nvidia/eval.py
+++ b/problems/nvidia/eval.py
@@ -254,8 +254,8 @@ def _run_single_benchmark(
         del output
         durations.append(duration)
 
-        if i > 1:
-            total_bm_duration = time.perf_counter_ns() - bm_start_time
+        total_bm_duration = time.perf_counter_ns() - bm_start_time
+        if i > 1 and total_bm_duration > 1e8:       # at least 2 runs, and at least 100 ms total time
             stats = calculate_stats(durations)
             # stop if either
             # a) relative error dips below 0.1%

From 9f0b15098cc4372414f838a63ab9bc10ecbdbe29 Mon Sep 17 00:00:00 2001
From: Vicki Wang <vickiw@nvidia.com>
Date: Tue, 11 Nov 2025 05:47:39 -0800
Subject: [PATCH 145/207] add nvfp4 dual_gemm example

---
 problems/nvidia/nvfp4_dual_gemm/eval.py       | 500 +++++++++
 problems/nvidia/nvfp4_dual_gemm/reference.py  | 194 ++++
 problems/nvidia/nvfp4_dual_gemm/submission.py | 957 ++++++++++++++++++
 problems/nvidia/nvfp4_dual_gemm/task.py       |  11 +
 problems/nvidia/nvfp4_dual_gemm/template.py   |  28 +
 problems/nvidia/nvfp4_dual_gemm/utils.py      | 172 ++++
 6 files changed, 1862 insertions(+)
 create mode 100644 problems/nvidia/nvfp4_dual_gemm/eval.py
 create mode 100644 problems/nvidia/nvfp4_dual_gemm/reference.py
 create mode 100644 problems/nvidia/nvfp4_dual_gemm/submission.py
 create mode 100644 problems/nvidia/nvfp4_dual_gemm/task.py
 create mode 100644 problems/nvidia/nvfp4_dual_gemm/template.py
 create mode 100644 problems/nvidia/nvfp4_dual_gemm/utils.py

diff --git a/problems/nvidia/nvfp4_dual_gemm/eval.py b/problems/nvidia/nvfp4_dual_gemm/eval.py
new file mode 100644
index 00000000..e8bb5b21
--- /dev/null
+++ b/problems/nvidia/nvfp4_dual_gemm/eval.py
@@ -0,0 +1,500 @@
+import base64
+import dataclasses
+import multiprocessing
+import re
+import time
+import os
+import sys
+import math
+from pathlib import Path
+from typing import Any, Optional
+import tempfile
+
+import torch.cuda
+from cutlass.cute.nvgpu.common import OpError
+
+from utils import set_seed, clear_l2_cache
+
+try:
+    from task import TestSpec
+except ImportError:
+    TestSpec = dict
+
+from reference import check_implementation, generate_input
+
+
+class PopcornOutput:
+    def __init__(self, fd: int):
+        self.file = os.fdopen(fd, "w")
+        os.set_inheritable(fd, False)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.file.close()
+
+    def print(self, *args, **kwargs):
+        print(*args, **kwargs, file=self.file, flush=True)
+
+    def log(self, key, value):
+        self.print(f"{key}: {value}")
+
+
+@dataclasses.dataclass
+class TestCase:
+    args: dict
+    spec: str
+
+
+def _combine(a: int, b: int) -> int:
+    # combine two integers into one:
+    # we need this to generate a secret seed based on the test-level seed and
+    # the global secret seed.
+    # the test-level seeds are public knowledge, and typically relatively small numbers,
+    # so we need to make sure they don't provide any useful info for the full seed.
+    # This Cantor construction ensures that if the secret seed is a large number,
+    # then so is the overall seed.
+    return int(a + (a + b) * (a + b + 1) // 2)
+
+
+def get_test_cases(file_name: str, seed: Optional[int]) -> list[TestCase]:
+    try:
+        content = Path(file_name).read_text()
+    except Exception as E:
+        print(f"Could not open test file`{file_name}`: {E}", file=sys.stderr)
+        exit(113)
+
+    tests = []
+    lines = content.splitlines()
+    match = r"\s*([a-zA-Z]+):\s*([a-zA-Z]+|[+-]?[0-9]+)\s*"
+    for line in lines:
+        parts = line.split(";")
+        case = {}
+        for part in parts:
+            matched = re.match(match, part)
+            if not re.fullmatch(match, part):
+                print(f"invalid test case: '{line}': '{part}'", file=sys.stderr)
+                exit(113)
+            key = matched[1]
+            val = matched[2]
+            try:
+                val = int(val)
+            except ValueError:
+                pass
+
+            case[key] = val
+        tests.append(TestCase(spec=line, args=case))
+
+    if seed is not None:
+        for test in tests:
+            if "seed" in test.args:
+                test.args["seed"] = _combine(test.args["seed"], seed)
+
+    return tests
+
+
+@dataclasses.dataclass
+class Stats:
+    runs: int
+    mean: float
+    std: float
+    err: float
+    best: float
+    worst: float
+
+
+def calculate_stats(durations: list[int]):
+    """
+    Calculate statistical data from a list of durations.
+
+    @param durations: A list of durations in nanoseconds.
+    @return: A Stats object containing the number of runs, mean, standard deviation, error, best, and worst durations.
+    """
+    runs = len(durations)
+    total = sum(durations)
+    best = min(durations)
+    worst = max(durations)
+
+    avg = total / runs
+    variance = sum(map(lambda x: (x - avg) ** 2, durations))
+    std = math.sqrt(variance / (runs - 1))
+    err = std / math.sqrt(runs)
+
+    return Stats(
+        runs=runs, mean=avg, std=std, err=err, best=float(best), worst=float(worst)
+    )
+
+
+def _clone_data(data):
+    """
+    Recursively goes through data and clones all tensors.
+    """
+    if isinstance(data, tuple):
+        return tuple(_clone_data(x) for x in data)
+    elif isinstance(data, list):
+        return [_clone_data(x) for x in data]
+    elif isinstance(data, dict):
+        return {k: _clone_data(v) for k, v in data.items()}
+    elif isinstance(data, torch.Tensor):
+        return data.clone()
+    else:
+        return data
+
+
+def _run_single_test(test: TestCase):
+    """
+    Runs a single test case. Do not call directly
+    """
+    from submission import custom_kernel
+
+    data = generate_input(**test.args)
+    torch.cuda.synchronize()
+    try:
+        submission_output = custom_kernel(_clone_data(data))
+
+    except OpError as E:
+        print(f"Encountered {E}", file=sys.stderr)
+        return False, str(E)
+    torch.cuda.synchronize()
+    return check_implementation(data, submission_output)
+
+
+def run_single_test(pool: multiprocessing.Pool, test: TestCase):
+    """
+    Runs a single test in another process.
+    """
+    return pool.apply(_run_single_test, (test,))
+
+
+def run_testing(
+    logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]
+):
+    """
+    Executes the actual test case code and checks for correctness.
+
+    @param logger: A PopcornOutput object used for logging test results.
+    @param tests: A list of TestCase objects representing the test cases to be executed.
+    @return: An integer representing the exit status: 0 if all tests pass, otherwise 112.
+    """
+    # Step 1: Compile kernel once before running tests
+    logger.log("compile", "start")
+    compile_success, compile_error = pool.apply(_compile_kernel_once)
+    if not compile_success:
+        logger.log("compile", "fail")
+        logger.log("compile.error", compile_error)
+        return 112
+    logger.log("compile", "pass")
+    
+    # Step 2: Run all tests with compiled kernel
+    passed = True
+    logger.log("test-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"test.{idx}.spec", test.spec)
+        good, message = run_single_test(pool, test)
+        if not good:
+            logger.log(f"test.{idx}.status", "fail")
+            logger.log(f"test.{idx}.error", message)
+            passed = False
+        else:
+            logger.log(f"test.{idx}.status", "pass")
+            if message:
+                logger.log(f"test.{idx}.message", message)
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def _compile_kernel_once():
+    """
+    Compile the kernel once before any benchmarking.
+    This ensures compilation time is not included in benchmark results.
+    """
+    from submission import compile_kernel
+    
+    try:
+        # Trigger compilation (will be cached)
+        compile_kernel()
+        torch.cuda.synchronize()
+        return True, None
+    except OpError as E:
+        return False, f"Compilation failed: {E}"
+    except Exception as E:
+        return False, f"Compilation failed: {E}"
+
+
+def _run_single_benchmark(
+    test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float
+) -> Stats | Any:
+    """
+    Runs one benchmark. Do not call directly.
+    """
+    from submission import custom_kernel, compile_kernel
+
+    durations = []
+    # generate input data once
+    data = generate_input(**test.args)
+    check_copy = _clone_data(data)
+    
+    # Ensure kernel is compiled before any timing (compilation is cached)
+    try:
+        compile_kernel()
+        torch.cuda.synchronize()
+    except OpError as E:
+        return f"Compilation failed: {E}"
+    except Exception as E:
+        return f"Compilation failed: {E}"
+    
+    #  first, one obligatory correctness check
+    try:
+        output = custom_kernel(_clone_data(data))
+    except OpError as E:
+        return f"Encountered {E}"
+    good, message = check_implementation(check_copy, output)
+    if not good:
+        return message
+
+    # now, do multiple timing runs without further correctness testing
+    # there is an upper bound of 200 runs, and a lower bound of 3 runs;
+    # otherwise, we repeat until we either measure at least 10 full seconds,
+    # or the relative error of the mean is below 1%.
+
+    bm_start_time = time.perf_counter_ns()
+    for i in range(max_repeats):
+        if recheck:
+            # ensure we use a different seed for every benchmark
+            if "seed" in test.args:
+                test.args["seed"] += 13
+
+            data = generate_input(**test.args)
+            check_copy = _clone_data(data)
+        torch.cuda.synchronize()
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        clear_l2_cache()
+
+        start_event.record()
+        output = custom_kernel(data)
+        end_event.record()
+        torch.cuda.synchronize()
+        duration = start_event.elapsed_time(end_event) * 1e6  # Convert ms to ns
+
+        if recheck:
+            good, message = check_implementation(check_copy, output)
+            if not good:
+                return message
+
+        del output
+        durations.append(duration)
+
+        if i > 1:
+            total_bm_duration = time.perf_counter_ns() - bm_start_time
+            stats = calculate_stats(durations)
+            # stop if either
+            # a) relative error dips below 0.1%
+            # b) we exceed the total time limit for benchmarking the kernel
+            # c) we exceed 2 minutes of total wallclock time.
+            if (
+                stats.err / stats.mean < 0.001
+                or stats.mean * stats.runs > max_time_ns
+                or total_bm_duration > 120e9
+            ):
+                break
+
+    return calculate_stats(durations)
+
+
+def run_single_benchmark(
+    pool: multiprocessing.Pool,
+    test: TestCase,
+    recheck: bool,
+    max_repeats: int,
+    max_time_ns: float,
+):
+    """
+    For a particular test case, check correctness (if applicable) and grab runtime results.
+
+    @param pool: Process on which the benchmark will be launched.
+    @param test: TestCase object.
+    @param recheck: Flag for whether to explicitly check functional correctness.
+    @param max_repeats: Number of trials to repeat.
+    @param max_time_ns: Timeout time in nanoseconds.
+    @return: A Stats object for this particular benchmark case or an error if the test fails.
+    """
+    return pool.apply(_run_single_benchmark, (test, recheck, max_repeats, max_time_ns))
+
+
+def run_benchmarking(
+    logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]
+):
+    """
+    Executes benchmarking code for a CUDA Kernel and logs runtimes.
+
+    @param logger: A PopcornOutput object used for logging benchmark results.
+    @param pool: Process on which the benchmarks will be launched.
+    @param tests: A list of TestCase objects representing the test cases to be benchmarked.
+    @return: An integer representing the exit status: 0 if all benchmarks pass, otherwise 112.
+    """
+    # Step 1: Compile kernel once (outside of timing)
+    logger.log("compile", "start")
+    compile_success, compile_error = pool.apply(_compile_kernel_once)
+    if not compile_success:
+        logger.log("compile", "fail")
+        logger.log("compile.error", compile_error)
+        return 112
+    logger.log("compile", "pass")
+    
+    # Step 2: Warm up with compiled kernel
+    run_single_benchmark(pool, tests[0], False, 200, 10e7)
+
+    # Step 3: Run benchmarks (compilation time excluded)
+    passed = True
+    logger.log("benchmark-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"benchmark.{idx}.spec", test.spec)
+        result = run_single_benchmark(pool, test, False, 200, 10e9)
+        if isinstance(result, Stats):
+            for field in dataclasses.fields(Stats):
+                logger.log(f"benchmark.{idx}.{field.name}", getattr(result, field.name))
+        else:
+            passed = False
+            logger.log(f"benchmark.{idx}.status", "fail")
+            logger.log(f"benchmark.{idx}.error", result)
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def run_single_profile(test: TestCase) -> str:
+    """
+    Runs a single test case. Do not call directly
+    """
+    from submission import custom_kernel
+    from torch.profiler import profile, record_function, ProfilerActivity
+
+    data = generate_input(**test.args)
+    torch.cuda.synchronize()
+
+    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
+        submission_output = custom_kernel(_clone_data(data))
+        torch.cuda.synchronize()
+    return prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=20)
+
+
+def run_profiling(logger: PopcornOutput, tests: list[TestCase]):
+    logger.log("benchmark-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"benchmark.{idx}.spec", test.spec)
+        report = run_single_profile(test)
+        logger.log(
+            f"benchmark.{idx}.report",
+            base64.b64encode(report.encode("utf-8"), b"+*").decode("utf-8"),
+        )
+    logger.log("check", "pass")
+    return 0
+
+
+def main():
+    fd = os.getenv("POPCORN_FD")
+    if not fd:
+        return 111
+
+    if len(sys.argv) < 3:
+        return 2
+
+    mode = sys.argv[1]
+    seed = os.getenv("POPCORN_SEED")
+    os.unsetenv("POPCORN_SEED")
+    seed = int(seed) if seed else None
+    set_seed(seed or 42)
+
+    filename = None
+
+    with tempfile.NamedTemporaryFile(delete=False) as tmp:
+
+        def build_test_string(tests: list[dict]):
+            as_str = ""
+            for test in tests:
+                kvs = []
+                for k, v in test.items():
+                    kvs.append(f"{k}: {v}")
+                as_str += "; ".join(kvs) + "\n"
+            return as_str
+
+        import yaml
+
+        yaml_content = yaml.safe_load(open(sys.argv[2], "r"))
+        if mode == "test":
+            tests_str = build_test_string(yaml_content.get("tests", []))
+        elif mode in ("benchmark", "leaderboard", "profile"):
+            tests_str = build_test_string(yaml_content.get("benchmarks", []))
+
+        tmp.write(tests_str.encode("utf-8"))
+        tmp.flush()
+        filename = tmp.name
+
+    tests = get_test_cases(filename, seed)
+
+    os.unlink(filename)
+
+    with PopcornOutput(int(fd)) as logger:
+        import multiprocessing
+
+        mp_context = multiprocessing.get_context("spawn")
+        with mp_context.Pool(1) as pool:
+            if mode == "test":
+                return run_testing(logger, pool, tests)
+            if mode == "benchmark":
+                return run_benchmarking(logger, pool, tests)
+
+            if mode == "leaderboard":
+                # Step 1: Compile kernel once (outside of timing)
+                logger.log("compile", "start")
+                compile_success, compile_error = pool.apply(_compile_kernel_once)
+                if not compile_success:
+                    logger.log("compile", "fail")
+                    logger.log("compile.error", compile_error)
+                    return 112
+                logger.log("compile", "pass")
+                
+                # Step 2: Warmup with compiled kernel
+                run_single_benchmark(pool, tests[0], False, 200, 1e7)
+                
+                # Step 3: Run leaderboard benchmarks (compilation time excluded)
+                logger.log("benchmark-count", len(tests))
+                passed = True
+                for i in range(len(tests)):
+                    result = run_single_benchmark(pool, tests[i], True, 200, 30e9)
+                    logger.log(f"benchmark.{i}.spec", tests[i].spec)
+                    if isinstance(result, Stats):
+                        for field in dataclasses.fields(Stats):
+                            logger.log(
+                                f"benchmark.{i}.{field.name}",
+                                getattr(result, field.name),
+                            )
+                    else:
+                        passed = False
+                        logger.log(f"benchmark.{i}.status", "fail")
+                        logger.log(
+                            f"benchmark.{i}.error", str(result)
+                        )  # TODO: Make sure result implements __str__?
+                        break
+
+                logger.log("check", "pass" if passed else "fail")
+            elif mode == "profile":
+                run_profiling(logger, tests)
+            else:
+                # TODO: Implement script mode
+                return 2
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/problems/nvidia/nvfp4_dual_gemm/reference.py b/problems/nvidia/nvfp4_dual_gemm/reference.py
new file mode 100644
index 00000000..e4835979
--- /dev/null
+++ b/problems/nvidia/nvfp4_dual_gemm/reference.py
@@ -0,0 +1,194 @@
+import torch
+from task import input_t, output_t
+from utils import make_match_reference
+
+# Scaling factor vector size
+sf_vec_size = 16
+
+# Helper function for ceiling division
+def ceil_div(a, b):
+    return (a + b - 1) // b
+
+# Helper function to convert scale factor tensor to blocked format
+def to_blocked(input_matrix):
+    rows, cols = input_matrix.shape
+
+    # Please ensure rows and cols are multiples of 128 and 4 respectively
+    n_row_blocks = ceil_div(rows, 128)
+    n_col_blocks = ceil_div(cols, 4)
+
+    padded = input_matrix
+    blocks = padded.view(n_row_blocks, 128, n_col_blocks, 4).permute(0, 2, 1, 3)
+    rearranged = blocks.reshape(-1, 4, 32, 4).transpose(1, 2).reshape(-1, 32, 16)
+
+    return rearranged.flatten()
+
+
+def ref_kernel(
+    data: input_t,
+) -> output_t:
+    """
+    PyTorch reference implementation of NVFP4 block-scaled dual GEMM with silu activation,
+    C = silu(A @ B1) * (A @ B2).
+    """
+    a_ref, b1_ref, b2_ref, sfa_ref_cpu, sfb1_ref_cpu, sfb2_ref_cpu, _, _, _, c_ref = data
+    
+    # Get dimensions from MxNxL layout
+    m, n, l = c_ref.shape
+
+    # Call torch._scaled_mm to compute the GEMV result
+    ref1 = torch.empty(
+        (l, m, n),
+        dtype=torch.float32,
+        device="cuda",
+    ).permute(1, 2, 0)
+    ref2 = torch.empty(
+        (l, m, n),
+        dtype=torch.float32,
+        device="cuda",
+    ).permute(1, 2, 0)
+    for l_idx in range(l):
+        # Convert the scale factor tensor to blocked format
+        scale_a = to_blocked(sfa_ref_cpu[:, :, l_idx])
+        scale_b1 = to_blocked(sfb1_ref_cpu[:, :, l_idx])
+        scale_b2 = to_blocked(sfb2_ref_cpu[:, :, l_idx])
+        # (m, k) @ (n, k).T -> (m, n)
+        res1 = torch._scaled_mm(
+            a_ref[:, :, l_idx],
+            b1_ref[:, :, l_idx].transpose(0, 1),
+            scale_a.cuda(),
+            scale_b1.cuda(),
+            bias=None,
+            out_dtype=torch.float32,
+        )
+        ref1[:, :, l_idx] = res1
+
+        res2 = torch._scaled_mm(
+            a_ref[:, :, l_idx],
+            b2_ref[:, :, l_idx].transpose(0, 1),
+            scale_a.cuda(),
+            scale_b2.cuda(),
+            bias=None,
+            out_dtype=torch.float32,
+        )
+        ref2[:, :, l_idx] = res2
+    # Do silu on the first GEMM result and multiply with the second GEMM result
+    c_ref = (torch.nn.functional.silu(ref1) * ref2).to(torch.float16)
+    return c_ref
+
+
+def generate_input(
+    m: int,
+    n: int,
+    k: int,
+    l: int,
+    seed: int,
+):
+    """
+    Generate input tensors for NVFP4 block-scaled dual GEMM with silu activation,
+    C = silu(A @ B1) * (A @ B2).
+    
+    Args:
+        m: Number of rows in matrix A
+        n: Number of columns in matrix B1 and B2
+        k: Number of columns in A and rows of B1 and B2
+        l: Batch size
+        seed: Random seed for reproducibility
+    
+    Returns:
+        Tuple of (a, b, scale_a, scale_b, c) where:
+            a: [m, k, l] - Input matrix in torch.float4e2m1fn_x2 data type
+            b1: [n, k, l] - Input matrix in torch.float4e2m1fn_x2 data type
+            b2: [n, k, l] - Input matrix in torch.float4e2m1fn_x2 data type
+            scale_a: [m, k, l] - Input scale factors in torch.float8e4m3fn data type
+            scale_b1: [n, k, l] - Input scale factors in torch.float8e4m3fn data type
+            scale_b2: [n, k, l] - Input scale factors in torch.float8e4m3fn data type
+            scale_a_permuted: [32, 4, rest_m, 4, rest_k, l] - Input scale factors in torch.float8e4m3fn data type
+            scale_b1_permuted: [32, 4, rest_n, 4, rest_k, l] - Input scale factors in torch.float8e4m3fn data type
+            scale_b2_permuted: [32, 4, rest_n, 4, rest_k, l] - Input scale factors in torch.float8e4m3fn data type
+            c: [m, n, l] - Output matrix in torch.float16 data type
+    """
+    torch.manual_seed(seed)
+    
+    # Generate uint8 tensor, then convert to float4e2m1fn_x2 data type
+    a_ref = torch.randint(
+        -6, 6, (l, m, k // 2), dtype=torch.int8, device="cuda"
+    ).permute(1, 2, 0)
+    b1_ref = torch.randint(
+        -6, 6, (l, n, k // 2), dtype=torch.int8, device="cuda"
+    ).permute(1, 2, 0)
+    b2_ref = torch.randint(
+        -6, 6, (l, n, k // 2), dtype=torch.int8, device="cuda"
+    ).permute(1, 2, 0)
+    a_ref = a_ref.view(torch.float4_e2m1fn_x2)
+    b1_ref = b1_ref.view(torch.float4_e2m1fn_x2)
+    b2_ref = b2_ref.view(torch.float4_e2m1fn_x2)
+
+    # Create float16 output tensor
+    c_ref = torch.randn((l, m, n), dtype=torch.float16, device="cuda").permute(
+        1, 2, 0
+    )
+    
+    # Helper function to prepare the scale factor tensors for both reference
+    # kernel and customize kernel. The customized data layout can be found in:
+    # https://docs.nvidia.com/cuda/cublas/index.html?highlight=fp4#d-block-scaling-factors-layout
+    def create_scale_factor_tensors(l, mn, sf_k):
+        # Create the reference scale factor tensor (mn, sf_k, l) on CPU.
+        ref_shape = (l, mn, sf_k)
+        ref_permute_order = (1, 2, 0)
+        # Init with uint8 tensor, then convert to float8_e4m3fn
+        ref_f8_random_int = torch.randint(-3, 3, ref_shape, dtype=torch.int8, device='cuda')
+        ref_f8_torch_tensor = ref_f8_random_int.to(dtype=torch.float8_e4m3fn)
+        # permute to match ref_permute_order
+        ref_f8_torch_tensor_permuted = ref_f8_torch_tensor.permute(*ref_permute_order)
+
+        atom_m = (32, 4)
+        atom_k = 4
+        mma_shape = (
+            l,  # batch size
+            ceil_div(mn, atom_m[0] * atom_m[1]),
+            ceil_div(sf_k, atom_k),
+            atom_m[0],
+            atom_m[1],
+            atom_k,
+        )
+
+        # Reorder scale factor tensor to (32, 4, rest_m, 4, rest_k, l) layout
+        # Which is needed by the CuTe customized kernel
+        mma_permute_order = (3, 4, 1, 5, 2, 0)
+        # Generate a random int8 tensor, then convert to float8_e4m3fn
+        rand_int_tensor = torch.randint(-3, 3, mma_shape, dtype=torch.int8, device='cuda')
+        reordered_f8_torch_tensor = rand_int_tensor.to(dtype=torch.float8_e4m3fn)
+        # Permute according to mma_permute_order
+        reordered_f8_torch_tensor = reordered_f8_torch_tensor.permute(*mma_permute_order)
+
+        # GPU-side vectorized reordering (replaces slow CPU nested loops)
+        # Create index grids for all dimensions
+        i_idx = torch.arange(mn, device='cuda')
+        j_idx = torch.arange(sf_k, device='cuda')
+        b_idx = torch.arange(l, device='cuda')
+        
+        # Create meshgrid for all combinations of (i, j, b)
+        i_grid, j_grid, b_grid = torch.meshgrid(i_idx, j_idx, b_idx, indexing='ij')
+        
+        # Calculate target indices in vectorized manner
+        mm = i_grid // (atom_m[0] * atom_m[1])
+        mm32 = i_grid % atom_m[0]
+        mm4 = (i_grid % 128) // atom_m[0]
+        kk = j_grid // atom_k
+        kk4 = j_grid % atom_k
+        
+        # Perform the reordering with advanced indexing (all on GPU)
+        reordered_f8_torch_tensor[mm32, mm4, mm, kk4, kk, b_grid] = ref_f8_torch_tensor_permuted[i_grid, j_grid, b_grid]
+        
+        return ref_f8_torch_tensor_permuted.cpu(), reordered_f8_torch_tensor
+
+    sf_k = ceil_div(k, sf_vec_size)
+    sfa_ref_cpu, sfa_ref_permuted = create_scale_factor_tensors(l, m, sf_k)
+    sfb1_ref_cpu, sfb1_ref_permuted = create_scale_factor_tensors(l, n, sf_k)
+    sfb2_ref_cpu, sfb2_ref_permuted = create_scale_factor_tensors(l, n, sf_k)
+
+    return (a_ref, b1_ref, b2_ref, sfa_ref_cpu, sfb1_ref_cpu, sfb2_ref_cpu, sfa_ref_permuted, sfb1_ref_permuted, sfb2_ref_permuted, c_ref)
+
+
+check_implementation = make_match_reference(ref_kernel, rtol=1e-03, atol=1e-03)
\ No newline at end of file
diff --git a/problems/nvidia/nvfp4_dual_gemm/submission.py b/problems/nvidia/nvfp4_dual_gemm/submission.py
new file mode 100644
index 00000000..f733212d
--- /dev/null
+++ b/problems/nvidia/nvfp4_dual_gemm/submission.py
@@ -0,0 +1,957 @@
+from torch._higher_order_ops.torchbind import call_torchbind_fake
+import cuda.bindings.driver as cuda
+
+import torch
+from task import input_t, output_t
+
+import cutlass
+import cutlass.cute as cute
+import cutlass.utils as utils
+import cutlass.pipeline as pipeline
+from cutlass.cute.nvgpu import cpasync, tcgen05
+import cutlass.torch as cutlass_torch
+import cutlass.utils.blackwell_helpers as sm100_utils
+import cutlass.utils.blockscaled_layout as blockscaled_utils
+from cutlass.cute.runtime import make_ptr
+
+# Kernel configuration parameters
+# Tile sizes for M, N, K dimensions
+mma_tiler_mnk= (128, 128, 256)  
+# Shape of the K dimension for the MMA instruction
+mma_inst_shape_k = 64
+# FP4 data type for A and B
+ab_dtype = cutlass.Float4E2M1FN  
+# FP8 data type for scale factors
+sf_dtype = cutlass.Float8E4M3FN  
+# FP16 output type
+c_dtype = cutlass.Float16  
+# Scale factor block size (16 elements share one scale)
+sf_vec_size = 16  
+# Number of threads per CUDA thread block
+threads_per_cta = 128  
+# Stage numbers of shared memory and tmem
+num_acc_stage = 1
+num_ab_stage = 1
+# Total number of columns in tmem
+num_tmem_alloc_cols = 512
+
+
+# Helper function for ceiling division
+def ceil_div(a, b):
+    return (a + b - 1) // b
+
+
+#  GPU device kernel
+@cute.kernel
+def kernel(
+    tiled_mma: cute.TiledMma,
+    tma_atom_a: cute.CopyAtom,
+    mA_mkl: cute.Tensor,
+    tma_atom_b1: cute.CopyAtom,
+    mB_nkl1: cute.Tensor,
+    tma_atom_b2: cute.CopyAtom,
+    mB_nkl2: cute.Tensor,
+    tma_atom_sfa: cute.CopyAtom,
+    mSFA_mkl: cute.Tensor,
+    tma_atom_sfb1: cute.CopyAtom,
+    mSFB_nkl1: cute.Tensor,
+    tma_atom_sfb2: cute.CopyAtom,
+    mSFB_nkl2: cute.Tensor,
+    mC_mnl: cute.Tensor,
+    a_smem_layout_staged: cute.ComposedLayout,
+    b_smem_layout_staged: cute.ComposedLayout,
+    sfa_smem_layout_staged: cute.Layout,
+    sfb_smem_layout_staged: cute.Layout,
+    num_tma_load_bytes: cutlass.Constexpr[int],
+    epilogue_op: cutlass.Constexpr = lambda x: x
+    * (1.0 / (1.0 + cute.math.exp(-x, fastmath=True))),
+):
+    """
+    GPU device kernel performing the batched GEMM computation.
+    """
+    warp_idx = cute.arch.warp_idx()
+    warp_idx = cute.arch.make_warp_uniform(warp_idx)
+    tidx = cute.arch.thread_idx()
+
+    #
+    # Setup cta/thread coordinates
+    #
+    # Coords inside cluster
+    bidx, bidy, bidz = cute.arch.block_idx()
+    mma_tile_coord_v = bidx % cute.size(tiled_mma.thr_id.shape)
+
+    # Coords outside cluster
+    cta_coord = (bidx, bidy, bidz)
+    mma_tile_coord_mnl = (
+        cta_coord[0] // cute.size(tiled_mma.thr_id.shape),
+        cta_coord[1],
+        cta_coord[2],
+    )
+    # Coord inside cta
+    tidx, _, _ = cute.arch.thread_idx()
+
+    #
+    # Define shared storage for kernel
+    #
+    @cute.struct
+    class SharedStorage:
+        ab_mbar_ptr: cute.struct.MemRange[cutlass.Int64, num_ab_stage * 2]
+        acc_mbar_ptr: cute.struct.MemRange[cutlass.Int64, num_acc_stage * 2]
+        tmem_holding_buf: cutlass.Int32
+
+    smem = utils.SmemAllocator()
+    storage = smem.allocate(SharedStorage)
+    # (MMA, MMA_M, MMA_K, STAGE)
+    sA = smem.allocate_tensor(
+        element_type=ab_dtype,
+        layout=a_smem_layout_staged.outer,
+        byte_alignment=128,
+        swizzle=a_smem_layout_staged.inner,
+    )
+    # (MMA, MMA_N, MMA_K, STAGE)
+    sB1 = smem.allocate_tensor(
+        element_type=ab_dtype,
+        layout=b_smem_layout_staged.outer,
+        byte_alignment=128,
+        swizzle=b_smem_layout_staged.inner,
+    )
+    # (MMA, MMA_N, MMA_K, STAGE)
+    sB2 = smem.allocate_tensor(
+        element_type=ab_dtype,
+        layout=b_smem_layout_staged.outer,
+        byte_alignment=128,
+        swizzle=b_smem_layout_staged.inner,
+    )
+    # (MMA, MMA_M, MMA_K, STAGE)
+    sSFA = smem.allocate_tensor(
+        element_type=sf_dtype,
+        layout=sfa_smem_layout_staged,
+        byte_alignment=128,
+    )
+    # (MMA, MMA_N, MMA_K, STAGE)
+    sSFB1 = smem.allocate_tensor(
+        element_type=sf_dtype,
+        layout=sfb_smem_layout_staged,
+        byte_alignment=128,
+    )
+    # (MMA, MMA_N, MMA_K, STAGE)
+    sSFB2 = smem.allocate_tensor(
+        element_type=sf_dtype,
+        layout=sfb_smem_layout_staged,
+        byte_alignment=128,
+    )
+
+    #
+    # Initialize mainloop ab_pipeline, acc_pipeline and their states
+    #
+    ab_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
+    ab_pipeline_consumer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread, 1)
+    ab_producer, ab_consumer = pipeline.PipelineTmaUmma.create(
+        barrier_storage=storage.ab_mbar_ptr.data_ptr(),
+        num_stages=num_ab_stage,
+        producer_group=ab_pipeline_producer_group,
+        consumer_group=ab_pipeline_consumer_group,
+        tx_count=num_tma_load_bytes,
+    ).make_participants()
+    acc_producer, acc_consumer = pipeline.PipelineUmmaAsync.create(
+        barrier_storage=storage.acc_mbar_ptr.data_ptr(),
+        num_stages=num_acc_stage,
+        producer_group=ab_pipeline_producer_group,
+        consumer_group=pipeline.CooperativeGroup(
+            pipeline.Agent.Thread,
+            threads_per_cta,
+        ),
+    ).make_participants()
+
+    #
+    # Local_tile partition global tensors
+    #
+    # (bM, bK, RestM, RestK, RestL)
+    gA_mkl = cute.local_tile(
+        mA_mkl, cute.slice_(mma_tiler_mnk, (None, 0, None)), (None, None, None)
+    )
+    # (bN, bK, RestN, RestK, RestL)
+    gB_nkl1 = cute.local_tile(
+        mB_nkl1, cute.slice_(mma_tiler_mnk, (0, None, None)), (None, None, None)
+    )
+    # (bN, bK, RestN, RestK, RestL)
+    gB_nkl2 = cute.local_tile(
+        mB_nkl2, cute.slice_(mma_tiler_mnk, (0, None, None)), (None, None, None)
+    )
+    gSFA_mkl = cute.local_tile(
+        mSFA_mkl, cute.slice_(mma_tiler_mnk, (None, 0, None)), (None, None, None)
+    )
+    gSFB_nkl1 = cute.local_tile(
+        mSFB_nkl1, cute.slice_(mma_tiler_mnk, (0, None, None)), (None, None, None)
+    )
+    # (bN, bK, RestN, RestK, RestL)
+    gSFB_nkl2 = cute.local_tile(
+        mSFB_nkl2, cute.slice_(mma_tiler_mnk, (0, None, None)), (None, None, None)
+    )
+    # (bM, bN, RestM, RestN, RestL)
+    gC_mnl = cute.local_tile(
+        mC_mnl, cute.slice_(mma_tiler_mnk, (None, None, 0)), (None, None, None)
+    )
+    k_tile_cnt = cute.size(gA_mkl, mode=[3])
+
+    #
+    # Partition global tensor for TiledMMA_A/B/SFA/SFB/C
+    #
+    # (MMA, MMA_M, MMA_K, RestK)
+    thr_mma = tiled_mma.get_slice(mma_tile_coord_v)
+    # (MMA, MMA_M, MMA_K, RestM, RestK, RestL)
+    tCgA = thr_mma.partition_A(gA_mkl)
+    # (MMA, MMA_N, MMA_K, RestN, RestK, RestL)
+    tCgB1 = thr_mma.partition_B(gB_nkl1)
+    # (MMA, MMA_N, MMA_K, RestN, RestK, RestL)
+    tCgB2 = thr_mma.partition_B(gB_nkl2)
+    # (MMA, MMA_M, MMA_K, RestM, RestK, RestL)
+    tCgSFA = thr_mma.partition_A(gSFA_mkl)
+    # (MMA, MMA_N, MMA_K, RestN, RestK, RestL)
+    tCgSFB1 = thr_mma.partition_B(gSFB_nkl1)
+    # (MMA, MMA_N, MMA_K, RestN, RestK, RestL)
+    tCgSFB2 = thr_mma.partition_B(gSFB_nkl2)
+    # (MMA, MMA_M, MMA_N, RestM, RestN, RestL)
+    tCgC = thr_mma.partition_C(gC_mnl)
+
+    #
+    # Partition global/shared tensor for TMA load A/B/SFA/SFB
+    #
+    # TMA Partition_S/D for A
+    # ((atom_v, rest_v), STAGE)
+    # ((atom_v, rest_v), RestM, RestK, RestL)
+    tAsA, tAgA = cpasync.tma_partition(
+        tma_atom_a,
+        0,
+        cute.make_layout(1),
+        cute.group_modes(sA, 0, 3),
+        cute.group_modes(tCgA, 0, 3),
+    )
+    # TMA Partition_S/D for B1
+    # ((atom_v, rest_v), STAGE)
+    # ((atom_v, rest_v), RestN, RestK, RestL)
+    tBsB1, tBgB1 = cpasync.tma_partition(
+        tma_atom_b1,
+        0,
+        cute.make_layout(1),
+        cute.group_modes(sB1, 0, 3),
+        cute.group_modes(tCgB1, 0, 3),
+    )
+    # TMA Partition_S/D for B2
+    # ((atom_v, rest_v), STAGE)
+    # ((atom_v, rest_v), RestN, RestK, RestL)
+    tBsB2, tBgB2 = cpasync.tma_partition(
+        tma_atom_b2,
+        0,
+        cute.make_layout(1),
+        cute.group_modes(sB2, 0, 3),
+        cute.group_modes(tCgB2, 0, 3),
+    )
+    #  TMA Partition_S/D for SFA
+    # ((atom_v, rest_v), STAGE)
+    # ((atom_v, rest_v), RestM, RestK, RestL)
+    tAsSFA, tAgSFA = cpasync.tma_partition(
+        tma_atom_sfa,
+        0,
+        cute.make_layout(1),
+        cute.group_modes(sSFA, 0, 3),
+        cute.group_modes(tCgSFA, 0, 3),
+    )
+    tAsSFA = cute.filter_zeros(tAsSFA)
+    tAgSFA = cute.filter_zeros(tAgSFA)
+    # TMA Partition_S/D for SFB1
+    # ((atom_v, rest_v), STAGE)
+    # ((atom_v, rest_v), RestN, RestK, RestL)
+    tBsSFB1, tBgSFB1 = cpasync.tma_partition(
+        tma_atom_sfb1,
+        0,
+        cute.make_layout(1),
+        cute.group_modes(sSFB1, 0, 3),
+        cute.group_modes(tCgSFB1, 0, 3),
+    )
+    tBsSFB1 = cute.filter_zeros(tBsSFB1)
+    tBgSFB1 = cute.filter_zeros(tBgSFB1)
+    # TMA Partition_S/D for SFB2
+    # ((atom_v, rest_v), STAGE)
+    # ((atom_v, rest_v), RestN, RestK, RestL)
+    tBsSFB2, tBgSFB2 = cpasync.tma_partition(
+        tma_atom_sfb2,
+        0,
+        cute.make_layout(1),
+        cute.group_modes(sSFB2, 0, 3),
+        cute.group_modes(tCgSFB2, 0, 3),
+    )
+    tBsSFB2 = cute.filter_zeros(tBsSFB2)
+    tBgSFB2 = cute.filter_zeros(tBgSFB2)
+
+    #
+    # Partition shared/tensor memory tensor for TiledMMA_A/B/C
+    #
+    # (MMA, MMA_M, MMA_K, STAGE)
+    tCrA = tiled_mma.make_fragment_A(sA)
+    # (MMA, MMA_N, MMA_K, STAGE)
+    tCrB1 = tiled_mma.make_fragment_B(sB1)
+    # (MMA, MMA_N, MMA_K, STAGE)
+    tCrB2 = tiled_mma.make_fragment_B(sB2)
+    # (MMA, MMA_M, MMA_N)
+    acc_shape = tiled_mma.partition_shape_C(mma_tiler_mnk[:2])
+    # (MMA, MMA_M, MMA_N)
+    tCtAcc_fake = tiled_mma.make_fragment_C(acc_shape)
+
+    #
+    # Alloc tensor memory buffer
+    # Make ACC1 and ACC2 tmem tensor
+    # ACC1 += A @ B1
+    # ACC2 += A @ B2
+    #
+    tmem_alloc_barrier = pipeline.NamedBarrier(
+        barrier_id=1,
+        num_threads=threads_per_cta,
+    )
+    tmem = utils.TmemAllocator(
+        storage.tmem_holding_buf,
+        barrier_for_retrieve=tmem_alloc_barrier,
+    )
+    tmem.allocate(num_tmem_alloc_cols)
+    tmem.wait_for_alloc()
+    acc_tmem_ptr = tmem.retrieve_ptr(cutlass.Float32)
+    tCtAcc1 = cute.make_tensor(acc_tmem_ptr, tCtAcc_fake.layout)
+    acc_tmem_ptr1 = cute.recast_ptr(
+        acc_tmem_ptr + tcgen05.find_tmem_tensor_col_offset(tCtAcc1),
+        dtype=cutlass.Float32,
+    )
+    tCtAcc2 = cute.make_tensor(acc_tmem_ptr1, tCtAcc_fake.layout)
+
+    #
+    # Make SFA/SFB1/SFB2 tmem tensor
+    #
+    # SFA tmem layout: (MMA, MMA_M, MMA_K)
+    tCtSFA_layout = blockscaled_utils.make_tmem_layout_sfa(
+        tiled_mma,
+        mma_tiler_mnk,
+        sf_vec_size,
+        cute.slice_(sfa_smem_layout_staged, (None, None, None, 0)),
+    )
+    # Get SFA tmem ptr
+    sfa_tmem_ptr = cute.recast_ptr(
+        acc_tmem_ptr
+        + tcgen05.find_tmem_tensor_col_offset(tCtAcc1)
+        + tcgen05.find_tmem_tensor_col_offset(tCtAcc2),
+        dtype=sf_dtype,
+    )
+    tCtSFA = cute.make_tensor(sfa_tmem_ptr, tCtSFA_layout)
+
+    # SFB1, SFB2 tmem layout: (MMA, MMA_N, MMA_K)
+    tCtSFB_layout = blockscaled_utils.make_tmem_layout_sfb(
+        tiled_mma,
+        mma_tiler_mnk,
+        sf_vec_size,
+        cute.slice_(sfb_smem_layout_staged, (None, None, None, 0)),
+    )
+    # Get SFB1 tmem ptr
+    sfb_tmem_ptr1 = cute.recast_ptr(
+        acc_tmem_ptr
+        + tcgen05.find_tmem_tensor_col_offset(tCtAcc1)
+        + tcgen05.find_tmem_tensor_col_offset(tCtAcc2)
+        + tcgen05.find_tmem_tensor_col_offset(tCtSFA),
+        dtype=sf_dtype,
+    )
+    tCtSFB1 = cute.make_tensor(sfb_tmem_ptr1, tCtSFB_layout)
+    # Get SFB2 tmem ptr
+    sfb_tmem_ptr2 = cute.recast_ptr(
+        acc_tmem_ptr
+        + tcgen05.find_tmem_tensor_col_offset(tCtAcc1)
+        + tcgen05.find_tmem_tensor_col_offset(tCtAcc2)
+        + tcgen05.find_tmem_tensor_col_offset(tCtSFA)
+        + tcgen05.find_tmem_tensor_col_offset(tCtSFB1),
+        dtype=sf_dtype,
+    )
+    tCtSFB2 = cute.make_tensor(sfb_tmem_ptr2, tCtSFB_layout)
+
+    #
+    # Partition for S2T copy of SFA/SFB1/SFB2
+    #
+    # Make S2T CopyAtom
+    copy_atom_s2t = cute.make_copy_atom(
+        tcgen05.Cp4x32x128bOp(tcgen05.CtaGroup.ONE),
+        sf_dtype,
+    )
+    # (MMA, MMA_MN, MMA_K, STAGE)
+    tCsSFA_compact = cute.filter_zeros(sSFA)
+    # (MMA, MMA_MN, MMA_K)
+    tCtSFA_compact = cute.filter_zeros(tCtSFA)
+    tiled_copy_s2t_sfa = tcgen05.make_s2t_copy(copy_atom_s2t, tCtSFA_compact)
+    thr_copy_s2t_sfa = tiled_copy_s2t_sfa.get_slice(0)
+    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+    tCsSFA_compact_s2t_ = thr_copy_s2t_sfa.partition_S(tCsSFA_compact)
+    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+    tCsSFA_compact_s2t = tcgen05.get_s2t_smem_desc_tensor(
+        tiled_copy_s2t_sfa, tCsSFA_compact_s2t_
+    )
+    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K)
+    tCtSFA_compact_s2t = thr_copy_s2t_sfa.partition_D(tCtSFA_compact)
+
+    # (MMA, MMA_MN, MMA_K, STAGE)
+    tCsSFB1_compact = cute.filter_zeros(sSFB1)
+    # (MMA, MMA_MN, MMA_K)
+    tCtSFB1_compact = cute.filter_zeros(tCtSFB1)
+    tiled_copy_s2t_sfb = tcgen05.make_s2t_copy(copy_atom_s2t, tCtSFB1_compact)
+    thr_copy_s2t_sfb = tiled_copy_s2t_sfb.get_slice(0)
+    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+    tCsSFB1_compact_s2t_ = thr_copy_s2t_sfb.partition_S(tCsSFB1_compact)
+    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+    tCsSFB1_compact_s2t = tcgen05.get_s2t_smem_desc_tensor(
+        tiled_copy_s2t_sfb, tCsSFB1_compact_s2t_
+    )
+    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K)
+    tCtSFB1_compact_s2t = thr_copy_s2t_sfb.partition_D(tCtSFB1_compact)
+
+    # SFB2 S2T copy and partition
+    # (MMA, MMA_MN, MMA_K, STAGE)
+    tCsSFB2_compact = cute.filter_zeros(sSFB2)
+    # (MMA, MMA_MN, MMA_K)
+    tCtSFB2_compact = cute.filter_zeros(tCtSFB2)
+    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+    tCsSFB2_compact_s2t_ = thr_copy_s2t_sfb.partition_S(tCsSFB2_compact)
+    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+    tCsSFB2_compact_s2t = tcgen05.get_s2t_smem_desc_tensor(
+        tiled_copy_s2t_sfb, tCsSFB2_compact_s2t_
+    )
+    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K)
+    tCtSFB2_compact_s2t = thr_copy_s2t_sfb.partition_D(tCtSFB2_compact)
+
+    #
+    # Slice to per mma tile index
+    #
+    # ((atom_v, rest_v), RestK)
+    tAgA = tAgA[(None, mma_tile_coord_mnl[0], None, mma_tile_coord_mnl[2])]
+    # ((atom_v, rest_v), RestK)
+    tBgB1 = tBgB1[(None, mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2])]
+    # ((atom_v, rest_v), RestK)
+    tBgB2 = tBgB2[(None, mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2])]
+    # ((atom_v, rest_v), RestK)
+    tAgSFA = tAgSFA[(None, mma_tile_coord_mnl[0], None, mma_tile_coord_mnl[2])]
+    # ((atom_v, rest_v), RestK)
+    tBgSFB1 = tBgSFB1[(None, mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2])]
+    # ((atom_v, rest_v), RestK)
+    tBgSFB2 = tBgSFB2[(None, mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2])]
+
+    #
+    # Execute Data copy and Math computation in the k_tile loop
+    #
+    if warp_idx == 0:
+        # Wait for accumulator buffer empty
+        acc_empty = acc_producer.acquire_and_advance()
+        # Set ACCUMULATE field to False for the first k_tile iteration
+        tiled_mma.set(tcgen05.Field.ACCUMULATE, False)
+        # Execute k_tile loop
+        for k_tile in range(k_tile_cnt):
+            # Wait for AB buffer empty
+            ab_empty = ab_producer.acquire_and_advance()
+
+            #  TMA load A/B1/B2/SFA/SFB1/SFB2 to shared memory
+            cute.copy(
+                tma_atom_a,
+                tAgA[(None, ab_empty.count)],
+                tAsA[(None, ab_empty.index)],
+                tma_bar_ptr=ab_empty.barrier,
+            )
+            cute.copy(
+                tma_atom_b1,
+                tBgB1[(None, ab_empty.count)],
+                tBsB1[(None, ab_empty.index)],
+                tma_bar_ptr=ab_empty.barrier,
+            )
+            cute.copy(
+                tma_atom_b2,
+                tBgB2[(None, ab_empty.count)],
+                tBsB2[(None, ab_empty.index)],
+                tma_bar_ptr=ab_empty.barrier,
+            )
+            cute.copy(
+                tma_atom_sfa,
+                tAgSFA[(None, ab_empty.count)],
+                tAsSFA[(None, ab_empty.index)],
+                tma_bar_ptr=ab_empty.barrier,
+            )
+            cute.copy(
+                tma_atom_sfb1,
+                tBgSFB1[(None, ab_empty.count)],
+                tBsSFB1[(None, ab_empty.index)],
+                tma_bar_ptr=ab_empty.barrier,
+            )
+            cute.copy(
+                tma_atom_sfb2,
+                tBgSFB2[(None, ab_empty.count)],
+                tBsSFB2[(None, ab_empty.index)],
+                tma_bar_ptr=ab_empty.barrier,
+            )
+
+            # Wait for AB buffer full
+            ab_full = ab_consumer.wait_and_advance()
+
+            #  Copy SFA/SFB1/SFB2 to tmem
+            s2t_stage_coord = (None, None, None, None, ab_full.index)
+            tCsSFA_compact_s2t_staged = tCsSFA_compact_s2t[s2t_stage_coord]
+            tCsSFB1_compact_s2t_staged = tCsSFB1_compact_s2t[s2t_stage_coord]
+            tCsSFB2_compact_s2t_staged = tCsSFB2_compact_s2t[s2t_stage_coord]
+            cute.copy(
+                tiled_copy_s2t_sfa,
+                tCsSFA_compact_s2t_staged,
+                tCtSFA_compact_s2t,
+            )
+            cute.copy(
+                tiled_copy_s2t_sfb,
+                tCsSFB1_compact_s2t_staged,
+                tCtSFB1_compact_s2t,
+            )
+            cute.copy(
+                tiled_copy_s2t_sfb,
+                tCsSFB2_compact_s2t_staged,
+                tCtSFB2_compact_s2t,
+            )
+
+            # tCtAcc1 += tCrA * tCrSFA * tCrB1 * tCrSFB1
+            # tCtAcc2 += tCrA * tCrSFA * tCrB2 * tCrSFB2
+            num_kblocks = cute.size(tCrA, mode=[2])
+            for kblock_idx in cutlass.range(num_kblocks, unroll_full=True):
+                kblock_coord = (
+                    None,
+                    None,
+                    kblock_idx,
+                    ab_full.index,
+                )
+
+                # Set SFA/SFB tensor to tiled_mma
+                sf_kblock_coord = (None, None, kblock_idx)
+                tiled_mma.set(
+                    tcgen05.Field.SFA,
+                    tCtSFA[sf_kblock_coord].iterator,
+                )
+                tiled_mma.set(
+                    tcgen05.Field.SFB,
+                    tCtSFB1[sf_kblock_coord].iterator,
+                )
+                cute.gemm(
+                    tiled_mma,
+                    tCtAcc1,
+                    tCrA[kblock_coord],
+                    tCrB1[kblock_coord],
+                    tCtAcc1,
+                )
+
+                tiled_mma.set(
+                    tcgen05.Field.SFB,
+                    tCtSFB2[sf_kblock_coord].iterator,
+                )
+                cute.gemm(
+                    tiled_mma,
+                    tCtAcc2,
+                    tCrA[kblock_coord],
+                    tCrB2[kblock_coord],
+                    tCtAcc2,
+                )
+
+                # Enable accumulate on tCtAcc1/tCtAcc2 after first kblock
+                tiled_mma.set(tcgen05.Field.ACCUMULATE, True)
+
+            # Async arrive AB buffer empty
+            ab_full.release()
+        acc_empty.commit()
+
+    #
+    # Epilogue
+    # Partition for epilogue
+    #
+    op = tcgen05.Ld32x32bOp(tcgen05.Repetition.x128, tcgen05.Pack.NONE)
+    copy_atom_t2r = cute.make_copy_atom(op, cutlass.Float32)
+    tiled_copy_t2r = tcgen05.make_tmem_copy(copy_atom_t2r, tCtAcc1)
+    thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
+    # (T2R_M, T2R_N, EPI_M, EPI_M)
+    tTR_tAcc1 = thr_copy_t2r.partition_S(tCtAcc1)
+    # (T2R_M, T2R_N, EPI_M, EPI_M)
+    tTR_tAcc2 = thr_copy_t2r.partition_S(tCtAcc2)
+    # (T2R_M, T2R_N, EPI_M, EPI_N, RestM, RestN, RestL)
+    tTR_gC = thr_copy_t2r.partition_D(tCgC)
+    # (T2R_M, T2R_N, EPI_M, EPI_N）
+    tTR_rAcc1 = cute.make_rmem_tensor(
+        tTR_gC[None, None, None, None, 0, 0, 0].shape, cutlass.Float32
+    )
+    # (T2R_M, T2R_N, EPI_M, EPI_N）
+    tTR_rAcc2 = cute.make_rmem_tensor(
+        tTR_gC[None, None, None, None, 0, 0, 0].shape, cutlass.Float32
+    )
+    # (T2R_M, T2R_N, EPI_M, EPI_N）
+    tTR_rC = cute.make_rmem_tensor(
+        tTR_gC[None, None, None, None, 0, 0, 0].shape, c_dtype
+    )
+    # STG Atom
+    simt_atom = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), c_dtype)
+    tTR_gC = tTR_gC[(None, None, None, None, *mma_tile_coord_mnl)]
+
+    # Wait for accumulator buffer full
+    acc_full = acc_consumer.wait_and_advance()
+
+    # Copy accumulator to register
+    cute.copy(tiled_copy_t2r, tTR_tAcc1, tTR_rAcc1)
+    cute.copy(tiled_copy_t2r, tTR_tAcc2, tTR_rAcc2)
+
+    # Silu activation on acc1 and multiply with acc2
+    acc_vec1 = epilogue_op(tTR_rAcc1.load())
+    acc_vec2 = tTR_rAcc2.load()
+    acc_vec = acc_vec1 * acc_vec2
+
+    tTR_rC.store(acc_vec.to(c_dtype))
+    # Store C to global memory
+    cute.copy(simt_atom, tTR_rC, tTR_gC)
+
+    acc_full.release()
+    # Deallocate TMEM
+    cute.arch.barrier()
+    tmem.free(acc_tmem_ptr)
+    return
+
+
+@cute.jit
+def my_kernel(
+    a_ptr: cute.Pointer,
+    b1_ptr: cute.Pointer,
+    b2_ptr: cute.Pointer,
+    sfa_ptr: cute.Pointer,
+    sfb1_ptr: cute.Pointer,
+    sfb2_ptr: cute.Pointer,
+    c_ptr: cute.Pointer,
+    problem_size: tuple,
+    epilogue_op: cutlass.Constexpr = lambda x: x
+    * (1.0 / (1.0 + cute.math.exp(-x, fastmath=True))),
+):
+    """
+    Host-side JIT function to prepare tensors and launch GPU kernel.
+    """
+    m, n, k, l = problem_size
+
+    # Setup attributes that depend on gemm inputs
+    a_tensor = cute.make_tensor(
+        a_ptr,
+        cute.make_layout(
+            (m, cute.assume(k, 32), l),
+            stride=(cute.assume(k, 32), 1, cute.assume(m * k, 32)),
+        ),
+    )
+    b_tensor1 = cute.make_tensor(
+        b1_ptr,
+        cute.make_layout(
+            (n, cute.assume(k, 32), l),
+            stride=(cute.assume(k, 32), 1, cute.assume(n * k, 32)),
+        ),
+    )
+    b_tensor2 = cute.make_tensor(
+        b2_ptr,
+        cute.make_layout(
+            (n, cute.assume(k, 32), l),
+            stride=(cute.assume(k, 32), 1, cute.assume(n * k, 32)),
+        ),
+    )
+    c_tensor = cute.make_tensor(
+        c_ptr, cute.make_layout((cute.assume(m, 32), n, l), stride=(n, 1, m * n))
+    )
+    # Setup sfa/sfb tensor by filling A/B tensor to scale factor atom layout
+    # ((Atom_M, Rest_M),(Atom_K, Rest_K),RestL)
+    sfa_layout = blockscaled_utils.tile_atom_to_shape_SF(
+        a_tensor.shape, sf_vec_size
+    )
+    sfa_tensor = cute.make_tensor(sfa_ptr, sfa_layout)
+
+    # ((Atom_N, Rest_N),(Atom_K, Rest_K),RestL)
+    sfb_layout = blockscaled_utils.tile_atom_to_shape_SF(
+        b_tensor1.shape, sf_vec_size
+    )
+    sfb_tensor1 = cute.make_tensor(sfb1_ptr, sfb_layout)
+    sfb_tensor2 = cute.make_tensor(sfb2_ptr, sfb_layout)
+
+    mma_op = tcgen05.MmaMXF4NVF4Op(
+        sf_dtype,
+        (mma_tiler_mnk[0], mma_tiler_mnk[1], mma_inst_shape_k),
+        tcgen05.CtaGroup.ONE,
+        tcgen05.OperandSource.SMEM,
+    )
+    tiled_mma = cute.make_tiled_mma(mma_op)
+
+    cluster_layout_vmnk  = cute.tiled_divide(
+        cute.make_layout((1, 1, 1)),
+        (tiled_mma.thr_id.shape,),
+    )
+
+    # Compute A/B/SFA/SFB/C shared memory layout
+    a_smem_layout_staged = sm100_utils.make_smem_layout_a(
+        tiled_mma,
+        mma_tiler_mnk,
+        ab_dtype,
+        num_ab_stage,
+    )
+    # B1 and B2 have the same size thus share the same smem layout
+    b_smem_layout_staged = sm100_utils.make_smem_layout_b(
+        tiled_mma,
+        mma_tiler_mnk,
+        ab_dtype,
+        num_ab_stage,
+    )
+    sfa_smem_layout_staged = blockscaled_utils.make_smem_layout_sfa(
+        tiled_mma,
+        mma_tiler_mnk,
+        sf_vec_size,
+        num_ab_stage,
+    )
+    # SFB1 and SFB2 have the same size thus share the same smem layout
+    sfb_smem_layout_staged = blockscaled_utils.make_smem_layout_sfb(
+        tiled_mma,
+        mma_tiler_mnk,
+        sf_vec_size,
+        num_ab_stage,
+    )
+    atom_thr_size = cute.size(tiled_mma.thr_id.shape)
+
+    # Setup TMA for A
+    a_smem_layout = cute.slice_(a_smem_layout_staged, (None, None, None, 0))
+    tma_atom_a, tma_tensor_a = cute.nvgpu.make_tiled_tma_atom_A(
+        cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE),
+        a_tensor,
+        a_smem_layout,
+        mma_tiler_mnk,
+        tiled_mma,
+        cluster_layout_vmnk .shape,
+    )
+    # Setup TMA for B1
+    b_smem_layout = cute.slice_(b_smem_layout_staged, (None, None, None, 0))
+    tma_atom_b1, tma_tensor_b1 = cute.nvgpu.make_tiled_tma_atom_B(
+        cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE),
+        b_tensor1,
+        b_smem_layout,
+        mma_tiler_mnk,
+        tiled_mma,
+        cluster_layout_vmnk .shape,
+    )
+    # Setup TMA for B2
+    tma_atom_b2, tma_tensor_b2 = cute.nvgpu.make_tiled_tma_atom_B(
+        cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE),
+        b_tensor2,
+        b_smem_layout,
+        mma_tiler_mnk,
+        tiled_mma,
+        cluster_layout_vmnk .shape,
+    )
+    # Setup TMA for SFA
+    sfa_smem_layout = cute.slice_(
+        sfa_smem_layout_staged , (None, None, None, 0)
+    )
+    tma_atom_sfa, tma_tensor_sfa = cute.nvgpu.make_tiled_tma_atom_A(
+        cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE),
+        sfa_tensor,
+        sfa_smem_layout,
+        mma_tiler_mnk,
+        tiled_mma,
+        cluster_layout_vmnk .shape,
+        internal_type=cutlass.Int16,
+    )
+    # Setup TMA for SFB1
+    sfb_smem_layout = cute.slice_(
+        sfb_smem_layout_staged , (None, None, None, 0)
+    )
+    tma_atom_sfb1, tma_tensor_sfb1 = cute.nvgpu.make_tiled_tma_atom_B(
+        cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE),
+        sfb_tensor1,
+        sfb_smem_layout,
+        mma_tiler_mnk,
+        tiled_mma,
+        cluster_layout_vmnk .shape,
+        internal_type=cutlass.Int16,
+    )
+    # Setup TMA for SFB2
+    tma_atom_sfb2, tma_tensor_sfb2 = cute.nvgpu.make_tiled_tma_atom_B(
+        cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE),
+        sfb_tensor2,
+        sfb_smem_layout,
+        mma_tiler_mnk,
+        tiled_mma,
+        cluster_layout_vmnk .shape,
+        internal_type=cutlass.Int16,
+    )
+
+    # Compute TMA load bytes
+    a_copy_size = cute.size_in_bytes(ab_dtype, a_smem_layout)
+    b_copy_size = cute.size_in_bytes(ab_dtype, b_smem_layout)
+    sfa_copy_size = cute.size_in_bytes(sf_dtype, sfa_smem_layout)
+    sfb_copy_size = cute.size_in_bytes(sf_dtype, sfb_smem_layout)
+    num_tma_load_bytes = (
+        a_copy_size + b_copy_size * 2 + sfa_copy_size + sfb_copy_size * 2
+    ) * atom_thr_size
+
+    # Compute grid size
+    grid = (
+        cute.ceil_div(c_tensor.shape[0], mma_tiler_mnk[0]),
+        cute.ceil_div(c_tensor.shape[1], mma_tiler_mnk[1]),
+        c_tensor.shape[2],
+    )
+
+    # Launch the kernel.
+    kernel(
+        # MMA (Matrix Multiply-Accumulate) configuration
+        tiled_mma,                  # Tiled MMA object defining NVFP4 GEMM compute pattern
+        
+        # TMA (Tensor Memory Accelerator) atoms and tensors for shared input matrix A
+        tma_atom_a,                 # TMA copy atom defining how to load A from global memory
+        tma_tensor_a,               # Tensor descriptor for A matrix (m, k, l) - shared by both GEMMs
+        
+        # TMA atoms and tensors for first B matrix (B1)
+        tma_atom_b1,                # TMA copy atom defining how to load B1 from global memory
+        tma_tensor_b1,              # Tensor descriptor for B1 matrix (n, k, l) - first GEMM
+        
+        # TMA atoms and tensors for second B matrix (B2)
+        tma_atom_b2,                # TMA copy atom defining how to load B2 from global memory
+        tma_tensor_b2,              # Tensor descriptor for B2 matrix (n, k, l) - second GEMM
+        
+        # TMA atoms and tensors for scale factor A (shared)
+        tma_atom_sfa,               # TMA copy atom for loading scale factors for A
+        tma_tensor_sfa,             # Tensor descriptor for SFA (block scale factors for A) - shared
+        
+        # TMA atoms and tensors for scale factor B1
+        tma_atom_sfb1,              # TMA copy atom for loading scale factors for B1
+        tma_tensor_sfb1,            # Tensor descriptor for SFB1 (block scale factors for B1)
+        
+        # TMA atoms and tensors for scale factor B2
+        tma_atom_sfb2,              # TMA copy atom for loading scale factors for B2
+        tma_tensor_sfb2,            # Tensor descriptor for SFB2 (block scale factors for B2)
+        
+        # Output tensor C (stores both C1 and C2 results)
+        c_tensor,                   # Output tensor where both GEMM results will be stored (m, n, l)
+        
+        # Shared memory layouts with staging for pipelined execution
+        a_smem_layout_staged,       # Staged shared memory layout for A (includes stage dimension)
+        b_smem_layout_staged,       # Staged shared memory layout for B1/B2 (includes stage dimension)
+        sfa_smem_layout_staged,     # Staged shared memory layout for SFA (includes stage dimension)
+        sfb_smem_layout_staged,     # Staged shared memory layout for SFB1/SFB2 (includes stage dimension)
+        
+        # Pipeline synchronization parameter
+        num_tma_load_bytes,         # Total bytes to load per TMA transaction (for barrier setup)
+        
+        # Epilogue operation
+        epilogue_op,                # Epilogue operation to apply to output (e.g., element-wise ops)
+    ).launch(
+        grid=grid,
+        block=[threads_per_cta, 1, 1],
+        cluster=(1, 1, 1),
+    )
+    return
+
+
+# Global cache for compiled kernel
+_compiled_kernel_cache = None
+# This function is used to compile the kernel once and cache it and then allow users to 
+# run the kernel multiple times to get more accurate timing results.
+def compile_kernel():
+    """
+    Compile the kernel once and cache it.
+    This should be called before any timing measurements.
+    
+    Returns:
+        The compiled kernel function
+    """
+    global _compiled_kernel_cache
+    
+    if _compiled_kernel_cache is not None:
+        return _compiled_kernel_cache
+    
+
+    # Create CuTe pointers for A/B/C/SFA/SFB via torch tensor data pointer
+    a_ptr = make_ptr(
+        ab_dtype, 0, cute.AddressSpace.gmem, assumed_align=16
+    )
+    b1_ptr = make_ptr(
+        ab_dtype, 0, cute.AddressSpace.gmem, assumed_align=16
+    )
+    b2_ptr = make_ptr(
+        ab_dtype, 0, cute.AddressSpace.gmem, assumed_align=16
+    )
+    c_ptr = make_ptr(
+        c_dtype, 0, cute.AddressSpace.gmem, assumed_align=16
+    )
+    sfa_ptr = make_ptr(
+        sf_dtype, 0, cute.AddressSpace.gmem, assumed_align=32
+    )
+    sfb1_ptr = make_ptr(
+        sf_dtype, 0, cute.AddressSpace.gmem, assumed_align=32
+    )
+    sfb2_ptr = make_ptr(
+        sf_dtype, 0, cute.AddressSpace.gmem, assumed_align=32
+    )
+
+    # Compile the kernel
+    _compiled_kernel_cache = cute.compile(my_kernel, a_ptr, b1_ptr, b2_ptr, sfa_ptr, sfb1_ptr, sfb2_ptr, c_ptr, (0, 0, 0, 0))
+    
+    return _compiled_kernel_cache
+
+
+def custom_kernel(data: input_t) -> output_t:
+    """
+    Execute the block-scaled dual GEMM kernel with silu activation,
+    C = silu(A @ B1) * (A @ B2).
+    
+    This is the main entry point called by the evaluation framework.
+    It converts PyTorch tensors to CuTe tensors, launches the kernel,
+    and returns the result.
+    
+    Args:
+        data: Tuple of (a, b1, b2, sfa_cpu, sfb1_cpu, sfb2_cpu, c) PyTorch tensors
+            a: [m, k, l] - Input matrix in float4e2m1fn 
+            b1: [n, k, l] - Input matrix in float4e2m1fn 
+            b2: [n, k, l] - Input matrix in float4e2m1fn 
+            sfa_cpu: [m, k, l] - Scale factors in float8_e4m3fn, used by reference implementation
+            sfb1_cpu: [n, k, l] - Scale factors in float8_e4m3fn, used by reference implementation
+            sfb2_cpu: [n, k, l] - Scale factors in float8_e4m3fn, used by reference implementation
+            sfa_permuted: [32, 4, rest_m, 4, rest_k, l] - Scale factors in float8_e4m3fn
+            sfb1_permuted: [32, 4, rest_n, 4, rest_k, l] - Scale factors in float8_e4m3fn
+            sfb2_permuted: [32, 4, rest_n, 4, rest_k, l] - Scale factors in float8_e4m3fn
+            c: [m, n, l] - Output vector in float16
+    
+    Returns:
+        Output tensor c with computed results
+    """
+    a, b1, b2, _, _, _, sfa_permuted, sfb1_permuted, sfb2_permuted, c = data
+    
+    # Ensure kernel is compiled (will use cached version if available)
+    # To avoid the compilation overhead, we compile the kernel once and cache it.
+    compiled_func = compile_kernel()
+
+    # Get dimensions from MxKxL layout
+    _, k, _ = a.shape
+    m, n, l = c.shape
+    # Torch use e2m1_x2 data type, thus k is halved
+    k = k * 2 
+
+    # Create CuTe pointers for A/B/C/SFA/SFB via torch tensor data pointer
+    a_ptr = make_ptr(
+        ab_dtype, a.data_ptr(), cute.AddressSpace.gmem, assumed_align=16
+    )
+    b1_ptr = make_ptr(
+        ab_dtype, b1.data_ptr(), cute.AddressSpace.gmem, assumed_align=16
+    )
+    b2_ptr = make_ptr(
+        ab_dtype, b2.data_ptr(), cute.AddressSpace.gmem, assumed_align=16
+    )
+    c_ptr = make_ptr(
+        c_dtype, c.data_ptr(), cute.AddressSpace.gmem, assumed_align=16
+    )
+    sfa_ptr = make_ptr(
+        sf_dtype, sfa_permuted.data_ptr(), cute.AddressSpace.gmem, assumed_align=32
+    )
+    sfb1_ptr = make_ptr(
+        sf_dtype, sfb1_permuted.data_ptr(), cute.AddressSpace.gmem, assumed_align=32
+    )
+    sfb2_ptr = make_ptr(
+        sf_dtype, sfb2_permuted.data_ptr(), cute.AddressSpace.gmem, assumed_align=32
+    )
+
+    # Execute the compiled kernel
+    compiled_func(a_ptr, b1_ptr, b2_ptr, sfa_ptr, sfb1_ptr, sfb2_ptr, c_ptr, (m, n, k, l))
+
+    return c
\ No newline at end of file
diff --git a/problems/nvidia/nvfp4_dual_gemm/task.py b/problems/nvidia/nvfp4_dual_gemm/task.py
new file mode 100644
index 00000000..8facfb07
--- /dev/null
+++ b/problems/nvidia/nvfp4_dual_gemm/task.py
@@ -0,0 +1,11 @@
+import torch
+from typing import TypedDict, TypeVar
+
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor])
+output_t = TypeVar("output_t", bound=torch.Tensor)
+class TestSpec(TypedDict):
+    m: int
+    n: int
+    k: int
+    l: int
+    seed: int
\ No newline at end of file
diff --git a/problems/nvidia/nvfp4_dual_gemm/template.py b/problems/nvidia/nvfp4_dual_gemm/template.py
new file mode 100644
index 00000000..d8985df5
--- /dev/null
+++ b/problems/nvidia/nvfp4_dual_gemm/template.py
@@ -0,0 +1,28 @@
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of block-scale fp4 dual gemm with silu activation
+    Args:
+        data: Tuple that expands to:
+            a: torch.Tensor[float4e2m1fn] of shape [m, k, l],
+            b1: torch.Tensor[float4e2m1fn] of shape [n, k, l],
+            b2: torch.Tensor[float4e2m1fn] of shape [n, k, l],
+            sfa: torch.Tensor[float8_e4m3fnuz] of shape [m, k // 16, l], used by reference implementation
+            sfb1: torch.Tensor[float8_e4m3fnuz] of shape [n, k // 16, l], used by reference implementation
+            sfb2: torch.Tensor[float8_e4m3fnuz] of shape [n, k // 16, l], used by reference implementation
+            sfa_permuted: torch.Tensor[float8_e4m3fnuz] of shape [32, 4, rest_m, 4, rest_k, l],
+            sfb1_permuted: torch.Tensor[float8_e4m3fnuz] of shape [32, 4, rest_n, 4, rest_k, l],
+            sfb2_permuted: torch.Tensor[float8_e4m3fnuz] of shape [32, 4, rest_n, 4, rest_k, l],
+            c: torch.Tensor[float16] of shape [m, n, l]
+    Returns:
+        Tensor containing output in float16
+        c: torch.Tensor[float16] of shape [m, n, l]
+    """
+    # c: [m, n, l] is pre-allocated memory to avoid timing allocation overhead.
+    a, b1, b2, sfa, sfb1, sfb2, sfa_permuted, sfb1_permuted, sfb2_permuted, c = data
+
+    # Your implementation here
+
+    return c
\ No newline at end of file
diff --git a/problems/nvidia/nvfp4_dual_gemm/utils.py b/problems/nvidia/nvfp4_dual_gemm/utils.py
new file mode 100644
index 00000000..d9b3a69e
--- /dev/null
+++ b/problems/nvidia/nvfp4_dual_gemm/utils.py
@@ -0,0 +1,172 @@
+import os
+import random
+import numpy as np
+import torch
+
+
+def set_seed(seed=42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+
+
+def get_device(use_cuda: bool = True) -> torch.device:
+    """Get the appropriate device (GPU or CPU)."""
+    if use_cuda:
+        if torch.cuda.is_available():
+            return torch.device("cuda")
+        elif torch.backends.mps.is_available():
+            return torch.device("mps")
+        else:
+            print("No compatible GPU found. Falling back to CPU.")
+    return torch.device("cpu")
+
+
+# Adapted from https://github.com/linkedin/Liger-Kernel/blob/main/test/utils.py
+@torch.no_grad()
+def verbose_allclose(
+        received: torch.Tensor,
+        expected: torch.Tensor,
+        rtol=1e-05,
+        atol=1e-08,
+        max_print=5
+) -> list[str]:
+    """
+    Assert that two tensors are element-wise equal within a tolerance, providing detailed information about mismatches.
+    Parameters:
+    received (torch.Tensor): Tensor we actually got.
+    expected (torch.Tensor): Tensor we expected to receive.
+    rtol (float): Relative tolerance; relative to expected
+    atol (float): Absolute tolerance.
+    max_print (int): Maximum number of mismatched elements to print.
+    Raises:
+    AssertionError: If the tensors are not all close within the given tolerance.
+    """
+    # Check if the shapes of the tensors match
+    if received.shape != expected.shape:
+        return ["SIZE MISMATCH"]
+
+    # Calculate the difference between the tensors
+    diff = torch.abs(received - expected)
+
+    # Determine the tolerance
+    tolerance = atol + rtol * torch.abs(expected)
+
+    # Find tolerance mismatched elements
+    tol_mismatched = diff > tolerance
+
+    # Find nan mismatched elements
+    nan_mismatched = torch.logical_xor(torch.isnan(received), torch.isnan(expected))
+
+    # Find +inf mismatched elements
+    posinf_mismatched = torch.logical_xor(torch.isposinf(received), torch.isposinf(expected))
+    # Find -inf mismatched elements
+    neginf_mismatched = torch.logical_xor(torch.isneginf(received), torch.isneginf(expected))
+
+    # Find all mismatched elements
+    mismatched = torch.logical_or(
+        torch.logical_or(tol_mismatched, nan_mismatched),
+        torch.logical_or(posinf_mismatched, neginf_mismatched),
+    )
+
+    mismatched_indices = torch.nonzero(mismatched)
+
+    # Count the number of mismatched elements
+    num_mismatched = mismatched.count_nonzero().item()
+
+    # Generate detailed information if there are mismatches
+    if num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
+
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return mismatch_details
+
+    return []
+
+
+@torch.no_grad()
+def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print: int=5):
+    """
+    Assert that two tensors are element-wise perfectly equal, providing detailed information about mismatches.
+    Parameters:
+    received (torch.Tensor): Tensor we actually got.
+    expected (torch.Tensor): Tensor we expected to receive.
+    max_print (int): Maximum number of mismatched elements to print.
+    Returns:
+         Empty string if tensors are equal, otherwise detailed error information
+    """
+    mismatched = torch.not_equal(received, expected)
+    mismatched_indices = torch.nonzero(mismatched)
+
+    # Count the number of mismatched elements
+    num_mismatched = mismatched.count_nonzero().item()
+
+    # Generate detailed information if there are mismatches
+    if num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
+
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return mismatch_details
+
+    return []
+
+
+def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08) -> tuple[bool, str]:
+    """
+    Convenient "default" implementation for tasks' `check_implementation` function.
+    """
+    expected = reference(data)
+    reasons = verbose_allclose(output, expected, rtol=rtol, atol=atol)
+
+    if len(reasons) > 0:
+        return False, "mismatch found! custom implementation doesn't match reference: " + " ".join(reasons)
+
+    return True, ''
+
+
+def make_match_reference(reference: callable, **kwargs):
+    def wrapped(data, output):
+        return match_reference(data, output, reference=reference, **kwargs)
+    return wrapped
+
+
+class DeterministicContext:
+    def __init__(self):
+        self.allow_tf32 = None
+        self.deterministic = None
+        self.cublas = None
+
+    def __enter__(self):
+        self.cublas = os.environ.get('CUBLAS_WORKSPACE_CONFIG', '')
+        self.allow_tf32 = torch.backends.cudnn.allow_tf32
+        self.deterministic = torch.backends.cudnn.deterministic
+        torch.backends.cudnn.allow_tf32 = False
+        torch.backends.cudnn.deterministic = True
+        torch.use_deterministic_algorithms(True)
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        torch.backends.cudnn.allow_tf32 = self.allow_tf32
+        torch.backends.cudnn.deterministic = self.deterministic
+        torch.use_deterministic_algorithms(False)
+        os.environ['CUBLAS_WORKSPACE_CONFIG'] = self.cublas
+
+def clear_l2_cache():
+    # import cupy as cp
+    # cp.cuda.runtime.deviceSetLimit(cp.cuda.runtime.cudaLimitPersistingL2CacheSize, 0)
+    # create a large dummy tensor
+    dummy = torch.empty((32, 1024, 1024), dtype=torch.int64, device="cuda")
+    # write stuff to
+    dummy.fill_(42)
+    del dummy

From 847111cc260f809ae41cb7312452a71b8688d674 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Tue, 11 Nov 2025 20:04:59 +0100
Subject: [PATCH 146/207] Fix: to gpu scales

---
 problems/nvidia/nvfp4_gemv/reference.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/problems/nvidia/nvfp4_gemv/reference.py b/problems/nvidia/nvfp4_gemv/reference.py
index f204663f..a3271e32 100644
--- a/problems/nvidia/nvfp4_gemv/reference.py
+++ b/problems/nvidia/nvfp4_gemv/reference.py
@@ -159,8 +159,11 @@ def create_scale_factor_tensors(l, mn, sf_k):
     sf_k = ceil_div(k, sf_vec_size)
     sfa_ref_cpu, sfa_permuted = create_scale_factor_tensors(l, m, sf_k)
     sfb_ref_cpu, sfb_permuted = create_scale_factor_tensors(l, n_padded_128, sf_k)
+
+    sfa_ref = sfa_ref_cpu.to("cuda")
+    sfb_ref = sfb_ref_cpu.to("cuda")
     
-    return (a_ref, b_ref, sfa_ref_cpu, sfb_ref_cpu, sfa_permuted, sfb_permuted, c_ref)
+    return (a_ref, b_ref, sfa_ref, sfb_ref, sfa_permuted, sfb_permuted, c_ref)
 
 
 check_implementation = make_match_reference(ref_kernel, rtol=1e-03, atol=1e-03)

From db8cfd3c2310bf02f1c084daf0be86e6aa167ddb Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Tue, 11 Nov 2025 20:31:26 +0100
Subject: [PATCH 147/207] Feat: enlarge init range

---
 problems/nvidia/nvfp4_gemv/reference.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/problems/nvidia/nvfp4_gemv/reference.py b/problems/nvidia/nvfp4_gemv/reference.py
index a3271e32..8aeb41cf 100644
--- a/problems/nvidia/nvfp4_gemv/reference.py
+++ b/problems/nvidia/nvfp4_gemv/reference.py
@@ -88,11 +88,11 @@ def generate_input(
     
     # Generate uint8 tensor, then convert to float4e2m1fn_x2 data type
     a_ref = torch.randint(
-        0, 2, (l, m, k // 2), dtype=torch.uint8, device="cuda"
+        0, 4, (l, m, k // 2), dtype=torch.uint8, device="cuda"
     ).permute(1, 2, 0)
     # Pad b tensor's N dimension to 128 to call torch._scaled_mm for nvfp4 dot product computation
     b_ref = torch.randint(
-        0, 2, (l, n_padded_128, k // 2), dtype=torch.uint8, device="cuda"
+        0, 4, (l, n_padded_128, k // 2), dtype=torch.uint8, device="cuda"
     ).permute(1, 2, 0)
     a_ref = a_ref.view(torch.float4_e2m1fn_x2)
     b_ref = b_ref.view(torch.float4_e2m1fn_x2)
@@ -110,7 +110,7 @@ def create_scale_factor_tensors(l, mn, sf_k):
         ref_shape = (l, mn, sf_k)
         ref_permute_order = (1, 2, 0)
         # Init with uint8 tensor, then convert to float8_e4m3fn
-        ref_f8_random_int = torch.randint(1, 3, ref_shape, dtype=torch.int8, device='cuda')
+        ref_f8_random_int = torch.randint(0, 3, ref_shape, dtype=torch.int8, device='cuda')
         ref_f8_torch_tensor = ref_f8_random_int.to(dtype=torch.float8_e4m3fn)
         # permute to match ref_permute_order
         ref_f8_torch_tensor_permuted = ref_f8_torch_tensor.permute(*ref_permute_order)
@@ -130,7 +130,7 @@ def create_scale_factor_tensors(l, mn, sf_k):
         # Which is needed by the CuTe customized kernel
         mma_permute_order = (3, 4, 1, 5, 2, 0)
         # Generate a random int8 tensor, then convert to float8_e4m3fn
-        rand_int_tensor = torch.randint(0, 2, mma_shape, dtype=torch.int8, device='cuda')
+        rand_int_tensor = torch.randint(0, 3, mma_shape, dtype=torch.int8, device='cuda')
         reordered_f8_torch_tensor = rand_int_tensor.to(dtype=torch.float8_e4m3fn)
         # Permute according to mma_permute_order
         reordered_f8_torch_tensor = reordered_f8_torch_tensor.permute(*mma_permute_order)

From 7e15d01188f2ecef58716efd2be5d855901077a9 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Tue, 11 Nov 2025 21:55:35 +0100
Subject: [PATCH 148/207] Feat: add cutedsl template

---
 problems/nvidia/nvfp4_gemv/task.yml         |   1 +
 problems/nvidia/nvfp4_gemv/template_cute.py | 247 ++++++++++++++++++++
 2 files changed, 248 insertions(+)
 create mode 100644 problems/nvidia/nvfp4_gemv/template_cute.py

diff --git a/problems/nvidia/nvfp4_gemv/task.yml b/problems/nvidia/nvfp4_gemv/task.yml
index 756fe80d..173914bd 100644
--- a/problems/nvidia/nvfp4_gemv/task.yml
+++ b/problems/nvidia/nvfp4_gemv/task.yml
@@ -37,6 +37,7 @@ config:
 
 templates:
   Python: "template.py"
+  CuteDSL: "template_cute.py"
 
 tests:
   - {"m": 128, "k": 256, "l": 1, "seed": 1111}
diff --git a/problems/nvidia/nvfp4_gemv/template_cute.py b/problems/nvidia/nvfp4_gemv/template_cute.py
new file mode 100644
index 00000000..1eebc8f7
--- /dev/null
+++ b/problems/nvidia/nvfp4_gemv/template_cute.py
@@ -0,0 +1,247 @@
+import torch
+from task import input_t, output_t
+
+import cutlass
+import cutlass.cute as cute
+from cutlass.cute.runtime import make_ptr
+import cutlass.utils.blockscaled_layout as blockscaled_utils
+
+# Kernel configuration parameters
+mma_tiler_mnk = (128, 1, 64)  # Tile sizes for M, N, K dimensions
+ab_dtype = cutlass.Float4E2M1FN  # FP4 data type for A and B
+sf_dtype = cutlass.Float8E4M3FN  # FP8 data type for scale factors
+c_dtype = cutlass.Float16  # FP16 output type
+sf_vec_size = 16  # Scale factor block size (16 elements share one scale)
+threads_per_cta = 128  # Number of threads per CUDA thread block
+
+
+# Helper function for ceiling division
+def ceil_div(a, b):
+    return (a + b - 1) // b
+
+
+# The CuTe reference implementation for NVFP4 block-scaled GEMV
+@cute.kernel
+def kernel(
+    mA_mkl: cute.Tensor,
+    mB_nkl: cute.Tensor,
+    mSFA_mkl: cute.Tensor,
+    mSFB_nkl: cute.Tensor,
+    mC_mnl: cute.Tensor,
+):
+    # Get CUDA block and thread indices
+    bidx, bidy, bidz = cute.arch.block_idx()
+    tidx, _, _ = cute.arch.thread_idx()
+
+    # Extract the local tile for input matrix A (shape: [block_M, block_K, rest_M, rest_K, rest_L])
+    gA_mkl = cute.local_tile(
+        mA_mkl, cute.slice_(mma_tiler_mnk, (None, 0, None)), (None, None, None)
+    )
+    # Extract the local tile for scale factor tensor for A (same shape as gA_mkl)
+    # Here, block_M = (32, 4); block_K = (16, 4)
+    gSFA_mkl = cute.local_tile(
+        mSFA_mkl, cute.slice_(mma_tiler_mnk, (None, 0, None)), (None, None, None)
+    )
+    # Extract the local tile for input matrix B (shape: [block_N, block_K, rest_N, rest_K, rest_L])
+    gB_nkl = cute.local_tile(
+        mB_nkl, cute.slice_(mma_tiler_mnk, (0, None, None)), (None, None, None)
+    )
+    # Extract the local tile for scale factor tensor for B (same shape as gB_nkl)
+    gSFB_nkl = cute.local_tile(
+        mSFB_nkl, cute.slice_(mma_tiler_mnk, (0, None, None)), (None, None, None)
+    )
+    # Extract the local tile for output matrix C (shape: [block_M, block_N, rest_M, rest_N, rest_L])
+    gC_mnl = cute.local_tile(
+        mC_mnl, cute.slice_(mma_tiler_mnk, (None, None, 0)), (None, None, None)
+    )
+
+    # Select output element corresponding to this thread and block indices
+    tCgC = gC_mnl[tidx, None, bidx, bidy, bidz]
+    tCgC = cute.make_tensor(tCgC.iterator, 1)
+    res = cute.zeros_like(tCgC, cutlass.Float32)
+
+    # Get the number of k tiles (depth dimension) for the reduction loop
+    k_tile_cnt = gA_mkl.layout[3].shape
+    for k_tile in range(k_tile_cnt):
+        tAgA = gA_mkl[tidx, None, bidx, k_tile, bidz]
+        tBgB = gB_nkl[0, None, bidy, k_tile, bidz]
+        tAgSFA = gSFA_mkl[tidx, None, bidx, k_tile, bidz]
+        tBgSFB = gSFB_nkl[0, None, bidy, k_tile, bidz]
+
+        tArA = cute.make_rmem_tensor_like(tAgA, cutlass.Float32)
+        tBrB = cute.make_rmem_tensor_like(tBgB, cutlass.Float32)
+        tArSFA = cute.make_rmem_tensor_like(tAgSFA, cutlass.Float32)
+        tBrSFB = cute.make_rmem_tensor_like(tBgSFB, cutlass.Float32)
+
+        # Load NVFP4 or FP8 values from global memory
+        a_val_nvfp4 = tAgA.load()
+        b_val_nvfp4 = tBgB.load()
+        sfa_val_fp8 = tAgSFA.load()
+        sfb_val_fp8 = tBgSFB.load()
+
+        # Convert loaded values to float32 for computation (FFMA)
+        a_val = a_val_nvfp4.to(cutlass.Float32)
+        b_val = b_val_nvfp4.to(cutlass.Float32)
+        sfa_val = sfa_val_fp8.to(cutlass.Float32)
+        sfb_val = sfb_val_fp8.to(cutlass.Float32)
+
+        # Store the converted values to RMEM CuTe tensors
+        tArA.store(a_val)
+        tBrB.store(b_val)
+        tArSFA.store(sfa_val)
+        tBrSFB.store(sfb_val)
+
+        # Iterate over SF vector tiles and compute the scale&matmul accumulation
+        for i in cutlass.range_constexpr(mma_tiler_mnk[2]):
+            res += tArA[i] * tArSFA[i] * tBrB[i] * tBrSFB[i]
+
+    # Store the final float16 result back to global memory
+    tCgC.store(res.to(cutlass.Float16))
+    return
+
+
+@cute.jit
+def my_kernel(
+    a_ptr: cute.Pointer,
+    b_ptr: cute.Pointer,
+    sfa_ptr: cute.Pointer,
+    sfb_ptr: cute.Pointer,
+    c_ptr: cute.Pointer,
+    problem_size: tuple,
+):
+    """
+    Host-side JIT function to prepare tensors and launch GPU kernel.
+    """
+    m, _, k, l = problem_size
+    # Create CuTe Tensor via pointer and problem size.
+    a_tensor = cute.make_tensor(
+        a_ptr,
+        cute.make_layout(
+            (m, cute.assume(k, 32), l),
+            stride=(cute.assume(k, 32), 1, cute.assume(m * k, 32)),
+        ),
+    )
+    # We use n=128 to create the torch tensor to do fp4 computation via torch._scaled_mm
+    # then copy torch tensor to cute tensor for cute customize kernel computation
+    # therefore we need to ensure b_tensor has the right stride with this 128 padded size on n.
+    n_padded_128 = 128
+    b_tensor = cute.make_tensor(
+        b_ptr,
+        cute.make_layout(
+            (n_padded_128, cute.assume(k, 32), l),
+            stride=(cute.assume(k, 32), 1, cute.assume(n_padded_128 * k, 32)),
+        ),
+    )
+    c_tensor = cute.make_tensor(
+        c_ptr, cute.make_layout((cute.assume(m, 32), 1, l), stride=(1, 1, m))
+    )
+    # Convert scale factor tensors to MMA layout
+    # The layout matches Tensor Core requirements: (((32, 4), REST_M), ((SF_K, 4), REST_K), (1, REST_L))
+    sfa_layout = blockscaled_utils.tile_atom_to_shape_SF(a_tensor.shape, sf_vec_size)
+    sfa_tensor = cute.make_tensor(sfa_ptr, sfa_layout)
+
+    sfb_layout = blockscaled_utils.tile_atom_to_shape_SF(b_tensor.shape, sf_vec_size)
+    sfb_tensor = cute.make_tensor(sfb_ptr, sfb_layout)
+
+    # Compute grid dimensions
+    # Grid is (M_blocks, 1, L) where:
+    # - M_blocks = ceil(M / 128) to cover all output rows
+    # - L = batch size
+    grid = (
+        cute.ceil_div(c_tensor.shape[0], 128),
+        1,
+        c_tensor.shape[2],
+    )
+
+    # Launch the CUDA kernel
+    kernel(a_tensor, b_tensor, sfa_tensor, sfb_tensor, c_tensor).launch(
+        grid=grid,
+        block=[threads_per_cta, 1, 1],
+        cluster=(1, 1, 1),
+    )
+    return
+
+
+# Global cache for compiled kernel
+_compiled_kernel_cache = None
+
+
+# This function is used to compile the kernel once and cache it and then allow users to
+# run the kernel multiple times to get more accurate timing results.
+def compile_kernel():
+    """
+    Compile the kernel once and cache it.
+    This should be called before any timing measurements.
+
+    Returns:
+        The compiled kernel function
+    """
+    global _compiled_kernel_cache
+
+    if _compiled_kernel_cache is not None:
+        return _compiled_kernel_cache
+
+    # Create CuTe pointers for A/B/C/SFA/SFB via torch tensor data pointer
+    a_ptr = make_ptr(ab_dtype, 0, cute.AddressSpace.gmem, assumed_align=16)
+    b_ptr = make_ptr(ab_dtype, 0, cute.AddressSpace.gmem, assumed_align=16)
+    c_ptr = make_ptr(c_dtype, 0, cute.AddressSpace.gmem, assumed_align=16)
+    sfa_ptr = make_ptr(sf_dtype, 0, cute.AddressSpace.gmem, assumed_align=32)
+    sfb_ptr = make_ptr(sf_dtype, 0, cute.AddressSpace.gmem, assumed_align=32)
+
+    # Compile the kernel
+    _compiled_kernel_cache = cute.compile(
+        my_kernel, a_ptr, b_ptr, sfa_ptr, sfb_ptr, c_ptr, (0, 0, 0, 0)
+    )
+
+    return _compiled_kernel_cache
+
+
+def custom_kernel(data: input_t) -> output_t:
+    """
+    Execute the block-scaled GEMV kernel.
+
+    This is the main entry point called by the evaluation framework.
+    It converts PyTorch tensors to CuTe tensors, launches the kernel,
+    and returns the result.
+
+    Args:
+        data: Tuple of (a, b, sfa_cpu, sfb_cpu, c) PyTorch tensors
+            a: [m, k, l] - Input matrix in float4e2m1fn
+            b: [1, k, l] - Input vector in float4e2m1fn
+            sfa_cpu: [m, k, l] - Scale factors in float8_e4m3fn
+            sfb_cpu: [1, k, l] - Scale factors in float8_e4m3fn
+            sfa_permuted: [32, 4, rest_m, 4, rest_k, l] - Scale factors in float8_e4m3fn
+            sfb_permuted: [32, 4, rest_n, 4, rest_k, l] - Scale factors in float8_e4m3fn
+            c: [m, 1, l] - Output vector in float16
+
+    Returns:
+        Output tensor c with computed GEMV results
+    """
+    a, b, _, _, sfa_permuted, sfb_permuted, c = data
+
+    # Ensure kernel is compiled (will use cached version if available)
+    # To avoid the compilation overhead, we compile the kernel once and cache it.
+    compiled_func = compile_kernel()
+
+    # Get dimensions from MxKxL layout
+    m, k, l = a.shape
+    # Torch use e2m1_x2 data type, thus k is halved
+    k = k * 2
+    # GEMV N dimension is always 1
+    n = 1
+
+    # Create CuTe pointers for A/B/C/SFA/SFB via torch tensor data pointer
+    a_ptr = make_ptr(ab_dtype, a.data_ptr(), cute.AddressSpace.gmem, assumed_align=16)
+    b_ptr = make_ptr(ab_dtype, b.data_ptr(), cute.AddressSpace.gmem, assumed_align=16)
+    c_ptr = make_ptr(c_dtype, c.data_ptr(), cute.AddressSpace.gmem, assumed_align=16)
+    sfa_ptr = make_ptr(
+        sf_dtype, sfa_permuted.data_ptr(), cute.AddressSpace.gmem, assumed_align=32
+    )
+    sfb_ptr = make_ptr(
+        sf_dtype, sfb_permuted.data_ptr(), cute.AddressSpace.gmem, assumed_align=32
+    )
+
+    # Execute the compiled kernel
+    compiled_func(a_ptr, b_ptr, sfa_ptr, sfb_ptr, c_ptr, (m, n, k, l))
+
+    return c

From 32cf8925ddb9132bb701ced187348889fe947a57 Mon Sep 17 00:00:00 2001
From: whadhack <xxx@xxxx.xxx>
Date: Sun, 16 Nov 2025 19:12:07 -0800
Subject: [PATCH 149/207] Fixes to profiler selection logic.

---
 problems/nvidia/eval.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/problems/nvidia/eval.py b/problems/nvidia/eval.py
index 1fe58d33..ccc337e6 100644
--- a/problems/nvidia/eval.py
+++ b/problems/nvidia/eval.py
@@ -405,7 +405,7 @@ def run_single_profile(test: TestCase, pool: multiprocessing.Pool) -> str:
     """
     Runs a single profiling activity in another process.
     """
-    if bool(os.getenv("POPCORN_NCU", "0")):
+    if int(os.getenv("POPCORN_NCU", "0")) == 1:
         return pool.apply(_run_single_profile_ncu, (test,))
     else:
         return pool.apply(_run_single_profile_torch, (test,))
@@ -420,7 +420,8 @@ def run_profiling(
         report = run_single_profile(test, pool)
         logger.log(
             f"benchmark.{idx}.report",
-            base64.b64encode(report.encode("utf-8"), b"+*").decode("utf-8"),
+            #base64.b64encode(report.encode("utf-8"), b"+*").decode("utf-8"),
+            report
         )
     logger.log("check", "pass")
     return 0

From 9afb6011e78bd1f0773911acd8127170f6a48aa1 Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Fri, 28 Nov 2025 01:39:17 -0500
Subject: [PATCH 150/207] add from vickiw's gemm

---
 problems/nvidia/nvfp4_gemm/eval.py       | 500 +++++++++++++++
 problems/nvidia/nvfp4_gemm/reference.py  | 161 +++++
 problems/nvidia/nvfp4_gemm/submission.py | 761 +++++++++++++++++++++++
 problems/nvidia/nvfp4_gemm/task.py       |  11 +
 problems/nvidia/nvfp4_gemm/template.py   |  25 +
 problems/nvidia/nvfp4_gemm/utils.py      | 172 +++++
 6 files changed, 1630 insertions(+)
 create mode 100644 problems/nvidia/nvfp4_gemm/eval.py
 create mode 100644 problems/nvidia/nvfp4_gemm/reference.py
 create mode 100644 problems/nvidia/nvfp4_gemm/submission.py
 create mode 100644 problems/nvidia/nvfp4_gemm/task.py
 create mode 100644 problems/nvidia/nvfp4_gemm/template.py
 create mode 100644 problems/nvidia/nvfp4_gemm/utils.py

diff --git a/problems/nvidia/nvfp4_gemm/eval.py b/problems/nvidia/nvfp4_gemm/eval.py
new file mode 100644
index 00000000..e8bb5b21
--- /dev/null
+++ b/problems/nvidia/nvfp4_gemm/eval.py
@@ -0,0 +1,500 @@
+import base64
+import dataclasses
+import multiprocessing
+import re
+import time
+import os
+import sys
+import math
+from pathlib import Path
+from typing import Any, Optional
+import tempfile
+
+import torch.cuda
+from cutlass.cute.nvgpu.common import OpError
+
+from utils import set_seed, clear_l2_cache
+
+try:
+    from task import TestSpec
+except ImportError:
+    TestSpec = dict
+
+from reference import check_implementation, generate_input
+
+
+class PopcornOutput:
+    def __init__(self, fd: int):
+        self.file = os.fdopen(fd, "w")
+        os.set_inheritable(fd, False)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.file.close()
+
+    def print(self, *args, **kwargs):
+        print(*args, **kwargs, file=self.file, flush=True)
+
+    def log(self, key, value):
+        self.print(f"{key}: {value}")
+
+
+@dataclasses.dataclass
+class TestCase:
+    args: dict
+    spec: str
+
+
+def _combine(a: int, b: int) -> int:
+    # combine two integers into one:
+    # we need this to generate a secret seed based on the test-level seed and
+    # the global secret seed.
+    # the test-level seeds are public knowledge, and typically relatively small numbers,
+    # so we need to make sure they don't provide any useful info for the full seed.
+    # This Cantor construction ensures that if the secret seed is a large number,
+    # then so is the overall seed.
+    return int(a + (a + b) * (a + b + 1) // 2)
+
+
+def get_test_cases(file_name: str, seed: Optional[int]) -> list[TestCase]:
+    try:
+        content = Path(file_name).read_text()
+    except Exception as E:
+        print(f"Could not open test file`{file_name}`: {E}", file=sys.stderr)
+        exit(113)
+
+    tests = []
+    lines = content.splitlines()
+    match = r"\s*([a-zA-Z]+):\s*([a-zA-Z]+|[+-]?[0-9]+)\s*"
+    for line in lines:
+        parts = line.split(";")
+        case = {}
+        for part in parts:
+            matched = re.match(match, part)
+            if not re.fullmatch(match, part):
+                print(f"invalid test case: '{line}': '{part}'", file=sys.stderr)
+                exit(113)
+            key = matched[1]
+            val = matched[2]
+            try:
+                val = int(val)
+            except ValueError:
+                pass
+
+            case[key] = val
+        tests.append(TestCase(spec=line, args=case))
+
+    if seed is not None:
+        for test in tests:
+            if "seed" in test.args:
+                test.args["seed"] = _combine(test.args["seed"], seed)
+
+    return tests
+
+
+@dataclasses.dataclass
+class Stats:
+    runs: int
+    mean: float
+    std: float
+    err: float
+    best: float
+    worst: float
+
+
+def calculate_stats(durations: list[int]):
+    """
+    Calculate statistical data from a list of durations.
+
+    @param durations: A list of durations in nanoseconds.
+    @return: A Stats object containing the number of runs, mean, standard deviation, error, best, and worst durations.
+    """
+    runs = len(durations)
+    total = sum(durations)
+    best = min(durations)
+    worst = max(durations)
+
+    avg = total / runs
+    variance = sum(map(lambda x: (x - avg) ** 2, durations))
+    std = math.sqrt(variance / (runs - 1))
+    err = std / math.sqrt(runs)
+
+    return Stats(
+        runs=runs, mean=avg, std=std, err=err, best=float(best), worst=float(worst)
+    )
+
+
+def _clone_data(data):
+    """
+    Recursively goes through data and clones all tensors.
+    """
+    if isinstance(data, tuple):
+        return tuple(_clone_data(x) for x in data)
+    elif isinstance(data, list):
+        return [_clone_data(x) for x in data]
+    elif isinstance(data, dict):
+        return {k: _clone_data(v) for k, v in data.items()}
+    elif isinstance(data, torch.Tensor):
+        return data.clone()
+    else:
+        return data
+
+
+def _run_single_test(test: TestCase):
+    """
+    Runs a single test case. Do not call directly
+    """
+    from submission import custom_kernel
+
+    data = generate_input(**test.args)
+    torch.cuda.synchronize()
+    try:
+        submission_output = custom_kernel(_clone_data(data))
+
+    except OpError as E:
+        print(f"Encountered {E}", file=sys.stderr)
+        return False, str(E)
+    torch.cuda.synchronize()
+    return check_implementation(data, submission_output)
+
+
+def run_single_test(pool: multiprocessing.Pool, test: TestCase):
+    """
+    Runs a single test in another process.
+    """
+    return pool.apply(_run_single_test, (test,))
+
+
+def run_testing(
+    logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]
+):
+    """
+    Executes the actual test case code and checks for correctness.
+
+    @param logger: A PopcornOutput object used for logging test results.
+    @param tests: A list of TestCase objects representing the test cases to be executed.
+    @return: An integer representing the exit status: 0 if all tests pass, otherwise 112.
+    """
+    # Step 1: Compile kernel once before running tests
+    logger.log("compile", "start")
+    compile_success, compile_error = pool.apply(_compile_kernel_once)
+    if not compile_success:
+        logger.log("compile", "fail")
+        logger.log("compile.error", compile_error)
+        return 112
+    logger.log("compile", "pass")
+    
+    # Step 2: Run all tests with compiled kernel
+    passed = True
+    logger.log("test-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"test.{idx}.spec", test.spec)
+        good, message = run_single_test(pool, test)
+        if not good:
+            logger.log(f"test.{idx}.status", "fail")
+            logger.log(f"test.{idx}.error", message)
+            passed = False
+        else:
+            logger.log(f"test.{idx}.status", "pass")
+            if message:
+                logger.log(f"test.{idx}.message", message)
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def _compile_kernel_once():
+    """
+    Compile the kernel once before any benchmarking.
+    This ensures compilation time is not included in benchmark results.
+    """
+    from submission import compile_kernel
+    
+    try:
+        # Trigger compilation (will be cached)
+        compile_kernel()
+        torch.cuda.synchronize()
+        return True, None
+    except OpError as E:
+        return False, f"Compilation failed: {E}"
+    except Exception as E:
+        return False, f"Compilation failed: {E}"
+
+
+def _run_single_benchmark(
+    test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float
+) -> Stats | Any:
+    """
+    Runs one benchmark. Do not call directly.
+    """
+    from submission import custom_kernel, compile_kernel
+
+    durations = []
+    # generate input data once
+    data = generate_input(**test.args)
+    check_copy = _clone_data(data)
+    
+    # Ensure kernel is compiled before any timing (compilation is cached)
+    try:
+        compile_kernel()
+        torch.cuda.synchronize()
+    except OpError as E:
+        return f"Compilation failed: {E}"
+    except Exception as E:
+        return f"Compilation failed: {E}"
+    
+    #  first, one obligatory correctness check
+    try:
+        output = custom_kernel(_clone_data(data))
+    except OpError as E:
+        return f"Encountered {E}"
+    good, message = check_implementation(check_copy, output)
+    if not good:
+        return message
+
+    # now, do multiple timing runs without further correctness testing
+    # there is an upper bound of 200 runs, and a lower bound of 3 runs;
+    # otherwise, we repeat until we either measure at least 10 full seconds,
+    # or the relative error of the mean is below 1%.
+
+    bm_start_time = time.perf_counter_ns()
+    for i in range(max_repeats):
+        if recheck:
+            # ensure we use a different seed for every benchmark
+            if "seed" in test.args:
+                test.args["seed"] += 13
+
+            data = generate_input(**test.args)
+            check_copy = _clone_data(data)
+        torch.cuda.synchronize()
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        clear_l2_cache()
+
+        start_event.record()
+        output = custom_kernel(data)
+        end_event.record()
+        torch.cuda.synchronize()
+        duration = start_event.elapsed_time(end_event) * 1e6  # Convert ms to ns
+
+        if recheck:
+            good, message = check_implementation(check_copy, output)
+            if not good:
+                return message
+
+        del output
+        durations.append(duration)
+
+        if i > 1:
+            total_bm_duration = time.perf_counter_ns() - bm_start_time
+            stats = calculate_stats(durations)
+            # stop if either
+            # a) relative error dips below 0.1%
+            # b) we exceed the total time limit for benchmarking the kernel
+            # c) we exceed 2 minutes of total wallclock time.
+            if (
+                stats.err / stats.mean < 0.001
+                or stats.mean * stats.runs > max_time_ns
+                or total_bm_duration > 120e9
+            ):
+                break
+
+    return calculate_stats(durations)
+
+
+def run_single_benchmark(
+    pool: multiprocessing.Pool,
+    test: TestCase,
+    recheck: bool,
+    max_repeats: int,
+    max_time_ns: float,
+):
+    """
+    For a particular test case, check correctness (if applicable) and grab runtime results.
+
+    @param pool: Process on which the benchmark will be launched.
+    @param test: TestCase object.
+    @param recheck: Flag for whether to explicitly check functional correctness.
+    @param max_repeats: Number of trials to repeat.
+    @param max_time_ns: Timeout time in nanoseconds.
+    @return: A Stats object for this particular benchmark case or an error if the test fails.
+    """
+    return pool.apply(_run_single_benchmark, (test, recheck, max_repeats, max_time_ns))
+
+
+def run_benchmarking(
+    logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]
+):
+    """
+    Executes benchmarking code for a CUDA Kernel and logs runtimes.
+
+    @param logger: A PopcornOutput object used for logging benchmark results.
+    @param pool: Process on which the benchmarks will be launched.
+    @param tests: A list of TestCase objects representing the test cases to be benchmarked.
+    @return: An integer representing the exit status: 0 if all benchmarks pass, otherwise 112.
+    """
+    # Step 1: Compile kernel once (outside of timing)
+    logger.log("compile", "start")
+    compile_success, compile_error = pool.apply(_compile_kernel_once)
+    if not compile_success:
+        logger.log("compile", "fail")
+        logger.log("compile.error", compile_error)
+        return 112
+    logger.log("compile", "pass")
+    
+    # Step 2: Warm up with compiled kernel
+    run_single_benchmark(pool, tests[0], False, 200, 10e7)
+
+    # Step 3: Run benchmarks (compilation time excluded)
+    passed = True
+    logger.log("benchmark-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"benchmark.{idx}.spec", test.spec)
+        result = run_single_benchmark(pool, test, False, 200, 10e9)
+        if isinstance(result, Stats):
+            for field in dataclasses.fields(Stats):
+                logger.log(f"benchmark.{idx}.{field.name}", getattr(result, field.name))
+        else:
+            passed = False
+            logger.log(f"benchmark.{idx}.status", "fail")
+            logger.log(f"benchmark.{idx}.error", result)
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def run_single_profile(test: TestCase) -> str:
+    """
+    Runs a single test case. Do not call directly
+    """
+    from submission import custom_kernel
+    from torch.profiler import profile, record_function, ProfilerActivity
+
+    data = generate_input(**test.args)
+    torch.cuda.synchronize()
+
+    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
+        submission_output = custom_kernel(_clone_data(data))
+        torch.cuda.synchronize()
+    return prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=20)
+
+
+def run_profiling(logger: PopcornOutput, tests: list[TestCase]):
+    logger.log("benchmark-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"benchmark.{idx}.spec", test.spec)
+        report = run_single_profile(test)
+        logger.log(
+            f"benchmark.{idx}.report",
+            base64.b64encode(report.encode("utf-8"), b"+*").decode("utf-8"),
+        )
+    logger.log("check", "pass")
+    return 0
+
+
+def main():
+    fd = os.getenv("POPCORN_FD")
+    if not fd:
+        return 111
+
+    if len(sys.argv) < 3:
+        return 2
+
+    mode = sys.argv[1]
+    seed = os.getenv("POPCORN_SEED")
+    os.unsetenv("POPCORN_SEED")
+    seed = int(seed) if seed else None
+    set_seed(seed or 42)
+
+    filename = None
+
+    with tempfile.NamedTemporaryFile(delete=False) as tmp:
+
+        def build_test_string(tests: list[dict]):
+            as_str = ""
+            for test in tests:
+                kvs = []
+                for k, v in test.items():
+                    kvs.append(f"{k}: {v}")
+                as_str += "; ".join(kvs) + "\n"
+            return as_str
+
+        import yaml
+
+        yaml_content = yaml.safe_load(open(sys.argv[2], "r"))
+        if mode == "test":
+            tests_str = build_test_string(yaml_content.get("tests", []))
+        elif mode in ("benchmark", "leaderboard", "profile"):
+            tests_str = build_test_string(yaml_content.get("benchmarks", []))
+
+        tmp.write(tests_str.encode("utf-8"))
+        tmp.flush()
+        filename = tmp.name
+
+    tests = get_test_cases(filename, seed)
+
+    os.unlink(filename)
+
+    with PopcornOutput(int(fd)) as logger:
+        import multiprocessing
+
+        mp_context = multiprocessing.get_context("spawn")
+        with mp_context.Pool(1) as pool:
+            if mode == "test":
+                return run_testing(logger, pool, tests)
+            if mode == "benchmark":
+                return run_benchmarking(logger, pool, tests)
+
+            if mode == "leaderboard":
+                # Step 1: Compile kernel once (outside of timing)
+                logger.log("compile", "start")
+                compile_success, compile_error = pool.apply(_compile_kernel_once)
+                if not compile_success:
+                    logger.log("compile", "fail")
+                    logger.log("compile.error", compile_error)
+                    return 112
+                logger.log("compile", "pass")
+                
+                # Step 2: Warmup with compiled kernel
+                run_single_benchmark(pool, tests[0], False, 200, 1e7)
+                
+                # Step 3: Run leaderboard benchmarks (compilation time excluded)
+                logger.log("benchmark-count", len(tests))
+                passed = True
+                for i in range(len(tests)):
+                    result = run_single_benchmark(pool, tests[i], True, 200, 30e9)
+                    logger.log(f"benchmark.{i}.spec", tests[i].spec)
+                    if isinstance(result, Stats):
+                        for field in dataclasses.fields(Stats):
+                            logger.log(
+                                f"benchmark.{i}.{field.name}",
+                                getattr(result, field.name),
+                            )
+                    else:
+                        passed = False
+                        logger.log(f"benchmark.{i}.status", "fail")
+                        logger.log(
+                            f"benchmark.{i}.error", str(result)
+                        )  # TODO: Make sure result implements __str__?
+                        break
+
+                logger.log("check", "pass" if passed else "fail")
+            elif mode == "profile":
+                run_profiling(logger, tests)
+            else:
+                # TODO: Implement script mode
+                return 2
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/problems/nvidia/nvfp4_gemm/reference.py b/problems/nvidia/nvfp4_gemm/reference.py
new file mode 100644
index 00000000..6853098c
--- /dev/null
+++ b/problems/nvidia/nvfp4_gemm/reference.py
@@ -0,0 +1,161 @@
+import torch
+from task import input_t, output_t
+from utils import make_match_reference
+
+# Scaling factor vector size
+sf_vec_size = 16
+
+# Helper function for ceiling division
+def ceil_div(a, b):
+    return (a + b - 1) // b
+
+# Helper function to convert scale factor tensor to blocked format
+def to_blocked(input_matrix):
+    rows, cols = input_matrix.shape
+
+    # Please ensure rows and cols are multiples of 128 and 4 respectively
+    n_row_blocks = ceil_div(rows, 128)
+    n_col_blocks = ceil_div(cols, 4)
+
+    padded = input_matrix
+    blocks = padded.view(n_row_blocks, 128, n_col_blocks, 4).permute(0, 2, 1, 3)
+    rearranged = blocks.reshape(-1, 4, 32, 4).transpose(1, 2).reshape(-1, 32, 16)
+
+    return rearranged.flatten()
+
+
+def ref_kernel(
+    data: input_t,
+) -> output_t:
+    """
+    PyTorch reference implementation of NVFP4 block-scaled GEMM.
+    """
+    a_ref, b_ref, sfa_ref_cpu, sfb_ref_cpu, _, _, c_ref = data
+    
+    # Get dimensions from MxNxL layout
+    _, _, l = c_ref.shape
+
+    # Call torch._scaled_mm to compute the GEMM result
+    for l_idx in range(l):
+        # Convert the scale factor tensor to blocked format
+        scale_a = to_blocked(sfa_ref_cpu[:, :, l_idx])
+        scale_b = to_blocked(sfb_ref_cpu[:, :, l_idx])
+        # (m, k) @ (n, k).T -> (m, n)
+        res = torch._scaled_mm(
+            a_ref[:, :, l_idx],
+            b_ref[:, :, l_idx].transpose(0, 1),
+            scale_a.cuda(),
+            scale_b.cuda(),
+            bias=None,
+            out_dtype=torch.float16,
+        )
+        c_ref[:, :, l_idx] = res
+    return c_ref
+
+
+def generate_input(
+    m: int,
+    n: int,
+    k: int,
+    l: int,
+    seed: int,
+):
+    """
+    Generate input tensors for NVFP4 block-scaled GEMM.
+    
+    Args:
+        m: Number of rows in matrix A
+        n: Number of columns in matrix B
+        k: Number of columns in A and rows of B
+        l: Batch size
+        seed: Random seed for reproducibility
+    
+    Returns:
+        Tuple of (a, b, scale_a, scale_b, c) where:
+            a: [m, k, l] - Input matrix in torch.float4e2m1fn_x2 data type
+            b: [n, k, l] - Input matrix in torch.float4e2m1fn_x2 data type
+            scale_a: [m, k, l] - Input scale factors in torch.float8e4m3fn data type
+            scale_b: [n, k, l] - Input scale factors in torch.float8e4m3fn data type
+            scale_a_permuted: [32, 4, rest_m, 4, rest_k, l] - Input scale factors in torch.float8e4m3fn data type
+            scale_b_permuted: [32, 4, rest_n, 4, rest_k, l] - Input scale factors in torch.float8e4m3fn data type
+            c: [m, n, l] - Output matrix in torch.float16 data type
+    """
+    torch.manual_seed(seed)
+    
+    # Generate uint8 tensor, then convert to float4e2m1fn_x2 data type
+    a_ref = torch.randint(
+        -6, 6, (l, m, k // 2), dtype=torch.int8, device="cuda"
+    ).permute(1, 2, 0)
+    b_ref = torch.randint(
+        -6, 6, (l, n, k // 2), dtype=torch.int8, device="cuda"
+    ).permute(1, 2, 0)
+    a_ref = a_ref.view(torch.float4_e2m1fn_x2)
+    b_ref = b_ref.view(torch.float4_e2m1fn_x2)
+
+    # Create float16 output tensor
+    c_ref = torch.randn((l, m, n), dtype=torch.float16, device="cuda").permute(
+        1, 2, 0
+    )
+    
+    # Helper function to prepare the scale factor tensors for both reference
+    # kernel and customize kernel. The customized data layout can be found in:
+    # https://docs.nvidia.com/cuda/cublas/index.html?highlight=fp4#d-block-scaling-factors-layout
+    def create_scale_factor_tensors(l, mn, sf_k):
+        # Create the reference scale factor tensor (mn, sf_k, l) on CPU.
+        ref_shape = (l, mn, sf_k)
+        ref_permute_order = (1, 2, 0)
+        # Init with uint8 tensor, then convert to float8_e4m3fn
+        ref_f8_random_int = torch.randint(-3, 3, ref_shape, dtype=torch.int8, device='cuda')
+        ref_f8_torch_tensor = ref_f8_random_int.to(dtype=torch.float8_e4m3fn)
+        # permute to match ref_permute_order
+        ref_f8_torch_tensor_permuted = ref_f8_torch_tensor.permute(*ref_permute_order)
+
+        atom_m = (32, 4)
+        atom_k = 4
+        mma_shape = (
+            l,  # batch size
+            ceil_div(mn, atom_m[0] * atom_m[1]),
+            ceil_div(sf_k, atom_k),
+            atom_m[0],
+            atom_m[1],
+            atom_k,
+        )
+
+        # Reorder scale factor tensor to (32, 4, rest_m, 4, rest_k, l) layout
+        # Which is needed by the CuTe customized kernel
+        mma_permute_order = (3, 4, 1, 5, 2, 0)
+        # Generate a random int8 tensor, then convert to float8_e4m3fn
+        rand_int_tensor = torch.randint(-3, 3, mma_shape, dtype=torch.int8, device='cuda')
+        reordered_f8_torch_tensor = rand_int_tensor.to(dtype=torch.float8_e4m3fn)
+        # Permute according to mma_permute_order
+        reordered_f8_torch_tensor = reordered_f8_torch_tensor.permute(*mma_permute_order)
+
+        # GPU-side vectorized reordering (replaces slow CPU nested loops)
+        # Create index grids for all dimensions
+        i_idx = torch.arange(mn, device='cuda')
+        j_idx = torch.arange(sf_k, device='cuda')
+        b_idx = torch.arange(l, device='cuda')
+        
+        # Create meshgrid for all combinations of (i, j, b)
+        i_grid, j_grid, b_grid = torch.meshgrid(i_idx, j_idx, b_idx, indexing='ij')
+        
+        # Calculate target indices in vectorized manner
+        mm = i_grid // (atom_m[0] * atom_m[1])
+        mm32 = i_grid % atom_m[0]
+        mm4 = (i_grid % 128) // atom_m[0]
+        kk = j_grid // atom_k
+        kk4 = j_grid % atom_k
+        
+        # Perform the reordering with advanced indexing (all on GPU)
+        reordered_f8_torch_tensor[mm32, mm4, mm, kk4, kk, b_grid] = ref_f8_torch_tensor_permuted[i_grid, j_grid, b_grid]
+        
+        return ref_f8_torch_tensor_permuted.cpu(), reordered_f8_torch_tensor
+
+    sf_k = ceil_div(k, sf_vec_size)
+    sfa_ref_cpu, sfa_ref_permuted = create_scale_factor_tensors(l, m, sf_k)
+    sfb_ref_cpu, sfb_ref_permuted = create_scale_factor_tensors(l, n, sf_k)
+
+    return (a_ref, b_ref, sfa_ref_cpu, sfb_ref_cpu, sfa_ref_permuted, sfb_ref_permuted, c_ref)
+
+
+check_implementation = make_match_reference(ref_kernel, rtol=1e-03, atol=1e-03)
diff --git a/problems/nvidia/nvfp4_gemm/submission.py b/problems/nvidia/nvfp4_gemm/submission.py
new file mode 100644
index 00000000..c2f37d92
--- /dev/null
+++ b/problems/nvidia/nvfp4_gemm/submission.py
@@ -0,0 +1,761 @@
+from torch._higher_order_ops.torchbind import call_torchbind_fake
+import cuda.bindings.driver as cuda
+
+import torch
+from task import input_t, output_t
+
+import cutlass
+import cutlass.cute as cute
+import cutlass.utils as utils
+import cutlass.pipeline as pipeline
+from cutlass.cute.nvgpu import cpasync, tcgen05
+import cutlass.torch as cutlass_torch
+import cutlass.utils.blackwell_helpers as sm100_utils
+import cutlass.utils.blockscaled_layout as blockscaled_utils
+from cutlass.cute.runtime import make_ptr
+
+# Kernel configuration parameters
+# Tile sizes for M, N, K dimensions
+mma_tiler_mnk = (128, 128, 256)  
+# Shape of the K dimension for the MMA instruction
+mma_inst_shape_k = 64
+# FP4 data type for A and B
+ab_dtype = cutlass.Float4E2M1FN  
+# FP8 data type for scale factors
+sf_dtype = cutlass.Float8E4M3FN  
+# FP16 output type
+c_dtype = cutlass.Float16  
+# Scale factor block size (16 elements share one scale)
+sf_vec_size = 16  
+# Number of threads per CUDA thread block
+threads_per_cta = 128  
+# Stage numbers of shared memory and tmem
+num_acc_stage = 1
+num_ab_stage = 1
+# Total number of columns in tmem
+num_tmem_alloc_cols = 512
+
+
+# Helper function for ceiling division
+def ceil_div(a, b):
+    return (a + b - 1) // b
+
+
+# The CuTe reference implementation for NVFP4 block-scaled GEMM
+@cute.kernel
+def kernel(
+    tiled_mma: cute.TiledMma,
+    tma_atom_a: cute.CopyAtom,
+    mA_mkl: cute.Tensor,
+    tma_atom_b: cute.CopyAtom,
+    mB_nkl: cute.Tensor,
+    tma_atom_sfa: cute.CopyAtom,
+    mSFA_mkl: cute.Tensor,
+    tma_atom_sfb: cute.CopyAtom,
+    mSFB_nkl: cute.Tensor,
+    mC_mnl: cute.Tensor,
+    a_smem_layout_staged: cute.ComposedLayout,
+    b_smem_layout_staged: cute.ComposedLayout,
+    sfa_smem_layout_staged: cute.Layout,
+    sfb_smem_layout_staged: cute.Layout,
+    num_tma_load_bytes: cutlass.Constexpr[int],
+):
+    """
+    GPU device kernel performing the batched GEMM computation.
+    """
+    warp_idx = cute.arch.warp_idx()
+    warp_idx = cute.arch.make_warp_uniform(warp_idx)
+    tidx = cute.arch.thread_idx()
+
+    #
+    # Setup cta/thread coordinates
+    #
+    # Coords inside cluster
+    bidx, bidy, bidz = cute.arch.block_idx()
+
+    # Coords outside cluster
+    cta_coord = (bidx, bidy, bidz)
+    mma_tile_coord_mnl = (
+        cta_coord[0] // cute.size(tiled_mma.thr_id.shape),
+        cta_coord[1],
+        cta_coord[2],
+    )
+    # Coord inside cta
+    tidx, _, _ = cute.arch.thread_idx()
+
+    #
+    # Define shared storage for kernel
+    #
+    @cute.struct
+    class SharedStorage:
+        ab_mbar_ptr: cute.struct.MemRange[cutlass.Int64, num_ab_stage * 2]
+        acc_mbar_ptr: cute.struct.MemRange[cutlass.Int64, num_acc_stage * 2]
+        tmem_holding_buf: cutlass.Int32
+
+    smem = utils.SmemAllocator()
+    storage = smem.allocate(SharedStorage)
+    # (MMA, MMA_M, MMA_K, STAGE)
+    sA = smem.allocate_tensor(
+        element_type=ab_dtype,
+        layout=a_smem_layout_staged.outer,
+        byte_alignment=128,
+        swizzle=a_smem_layout_staged.inner,
+    )
+    # (MMA, MMA_N, MMA_K, STAGE)
+    sB = smem.allocate_tensor(
+        element_type=ab_dtype,
+        layout=b_smem_layout_staged.outer,
+        byte_alignment=128,
+        swizzle=b_smem_layout_staged.inner,
+    )
+    # (MMA, MMA_M, MMA_K, STAGE)
+    sSFA = smem.allocate_tensor(
+        element_type=sf_dtype,
+        layout=sfa_smem_layout_staged,
+        byte_alignment=128,
+    )
+    # (MMA, MMA_N, MMA_K, STAGE)
+    sSFB = smem.allocate_tensor(
+        element_type=sf_dtype,
+        layout=sfb_smem_layout_staged,
+        byte_alignment=128,
+    )
+
+    #
+    # Initialize mainloop ab_pipeline, acc_pipeline and their states
+    #
+    ab_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
+    ab_pipeline_consumer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread, 1)
+    ab_producer, ab_consumer = pipeline.PipelineTmaUmma.create(
+        barrier_storage=storage.ab_mbar_ptr.data_ptr(),
+        num_stages=num_ab_stage,
+        producer_group=ab_pipeline_producer_group,
+        consumer_group=ab_pipeline_consumer_group,
+        tx_count=num_tma_load_bytes,
+    ).make_participants()
+    acc_producer, acc_consumer = pipeline.PipelineUmmaAsync.create(
+        barrier_storage=storage.acc_mbar_ptr.data_ptr(),
+        num_stages=num_acc_stage,
+        producer_group=ab_pipeline_producer_group,
+        consumer_group=pipeline.CooperativeGroup(
+            pipeline.Agent.Thread,
+            threads_per_cta,
+        ),
+    ).make_participants()
+
+    #
+    # Local_tile partition global tensors
+    #
+    # (bM, bK, RestM, RestK, RestL)
+    gA_mkl = cute.local_tile(
+        mA_mkl, cute.slice_(mma_tiler_mnk, (None, 0, None)), (None, None, None)
+    )
+    # (bN, bK, RestN, RestK, RestL)
+    gB_nkl = cute.local_tile(
+        mB_nkl, cute.slice_(mma_tiler_mnk, (0, None, None)), (None, None, None)
+    )
+    gSFA_mkl = cute.local_tile(
+        mSFA_mkl, cute.slice_(mma_tiler_mnk, (None, 0, None)), (None, None, None)
+    )
+    gSFB_nkl = cute.local_tile(
+        mSFB_nkl, cute.slice_(mma_tiler_mnk, (0, None, None)), (None, None, None)
+    )
+    # (bM, bN, RestM, RestN, RestL)
+    gC_mnl = cute.local_tile(
+        mC_mnl, cute.slice_(mma_tiler_mnk, (None, None, 0)), (None, None, None)
+    )
+    k_tile_cnt = cute.size(gA_mkl, mode=[3])
+
+    #
+    # Partition global tensor for TiledMMA_A/B/SFA/SFB/C
+    #
+    # (MMA, MMA_M, MMA_K, RestK)
+    thr_mma = tiled_mma.get_slice(0)
+    # (MMA, MMA_M, MMA_K, RestM, RestK, RestL)
+    tCgA = thr_mma.partition_A(gA_mkl)
+    # (MMA, MMA_N, MMA_K, RestN, RestK, RestL)
+    tCgB = thr_mma.partition_B(gB_nkl)
+    # (MMA, MMA_M, MMA_K, RestM, RestK, RestL)
+    tCgSFA = thr_mma.partition_A(gSFA_mkl)
+    # (MMA, MMA_N, MMA_K, RestN, RestK, RestL)
+    tCgSFB = thr_mma.partition_B(gSFB_nkl)
+    # (MMA, MMA_M, MMA_N, RestM, RestN, RestL)
+    tCgC = thr_mma.partition_C(gC_mnl)
+
+    #
+    # Partition global/shared tensor for TMA load A/B/SFA/SFB
+    #
+    # TMA Partition_S/D for A
+    # ((atom_v, rest_v), STAGE)
+    # ((atom_v, rest_v), RestM, RestK, RestL)
+    tAsA, tAgA = cpasync.tma_partition(
+        tma_atom_a,
+        0,
+        cute.make_layout(1),
+        cute.group_modes(sA, 0, 3),
+        cute.group_modes(tCgA, 0, 3),
+    )
+    # TMA Partition_S/D for B
+    # ((atom_v, rest_v), STAGE)
+    # ((atom_v, rest_v), RestN, RestK, RestL)
+    tBsB, tBgB = cpasync.tma_partition(
+        tma_atom_b,
+        0,
+        cute.make_layout(1),
+        cute.group_modes(sB, 0, 3),
+        cute.group_modes(tCgB, 0, 3),
+    )
+    #  TMA Partition_S/D for SFA
+    # ((atom_v, rest_v), STAGE)
+    # ((atom_v, rest_v), RestM, RestK, RestL)
+    tAsSFA, tAgSFA = cpasync.tma_partition(
+        tma_atom_sfa,
+        0,
+        cute.make_layout(1),
+        cute.group_modes(sSFA, 0, 3),
+        cute.group_modes(tCgSFA, 0, 3),
+    )
+    tAsSFA = cute.filter_zeros(tAsSFA)
+    tAgSFA = cute.filter_zeros(tAgSFA)
+    # TMA Partition_S/D for SFB
+    # ((atom_v, rest_v), STAGE)
+    # ((atom_v, rest_v), RestN, RestK, RestL)
+    tBsSFB, tBgSFB = cpasync.tma_partition(
+        tma_atom_sfb,
+        0,
+        cute.make_layout(1),
+        cute.group_modes(sSFB, 0, 3),
+        cute.group_modes(tCgSFB, 0, 3),
+    )
+    tBsSFB = cute.filter_zeros(tBsSFB)
+    tBgSFB = cute.filter_zeros(tBgSFB)
+
+    #
+    # Partition shared/tensor memory tensor for TiledMMA_A/B/C
+    #
+    # (MMA, MMA_M, MMA_K, STAGE)
+    tCrA = tiled_mma.make_fragment_A(sA)
+    # (MMA, MMA_N, MMA_K, STAGE)
+    tCrB = tiled_mma.make_fragment_B(sB)
+    # (MMA, MMA_M, MMA_N)
+    acc_shape = tiled_mma.partition_shape_C(mma_tiler_mnk[:2])
+    # (MMA, MMA_M, MMA_N)
+    tCtAcc_fake = tiled_mma.make_fragment_C(acc_shape)
+
+    #
+    # Alloc tensor memory buffer
+    #
+    tmem_alloc_barrier = pipeline.NamedBarrier(
+        barrier_id=1,
+        num_threads=threads_per_cta,
+    )
+    tmem = utils.TmemAllocator(
+        storage.tmem_holding_buf,
+        barrier_for_retrieve=tmem_alloc_barrier,
+    )
+    tmem.allocate(num_tmem_alloc_cols)
+    tmem.wait_for_alloc()
+    acc_tmem_ptr = tmem.retrieve_ptr(cutlass.Float32)
+    tCtAcc = cute.make_tensor(acc_tmem_ptr, tCtAcc_fake.layout)
+
+    #
+    # Make SFA/SFB tmem tensor
+    #
+    # Get SFA tmem ptr
+    sfa_tmem_ptr = cute.recast_ptr(
+        acc_tmem_ptr + tcgen05.find_tmem_tensor_col_offset(tCtAcc),
+        dtype=sf_dtype,
+    )
+    # (MMA, MMA_M, MMA_K)
+    tCtSFA_layout = blockscaled_utils.make_tmem_layout_sfa(
+        tiled_mma,
+        mma_tiler_mnk,
+        sf_vec_size,
+        cute.slice_(sfa_smem_layout_staged, (None, None, None, 0)),
+    )
+    tCtSFA = cute.make_tensor(sfa_tmem_ptr, tCtSFA_layout)
+    # Get SFB tmem ptr
+    sfb_tmem_ptr = cute.recast_ptr(
+        acc_tmem_ptr
+        + tcgen05.find_tmem_tensor_col_offset(tCtAcc)
+        + tcgen05.find_tmem_tensor_col_offset(tCtSFA),
+        dtype=sf_dtype,
+    )
+    # (MMA, MMA_N, MMA_K)
+    tCtSFB_layout = blockscaled_utils.make_tmem_layout_sfb(
+        tiled_mma,
+        mma_tiler_mnk,
+        sf_vec_size,
+        cute.slice_(sfb_smem_layout_staged, (None, None, None, 0)),
+    )
+    tCtSFB = cute.make_tensor(sfb_tmem_ptr, tCtSFB_layout)
+
+    #
+    # Partition for S2T copy of SFA/SFB
+    #
+    # Make S2T CopyAtom
+    copy_atom_s2t = cute.make_copy_atom(
+        tcgen05.Cp4x32x128bOp(tcgen05.CtaGroup.ONE),
+        sf_dtype,
+    )
+    # (MMA, MMA_MN, MMA_K, STAGE)
+    tCsSFA_compact = cute.filter_zeros(sSFA)
+    # (MMA, MMA_MN, MMA_K)
+    tCtSFA_compact = cute.filter_zeros(tCtSFA)
+    tiled_copy_s2t_sfa = tcgen05.make_s2t_copy(copy_atom_s2t, tCtSFA_compact)
+    thr_copy_s2t_sfa = tiled_copy_s2t_sfa.get_slice(0)
+    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+    tCsSFA_compact_s2t_ = thr_copy_s2t_sfa.partition_S(tCsSFA_compact)
+    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+    tCsSFA_compact_s2t = tcgen05.get_s2t_smem_desc_tensor(
+        tiled_copy_s2t_sfa, tCsSFA_compact_s2t_
+    )
+    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K)
+    tCtSFA_compact_s2t = thr_copy_s2t_sfa.partition_D(tCtSFA_compact)
+
+    # (MMA, MMA_MN, MMA_K, STAGE)
+    tCsSFB_compact = cute.filter_zeros(sSFB)
+    # (MMA, MMA_MN, MMA_K)
+    tCtSFB_compact = cute.filter_zeros(tCtSFB)
+    tiled_copy_s2t_sfb = tcgen05.make_s2t_copy(copy_atom_s2t, tCtSFB_compact)
+    thr_copy_s2t_sfb = tiled_copy_s2t_sfb.get_slice(0)
+    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+    tCsSFB_compact_s2t_ = thr_copy_s2t_sfb.partition_S(tCsSFB_compact)
+    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+    tCsSFB_compact_s2t = tcgen05.get_s2t_smem_desc_tensor(
+        tiled_copy_s2t_sfb, tCsSFB_compact_s2t_
+    )
+    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K)
+    tCtSFB_compact_s2t = thr_copy_s2t_sfb.partition_D(tCtSFB_compact)
+
+    #
+    # Slice to per mma tile index
+    #
+    # ((atom_v, rest_v), RestK)
+    tAgA = tAgA[(None, mma_tile_coord_mnl[0], None, mma_tile_coord_mnl[2])]
+    # ((atom_v, rest_v), RestK)
+    tBgB = tBgB[(None, mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2])]
+    # ((atom_v, rest_v), RestK)
+    tAgSFA = tAgSFA[(None, mma_tile_coord_mnl[0], None, mma_tile_coord_mnl[2])]
+    # ((atom_v, rest_v), RestK)
+    tBgSFB = tBgSFB[(None, mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2])]
+
+    #
+    # Execute Data copy and Math computation in the k_tile loop
+    #
+    if warp_idx == 0:
+        # Wait for accumulator buffer empty
+        acc_empty = acc_producer.acquire_and_advance()
+        # Set ACCUMULATE field to False for the first k_tile iteration
+        tiled_mma.set(tcgen05.Field.ACCUMULATE, False)
+        # Execute k_tile loop
+        for k_tile in range(k_tile_cnt):
+            # Wait for AB buffer empty
+            ab_empty = ab_producer.acquire_and_advance()
+
+            #  TMA load A/B/SFA/SFB to shared memory
+            cute.copy(
+                tma_atom_a,
+                tAgA[(None, k_tile)],
+                tAsA[(None, ab_empty.index)],
+                tma_bar_ptr=ab_empty.barrier,
+            )
+            cute.copy(
+                tma_atom_b,
+                tBgB[(None, k_tile)],
+                tBsB[(None, ab_empty.index)],
+                tma_bar_ptr=ab_empty.barrier,
+            )
+            cute.copy(
+                tma_atom_sfa,
+                tAgSFA[(None, k_tile)],
+                tAsSFA[(None, ab_empty.index)],
+                tma_bar_ptr=ab_empty.barrier,
+            )
+            cute.copy(
+                tma_atom_sfb,
+                tBgSFB[(None, k_tile)],
+                tBsSFB[(None, ab_empty.index)],
+                tma_bar_ptr=ab_empty.barrier,
+            )
+
+            # Wait for AB buffer full
+            ab_full = ab_consumer.wait_and_advance()
+
+            # Copy SFA/SFB from shared memory to TMEM
+            s2t_stage_coord = (None, None, None, None, ab_full.index)
+            tCsSFA_compact_s2t_staged = tCsSFA_compact_s2t[s2t_stage_coord]
+            tCsSFB_compact_s2t_staged = tCsSFB_compact_s2t[s2t_stage_coord]
+            cute.copy(
+                tiled_copy_s2t_sfa,
+                tCsSFA_compact_s2t_staged,
+                tCtSFA_compact_s2t,
+            )
+            cute.copy(
+                tiled_copy_s2t_sfb,
+                tCsSFB_compact_s2t_staged,
+                tCtSFB_compact_s2t,
+            )
+
+            # tCtAcc += tCrA * tCrSFA * tCrB * tCrSFB
+            num_kblocks = cute.size(tCrA, mode=[2])
+            for kblock_idx in cutlass.range(num_kblocks, unroll_full=True):
+                kblock_coord = (
+                    None,
+                    None,
+                    kblock_idx,
+                    ab_full.index,
+                )
+
+                # Set SFA/SFB tensor to tiled_mma
+                sf_kblock_coord = (None, None, kblock_idx)
+                tiled_mma.set(
+                    tcgen05.Field.SFA,
+                    tCtSFA[sf_kblock_coord].iterator,
+                )
+                tiled_mma.set(
+                    tcgen05.Field.SFB,
+                    tCtSFB[sf_kblock_coord].iterator,
+                )
+
+                cute.gemm(
+                    tiled_mma,
+                    tCtAcc,
+                    tCrA[kblock_coord],
+                    tCrB[kblock_coord],
+                    tCtAcc,
+                )
+                # Enable accumulate on tCtAcc after first kblock
+                tiled_mma.set(tcgen05.Field.ACCUMULATE, True)
+
+            # Async arrive AB buffer empty
+            ab_full.release()
+        acc_empty.commit()
+
+    #
+    # Epilogue
+    # Partition for epilogue
+    #
+    op = tcgen05.Ld32x32bOp(tcgen05.Repetition.x128, tcgen05.Pack.NONE)
+    copy_atom_t2r = cute.make_copy_atom(op, cutlass.Float32)
+    tiled_copy_t2r = tcgen05.make_tmem_copy(copy_atom_t2r, tCtAcc)
+    thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
+    # (T2R_M, T2R_N, EPI_M, EPI_M)
+    tTR_tAcc = thr_copy_t2r.partition_S(tCtAcc)
+    # (T2R_M, T2R_N, EPI_M, EPI_N, RestM, RestN, RestL)
+    tTR_gC = thr_copy_t2r.partition_D(tCgC)
+    # (T2R_M, T2R_N, EPI_M, EPI_N）
+    tTR_rAcc = cute.make_rmem_tensor(
+        tTR_gC[None, None, None, None, 0, 0, 0].shape, cutlass.Float32
+    )
+    # (T2R_M, T2R_N, EPI_M, EPI_N）
+    tTR_rC = cute.make_rmem_tensor(
+        tTR_gC[None, None, None, None, 0, 0, 0].shape, c_dtype
+    )
+    # STG Atom
+    simt_atom = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), c_dtype)
+    tTR_gC = tTR_gC[(None, None, None, None, *mma_tile_coord_mnl)]
+
+    # Wait for accumulator buffer full
+    acc_full = acc_consumer.wait_and_advance()
+
+    # Copy accumulator to register
+    cute.copy(tiled_copy_t2r, tTR_tAcc, tTR_rAcc)
+    acc_vec = tTR_rAcc.load().to(c_dtype)
+    tTR_rC.store(acc_vec)
+    # Store C to global memory
+    cute.copy(simt_atom, tTR_rC, tTR_gC)
+
+    acc_full.release()
+
+    # Deallocate TMEM
+    cute.arch.barrier()
+    tmem.free(acc_tmem_ptr)
+
+    return
+
+
+@cute.jit
+def my_kernel(
+    a_ptr: cute.Pointer,
+    b_ptr: cute.Pointer,
+    sfa_ptr: cute.Pointer,
+    sfb_ptr: cute.Pointer,
+    c_ptr: cute.Pointer,
+    problem_size: tuple,
+):
+    """
+    Host-side JIT function to prepare tensors and launch GPU kernel.
+    """
+    m, n, k, l = problem_size
+
+    # Setup attributes that depend on gemm inputs
+    a_tensor = cute.make_tensor(
+        a_ptr,
+        cute.make_layout(
+            (m, cute.assume(k, 32), l),
+            stride=(cute.assume(k, 32), 1, cute.assume(m * k, 32)),
+        ),
+    )
+    b_tensor = cute.make_tensor(
+        b_ptr,
+        cute.make_layout(
+            (n, cute.assume(k, 32), l),
+            stride=(cute.assume(k, 32), 1, cute.assume(n * k, 32)),
+        ),
+    )
+    c_tensor = cute.make_tensor(
+        c_ptr, cute.make_layout((cute.assume(m, 32), n, l), stride=(n, 1, m * n))
+    )
+    # Setup sfa/sfb tensor by filling A/B tensor to scale factor atom layout
+    # ((Atom_M, Rest_M),(Atom_K, Rest_K),RestL)
+    sfa_layout = blockscaled_utils.tile_atom_to_shape_SF(
+        a_tensor.shape, sf_vec_size
+    )
+    sfa_tensor = cute.make_tensor(sfa_ptr, sfa_layout)
+
+    # ((Atom_N, Rest_N),(Atom_K, Rest_K),RestL)
+    sfb_layout = blockscaled_utils.tile_atom_to_shape_SF(
+        b_tensor.shape, sf_vec_size
+    )
+    sfb_tensor = cute.make_tensor(sfb_ptr, sfb_layout)
+
+    mma_op = tcgen05.MmaMXF4NVF4Op(
+        sf_dtype,
+        (mma_tiler_mnk[0], mma_tiler_mnk[1], mma_inst_shape_k),
+        tcgen05.CtaGroup.ONE,
+        tcgen05.OperandSource.SMEM,
+    )
+    tiled_mma = cute.make_tiled_mma(mma_op)
+
+    cluster_layout_vmnk = cute.tiled_divide(
+        cute.make_layout((1, 1, 1)),
+        (tiled_mma.thr_id.shape,),
+    )
+
+    # Compute A/B/SFA/SFB/C shared memory layout
+    a_smem_layout_staged = sm100_utils.make_smem_layout_a(
+        tiled_mma,
+        mma_tiler_mnk,
+        ab_dtype,
+        num_ab_stage,
+    )
+    b_smem_layout_staged = sm100_utils.make_smem_layout_b(
+        tiled_mma,
+        mma_tiler_mnk,
+        ab_dtype,
+        num_ab_stage,
+    )
+    sfa_smem_layout_staged = blockscaled_utils.make_smem_layout_sfa(
+        tiled_mma,
+        mma_tiler_mnk,
+        sf_vec_size,
+        num_ab_stage,
+    )
+    sfb_smem_layout_staged = blockscaled_utils.make_smem_layout_sfb(
+        tiled_mma,
+        mma_tiler_mnk,
+        sf_vec_size,
+        num_ab_stage,
+    )
+
+    atom_thr_size = cute.size(tiled_mma.thr_id.shape)
+
+    # Setup TMA for A
+    a_smem_layout = cute.slice_(a_smem_layout_staged, (None, None, None, 0))
+    tma_atom_a, tma_tensor_a = cute.nvgpu.make_tiled_tma_atom_A(
+        cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE),
+        a_tensor,
+        a_smem_layout,
+        mma_tiler_mnk,
+        tiled_mma,
+        cluster_layout_vmnk.shape,
+    )
+    # Setup TMA for B
+    b_smem_layout = cute.slice_(b_smem_layout_staged, (None, None, None, 0))
+    tma_atom_b, tma_tensor_b = cute.nvgpu.make_tiled_tma_atom_B(
+        cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE),
+        b_tensor,
+        b_smem_layout,
+        mma_tiler_mnk,
+        tiled_mma,
+        cluster_layout_vmnk.shape,
+    )
+    # Setup TMA for SFA
+    sfa_smem_layout = cute.slice_(
+        sfa_smem_layout_staged, (None, None, None, 0)
+    )
+    tma_atom_sfa, tma_tensor_sfa = cute.nvgpu.make_tiled_tma_atom_A(
+        cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE),
+        sfa_tensor,
+        sfa_smem_layout,
+        mma_tiler_mnk,
+        tiled_mma,
+        cluster_layout_vmnk.shape,
+        internal_type=cutlass.Int16,
+    )
+    # Setup TMA for SFB
+    sfb_smem_layout = cute.slice_(
+        sfb_smem_layout_staged, (None, None, None, 0)
+    )
+    tma_atom_sfb, tma_tensor_sfb = cute.nvgpu.make_tiled_tma_atom_B(
+        cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE),
+        sfb_tensor,
+        sfb_smem_layout,
+        mma_tiler_mnk,
+        tiled_mma,
+        cluster_layout_vmnk.shape,
+        internal_type=cutlass.Int16,
+    )
+
+    # Compute TMA load bytes
+    a_copy_size = cute.size_in_bytes(ab_dtype, a_smem_layout)
+    b_copy_size = cute.size_in_bytes(ab_dtype, b_smem_layout)
+    sfa_copy_size = cute.size_in_bytes(sf_dtype, sfa_smem_layout)
+    sfb_copy_size = cute.size_in_bytes(sf_dtype, sfb_smem_layout)
+    num_tma_load_bytes = (
+        a_copy_size + b_copy_size + sfa_copy_size + sfb_copy_size
+    ) * atom_thr_size
+
+    # Compute grid size
+    grid = (
+        cute.ceil_div(c_tensor.shape[0], mma_tiler_mnk[0]),
+        cute.ceil_div(c_tensor.shape[1], mma_tiler_mnk[1]),
+        c_tensor.shape[2],
+    )
+
+    # Launch the kernel
+    kernel(
+        # MMA (Matrix Multiply-Accumulate) configuration
+        tiled_mma,                  # Tiled MMA object defining NVFP4 GEMM compute pattern
+        
+        # TMA (Tensor Memory Accelerator) atoms and tensors for input matrix A
+        tma_atom_a,                 # TMA copy atom defining how to load A from global memory
+        tma_tensor_a,               # Tensor descriptor for A matrix (m, k, l)
+        
+        # TMA atoms and tensors for input matrix B
+        tma_atom_b,                 # TMA copy atom defining how to load B from global memory
+        tma_tensor_b,               # Tensor descriptor for B matrix (n, k, l)
+        
+        # TMA atoms and tensors for scale factor A
+        tma_atom_sfa,               # TMA copy atom for loading scale factors for A
+        tma_tensor_sfa,             # Tensor descriptor for SFA (block scale factors for A)
+        
+        # TMA atoms and tensors for scale factor B
+        tma_atom_sfb,               # TMA copy atom for loading scale factors for B
+        tma_tensor_sfb,             # Tensor descriptor for SFB (block scale factors for B)
+        
+        # Output tensor C
+        c_tensor,                   # Output tensor C where result will be stored (m, n, l)
+        
+        # Shared memory layouts with staging for pipelined execution
+        a_smem_layout_staged,       # Staged shared memory layout for A (includes stage dimension)
+        b_smem_layout_staged,       # Staged shared memory layout for B (includes stage dimension)
+        sfa_smem_layout_staged,     # Staged shared memory layout for SFA (includes stage dimension)
+        sfb_smem_layout_staged,     # Staged shared memory layout for SFB (includes stage dimension)
+        
+        # Pipeline synchronization parameter
+        num_tma_load_bytes,         # Total bytes to load per TMA transaction (for barrier setup)
+    ).launch(
+        grid=grid,
+        block=[threads_per_cta, 1, 1],
+        cluster=(1, 1, 1),
+    )
+    return
+
+
+# Global cache for compiled kernel
+_compiled_kernel_cache = None
+# This function is used to compile the kernel once and cache it and then allow users to 
+# run the kernel multiple times to get more accurate timing results.
+def compile_kernel():
+    """
+    Compile the kernel once and cache it.
+    This should be called before any timing measurements.
+
+    Returns:
+        The compiled kernel function
+    """
+    global _compiled_kernel_cache
+    
+    if _compiled_kernel_cache is not None:
+        return _compiled_kernel_cache
+    
+
+    # Create CuTe pointers for A/B/C/SFA/SFB via torch tensor data pointer
+    a_ptr = make_ptr(
+        ab_dtype, 0, cute.AddressSpace.gmem, assumed_align=16
+    )
+    b_ptr = make_ptr(
+        ab_dtype, 0, cute.AddressSpace.gmem, assumed_align=16
+    )
+    c_ptr = make_ptr(
+        c_dtype, 0, cute.AddressSpace.gmem, assumed_align=16
+    )
+    sfa_ptr = make_ptr(
+        sf_dtype, 0, cute.AddressSpace.gmem, assumed_align=32
+    )
+    sfb_ptr = make_ptr(
+        sf_dtype, 0, cute.AddressSpace.gmem, assumed_align=32
+    )
+
+    # Compile the kernel
+    _compiled_kernel_cache = cute.compile(my_kernel, a_ptr, b_ptr, sfa_ptr, sfb_ptr, c_ptr, (0, 0, 0, 0))
+    
+    return _compiled_kernel_cache
+
+
+def custom_kernel(data: input_t) -> output_t:
+    """
+    Execute the block-scaled GEMM kernel.
+    
+    This is the main entry point called by the evaluation framework.
+    It converts PyTorch tensors to CuTe tensors, launches the kernel,
+    and returns the result.
+    
+    Args:
+        data: Tuple of (a, b, sfa_ref, sfb_ref, sfa_permuted, sfb_permuted, c) PyTorch tensors
+            a: [m, k, l] - Input matrix in float4e2m1fn 
+            b: [n, k, l] - Input vector in float4e2m1fn 
+            sfa_ref: [m, k, l] - Scale factors in float8_e4m3fn, used by reference implementation
+            sfb_ref: [n, k, l] - Scale factors in float8_e4m3fn, used by reference implementation
+            sfa_permuted: [32, 4, rest_m, 4, rest_k, l] - Scale factors in float8_e4m3fn 
+            sfb_permuted: [32, 4, rest_n, 4, rest_k, l] - Scale factors in float8_e4m3fn 
+            c: [m, n, l] - Output vector in float16
+    
+    Returns:
+        Output tensor c with computed results
+    """
+    a, b, _, _, sfa_permuted, sfb_permuted, c = data
+    
+    # Ensure kernel is compiled (will use cached version if available)
+    # To avoid the compilation overhead, we compile the kernel once and cache it.
+    compiled_func = compile_kernel()
+
+    # Get dimensions from MxKxL layout
+    m, k, l = a.shape
+    n, _, _ = b.shape
+    # Torch use e2m1_x2 data type, thus k is halved
+    k = k * 2 
+
+    # Create CuTe pointers for A/B/C/SFA/SFB via torch tensor data pointer
+    a_ptr = make_ptr(
+        ab_dtype, a.data_ptr(), cute.AddressSpace.gmem, assumed_align=16
+    )
+    b_ptr = make_ptr(
+        ab_dtype, b.data_ptr(), cute.AddressSpace.gmem, assumed_align=16
+    )
+    c_ptr = make_ptr(
+        c_dtype, c.data_ptr(), cute.AddressSpace.gmem, assumed_align=16
+    )
+    sfa_ptr = make_ptr(
+        sf_dtype, sfa_permuted.data_ptr(), cute.AddressSpace.gmem, assumed_align=32
+    )
+    sfb_ptr = make_ptr(
+        sf_dtype, sfb_permuted.data_ptr(), cute.AddressSpace.gmem, assumed_align=32
+    )
+
+    # Execute the compiled kernel
+    compiled_func(a_ptr, b_ptr, sfa_ptr, sfb_ptr, c_ptr, (m, n, k, l))
+
+    return c
\ No newline at end of file
diff --git a/problems/nvidia/nvfp4_gemm/task.py b/problems/nvidia/nvfp4_gemm/task.py
new file mode 100644
index 00000000..66db7351
--- /dev/null
+++ b/problems/nvidia/nvfp4_gemm/task.py
@@ -0,0 +1,11 @@
+import torch
+from typing import TypedDict, TypeVar
+
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor])
+output_t = TypeVar("output_t", bound=torch.Tensor)
+class TestSpec(TypedDict):
+    m: int
+    n: int
+    k: int
+    l: int
+    seed: int
\ No newline at end of file
diff --git a/problems/nvidia/nvfp4_gemm/template.py b/problems/nvidia/nvfp4_gemm/template.py
new file mode 100644
index 00000000..3855d694
--- /dev/null
+++ b/problems/nvidia/nvfp4_gemm/template.py
@@ -0,0 +1,25 @@
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of block-scale fp4 gemm
+    Args:
+        data: Tuple that expands to:
+            a: torch.Tensor[float4e2m1fn] of shape [m, k, l],
+            b: torch.Tensor[float4e2m1fn] of shape [n, k, l],
+            sfa: torch.Tensor[float8_e4m3fnuz] of shape [m, k // 16, l],
+            sfb: torch.Tensor[float8_e4m3fnuz] of shape [n, k // 16, l],
+            sfa_permuted: torch.Tensor[float8_e4m3fnuz] of shape [32, 4, rest_m, 4, rest_k, l],
+            sfb_permuted: torch.Tensor[float8_e4m3fnuz] of shape [32, 4, rest_n, 4, rest_k, l],
+            c: torch.Tensor[float16] of shape [m, n, l]
+    Returns:
+        Tensor containing output in float16
+        c: torch.Tensor[float16] of shape [m, n, l]
+    """
+    # c: [m, n, l] is pre-allocated memory to avoid timing allocation overhead.
+    a, b, sfa, sfb, sfa_permuted, sfb_permuted, c = data
+
+    # Your implementation here
+
+    return c
\ No newline at end of file
diff --git a/problems/nvidia/nvfp4_gemm/utils.py b/problems/nvidia/nvfp4_gemm/utils.py
new file mode 100644
index 00000000..d9b3a69e
--- /dev/null
+++ b/problems/nvidia/nvfp4_gemm/utils.py
@@ -0,0 +1,172 @@
+import os
+import random
+import numpy as np
+import torch
+
+
+def set_seed(seed=42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+
+
+def get_device(use_cuda: bool = True) -> torch.device:
+    """Get the appropriate device (GPU or CPU)."""
+    if use_cuda:
+        if torch.cuda.is_available():
+            return torch.device("cuda")
+        elif torch.backends.mps.is_available():
+            return torch.device("mps")
+        else:
+            print("No compatible GPU found. Falling back to CPU.")
+    return torch.device("cpu")
+
+
+# Adapted from https://github.com/linkedin/Liger-Kernel/blob/main/test/utils.py
+@torch.no_grad()
+def verbose_allclose(
+        received: torch.Tensor,
+        expected: torch.Tensor,
+        rtol=1e-05,
+        atol=1e-08,
+        max_print=5
+) -> list[str]:
+    """
+    Assert that two tensors are element-wise equal within a tolerance, providing detailed information about mismatches.
+    Parameters:
+    received (torch.Tensor): Tensor we actually got.
+    expected (torch.Tensor): Tensor we expected to receive.
+    rtol (float): Relative tolerance; relative to expected
+    atol (float): Absolute tolerance.
+    max_print (int): Maximum number of mismatched elements to print.
+    Raises:
+    AssertionError: If the tensors are not all close within the given tolerance.
+    """
+    # Check if the shapes of the tensors match
+    if received.shape != expected.shape:
+        return ["SIZE MISMATCH"]
+
+    # Calculate the difference between the tensors
+    diff = torch.abs(received - expected)
+
+    # Determine the tolerance
+    tolerance = atol + rtol * torch.abs(expected)
+
+    # Find tolerance mismatched elements
+    tol_mismatched = diff > tolerance
+
+    # Find nan mismatched elements
+    nan_mismatched = torch.logical_xor(torch.isnan(received), torch.isnan(expected))
+
+    # Find +inf mismatched elements
+    posinf_mismatched = torch.logical_xor(torch.isposinf(received), torch.isposinf(expected))
+    # Find -inf mismatched elements
+    neginf_mismatched = torch.logical_xor(torch.isneginf(received), torch.isneginf(expected))
+
+    # Find all mismatched elements
+    mismatched = torch.logical_or(
+        torch.logical_or(tol_mismatched, nan_mismatched),
+        torch.logical_or(posinf_mismatched, neginf_mismatched),
+    )
+
+    mismatched_indices = torch.nonzero(mismatched)
+
+    # Count the number of mismatched elements
+    num_mismatched = mismatched.count_nonzero().item()
+
+    # Generate detailed information if there are mismatches
+    if num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
+
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return mismatch_details
+
+    return []
+
+
+@torch.no_grad()
+def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print: int=5):
+    """
+    Assert that two tensors are element-wise perfectly equal, providing detailed information about mismatches.
+    Parameters:
+    received (torch.Tensor): Tensor we actually got.
+    expected (torch.Tensor): Tensor we expected to receive.
+    max_print (int): Maximum number of mismatched elements to print.
+    Returns:
+         Empty string if tensors are equal, otherwise detailed error information
+    """
+    mismatched = torch.not_equal(received, expected)
+    mismatched_indices = torch.nonzero(mismatched)
+
+    # Count the number of mismatched elements
+    num_mismatched = mismatched.count_nonzero().item()
+
+    # Generate detailed information if there are mismatches
+    if num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
+
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return mismatch_details
+
+    return []
+
+
+def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08) -> tuple[bool, str]:
+    """
+    Convenient "default" implementation for tasks' `check_implementation` function.
+    """
+    expected = reference(data)
+    reasons = verbose_allclose(output, expected, rtol=rtol, atol=atol)
+
+    if len(reasons) > 0:
+        return False, "mismatch found! custom implementation doesn't match reference: " + " ".join(reasons)
+
+    return True, ''
+
+
+def make_match_reference(reference: callable, **kwargs):
+    def wrapped(data, output):
+        return match_reference(data, output, reference=reference, **kwargs)
+    return wrapped
+
+
+class DeterministicContext:
+    def __init__(self):
+        self.allow_tf32 = None
+        self.deterministic = None
+        self.cublas = None
+
+    def __enter__(self):
+        self.cublas = os.environ.get('CUBLAS_WORKSPACE_CONFIG', '')
+        self.allow_tf32 = torch.backends.cudnn.allow_tf32
+        self.deterministic = torch.backends.cudnn.deterministic
+        torch.backends.cudnn.allow_tf32 = False
+        torch.backends.cudnn.deterministic = True
+        torch.use_deterministic_algorithms(True)
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        torch.backends.cudnn.allow_tf32 = self.allow_tf32
+        torch.backends.cudnn.deterministic = self.deterministic
+        torch.use_deterministic_algorithms(False)
+        os.environ['CUBLAS_WORKSPACE_CONFIG'] = self.cublas
+
+def clear_l2_cache():
+    # import cupy as cp
+    # cp.cuda.runtime.deviceSetLimit(cp.cuda.runtime.cudaLimitPersistingL2CacheSize, 0)
+    # create a large dummy tensor
+    dummy = torch.empty((32, 1024, 1024), dtype=torch.int64, device="cuda")
+    # write stuff to
+    dummy.fill_(42)
+    del dummy

From 1bec480faab3ddcd6b864dccac32443ebbc55369 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Fri, 28 Nov 2025 08:35:33 +0100
Subject: [PATCH 151/207] fix: remove duplicate env var

---
 problems/nvidia.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/nvidia.yaml b/problems/nvidia.yaml
index 64ca9f71..4788237c 100644
--- a/problems/nvidia.yaml
+++ b/problems/nvidia.yaml
@@ -7,6 +7,6 @@ description: "NVIDIA Blackwell NVFP4 Kernel Hackathon"
 problems:
   - directory: nvidia/nvfp4_gemv
     name: nvfp4_gemv
-    deadline: "2025-11-28"
+    deadline: "2025-11-30"
     gpus:
       - NVIDIA

From e3f9699bb301377de6c689de6e73d4f3fa9d31f9 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Fri, 28 Nov 2025 08:41:03 +0100
Subject: [PATCH 152/207] Fix: nvidia deadline

---
 problems/nvidia.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/nvidia.yaml b/problems/nvidia.yaml
index 4788237c..ecbaa837 100644
--- a/problems/nvidia.yaml
+++ b/problems/nvidia.yaml
@@ -7,6 +7,6 @@ description: "NVIDIA Blackwell NVFP4 Kernel Hackathon"
 problems:
   - directory: nvidia/nvfp4_gemv
     name: nvfp4_gemv
-    deadline: "2025-11-30"
+    deadline: "2025-11-30 6:59"
     gpus:
       - NVIDIA

From 31f138027f33e4660feecdc3ba0141470c41ddbd Mon Sep 17 00:00:00 2001
From: az <alex.lx.zhang@gmail.com>
Date: Fri, 28 Nov 2025 03:10:40 -0500
Subject: [PATCH 153/207] Add new problem nvfp4_gemm to nvidia.yaml

---
 problems/nvidia.yaml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/problems/nvidia.yaml b/problems/nvidia.yaml
index 64ca9f71..e5c8a5f6 100644
--- a/problems/nvidia.yaml
+++ b/problems/nvidia.yaml
@@ -10,3 +10,10 @@ problems:
     deadline: "2025-11-28"
     gpus:
       - NVIDIA
+
+problems:
+  - directory: nvidia/nvfp4_gemm
+    name: nvfp4_gemm
+    deadline: "2025-12-19"
+    gpus:
+      - NVIDIA

From 6b098a13432d885fe47dcd24d55a8377b4ce727a Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Fri, 28 Nov 2025 03:18:35 -0500
Subject: [PATCH 154/207] add task yaml

---
 problems/nvidia/nvfp4_gemm/task.yml | 55 +++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)
 create mode 100644 problems/nvidia/nvfp4_gemm/task.yml

diff --git a/problems/nvidia/nvfp4_gemm/task.yml b/problems/nvidia/nvfp4_gemm/task.yml
new file mode 100644
index 00000000..5f4c712f
--- /dev/null
+++ b/problems/nvidia/nvfp4_gemm/task.yml
@@ -0,0 +1,55 @@
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  
+  You will implement a batched matrix-matrix multiplication kernel optimized for NVIDIA B200.
+  To be explicit, you will be given a tuple of tensors:
+  ```
+  (a, b, sfa, sfb, sfa_permuted, sfb_permuted, c)
+  ```
+  where:
+  * `a` is M x K x L in K-major order in nvfp4(e2m1)
+  * `b` is N x K x L in K-major order in nvfp4(e2m1)
+  * `sfa` is M x (K // 16) x L in K-major order in fp8(e4m3fnuz)
+  * `sfb` is N x (K // 16) x L in K-major order in fp8(e4m3fnuz)
+  * `sfa_permuted` is 32 x 4 x rest_m x 4 x rest_k x L in fp8(e4m3fnuz)
+  * `sfb_permuted` is 32 x 4 x rest_n x 4 x rest_k x L in fp8(e4m3fnuz)
+  * `c` is M x N x L in fp16
+  
+  Matrix sizes `M` and `N` are divisible by mma_tiler_mn[0] defined in the kernel, `K` is divisible by 64.
+  The ranking criteria is the geometric mean of the benchmark results.
+  For the grand price, your kernel will be evaluated against the speed of light analysis
+  and the solution closest to the speed of light will be awarded the grand price.
+
+config:
+  main: "eval.py"
+
+templates:
+  Python: "template.py"
+
+tests:
+  - {"m": 128, "n": 256, "k": 256, "l": 1, "seed": 1111}
+  - {"m": 128, "n": 1536, "k": 7168, "l": 1, "seed": 1111}
+  - {"m": 128, "n": 3072, "k": 1536, "l": 1, "seed": 1111}
+  - {"m": 256, "n": 7168, "k": 256, "l": 1, "seed": 1111}
+  - {"m": 256, "n": 7168, "k": 2048, "l": 1, "seed": 1111}
+  - {"m": 2304, "n": 4608, "k": 7168, "l": 1, "seed": 1111}
+  - {"m": 384, "n": 7168, "k": 2304, "l": 1, "seed": 1111}
+  - {"m": 512, "n": 512, "k": 7168, "l": 1, "seed": 1111}
+  - {"m": 512, "n": 4096, "k": 512, "l": 1, "seed": 1111}
+  - {"m": 512, "n": 1536, "k": 7168, "l": 1, "seed": 1111}
+
+benchmarks:
+  - {"m": 7168, "n": 128, "k": 16384, "l": 1, "seed": 1111}
+  - {"m": 4096, "n": 128, "k": 7168, "l": 1, "seed": 1111}
+  - {"m": 7168, "n": 128, "k": 2048, "l": 1, "seed": 1111}
+
+ranking_by: "geom"
+

From e7d01366fb2d4998ec3e88449219b202e8f1d351 Mon Sep 17 00:00:00 2001
From: Alex Zhang <alex.lx.zhang@gmail.com>
Date: Fri, 28 Nov 2025 03:24:36 -0500
Subject: [PATCH 155/207] update to correct task yml

---
 problems/nvidia/nvfp4_gemm/task.yml | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/problems/nvidia/nvfp4_gemm/task.yml b/problems/nvidia/nvfp4_gemm/task.yml
index 5f4c712f..06388bbc 100644
--- a/problems/nvidia/nvfp4_gemm/task.yml
+++ b/problems/nvidia/nvfp4_gemm/task.yml
@@ -1,3 +1,5 @@
+# name: nvfp4-block-scaled-gemm
+
 files:
   - {"name": "submission.py", "source": "@SUBMISSION@"}
   - {"name": "task.py", "source": "task.py"}
@@ -9,25 +11,29 @@ lang: "py"
 
 description: |
   
-  You will implement a batched matrix-matrix multiplication kernel optimized for NVIDIA B200.
+  You will implement a block scaled matrix-matrix multiplication kernel optimized for NVIDIA B200.
   To be explicit, you will be given a tuple of tensors:
   ```
-  (a, b, sfa, sfb, sfa_permuted, sfb_permuted, c)
+  (a, b, sfa, sfb, c)
   ```
   where:
   * `a` is M x K x L in K-major order in nvfp4(e2m1)
   * `b` is N x K x L in K-major order in nvfp4(e2m1)
   * `sfa` is M x (K // 16) x L in K-major order in fp8(e4m3fnuz)
   * `sfb` is N x (K // 16) x L in K-major order in fp8(e4m3fnuz)
-  * `sfa_permuted` is 32 x 4 x rest_m x 4 x rest_k x L in fp8(e4m3fnuz)
-  * `sfb_permuted` is 32 x 4 x rest_n x 4 x rest_k x L in fp8(e4m3fnuz)
   * `c` is M x N x L in fp16
   
-  Matrix sizes `M` and `N` are divisible by mma_tiler_mn[0] defined in the kernel, `K` is divisible by 64.
+  Matrix sizes `M` is divisible by mma_tiler_mn[0], `N` is divisible by mma_tiler_mn[1], `K` is divisible by 256.
   The ranking criteria is the geometric mean of the benchmark results.
   For the grand price, your kernel will be evaluated against the speed of light analysis
   and the solution closest to the speed of light will be awarded the grand price.
-
+  ```
+  The speed of light analysis based on the max(FP4 Tensor Core math throughput, DRAM memory throughput) of B200 and tested under 1.5Ghz clock:
+  M   N    K   L time[us]
+  128 7168 16384 1 8.994
+  128 4096 7168  1 2.354
+  128 7168 2048  1 1.333
+  ```
 config:
   main: "eval.py"
 
@@ -51,5 +57,4 @@ benchmarks:
   - {"m": 4096, "n": 128, "k": 7168, "l": 1, "seed": 1111}
   - {"m": 7168, "n": 128, "k": 2048, "l": 1, "seed": 1111}
 
-ranking_by: "geom"
-
+ranking_by: "geom"
\ No newline at end of file

From 28d05d4fe7fe475be6303efc4b28156529eecab6 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sat, 29 Nov 2025 19:31:20 +0100
Subject: [PATCH 156/207] Fixes

---
 problems/nvidia/nvfp4_gemm/reference.py | 2 +-
 problems/nvidia/nvfp4_gemm/task.yml     | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/problems/nvidia/nvfp4_gemm/reference.py b/problems/nvidia/nvfp4_gemm/reference.py
index 6853098c..c7cb96f9 100644
--- a/problems/nvidia/nvfp4_gemm/reference.py
+++ b/problems/nvidia/nvfp4_gemm/reference.py
@@ -155,7 +155,7 @@ def create_scale_factor_tensors(l, mn, sf_k):
     sfa_ref_cpu, sfa_ref_permuted = create_scale_factor_tensors(l, m, sf_k)
     sfb_ref_cpu, sfb_ref_permuted = create_scale_factor_tensors(l, n, sf_k)
 
-    return (a_ref, b_ref, sfa_ref_cpu, sfb_ref_cpu, sfa_ref_permuted, sfb_ref_permuted, c_ref)
+    return (a_ref, b_ref, sfa_ref_cpu.to("cuda"), sfb_ref_cpu.to("cuda"), sfa_ref_permuted, sfb_ref_permuted, c_ref)
 
 
 check_implementation = make_match_reference(ref_kernel, rtol=1e-03, atol=1e-03)
diff --git a/problems/nvidia/nvfp4_gemm/task.yml b/problems/nvidia/nvfp4_gemm/task.yml
index 06388bbc..137dd861 100644
--- a/problems/nvidia/nvfp4_gemm/task.yml
+++ b/problems/nvidia/nvfp4_gemm/task.yml
@@ -5,7 +5,7 @@ files:
   - {"name": "task.py", "source": "task.py"}
   - {"name": "utils.py", "source": "../utils.py"}
   - {"name": "reference.py", "source": "reference.py"}
-  - {"name": "eval.py", "source": "../eval.py"}
+  - {"name": "eval.py", "source": "eval.py"}
 
 lang: "py"
 
@@ -57,4 +57,4 @@ benchmarks:
   - {"m": 4096, "n": 128, "k": 7168, "l": 1, "seed": 1111}
   - {"m": 7168, "n": 128, "k": 2048, "l": 1, "seed": 1111}
 
-ranking_by: "geom"
\ No newline at end of file
+ranking_by: "geom"

From a069e39f9be2db08dbb5bda2666bbcabd03cc51f Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sat, 29 Nov 2025 19:38:22 +0100
Subject: [PATCH 157/207] Fixes

---
 problems/nvidia/nvfp4_gemm/task.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/problems/nvidia/nvfp4_gemm/task.yml b/problems/nvidia/nvfp4_gemm/task.yml
index 137dd861..544ca17f 100644
--- a/problems/nvidia/nvfp4_gemm/task.yml
+++ b/problems/nvidia/nvfp4_gemm/task.yml
@@ -1,5 +1,3 @@
-# name: nvfp4-block-scaled-gemm
-
 files:
   - {"name": "submission.py", "source": "@SUBMISSION@"}
   - {"name": "task.py", "source": "task.py"}

From 24d914620cb7a2b200ce71ed2d91701c33de788f Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sat, 29 Nov 2025 19:46:16 +0100
Subject: [PATCH 158/207] Fixes

---
 problems/nvidia.yaml                | 2 +-
 problems/nvidia/nvfp4_gemm/task.yml | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/problems/nvidia.yaml b/problems/nvidia.yaml
index e5c8a5f6..d14f2798 100644
--- a/problems/nvidia.yaml
+++ b/problems/nvidia.yaml
@@ -14,6 +14,6 @@ problems:
 problems:
   - directory: nvidia/nvfp4_gemm
     name: nvfp4_gemm
-    deadline: "2025-12-19"
+    deadline: "2025-12-20 6:59"
     gpus:
       - NVIDIA
diff --git a/problems/nvidia/nvfp4_gemm/task.yml b/problems/nvidia/nvfp4_gemm/task.yml
index 544ca17f..37e7e9ee 100644
--- a/problems/nvidia/nvfp4_gemm/task.yml
+++ b/problems/nvidia/nvfp4_gemm/task.yml
@@ -3,12 +3,12 @@ files:
   - {"name": "task.py", "source": "task.py"}
   - {"name": "utils.py", "source": "../utils.py"}
   - {"name": "reference.py", "source": "reference.py"}
-  - {"name": "eval.py", "source": "eval.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
 
 lang: "py"
 
 description: |
-  
+
   You will implement a block scaled matrix-matrix multiplication kernel optimized for NVIDIA B200.
   To be explicit, you will be given a tuple of tensors:
   ```
@@ -20,7 +20,7 @@ description: |
   * `sfa` is M x (K // 16) x L in K-major order in fp8(e4m3fnuz)
   * `sfb` is N x (K // 16) x L in K-major order in fp8(e4m3fnuz)
   * `c` is M x N x L in fp16
-  
+
   Matrix sizes `M` is divisible by mma_tiler_mn[0], `N` is divisible by mma_tiler_mn[1], `K` is divisible by 256.
   The ranking criteria is the geometric mean of the benchmark results.
   For the grand price, your kernel will be evaluated against the speed of light analysis

From d26c5900c0edfb61bf816060f08c2cd1e0e7e973 Mon Sep 17 00:00:00 2001
From: Matej Sirovatka <54212263+S1ro1@users.noreply.github.com>
Date: Sat, 29 Nov 2025 21:01:15 +0100
Subject: [PATCH 159/207] Fix: flip back gemm shapes

---
 problems/nvidia/nvfp4_gemm/task.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/problems/nvidia/nvfp4_gemm/task.yml b/problems/nvidia/nvfp4_gemm/task.yml
index 37e7e9ee..27875155 100644
--- a/problems/nvidia/nvfp4_gemm/task.yml
+++ b/problems/nvidia/nvfp4_gemm/task.yml
@@ -51,8 +51,8 @@ tests:
   - {"m": 512, "n": 1536, "k": 7168, "l": 1, "seed": 1111}
 
 benchmarks:
-  - {"m": 7168, "n": 128, "k": 16384, "l": 1, "seed": 1111}
-  - {"m": 4096, "n": 128, "k": 7168, "l": 1, "seed": 1111}
-  - {"m": 7168, "n": 128, "k": 2048, "l": 1, "seed": 1111}
+  - {"m": 128, "n": 7168, "k": 16384, "l": 1, "seed": 1111}
+  - {"m": 128, "n": 4096, "k": 7168, "l": 1, "seed": 1111}
+  - {"m": 128, "n": 7168, "k": 2048, "l": 1, "seed": 1111}
 
 ranking_by: "geom"

From 38be59362f6463dfbe546e472ac08ebd90814835 Mon Sep 17 00:00:00 2001
From: Matej Sirovatka <54212263+S1ro1@users.noreply.github.com>
Date: Sun, 30 Nov 2025 09:52:31 +0100
Subject: [PATCH 160/207] Update deadline for nvfp4_gemv problem

---
 problems/nvidia.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/nvidia.yaml b/problems/nvidia.yaml
index 1087fd13..e9536874 100644
--- a/problems/nvidia.yaml
+++ b/problems/nvidia.yaml
@@ -7,7 +7,7 @@ description: "NVIDIA Blackwell NVFP4 Kernel Hackathon"
 problems:
   - directory: nvidia/nvfp4_gemv
     name: nvfp4_gemv
-    deadline: "2025-11-30 6:59"
+    deadline: "2025-11-29 6:59"
     gpus:
       - NVIDIA
 

From 58a9f4e2cd947b66259d5fb5fa0d68521863b268 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sun, 30 Nov 2025 14:46:14 +0100
Subject: [PATCH 161/207] Feat: improve eval.py

---
 problems/nvidia/eval_better_bench.py | 491 +++++++++++++++++++++++++++
 problems/nvidia/nvfp4_gemm/task.yml  |   2 +-
 problems/nvidia/nvfp4_gemv/task.yml  |   2 +-
 3 files changed, 493 insertions(+), 2 deletions(-)
 create mode 100644 problems/nvidia/eval_better_bench.py

diff --git a/problems/nvidia/eval_better_bench.py b/problems/nvidia/eval_better_bench.py
new file mode 100644
index 00000000..d8bdeced
--- /dev/null
+++ b/problems/nvidia/eval_better_bench.py
@@ -0,0 +1,491 @@
+import base64
+import dataclasses
+import multiprocessing
+import re
+import time
+import os
+import sys
+import math
+from pathlib import Path
+from typing import Any, Optional
+
+import torch.cuda
+from cutlass.cute.nvgpu.common import OpError
+from torch.cuda.nvtx import range as nvtx_range
+
+from utils import set_seed, clear_l2_cache
+
+try:
+    from task import TestSpec
+except ImportError:
+    TestSpec = dict
+
+from reference import check_implementation, generate_input
+
+
+class PopcornOutput:
+    def __init__(self, fd: int):
+        self.file = os.fdopen(fd, "w")
+        os.set_inheritable(fd, False)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.file.close()
+
+    def print(self, *args, **kwargs):
+        print(*args, **kwargs, file=self.file, flush=True)
+
+    def log(self, key, value):
+        self.print(f"{key}: {value}")
+
+
+@dataclasses.dataclass
+class TestCase:
+    args: dict
+    spec: str
+
+
+def _combine(a: int, b: int) -> int:
+    # combine two integers into one:
+    # we need this to generate a secret seed based on the test-level seed and
+    # the global secret seed.
+    # the test-level seeds are public knowledge, and typically relatively small numbers,
+    # so we need to make sure they don't provide any useful info for the full seed.
+    # This Cantor construction ensures that if the secret seed is a large number,
+    # then so is the overall seed.
+    return int(a + (a + b) * (a + b + 1) // 2)
+
+
+def get_test_cases(file_name: str, seed: Optional[int]) -> list[TestCase]:
+    try:
+        content = Path(file_name).read_text()
+    except Exception as E:
+        print(f"Could not open test file`{file_name}`: {E}", file=sys.stderr)
+        exit(113)
+
+    tests = []
+    lines = content.splitlines()
+    match = r"\s*([a-zA-Z]+):\s*([a-zA-Z]+|[+-]?[0-9]+)\s*"
+    for line in lines:
+        parts = line.split(";")
+        case = {}
+        for part in parts:
+            matched = re.match(match, part)
+            if not re.fullmatch(match, part):
+                print(f"invalid test case: '{line}': '{part}'", file=sys.stderr)
+                exit(113)
+            key = matched[1]
+            val = matched[2]
+            try:
+                val = int(val)
+            except ValueError:
+                pass
+
+            case[key] = val
+        tests.append(TestCase(spec=line, args=case))
+
+    if seed is not None:
+        for test in tests:
+            if "seed" in test.args:
+                test.args["seed"] = _combine(test.args["seed"], seed)
+
+    return tests
+
+
+@dataclasses.dataclass
+class Stats:
+    runs: int
+    mean: float
+    std: float
+    err: float
+    best: float
+    worst: float
+
+
+def calculate_stats(durations: list[int]):
+    """
+    Calculate statistical data from a list of durations.
+
+    @param durations: A list of durations in nanoseconds.
+    @return: A Stats object containing the number of runs, mean, standard deviation, error, best, and worst durations.
+    """
+    runs = len(durations)
+    total = sum(durations)
+    best = min(durations)
+    worst = max(durations)
+
+    avg = total / runs
+    variance = sum(map(lambda x: (x - avg) ** 2, durations))
+    std = math.sqrt(variance / (runs - 1))
+    err = std / math.sqrt(runs)
+
+    return Stats(
+        runs=runs, mean=avg, std=std, err=err, best=float(best), worst=float(worst)
+    )
+
+
+def _clone_data(data):
+    """
+    Recursively goes through data and clones all tensors.
+    """
+    if isinstance(data, tuple):
+        return tuple(_clone_data(x) for x in data)
+    elif isinstance(data, list):
+        return [_clone_data(x) for x in data]
+    elif isinstance(data, dict):
+        return {k: _clone_data(v) for k, v in data.items()}
+    elif isinstance(data, torch.Tensor):
+        return data.clone()
+    else:
+        return data
+
+
+def _run_single_test(test: TestCase):
+    """
+    Runs a single test case. Do not call directly
+    """
+    from submission import custom_kernel
+
+    data = generate_input(**test.args)
+    torch.cuda.synchronize()
+    try:
+        submission_output = custom_kernel(_clone_data(data))
+
+    except OpError as E:
+        print(f"Encountered {E}", file=sys.stderr)
+        return False, str(E)
+    torch.cuda.synchronize()
+    return check_implementation(data, submission_output)
+
+
+def run_single_test(pool: multiprocessing.Pool, test: TestCase):
+    """
+    Runs a single test in another process.
+    """
+    return pool.apply(_run_single_test, (test,))
+
+
+def run_testing(
+    logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]
+):
+    """
+    Executes the actual test case code and checks for correctness.
+
+    @param logger: A PopcornOutput object used for logging test results.
+    @param tests: A list of TestCase objects representing the test cases to be executed.
+    @return: An integer representing the exit status: 0 if all tests pass, otherwise 112.
+    """
+    passed = True
+    logger.log("test-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"test.{idx}.spec", test.spec)
+        good, message = run_single_test(pool, test)
+        if not good:
+            logger.log(f"test.{idx}.status", "fail")
+            logger.log(f"test.{idx}.error", message)
+            passed = False
+        else:
+            logger.log(f"test.{idx}.status", "pass")
+            if message:
+                logger.log(f"test.{idx}.message", message)
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def _run_single_benchmark(
+    test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float
+) -> Stats | Any:
+    """
+    Runs one benchmark. Do not call directly.
+    """
+    from submission import custom_kernel
+
+    durations = []
+    num_iterations_per_benchmark = 50
+    data_list = []
+    # generate input data once
+
+    for i in range(num_iterations_per_benchmark):
+        if "seed" in test.args:
+            test.args["seed"] += 42
+        data = generate_input(**test.args)
+        data_list.append(data)
+
+    check_copy = _clone_data(data_list)
+
+    #  first, one obligatory correctness check
+    outputs = []
+    try:
+        for data in data_list:
+            output = custom_kernel(_clone_data(data))
+            outputs.append(output)
+    except OpError as E:
+        return f"Encountered {E}"
+    for out_idx, output in enumerate(outputs):
+        good, message = check_implementation(check_copy[out_idx], output)
+        if not good:
+            return message
+
+    # now, do multiple timing runs without further correctness testing
+    # there is an upper bound of 200 runs, and a lower bound of 3 runs;
+    # otherwise, we repeat until we either measure at least 10 full seconds,
+    # or the relative error of the mean is below 1%.
+
+    bm_start_time = time.perf_counter_ns()
+    for i in range(max_repeats):
+        torch.cuda.synchronize()
+        
+        clear_l2_cache()
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        start_event.record()
+        for data in data_list:
+            output = custom_kernel(data)
+        end_event.record()
+        torch.cuda.synchronize()
+        duration = (start_event.elapsed_time(end_event) / num_iterations_per_benchmark) * 1e6  # Convert ms to ns
+
+        if recheck:
+            for out_idx, output in enumerate(outputs):
+                good, message = check_implementation(check_copy[out_idx], output)
+            if not good:
+                return message
+
+        durations.append(duration)
+
+        total_bm_duration = time.perf_counter_ns() - bm_start_time
+        if i > 1 and total_bm_duration > 1e8:       # at least 2 runs, and at least 100 ms total time
+            stats = calculate_stats(durations)
+            # stop if either
+            # a) relative error dips below 0.1%
+            # b) we exceed the total time limit for benchmarking the kernel
+            # c) we exceed 2 minutes of total wallclock time.
+            if (
+                stats.err / stats.mean < 0.001
+                or stats.mean * stats.runs > max_time_ns
+                or total_bm_duration > 120e9
+            ):
+                break
+
+    return calculate_stats(durations)
+
+
+def run_single_benchmark(
+    pool: multiprocessing.Pool,
+    test: TestCase,
+    recheck: bool,
+    max_repeats: int,
+    max_time_ns: float,
+):
+    """
+    For a particular test case, check correctness (if applicable) and grab runtime results.
+
+    @param pool: Process on which the benchmark will be launched.
+    @param test: TestCase object.
+    @param recheck: Flag for whether to explicitly check functional correctness.
+    @param max_repeats: Number of trials to repeat.
+    @param max_time_ns: Timeout time in nanoseconds.
+    @return: A Stats object for this particular benchmark case or an error if the test fails.
+    """
+    return pool.apply(_run_single_benchmark, (test, recheck, max_repeats, max_time_ns))
+
+
+def run_benchmarking(
+    logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]
+):
+    """
+    Executes benchmarking code for a CUDA Kernel and logs runtimes.
+
+    @param logger: A PopcornOutput object used for logging benchmark results.
+    @param pool: Process on which the benchmarks will be launched.
+    @param tests: A list of TestCase objects representing the test cases to be benchmarked.
+    @return: An integer representing the exit status: 0 if all benchmarks pass, otherwise 112.
+    """
+
+    run_single_benchmark(pool, tests[0], False, 200, 10e7)
+
+    passed = True
+    logger.log("benchmark-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"benchmark.{idx}.spec", test.spec)
+        result = run_single_benchmark(pool, test, False, 200, 10e9)
+        if isinstance(result, Stats):
+            for field in dataclasses.fields(Stats):
+                logger.log(f"benchmark.{idx}.{field.name}", getattr(result, field.name))
+        else:
+            passed = False
+            logger.log(f"benchmark.{idx}.status", "fail")
+            logger.log(f"benchmark.{idx}.error", result)
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def _run_single_profile_torch(test: TestCase) -> str:
+    """
+    Profiles a single benchmark using the torch profiler.
+    """
+    from submission import custom_kernel
+    from torch.profiler import profile, ProfilerActivity
+
+    with nvtx_range("generate input"):
+        data = generate_input(**test.args)
+        torch.cuda.synchronize()
+
+    cloned = _clone_data(data)
+    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
+        with nvtx_range("custom_kernel"):
+            submission_output = custom_kernel(cloned)
+            torch.cuda.synchronize()
+
+    return prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=20)
+
+
+def _run_single_profile_ncu(test: TestCase) -> str:
+    """
+    Profiles a single benchmark using ncu. Note: this does not
+    invoke NCU; instead, it is expected that eval is launched
+    under NCU, and this function will rurnthe kernel excactly
+    once in the 'custom_kernel' nvtx range.
+    """
+    from submission import custom_kernel
+
+    with nvtx_range("generate input"):
+        data = generate_input(**test.args)
+        torch.cuda.synchronize()
+
+    cloned = _clone_data(data)
+    with nvtx_range("custom_kernel"):
+        submission_output = custom_kernel(cloned)
+        torch.cuda.synchronize()
+
+    return ""
+
+
+def _combine_traces(traces: list["EventList"]) -> "EventList":
+    """
+    Combine multiple event traces obtained from multiple (distributed) torch.profiler
+    activities. This function simply aggregates the data as like `prof.key_averages()`,
+    except over multiple traces. Most of this function is reimplemented
+    from `torch.autograd.profiler_util.EventList.key_averages()`.
+    """
+    from torch.autograd.profiler_util import FunctionEventAvg, EventList
+    from collections import defaultdict
+
+    def get_key(event) -> tuple[str, ...]:
+        return (
+            str(event.key),
+            str(event.node_id),
+            str(event.device_type),
+            str(event.is_legacy),
+            str(event.is_user_annotation),
+        )
+
+    stats: dict[tuple[str, ...], FunctionEventAvg] = defaultdict(FunctionEventAvg)
+
+    for events in traces:
+        for event in events:
+            stats[get_key(event)].add(event)
+
+    avg_list = EventList(stats.values())
+    for event in avg_list:
+        event.stack = []
+        event.input_shapes = ""
+        event.overload_name = ""
+
+    return avg_list
+
+
+def run_single_profile(test: TestCase, pool: multiprocessing.Pool) -> str:
+    """
+    Runs a single profiling activity in another process.
+    """
+    if bool(os.getenv("POPCORN_NCU", "0")):
+        return pool.apply(_run_single_profile_ncu, (test,))
+    else:
+        return pool.apply(_run_single_profile_torch, (test,))
+
+
+def run_profiling(
+    logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]
+):
+    logger.log("benchmark-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"benchmark.{idx}.spec", test.spec)
+        report = run_single_profile(test, pool)
+        logger.log(
+            f"benchmark.{idx}.report",
+            base64.b64encode(report.encode("utf-8"), b"+*").decode("utf-8"),
+        )
+    logger.log("check", "pass")
+    return 0
+
+
+def main():
+    fd = os.getenv("POPCORN_FD")
+    if not fd:
+        return 111
+
+    if len(sys.argv) < 3:
+        return 2
+
+    mode = sys.argv[1]
+    seed = os.getenv("POPCORN_SEED")
+    os.unsetenv("POPCORN_SEED")
+    seed = int(seed) if seed else None
+    set_seed(seed or 42)
+
+    tests = get_test_cases(sys.argv[2], seed)
+
+    with PopcornOutput(int(fd)) as logger:
+        import multiprocessing
+
+        mp_context = multiprocessing.get_context("spawn")
+        with mp_context.Pool(1) as pool:
+            if mode == "test":
+                return run_testing(logger, pool, tests)
+            if mode == "benchmark":
+                return run_benchmarking(logger, pool, tests)
+
+            if mode == "leaderboard":
+                run_single_benchmark(pool, tests[0], False, 200, 1e7)
+                logger.log("benchmark-count", len(tests))
+                passed = True
+                for i in range(len(tests)):
+                    result = run_single_benchmark(pool, tests[i], True, 200, 30e9)
+                    logger.log(f"benchmark.{i}.spec", tests[i].spec)
+                    if isinstance(result, Stats):
+                        for field in dataclasses.fields(Stats):
+                            logger.log(
+                                f"benchmark.{i}.{field.name}",
+                                getattr(result, field.name),
+                            )
+                    else:
+                        passed = False
+                        logger.log(f"benchmark.{i}.status", "fail")
+                        logger.log(
+                            f"benchmark.{i}.error", str(result)
+                        )  # TODO: Make sure result implements __str__?
+                        break
+
+                logger.log("check", "pass" if passed else "fail")
+            elif mode == "profile":
+                run_profiling(logger, pool, tests)
+            else:
+                # TODO: Implement script mode
+                return 2
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/problems/nvidia/nvfp4_gemm/task.yml b/problems/nvidia/nvfp4_gemm/task.yml
index 27875155..aca40fd6 100644
--- a/problems/nvidia/nvfp4_gemm/task.yml
+++ b/problems/nvidia/nvfp4_gemm/task.yml
@@ -3,7 +3,7 @@ files:
   - {"name": "task.py", "source": "task.py"}
   - {"name": "utils.py", "source": "../utils.py"}
   - {"name": "reference.py", "source": "reference.py"}
-  - {"name": "eval.py", "source": "../eval.py"}
+  - {"name": "eval.py", "source": "../eval_better_bench.py"}
 
 lang: "py"
 
diff --git a/problems/nvidia/nvfp4_gemv/task.yml b/problems/nvidia/nvfp4_gemv/task.yml
index 173914bd..8dac81e7 100644
--- a/problems/nvidia/nvfp4_gemv/task.yml
+++ b/problems/nvidia/nvfp4_gemv/task.yml
@@ -3,7 +3,7 @@ files:
   - {"name": "task.py", "source": "task.py"}
   - {"name": "utils.py", "source": "../utils.py"}
   - {"name": "reference.py", "source": "reference.py"}
-  - {"name": "eval.py", "source": "../eval.py"}
+  - {"name": "eval.py", "source": "../eval_new.py"}
 
 lang: "py"
 

From 2806e91935b4bda75550022ed6b37b631fe0dcde Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sun, 30 Nov 2025 14:47:21 +0100
Subject: [PATCH 162/207] ref

---
 problems/nvidia/eval_better_bench.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/problems/nvidia/eval_better_bench.py b/problems/nvidia/eval_better_bench.py
index d8bdeced..42c269fd 100644
--- a/problems/nvidia/eval_better_bench.py
+++ b/problems/nvidia/eval_better_bench.py
@@ -22,6 +22,8 @@
 
 from reference import check_implementation, generate_input
 
+NUM_ITERATIONS_PER_BENCHMARK = 50
+
 
 class PopcornOutput:
     def __init__(self, fd: int):
@@ -208,11 +210,10 @@ def _run_single_benchmark(
     from submission import custom_kernel
 
     durations = []
-    num_iterations_per_benchmark = 50
     data_list = []
     # generate input data once
 
-    for i in range(num_iterations_per_benchmark):
+    for i in range(NUM_ITERATIONS_PER_BENCHMARK):
         if "seed" in test.args:
             test.args["seed"] += 42
         data = generate_input(**test.args)
@@ -241,7 +242,7 @@ def _run_single_benchmark(
     bm_start_time = time.perf_counter_ns()
     for i in range(max_repeats):
         torch.cuda.synchronize()
-        
+
         clear_l2_cache()
         start_event = torch.cuda.Event(enable_timing=True)
         end_event = torch.cuda.Event(enable_timing=True)
@@ -250,7 +251,9 @@ def _run_single_benchmark(
             output = custom_kernel(data)
         end_event.record()
         torch.cuda.synchronize()
-        duration = (start_event.elapsed_time(end_event) / num_iterations_per_benchmark) * 1e6  # Convert ms to ns
+        duration = (
+            start_event.elapsed_time(end_event) / NUM_ITERATIONS_PER_BENCHMARK
+        ) * 1e6  # Convert ms to ns
 
         if recheck:
             for out_idx, output in enumerate(outputs):
@@ -261,7 +264,9 @@ def _run_single_benchmark(
         durations.append(duration)
 
         total_bm_duration = time.perf_counter_ns() - bm_start_time
-        if i > 1 and total_bm_duration > 1e8:       # at least 2 runs, and at least 100 ms total time
+        if (
+            i > 1 and total_bm_duration > 1e8
+        ):  # at least 2 runs, and at least 100 ms total time
             stats = calculate_stats(durations)
             # stop if either
             # a) relative error dips below 0.1%

From 62c005f6b7c598d27201db0f8d4f3b6529e33a4e Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sun, 30 Nov 2025 14:48:17 +0100
Subject: [PATCH 163/207] ref

---
 problems/nvidia/nvfp4_gemv/task.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/nvidia/nvfp4_gemv/task.yml b/problems/nvidia/nvfp4_gemv/task.yml
index 8dac81e7..173914bd 100644
--- a/problems/nvidia/nvfp4_gemv/task.yml
+++ b/problems/nvidia/nvfp4_gemv/task.yml
@@ -3,7 +3,7 @@ files:
   - {"name": "task.py", "source": "task.py"}
   - {"name": "utils.py", "source": "../utils.py"}
   - {"name": "reference.py", "source": "reference.py"}
-  - {"name": "eval.py", "source": "../eval_new.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
 
 lang: "py"
 

From 45d6a9a266819bb9243fb52016c3b00e4856c5a6 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sun, 30 Nov 2025 16:04:26 +0100
Subject: [PATCH 164/207] Fix: correct recheck

---
 problems/nvidia/eval_better_bench.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/problems/nvidia/eval_better_bench.py b/problems/nvidia/eval_better_bench.py
index 42c269fd..20660a40 100644
--- a/problems/nvidia/eval_better_bench.py
+++ b/problems/nvidia/eval_better_bench.py
@@ -243,12 +243,14 @@ def _run_single_benchmark(
     for i in range(max_repeats):
         torch.cuda.synchronize()
 
+        outputs = []
         clear_l2_cache()
         start_event = torch.cuda.Event(enable_timing=True)
         end_event = torch.cuda.Event(enable_timing=True)
         start_event.record()
         for data in data_list:
             output = custom_kernel(data)
+            outputs.append(output)
         end_event.record()
         torch.cuda.synchronize()
         duration = (

From 952d71cdb49eaf24b2d9f3c43ebc6e05840d5caa Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sun, 30 Nov 2025 16:13:55 +0100
Subject: [PATCH 165/207] More readable

---
 problems/nvidia/eval_better_bench.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/problems/nvidia/eval_better_bench.py b/problems/nvidia/eval_better_bench.py
index 20660a40..1aa9a5fd 100644
--- a/problems/nvidia/eval_better_bench.py
+++ b/problems/nvidia/eval_better_bench.py
@@ -229,8 +229,8 @@ def _run_single_benchmark(
             outputs.append(output)
     except OpError as E:
         return f"Encountered {E}"
-    for out_idx, output in enumerate(outputs):
-        good, message = check_implementation(check_copy[out_idx], output)
+    for reference_output, custom_output in zip(check_copy, outputs):
+        good, message = check_implementation(reference_output, custom_output)
         if not good:
             return message
 
@@ -258,8 +258,8 @@ def _run_single_benchmark(
         ) * 1e6  # Convert ms to ns
 
         if recheck:
-            for out_idx, output in enumerate(outputs):
-                good, message = check_implementation(check_copy[out_idx], output)
+            for reference_output, custom_output in zip(check_copy, outputs):
+                good, message = check_implementation(reference_output, custom_output)
             if not good:
                 return message
 

From 201e3fb90e01fa3379c63f87248803cdbab2001d Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Tue, 2 Dec 2025 19:55:17 +0100
Subject: [PATCH 166/207] Feat: larger l2 cache

---
 problems/nvidia/eval_better_bench.py |  2 +-
 problems/nvidia/utils.py             | 52 ++++++++++++++++++++--------
 2 files changed, 38 insertions(+), 16 deletions(-)

diff --git a/problems/nvidia/eval_better_bench.py b/problems/nvidia/eval_better_bench.py
index 1aa9a5fd..7754b370 100644
--- a/problems/nvidia/eval_better_bench.py
+++ b/problems/nvidia/eval_better_bench.py
@@ -13,7 +13,7 @@
 from cutlass.cute.nvgpu.common import OpError
 from torch.cuda.nvtx import range as nvtx_range
 
-from utils import set_seed, clear_l2_cache
+from utils import set_seed, clear_l2_cache_large
 
 try:
     from task import TestSpec
diff --git a/problems/nvidia/utils.py b/problems/nvidia/utils.py
index 7997d3db..b2859f09 100644
--- a/problems/nvidia/utils.py
+++ b/problems/nvidia/utils.py
@@ -28,11 +28,7 @@ def get_device(use_cuda: bool = True) -> torch.device:
 # Adapted from https://github.com/linkedin/Liger-Kernel/blob/main/test/utils.py
 @torch.no_grad()
 def verbose_allclose(
-        received: torch.Tensor,
-        expected: torch.Tensor,
-        rtol=1e-05,
-        atol=1e-08,
-        max_print=5
+    received: torch.Tensor, expected: torch.Tensor, rtol=1e-05, atol=1e-08, max_print=5
 ) -> list[str]:
     """
     Assert that two tensors are element-wise equal within a tolerance, providing detailed information about mismatches.
@@ -64,9 +60,13 @@ def verbose_allclose(
     nan_mismatched = torch.logical_xor(torch.isnan(received), torch.isnan(expected))
 
     # Find +inf mismatched elements
-    posinf_mismatched = torch.logical_xor(torch.isposinf(received), torch.isposinf(expected))
+    posinf_mismatched = torch.logical_xor(
+        torch.isposinf(received), torch.isposinf(expected)
+    )
     # Find -inf mismatched elements
-    neginf_mismatched = torch.logical_xor(torch.isneginf(received), torch.isneginf(expected))
+    neginf_mismatched = torch.logical_xor(
+        torch.isneginf(received), torch.isneginf(expected)
+    )
 
     # Find all mismatched elements
     mismatched = torch.logical_or(
@@ -87,14 +87,18 @@ def verbose_allclose(
             i = tuple(index.tolist())
             mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
         if num_mismatched > max_print:
-            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+            mismatch_details.append(
+                f"... and {num_mismatched - max_print} more mismatched elements."
+            )
         return mismatch_details
 
     return []
 
 
 @torch.no_grad()
-def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print: int=5):
+def verbose_allequal(
+    received: torch.Tensor, expected: torch.Tensor, max_print: int = 5
+):
     """
     Assert that two tensors are element-wise perfectly equal, providing detailed information about mismatches.
 
@@ -120,13 +124,17 @@ def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print:
             i = tuple(index.tolist())
             mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
         if num_mismatched > max_print:
-            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+            mismatch_details.append(
+                f"... and {num_mismatched - max_print} more mismatched elements."
+            )
         return mismatch_details
 
     return []
 
 
-def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08) -> tuple[bool, str]:
+def match_reference(
+    data, output, reference: callable, rtol=1e-05, atol=1e-08
+) -> tuple[bool, str]:
     """
     Convenient "default" implementation for tasks' `check_implementation` function.
     """
@@ -134,14 +142,19 @@ def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08) -
     reasons = verbose_allclose(output, expected, rtol=rtol, atol=atol)
 
     if len(reasons) > 0:
-        return False, "mismatch found! custom implementation doesn't match reference: " + " ".join(reasons)
+        return (
+            False,
+            "mismatch found! custom implementation doesn't match reference: "
+            + " ".join(reasons),
+        )
 
-    return True, ''
+    return True, ""
 
 
 def make_match_reference(reference: callable, **kwargs):
     def wrapped(data, output):
         return match_reference(data, output, reference=reference, **kwargs)
+
     return wrapped
 
 
@@ -152,7 +165,7 @@ def __init__(self):
         self.cublas = None
 
     def __enter__(self):
-        self.cublas = os.environ.get('CUBLAS_WORKSPACE_CONFIG', '')
+        self.cublas = os.environ.get("CUBLAS_WORKSPACE_CONFIG", "")
         self.allow_tf32 = torch.backends.cudnn.allow_tf32
         self.deterministic = torch.backends.cudnn.deterministic
         torch.backends.cudnn.allow_tf32 = False
@@ -164,7 +177,8 @@ def __exit__(self, exc_type, exc_value, traceback):
         torch.backends.cudnn.allow_tf32 = self.allow_tf32
         torch.backends.cudnn.deterministic = self.deterministic
         torch.use_deterministic_algorithms(False)
-        os.environ['CUBLAS_WORKSPACE_CONFIG'] = self.cublas
+        os.environ["CUBLAS_WORKSPACE_CONFIG"] = self.cublas
+
 
 def clear_l2_cache():
     # import cupy as cp
@@ -172,3 +186,11 @@ def clear_l2_cache():
     # create a large dummy tensor
     dummy = torch.randn((1024, 1024, 1024), device="cuda")
     del dummy
+
+
+def clear_l2_cache_large():
+    # import cupy as cp
+    # cp.cuda.runtime.deviceSetLimit(cp.cuda.runtime.cudaLimitPersistingL2CacheSize, 0)
+    # create a large dummy tensor
+    dummy = torch.randn((16000, 1024, 1024), device="cuda")
+    del dummy

From c6bf4366f8b646408839a7bed89471c38966cda6 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Tue, 2 Dec 2025 19:56:39 +0100
Subject: [PATCH 167/207] Feat: larger l2 cache

---
 problems/nvidia/eval_better_bench.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/nvidia/eval_better_bench.py b/problems/nvidia/eval_better_bench.py
index 7754b370..359f4c1b 100644
--- a/problems/nvidia/eval_better_bench.py
+++ b/problems/nvidia/eval_better_bench.py
@@ -13,7 +13,7 @@
 from cutlass.cute.nvgpu.common import OpError
 from torch.cuda.nvtx import range as nvtx_range
 
-from utils import set_seed, clear_l2_cache_large
+from utils import set_seed, clear_l2_cache_large as clear_l2_cache
 
 try:
     from task import TestSpec

From 47cec7104f48d24d0520034751f8aa3bf0ca90b3 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Wed, 3 Dec 2025 15:11:25 -0500
Subject: [PATCH 168/207] nvfp4_gemm: Update random tensor generation ranges in
 reference.py

---
 problems/nvidia/nvfp4_gemm/reference.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/problems/nvidia/nvfp4_gemm/reference.py b/problems/nvidia/nvfp4_gemm/reference.py
index c7cb96f9..421db5ef 100644
--- a/problems/nvidia/nvfp4_gemm/reference.py
+++ b/problems/nvidia/nvfp4_gemm/reference.py
@@ -84,10 +84,10 @@ def generate_input(
     
     # Generate uint8 tensor, then convert to float4e2m1fn_x2 data type
     a_ref = torch.randint(
-        -6, 6, (l, m, k // 2), dtype=torch.int8, device="cuda"
+        -128, 128, (l, m, k // 2), dtype=torch.int8, device="cuda"
     ).permute(1, 2, 0)
     b_ref = torch.randint(
-        -6, 6, (l, n, k // 2), dtype=torch.int8, device="cuda"
+        -128, 128, (l, n, k // 2), dtype=torch.int8, device="cuda"
     ).permute(1, 2, 0)
     a_ref = a_ref.view(torch.float4_e2m1fn_x2)
     b_ref = b_ref.view(torch.float4_e2m1fn_x2)
@@ -105,7 +105,7 @@ def create_scale_factor_tensors(l, mn, sf_k):
         ref_shape = (l, mn, sf_k)
         ref_permute_order = (1, 2, 0)
         # Init with uint8 tensor, then convert to float8_e4m3fn
-        ref_f8_random_int = torch.randint(-3, 3, ref_shape, dtype=torch.int8, device='cuda')
+        ref_f8_random_int = torch.randint(0, 4, ref_shape, dtype=torch.int8, device='cuda')
         ref_f8_torch_tensor = ref_f8_random_int.to(dtype=torch.float8_e4m3fn)
         # permute to match ref_permute_order
         ref_f8_torch_tensor_permuted = ref_f8_torch_tensor.permute(*ref_permute_order)
@@ -125,7 +125,7 @@ def create_scale_factor_tensors(l, mn, sf_k):
         # Which is needed by the CuTe customized kernel
         mma_permute_order = (3, 4, 1, 5, 2, 0)
         # Generate a random int8 tensor, then convert to float8_e4m3fn
-        rand_int_tensor = torch.randint(-3, 3, mma_shape, dtype=torch.int8, device='cuda')
+        rand_int_tensor = torch.randint(0, 4, mma_shape, dtype=torch.int8, device='cuda')
         reordered_f8_torch_tensor = rand_int_tensor.to(dtype=torch.float8_e4m3fn)
         # Permute according to mma_permute_order
         reordered_f8_torch_tensor = reordered_f8_torch_tensor.permute(*mma_permute_order)

From 0b692668dfeac8ac94b44e62047327cba552c32d Mon Sep 17 00:00:00 2001
From: Vicki Wang <vickiw@nvidia.com>
Date: Mon, 15 Dec 2025 21:47:47 -0800
Subject: [PATCH 169/207] update task.yml

---
 problems/nvidia/nvfp4_dual_gemm/eval.py  | 20 +++-----
 problems/nvidia/nvfp4_dual_gemm/task.yml | 64 ++++++++++++++++++++++++
 2 files changed, 72 insertions(+), 12 deletions(-)
 create mode 100644 problems/nvidia/nvfp4_dual_gemm/task.yml

diff --git a/problems/nvidia/nvfp4_dual_gemm/eval.py b/problems/nvidia/nvfp4_dual_gemm/eval.py
index e8bb5b21..ca325354 100644
--- a/problems/nvidia/nvfp4_dual_gemm/eval.py
+++ b/problems/nvidia/nvfp4_dual_gemm/eval.py
@@ -107,7 +107,6 @@ class Stats:
 def calculate_stats(durations: list[int]):
     """
     Calculate statistical data from a list of durations.
-
     @param durations: A list of durations in nanoseconds.
     @return: A Stats object containing the number of runs, mean, standard deviation, error, best, and worst durations.
     """
@@ -172,7 +171,6 @@ def run_testing(
 ):
     """
     Executes the actual test case code and checks for correctness.
-
     @param logger: A PopcornOutput object used for logging test results.
     @param tests: A list of TestCase objects representing the test cases to be executed.
     @return: An integer representing the exit status: 0 if all tests pass, otherwise 112.
@@ -185,7 +183,7 @@ def run_testing(
         logger.log("compile.error", compile_error)
         return 112
     logger.log("compile", "pass")
-    
+
     # Step 2: Run all tests with compiled kernel
     passed = True
     logger.log("test-count", len(tests))
@@ -215,7 +213,7 @@ def _compile_kernel_once():
     This ensures compilation time is not included in benchmark results.
     """
     from submission import compile_kernel
-    
+
     try:
         # Trigger compilation (will be cached)
         compile_kernel()
@@ -239,7 +237,7 @@ def _run_single_benchmark(
     # generate input data once
     data = generate_input(**test.args)
     check_copy = _clone_data(data)
-    
+
     # Ensure kernel is compiled before any timing (compilation is cached)
     try:
         compile_kernel()
@@ -248,7 +246,7 @@ def _run_single_benchmark(
         return f"Compilation failed: {E}"
     except Exception as E:
         return f"Compilation failed: {E}"
-    
+
     #  first, one obligatory correctness check
     try:
         output = custom_kernel(_clone_data(data))
@@ -317,7 +315,6 @@ def run_single_benchmark(
 ):
     """
     For a particular test case, check correctness (if applicable) and grab runtime results.
-
     @param pool: Process on which the benchmark will be launched.
     @param test: TestCase object.
     @param recheck: Flag for whether to explicitly check functional correctness.
@@ -333,7 +330,6 @@ def run_benchmarking(
 ):
     """
     Executes benchmarking code for a CUDA Kernel and logs runtimes.
-
     @param logger: A PopcornOutput object used for logging benchmark results.
     @param pool: Process on which the benchmarks will be launched.
     @param tests: A list of TestCase objects representing the test cases to be benchmarked.
@@ -347,7 +343,7 @@ def run_benchmarking(
         logger.log("compile.error", compile_error)
         return 112
     logger.log("compile", "pass")
-    
+
     # Step 2: Warm up with compiled kernel
     run_single_benchmark(pool, tests[0], False, 200, 10e7)
 
@@ -464,10 +460,10 @@ def build_test_string(tests: list[dict]):
                     logger.log("compile.error", compile_error)
                     return 112
                 logger.log("compile", "pass")
-                
+
                 # Step 2: Warmup with compiled kernel
                 run_single_benchmark(pool, tests[0], False, 200, 1e7)
-                
+
                 # Step 3: Run leaderboard benchmarks (compilation time excluded)
                 logger.log("benchmark-count", len(tests))
                 passed = True
@@ -497,4 +493,4 @@ def build_test_string(tests: list[dict]):
 
 
 if __name__ == "__main__":
-    sys.exit(main())
+    sys.exit(main())
\ No newline at end of file
diff --git a/problems/nvidia/nvfp4_dual_gemm/task.yml b/problems/nvidia/nvfp4_dual_gemm/task.yml
new file mode 100644
index 00000000..b22a2233
--- /dev/null
+++ b/problems/nvidia/nvfp4_dual_gemm/task.yml
@@ -0,0 +1,64 @@
+# name: nvfp4-dual-gemm
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  
+  You will implement a block scaled dual matrix-matrix multiplication kernel with silu activation optimized for NVIDIA B200.
+  To be explicit, you will be given a tuple of tensors:
+  ```
+  (a, b1, b2, sfa, sfb1, sfb2, c)
+  ```
+  where:
+  * `a` is M x K x L in K-major order in nvfp4(e2m1)
+  * `b1` is N x K x L in K-major order in nvfp4(e2m1)
+  * `b2` is N x K x L in K-major order in nvfp4(e2m1)
+  * `sfa` is M x (K // 16) x L in K-major order in fp8(e4m3fnuz)
+  * `sfb1` is N x (K // 16) x L in K-major order in fp8(e4m3fnuz)
+  * `sfb2` is N x (K // 16) x L in K-major order in fp8(e4m3fnuz)
+  * `c` is M x N x L in fp16
+  
+  Matrix sizes `M` is divisible by mma_tiler_mn[0], `N` is divisible by mma_tiler_mn[1], `K` is divisible by 256.
+  The ranking criteria is the geometric mean of the benchmark results.
+  For the grand price, your kernel will be evaluated against the speed of light analysis
+  and the solution closest to the speed of light will be awarded the grand price.
+  ```
+  The speed of light analysis based on the max(FP4 Tensor Core math throughput, DRAM memory throughput) of B200 and tested under 1.5Ghz clock:
+    M   N   K   L time[us] 
+  256 4096 7168 1 4.708
+  512 4096 7168 1 8.714
+  256 3072 4096 1 2.125
+  512 3072 7168 1 6.535
+  ```
+config:
+  main: "eval.py"
+
+templates:
+  Python: "template.py"
+
+tests:
+  - {"m": 1536, "n": 512, "k": 7168, "l": 1, "seed": 1111}
+  - {"m": 256, "n": 512, "k": 256, "l": 1, "seed": 1111}
+  - {"m": 1536, "n": 512, "k": 7168, "l": 1, "seed": 1111}
+  - {"m": 3072, "n": 1024, "k": 1536, "l": 1, "seed": 1111}
+  - {"m": 7168, "n": 1024, "k": 256, "l": 1, "seed": 1111}
+  - {"m": 7168, "n": 2304, "k": 2048, "l": 1, "seed": 1111}
+  - {"m": 4608, "n": 384, "k": 7168, "l": 1, "seed": 1111}
+  - {"m": 7168, "n": 384, "k": 2304, "l": 1, "seed": 1111}
+  - {"m": 512, "n": 768, "k": 7168, "l": 1, "seed": 1111}
+  - {"m": 4096, "n": 768, "k": 512, "l": 1, "seed": 1111}
+  
+benchmarks:
+  - {"m": 256, "n": 4096, "k": 7168, "l": 1, "seed": 1111}
+  - {"m": 512, "n": 4096, "k": 7168, "l": 1, "seed": 1111}
+  - {"m": 256, "n": 3072, "k": 4096, "l": 1, "seed": 1111}
+  - {"m": 512, "n": 3072, "k": 7168, "l": 1, "seed": 1111}
+
+ranking_by: "geom"
\ No newline at end of file

From 8cdfecd5d07845da25647f4d4be3b82c0f42b518 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Fri, 19 Dec 2025 18:24:34 +0100
Subject: [PATCH 170/207] Fix: increase modal timeout to 5min

---
 problems/pmpp_v2/conv2d_py/task.yml    | 2 +-
 problems/pmpp_v2/grayscale_py/task.yml | 2 +-
 problems/pmpp_v2/histogram_py/task.yml | 2 +-
 problems/pmpp_v2/matmul_py/task.yml    | 2 +-
 problems/pmpp_v2/prefixsum_py/task.yml | 2 +-
 problems/pmpp_v2/sort_py/task.yml      | 2 +-
 problems/pmpp_v2/vectoradd_py/task.yml | 2 +-
 problems/pmpp_v2/vectorsum_py/task.yml | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/problems/pmpp_v2/conv2d_py/task.yml b/problems/pmpp_v2/conv2d_py/task.yml
index 55adc532..f754ce7f 100644
--- a/problems/pmpp_v2/conv2d_py/task.yml
+++ b/problems/pmpp_v2/conv2d_py/task.yml
@@ -44,4 +44,4 @@ benchmarks:
 
 test_timeout: 180
 benchmark_timeout: 180
-ranked_timeout: 180
+ranked_timeout: 300
diff --git a/problems/pmpp_v2/grayscale_py/task.yml b/problems/pmpp_v2/grayscale_py/task.yml
index cada0257..9f191b01 100644
--- a/problems/pmpp_v2/grayscale_py/task.yml
+++ b/problems/pmpp_v2/grayscale_py/task.yml
@@ -38,4 +38,4 @@ benchmarks:
 
 test_timeout: 180
 benchmark_timeout: 180
-ranked_timeout: 180
+ranked_timeout: 300
diff --git a/problems/pmpp_v2/histogram_py/task.yml b/problems/pmpp_v2/histogram_py/task.yml
index 489a98b6..16136a95 100644
--- a/problems/pmpp_v2/histogram_py/task.yml
+++ b/problems/pmpp_v2/histogram_py/task.yml
@@ -40,4 +40,4 @@ benchmarks:
 
 test_timeout: 180
 benchmark_timeout: 180
-ranked_timeout: 180
+ranked_timeout: 300
diff --git a/problems/pmpp_v2/matmul_py/task.yml b/problems/pmpp_v2/matmul_py/task.yml
index 6924764b..05b160c4 100644
--- a/problems/pmpp_v2/matmul_py/task.yml
+++ b/problems/pmpp_v2/matmul_py/task.yml
@@ -41,4 +41,4 @@ benchmarks:
 
 test_timeout: 180
 benchmark_timeout: 180
-ranked_timeout: 180
+ranked_timeout: 300
diff --git a/problems/pmpp_v2/prefixsum_py/task.yml b/problems/pmpp_v2/prefixsum_py/task.yml
index a91d1496..f8561e47 100644
--- a/problems/pmpp_v2/prefixsum_py/task.yml
+++ b/problems/pmpp_v2/prefixsum_py/task.yml
@@ -54,4 +54,4 @@ benchmarks:
 
 test_timeout: 180
 benchmark_timeout: 180
-ranked_timeout: 180
+ranked_timeout: 300
diff --git a/problems/pmpp_v2/sort_py/task.yml b/problems/pmpp_v2/sort_py/task.yml
index 5c702e29..e8b6e6a0 100644
--- a/problems/pmpp_v2/sort_py/task.yml
+++ b/problems/pmpp_v2/sort_py/task.yml
@@ -38,4 +38,4 @@ benchmarks:
 
 test_timeout: 180
 benchmark_timeout: 180
-ranked_timeout: 180
+ranked_timeout: 300
diff --git a/problems/pmpp_v2/vectoradd_py/task.yml b/problems/pmpp_v2/vectoradd_py/task.yml
index 6906a313..1545915e 100644
--- a/problems/pmpp_v2/vectoradd_py/task.yml
+++ b/problems/pmpp_v2/vectoradd_py/task.yml
@@ -38,4 +38,4 @@ benchmarks:
 
 test_timeout: 180
 benchmark_timeout: 180
-ranked_timeout: 180
+ranked_timeout: 300
diff --git a/problems/pmpp_v2/vectorsum_py/task.yml b/problems/pmpp_v2/vectorsum_py/task.yml
index 8b3ddbb7..699fa30b 100644
--- a/problems/pmpp_v2/vectorsum_py/task.yml
+++ b/problems/pmpp_v2/vectorsum_py/task.yml
@@ -38,4 +38,4 @@ benchmarks:
 
 test_timeout: 180
 benchmark_timeout: 180
-ranked_timeout: 180
+ranked_timeout: 300

From 5d25aebebc45fda5923141e58b27a51bb469b9e8 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Fri, 19 Dec 2025 18:44:08 +0100
Subject: [PATCH 171/207] Fix: increase modal timeout to 7min

---
 problems/pmpp_v2/conv2d_py/task.yml    | 2 +-
 problems/pmpp_v2/grayscale_py/task.yml | 2 +-
 problems/pmpp_v2/histogram_py/task.yml | 2 +-
 problems/pmpp_v2/matmul_py/task.yml    | 2 +-
 problems/pmpp_v2/prefixsum_py/task.yml | 2 +-
 problems/pmpp_v2/sort_py/task.yml      | 2 +-
 problems/pmpp_v2/vectoradd_py/task.yml | 2 +-
 problems/pmpp_v2/vectorsum_py/task.yml | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/problems/pmpp_v2/conv2d_py/task.yml b/problems/pmpp_v2/conv2d_py/task.yml
index f754ce7f..9b44b2b7 100644
--- a/problems/pmpp_v2/conv2d_py/task.yml
+++ b/problems/pmpp_v2/conv2d_py/task.yml
@@ -44,4 +44,4 @@ benchmarks:
 
 test_timeout: 180
 benchmark_timeout: 180
-ranked_timeout: 300
+ranked_timeout: 420
diff --git a/problems/pmpp_v2/grayscale_py/task.yml b/problems/pmpp_v2/grayscale_py/task.yml
index 9f191b01..d1cbb30a 100644
--- a/problems/pmpp_v2/grayscale_py/task.yml
+++ b/problems/pmpp_v2/grayscale_py/task.yml
@@ -38,4 +38,4 @@ benchmarks:
 
 test_timeout: 180
 benchmark_timeout: 180
-ranked_timeout: 300
+ranked_timeout: 420
diff --git a/problems/pmpp_v2/histogram_py/task.yml b/problems/pmpp_v2/histogram_py/task.yml
index 16136a95..419529ab 100644
--- a/problems/pmpp_v2/histogram_py/task.yml
+++ b/problems/pmpp_v2/histogram_py/task.yml
@@ -40,4 +40,4 @@ benchmarks:
 
 test_timeout: 180
 benchmark_timeout: 180
-ranked_timeout: 300
+ranked_timeout: 420
diff --git a/problems/pmpp_v2/matmul_py/task.yml b/problems/pmpp_v2/matmul_py/task.yml
index 05b160c4..864ba171 100644
--- a/problems/pmpp_v2/matmul_py/task.yml
+++ b/problems/pmpp_v2/matmul_py/task.yml
@@ -41,4 +41,4 @@ benchmarks:
 
 test_timeout: 180
 benchmark_timeout: 180
-ranked_timeout: 300
+ranked_timeout: 420
diff --git a/problems/pmpp_v2/prefixsum_py/task.yml b/problems/pmpp_v2/prefixsum_py/task.yml
index f8561e47..734546d3 100644
--- a/problems/pmpp_v2/prefixsum_py/task.yml
+++ b/problems/pmpp_v2/prefixsum_py/task.yml
@@ -54,4 +54,4 @@ benchmarks:
 
 test_timeout: 180
 benchmark_timeout: 180
-ranked_timeout: 300
+ranked_timeout: 420
diff --git a/problems/pmpp_v2/sort_py/task.yml b/problems/pmpp_v2/sort_py/task.yml
index e8b6e6a0..7e78a156 100644
--- a/problems/pmpp_v2/sort_py/task.yml
+++ b/problems/pmpp_v2/sort_py/task.yml
@@ -38,4 +38,4 @@ benchmarks:
 
 test_timeout: 180
 benchmark_timeout: 180
-ranked_timeout: 300
+ranked_timeout: 420
diff --git a/problems/pmpp_v2/vectoradd_py/task.yml b/problems/pmpp_v2/vectoradd_py/task.yml
index 1545915e..f662f16a 100644
--- a/problems/pmpp_v2/vectoradd_py/task.yml
+++ b/problems/pmpp_v2/vectoradd_py/task.yml
@@ -38,4 +38,4 @@ benchmarks:
 
 test_timeout: 180
 benchmark_timeout: 180
-ranked_timeout: 300
+ranked_timeout: 420
diff --git a/problems/pmpp_v2/vectorsum_py/task.yml b/problems/pmpp_v2/vectorsum_py/task.yml
index 699fa30b..fc752a83 100644
--- a/problems/pmpp_v2/vectorsum_py/task.yml
+++ b/problems/pmpp_v2/vectorsum_py/task.yml
@@ -38,4 +38,4 @@ benchmarks:
 
 test_timeout: 180
 benchmark_timeout: 180
-ranked_timeout: 300
+ranked_timeout: 420

From 05524276a2b137ed93deb85be29af8a32631c641 Mon Sep 17 00:00:00 2001
From: Matej Sirovatka <54212263+S1ro1@users.noreply.github.com>
Date: Fri, 19 Dec 2025 20:57:45 +0100
Subject: [PATCH 172/207] Update deadline for nvfp4_gemm problem

---
 problems/nvidia.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/nvidia.yaml b/problems/nvidia.yaml
index e9536874..60128771 100644
--- a/problems/nvidia.yaml
+++ b/problems/nvidia.yaml
@@ -14,6 +14,6 @@ problems:
 problems:
   - directory: nvidia/nvfp4_gemm
     name: nvfp4_gemm
-    deadline: "2025-12-20 6:59"
+    deadline: "2025-12-21 6:59"
     gpus:
       - NVIDIA

From 89da6492fe0941844910a1c10eec26ccdda116de Mon Sep 17 00:00:00 2001
From: Matej Sirovatka <54212263+S1ro1@users.noreply.github.com>
Date: Sat, 20 Dec 2025 01:05:50 +0100
Subject: [PATCH 173/207] Update deadline for nvfp4_gemm problem

---
 problems/nvidia.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/nvidia.yaml b/problems/nvidia.yaml
index 60128771..6066cc58 100644
--- a/problems/nvidia.yaml
+++ b/problems/nvidia.yaml
@@ -14,6 +14,6 @@ problems:
 problems:
   - directory: nvidia/nvfp4_gemm
     name: nvfp4_gemm
-    deadline: "2025-12-21 6:59"
+    deadline: "2025-12-21 7:59"
     gpus:
       - NVIDIA

From 9832f6db20cc9ccbfc8fd4e15b9247b23ba7bc28 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sat, 20 Dec 2025 14:42:14 +0100
Subject: [PATCH 174/207] Feat: prepare dual gemm

---
 problems/nvidia/nvfp4_dual_gemm/eval.py      | 496 -------------------
 problems/nvidia/nvfp4_dual_gemm/reference.py |   4 +-
 problems/nvidia/nvfp4_dual_gemm/task.yml     |   4 +-
 3 files changed, 4 insertions(+), 500 deletions(-)
 delete mode 100644 problems/nvidia/nvfp4_dual_gemm/eval.py

diff --git a/problems/nvidia/nvfp4_dual_gemm/eval.py b/problems/nvidia/nvfp4_dual_gemm/eval.py
deleted file mode 100644
index ca325354..00000000
--- a/problems/nvidia/nvfp4_dual_gemm/eval.py
+++ /dev/null
@@ -1,496 +0,0 @@
-import base64
-import dataclasses
-import multiprocessing
-import re
-import time
-import os
-import sys
-import math
-from pathlib import Path
-from typing import Any, Optional
-import tempfile
-
-import torch.cuda
-from cutlass.cute.nvgpu.common import OpError
-
-from utils import set_seed, clear_l2_cache
-
-try:
-    from task import TestSpec
-except ImportError:
-    TestSpec = dict
-
-from reference import check_implementation, generate_input
-
-
-class PopcornOutput:
-    def __init__(self, fd: int):
-        self.file = os.fdopen(fd, "w")
-        os.set_inheritable(fd, False)
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.file.close()
-
-    def print(self, *args, **kwargs):
-        print(*args, **kwargs, file=self.file, flush=True)
-
-    def log(self, key, value):
-        self.print(f"{key}: {value}")
-
-
-@dataclasses.dataclass
-class TestCase:
-    args: dict
-    spec: str
-
-
-def _combine(a: int, b: int) -> int:
-    # combine two integers into one:
-    # we need this to generate a secret seed based on the test-level seed and
-    # the global secret seed.
-    # the test-level seeds are public knowledge, and typically relatively small numbers,
-    # so we need to make sure they don't provide any useful info for the full seed.
-    # This Cantor construction ensures that if the secret seed is a large number,
-    # then so is the overall seed.
-    return int(a + (a + b) * (a + b + 1) // 2)
-
-
-def get_test_cases(file_name: str, seed: Optional[int]) -> list[TestCase]:
-    try:
-        content = Path(file_name).read_text()
-    except Exception as E:
-        print(f"Could not open test file`{file_name}`: {E}", file=sys.stderr)
-        exit(113)
-
-    tests = []
-    lines = content.splitlines()
-    match = r"\s*([a-zA-Z]+):\s*([a-zA-Z]+|[+-]?[0-9]+)\s*"
-    for line in lines:
-        parts = line.split(";")
-        case = {}
-        for part in parts:
-            matched = re.match(match, part)
-            if not re.fullmatch(match, part):
-                print(f"invalid test case: '{line}': '{part}'", file=sys.stderr)
-                exit(113)
-            key = matched[1]
-            val = matched[2]
-            try:
-                val = int(val)
-            except ValueError:
-                pass
-
-            case[key] = val
-        tests.append(TestCase(spec=line, args=case))
-
-    if seed is not None:
-        for test in tests:
-            if "seed" in test.args:
-                test.args["seed"] = _combine(test.args["seed"], seed)
-
-    return tests
-
-
-@dataclasses.dataclass
-class Stats:
-    runs: int
-    mean: float
-    std: float
-    err: float
-    best: float
-    worst: float
-
-
-def calculate_stats(durations: list[int]):
-    """
-    Calculate statistical data from a list of durations.
-    @param durations: A list of durations in nanoseconds.
-    @return: A Stats object containing the number of runs, mean, standard deviation, error, best, and worst durations.
-    """
-    runs = len(durations)
-    total = sum(durations)
-    best = min(durations)
-    worst = max(durations)
-
-    avg = total / runs
-    variance = sum(map(lambda x: (x - avg) ** 2, durations))
-    std = math.sqrt(variance / (runs - 1))
-    err = std / math.sqrt(runs)
-
-    return Stats(
-        runs=runs, mean=avg, std=std, err=err, best=float(best), worst=float(worst)
-    )
-
-
-def _clone_data(data):
-    """
-    Recursively goes through data and clones all tensors.
-    """
-    if isinstance(data, tuple):
-        return tuple(_clone_data(x) for x in data)
-    elif isinstance(data, list):
-        return [_clone_data(x) for x in data]
-    elif isinstance(data, dict):
-        return {k: _clone_data(v) for k, v in data.items()}
-    elif isinstance(data, torch.Tensor):
-        return data.clone()
-    else:
-        return data
-
-
-def _run_single_test(test: TestCase):
-    """
-    Runs a single test case. Do not call directly
-    """
-    from submission import custom_kernel
-
-    data = generate_input(**test.args)
-    torch.cuda.synchronize()
-    try:
-        submission_output = custom_kernel(_clone_data(data))
-
-    except OpError as E:
-        print(f"Encountered {E}", file=sys.stderr)
-        return False, str(E)
-    torch.cuda.synchronize()
-    return check_implementation(data, submission_output)
-
-
-def run_single_test(pool: multiprocessing.Pool, test: TestCase):
-    """
-    Runs a single test in another process.
-    """
-    return pool.apply(_run_single_test, (test,))
-
-
-def run_testing(
-    logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]
-):
-    """
-    Executes the actual test case code and checks for correctness.
-    @param logger: A PopcornOutput object used for logging test results.
-    @param tests: A list of TestCase objects representing the test cases to be executed.
-    @return: An integer representing the exit status: 0 if all tests pass, otherwise 112.
-    """
-    # Step 1: Compile kernel once before running tests
-    logger.log("compile", "start")
-    compile_success, compile_error = pool.apply(_compile_kernel_once)
-    if not compile_success:
-        logger.log("compile", "fail")
-        logger.log("compile.error", compile_error)
-        return 112
-    logger.log("compile", "pass")
-
-    # Step 2: Run all tests with compiled kernel
-    passed = True
-    logger.log("test-count", len(tests))
-    for idx, test in enumerate(tests):
-        logger.log(f"test.{idx}.spec", test.spec)
-        good, message = run_single_test(pool, test)
-        if not good:
-            logger.log(f"test.{idx}.status", "fail")
-            logger.log(f"test.{idx}.error", message)
-            passed = False
-        else:
-            logger.log(f"test.{idx}.status", "pass")
-            if message:
-                logger.log(f"test.{idx}.message", message)
-
-    if passed:
-        logger.log("check", "pass")
-        return 0
-    else:
-        logger.log("check", "fail")
-        return 112
-
-
-def _compile_kernel_once():
-    """
-    Compile the kernel once before any benchmarking.
-    This ensures compilation time is not included in benchmark results.
-    """
-    from submission import compile_kernel
-
-    try:
-        # Trigger compilation (will be cached)
-        compile_kernel()
-        torch.cuda.synchronize()
-        return True, None
-    except OpError as E:
-        return False, f"Compilation failed: {E}"
-    except Exception as E:
-        return False, f"Compilation failed: {E}"
-
-
-def _run_single_benchmark(
-    test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float
-) -> Stats | Any:
-    """
-    Runs one benchmark. Do not call directly.
-    """
-    from submission import custom_kernel, compile_kernel
-
-    durations = []
-    # generate input data once
-    data = generate_input(**test.args)
-    check_copy = _clone_data(data)
-
-    # Ensure kernel is compiled before any timing (compilation is cached)
-    try:
-        compile_kernel()
-        torch.cuda.synchronize()
-    except OpError as E:
-        return f"Compilation failed: {E}"
-    except Exception as E:
-        return f"Compilation failed: {E}"
-
-    #  first, one obligatory correctness check
-    try:
-        output = custom_kernel(_clone_data(data))
-    except OpError as E:
-        return f"Encountered {E}"
-    good, message = check_implementation(check_copy, output)
-    if not good:
-        return message
-
-    # now, do multiple timing runs without further correctness testing
-    # there is an upper bound of 200 runs, and a lower bound of 3 runs;
-    # otherwise, we repeat until we either measure at least 10 full seconds,
-    # or the relative error of the mean is below 1%.
-
-    bm_start_time = time.perf_counter_ns()
-    for i in range(max_repeats):
-        if recheck:
-            # ensure we use a different seed for every benchmark
-            if "seed" in test.args:
-                test.args["seed"] += 13
-
-            data = generate_input(**test.args)
-            check_copy = _clone_data(data)
-        torch.cuda.synchronize()
-        start_event = torch.cuda.Event(enable_timing=True)
-        end_event = torch.cuda.Event(enable_timing=True)
-        clear_l2_cache()
-
-        start_event.record()
-        output = custom_kernel(data)
-        end_event.record()
-        torch.cuda.synchronize()
-        duration = start_event.elapsed_time(end_event) * 1e6  # Convert ms to ns
-
-        if recheck:
-            good, message = check_implementation(check_copy, output)
-            if not good:
-                return message
-
-        del output
-        durations.append(duration)
-
-        if i > 1:
-            total_bm_duration = time.perf_counter_ns() - bm_start_time
-            stats = calculate_stats(durations)
-            # stop if either
-            # a) relative error dips below 0.1%
-            # b) we exceed the total time limit for benchmarking the kernel
-            # c) we exceed 2 minutes of total wallclock time.
-            if (
-                stats.err / stats.mean < 0.001
-                or stats.mean * stats.runs > max_time_ns
-                or total_bm_duration > 120e9
-            ):
-                break
-
-    return calculate_stats(durations)
-
-
-def run_single_benchmark(
-    pool: multiprocessing.Pool,
-    test: TestCase,
-    recheck: bool,
-    max_repeats: int,
-    max_time_ns: float,
-):
-    """
-    For a particular test case, check correctness (if applicable) and grab runtime results.
-    @param pool: Process on which the benchmark will be launched.
-    @param test: TestCase object.
-    @param recheck: Flag for whether to explicitly check functional correctness.
-    @param max_repeats: Number of trials to repeat.
-    @param max_time_ns: Timeout time in nanoseconds.
-    @return: A Stats object for this particular benchmark case or an error if the test fails.
-    """
-    return pool.apply(_run_single_benchmark, (test, recheck, max_repeats, max_time_ns))
-
-
-def run_benchmarking(
-    logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]
-):
-    """
-    Executes benchmarking code for a CUDA Kernel and logs runtimes.
-    @param logger: A PopcornOutput object used for logging benchmark results.
-    @param pool: Process on which the benchmarks will be launched.
-    @param tests: A list of TestCase objects representing the test cases to be benchmarked.
-    @return: An integer representing the exit status: 0 if all benchmarks pass, otherwise 112.
-    """
-    # Step 1: Compile kernel once (outside of timing)
-    logger.log("compile", "start")
-    compile_success, compile_error = pool.apply(_compile_kernel_once)
-    if not compile_success:
-        logger.log("compile", "fail")
-        logger.log("compile.error", compile_error)
-        return 112
-    logger.log("compile", "pass")
-
-    # Step 2: Warm up with compiled kernel
-    run_single_benchmark(pool, tests[0], False, 200, 10e7)
-
-    # Step 3: Run benchmarks (compilation time excluded)
-    passed = True
-    logger.log("benchmark-count", len(tests))
-    for idx, test in enumerate(tests):
-        logger.log(f"benchmark.{idx}.spec", test.spec)
-        result = run_single_benchmark(pool, test, False, 200, 10e9)
-        if isinstance(result, Stats):
-            for field in dataclasses.fields(Stats):
-                logger.log(f"benchmark.{idx}.{field.name}", getattr(result, field.name))
-        else:
-            passed = False
-            logger.log(f"benchmark.{idx}.status", "fail")
-            logger.log(f"benchmark.{idx}.error", result)
-
-    if passed:
-        logger.log("check", "pass")
-        return 0
-    else:
-        logger.log("check", "fail")
-        return 112
-
-
-def run_single_profile(test: TestCase) -> str:
-    """
-    Runs a single test case. Do not call directly
-    """
-    from submission import custom_kernel
-    from torch.profiler import profile, record_function, ProfilerActivity
-
-    data = generate_input(**test.args)
-    torch.cuda.synchronize()
-
-    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
-        submission_output = custom_kernel(_clone_data(data))
-        torch.cuda.synchronize()
-    return prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=20)
-
-
-def run_profiling(logger: PopcornOutput, tests: list[TestCase]):
-    logger.log("benchmark-count", len(tests))
-    for idx, test in enumerate(tests):
-        logger.log(f"benchmark.{idx}.spec", test.spec)
-        report = run_single_profile(test)
-        logger.log(
-            f"benchmark.{idx}.report",
-            base64.b64encode(report.encode("utf-8"), b"+*").decode("utf-8"),
-        )
-    logger.log("check", "pass")
-    return 0
-
-
-def main():
-    fd = os.getenv("POPCORN_FD")
-    if not fd:
-        return 111
-
-    if len(sys.argv) < 3:
-        return 2
-
-    mode = sys.argv[1]
-    seed = os.getenv("POPCORN_SEED")
-    os.unsetenv("POPCORN_SEED")
-    seed = int(seed) if seed else None
-    set_seed(seed or 42)
-
-    filename = None
-
-    with tempfile.NamedTemporaryFile(delete=False) as tmp:
-
-        def build_test_string(tests: list[dict]):
-            as_str = ""
-            for test in tests:
-                kvs = []
-                for k, v in test.items():
-                    kvs.append(f"{k}: {v}")
-                as_str += "; ".join(kvs) + "\n"
-            return as_str
-
-        import yaml
-
-        yaml_content = yaml.safe_load(open(sys.argv[2], "r"))
-        if mode == "test":
-            tests_str = build_test_string(yaml_content.get("tests", []))
-        elif mode in ("benchmark", "leaderboard", "profile"):
-            tests_str = build_test_string(yaml_content.get("benchmarks", []))
-
-        tmp.write(tests_str.encode("utf-8"))
-        tmp.flush()
-        filename = tmp.name
-
-    tests = get_test_cases(filename, seed)
-
-    os.unlink(filename)
-
-    with PopcornOutput(int(fd)) as logger:
-        import multiprocessing
-
-        mp_context = multiprocessing.get_context("spawn")
-        with mp_context.Pool(1) as pool:
-            if mode == "test":
-                return run_testing(logger, pool, tests)
-            if mode == "benchmark":
-                return run_benchmarking(logger, pool, tests)
-
-            if mode == "leaderboard":
-                # Step 1: Compile kernel once (outside of timing)
-                logger.log("compile", "start")
-                compile_success, compile_error = pool.apply(_compile_kernel_once)
-                if not compile_success:
-                    logger.log("compile", "fail")
-                    logger.log("compile.error", compile_error)
-                    return 112
-                logger.log("compile", "pass")
-
-                # Step 2: Warmup with compiled kernel
-                run_single_benchmark(pool, tests[0], False, 200, 1e7)
-
-                # Step 3: Run leaderboard benchmarks (compilation time excluded)
-                logger.log("benchmark-count", len(tests))
-                passed = True
-                for i in range(len(tests)):
-                    result = run_single_benchmark(pool, tests[i], True, 200, 30e9)
-                    logger.log(f"benchmark.{i}.spec", tests[i].spec)
-                    if isinstance(result, Stats):
-                        for field in dataclasses.fields(Stats):
-                            logger.log(
-                                f"benchmark.{i}.{field.name}",
-                                getattr(result, field.name),
-                            )
-                    else:
-                        passed = False
-                        logger.log(f"benchmark.{i}.status", "fail")
-                        logger.log(
-                            f"benchmark.{i}.error", str(result)
-                        )  # TODO: Make sure result implements __str__?
-                        break
-
-                logger.log("check", "pass" if passed else "fail")
-            elif mode == "profile":
-                run_profiling(logger, tests)
-            else:
-                # TODO: Implement script mode
-                return 2
-
-
-if __name__ == "__main__":
-    sys.exit(main())
\ No newline at end of file
diff --git a/problems/nvidia/nvfp4_dual_gemm/reference.py b/problems/nvidia/nvfp4_dual_gemm/reference.py
index e4835979..c99b791f 100644
--- a/problems/nvidia/nvfp4_dual_gemm/reference.py
+++ b/problems/nvidia/nvfp4_dual_gemm/reference.py
@@ -188,7 +188,7 @@ def create_scale_factor_tensors(l, mn, sf_k):
     sfb1_ref_cpu, sfb1_ref_permuted = create_scale_factor_tensors(l, n, sf_k)
     sfb2_ref_cpu, sfb2_ref_permuted = create_scale_factor_tensors(l, n, sf_k)
 
-    return (a_ref, b1_ref, b2_ref, sfa_ref_cpu, sfb1_ref_cpu, sfb2_ref_cpu, sfa_ref_permuted, sfb1_ref_permuted, sfb2_ref_permuted, c_ref)
+    return (a_ref, b1_ref, b2_ref, sfa_ref_cpu.to("cuda"), sfb1_ref_cpu.to("cuda"), sfb2_ref_cpu.to("cuda"), sfa_ref_permuted, sfb1_ref_permuted, sfb2_ref_permuted, c_ref)
 
 
-check_implementation = make_match_reference(ref_kernel, rtol=1e-03, atol=1e-03)
\ No newline at end of file
+check_implementation = make_match_reference(ref_kernel, rtol=1e-03, atol=1e-03)
diff --git a/problems/nvidia/nvfp4_dual_gemm/task.yml b/problems/nvidia/nvfp4_dual_gemm/task.yml
index b22a2233..4d36bde6 100644
--- a/problems/nvidia/nvfp4_dual_gemm/task.yml
+++ b/problems/nvidia/nvfp4_dual_gemm/task.yml
@@ -5,7 +5,7 @@ files:
   - {"name": "task.py", "source": "task.py"}
   - {"name": "utils.py", "source": "../utils.py"}
   - {"name": "reference.py", "source": "reference.py"}
-  - {"name": "eval.py", "source": "../eval.py"}
+  - {"name": "eval.py", "source": "../eval_better_bench.py"}
 
 lang: "py"
 
@@ -61,4 +61,4 @@ benchmarks:
   - {"m": 256, "n": 3072, "k": 4096, "l": 1, "seed": 1111}
   - {"m": 512, "n": 3072, "k": 7168, "l": 1, "seed": 1111}
 
-ranking_by: "geom"
\ No newline at end of file
+ranking_by: "geom"

From 1b3868562c099c51dda0902cf2ed522e7cfd3332 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sat, 20 Dec 2025 23:20:02 +0100
Subject: [PATCH 175/207] Feat: dual gemm into yaml

---
 problems/nvidia.yaml | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/problems/nvidia.yaml b/problems/nvidia.yaml
index 6066cc58..cf8b68f7 100644
--- a/problems/nvidia.yaml
+++ b/problems/nvidia.yaml
@@ -10,10 +10,14 @@ problems:
     deadline: "2025-11-29 6:59"
     gpus:
       - NVIDIA
-
-problems:
   - directory: nvidia/nvfp4_gemm
     name: nvfp4_gemm
     deadline: "2025-12-21 7:59"
     gpus:
       - NVIDIA
+  - directory: nvidia/nvfp4_dual_gemm
+    name: nvfp4_dual_gemm
+    deadline: "2026-01-17 7:59"
+    gpus:
+      - NVIDIA
+  

From 4937a7c146964eeadae98d47738db3162fb08bbd Mon Sep 17 00:00:00 2001
From: Vicki Wang <vickiw@nvidia.com>
Date: Tue, 23 Dec 2025 05:20:34 -0800
Subject: [PATCH 176/207] reduce randint range and clamp scaling exponents to
 negatives.

---
 problems/nvidia/nvfp4_dual_gemm/reference.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/problems/nvidia/nvfp4_dual_gemm/reference.py b/problems/nvidia/nvfp4_dual_gemm/reference.py
index c99b791f..e6faa246 100644
--- a/problems/nvidia/nvfp4_dual_gemm/reference.py
+++ b/problems/nvidia/nvfp4_dual_gemm/reference.py
@@ -74,6 +74,7 @@ def ref_kernel(
         ref2[:, :, l_idx] = res2
     # Do silu on the first GEMM result and multiply with the second GEMM result
     c_ref = (torch.nn.functional.silu(ref1) * ref2).to(torch.float16)
+    print("First 10 elements of c_ref:", c_ref.flatten()[:10])
     return c_ref
 
 
@@ -112,13 +113,13 @@ def generate_input(
     
     # Generate uint8 tensor, then convert to float4e2m1fn_x2 data type
     a_ref = torch.randint(
-        -6, 6, (l, m, k // 2), dtype=torch.int8, device="cuda"
+        0, 2, (l, m, k // 2), dtype=torch.int8, device="cuda"
     ).permute(1, 2, 0)
     b1_ref = torch.randint(
-        -6, 6, (l, n, k // 2), dtype=torch.int8, device="cuda"
+        0, 2, (l, n, k // 2), dtype=torch.int8, device="cuda"
     ).permute(1, 2, 0)
     b2_ref = torch.randint(
-        -6, 6, (l, n, k // 2), dtype=torch.int8, device="cuda"
+        0, 2, (l, n, k // 2), dtype=torch.int8, device="cuda"
     ).permute(1, 2, 0)
     a_ref = a_ref.view(torch.float4_e2m1fn_x2)
     b1_ref = b1_ref.view(torch.float4_e2m1fn_x2)
@@ -137,7 +138,8 @@ def create_scale_factor_tensors(l, mn, sf_k):
         ref_shape = (l, mn, sf_k)
         ref_permute_order = (1, 2, 0)
         # Init with uint8 tensor, then convert to float8_e4m3fn
-        ref_f8_random_int = torch.randint(-3, 3, ref_shape, dtype=torch.int8, device='cuda')
+        ref_f8_random_int = torch.randint(1, 3, ref_shape, dtype=torch.int8, device='cuda')
+        ref_f8_random_int = torch.clamp(ref_f8_random_int, -3, -1)
         ref_f8_torch_tensor = ref_f8_random_int.to(dtype=torch.float8_e4m3fn)
         # permute to match ref_permute_order
         ref_f8_torch_tensor_permuted = ref_f8_torch_tensor.permute(*ref_permute_order)
@@ -157,7 +159,7 @@ def create_scale_factor_tensors(l, mn, sf_k):
         # Which is needed by the CuTe customized kernel
         mma_permute_order = (3, 4, 1, 5, 2, 0)
         # Generate a random int8 tensor, then convert to float8_e4m3fn
-        rand_int_tensor = torch.randint(-3, 3, mma_shape, dtype=torch.int8, device='cuda')
+        rand_int_tensor = torch.empty(mma_shape, dtype=torch.int8, device='cuda')
         reordered_f8_torch_tensor = rand_int_tensor.to(dtype=torch.float8_e4m3fn)
         # Permute according to mma_permute_order
         reordered_f8_torch_tensor = reordered_f8_torch_tensor.permute(*mma_permute_order)

From 7ccca9a230c1381b68fe318dc38863e2712b6dc1 Mon Sep 17 00:00:00 2001
From: Vicki Wang <vickiw@nvidia.com>
Date: Tue, 23 Dec 2025 05:21:59 -0800
Subject: [PATCH 177/207] remove debug print.

---
 problems/nvidia/nvfp4_dual_gemm/reference.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/problems/nvidia/nvfp4_dual_gemm/reference.py b/problems/nvidia/nvfp4_dual_gemm/reference.py
index e6faa246..0e251f6e 100644
--- a/problems/nvidia/nvfp4_dual_gemm/reference.py
+++ b/problems/nvidia/nvfp4_dual_gemm/reference.py
@@ -74,7 +74,6 @@ def ref_kernel(
         ref2[:, :, l_idx] = res2
     # Do silu on the first GEMM result and multiply with the second GEMM result
     c_ref = (torch.nn.functional.silu(ref1) * ref2).to(torch.float16)
-    print("First 10 elements of c_ref:", c_ref.flatten()[:10])
     return c_ref
 
 

From e03761b21135562255d55effa3dd784f4634620b Mon Sep 17 00:00:00 2001
From: Vicki Wang <vickiw@nvidia.com>
Date: Tue, 23 Dec 2025 06:30:50 -0800
Subject: [PATCH 178/207] avoid using the same value for scaling factor

---
 problems/nvidia/nvfp4_dual_gemm/reference.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/problems/nvidia/nvfp4_dual_gemm/reference.py b/problems/nvidia/nvfp4_dual_gemm/reference.py
index 0e251f6e..b378f789 100644
--- a/problems/nvidia/nvfp4_dual_gemm/reference.py
+++ b/problems/nvidia/nvfp4_dual_gemm/reference.py
@@ -137,8 +137,7 @@ def create_scale_factor_tensors(l, mn, sf_k):
         ref_shape = (l, mn, sf_k)
         ref_permute_order = (1, 2, 0)
         # Init with uint8 tensor, then convert to float8_e4m3fn
-        ref_f8_random_int = torch.randint(1, 3, ref_shape, dtype=torch.int8, device='cuda')
-        ref_f8_random_int = torch.clamp(ref_f8_random_int, -3, -1)
+        ref_f8_random_int = torch.randint(-1, 2, ref_shape, dtype=torch.int8, device='cuda')
         ref_f8_torch_tensor = ref_f8_random_int.to(dtype=torch.float8_e4m3fn)
         # permute to match ref_permute_order
         ref_f8_torch_tensor_permuted = ref_f8_torch_tensor.permute(*ref_permute_order)

From f861b2eb2b2c751bc44b26367c394ed66aacb749 Mon Sep 17 00:00:00 2001
From: Matej Sirovatka <54212263+S1ro1@users.noreply.github.com>
Date: Tue, 23 Dec 2025 16:21:54 +0100
Subject: [PATCH 179/207] Change random integer range for reference tensor

---
 problems/nvidia/nvfp4_dual_gemm/reference.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/nvidia/nvfp4_dual_gemm/reference.py b/problems/nvidia/nvfp4_dual_gemm/reference.py
index b378f789..7c3666fa 100644
--- a/problems/nvidia/nvfp4_dual_gemm/reference.py
+++ b/problems/nvidia/nvfp4_dual_gemm/reference.py
@@ -137,7 +137,7 @@ def create_scale_factor_tensors(l, mn, sf_k):
         ref_shape = (l, mn, sf_k)
         ref_permute_order = (1, 2, 0)
         # Init with uint8 tensor, then convert to float8_e4m3fn
-        ref_f8_random_int = torch.randint(-1, 2, ref_shape, dtype=torch.int8, device='cuda')
+        ref_f8_random_int = torch.randint(0, 3, ref_shape, dtype=torch.int8, device='cuda')
         ref_f8_torch_tensor = ref_f8_random_int.to(dtype=torch.float8_e4m3fn)
         # permute to match ref_permute_order
         ref_f8_torch_tensor_permuted = ref_f8_torch_tensor.permute(*ref_permute_order)

From dfd6bfff91ee56f34cbc78677c5a5e6922e9676e Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Tue, 23 Dec 2025 23:49:46 +0800
Subject: [PATCH 180/207] update

---
 problems/nvidia/nvfp4_dual_gemm/reference.py | 28 +++++++++++---------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/problems/nvidia/nvfp4_dual_gemm/reference.py b/problems/nvidia/nvfp4_dual_gemm/reference.py
index 7c3666fa..699c4990 100644
--- a/problems/nvidia/nvfp4_dual_gemm/reference.py
+++ b/problems/nvidia/nvfp4_dual_gemm/reference.py
@@ -110,16 +110,20 @@ def generate_input(
     """
     torch.manual_seed(seed)
     
+    def create_fp4_tensors(l, mn, k):
+        # generate int8 tensor, then convert to float4e2m1fn_x2 data type
+        ref_i8 = torch.randint(255, size=(l, mn, k // 2), dtype=torch.uint8, device="cuda")
+
+        # for each nibble, only keep the sign bit and 2 LSBs
+        # the possible values are [-1.5, -1, -0.5, 0, +0.5, +1, +1.5]
+        ref_i8 = ref_i8 & 0b1011_1011
+
+        return ref_i8.permute(1, 2, 0).view(torch.float4_e2m1fn_x2)
+
     # Generate uint8 tensor, then convert to float4e2m1fn_x2 data type
-    a_ref = torch.randint(
-        0, 2, (l, m, k // 2), dtype=torch.int8, device="cuda"
-    ).permute(1, 2, 0)
-    b1_ref = torch.randint(
-        0, 2, (l, n, k // 2), dtype=torch.int8, device="cuda"
-    ).permute(1, 2, 0)
-    b2_ref = torch.randint(
-        0, 2, (l, n, k // 2), dtype=torch.int8, device="cuda"
-    ).permute(1, 2, 0)
+    a_ref = create_fp4_tensors(l, m, k)
+    b1_ref = create_fp4_tensors(l, n, k)
+    b2_ref = create_fp4_tensors(l, n, k)
     a_ref = a_ref.view(torch.float4_e2m1fn_x2)
     b1_ref = b1_ref.view(torch.float4_e2m1fn_x2)
     b2_ref = b2_ref.view(torch.float4_e2m1fn_x2)
@@ -136,9 +140,9 @@ def create_scale_factor_tensors(l, mn, sf_k):
         # Create the reference scale factor tensor (mn, sf_k, l) on CPU.
         ref_shape = (l, mn, sf_k)
         ref_permute_order = (1, 2, 0)
-        # Init with uint8 tensor, then convert to float8_e4m3fn
-        ref_f8_random_int = torch.randint(0, 3, ref_shape, dtype=torch.int8, device='cuda')
-        ref_f8_torch_tensor = ref_f8_random_int.to(dtype=torch.float8_e4m3fn)
+        # Init with fp32 tensor in [0,1), then convert to float8_e4m3fn
+        ref_f8_random_fp32 = torch.rand(ref_shape, dtype=torch.float32, device='cuda')
+        ref_f8_torch_tensor = ref_f8_random_fp32.to(dtype=torch.float8_e4m3fn)
         # permute to match ref_permute_order
         ref_f8_torch_tensor_permuted = ref_f8_torch_tensor.permute(*ref_permute_order)
 

From 1b252833a07df0d80c9513789e7c548ec2b6d0af Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Tue, 23 Dec 2025 23:54:42 +0800
Subject: [PATCH 181/207] update comment

---
 problems/nvidia/nvfp4_dual_gemm/reference.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/problems/nvidia/nvfp4_dual_gemm/reference.py b/problems/nvidia/nvfp4_dual_gemm/reference.py
index 699c4990..95c6aacd 100644
--- a/problems/nvidia/nvfp4_dual_gemm/reference.py
+++ b/problems/nvidia/nvfp4_dual_gemm/reference.py
@@ -111,7 +111,8 @@ def generate_input(
     torch.manual_seed(seed)
     
     def create_fp4_tensors(l, mn, k):
-        # generate int8 tensor, then convert to float4e2m1fn_x2 data type
+        # generate uint8 tensor, then convert to float4e2m1fn_x2 data type
+        # generate all bit patterns
         ref_i8 = torch.randint(255, size=(l, mn, k // 2), dtype=torch.uint8, device="cuda")
 
         # for each nibble, only keep the sign bit and 2 LSBs

From 145d34a28005973c5abc61b2922da1beae98b58d Mon Sep 17 00:00:00 2001
From: az <alex.lx.zhang@gmail.com>
Date: Tue, 13 Jan 2026 18:11:44 -0500
Subject: [PATCH 182/207] Extend deadline for trimul problem to 2026-09-30

---
 problems/bioml.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/bioml.yaml b/problems/bioml.yaml
index 3761aea6..73e320ee 100644
--- a/problems/bioml.yaml
+++ b/problems/bioml.yaml
@@ -8,7 +8,7 @@ description: "Popular and important kernels for BioML models like AlphaFold3"
 problems:
   - directory: bioml/trimul
     name: trimul
-    deadline: "2025-09-30"
+    deadline: "2026-09-30"
     gpus:
       - B200
       - H100

From 64137f632ab2d654891cc3c584a382d1a8fdae0a Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Tue, 13 Jan 2026 18:54:19 -0800
Subject: [PATCH 183/207] more loops (#90)

* more loops

* Increase loops to hopefully decrease measurement noise

* Remove main function for standalone execution

Removed the main function and its associated logic for running the dual GEMM kernel locally.
---
 problems/nvidia/eval.py                       | 4 ++--
 problems/nvidia/eval_better_bench.py          | 4 ++--
 problems/nvidia/nvfp4_dual_gemm/submission.py | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/problems/nvidia/eval.py b/problems/nvidia/eval.py
index 1fe58d33..252f35e4 100644
--- a/problems/nvidia/eval.py
+++ b/problems/nvidia/eval.py
@@ -453,11 +453,11 @@ def main():
                 return run_benchmarking(logger, pool, tests)
 
             if mode == "leaderboard":
-                run_single_benchmark(pool, tests[0], False, 200, 1e7)
+                run_single_benchmark(pool, tests[0], False, 1000, 5e8)
                 logger.log("benchmark-count", len(tests))
                 passed = True
                 for i in range(len(tests)):
-                    result = run_single_benchmark(pool, tests[i], True, 200, 30e9)
+                    result = run_single_benchmark(pool, tests[i], True, 1000, 30e9)
                     logger.log(f"benchmark.{i}.spec", tests[i].spec)
                     if isinstance(result, Stats):
                         for field in dataclasses.fields(Stats):
diff --git a/problems/nvidia/eval_better_bench.py b/problems/nvidia/eval_better_bench.py
index 359f4c1b..2e591e2b 100644
--- a/problems/nvidia/eval_better_bench.py
+++ b/problems/nvidia/eval_better_bench.py
@@ -466,11 +466,11 @@ def main():
                 return run_benchmarking(logger, pool, tests)
 
             if mode == "leaderboard":
-                run_single_benchmark(pool, tests[0], False, 200, 1e7)
+                run_single_benchmark(pool, tests[0], False, 1000, 5e8)
                 logger.log("benchmark-count", len(tests))
                 passed = True
                 for i in range(len(tests)):
-                    result = run_single_benchmark(pool, tests[i], True, 200, 30e9)
+                    result = run_single_benchmark(pool, tests[i], True, 1000, 30e9)
                     logger.log(f"benchmark.{i}.spec", tests[i].spec)
                     if isinstance(result, Stats):
                         for field in dataclasses.fields(Stats):
diff --git a/problems/nvidia/nvfp4_dual_gemm/submission.py b/problems/nvidia/nvfp4_dual_gemm/submission.py
index f733212d..739cc5a0 100644
--- a/problems/nvidia/nvfp4_dual_gemm/submission.py
+++ b/problems/nvidia/nvfp4_dual_gemm/submission.py
@@ -954,4 +954,4 @@ def custom_kernel(data: input_t) -> output_t:
     # Execute the compiled kernel
     compiled_func(a_ptr, b1_ptr, b2_ptr, sfa_ptr, sfb1_ptr, sfb2_ptr, c_ptr, (m, n, k, l))
 
-    return c
\ No newline at end of file
+    return c

From 8227254330e138e33d583a32c62e786aa652a4f2 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Tue, 13 Jan 2026 19:01:07 -0800
Subject: [PATCH 184/207] disable cute dsl cache

---
 problems/nvidia/eval_better_bench.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/problems/nvidia/eval_better_bench.py b/problems/nvidia/eval_better_bench.py
index 2e591e2b..0d9a55c8 100644
--- a/problems/nvidia/eval_better_bench.py
+++ b/problems/nvidia/eval_better_bench.py
@@ -6,6 +6,10 @@
 import os
 import sys
 import math
+
+# Disable CuTe DSL file caching for more stable benchmarking
+os.environ["CUTE_DSL_DISABLE_FILE_CACHING"] = "True"
+
 from pathlib import Path
 from typing import Any, Optional
 

From cf6b3bd08851a4cb01ea9c4e237303b430c4da61 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Tue, 13 Jan 2026 19:14:29 -0800
Subject: [PATCH 185/207] warmup all shapes and init env over all processes

---
 problems/nvidia/eval_better_bench.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/problems/nvidia/eval_better_bench.py b/problems/nvidia/eval_better_bench.py
index 0d9a55c8..007781ed 100644
--- a/problems/nvidia/eval_better_bench.py
+++ b/problems/nvidia/eval_better_bench.py
@@ -8,7 +8,13 @@
 import math
 
 # Disable CuTe DSL file caching for more stable benchmarking
-os.environ["CUTE_DSL_DISABLE_FILE_CACHING"] = "True"
+os.environ["CUTE_DSL_DISABLE_FILE_CACHING"] = "1"
+
+
+def _init_worker():
+    """Initialize worker process with correct env vars."""
+    os.environ["CUTE_DSL_DISABLE_FILE_CACHING"] = "1"
+
 
 from pathlib import Path
 from typing import Any, Optional
@@ -463,14 +469,16 @@ def main():
         import multiprocessing
 
         mp_context = multiprocessing.get_context("spawn")
-        with mp_context.Pool(1) as pool:
+        with mp_context.Pool(1, initializer=_init_worker) as pool:
             if mode == "test":
                 return run_testing(logger, pool, tests)
             if mode == "benchmark":
                 return run_benchmarking(logger, pool, tests)
 
             if mode == "leaderboard":
-                run_single_benchmark(pool, tests[0], False, 1000, 5e8)
+                # Warmup all test shapes to ensure consistent benchmarking
+                for test in tests:
+                    run_single_benchmark(pool, test, False, 1000, 5e8)
                 logger.log("benchmark-count", len(tests))
                 passed = True
                 for i in range(len(tests)):

From b5934a744dcde49fb40531f3a5e1065af129ef41 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Thu, 15 Jan 2026 16:54:00 -0800
Subject: [PATCH 186/207] change nvidia dual gem fp4 problem name

---
 problems/nvidia.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/problems/nvidia.yaml b/problems/nvidia.yaml
index cf8b68f7..bfb0df7a 100644
--- a/problems/nvidia.yaml
+++ b/problems/nvidia.yaml
@@ -16,8 +16,8 @@ problems:
     gpus:
       - NVIDIA
   - directory: nvidia/nvfp4_dual_gemm
-    name: nvfp4_dual_gemm
-    deadline: "2026-01-17 7:59"
+    name: do_not_submit_nvfp4_dual_gem
+    deadline: "2026-02-20 7:59"
     gpus:
       - NVIDIA
   

From b9a6f89b609bb267b3fe39695a7223e602d28d40 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Thu, 15 Jan 2026 17:00:05 -0800
Subject: [PATCH 187/207] update

---
 problems/nvidia.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/nvidia.yaml b/problems/nvidia.yaml
index bfb0df7a..42658f65 100644
--- a/problems/nvidia.yaml
+++ b/problems/nvidia.yaml
@@ -16,7 +16,7 @@ problems:
     gpus:
       - NVIDIA
   - directory: nvidia/nvfp4_dual_gemm
-    name: do_not_submit_nvfp4_dual_gem
+    name: fake_nvfp4_dual_gem
     deadline: "2026-02-20 7:59"
     gpus:
       - NVIDIA

From 1d795e67ec4fa36b9daf6c64cad42641b7c9f8bd Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Thu, 15 Jan 2026 17:05:37 -0800
Subject: [PATCH 188/207] renaming problems doesn't actually work

---
 problems/nvidia.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/nvidia.yaml b/problems/nvidia.yaml
index 42658f65..42dd7a1e 100644
--- a/problems/nvidia.yaml
+++ b/problems/nvidia.yaml
@@ -16,7 +16,7 @@ problems:
     gpus:
       - NVIDIA
   - directory: nvidia/nvfp4_dual_gemm
-    name: fake_nvfp4_dual_gem
+    name: nvfp4_dual_gemm
     deadline: "2026-02-20 7:59"
     gpus:
       - NVIDIA

From 930d48de7fcf2b61452d05f234cba135d6648b46 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Sat, 17 Jan 2026 11:40:42 -0800
Subject: [PATCH 189/207] re-release of dual gemm problem

---
 problems/nvidia.yaml                          |   7 +-
 .../nvidia/final_nvfp4_dual_gemm/reference.py | 199 ++++
 .../final_nvfp4_dual_gemm/submission.py       | 957 ++++++++++++++++++
 problems/nvidia/final_nvfp4_dual_gemm/task.py |  11 +
 .../nvidia/final_nvfp4_dual_gemm/task.yml     |  64 ++
 .../nvidia/final_nvfp4_dual_gemm/template.py  |  28 +
 .../nvidia/final_nvfp4_dual_gemm/utils.py     | 172 ++++
 7 files changed, 1437 insertions(+), 1 deletion(-)
 create mode 100644 problems/nvidia/final_nvfp4_dual_gemm/reference.py
 create mode 100644 problems/nvidia/final_nvfp4_dual_gemm/submission.py
 create mode 100644 problems/nvidia/final_nvfp4_dual_gemm/task.py
 create mode 100644 problems/nvidia/final_nvfp4_dual_gemm/task.yml
 create mode 100644 problems/nvidia/final_nvfp4_dual_gemm/template.py
 create mode 100644 problems/nvidia/final_nvfp4_dual_gemm/utils.py

diff --git a/problems/nvidia.yaml b/problems/nvidia.yaml
index 42dd7a1e..6ff186bb 100644
--- a/problems/nvidia.yaml
+++ b/problems/nvidia.yaml
@@ -20,4 +20,9 @@ problems:
     deadline: "2026-02-20 7:59"
     gpus:
       - NVIDIA
-  
+  - directory: nvidia/final_nvfp4_dual_gemm
+    name: final_nvfp4_dual_gemm
+    deadline: "2026-01-20 7:59"
+    gpus:
+      - NVIDIA
+
diff --git a/problems/nvidia/final_nvfp4_dual_gemm/reference.py b/problems/nvidia/final_nvfp4_dual_gemm/reference.py
new file mode 100644
index 00000000..95c6aacd
--- /dev/null
+++ b/problems/nvidia/final_nvfp4_dual_gemm/reference.py
@@ -0,0 +1,199 @@
+import torch
+from task import input_t, output_t
+from utils import make_match_reference
+
+# Scaling factor vector size
+sf_vec_size = 16
+
+# Helper function for ceiling division
+def ceil_div(a, b):
+    return (a + b - 1) // b
+
+# Helper function to convert scale factor tensor to blocked format
+def to_blocked(input_matrix):
+    rows, cols = input_matrix.shape
+
+    # Please ensure rows and cols are multiples of 128 and 4 respectively
+    n_row_blocks = ceil_div(rows, 128)
+    n_col_blocks = ceil_div(cols, 4)
+
+    padded = input_matrix
+    blocks = padded.view(n_row_blocks, 128, n_col_blocks, 4).permute(0, 2, 1, 3)
+    rearranged = blocks.reshape(-1, 4, 32, 4).transpose(1, 2).reshape(-1, 32, 16)
+
+    return rearranged.flatten()
+
+
+def ref_kernel(
+    data: input_t,
+) -> output_t:
+    """
+    PyTorch reference implementation of NVFP4 block-scaled dual GEMM with silu activation,
+    C = silu(A @ B1) * (A @ B2).
+    """
+    a_ref, b1_ref, b2_ref, sfa_ref_cpu, sfb1_ref_cpu, sfb2_ref_cpu, _, _, _, c_ref = data
+    
+    # Get dimensions from MxNxL layout
+    m, n, l = c_ref.shape
+
+    # Call torch._scaled_mm to compute the GEMV result
+    ref1 = torch.empty(
+        (l, m, n),
+        dtype=torch.float32,
+        device="cuda",
+    ).permute(1, 2, 0)
+    ref2 = torch.empty(
+        (l, m, n),
+        dtype=torch.float32,
+        device="cuda",
+    ).permute(1, 2, 0)
+    for l_idx in range(l):
+        # Convert the scale factor tensor to blocked format
+        scale_a = to_blocked(sfa_ref_cpu[:, :, l_idx])
+        scale_b1 = to_blocked(sfb1_ref_cpu[:, :, l_idx])
+        scale_b2 = to_blocked(sfb2_ref_cpu[:, :, l_idx])
+        # (m, k) @ (n, k).T -> (m, n)
+        res1 = torch._scaled_mm(
+            a_ref[:, :, l_idx],
+            b1_ref[:, :, l_idx].transpose(0, 1),
+            scale_a.cuda(),
+            scale_b1.cuda(),
+            bias=None,
+            out_dtype=torch.float32,
+        )
+        ref1[:, :, l_idx] = res1
+
+        res2 = torch._scaled_mm(
+            a_ref[:, :, l_idx],
+            b2_ref[:, :, l_idx].transpose(0, 1),
+            scale_a.cuda(),
+            scale_b2.cuda(),
+            bias=None,
+            out_dtype=torch.float32,
+        )
+        ref2[:, :, l_idx] = res2
+    # Do silu on the first GEMM result and multiply with the second GEMM result
+    c_ref = (torch.nn.functional.silu(ref1) * ref2).to(torch.float16)
+    return c_ref
+
+
+def generate_input(
+    m: int,
+    n: int,
+    k: int,
+    l: int,
+    seed: int,
+):
+    """
+    Generate input tensors for NVFP4 block-scaled dual GEMM with silu activation,
+    C = silu(A @ B1) * (A @ B2).
+    
+    Args:
+        m: Number of rows in matrix A
+        n: Number of columns in matrix B1 and B2
+        k: Number of columns in A and rows of B1 and B2
+        l: Batch size
+        seed: Random seed for reproducibility
+    
+    Returns:
+        Tuple of (a, b, scale_a, scale_b, c) where:
+            a: [m, k, l] - Input matrix in torch.float4e2m1fn_x2 data type
+            b1: [n, k, l] - Input matrix in torch.float4e2m1fn_x2 data type
+            b2: [n, k, l] - Input matrix in torch.float4e2m1fn_x2 data type
+            scale_a: [m, k, l] - Input scale factors in torch.float8e4m3fn data type
+            scale_b1: [n, k, l] - Input scale factors in torch.float8e4m3fn data type
+            scale_b2: [n, k, l] - Input scale factors in torch.float8e4m3fn data type
+            scale_a_permuted: [32, 4, rest_m, 4, rest_k, l] - Input scale factors in torch.float8e4m3fn data type
+            scale_b1_permuted: [32, 4, rest_n, 4, rest_k, l] - Input scale factors in torch.float8e4m3fn data type
+            scale_b2_permuted: [32, 4, rest_n, 4, rest_k, l] - Input scale factors in torch.float8e4m3fn data type
+            c: [m, n, l] - Output matrix in torch.float16 data type
+    """
+    torch.manual_seed(seed)
+    
+    def create_fp4_tensors(l, mn, k):
+        # generate uint8 tensor, then convert to float4e2m1fn_x2 data type
+        # generate all bit patterns
+        ref_i8 = torch.randint(255, size=(l, mn, k // 2), dtype=torch.uint8, device="cuda")
+
+        # for each nibble, only keep the sign bit and 2 LSBs
+        # the possible values are [-1.5, -1, -0.5, 0, +0.5, +1, +1.5]
+        ref_i8 = ref_i8 & 0b1011_1011
+
+        return ref_i8.permute(1, 2, 0).view(torch.float4_e2m1fn_x2)
+
+    # Generate uint8 tensor, then convert to float4e2m1fn_x2 data type
+    a_ref = create_fp4_tensors(l, m, k)
+    b1_ref = create_fp4_tensors(l, n, k)
+    b2_ref = create_fp4_tensors(l, n, k)
+    a_ref = a_ref.view(torch.float4_e2m1fn_x2)
+    b1_ref = b1_ref.view(torch.float4_e2m1fn_x2)
+    b2_ref = b2_ref.view(torch.float4_e2m1fn_x2)
+
+    # Create float16 output tensor
+    c_ref = torch.randn((l, m, n), dtype=torch.float16, device="cuda").permute(
+        1, 2, 0
+    )
+    
+    # Helper function to prepare the scale factor tensors for both reference
+    # kernel and customize kernel. The customized data layout can be found in:
+    # https://docs.nvidia.com/cuda/cublas/index.html?highlight=fp4#d-block-scaling-factors-layout
+    def create_scale_factor_tensors(l, mn, sf_k):
+        # Create the reference scale factor tensor (mn, sf_k, l) on CPU.
+        ref_shape = (l, mn, sf_k)
+        ref_permute_order = (1, 2, 0)
+        # Init with fp32 tensor in [0,1), then convert to float8_e4m3fn
+        ref_f8_random_fp32 = torch.rand(ref_shape, dtype=torch.float32, device='cuda')
+        ref_f8_torch_tensor = ref_f8_random_fp32.to(dtype=torch.float8_e4m3fn)
+        # permute to match ref_permute_order
+        ref_f8_torch_tensor_permuted = ref_f8_torch_tensor.permute(*ref_permute_order)
+
+        atom_m = (32, 4)
+        atom_k = 4
+        mma_shape = (
+            l,  # batch size
+            ceil_div(mn, atom_m[0] * atom_m[1]),
+            ceil_div(sf_k, atom_k),
+            atom_m[0],
+            atom_m[1],
+            atom_k,
+        )
+
+        # Reorder scale factor tensor to (32, 4, rest_m, 4, rest_k, l) layout
+        # Which is needed by the CuTe customized kernel
+        mma_permute_order = (3, 4, 1, 5, 2, 0)
+        # Generate a random int8 tensor, then convert to float8_e4m3fn
+        rand_int_tensor = torch.empty(mma_shape, dtype=torch.int8, device='cuda')
+        reordered_f8_torch_tensor = rand_int_tensor.to(dtype=torch.float8_e4m3fn)
+        # Permute according to mma_permute_order
+        reordered_f8_torch_tensor = reordered_f8_torch_tensor.permute(*mma_permute_order)
+
+        # GPU-side vectorized reordering (replaces slow CPU nested loops)
+        # Create index grids for all dimensions
+        i_idx = torch.arange(mn, device='cuda')
+        j_idx = torch.arange(sf_k, device='cuda')
+        b_idx = torch.arange(l, device='cuda')
+        
+        # Create meshgrid for all combinations of (i, j, b)
+        i_grid, j_grid, b_grid = torch.meshgrid(i_idx, j_idx, b_idx, indexing='ij')
+        
+        # Calculate target indices in vectorized manner
+        mm = i_grid // (atom_m[0] * atom_m[1])
+        mm32 = i_grid % atom_m[0]
+        mm4 = (i_grid % 128) // atom_m[0]
+        kk = j_grid // atom_k
+        kk4 = j_grid % atom_k
+        
+        # Perform the reordering with advanced indexing (all on GPU)
+        reordered_f8_torch_tensor[mm32, mm4, mm, kk4, kk, b_grid] = ref_f8_torch_tensor_permuted[i_grid, j_grid, b_grid]
+        
+        return ref_f8_torch_tensor_permuted.cpu(), reordered_f8_torch_tensor
+
+    sf_k = ceil_div(k, sf_vec_size)
+    sfa_ref_cpu, sfa_ref_permuted = create_scale_factor_tensors(l, m, sf_k)
+    sfb1_ref_cpu, sfb1_ref_permuted = create_scale_factor_tensors(l, n, sf_k)
+    sfb2_ref_cpu, sfb2_ref_permuted = create_scale_factor_tensors(l, n, sf_k)
+
+    return (a_ref, b1_ref, b2_ref, sfa_ref_cpu.to("cuda"), sfb1_ref_cpu.to("cuda"), sfb2_ref_cpu.to("cuda"), sfa_ref_permuted, sfb1_ref_permuted, sfb2_ref_permuted, c_ref)
+
+
+check_implementation = make_match_reference(ref_kernel, rtol=1e-03, atol=1e-03)
diff --git a/problems/nvidia/final_nvfp4_dual_gemm/submission.py b/problems/nvidia/final_nvfp4_dual_gemm/submission.py
new file mode 100644
index 00000000..739cc5a0
--- /dev/null
+++ b/problems/nvidia/final_nvfp4_dual_gemm/submission.py
@@ -0,0 +1,957 @@
+from torch._higher_order_ops.torchbind import call_torchbind_fake
+import cuda.bindings.driver as cuda
+
+import torch
+from task import input_t, output_t
+
+import cutlass
+import cutlass.cute as cute
+import cutlass.utils as utils
+import cutlass.pipeline as pipeline
+from cutlass.cute.nvgpu import cpasync, tcgen05
+import cutlass.torch as cutlass_torch
+import cutlass.utils.blackwell_helpers as sm100_utils
+import cutlass.utils.blockscaled_layout as blockscaled_utils
+from cutlass.cute.runtime import make_ptr
+
+# Kernel configuration parameters
+# Tile sizes for M, N, K dimensions
+mma_tiler_mnk= (128, 128, 256)  
+# Shape of the K dimension for the MMA instruction
+mma_inst_shape_k = 64
+# FP4 data type for A and B
+ab_dtype = cutlass.Float4E2M1FN  
+# FP8 data type for scale factors
+sf_dtype = cutlass.Float8E4M3FN  
+# FP16 output type
+c_dtype = cutlass.Float16  
+# Scale factor block size (16 elements share one scale)
+sf_vec_size = 16  
+# Number of threads per CUDA thread block
+threads_per_cta = 128  
+# Stage numbers of shared memory and tmem
+num_acc_stage = 1
+num_ab_stage = 1
+# Total number of columns in tmem
+num_tmem_alloc_cols = 512
+
+
+# Helper function for ceiling division
+def ceil_div(a, b):
+    return (a + b - 1) // b
+
+
+#  GPU device kernel
+@cute.kernel
+def kernel(
+    tiled_mma: cute.TiledMma,
+    tma_atom_a: cute.CopyAtom,
+    mA_mkl: cute.Tensor,
+    tma_atom_b1: cute.CopyAtom,
+    mB_nkl1: cute.Tensor,
+    tma_atom_b2: cute.CopyAtom,
+    mB_nkl2: cute.Tensor,
+    tma_atom_sfa: cute.CopyAtom,
+    mSFA_mkl: cute.Tensor,
+    tma_atom_sfb1: cute.CopyAtom,
+    mSFB_nkl1: cute.Tensor,
+    tma_atom_sfb2: cute.CopyAtom,
+    mSFB_nkl2: cute.Tensor,
+    mC_mnl: cute.Tensor,
+    a_smem_layout_staged: cute.ComposedLayout,
+    b_smem_layout_staged: cute.ComposedLayout,
+    sfa_smem_layout_staged: cute.Layout,
+    sfb_smem_layout_staged: cute.Layout,
+    num_tma_load_bytes: cutlass.Constexpr[int],
+    epilogue_op: cutlass.Constexpr = lambda x: x
+    * (1.0 / (1.0 + cute.math.exp(-x, fastmath=True))),
+):
+    """
+    GPU device kernel performing the batched GEMM computation.
+    """
+    warp_idx = cute.arch.warp_idx()
+    warp_idx = cute.arch.make_warp_uniform(warp_idx)
+    tidx = cute.arch.thread_idx()
+
+    #
+    # Setup cta/thread coordinates
+    #
+    # Coords inside cluster
+    bidx, bidy, bidz = cute.arch.block_idx()
+    mma_tile_coord_v = bidx % cute.size(tiled_mma.thr_id.shape)
+
+    # Coords outside cluster
+    cta_coord = (bidx, bidy, bidz)
+    mma_tile_coord_mnl = (
+        cta_coord[0] // cute.size(tiled_mma.thr_id.shape),
+        cta_coord[1],
+        cta_coord[2],
+    )
+    # Coord inside cta
+    tidx, _, _ = cute.arch.thread_idx()
+
+    #
+    # Define shared storage for kernel
+    #
+    @cute.struct
+    class SharedStorage:
+        ab_mbar_ptr: cute.struct.MemRange[cutlass.Int64, num_ab_stage * 2]
+        acc_mbar_ptr: cute.struct.MemRange[cutlass.Int64, num_acc_stage * 2]
+        tmem_holding_buf: cutlass.Int32
+
+    smem = utils.SmemAllocator()
+    storage = smem.allocate(SharedStorage)
+    # (MMA, MMA_M, MMA_K, STAGE)
+    sA = smem.allocate_tensor(
+        element_type=ab_dtype,
+        layout=a_smem_layout_staged.outer,
+        byte_alignment=128,
+        swizzle=a_smem_layout_staged.inner,
+    )
+    # (MMA, MMA_N, MMA_K, STAGE)
+    sB1 = smem.allocate_tensor(
+        element_type=ab_dtype,
+        layout=b_smem_layout_staged.outer,
+        byte_alignment=128,
+        swizzle=b_smem_layout_staged.inner,
+    )
+    # (MMA, MMA_N, MMA_K, STAGE)
+    sB2 = smem.allocate_tensor(
+        element_type=ab_dtype,
+        layout=b_smem_layout_staged.outer,
+        byte_alignment=128,
+        swizzle=b_smem_layout_staged.inner,
+    )
+    # (MMA, MMA_M, MMA_K, STAGE)
+    sSFA = smem.allocate_tensor(
+        element_type=sf_dtype,
+        layout=sfa_smem_layout_staged,
+        byte_alignment=128,
+    )
+    # (MMA, MMA_N, MMA_K, STAGE)
+    sSFB1 = smem.allocate_tensor(
+        element_type=sf_dtype,
+        layout=sfb_smem_layout_staged,
+        byte_alignment=128,
+    )
+    # (MMA, MMA_N, MMA_K, STAGE)
+    sSFB2 = smem.allocate_tensor(
+        element_type=sf_dtype,
+        layout=sfb_smem_layout_staged,
+        byte_alignment=128,
+    )
+
+    #
+    # Initialize mainloop ab_pipeline, acc_pipeline and their states
+    #
+    ab_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
+    ab_pipeline_consumer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread, 1)
+    ab_producer, ab_consumer = pipeline.PipelineTmaUmma.create(
+        barrier_storage=storage.ab_mbar_ptr.data_ptr(),
+        num_stages=num_ab_stage,
+        producer_group=ab_pipeline_producer_group,
+        consumer_group=ab_pipeline_consumer_group,
+        tx_count=num_tma_load_bytes,
+    ).make_participants()
+    acc_producer, acc_consumer = pipeline.PipelineUmmaAsync.create(
+        barrier_storage=storage.acc_mbar_ptr.data_ptr(),
+        num_stages=num_acc_stage,
+        producer_group=ab_pipeline_producer_group,
+        consumer_group=pipeline.CooperativeGroup(
+            pipeline.Agent.Thread,
+            threads_per_cta,
+        ),
+    ).make_participants()
+
+    #
+    # Local_tile partition global tensors
+    #
+    # (bM, bK, RestM, RestK, RestL)
+    gA_mkl = cute.local_tile(
+        mA_mkl, cute.slice_(mma_tiler_mnk, (None, 0, None)), (None, None, None)
+    )
+    # (bN, bK, RestN, RestK, RestL)
+    gB_nkl1 = cute.local_tile(
+        mB_nkl1, cute.slice_(mma_tiler_mnk, (0, None, None)), (None, None, None)
+    )
+    # (bN, bK, RestN, RestK, RestL)
+    gB_nkl2 = cute.local_tile(
+        mB_nkl2, cute.slice_(mma_tiler_mnk, (0, None, None)), (None, None, None)
+    )
+    gSFA_mkl = cute.local_tile(
+        mSFA_mkl, cute.slice_(mma_tiler_mnk, (None, 0, None)), (None, None, None)
+    )
+    gSFB_nkl1 = cute.local_tile(
+        mSFB_nkl1, cute.slice_(mma_tiler_mnk, (0, None, None)), (None, None, None)
+    )
+    # (bN, bK, RestN, RestK, RestL)
+    gSFB_nkl2 = cute.local_tile(
+        mSFB_nkl2, cute.slice_(mma_tiler_mnk, (0, None, None)), (None, None, None)
+    )
+    # (bM, bN, RestM, RestN, RestL)
+    gC_mnl = cute.local_tile(
+        mC_mnl, cute.slice_(mma_tiler_mnk, (None, None, 0)), (None, None, None)
+    )
+    k_tile_cnt = cute.size(gA_mkl, mode=[3])
+
+    #
+    # Partition global tensor for TiledMMA_A/B/SFA/SFB/C
+    #
+    # (MMA, MMA_M, MMA_K, RestK)
+    thr_mma = tiled_mma.get_slice(mma_tile_coord_v)
+    # (MMA, MMA_M, MMA_K, RestM, RestK, RestL)
+    tCgA = thr_mma.partition_A(gA_mkl)
+    # (MMA, MMA_N, MMA_K, RestN, RestK, RestL)
+    tCgB1 = thr_mma.partition_B(gB_nkl1)
+    # (MMA, MMA_N, MMA_K, RestN, RestK, RestL)
+    tCgB2 = thr_mma.partition_B(gB_nkl2)
+    # (MMA, MMA_M, MMA_K, RestM, RestK, RestL)
+    tCgSFA = thr_mma.partition_A(gSFA_mkl)
+    # (MMA, MMA_N, MMA_K, RestN, RestK, RestL)
+    tCgSFB1 = thr_mma.partition_B(gSFB_nkl1)
+    # (MMA, MMA_N, MMA_K, RestN, RestK, RestL)
+    tCgSFB2 = thr_mma.partition_B(gSFB_nkl2)
+    # (MMA, MMA_M, MMA_N, RestM, RestN, RestL)
+    tCgC = thr_mma.partition_C(gC_mnl)
+
+    #
+    # Partition global/shared tensor for TMA load A/B/SFA/SFB
+    #
+    # TMA Partition_S/D for A
+    # ((atom_v, rest_v), STAGE)
+    # ((atom_v, rest_v), RestM, RestK, RestL)
+    tAsA, tAgA = cpasync.tma_partition(
+        tma_atom_a,
+        0,
+        cute.make_layout(1),
+        cute.group_modes(sA, 0, 3),
+        cute.group_modes(tCgA, 0, 3),
+    )
+    # TMA Partition_S/D for B1
+    # ((atom_v, rest_v), STAGE)
+    # ((atom_v, rest_v), RestN, RestK, RestL)
+    tBsB1, tBgB1 = cpasync.tma_partition(
+        tma_atom_b1,
+        0,
+        cute.make_layout(1),
+        cute.group_modes(sB1, 0, 3),
+        cute.group_modes(tCgB1, 0, 3),
+    )
+    # TMA Partition_S/D for B2
+    # ((atom_v, rest_v), STAGE)
+    # ((atom_v, rest_v), RestN, RestK, RestL)
+    tBsB2, tBgB2 = cpasync.tma_partition(
+        tma_atom_b2,
+        0,
+        cute.make_layout(1),
+        cute.group_modes(sB2, 0, 3),
+        cute.group_modes(tCgB2, 0, 3),
+    )
+    #  TMA Partition_S/D for SFA
+    # ((atom_v, rest_v), STAGE)
+    # ((atom_v, rest_v), RestM, RestK, RestL)
+    tAsSFA, tAgSFA = cpasync.tma_partition(
+        tma_atom_sfa,
+        0,
+        cute.make_layout(1),
+        cute.group_modes(sSFA, 0, 3),
+        cute.group_modes(tCgSFA, 0, 3),
+    )
+    tAsSFA = cute.filter_zeros(tAsSFA)
+    tAgSFA = cute.filter_zeros(tAgSFA)
+    # TMA Partition_S/D for SFB1
+    # ((atom_v, rest_v), STAGE)
+    # ((atom_v, rest_v), RestN, RestK, RestL)
+    tBsSFB1, tBgSFB1 = cpasync.tma_partition(
+        tma_atom_sfb1,
+        0,
+        cute.make_layout(1),
+        cute.group_modes(sSFB1, 0, 3),
+        cute.group_modes(tCgSFB1, 0, 3),
+    )
+    tBsSFB1 = cute.filter_zeros(tBsSFB1)
+    tBgSFB1 = cute.filter_zeros(tBgSFB1)
+    # TMA Partition_S/D for SFB2
+    # ((atom_v, rest_v), STAGE)
+    # ((atom_v, rest_v), RestN, RestK, RestL)
+    tBsSFB2, tBgSFB2 = cpasync.tma_partition(
+        tma_atom_sfb2,
+        0,
+        cute.make_layout(1),
+        cute.group_modes(sSFB2, 0, 3),
+        cute.group_modes(tCgSFB2, 0, 3),
+    )
+    tBsSFB2 = cute.filter_zeros(tBsSFB2)
+    tBgSFB2 = cute.filter_zeros(tBgSFB2)
+
+    #
+    # Partition shared/tensor memory tensor for TiledMMA_A/B/C
+    #
+    # (MMA, MMA_M, MMA_K, STAGE)
+    tCrA = tiled_mma.make_fragment_A(sA)
+    # (MMA, MMA_N, MMA_K, STAGE)
+    tCrB1 = tiled_mma.make_fragment_B(sB1)
+    # (MMA, MMA_N, MMA_K, STAGE)
+    tCrB2 = tiled_mma.make_fragment_B(sB2)
+    # (MMA, MMA_M, MMA_N)
+    acc_shape = tiled_mma.partition_shape_C(mma_tiler_mnk[:2])
+    # (MMA, MMA_M, MMA_N)
+    tCtAcc_fake = tiled_mma.make_fragment_C(acc_shape)
+
+    #
+    # Alloc tensor memory buffer
+    # Make ACC1 and ACC2 tmem tensor
+    # ACC1 += A @ B1
+    # ACC2 += A @ B2
+    #
+    tmem_alloc_barrier = pipeline.NamedBarrier(
+        barrier_id=1,
+        num_threads=threads_per_cta,
+    )
+    tmem = utils.TmemAllocator(
+        storage.tmem_holding_buf,
+        barrier_for_retrieve=tmem_alloc_barrier,
+    )
+    tmem.allocate(num_tmem_alloc_cols)
+    tmem.wait_for_alloc()
+    acc_tmem_ptr = tmem.retrieve_ptr(cutlass.Float32)
+    tCtAcc1 = cute.make_tensor(acc_tmem_ptr, tCtAcc_fake.layout)
+    acc_tmem_ptr1 = cute.recast_ptr(
+        acc_tmem_ptr + tcgen05.find_tmem_tensor_col_offset(tCtAcc1),
+        dtype=cutlass.Float32,
+    )
+    tCtAcc2 = cute.make_tensor(acc_tmem_ptr1, tCtAcc_fake.layout)
+
+    #
+    # Make SFA/SFB1/SFB2 tmem tensor
+    #
+    # SFA tmem layout: (MMA, MMA_M, MMA_K)
+    tCtSFA_layout = blockscaled_utils.make_tmem_layout_sfa(
+        tiled_mma,
+        mma_tiler_mnk,
+        sf_vec_size,
+        cute.slice_(sfa_smem_layout_staged, (None, None, None, 0)),
+    )
+    # Get SFA tmem ptr
+    sfa_tmem_ptr = cute.recast_ptr(
+        acc_tmem_ptr
+        + tcgen05.find_tmem_tensor_col_offset(tCtAcc1)
+        + tcgen05.find_tmem_tensor_col_offset(tCtAcc2),
+        dtype=sf_dtype,
+    )
+    tCtSFA = cute.make_tensor(sfa_tmem_ptr, tCtSFA_layout)
+
+    # SFB1, SFB2 tmem layout: (MMA, MMA_N, MMA_K)
+    tCtSFB_layout = blockscaled_utils.make_tmem_layout_sfb(
+        tiled_mma,
+        mma_tiler_mnk,
+        sf_vec_size,
+        cute.slice_(sfb_smem_layout_staged, (None, None, None, 0)),
+    )
+    # Get SFB1 tmem ptr
+    sfb_tmem_ptr1 = cute.recast_ptr(
+        acc_tmem_ptr
+        + tcgen05.find_tmem_tensor_col_offset(tCtAcc1)
+        + tcgen05.find_tmem_tensor_col_offset(tCtAcc2)
+        + tcgen05.find_tmem_tensor_col_offset(tCtSFA),
+        dtype=sf_dtype,
+    )
+    tCtSFB1 = cute.make_tensor(sfb_tmem_ptr1, tCtSFB_layout)
+    # Get SFB2 tmem ptr
+    sfb_tmem_ptr2 = cute.recast_ptr(
+        acc_tmem_ptr
+        + tcgen05.find_tmem_tensor_col_offset(tCtAcc1)
+        + tcgen05.find_tmem_tensor_col_offset(tCtAcc2)
+        + tcgen05.find_tmem_tensor_col_offset(tCtSFA)
+        + tcgen05.find_tmem_tensor_col_offset(tCtSFB1),
+        dtype=sf_dtype,
+    )
+    tCtSFB2 = cute.make_tensor(sfb_tmem_ptr2, tCtSFB_layout)
+
+    #
+    # Partition for S2T copy of SFA/SFB1/SFB2
+    #
+    # Make S2T CopyAtom
+    copy_atom_s2t = cute.make_copy_atom(
+        tcgen05.Cp4x32x128bOp(tcgen05.CtaGroup.ONE),
+        sf_dtype,
+    )
+    # (MMA, MMA_MN, MMA_K, STAGE)
+    tCsSFA_compact = cute.filter_zeros(sSFA)
+    # (MMA, MMA_MN, MMA_K)
+    tCtSFA_compact = cute.filter_zeros(tCtSFA)
+    tiled_copy_s2t_sfa = tcgen05.make_s2t_copy(copy_atom_s2t, tCtSFA_compact)
+    thr_copy_s2t_sfa = tiled_copy_s2t_sfa.get_slice(0)
+    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+    tCsSFA_compact_s2t_ = thr_copy_s2t_sfa.partition_S(tCsSFA_compact)
+    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+    tCsSFA_compact_s2t = tcgen05.get_s2t_smem_desc_tensor(
+        tiled_copy_s2t_sfa, tCsSFA_compact_s2t_
+    )
+    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K)
+    tCtSFA_compact_s2t = thr_copy_s2t_sfa.partition_D(tCtSFA_compact)
+
+    # (MMA, MMA_MN, MMA_K, STAGE)
+    tCsSFB1_compact = cute.filter_zeros(sSFB1)
+    # (MMA, MMA_MN, MMA_K)
+    tCtSFB1_compact = cute.filter_zeros(tCtSFB1)
+    tiled_copy_s2t_sfb = tcgen05.make_s2t_copy(copy_atom_s2t, tCtSFB1_compact)
+    thr_copy_s2t_sfb = tiled_copy_s2t_sfb.get_slice(0)
+    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+    tCsSFB1_compact_s2t_ = thr_copy_s2t_sfb.partition_S(tCsSFB1_compact)
+    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+    tCsSFB1_compact_s2t = tcgen05.get_s2t_smem_desc_tensor(
+        tiled_copy_s2t_sfb, tCsSFB1_compact_s2t_
+    )
+    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K)
+    tCtSFB1_compact_s2t = thr_copy_s2t_sfb.partition_D(tCtSFB1_compact)
+
+    # SFB2 S2T copy and partition
+    # (MMA, MMA_MN, MMA_K, STAGE)
+    tCsSFB2_compact = cute.filter_zeros(sSFB2)
+    # (MMA, MMA_MN, MMA_K)
+    tCtSFB2_compact = cute.filter_zeros(tCtSFB2)
+    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+    tCsSFB2_compact_s2t_ = thr_copy_s2t_sfb.partition_S(tCsSFB2_compact)
+    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+    tCsSFB2_compact_s2t = tcgen05.get_s2t_smem_desc_tensor(
+        tiled_copy_s2t_sfb, tCsSFB2_compact_s2t_
+    )
+    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K)
+    tCtSFB2_compact_s2t = thr_copy_s2t_sfb.partition_D(tCtSFB2_compact)
+
+    #
+    # Slice to per mma tile index
+    #
+    # ((atom_v, rest_v), RestK)
+    tAgA = tAgA[(None, mma_tile_coord_mnl[0], None, mma_tile_coord_mnl[2])]
+    # ((atom_v, rest_v), RestK)
+    tBgB1 = tBgB1[(None, mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2])]
+    # ((atom_v, rest_v), RestK)
+    tBgB2 = tBgB2[(None, mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2])]
+    # ((atom_v, rest_v), RestK)
+    tAgSFA = tAgSFA[(None, mma_tile_coord_mnl[0], None, mma_tile_coord_mnl[2])]
+    # ((atom_v, rest_v), RestK)
+    tBgSFB1 = tBgSFB1[(None, mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2])]
+    # ((atom_v, rest_v), RestK)
+    tBgSFB2 = tBgSFB2[(None, mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2])]
+
+    #
+    # Execute Data copy and Math computation in the k_tile loop
+    #
+    if warp_idx == 0:
+        # Wait for accumulator buffer empty
+        acc_empty = acc_producer.acquire_and_advance()
+        # Set ACCUMULATE field to False for the first k_tile iteration
+        tiled_mma.set(tcgen05.Field.ACCUMULATE, False)
+        # Execute k_tile loop
+        for k_tile in range(k_tile_cnt):
+            # Wait for AB buffer empty
+            ab_empty = ab_producer.acquire_and_advance()
+
+            #  TMA load A/B1/B2/SFA/SFB1/SFB2 to shared memory
+            cute.copy(
+                tma_atom_a,
+                tAgA[(None, ab_empty.count)],
+                tAsA[(None, ab_empty.index)],
+                tma_bar_ptr=ab_empty.barrier,
+            )
+            cute.copy(
+                tma_atom_b1,
+                tBgB1[(None, ab_empty.count)],
+                tBsB1[(None, ab_empty.index)],
+                tma_bar_ptr=ab_empty.barrier,
+            )
+            cute.copy(
+                tma_atom_b2,
+                tBgB2[(None, ab_empty.count)],
+                tBsB2[(None, ab_empty.index)],
+                tma_bar_ptr=ab_empty.barrier,
+            )
+            cute.copy(
+                tma_atom_sfa,
+                tAgSFA[(None, ab_empty.count)],
+                tAsSFA[(None, ab_empty.index)],
+                tma_bar_ptr=ab_empty.barrier,
+            )
+            cute.copy(
+                tma_atom_sfb1,
+                tBgSFB1[(None, ab_empty.count)],
+                tBsSFB1[(None, ab_empty.index)],
+                tma_bar_ptr=ab_empty.barrier,
+            )
+            cute.copy(
+                tma_atom_sfb2,
+                tBgSFB2[(None, ab_empty.count)],
+                tBsSFB2[(None, ab_empty.index)],
+                tma_bar_ptr=ab_empty.barrier,
+            )
+
+            # Wait for AB buffer full
+            ab_full = ab_consumer.wait_and_advance()
+
+            #  Copy SFA/SFB1/SFB2 to tmem
+            s2t_stage_coord = (None, None, None, None, ab_full.index)
+            tCsSFA_compact_s2t_staged = tCsSFA_compact_s2t[s2t_stage_coord]
+            tCsSFB1_compact_s2t_staged = tCsSFB1_compact_s2t[s2t_stage_coord]
+            tCsSFB2_compact_s2t_staged = tCsSFB2_compact_s2t[s2t_stage_coord]
+            cute.copy(
+                tiled_copy_s2t_sfa,
+                tCsSFA_compact_s2t_staged,
+                tCtSFA_compact_s2t,
+            )
+            cute.copy(
+                tiled_copy_s2t_sfb,
+                tCsSFB1_compact_s2t_staged,
+                tCtSFB1_compact_s2t,
+            )
+            cute.copy(
+                tiled_copy_s2t_sfb,
+                tCsSFB2_compact_s2t_staged,
+                tCtSFB2_compact_s2t,
+            )
+
+            # tCtAcc1 += tCrA * tCrSFA * tCrB1 * tCrSFB1
+            # tCtAcc2 += tCrA * tCrSFA * tCrB2 * tCrSFB2
+            num_kblocks = cute.size(tCrA, mode=[2])
+            for kblock_idx in cutlass.range(num_kblocks, unroll_full=True):
+                kblock_coord = (
+                    None,
+                    None,
+                    kblock_idx,
+                    ab_full.index,
+                )
+
+                # Set SFA/SFB tensor to tiled_mma
+                sf_kblock_coord = (None, None, kblock_idx)
+                tiled_mma.set(
+                    tcgen05.Field.SFA,
+                    tCtSFA[sf_kblock_coord].iterator,
+                )
+                tiled_mma.set(
+                    tcgen05.Field.SFB,
+                    tCtSFB1[sf_kblock_coord].iterator,
+                )
+                cute.gemm(
+                    tiled_mma,
+                    tCtAcc1,
+                    tCrA[kblock_coord],
+                    tCrB1[kblock_coord],
+                    tCtAcc1,
+                )
+
+                tiled_mma.set(
+                    tcgen05.Field.SFB,
+                    tCtSFB2[sf_kblock_coord].iterator,
+                )
+                cute.gemm(
+                    tiled_mma,
+                    tCtAcc2,
+                    tCrA[kblock_coord],
+                    tCrB2[kblock_coord],
+                    tCtAcc2,
+                )
+
+                # Enable accumulate on tCtAcc1/tCtAcc2 after first kblock
+                tiled_mma.set(tcgen05.Field.ACCUMULATE, True)
+
+            # Async arrive AB buffer empty
+            ab_full.release()
+        acc_empty.commit()
+
+    #
+    # Epilogue
+    # Partition for epilogue
+    #
+    op = tcgen05.Ld32x32bOp(tcgen05.Repetition.x128, tcgen05.Pack.NONE)
+    copy_atom_t2r = cute.make_copy_atom(op, cutlass.Float32)
+    tiled_copy_t2r = tcgen05.make_tmem_copy(copy_atom_t2r, tCtAcc1)
+    thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
+    # (T2R_M, T2R_N, EPI_M, EPI_M)
+    tTR_tAcc1 = thr_copy_t2r.partition_S(tCtAcc1)
+    # (T2R_M, T2R_N, EPI_M, EPI_M)
+    tTR_tAcc2 = thr_copy_t2r.partition_S(tCtAcc2)
+    # (T2R_M, T2R_N, EPI_M, EPI_N, RestM, RestN, RestL)
+    tTR_gC = thr_copy_t2r.partition_D(tCgC)
+    # (T2R_M, T2R_N, EPI_M, EPI_N）
+    tTR_rAcc1 = cute.make_rmem_tensor(
+        tTR_gC[None, None, None, None, 0, 0, 0].shape, cutlass.Float32
+    )
+    # (T2R_M, T2R_N, EPI_M, EPI_N）
+    tTR_rAcc2 = cute.make_rmem_tensor(
+        tTR_gC[None, None, None, None, 0, 0, 0].shape, cutlass.Float32
+    )
+    # (T2R_M, T2R_N, EPI_M, EPI_N）
+    tTR_rC = cute.make_rmem_tensor(
+        tTR_gC[None, None, None, None, 0, 0, 0].shape, c_dtype
+    )
+    # STG Atom
+    simt_atom = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), c_dtype)
+    tTR_gC = tTR_gC[(None, None, None, None, *mma_tile_coord_mnl)]
+
+    # Wait for accumulator buffer full
+    acc_full = acc_consumer.wait_and_advance()
+
+    # Copy accumulator to register
+    cute.copy(tiled_copy_t2r, tTR_tAcc1, tTR_rAcc1)
+    cute.copy(tiled_copy_t2r, tTR_tAcc2, tTR_rAcc2)
+
+    # Silu activation on acc1 and multiply with acc2
+    acc_vec1 = epilogue_op(tTR_rAcc1.load())
+    acc_vec2 = tTR_rAcc2.load()
+    acc_vec = acc_vec1 * acc_vec2
+
+    tTR_rC.store(acc_vec.to(c_dtype))
+    # Store C to global memory
+    cute.copy(simt_atom, tTR_rC, tTR_gC)
+
+    acc_full.release()
+    # Deallocate TMEM
+    cute.arch.barrier()
+    tmem.free(acc_tmem_ptr)
+    return
+
+
+@cute.jit
+def my_kernel(
+    a_ptr: cute.Pointer,
+    b1_ptr: cute.Pointer,
+    b2_ptr: cute.Pointer,
+    sfa_ptr: cute.Pointer,
+    sfb1_ptr: cute.Pointer,
+    sfb2_ptr: cute.Pointer,
+    c_ptr: cute.Pointer,
+    problem_size: tuple,
+    epilogue_op: cutlass.Constexpr = lambda x: x
+    * (1.0 / (1.0 + cute.math.exp(-x, fastmath=True))),
+):
+    """
+    Host-side JIT function to prepare tensors and launch GPU kernel.
+    """
+    m, n, k, l = problem_size
+
+    # Setup attributes that depend on gemm inputs
+    a_tensor = cute.make_tensor(
+        a_ptr,
+        cute.make_layout(
+            (m, cute.assume(k, 32), l),
+            stride=(cute.assume(k, 32), 1, cute.assume(m * k, 32)),
+        ),
+    )
+    b_tensor1 = cute.make_tensor(
+        b1_ptr,
+        cute.make_layout(
+            (n, cute.assume(k, 32), l),
+            stride=(cute.assume(k, 32), 1, cute.assume(n * k, 32)),
+        ),
+    )
+    b_tensor2 = cute.make_tensor(
+        b2_ptr,
+        cute.make_layout(
+            (n, cute.assume(k, 32), l),
+            stride=(cute.assume(k, 32), 1, cute.assume(n * k, 32)),
+        ),
+    )
+    c_tensor = cute.make_tensor(
+        c_ptr, cute.make_layout((cute.assume(m, 32), n, l), stride=(n, 1, m * n))
+    )
+    # Setup sfa/sfb tensor by filling A/B tensor to scale factor atom layout
+    # ((Atom_M, Rest_M),(Atom_K, Rest_K),RestL)
+    sfa_layout = blockscaled_utils.tile_atom_to_shape_SF(
+        a_tensor.shape, sf_vec_size
+    )
+    sfa_tensor = cute.make_tensor(sfa_ptr, sfa_layout)
+
+    # ((Atom_N, Rest_N),(Atom_K, Rest_K),RestL)
+    sfb_layout = blockscaled_utils.tile_atom_to_shape_SF(
+        b_tensor1.shape, sf_vec_size
+    )
+    sfb_tensor1 = cute.make_tensor(sfb1_ptr, sfb_layout)
+    sfb_tensor2 = cute.make_tensor(sfb2_ptr, sfb_layout)
+
+    mma_op = tcgen05.MmaMXF4NVF4Op(
+        sf_dtype,
+        (mma_tiler_mnk[0], mma_tiler_mnk[1], mma_inst_shape_k),
+        tcgen05.CtaGroup.ONE,
+        tcgen05.OperandSource.SMEM,
+    )
+    tiled_mma = cute.make_tiled_mma(mma_op)
+
+    cluster_layout_vmnk  = cute.tiled_divide(
+        cute.make_layout((1, 1, 1)),
+        (tiled_mma.thr_id.shape,),
+    )
+
+    # Compute A/B/SFA/SFB/C shared memory layout
+    a_smem_layout_staged = sm100_utils.make_smem_layout_a(
+        tiled_mma,
+        mma_tiler_mnk,
+        ab_dtype,
+        num_ab_stage,
+    )
+    # B1 and B2 have the same size thus share the same smem layout
+    b_smem_layout_staged = sm100_utils.make_smem_layout_b(
+        tiled_mma,
+        mma_tiler_mnk,
+        ab_dtype,
+        num_ab_stage,
+    )
+    sfa_smem_layout_staged = blockscaled_utils.make_smem_layout_sfa(
+        tiled_mma,
+        mma_tiler_mnk,
+        sf_vec_size,
+        num_ab_stage,
+    )
+    # SFB1 and SFB2 have the same size thus share the same smem layout
+    sfb_smem_layout_staged = blockscaled_utils.make_smem_layout_sfb(
+        tiled_mma,
+        mma_tiler_mnk,
+        sf_vec_size,
+        num_ab_stage,
+    )
+    atom_thr_size = cute.size(tiled_mma.thr_id.shape)
+
+    # Setup TMA for A
+    a_smem_layout = cute.slice_(a_smem_layout_staged, (None, None, None, 0))
+    tma_atom_a, tma_tensor_a = cute.nvgpu.make_tiled_tma_atom_A(
+        cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE),
+        a_tensor,
+        a_smem_layout,
+        mma_tiler_mnk,
+        tiled_mma,
+        cluster_layout_vmnk .shape,
+    )
+    # Setup TMA for B1
+    b_smem_layout = cute.slice_(b_smem_layout_staged, (None, None, None, 0))
+    tma_atom_b1, tma_tensor_b1 = cute.nvgpu.make_tiled_tma_atom_B(
+        cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE),
+        b_tensor1,
+        b_smem_layout,
+        mma_tiler_mnk,
+        tiled_mma,
+        cluster_layout_vmnk .shape,
+    )
+    # Setup TMA for B2
+    tma_atom_b2, tma_tensor_b2 = cute.nvgpu.make_tiled_tma_atom_B(
+        cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE),
+        b_tensor2,
+        b_smem_layout,
+        mma_tiler_mnk,
+        tiled_mma,
+        cluster_layout_vmnk .shape,
+    )
+    # Setup TMA for SFA
+    sfa_smem_layout = cute.slice_(
+        sfa_smem_layout_staged , (None, None, None, 0)
+    )
+    tma_atom_sfa, tma_tensor_sfa = cute.nvgpu.make_tiled_tma_atom_A(
+        cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE),
+        sfa_tensor,
+        sfa_smem_layout,
+        mma_tiler_mnk,
+        tiled_mma,
+        cluster_layout_vmnk .shape,
+        internal_type=cutlass.Int16,
+    )
+    # Setup TMA for SFB1
+    sfb_smem_layout = cute.slice_(
+        sfb_smem_layout_staged , (None, None, None, 0)
+    )
+    tma_atom_sfb1, tma_tensor_sfb1 = cute.nvgpu.make_tiled_tma_atom_B(
+        cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE),
+        sfb_tensor1,
+        sfb_smem_layout,
+        mma_tiler_mnk,
+        tiled_mma,
+        cluster_layout_vmnk .shape,
+        internal_type=cutlass.Int16,
+    )
+    # Setup TMA for SFB2
+    tma_atom_sfb2, tma_tensor_sfb2 = cute.nvgpu.make_tiled_tma_atom_B(
+        cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE),
+        sfb_tensor2,
+        sfb_smem_layout,
+        mma_tiler_mnk,
+        tiled_mma,
+        cluster_layout_vmnk .shape,
+        internal_type=cutlass.Int16,
+    )
+
+    # Compute TMA load bytes
+    a_copy_size = cute.size_in_bytes(ab_dtype, a_smem_layout)
+    b_copy_size = cute.size_in_bytes(ab_dtype, b_smem_layout)
+    sfa_copy_size = cute.size_in_bytes(sf_dtype, sfa_smem_layout)
+    sfb_copy_size = cute.size_in_bytes(sf_dtype, sfb_smem_layout)
+    num_tma_load_bytes = (
+        a_copy_size + b_copy_size * 2 + sfa_copy_size + sfb_copy_size * 2
+    ) * atom_thr_size
+
+    # Compute grid size
+    grid = (
+        cute.ceil_div(c_tensor.shape[0], mma_tiler_mnk[0]),
+        cute.ceil_div(c_tensor.shape[1], mma_tiler_mnk[1]),
+        c_tensor.shape[2],
+    )
+
+    # Launch the kernel.
+    kernel(
+        # MMA (Matrix Multiply-Accumulate) configuration
+        tiled_mma,                  # Tiled MMA object defining NVFP4 GEMM compute pattern
+        
+        # TMA (Tensor Memory Accelerator) atoms and tensors for shared input matrix A
+        tma_atom_a,                 # TMA copy atom defining how to load A from global memory
+        tma_tensor_a,               # Tensor descriptor for A matrix (m, k, l) - shared by both GEMMs
+        
+        # TMA atoms and tensors for first B matrix (B1)
+        tma_atom_b1,                # TMA copy atom defining how to load B1 from global memory
+        tma_tensor_b1,              # Tensor descriptor for B1 matrix (n, k, l) - first GEMM
+        
+        # TMA atoms and tensors for second B matrix (B2)
+        tma_atom_b2,                # TMA copy atom defining how to load B2 from global memory
+        tma_tensor_b2,              # Tensor descriptor for B2 matrix (n, k, l) - second GEMM
+        
+        # TMA atoms and tensors for scale factor A (shared)
+        tma_atom_sfa,               # TMA copy atom for loading scale factors for A
+        tma_tensor_sfa,             # Tensor descriptor for SFA (block scale factors for A) - shared
+        
+        # TMA atoms and tensors for scale factor B1
+        tma_atom_sfb1,              # TMA copy atom for loading scale factors for B1
+        tma_tensor_sfb1,            # Tensor descriptor for SFB1 (block scale factors for B1)
+        
+        # TMA atoms and tensors for scale factor B2
+        tma_atom_sfb2,              # TMA copy atom for loading scale factors for B2
+        tma_tensor_sfb2,            # Tensor descriptor for SFB2 (block scale factors for B2)
+        
+        # Output tensor C (stores both C1 and C2 results)
+        c_tensor,                   # Output tensor where both GEMM results will be stored (m, n, l)
+        
+        # Shared memory layouts with staging for pipelined execution
+        a_smem_layout_staged,       # Staged shared memory layout for A (includes stage dimension)
+        b_smem_layout_staged,       # Staged shared memory layout for B1/B2 (includes stage dimension)
+        sfa_smem_layout_staged,     # Staged shared memory layout for SFA (includes stage dimension)
+        sfb_smem_layout_staged,     # Staged shared memory layout for SFB1/SFB2 (includes stage dimension)
+        
+        # Pipeline synchronization parameter
+        num_tma_load_bytes,         # Total bytes to load per TMA transaction (for barrier setup)
+        
+        # Epilogue operation
+        epilogue_op,                # Epilogue operation to apply to output (e.g., element-wise ops)
+    ).launch(
+        grid=grid,
+        block=[threads_per_cta, 1, 1],
+        cluster=(1, 1, 1),
+    )
+    return
+
+
+# Global cache for compiled kernel
+_compiled_kernel_cache = None
+# This function is used to compile the kernel once and cache it and then allow users to 
+# run the kernel multiple times to get more accurate timing results.
+def compile_kernel():
+    """
+    Compile the kernel once and cache it.
+    This should be called before any timing measurements.
+    
+    Returns:
+        The compiled kernel function
+    """
+    global _compiled_kernel_cache
+    
+    if _compiled_kernel_cache is not None:
+        return _compiled_kernel_cache
+    
+
+    # Create CuTe pointers for A/B/C/SFA/SFB via torch tensor data pointer
+    a_ptr = make_ptr(
+        ab_dtype, 0, cute.AddressSpace.gmem, assumed_align=16
+    )
+    b1_ptr = make_ptr(
+        ab_dtype, 0, cute.AddressSpace.gmem, assumed_align=16
+    )
+    b2_ptr = make_ptr(
+        ab_dtype, 0, cute.AddressSpace.gmem, assumed_align=16
+    )
+    c_ptr = make_ptr(
+        c_dtype, 0, cute.AddressSpace.gmem, assumed_align=16
+    )
+    sfa_ptr = make_ptr(
+        sf_dtype, 0, cute.AddressSpace.gmem, assumed_align=32
+    )
+    sfb1_ptr = make_ptr(
+        sf_dtype, 0, cute.AddressSpace.gmem, assumed_align=32
+    )
+    sfb2_ptr = make_ptr(
+        sf_dtype, 0, cute.AddressSpace.gmem, assumed_align=32
+    )
+
+    # Compile the kernel
+    _compiled_kernel_cache = cute.compile(my_kernel, a_ptr, b1_ptr, b2_ptr, sfa_ptr, sfb1_ptr, sfb2_ptr, c_ptr, (0, 0, 0, 0))
+    
+    return _compiled_kernel_cache
+
+
+def custom_kernel(data: input_t) -> output_t:
+    """
+    Execute the block-scaled dual GEMM kernel with silu activation,
+    C = silu(A @ B1) * (A @ B2).
+    
+    This is the main entry point called by the evaluation framework.
+    It converts PyTorch tensors to CuTe tensors, launches the kernel,
+    and returns the result.
+    
+    Args:
+        data: Tuple of (a, b1, b2, sfa_cpu, sfb1_cpu, sfb2_cpu, c) PyTorch tensors
+            a: [m, k, l] - Input matrix in float4e2m1fn 
+            b1: [n, k, l] - Input matrix in float4e2m1fn 
+            b2: [n, k, l] - Input matrix in float4e2m1fn 
+            sfa_cpu: [m, k, l] - Scale factors in float8_e4m3fn, used by reference implementation
+            sfb1_cpu: [n, k, l] - Scale factors in float8_e4m3fn, used by reference implementation
+            sfb2_cpu: [n, k, l] - Scale factors in float8_e4m3fn, used by reference implementation
+            sfa_permuted: [32, 4, rest_m, 4, rest_k, l] - Scale factors in float8_e4m3fn
+            sfb1_permuted: [32, 4, rest_n, 4, rest_k, l] - Scale factors in float8_e4m3fn
+            sfb2_permuted: [32, 4, rest_n, 4, rest_k, l] - Scale factors in float8_e4m3fn
+            c: [m, n, l] - Output vector in float16
+    
+    Returns:
+        Output tensor c with computed results
+    """
+    a, b1, b2, _, _, _, sfa_permuted, sfb1_permuted, sfb2_permuted, c = data
+    
+    # Ensure kernel is compiled (will use cached version if available)
+    # To avoid the compilation overhead, we compile the kernel once and cache it.
+    compiled_func = compile_kernel()
+
+    # Get dimensions from MxKxL layout
+    _, k, _ = a.shape
+    m, n, l = c.shape
+    # Torch use e2m1_x2 data type, thus k is halved
+    k = k * 2 
+
+    # Create CuTe pointers for A/B/C/SFA/SFB via torch tensor data pointer
+    a_ptr = make_ptr(
+        ab_dtype, a.data_ptr(), cute.AddressSpace.gmem, assumed_align=16
+    )
+    b1_ptr = make_ptr(
+        ab_dtype, b1.data_ptr(), cute.AddressSpace.gmem, assumed_align=16
+    )
+    b2_ptr = make_ptr(
+        ab_dtype, b2.data_ptr(), cute.AddressSpace.gmem, assumed_align=16
+    )
+    c_ptr = make_ptr(
+        c_dtype, c.data_ptr(), cute.AddressSpace.gmem, assumed_align=16
+    )
+    sfa_ptr = make_ptr(
+        sf_dtype, sfa_permuted.data_ptr(), cute.AddressSpace.gmem, assumed_align=32
+    )
+    sfb1_ptr = make_ptr(
+        sf_dtype, sfb1_permuted.data_ptr(), cute.AddressSpace.gmem, assumed_align=32
+    )
+    sfb2_ptr = make_ptr(
+        sf_dtype, sfb2_permuted.data_ptr(), cute.AddressSpace.gmem, assumed_align=32
+    )
+
+    # Execute the compiled kernel
+    compiled_func(a_ptr, b1_ptr, b2_ptr, sfa_ptr, sfb1_ptr, sfb2_ptr, c_ptr, (m, n, k, l))
+
+    return c
diff --git a/problems/nvidia/final_nvfp4_dual_gemm/task.py b/problems/nvidia/final_nvfp4_dual_gemm/task.py
new file mode 100644
index 00000000..8facfb07
--- /dev/null
+++ b/problems/nvidia/final_nvfp4_dual_gemm/task.py
@@ -0,0 +1,11 @@
+import torch
+from typing import TypedDict, TypeVar
+
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor])
+output_t = TypeVar("output_t", bound=torch.Tensor)
+class TestSpec(TypedDict):
+    m: int
+    n: int
+    k: int
+    l: int
+    seed: int
\ No newline at end of file
diff --git a/problems/nvidia/final_nvfp4_dual_gemm/task.yml b/problems/nvidia/final_nvfp4_dual_gemm/task.yml
new file mode 100644
index 00000000..4d36bde6
--- /dev/null
+++ b/problems/nvidia/final_nvfp4_dual_gemm/task.yml
@@ -0,0 +1,64 @@
+# name: nvfp4-dual-gemm
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval_better_bench.py"}
+
+lang: "py"
+
+description: |
+  
+  You will implement a block scaled dual matrix-matrix multiplication kernel with silu activation optimized for NVIDIA B200.
+  To be explicit, you will be given a tuple of tensors:
+  ```
+  (a, b1, b2, sfa, sfb1, sfb2, c)
+  ```
+  where:
+  * `a` is M x K x L in K-major order in nvfp4(e2m1)
+  * `b1` is N x K x L in K-major order in nvfp4(e2m1)
+  * `b2` is N x K x L in K-major order in nvfp4(e2m1)
+  * `sfa` is M x (K // 16) x L in K-major order in fp8(e4m3fnuz)
+  * `sfb1` is N x (K // 16) x L in K-major order in fp8(e4m3fnuz)
+  * `sfb2` is N x (K // 16) x L in K-major order in fp8(e4m3fnuz)
+  * `c` is M x N x L in fp16
+  
+  Matrix sizes `M` is divisible by mma_tiler_mn[0], `N` is divisible by mma_tiler_mn[1], `K` is divisible by 256.
+  The ranking criteria is the geometric mean of the benchmark results.
+  For the grand price, your kernel will be evaluated against the speed of light analysis
+  and the solution closest to the speed of light will be awarded the grand price.
+  ```
+  The speed of light analysis based on the max(FP4 Tensor Core math throughput, DRAM memory throughput) of B200 and tested under 1.5Ghz clock:
+    M   N   K   L time[us] 
+  256 4096 7168 1 4.708
+  512 4096 7168 1 8.714
+  256 3072 4096 1 2.125
+  512 3072 7168 1 6.535
+  ```
+config:
+  main: "eval.py"
+
+templates:
+  Python: "template.py"
+
+tests:
+  - {"m": 1536, "n": 512, "k": 7168, "l": 1, "seed": 1111}
+  - {"m": 256, "n": 512, "k": 256, "l": 1, "seed": 1111}
+  - {"m": 1536, "n": 512, "k": 7168, "l": 1, "seed": 1111}
+  - {"m": 3072, "n": 1024, "k": 1536, "l": 1, "seed": 1111}
+  - {"m": 7168, "n": 1024, "k": 256, "l": 1, "seed": 1111}
+  - {"m": 7168, "n": 2304, "k": 2048, "l": 1, "seed": 1111}
+  - {"m": 4608, "n": 384, "k": 7168, "l": 1, "seed": 1111}
+  - {"m": 7168, "n": 384, "k": 2304, "l": 1, "seed": 1111}
+  - {"m": 512, "n": 768, "k": 7168, "l": 1, "seed": 1111}
+  - {"m": 4096, "n": 768, "k": 512, "l": 1, "seed": 1111}
+  
+benchmarks:
+  - {"m": 256, "n": 4096, "k": 7168, "l": 1, "seed": 1111}
+  - {"m": 512, "n": 4096, "k": 7168, "l": 1, "seed": 1111}
+  - {"m": 256, "n": 3072, "k": 4096, "l": 1, "seed": 1111}
+  - {"m": 512, "n": 3072, "k": 7168, "l": 1, "seed": 1111}
+
+ranking_by: "geom"
diff --git a/problems/nvidia/final_nvfp4_dual_gemm/template.py b/problems/nvidia/final_nvfp4_dual_gemm/template.py
new file mode 100644
index 00000000..d8985df5
--- /dev/null
+++ b/problems/nvidia/final_nvfp4_dual_gemm/template.py
@@ -0,0 +1,28 @@
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of block-scale fp4 dual gemm with silu activation
+    Args:
+        data: Tuple that expands to:
+            a: torch.Tensor[float4e2m1fn] of shape [m, k, l],
+            b1: torch.Tensor[float4e2m1fn] of shape [n, k, l],
+            b2: torch.Tensor[float4e2m1fn] of shape [n, k, l],
+            sfa: torch.Tensor[float8_e4m3fnuz] of shape [m, k // 16, l], used by reference implementation
+            sfb1: torch.Tensor[float8_e4m3fnuz] of shape [n, k // 16, l], used by reference implementation
+            sfb2: torch.Tensor[float8_e4m3fnuz] of shape [n, k // 16, l], used by reference implementation
+            sfa_permuted: torch.Tensor[float8_e4m3fnuz] of shape [32, 4, rest_m, 4, rest_k, l],
+            sfb1_permuted: torch.Tensor[float8_e4m3fnuz] of shape [32, 4, rest_n, 4, rest_k, l],
+            sfb2_permuted: torch.Tensor[float8_e4m3fnuz] of shape [32, 4, rest_n, 4, rest_k, l],
+            c: torch.Tensor[float16] of shape [m, n, l]
+    Returns:
+        Tensor containing output in float16
+        c: torch.Tensor[float16] of shape [m, n, l]
+    """
+    # c: [m, n, l] is pre-allocated memory to avoid timing allocation overhead.
+    a, b1, b2, sfa, sfb1, sfb2, sfa_permuted, sfb1_permuted, sfb2_permuted, c = data
+
+    # Your implementation here
+
+    return c
\ No newline at end of file
diff --git a/problems/nvidia/final_nvfp4_dual_gemm/utils.py b/problems/nvidia/final_nvfp4_dual_gemm/utils.py
new file mode 100644
index 00000000..d9b3a69e
--- /dev/null
+++ b/problems/nvidia/final_nvfp4_dual_gemm/utils.py
@@ -0,0 +1,172 @@
+import os
+import random
+import numpy as np
+import torch
+
+
+def set_seed(seed=42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+
+
+def get_device(use_cuda: bool = True) -> torch.device:
+    """Get the appropriate device (GPU or CPU)."""
+    if use_cuda:
+        if torch.cuda.is_available():
+            return torch.device("cuda")
+        elif torch.backends.mps.is_available():
+            return torch.device("mps")
+        else:
+            print("No compatible GPU found. Falling back to CPU.")
+    return torch.device("cpu")
+
+
+# Adapted from https://github.com/linkedin/Liger-Kernel/blob/main/test/utils.py
+@torch.no_grad()
+def verbose_allclose(
+        received: torch.Tensor,
+        expected: torch.Tensor,
+        rtol=1e-05,
+        atol=1e-08,
+        max_print=5
+) -> list[str]:
+    """
+    Assert that two tensors are element-wise equal within a tolerance, providing detailed information about mismatches.
+    Parameters:
+    received (torch.Tensor): Tensor we actually got.
+    expected (torch.Tensor): Tensor we expected to receive.
+    rtol (float): Relative tolerance; relative to expected
+    atol (float): Absolute tolerance.
+    max_print (int): Maximum number of mismatched elements to print.
+    Raises:
+    AssertionError: If the tensors are not all close within the given tolerance.
+    """
+    # Check if the shapes of the tensors match
+    if received.shape != expected.shape:
+        return ["SIZE MISMATCH"]
+
+    # Calculate the difference between the tensors
+    diff = torch.abs(received - expected)
+
+    # Determine the tolerance
+    tolerance = atol + rtol * torch.abs(expected)
+
+    # Find tolerance mismatched elements
+    tol_mismatched = diff > tolerance
+
+    # Find nan mismatched elements
+    nan_mismatched = torch.logical_xor(torch.isnan(received), torch.isnan(expected))
+
+    # Find +inf mismatched elements
+    posinf_mismatched = torch.logical_xor(torch.isposinf(received), torch.isposinf(expected))
+    # Find -inf mismatched elements
+    neginf_mismatched = torch.logical_xor(torch.isneginf(received), torch.isneginf(expected))
+
+    # Find all mismatched elements
+    mismatched = torch.logical_or(
+        torch.logical_or(tol_mismatched, nan_mismatched),
+        torch.logical_or(posinf_mismatched, neginf_mismatched),
+    )
+
+    mismatched_indices = torch.nonzero(mismatched)
+
+    # Count the number of mismatched elements
+    num_mismatched = mismatched.count_nonzero().item()
+
+    # Generate detailed information if there are mismatches
+    if num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
+
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return mismatch_details
+
+    return []
+
+
+@torch.no_grad()
+def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print: int=5):
+    """
+    Assert that two tensors are element-wise perfectly equal, providing detailed information about mismatches.
+    Parameters:
+    received (torch.Tensor): Tensor we actually got.
+    expected (torch.Tensor): Tensor we expected to receive.
+    max_print (int): Maximum number of mismatched elements to print.
+    Returns:
+         Empty string if tensors are equal, otherwise detailed error information
+    """
+    mismatched = torch.not_equal(received, expected)
+    mismatched_indices = torch.nonzero(mismatched)
+
+    # Count the number of mismatched elements
+    num_mismatched = mismatched.count_nonzero().item()
+
+    # Generate detailed information if there are mismatches
+    if num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
+
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return mismatch_details
+
+    return []
+
+
+def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08) -> tuple[bool, str]:
+    """
+    Convenient "default" implementation for tasks' `check_implementation` function.
+    """
+    expected = reference(data)
+    reasons = verbose_allclose(output, expected, rtol=rtol, atol=atol)
+
+    if len(reasons) > 0:
+        return False, "mismatch found! custom implementation doesn't match reference: " + " ".join(reasons)
+
+    return True, ''
+
+
+def make_match_reference(reference: callable, **kwargs):
+    def wrapped(data, output):
+        return match_reference(data, output, reference=reference, **kwargs)
+    return wrapped
+
+
+class DeterministicContext:
+    def __init__(self):
+        self.allow_tf32 = None
+        self.deterministic = None
+        self.cublas = None
+
+    def __enter__(self):
+        self.cublas = os.environ.get('CUBLAS_WORKSPACE_CONFIG', '')
+        self.allow_tf32 = torch.backends.cudnn.allow_tf32
+        self.deterministic = torch.backends.cudnn.deterministic
+        torch.backends.cudnn.allow_tf32 = False
+        torch.backends.cudnn.deterministic = True
+        torch.use_deterministic_algorithms(True)
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        torch.backends.cudnn.allow_tf32 = self.allow_tf32
+        torch.backends.cudnn.deterministic = self.deterministic
+        torch.use_deterministic_algorithms(False)
+        os.environ['CUBLAS_WORKSPACE_CONFIG'] = self.cublas
+
+def clear_l2_cache():
+    # import cupy as cp
+    # cp.cuda.runtime.deviceSetLimit(cp.cuda.runtime.cudaLimitPersistingL2CacheSize, 0)
+    # create a large dummy tensor
+    dummy = torch.empty((32, 1024, 1024), dtype=torch.int64, device="cuda")
+    # write stuff to
+    dummy.fill_(42)
+    del dummy

From 41c6af4a42bd3e10a3f78197ecbf400fed4fbea5 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Sat, 17 Jan 2026 17:05:44 -0800
Subject: [PATCH 190/207] add modal as an option to submit the dual gemm
 problem

---
 problems/nvidia.yaml                          |   5 +
 .../nvidia/modal_nvfp4_dual_gemm/reference.py | 199 ++++
 .../modal_nvfp4_dual_gemm/submission.py       | 957 ++++++++++++++++++
 problems/nvidia/modal_nvfp4_dual_gemm/task.py |  11 +
 .../nvidia/modal_nvfp4_dual_gemm/task.yml     |  64 ++
 .../nvidia/modal_nvfp4_dual_gemm/template.py  |  28 +
 .../nvidia/modal_nvfp4_dual_gemm/utils.py     | 172 ++++
 7 files changed, 1436 insertions(+)
 create mode 100644 problems/nvidia/modal_nvfp4_dual_gemm/reference.py
 create mode 100644 problems/nvidia/modal_nvfp4_dual_gemm/submission.py
 create mode 100644 problems/nvidia/modal_nvfp4_dual_gemm/task.py
 create mode 100644 problems/nvidia/modal_nvfp4_dual_gemm/task.yml
 create mode 100644 problems/nvidia/modal_nvfp4_dual_gemm/template.py
 create mode 100644 problems/nvidia/modal_nvfp4_dual_gemm/utils.py

diff --git a/problems/nvidia.yaml b/problems/nvidia.yaml
index 6ff186bb..8f868a9a 100644
--- a/problems/nvidia.yaml
+++ b/problems/nvidia.yaml
@@ -25,4 +25,9 @@ problems:
     deadline: "2026-01-20 7:59"
     gpus:
       - NVIDIA
+  - directory: nvidia/modal_nvfp4_dual_gemm
+    name: modal_nvfp4_dual_gemm
+    deadline: "2026-01-20 7:59"
+    gpus:
+      - B200
 
diff --git a/problems/nvidia/modal_nvfp4_dual_gemm/reference.py b/problems/nvidia/modal_nvfp4_dual_gemm/reference.py
new file mode 100644
index 00000000..95c6aacd
--- /dev/null
+++ b/problems/nvidia/modal_nvfp4_dual_gemm/reference.py
@@ -0,0 +1,199 @@
+import torch
+from task import input_t, output_t
+from utils import make_match_reference
+
+# Scaling factor vector size
+sf_vec_size = 16
+
+# Helper function for ceiling division
+def ceil_div(a, b):
+    return (a + b - 1) // b
+
+# Helper function to convert scale factor tensor to blocked format
+def to_blocked(input_matrix):
+    rows, cols = input_matrix.shape
+
+    # Please ensure rows and cols are multiples of 128 and 4 respectively
+    n_row_blocks = ceil_div(rows, 128)
+    n_col_blocks = ceil_div(cols, 4)
+
+    padded = input_matrix
+    blocks = padded.view(n_row_blocks, 128, n_col_blocks, 4).permute(0, 2, 1, 3)
+    rearranged = blocks.reshape(-1, 4, 32, 4).transpose(1, 2).reshape(-1, 32, 16)
+
+    return rearranged.flatten()
+
+
+def ref_kernel(
+    data: input_t,
+) -> output_t:
+    """
+    PyTorch reference implementation of NVFP4 block-scaled dual GEMM with silu activation,
+    C = silu(A @ B1) * (A @ B2).
+    """
+    a_ref, b1_ref, b2_ref, sfa_ref_cpu, sfb1_ref_cpu, sfb2_ref_cpu, _, _, _, c_ref = data
+    
+    # Get dimensions from MxNxL layout
+    m, n, l = c_ref.shape
+
+    # Call torch._scaled_mm to compute the GEMV result
+    ref1 = torch.empty(
+        (l, m, n),
+        dtype=torch.float32,
+        device="cuda",
+    ).permute(1, 2, 0)
+    ref2 = torch.empty(
+        (l, m, n),
+        dtype=torch.float32,
+        device="cuda",
+    ).permute(1, 2, 0)
+    for l_idx in range(l):
+        # Convert the scale factor tensor to blocked format
+        scale_a = to_blocked(sfa_ref_cpu[:, :, l_idx])
+        scale_b1 = to_blocked(sfb1_ref_cpu[:, :, l_idx])
+        scale_b2 = to_blocked(sfb2_ref_cpu[:, :, l_idx])
+        # (m, k) @ (n, k).T -> (m, n)
+        res1 = torch._scaled_mm(
+            a_ref[:, :, l_idx],
+            b1_ref[:, :, l_idx].transpose(0, 1),
+            scale_a.cuda(),
+            scale_b1.cuda(),
+            bias=None,
+            out_dtype=torch.float32,
+        )
+        ref1[:, :, l_idx] = res1
+
+        res2 = torch._scaled_mm(
+            a_ref[:, :, l_idx],
+            b2_ref[:, :, l_idx].transpose(0, 1),
+            scale_a.cuda(),
+            scale_b2.cuda(),
+            bias=None,
+            out_dtype=torch.float32,
+        )
+        ref2[:, :, l_idx] = res2
+    # Do silu on the first GEMM result and multiply with the second GEMM result
+    c_ref = (torch.nn.functional.silu(ref1) * ref2).to(torch.float16)
+    return c_ref
+
+
+def generate_input(
+    m: int,
+    n: int,
+    k: int,
+    l: int,
+    seed: int,
+):
+    """
+    Generate input tensors for NVFP4 block-scaled dual GEMM with silu activation,
+    C = silu(A @ B1) * (A @ B2).
+    
+    Args:
+        m: Number of rows in matrix A
+        n: Number of columns in matrix B1 and B2
+        k: Number of columns in A and rows of B1 and B2
+        l: Batch size
+        seed: Random seed for reproducibility
+    
+    Returns:
+        Tuple of (a, b, scale_a, scale_b, c) where:
+            a: [m, k, l] - Input matrix in torch.float4e2m1fn_x2 data type
+            b1: [n, k, l] - Input matrix in torch.float4e2m1fn_x2 data type
+            b2: [n, k, l] - Input matrix in torch.float4e2m1fn_x2 data type
+            scale_a: [m, k, l] - Input scale factors in torch.float8e4m3fn data type
+            scale_b1: [n, k, l] - Input scale factors in torch.float8e4m3fn data type
+            scale_b2: [n, k, l] - Input scale factors in torch.float8e4m3fn data type
+            scale_a_permuted: [32, 4, rest_m, 4, rest_k, l] - Input scale factors in torch.float8e4m3fn data type
+            scale_b1_permuted: [32, 4, rest_n, 4, rest_k, l] - Input scale factors in torch.float8e4m3fn data type
+            scale_b2_permuted: [32, 4, rest_n, 4, rest_k, l] - Input scale factors in torch.float8e4m3fn data type
+            c: [m, n, l] - Output matrix in torch.float16 data type
+    """
+    torch.manual_seed(seed)
+    
+    def create_fp4_tensors(l, mn, k):
+        # generate uint8 tensor, then convert to float4e2m1fn_x2 data type
+        # generate all bit patterns
+        ref_i8 = torch.randint(255, size=(l, mn, k // 2), dtype=torch.uint8, device="cuda")
+
+        # for each nibble, only keep the sign bit and 2 LSBs
+        # the possible values are [-1.5, -1, -0.5, 0, +0.5, +1, +1.5]
+        ref_i8 = ref_i8 & 0b1011_1011
+
+        return ref_i8.permute(1, 2, 0).view(torch.float4_e2m1fn_x2)
+
+    # Generate uint8 tensor, then convert to float4e2m1fn_x2 data type
+    a_ref = create_fp4_tensors(l, m, k)
+    b1_ref = create_fp4_tensors(l, n, k)
+    b2_ref = create_fp4_tensors(l, n, k)
+    a_ref = a_ref.view(torch.float4_e2m1fn_x2)
+    b1_ref = b1_ref.view(torch.float4_e2m1fn_x2)
+    b2_ref = b2_ref.view(torch.float4_e2m1fn_x2)
+
+    # Create float16 output tensor
+    c_ref = torch.randn((l, m, n), dtype=torch.float16, device="cuda").permute(
+        1, 2, 0
+    )
+    
+    # Helper function to prepare the scale factor tensors for both reference
+    # kernel and customize kernel. The customized data layout can be found in:
+    # https://docs.nvidia.com/cuda/cublas/index.html?highlight=fp4#d-block-scaling-factors-layout
+    def create_scale_factor_tensors(l, mn, sf_k):
+        # Create the reference scale factor tensor (mn, sf_k, l) on CPU.
+        ref_shape = (l, mn, sf_k)
+        ref_permute_order = (1, 2, 0)
+        # Init with fp32 tensor in [0,1), then convert to float8_e4m3fn
+        ref_f8_random_fp32 = torch.rand(ref_shape, dtype=torch.float32, device='cuda')
+        ref_f8_torch_tensor = ref_f8_random_fp32.to(dtype=torch.float8_e4m3fn)
+        # permute to match ref_permute_order
+        ref_f8_torch_tensor_permuted = ref_f8_torch_tensor.permute(*ref_permute_order)
+
+        atom_m = (32, 4)
+        atom_k = 4
+        mma_shape = (
+            l,  # batch size
+            ceil_div(mn, atom_m[0] * atom_m[1]),
+            ceil_div(sf_k, atom_k),
+            atom_m[0],
+            atom_m[1],
+            atom_k,
+        )
+
+        # Reorder scale factor tensor to (32, 4, rest_m, 4, rest_k, l) layout
+        # Which is needed by the CuTe customized kernel
+        mma_permute_order = (3, 4, 1, 5, 2, 0)
+        # Generate a random int8 tensor, then convert to float8_e4m3fn
+        rand_int_tensor = torch.empty(mma_shape, dtype=torch.int8, device='cuda')
+        reordered_f8_torch_tensor = rand_int_tensor.to(dtype=torch.float8_e4m3fn)
+        # Permute according to mma_permute_order
+        reordered_f8_torch_tensor = reordered_f8_torch_tensor.permute(*mma_permute_order)
+
+        # GPU-side vectorized reordering (replaces slow CPU nested loops)
+        # Create index grids for all dimensions
+        i_idx = torch.arange(mn, device='cuda')
+        j_idx = torch.arange(sf_k, device='cuda')
+        b_idx = torch.arange(l, device='cuda')
+        
+        # Create meshgrid for all combinations of (i, j, b)
+        i_grid, j_grid, b_grid = torch.meshgrid(i_idx, j_idx, b_idx, indexing='ij')
+        
+        # Calculate target indices in vectorized manner
+        mm = i_grid // (atom_m[0] * atom_m[1])
+        mm32 = i_grid % atom_m[0]
+        mm4 = (i_grid % 128) // atom_m[0]
+        kk = j_grid // atom_k
+        kk4 = j_grid % atom_k
+        
+        # Perform the reordering with advanced indexing (all on GPU)
+        reordered_f8_torch_tensor[mm32, mm4, mm, kk4, kk, b_grid] = ref_f8_torch_tensor_permuted[i_grid, j_grid, b_grid]
+        
+        return ref_f8_torch_tensor_permuted.cpu(), reordered_f8_torch_tensor
+
+    sf_k = ceil_div(k, sf_vec_size)
+    sfa_ref_cpu, sfa_ref_permuted = create_scale_factor_tensors(l, m, sf_k)
+    sfb1_ref_cpu, sfb1_ref_permuted = create_scale_factor_tensors(l, n, sf_k)
+    sfb2_ref_cpu, sfb2_ref_permuted = create_scale_factor_tensors(l, n, sf_k)
+
+    return (a_ref, b1_ref, b2_ref, sfa_ref_cpu.to("cuda"), sfb1_ref_cpu.to("cuda"), sfb2_ref_cpu.to("cuda"), sfa_ref_permuted, sfb1_ref_permuted, sfb2_ref_permuted, c_ref)
+
+
+check_implementation = make_match_reference(ref_kernel, rtol=1e-03, atol=1e-03)
diff --git a/problems/nvidia/modal_nvfp4_dual_gemm/submission.py b/problems/nvidia/modal_nvfp4_dual_gemm/submission.py
new file mode 100644
index 00000000..739cc5a0
--- /dev/null
+++ b/problems/nvidia/modal_nvfp4_dual_gemm/submission.py
@@ -0,0 +1,957 @@
+from torch._higher_order_ops.torchbind import call_torchbind_fake
+import cuda.bindings.driver as cuda
+
+import torch
+from task import input_t, output_t
+
+import cutlass
+import cutlass.cute as cute
+import cutlass.utils as utils
+import cutlass.pipeline as pipeline
+from cutlass.cute.nvgpu import cpasync, tcgen05
+import cutlass.torch as cutlass_torch
+import cutlass.utils.blackwell_helpers as sm100_utils
+import cutlass.utils.blockscaled_layout as blockscaled_utils
+from cutlass.cute.runtime import make_ptr
+
+# Kernel configuration parameters
+# Tile sizes for M, N, K dimensions
+mma_tiler_mnk= (128, 128, 256)  
+# Shape of the K dimension for the MMA instruction
+mma_inst_shape_k = 64
+# FP4 data type for A and B
+ab_dtype = cutlass.Float4E2M1FN  
+# FP8 data type for scale factors
+sf_dtype = cutlass.Float8E4M3FN  
+# FP16 output type
+c_dtype = cutlass.Float16  
+# Scale factor block size (16 elements share one scale)
+sf_vec_size = 16  
+# Number of threads per CUDA thread block
+threads_per_cta = 128  
+# Stage numbers of shared memory and tmem
+num_acc_stage = 1
+num_ab_stage = 1
+# Total number of columns in tmem
+num_tmem_alloc_cols = 512
+
+
+# Helper function for ceiling division
+def ceil_div(a, b):
+    return (a + b - 1) // b
+
+
+#  GPU device kernel
+@cute.kernel
+def kernel(
+    tiled_mma: cute.TiledMma,
+    tma_atom_a: cute.CopyAtom,
+    mA_mkl: cute.Tensor,
+    tma_atom_b1: cute.CopyAtom,
+    mB_nkl1: cute.Tensor,
+    tma_atom_b2: cute.CopyAtom,
+    mB_nkl2: cute.Tensor,
+    tma_atom_sfa: cute.CopyAtom,
+    mSFA_mkl: cute.Tensor,
+    tma_atom_sfb1: cute.CopyAtom,
+    mSFB_nkl1: cute.Tensor,
+    tma_atom_sfb2: cute.CopyAtom,
+    mSFB_nkl2: cute.Tensor,
+    mC_mnl: cute.Tensor,
+    a_smem_layout_staged: cute.ComposedLayout,
+    b_smem_layout_staged: cute.ComposedLayout,
+    sfa_smem_layout_staged: cute.Layout,
+    sfb_smem_layout_staged: cute.Layout,
+    num_tma_load_bytes: cutlass.Constexpr[int],
+    epilogue_op: cutlass.Constexpr = lambda x: x
+    * (1.0 / (1.0 + cute.math.exp(-x, fastmath=True))),
+):
+    """
+    GPU device kernel performing the batched GEMM computation.
+    """
+    warp_idx = cute.arch.warp_idx()
+    warp_idx = cute.arch.make_warp_uniform(warp_idx)
+    tidx = cute.arch.thread_idx()
+
+    #
+    # Setup cta/thread coordinates
+    #
+    # Coords inside cluster
+    bidx, bidy, bidz = cute.arch.block_idx()
+    mma_tile_coord_v = bidx % cute.size(tiled_mma.thr_id.shape)
+
+    # Coords outside cluster
+    cta_coord = (bidx, bidy, bidz)
+    mma_tile_coord_mnl = (
+        cta_coord[0] // cute.size(tiled_mma.thr_id.shape),
+        cta_coord[1],
+        cta_coord[2],
+    )
+    # Coord inside cta
+    tidx, _, _ = cute.arch.thread_idx()
+
+    #
+    # Define shared storage for kernel
+    #
+    @cute.struct
+    class SharedStorage:
+        ab_mbar_ptr: cute.struct.MemRange[cutlass.Int64, num_ab_stage * 2]
+        acc_mbar_ptr: cute.struct.MemRange[cutlass.Int64, num_acc_stage * 2]
+        tmem_holding_buf: cutlass.Int32
+
+    smem = utils.SmemAllocator()
+    storage = smem.allocate(SharedStorage)
+    # (MMA, MMA_M, MMA_K, STAGE)
+    sA = smem.allocate_tensor(
+        element_type=ab_dtype,
+        layout=a_smem_layout_staged.outer,
+        byte_alignment=128,
+        swizzle=a_smem_layout_staged.inner,
+    )
+    # (MMA, MMA_N, MMA_K, STAGE)
+    sB1 = smem.allocate_tensor(
+        element_type=ab_dtype,
+        layout=b_smem_layout_staged.outer,
+        byte_alignment=128,
+        swizzle=b_smem_layout_staged.inner,
+    )
+    # (MMA, MMA_N, MMA_K, STAGE)
+    sB2 = smem.allocate_tensor(
+        element_type=ab_dtype,
+        layout=b_smem_layout_staged.outer,
+        byte_alignment=128,
+        swizzle=b_smem_layout_staged.inner,
+    )
+    # (MMA, MMA_M, MMA_K, STAGE)
+    sSFA = smem.allocate_tensor(
+        element_type=sf_dtype,
+        layout=sfa_smem_layout_staged,
+        byte_alignment=128,
+    )
+    # (MMA, MMA_N, MMA_K, STAGE)
+    sSFB1 = smem.allocate_tensor(
+        element_type=sf_dtype,
+        layout=sfb_smem_layout_staged,
+        byte_alignment=128,
+    )
+    # (MMA, MMA_N, MMA_K, STAGE)
+    sSFB2 = smem.allocate_tensor(
+        element_type=sf_dtype,
+        layout=sfb_smem_layout_staged,
+        byte_alignment=128,
+    )
+
+    #
+    # Initialize mainloop ab_pipeline, acc_pipeline and their states
+    #
+    ab_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
+    ab_pipeline_consumer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread, 1)
+    ab_producer, ab_consumer = pipeline.PipelineTmaUmma.create(
+        barrier_storage=storage.ab_mbar_ptr.data_ptr(),
+        num_stages=num_ab_stage,
+        producer_group=ab_pipeline_producer_group,
+        consumer_group=ab_pipeline_consumer_group,
+        tx_count=num_tma_load_bytes,
+    ).make_participants()
+    acc_producer, acc_consumer = pipeline.PipelineUmmaAsync.create(
+        barrier_storage=storage.acc_mbar_ptr.data_ptr(),
+        num_stages=num_acc_stage,
+        producer_group=ab_pipeline_producer_group,
+        consumer_group=pipeline.CooperativeGroup(
+            pipeline.Agent.Thread,
+            threads_per_cta,
+        ),
+    ).make_participants()
+
+    #
+    # Local_tile partition global tensors
+    #
+    # (bM, bK, RestM, RestK, RestL)
+    gA_mkl = cute.local_tile(
+        mA_mkl, cute.slice_(mma_tiler_mnk, (None, 0, None)), (None, None, None)
+    )
+    # (bN, bK, RestN, RestK, RestL)
+    gB_nkl1 = cute.local_tile(
+        mB_nkl1, cute.slice_(mma_tiler_mnk, (0, None, None)), (None, None, None)
+    )
+    # (bN, bK, RestN, RestK, RestL)
+    gB_nkl2 = cute.local_tile(
+        mB_nkl2, cute.slice_(mma_tiler_mnk, (0, None, None)), (None, None, None)
+    )
+    gSFA_mkl = cute.local_tile(
+        mSFA_mkl, cute.slice_(mma_tiler_mnk, (None, 0, None)), (None, None, None)
+    )
+    gSFB_nkl1 = cute.local_tile(
+        mSFB_nkl1, cute.slice_(mma_tiler_mnk, (0, None, None)), (None, None, None)
+    )
+    # (bN, bK, RestN, RestK, RestL)
+    gSFB_nkl2 = cute.local_tile(
+        mSFB_nkl2, cute.slice_(mma_tiler_mnk, (0, None, None)), (None, None, None)
+    )
+    # (bM, bN, RestM, RestN, RestL)
+    gC_mnl = cute.local_tile(
+        mC_mnl, cute.slice_(mma_tiler_mnk, (None, None, 0)), (None, None, None)
+    )
+    k_tile_cnt = cute.size(gA_mkl, mode=[3])
+
+    #
+    # Partition global tensor for TiledMMA_A/B/SFA/SFB/C
+    #
+    # (MMA, MMA_M, MMA_K, RestK)
+    thr_mma = tiled_mma.get_slice(mma_tile_coord_v)
+    # (MMA, MMA_M, MMA_K, RestM, RestK, RestL)
+    tCgA = thr_mma.partition_A(gA_mkl)
+    # (MMA, MMA_N, MMA_K, RestN, RestK, RestL)
+    tCgB1 = thr_mma.partition_B(gB_nkl1)
+    # (MMA, MMA_N, MMA_K, RestN, RestK, RestL)
+    tCgB2 = thr_mma.partition_B(gB_nkl2)
+    # (MMA, MMA_M, MMA_K, RestM, RestK, RestL)
+    tCgSFA = thr_mma.partition_A(gSFA_mkl)
+    # (MMA, MMA_N, MMA_K, RestN, RestK, RestL)
+    tCgSFB1 = thr_mma.partition_B(gSFB_nkl1)
+    # (MMA, MMA_N, MMA_K, RestN, RestK, RestL)
+    tCgSFB2 = thr_mma.partition_B(gSFB_nkl2)
+    # (MMA, MMA_M, MMA_N, RestM, RestN, RestL)
+    tCgC = thr_mma.partition_C(gC_mnl)
+
+    #
+    # Partition global/shared tensor for TMA load A/B/SFA/SFB
+    #
+    # TMA Partition_S/D for A
+    # ((atom_v, rest_v), STAGE)
+    # ((atom_v, rest_v), RestM, RestK, RestL)
+    tAsA, tAgA = cpasync.tma_partition(
+        tma_atom_a,
+        0,
+        cute.make_layout(1),
+        cute.group_modes(sA, 0, 3),
+        cute.group_modes(tCgA, 0, 3),
+    )
+    # TMA Partition_S/D for B1
+    # ((atom_v, rest_v), STAGE)
+    # ((atom_v, rest_v), RestN, RestK, RestL)
+    tBsB1, tBgB1 = cpasync.tma_partition(
+        tma_atom_b1,
+        0,
+        cute.make_layout(1),
+        cute.group_modes(sB1, 0, 3),
+        cute.group_modes(tCgB1, 0, 3),
+    )
+    # TMA Partition_S/D for B2
+    # ((atom_v, rest_v), STAGE)
+    # ((atom_v, rest_v), RestN, RestK, RestL)
+    tBsB2, tBgB2 = cpasync.tma_partition(
+        tma_atom_b2,
+        0,
+        cute.make_layout(1),
+        cute.group_modes(sB2, 0, 3),
+        cute.group_modes(tCgB2, 0, 3),
+    )
+    #  TMA Partition_S/D for SFA
+    # ((atom_v, rest_v), STAGE)
+    # ((atom_v, rest_v), RestM, RestK, RestL)
+    tAsSFA, tAgSFA = cpasync.tma_partition(
+        tma_atom_sfa,
+        0,
+        cute.make_layout(1),
+        cute.group_modes(sSFA, 0, 3),
+        cute.group_modes(tCgSFA, 0, 3),
+    )
+    tAsSFA = cute.filter_zeros(tAsSFA)
+    tAgSFA = cute.filter_zeros(tAgSFA)
+    # TMA Partition_S/D for SFB1
+    # ((atom_v, rest_v), STAGE)
+    # ((atom_v, rest_v), RestN, RestK, RestL)
+    tBsSFB1, tBgSFB1 = cpasync.tma_partition(
+        tma_atom_sfb1,
+        0,
+        cute.make_layout(1),
+        cute.group_modes(sSFB1, 0, 3),
+        cute.group_modes(tCgSFB1, 0, 3),
+    )
+    tBsSFB1 = cute.filter_zeros(tBsSFB1)
+    tBgSFB1 = cute.filter_zeros(tBgSFB1)
+    # TMA Partition_S/D for SFB2
+    # ((atom_v, rest_v), STAGE)
+    # ((atom_v, rest_v), RestN, RestK, RestL)
+    tBsSFB2, tBgSFB2 = cpasync.tma_partition(
+        tma_atom_sfb2,
+        0,
+        cute.make_layout(1),
+        cute.group_modes(sSFB2, 0, 3),
+        cute.group_modes(tCgSFB2, 0, 3),
+    )
+    tBsSFB2 = cute.filter_zeros(tBsSFB2)
+    tBgSFB2 = cute.filter_zeros(tBgSFB2)
+
+    #
+    # Partition shared/tensor memory tensor for TiledMMA_A/B/C
+    #
+    # (MMA, MMA_M, MMA_K, STAGE)
+    tCrA = tiled_mma.make_fragment_A(sA)
+    # (MMA, MMA_N, MMA_K, STAGE)
+    tCrB1 = tiled_mma.make_fragment_B(sB1)
+    # (MMA, MMA_N, MMA_K, STAGE)
+    tCrB2 = tiled_mma.make_fragment_B(sB2)
+    # (MMA, MMA_M, MMA_N)
+    acc_shape = tiled_mma.partition_shape_C(mma_tiler_mnk[:2])
+    # (MMA, MMA_M, MMA_N)
+    tCtAcc_fake = tiled_mma.make_fragment_C(acc_shape)
+
+    #
+    # Alloc tensor memory buffer
+    # Make ACC1 and ACC2 tmem tensor
+    # ACC1 += A @ B1
+    # ACC2 += A @ B2
+    #
+    tmem_alloc_barrier = pipeline.NamedBarrier(
+        barrier_id=1,
+        num_threads=threads_per_cta,
+    )
+    tmem = utils.TmemAllocator(
+        storage.tmem_holding_buf,
+        barrier_for_retrieve=tmem_alloc_barrier,
+    )
+    tmem.allocate(num_tmem_alloc_cols)
+    tmem.wait_for_alloc()
+    acc_tmem_ptr = tmem.retrieve_ptr(cutlass.Float32)
+    tCtAcc1 = cute.make_tensor(acc_tmem_ptr, tCtAcc_fake.layout)
+    acc_tmem_ptr1 = cute.recast_ptr(
+        acc_tmem_ptr + tcgen05.find_tmem_tensor_col_offset(tCtAcc1),
+        dtype=cutlass.Float32,
+    )
+    tCtAcc2 = cute.make_tensor(acc_tmem_ptr1, tCtAcc_fake.layout)
+
+    #
+    # Make SFA/SFB1/SFB2 tmem tensor
+    #
+    # SFA tmem layout: (MMA, MMA_M, MMA_K)
+    tCtSFA_layout = blockscaled_utils.make_tmem_layout_sfa(
+        tiled_mma,
+        mma_tiler_mnk,
+        sf_vec_size,
+        cute.slice_(sfa_smem_layout_staged, (None, None, None, 0)),
+    )
+    # Get SFA tmem ptr
+    sfa_tmem_ptr = cute.recast_ptr(
+        acc_tmem_ptr
+        + tcgen05.find_tmem_tensor_col_offset(tCtAcc1)
+        + tcgen05.find_tmem_tensor_col_offset(tCtAcc2),
+        dtype=sf_dtype,
+    )
+    tCtSFA = cute.make_tensor(sfa_tmem_ptr, tCtSFA_layout)
+
+    # SFB1, SFB2 tmem layout: (MMA, MMA_N, MMA_K)
+    tCtSFB_layout = blockscaled_utils.make_tmem_layout_sfb(
+        tiled_mma,
+        mma_tiler_mnk,
+        sf_vec_size,
+        cute.slice_(sfb_smem_layout_staged, (None, None, None, 0)),
+    )
+    # Get SFB1 tmem ptr
+    sfb_tmem_ptr1 = cute.recast_ptr(
+        acc_tmem_ptr
+        + tcgen05.find_tmem_tensor_col_offset(tCtAcc1)
+        + tcgen05.find_tmem_tensor_col_offset(tCtAcc2)
+        + tcgen05.find_tmem_tensor_col_offset(tCtSFA),
+        dtype=sf_dtype,
+    )
+    tCtSFB1 = cute.make_tensor(sfb_tmem_ptr1, tCtSFB_layout)
+    # Get SFB2 tmem ptr
+    sfb_tmem_ptr2 = cute.recast_ptr(
+        acc_tmem_ptr
+        + tcgen05.find_tmem_tensor_col_offset(tCtAcc1)
+        + tcgen05.find_tmem_tensor_col_offset(tCtAcc2)
+        + tcgen05.find_tmem_tensor_col_offset(tCtSFA)
+        + tcgen05.find_tmem_tensor_col_offset(tCtSFB1),
+        dtype=sf_dtype,
+    )
+    tCtSFB2 = cute.make_tensor(sfb_tmem_ptr2, tCtSFB_layout)
+
+    #
+    # Partition for S2T copy of SFA/SFB1/SFB2
+    #
+    # Make S2T CopyAtom
+    copy_atom_s2t = cute.make_copy_atom(
+        tcgen05.Cp4x32x128bOp(tcgen05.CtaGroup.ONE),
+        sf_dtype,
+    )
+    # (MMA, MMA_MN, MMA_K, STAGE)
+    tCsSFA_compact = cute.filter_zeros(sSFA)
+    # (MMA, MMA_MN, MMA_K)
+    tCtSFA_compact = cute.filter_zeros(tCtSFA)
+    tiled_copy_s2t_sfa = tcgen05.make_s2t_copy(copy_atom_s2t, tCtSFA_compact)
+    thr_copy_s2t_sfa = tiled_copy_s2t_sfa.get_slice(0)
+    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+    tCsSFA_compact_s2t_ = thr_copy_s2t_sfa.partition_S(tCsSFA_compact)
+    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+    tCsSFA_compact_s2t = tcgen05.get_s2t_smem_desc_tensor(
+        tiled_copy_s2t_sfa, tCsSFA_compact_s2t_
+    )
+    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K)
+    tCtSFA_compact_s2t = thr_copy_s2t_sfa.partition_D(tCtSFA_compact)
+
+    # (MMA, MMA_MN, MMA_K, STAGE)
+    tCsSFB1_compact = cute.filter_zeros(sSFB1)
+    # (MMA, MMA_MN, MMA_K)
+    tCtSFB1_compact = cute.filter_zeros(tCtSFB1)
+    tiled_copy_s2t_sfb = tcgen05.make_s2t_copy(copy_atom_s2t, tCtSFB1_compact)
+    thr_copy_s2t_sfb = tiled_copy_s2t_sfb.get_slice(0)
+    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+    tCsSFB1_compact_s2t_ = thr_copy_s2t_sfb.partition_S(tCsSFB1_compact)
+    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+    tCsSFB1_compact_s2t = tcgen05.get_s2t_smem_desc_tensor(
+        tiled_copy_s2t_sfb, tCsSFB1_compact_s2t_
+    )
+    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K)
+    tCtSFB1_compact_s2t = thr_copy_s2t_sfb.partition_D(tCtSFB1_compact)
+
+    # SFB2 S2T copy and partition
+    # (MMA, MMA_MN, MMA_K, STAGE)
+    tCsSFB2_compact = cute.filter_zeros(sSFB2)
+    # (MMA, MMA_MN, MMA_K)
+    tCtSFB2_compact = cute.filter_zeros(tCtSFB2)
+    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+    tCsSFB2_compact_s2t_ = thr_copy_s2t_sfb.partition_S(tCsSFB2_compact)
+    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+    tCsSFB2_compact_s2t = tcgen05.get_s2t_smem_desc_tensor(
+        tiled_copy_s2t_sfb, tCsSFB2_compact_s2t_
+    )
+    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K)
+    tCtSFB2_compact_s2t = thr_copy_s2t_sfb.partition_D(tCtSFB2_compact)
+
+    #
+    # Slice to per mma tile index
+    #
+    # ((atom_v, rest_v), RestK)
+    tAgA = tAgA[(None, mma_tile_coord_mnl[0], None, mma_tile_coord_mnl[2])]
+    # ((atom_v, rest_v), RestK)
+    tBgB1 = tBgB1[(None, mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2])]
+    # ((atom_v, rest_v), RestK)
+    tBgB2 = tBgB2[(None, mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2])]
+    # ((atom_v, rest_v), RestK)
+    tAgSFA = tAgSFA[(None, mma_tile_coord_mnl[0], None, mma_tile_coord_mnl[2])]
+    # ((atom_v, rest_v), RestK)
+    tBgSFB1 = tBgSFB1[(None, mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2])]
+    # ((atom_v, rest_v), RestK)
+    tBgSFB2 = tBgSFB2[(None, mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2])]
+
+    #
+    # Execute Data copy and Math computation in the k_tile loop
+    #
+    if warp_idx == 0:
+        # Wait for accumulator buffer empty
+        acc_empty = acc_producer.acquire_and_advance()
+        # Set ACCUMULATE field to False for the first k_tile iteration
+        tiled_mma.set(tcgen05.Field.ACCUMULATE, False)
+        # Execute k_tile loop
+        for k_tile in range(k_tile_cnt):
+            # Wait for AB buffer empty
+            ab_empty = ab_producer.acquire_and_advance()
+
+            #  TMA load A/B1/B2/SFA/SFB1/SFB2 to shared memory
+            cute.copy(
+                tma_atom_a,
+                tAgA[(None, ab_empty.count)],
+                tAsA[(None, ab_empty.index)],
+                tma_bar_ptr=ab_empty.barrier,
+            )
+            cute.copy(
+                tma_atom_b1,
+                tBgB1[(None, ab_empty.count)],
+                tBsB1[(None, ab_empty.index)],
+                tma_bar_ptr=ab_empty.barrier,
+            )
+            cute.copy(
+                tma_atom_b2,
+                tBgB2[(None, ab_empty.count)],
+                tBsB2[(None, ab_empty.index)],
+                tma_bar_ptr=ab_empty.barrier,
+            )
+            cute.copy(
+                tma_atom_sfa,
+                tAgSFA[(None, ab_empty.count)],
+                tAsSFA[(None, ab_empty.index)],
+                tma_bar_ptr=ab_empty.barrier,
+            )
+            cute.copy(
+                tma_atom_sfb1,
+                tBgSFB1[(None, ab_empty.count)],
+                tBsSFB1[(None, ab_empty.index)],
+                tma_bar_ptr=ab_empty.barrier,
+            )
+            cute.copy(
+                tma_atom_sfb2,
+                tBgSFB2[(None, ab_empty.count)],
+                tBsSFB2[(None, ab_empty.index)],
+                tma_bar_ptr=ab_empty.barrier,
+            )
+
+            # Wait for AB buffer full
+            ab_full = ab_consumer.wait_and_advance()
+
+            #  Copy SFA/SFB1/SFB2 to tmem
+            s2t_stage_coord = (None, None, None, None, ab_full.index)
+            tCsSFA_compact_s2t_staged = tCsSFA_compact_s2t[s2t_stage_coord]
+            tCsSFB1_compact_s2t_staged = tCsSFB1_compact_s2t[s2t_stage_coord]
+            tCsSFB2_compact_s2t_staged = tCsSFB2_compact_s2t[s2t_stage_coord]
+            cute.copy(
+                tiled_copy_s2t_sfa,
+                tCsSFA_compact_s2t_staged,
+                tCtSFA_compact_s2t,
+            )
+            cute.copy(
+                tiled_copy_s2t_sfb,
+                tCsSFB1_compact_s2t_staged,
+                tCtSFB1_compact_s2t,
+            )
+            cute.copy(
+                tiled_copy_s2t_sfb,
+                tCsSFB2_compact_s2t_staged,
+                tCtSFB2_compact_s2t,
+            )
+
+            # tCtAcc1 += tCrA * tCrSFA * tCrB1 * tCrSFB1
+            # tCtAcc2 += tCrA * tCrSFA * tCrB2 * tCrSFB2
+            num_kblocks = cute.size(tCrA, mode=[2])
+            for kblock_idx in cutlass.range(num_kblocks, unroll_full=True):
+                kblock_coord = (
+                    None,
+                    None,
+                    kblock_idx,
+                    ab_full.index,
+                )
+
+                # Set SFA/SFB tensor to tiled_mma
+                sf_kblock_coord = (None, None, kblock_idx)
+                tiled_mma.set(
+                    tcgen05.Field.SFA,
+                    tCtSFA[sf_kblock_coord].iterator,
+                )
+                tiled_mma.set(
+                    tcgen05.Field.SFB,
+                    tCtSFB1[sf_kblock_coord].iterator,
+                )
+                cute.gemm(
+                    tiled_mma,
+                    tCtAcc1,
+                    tCrA[kblock_coord],
+                    tCrB1[kblock_coord],
+                    tCtAcc1,
+                )
+
+                tiled_mma.set(
+                    tcgen05.Field.SFB,
+                    tCtSFB2[sf_kblock_coord].iterator,
+                )
+                cute.gemm(
+                    tiled_mma,
+                    tCtAcc2,
+                    tCrA[kblock_coord],
+                    tCrB2[kblock_coord],
+                    tCtAcc2,
+                )
+
+                # Enable accumulate on tCtAcc1/tCtAcc2 after first kblock
+                tiled_mma.set(tcgen05.Field.ACCUMULATE, True)
+
+            # Async arrive AB buffer empty
+            ab_full.release()
+        acc_empty.commit()
+
+    #
+    # Epilogue
+    # Partition for epilogue
+    #
+    op = tcgen05.Ld32x32bOp(tcgen05.Repetition.x128, tcgen05.Pack.NONE)
+    copy_atom_t2r = cute.make_copy_atom(op, cutlass.Float32)
+    tiled_copy_t2r = tcgen05.make_tmem_copy(copy_atom_t2r, tCtAcc1)
+    thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
+    # (T2R_M, T2R_N, EPI_M, EPI_M)
+    tTR_tAcc1 = thr_copy_t2r.partition_S(tCtAcc1)
+    # (T2R_M, T2R_N, EPI_M, EPI_M)
+    tTR_tAcc2 = thr_copy_t2r.partition_S(tCtAcc2)
+    # (T2R_M, T2R_N, EPI_M, EPI_N, RestM, RestN, RestL)
+    tTR_gC = thr_copy_t2r.partition_D(tCgC)
+    # (T2R_M, T2R_N, EPI_M, EPI_N）
+    tTR_rAcc1 = cute.make_rmem_tensor(
+        tTR_gC[None, None, None, None, 0, 0, 0].shape, cutlass.Float32
+    )
+    # (T2R_M, T2R_N, EPI_M, EPI_N）
+    tTR_rAcc2 = cute.make_rmem_tensor(
+        tTR_gC[None, None, None, None, 0, 0, 0].shape, cutlass.Float32
+    )
+    # (T2R_M, T2R_N, EPI_M, EPI_N）
+    tTR_rC = cute.make_rmem_tensor(
+        tTR_gC[None, None, None, None, 0, 0, 0].shape, c_dtype
+    )
+    # STG Atom
+    simt_atom = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), c_dtype)
+    tTR_gC = tTR_gC[(None, None, None, None, *mma_tile_coord_mnl)]
+
+    # Wait for accumulator buffer full
+    acc_full = acc_consumer.wait_and_advance()
+
+    # Copy accumulator to register
+    cute.copy(tiled_copy_t2r, tTR_tAcc1, tTR_rAcc1)
+    cute.copy(tiled_copy_t2r, tTR_tAcc2, tTR_rAcc2)
+
+    # Silu activation on acc1 and multiply with acc2
+    acc_vec1 = epilogue_op(tTR_rAcc1.load())
+    acc_vec2 = tTR_rAcc2.load()
+    acc_vec = acc_vec1 * acc_vec2
+
+    tTR_rC.store(acc_vec.to(c_dtype))
+    # Store C to global memory
+    cute.copy(simt_atom, tTR_rC, tTR_gC)
+
+    acc_full.release()
+    # Deallocate TMEM
+    cute.arch.barrier()
+    tmem.free(acc_tmem_ptr)
+    return
+
+
+@cute.jit
+def my_kernel(
+    a_ptr: cute.Pointer,
+    b1_ptr: cute.Pointer,
+    b2_ptr: cute.Pointer,
+    sfa_ptr: cute.Pointer,
+    sfb1_ptr: cute.Pointer,
+    sfb2_ptr: cute.Pointer,
+    c_ptr: cute.Pointer,
+    problem_size: tuple,
+    epilogue_op: cutlass.Constexpr = lambda x: x
+    * (1.0 / (1.0 + cute.math.exp(-x, fastmath=True))),
+):
+    """
+    Host-side JIT function to prepare tensors and launch GPU kernel.
+    """
+    m, n, k, l = problem_size
+
+    # Setup attributes that depend on gemm inputs
+    a_tensor = cute.make_tensor(
+        a_ptr,
+        cute.make_layout(
+            (m, cute.assume(k, 32), l),
+            stride=(cute.assume(k, 32), 1, cute.assume(m * k, 32)),
+        ),
+    )
+    b_tensor1 = cute.make_tensor(
+        b1_ptr,
+        cute.make_layout(
+            (n, cute.assume(k, 32), l),
+            stride=(cute.assume(k, 32), 1, cute.assume(n * k, 32)),
+        ),
+    )
+    b_tensor2 = cute.make_tensor(
+        b2_ptr,
+        cute.make_layout(
+            (n, cute.assume(k, 32), l),
+            stride=(cute.assume(k, 32), 1, cute.assume(n * k, 32)),
+        ),
+    )
+    c_tensor = cute.make_tensor(
+        c_ptr, cute.make_layout((cute.assume(m, 32), n, l), stride=(n, 1, m * n))
+    )
+    # Setup sfa/sfb tensor by filling A/B tensor to scale factor atom layout
+    # ((Atom_M, Rest_M),(Atom_K, Rest_K),RestL)
+    sfa_layout = blockscaled_utils.tile_atom_to_shape_SF(
+        a_tensor.shape, sf_vec_size
+    )
+    sfa_tensor = cute.make_tensor(sfa_ptr, sfa_layout)
+
+    # ((Atom_N, Rest_N),(Atom_K, Rest_K),RestL)
+    sfb_layout = blockscaled_utils.tile_atom_to_shape_SF(
+        b_tensor1.shape, sf_vec_size
+    )
+    sfb_tensor1 = cute.make_tensor(sfb1_ptr, sfb_layout)
+    sfb_tensor2 = cute.make_tensor(sfb2_ptr, sfb_layout)
+
+    mma_op = tcgen05.MmaMXF4NVF4Op(
+        sf_dtype,
+        (mma_tiler_mnk[0], mma_tiler_mnk[1], mma_inst_shape_k),
+        tcgen05.CtaGroup.ONE,
+        tcgen05.OperandSource.SMEM,
+    )
+    tiled_mma = cute.make_tiled_mma(mma_op)
+
+    cluster_layout_vmnk  = cute.tiled_divide(
+        cute.make_layout((1, 1, 1)),
+        (tiled_mma.thr_id.shape,),
+    )
+
+    # Compute A/B/SFA/SFB/C shared memory layout
+    a_smem_layout_staged = sm100_utils.make_smem_layout_a(
+        tiled_mma,
+        mma_tiler_mnk,
+        ab_dtype,
+        num_ab_stage,
+    )
+    # B1 and B2 have the same size thus share the same smem layout
+    b_smem_layout_staged = sm100_utils.make_smem_layout_b(
+        tiled_mma,
+        mma_tiler_mnk,
+        ab_dtype,
+        num_ab_stage,
+    )
+    sfa_smem_layout_staged = blockscaled_utils.make_smem_layout_sfa(
+        tiled_mma,
+        mma_tiler_mnk,
+        sf_vec_size,
+        num_ab_stage,
+    )
+    # SFB1 and SFB2 have the same size thus share the same smem layout
+    sfb_smem_layout_staged = blockscaled_utils.make_smem_layout_sfb(
+        tiled_mma,
+        mma_tiler_mnk,
+        sf_vec_size,
+        num_ab_stage,
+    )
+    atom_thr_size = cute.size(tiled_mma.thr_id.shape)
+
+    # Setup TMA for A
+    a_smem_layout = cute.slice_(a_smem_layout_staged, (None, None, None, 0))
+    tma_atom_a, tma_tensor_a = cute.nvgpu.make_tiled_tma_atom_A(
+        cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE),
+        a_tensor,
+        a_smem_layout,
+        mma_tiler_mnk,
+        tiled_mma,
+        cluster_layout_vmnk .shape,
+    )
+    # Setup TMA for B1
+    b_smem_layout = cute.slice_(b_smem_layout_staged, (None, None, None, 0))
+    tma_atom_b1, tma_tensor_b1 = cute.nvgpu.make_tiled_tma_atom_B(
+        cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE),
+        b_tensor1,
+        b_smem_layout,
+        mma_tiler_mnk,
+        tiled_mma,
+        cluster_layout_vmnk .shape,
+    )
+    # Setup TMA for B2
+    tma_atom_b2, tma_tensor_b2 = cute.nvgpu.make_tiled_tma_atom_B(
+        cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE),
+        b_tensor2,
+        b_smem_layout,
+        mma_tiler_mnk,
+        tiled_mma,
+        cluster_layout_vmnk .shape,
+    )
+    # Setup TMA for SFA
+    sfa_smem_layout = cute.slice_(
+        sfa_smem_layout_staged , (None, None, None, 0)
+    )
+    tma_atom_sfa, tma_tensor_sfa = cute.nvgpu.make_tiled_tma_atom_A(
+        cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE),
+        sfa_tensor,
+        sfa_smem_layout,
+        mma_tiler_mnk,
+        tiled_mma,
+        cluster_layout_vmnk .shape,
+        internal_type=cutlass.Int16,
+    )
+    # Setup TMA for SFB1
+    sfb_smem_layout = cute.slice_(
+        sfb_smem_layout_staged , (None, None, None, 0)
+    )
+    tma_atom_sfb1, tma_tensor_sfb1 = cute.nvgpu.make_tiled_tma_atom_B(
+        cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE),
+        sfb_tensor1,
+        sfb_smem_layout,
+        mma_tiler_mnk,
+        tiled_mma,
+        cluster_layout_vmnk .shape,
+        internal_type=cutlass.Int16,
+    )
+    # Setup TMA for SFB2
+    tma_atom_sfb2, tma_tensor_sfb2 = cute.nvgpu.make_tiled_tma_atom_B(
+        cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE),
+        sfb_tensor2,
+        sfb_smem_layout,
+        mma_tiler_mnk,
+        tiled_mma,
+        cluster_layout_vmnk .shape,
+        internal_type=cutlass.Int16,
+    )
+
+    # Compute TMA load bytes
+    a_copy_size = cute.size_in_bytes(ab_dtype, a_smem_layout)
+    b_copy_size = cute.size_in_bytes(ab_dtype, b_smem_layout)
+    sfa_copy_size = cute.size_in_bytes(sf_dtype, sfa_smem_layout)
+    sfb_copy_size = cute.size_in_bytes(sf_dtype, sfb_smem_layout)
+    num_tma_load_bytes = (
+        a_copy_size + b_copy_size * 2 + sfa_copy_size + sfb_copy_size * 2
+    ) * atom_thr_size
+
+    # Compute grid size
+    grid = (
+        cute.ceil_div(c_tensor.shape[0], mma_tiler_mnk[0]),
+        cute.ceil_div(c_tensor.shape[1], mma_tiler_mnk[1]),
+        c_tensor.shape[2],
+    )
+
+    # Launch the kernel.
+    kernel(
+        # MMA (Matrix Multiply-Accumulate) configuration
+        tiled_mma,                  # Tiled MMA object defining NVFP4 GEMM compute pattern
+        
+        # TMA (Tensor Memory Accelerator) atoms and tensors for shared input matrix A
+        tma_atom_a,                 # TMA copy atom defining how to load A from global memory
+        tma_tensor_a,               # Tensor descriptor for A matrix (m, k, l) - shared by both GEMMs
+        
+        # TMA atoms and tensors for first B matrix (B1)
+        tma_atom_b1,                # TMA copy atom defining how to load B1 from global memory
+        tma_tensor_b1,              # Tensor descriptor for B1 matrix (n, k, l) - first GEMM
+        
+        # TMA atoms and tensors for second B matrix (B2)
+        tma_atom_b2,                # TMA copy atom defining how to load B2 from global memory
+        tma_tensor_b2,              # Tensor descriptor for B2 matrix (n, k, l) - second GEMM
+        
+        # TMA atoms and tensors for scale factor A (shared)
+        tma_atom_sfa,               # TMA copy atom for loading scale factors for A
+        tma_tensor_sfa,             # Tensor descriptor for SFA (block scale factors for A) - shared
+        
+        # TMA atoms and tensors for scale factor B1
+        tma_atom_sfb1,              # TMA copy atom for loading scale factors for B1
+        tma_tensor_sfb1,            # Tensor descriptor for SFB1 (block scale factors for B1)
+        
+        # TMA atoms and tensors for scale factor B2
+        tma_atom_sfb2,              # TMA copy atom for loading scale factors for B2
+        tma_tensor_sfb2,            # Tensor descriptor for SFB2 (block scale factors for B2)
+        
+        # Output tensor C (stores both C1 and C2 results)
+        c_tensor,                   # Output tensor where both GEMM results will be stored (m, n, l)
+        
+        # Shared memory layouts with staging for pipelined execution
+        a_smem_layout_staged,       # Staged shared memory layout for A (includes stage dimension)
+        b_smem_layout_staged,       # Staged shared memory layout for B1/B2 (includes stage dimension)
+        sfa_smem_layout_staged,     # Staged shared memory layout for SFA (includes stage dimension)
+        sfb_smem_layout_staged,     # Staged shared memory layout for SFB1/SFB2 (includes stage dimension)
+        
+        # Pipeline synchronization parameter
+        num_tma_load_bytes,         # Total bytes to load per TMA transaction (for barrier setup)
+        
+        # Epilogue operation
+        epilogue_op,                # Epilogue operation to apply to output (e.g., element-wise ops)
+    ).launch(
+        grid=grid,
+        block=[threads_per_cta, 1, 1],
+        cluster=(1, 1, 1),
+    )
+    return
+
+
+# Global cache for compiled kernel
+_compiled_kernel_cache = None
+# This function is used to compile the kernel once and cache it and then allow users to 
+# run the kernel multiple times to get more accurate timing results.
+def compile_kernel():
+    """
+    Compile the kernel once and cache it.
+    This should be called before any timing measurements.
+    
+    Returns:
+        The compiled kernel function
+    """
+    global _compiled_kernel_cache
+    
+    if _compiled_kernel_cache is not None:
+        return _compiled_kernel_cache
+    
+
+    # Create CuTe pointers for A/B/C/SFA/SFB via torch tensor data pointer
+    a_ptr = make_ptr(
+        ab_dtype, 0, cute.AddressSpace.gmem, assumed_align=16
+    )
+    b1_ptr = make_ptr(
+        ab_dtype, 0, cute.AddressSpace.gmem, assumed_align=16
+    )
+    b2_ptr = make_ptr(
+        ab_dtype, 0, cute.AddressSpace.gmem, assumed_align=16
+    )
+    c_ptr = make_ptr(
+        c_dtype, 0, cute.AddressSpace.gmem, assumed_align=16
+    )
+    sfa_ptr = make_ptr(
+        sf_dtype, 0, cute.AddressSpace.gmem, assumed_align=32
+    )
+    sfb1_ptr = make_ptr(
+        sf_dtype, 0, cute.AddressSpace.gmem, assumed_align=32
+    )
+    sfb2_ptr = make_ptr(
+        sf_dtype, 0, cute.AddressSpace.gmem, assumed_align=32
+    )
+
+    # Compile the kernel
+    _compiled_kernel_cache = cute.compile(my_kernel, a_ptr, b1_ptr, b2_ptr, sfa_ptr, sfb1_ptr, sfb2_ptr, c_ptr, (0, 0, 0, 0))
+    
+    return _compiled_kernel_cache
+
+
+def custom_kernel(data: input_t) -> output_t:
+    """
+    Execute the block-scaled dual GEMM kernel with silu activation,
+    C = silu(A @ B1) * (A @ B2).
+    
+    This is the main entry point called by the evaluation framework.
+    It converts PyTorch tensors to CuTe tensors, launches the kernel,
+    and returns the result.
+    
+    Args:
+        data: Tuple of (a, b1, b2, sfa_cpu, sfb1_cpu, sfb2_cpu, c) PyTorch tensors
+            a: [m, k, l] - Input matrix in float4e2m1fn 
+            b1: [n, k, l] - Input matrix in float4e2m1fn 
+            b2: [n, k, l] - Input matrix in float4e2m1fn 
+            sfa_cpu: [m, k, l] - Scale factors in float8_e4m3fn, used by reference implementation
+            sfb1_cpu: [n, k, l] - Scale factors in float8_e4m3fn, used by reference implementation
+            sfb2_cpu: [n, k, l] - Scale factors in float8_e4m3fn, used by reference implementation
+            sfa_permuted: [32, 4, rest_m, 4, rest_k, l] - Scale factors in float8_e4m3fn
+            sfb1_permuted: [32, 4, rest_n, 4, rest_k, l] - Scale factors in float8_e4m3fn
+            sfb2_permuted: [32, 4, rest_n, 4, rest_k, l] - Scale factors in float8_e4m3fn
+            c: [m, n, l] - Output vector in float16
+    
+    Returns:
+        Output tensor c with computed results
+    """
+    a, b1, b2, _, _, _, sfa_permuted, sfb1_permuted, sfb2_permuted, c = data
+    
+    # Ensure kernel is compiled (will use cached version if available)
+    # To avoid the compilation overhead, we compile the kernel once and cache it.
+    compiled_func = compile_kernel()
+
+    # Get dimensions from MxKxL layout
+    _, k, _ = a.shape
+    m, n, l = c.shape
+    # Torch use e2m1_x2 data type, thus k is halved
+    k = k * 2 
+
+    # Create CuTe pointers for A/B/C/SFA/SFB via torch tensor data pointer
+    a_ptr = make_ptr(
+        ab_dtype, a.data_ptr(), cute.AddressSpace.gmem, assumed_align=16
+    )
+    b1_ptr = make_ptr(
+        ab_dtype, b1.data_ptr(), cute.AddressSpace.gmem, assumed_align=16
+    )
+    b2_ptr = make_ptr(
+        ab_dtype, b2.data_ptr(), cute.AddressSpace.gmem, assumed_align=16
+    )
+    c_ptr = make_ptr(
+        c_dtype, c.data_ptr(), cute.AddressSpace.gmem, assumed_align=16
+    )
+    sfa_ptr = make_ptr(
+        sf_dtype, sfa_permuted.data_ptr(), cute.AddressSpace.gmem, assumed_align=32
+    )
+    sfb1_ptr = make_ptr(
+        sf_dtype, sfb1_permuted.data_ptr(), cute.AddressSpace.gmem, assumed_align=32
+    )
+    sfb2_ptr = make_ptr(
+        sf_dtype, sfb2_permuted.data_ptr(), cute.AddressSpace.gmem, assumed_align=32
+    )
+
+    # Execute the compiled kernel
+    compiled_func(a_ptr, b1_ptr, b2_ptr, sfa_ptr, sfb1_ptr, sfb2_ptr, c_ptr, (m, n, k, l))
+
+    return c
diff --git a/problems/nvidia/modal_nvfp4_dual_gemm/task.py b/problems/nvidia/modal_nvfp4_dual_gemm/task.py
new file mode 100644
index 00000000..8facfb07
--- /dev/null
+++ b/problems/nvidia/modal_nvfp4_dual_gemm/task.py
@@ -0,0 +1,11 @@
+import torch
+from typing import TypedDict, TypeVar
+
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor])
+output_t = TypeVar("output_t", bound=torch.Tensor)
+class TestSpec(TypedDict):
+    m: int
+    n: int
+    k: int
+    l: int
+    seed: int
\ No newline at end of file
diff --git a/problems/nvidia/modal_nvfp4_dual_gemm/task.yml b/problems/nvidia/modal_nvfp4_dual_gemm/task.yml
new file mode 100644
index 00000000..4d36bde6
--- /dev/null
+++ b/problems/nvidia/modal_nvfp4_dual_gemm/task.yml
@@ -0,0 +1,64 @@
+# name: nvfp4-dual-gemm
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "../utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval_better_bench.py"}
+
+lang: "py"
+
+description: |
+  
+  You will implement a block scaled dual matrix-matrix multiplication kernel with silu activation optimized for NVIDIA B200.
+  To be explicit, you will be given a tuple of tensors:
+  ```
+  (a, b1, b2, sfa, sfb1, sfb2, c)
+  ```
+  where:
+  * `a` is M x K x L in K-major order in nvfp4(e2m1)
+  * `b1` is N x K x L in K-major order in nvfp4(e2m1)
+  * `b2` is N x K x L in K-major order in nvfp4(e2m1)
+  * `sfa` is M x (K // 16) x L in K-major order in fp8(e4m3fnuz)
+  * `sfb1` is N x (K // 16) x L in K-major order in fp8(e4m3fnuz)
+  * `sfb2` is N x (K // 16) x L in K-major order in fp8(e4m3fnuz)
+  * `c` is M x N x L in fp16
+  
+  Matrix sizes `M` is divisible by mma_tiler_mn[0], `N` is divisible by mma_tiler_mn[1], `K` is divisible by 256.
+  The ranking criteria is the geometric mean of the benchmark results.
+  For the grand price, your kernel will be evaluated against the speed of light analysis
+  and the solution closest to the speed of light will be awarded the grand price.
+  ```
+  The speed of light analysis based on the max(FP4 Tensor Core math throughput, DRAM memory throughput) of B200 and tested under 1.5Ghz clock:
+    M   N   K   L time[us] 
+  256 4096 7168 1 4.708
+  512 4096 7168 1 8.714
+  256 3072 4096 1 2.125
+  512 3072 7168 1 6.535
+  ```
+config:
+  main: "eval.py"
+
+templates:
+  Python: "template.py"
+
+tests:
+  - {"m": 1536, "n": 512, "k": 7168, "l": 1, "seed": 1111}
+  - {"m": 256, "n": 512, "k": 256, "l": 1, "seed": 1111}
+  - {"m": 1536, "n": 512, "k": 7168, "l": 1, "seed": 1111}
+  - {"m": 3072, "n": 1024, "k": 1536, "l": 1, "seed": 1111}
+  - {"m": 7168, "n": 1024, "k": 256, "l": 1, "seed": 1111}
+  - {"m": 7168, "n": 2304, "k": 2048, "l": 1, "seed": 1111}
+  - {"m": 4608, "n": 384, "k": 7168, "l": 1, "seed": 1111}
+  - {"m": 7168, "n": 384, "k": 2304, "l": 1, "seed": 1111}
+  - {"m": 512, "n": 768, "k": 7168, "l": 1, "seed": 1111}
+  - {"m": 4096, "n": 768, "k": 512, "l": 1, "seed": 1111}
+  
+benchmarks:
+  - {"m": 256, "n": 4096, "k": 7168, "l": 1, "seed": 1111}
+  - {"m": 512, "n": 4096, "k": 7168, "l": 1, "seed": 1111}
+  - {"m": 256, "n": 3072, "k": 4096, "l": 1, "seed": 1111}
+  - {"m": 512, "n": 3072, "k": 7168, "l": 1, "seed": 1111}
+
+ranking_by: "geom"
diff --git a/problems/nvidia/modal_nvfp4_dual_gemm/template.py b/problems/nvidia/modal_nvfp4_dual_gemm/template.py
new file mode 100644
index 00000000..d8985df5
--- /dev/null
+++ b/problems/nvidia/modal_nvfp4_dual_gemm/template.py
@@ -0,0 +1,28 @@
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of block-scale fp4 dual gemm with silu activation
+    Args:
+        data: Tuple that expands to:
+            a: torch.Tensor[float4e2m1fn] of shape [m, k, l],
+            b1: torch.Tensor[float4e2m1fn] of shape [n, k, l],
+            b2: torch.Tensor[float4e2m1fn] of shape [n, k, l],
+            sfa: torch.Tensor[float8_e4m3fnuz] of shape [m, k // 16, l], used by reference implementation
+            sfb1: torch.Tensor[float8_e4m3fnuz] of shape [n, k // 16, l], used by reference implementation
+            sfb2: torch.Tensor[float8_e4m3fnuz] of shape [n, k // 16, l], used by reference implementation
+            sfa_permuted: torch.Tensor[float8_e4m3fnuz] of shape [32, 4, rest_m, 4, rest_k, l],
+            sfb1_permuted: torch.Tensor[float8_e4m3fnuz] of shape [32, 4, rest_n, 4, rest_k, l],
+            sfb2_permuted: torch.Tensor[float8_e4m3fnuz] of shape [32, 4, rest_n, 4, rest_k, l],
+            c: torch.Tensor[float16] of shape [m, n, l]
+    Returns:
+        Tensor containing output in float16
+        c: torch.Tensor[float16] of shape [m, n, l]
+    """
+    # c: [m, n, l] is pre-allocated memory to avoid timing allocation overhead.
+    a, b1, b2, sfa, sfb1, sfb2, sfa_permuted, sfb1_permuted, sfb2_permuted, c = data
+
+    # Your implementation here
+
+    return c
\ No newline at end of file
diff --git a/problems/nvidia/modal_nvfp4_dual_gemm/utils.py b/problems/nvidia/modal_nvfp4_dual_gemm/utils.py
new file mode 100644
index 00000000..d9b3a69e
--- /dev/null
+++ b/problems/nvidia/modal_nvfp4_dual_gemm/utils.py
@@ -0,0 +1,172 @@
+import os
+import random
+import numpy as np
+import torch
+
+
+def set_seed(seed=42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+
+
+def get_device(use_cuda: bool = True) -> torch.device:
+    """Get the appropriate device (GPU or CPU)."""
+    if use_cuda:
+        if torch.cuda.is_available():
+            return torch.device("cuda")
+        elif torch.backends.mps.is_available():
+            return torch.device("mps")
+        else:
+            print("No compatible GPU found. Falling back to CPU.")
+    return torch.device("cpu")
+
+
+# Adapted from https://github.com/linkedin/Liger-Kernel/blob/main/test/utils.py
+@torch.no_grad()
+def verbose_allclose(
+        received: torch.Tensor,
+        expected: torch.Tensor,
+        rtol=1e-05,
+        atol=1e-08,
+        max_print=5
+) -> list[str]:
+    """
+    Assert that two tensors are element-wise equal within a tolerance, providing detailed information about mismatches.
+    Parameters:
+    received (torch.Tensor): Tensor we actually got.
+    expected (torch.Tensor): Tensor we expected to receive.
+    rtol (float): Relative tolerance; relative to expected
+    atol (float): Absolute tolerance.
+    max_print (int): Maximum number of mismatched elements to print.
+    Raises:
+    AssertionError: If the tensors are not all close within the given tolerance.
+    """
+    # Check if the shapes of the tensors match
+    if received.shape != expected.shape:
+        return ["SIZE MISMATCH"]
+
+    # Calculate the difference between the tensors
+    diff = torch.abs(received - expected)
+
+    # Determine the tolerance
+    tolerance = atol + rtol * torch.abs(expected)
+
+    # Find tolerance mismatched elements
+    tol_mismatched = diff > tolerance
+
+    # Find nan mismatched elements
+    nan_mismatched = torch.logical_xor(torch.isnan(received), torch.isnan(expected))
+
+    # Find +inf mismatched elements
+    posinf_mismatched = torch.logical_xor(torch.isposinf(received), torch.isposinf(expected))
+    # Find -inf mismatched elements
+    neginf_mismatched = torch.logical_xor(torch.isneginf(received), torch.isneginf(expected))
+
+    # Find all mismatched elements
+    mismatched = torch.logical_or(
+        torch.logical_or(tol_mismatched, nan_mismatched),
+        torch.logical_or(posinf_mismatched, neginf_mismatched),
+    )
+
+    mismatched_indices = torch.nonzero(mismatched)
+
+    # Count the number of mismatched elements
+    num_mismatched = mismatched.count_nonzero().item()
+
+    # Generate detailed information if there are mismatches
+    if num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
+
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return mismatch_details
+
+    return []
+
+
+@torch.no_grad()
+def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print: int=5):
+    """
+    Assert that two tensors are element-wise perfectly equal, providing detailed information about mismatches.
+    Parameters:
+    received (torch.Tensor): Tensor we actually got.
+    expected (torch.Tensor): Tensor we expected to receive.
+    max_print (int): Maximum number of mismatched elements to print.
+    Returns:
+         Empty string if tensors are equal, otherwise detailed error information
+    """
+    mismatched = torch.not_equal(received, expected)
+    mismatched_indices = torch.nonzero(mismatched)
+
+    # Count the number of mismatched elements
+    num_mismatched = mismatched.count_nonzero().item()
+
+    # Generate detailed information if there are mismatches
+    if num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
+
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return mismatch_details
+
+    return []
+
+
+def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08) -> tuple[bool, str]:
+    """
+    Convenient "default" implementation for tasks' `check_implementation` function.
+    """
+    expected = reference(data)
+    reasons = verbose_allclose(output, expected, rtol=rtol, atol=atol)
+
+    if len(reasons) > 0:
+        return False, "mismatch found! custom implementation doesn't match reference: " + " ".join(reasons)
+
+    return True, ''
+
+
+def make_match_reference(reference: callable, **kwargs):
+    def wrapped(data, output):
+        return match_reference(data, output, reference=reference, **kwargs)
+    return wrapped
+
+
+class DeterministicContext:
+    def __init__(self):
+        self.allow_tf32 = None
+        self.deterministic = None
+        self.cublas = None
+
+    def __enter__(self):
+        self.cublas = os.environ.get('CUBLAS_WORKSPACE_CONFIG', '')
+        self.allow_tf32 = torch.backends.cudnn.allow_tf32
+        self.deterministic = torch.backends.cudnn.deterministic
+        torch.backends.cudnn.allow_tf32 = False
+        torch.backends.cudnn.deterministic = True
+        torch.use_deterministic_algorithms(True)
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        torch.backends.cudnn.allow_tf32 = self.allow_tf32
+        torch.backends.cudnn.deterministic = self.deterministic
+        torch.use_deterministic_algorithms(False)
+        os.environ['CUBLAS_WORKSPACE_CONFIG'] = self.cublas
+
+def clear_l2_cache():
+    # import cupy as cp
+    # cp.cuda.runtime.deviceSetLimit(cp.cuda.runtime.cudaLimitPersistingL2CacheSize, 0)
+    # create a large dummy tensor
+    dummy = torch.empty((32, 1024, 1024), dtype=torch.int64, device="cuda")
+    # write stuff to
+    dummy.fill_(42)
+    del dummy

From cc4fed55817c08563455cac2ada80d56702c67f5 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Sun, 18 Jan 2026 08:52:49 -0800
Subject: [PATCH 191/207] remove final nvfp4 problem version

---
 problems/nvidia.yaml                          |   5 -
 .../nvidia/final_nvfp4_dual_gemm/reference.py | 199 ----
 .../final_nvfp4_dual_gemm/submission.py       | 957 ------------------
 problems/nvidia/final_nvfp4_dual_gemm/task.py |  11 -
 .../nvidia/final_nvfp4_dual_gemm/task.yml     |  64 --
 .../nvidia/final_nvfp4_dual_gemm/template.py  |  28 -
 .../nvidia/final_nvfp4_dual_gemm/utils.py     | 172 ----
 7 files changed, 1436 deletions(-)
 delete mode 100644 problems/nvidia/final_nvfp4_dual_gemm/reference.py
 delete mode 100644 problems/nvidia/final_nvfp4_dual_gemm/submission.py
 delete mode 100644 problems/nvidia/final_nvfp4_dual_gemm/task.py
 delete mode 100644 problems/nvidia/final_nvfp4_dual_gemm/task.yml
 delete mode 100644 problems/nvidia/final_nvfp4_dual_gemm/template.py
 delete mode 100644 problems/nvidia/final_nvfp4_dual_gemm/utils.py

diff --git a/problems/nvidia.yaml b/problems/nvidia.yaml
index 8f868a9a..2bc74568 100644
--- a/problems/nvidia.yaml
+++ b/problems/nvidia.yaml
@@ -17,11 +17,6 @@ problems:
       - NVIDIA
   - directory: nvidia/nvfp4_dual_gemm
     name: nvfp4_dual_gemm
-    deadline: "2026-02-20 7:59"
-    gpus:
-      - NVIDIA
-  - directory: nvidia/final_nvfp4_dual_gemm
-    name: final_nvfp4_dual_gemm
     deadline: "2026-01-20 7:59"
     gpus:
       - NVIDIA
diff --git a/problems/nvidia/final_nvfp4_dual_gemm/reference.py b/problems/nvidia/final_nvfp4_dual_gemm/reference.py
deleted file mode 100644
index 95c6aacd..00000000
--- a/problems/nvidia/final_nvfp4_dual_gemm/reference.py
+++ /dev/null
@@ -1,199 +0,0 @@
-import torch
-from task import input_t, output_t
-from utils import make_match_reference
-
-# Scaling factor vector size
-sf_vec_size = 16
-
-# Helper function for ceiling division
-def ceil_div(a, b):
-    return (a + b - 1) // b
-
-# Helper function to convert scale factor tensor to blocked format
-def to_blocked(input_matrix):
-    rows, cols = input_matrix.shape
-
-    # Please ensure rows and cols are multiples of 128 and 4 respectively
-    n_row_blocks = ceil_div(rows, 128)
-    n_col_blocks = ceil_div(cols, 4)
-
-    padded = input_matrix
-    blocks = padded.view(n_row_blocks, 128, n_col_blocks, 4).permute(0, 2, 1, 3)
-    rearranged = blocks.reshape(-1, 4, 32, 4).transpose(1, 2).reshape(-1, 32, 16)
-
-    return rearranged.flatten()
-
-
-def ref_kernel(
-    data: input_t,
-) -> output_t:
-    """
-    PyTorch reference implementation of NVFP4 block-scaled dual GEMM with silu activation,
-    C = silu(A @ B1) * (A @ B2).
-    """
-    a_ref, b1_ref, b2_ref, sfa_ref_cpu, sfb1_ref_cpu, sfb2_ref_cpu, _, _, _, c_ref = data
-    
-    # Get dimensions from MxNxL layout
-    m, n, l = c_ref.shape
-
-    # Call torch._scaled_mm to compute the GEMV result
-    ref1 = torch.empty(
-        (l, m, n),
-        dtype=torch.float32,
-        device="cuda",
-    ).permute(1, 2, 0)
-    ref2 = torch.empty(
-        (l, m, n),
-        dtype=torch.float32,
-        device="cuda",
-    ).permute(1, 2, 0)
-    for l_idx in range(l):
-        # Convert the scale factor tensor to blocked format
-        scale_a = to_blocked(sfa_ref_cpu[:, :, l_idx])
-        scale_b1 = to_blocked(sfb1_ref_cpu[:, :, l_idx])
-        scale_b2 = to_blocked(sfb2_ref_cpu[:, :, l_idx])
-        # (m, k) @ (n, k).T -> (m, n)
-        res1 = torch._scaled_mm(
-            a_ref[:, :, l_idx],
-            b1_ref[:, :, l_idx].transpose(0, 1),
-            scale_a.cuda(),
-            scale_b1.cuda(),
-            bias=None,
-            out_dtype=torch.float32,
-        )
-        ref1[:, :, l_idx] = res1
-
-        res2 = torch._scaled_mm(
-            a_ref[:, :, l_idx],
-            b2_ref[:, :, l_idx].transpose(0, 1),
-            scale_a.cuda(),
-            scale_b2.cuda(),
-            bias=None,
-            out_dtype=torch.float32,
-        )
-        ref2[:, :, l_idx] = res2
-    # Do silu on the first GEMM result and multiply with the second GEMM result
-    c_ref = (torch.nn.functional.silu(ref1) * ref2).to(torch.float16)
-    return c_ref
-
-
-def generate_input(
-    m: int,
-    n: int,
-    k: int,
-    l: int,
-    seed: int,
-):
-    """
-    Generate input tensors for NVFP4 block-scaled dual GEMM with silu activation,
-    C = silu(A @ B1) * (A @ B2).
-    
-    Args:
-        m: Number of rows in matrix A
-        n: Number of columns in matrix B1 and B2
-        k: Number of columns in A and rows of B1 and B2
-        l: Batch size
-        seed: Random seed for reproducibility
-    
-    Returns:
-        Tuple of (a, b, scale_a, scale_b, c) where:
-            a: [m, k, l] - Input matrix in torch.float4e2m1fn_x2 data type
-            b1: [n, k, l] - Input matrix in torch.float4e2m1fn_x2 data type
-            b2: [n, k, l] - Input matrix in torch.float4e2m1fn_x2 data type
-            scale_a: [m, k, l] - Input scale factors in torch.float8e4m3fn data type
-            scale_b1: [n, k, l] - Input scale factors in torch.float8e4m3fn data type
-            scale_b2: [n, k, l] - Input scale factors in torch.float8e4m3fn data type
-            scale_a_permuted: [32, 4, rest_m, 4, rest_k, l] - Input scale factors in torch.float8e4m3fn data type
-            scale_b1_permuted: [32, 4, rest_n, 4, rest_k, l] - Input scale factors in torch.float8e4m3fn data type
-            scale_b2_permuted: [32, 4, rest_n, 4, rest_k, l] - Input scale factors in torch.float8e4m3fn data type
-            c: [m, n, l] - Output matrix in torch.float16 data type
-    """
-    torch.manual_seed(seed)
-    
-    def create_fp4_tensors(l, mn, k):
-        # generate uint8 tensor, then convert to float4e2m1fn_x2 data type
-        # generate all bit patterns
-        ref_i8 = torch.randint(255, size=(l, mn, k // 2), dtype=torch.uint8, device="cuda")
-
-        # for each nibble, only keep the sign bit and 2 LSBs
-        # the possible values are [-1.5, -1, -0.5, 0, +0.5, +1, +1.5]
-        ref_i8 = ref_i8 & 0b1011_1011
-
-        return ref_i8.permute(1, 2, 0).view(torch.float4_e2m1fn_x2)
-
-    # Generate uint8 tensor, then convert to float4e2m1fn_x2 data type
-    a_ref = create_fp4_tensors(l, m, k)
-    b1_ref = create_fp4_tensors(l, n, k)
-    b2_ref = create_fp4_tensors(l, n, k)
-    a_ref = a_ref.view(torch.float4_e2m1fn_x2)
-    b1_ref = b1_ref.view(torch.float4_e2m1fn_x2)
-    b2_ref = b2_ref.view(torch.float4_e2m1fn_x2)
-
-    # Create float16 output tensor
-    c_ref = torch.randn((l, m, n), dtype=torch.float16, device="cuda").permute(
-        1, 2, 0
-    )
-    
-    # Helper function to prepare the scale factor tensors for both reference
-    # kernel and customize kernel. The customized data layout can be found in:
-    # https://docs.nvidia.com/cuda/cublas/index.html?highlight=fp4#d-block-scaling-factors-layout
-    def create_scale_factor_tensors(l, mn, sf_k):
-        # Create the reference scale factor tensor (mn, sf_k, l) on CPU.
-        ref_shape = (l, mn, sf_k)
-        ref_permute_order = (1, 2, 0)
-        # Init with fp32 tensor in [0,1), then convert to float8_e4m3fn
-        ref_f8_random_fp32 = torch.rand(ref_shape, dtype=torch.float32, device='cuda')
-        ref_f8_torch_tensor = ref_f8_random_fp32.to(dtype=torch.float8_e4m3fn)
-        # permute to match ref_permute_order
-        ref_f8_torch_tensor_permuted = ref_f8_torch_tensor.permute(*ref_permute_order)
-
-        atom_m = (32, 4)
-        atom_k = 4
-        mma_shape = (
-            l,  # batch size
-            ceil_div(mn, atom_m[0] * atom_m[1]),
-            ceil_div(sf_k, atom_k),
-            atom_m[0],
-            atom_m[1],
-            atom_k,
-        )
-
-        # Reorder scale factor tensor to (32, 4, rest_m, 4, rest_k, l) layout
-        # Which is needed by the CuTe customized kernel
-        mma_permute_order = (3, 4, 1, 5, 2, 0)
-        # Generate a random int8 tensor, then convert to float8_e4m3fn
-        rand_int_tensor = torch.empty(mma_shape, dtype=torch.int8, device='cuda')
-        reordered_f8_torch_tensor = rand_int_tensor.to(dtype=torch.float8_e4m3fn)
-        # Permute according to mma_permute_order
-        reordered_f8_torch_tensor = reordered_f8_torch_tensor.permute(*mma_permute_order)
-
-        # GPU-side vectorized reordering (replaces slow CPU nested loops)
-        # Create index grids for all dimensions
-        i_idx = torch.arange(mn, device='cuda')
-        j_idx = torch.arange(sf_k, device='cuda')
-        b_idx = torch.arange(l, device='cuda')
-        
-        # Create meshgrid for all combinations of (i, j, b)
-        i_grid, j_grid, b_grid = torch.meshgrid(i_idx, j_idx, b_idx, indexing='ij')
-        
-        # Calculate target indices in vectorized manner
-        mm = i_grid // (atom_m[0] * atom_m[1])
-        mm32 = i_grid % atom_m[0]
-        mm4 = (i_grid % 128) // atom_m[0]
-        kk = j_grid // atom_k
-        kk4 = j_grid % atom_k
-        
-        # Perform the reordering with advanced indexing (all on GPU)
-        reordered_f8_torch_tensor[mm32, mm4, mm, kk4, kk, b_grid] = ref_f8_torch_tensor_permuted[i_grid, j_grid, b_grid]
-        
-        return ref_f8_torch_tensor_permuted.cpu(), reordered_f8_torch_tensor
-
-    sf_k = ceil_div(k, sf_vec_size)
-    sfa_ref_cpu, sfa_ref_permuted = create_scale_factor_tensors(l, m, sf_k)
-    sfb1_ref_cpu, sfb1_ref_permuted = create_scale_factor_tensors(l, n, sf_k)
-    sfb2_ref_cpu, sfb2_ref_permuted = create_scale_factor_tensors(l, n, sf_k)
-
-    return (a_ref, b1_ref, b2_ref, sfa_ref_cpu.to("cuda"), sfb1_ref_cpu.to("cuda"), sfb2_ref_cpu.to("cuda"), sfa_ref_permuted, sfb1_ref_permuted, sfb2_ref_permuted, c_ref)
-
-
-check_implementation = make_match_reference(ref_kernel, rtol=1e-03, atol=1e-03)
diff --git a/problems/nvidia/final_nvfp4_dual_gemm/submission.py b/problems/nvidia/final_nvfp4_dual_gemm/submission.py
deleted file mode 100644
index 739cc5a0..00000000
--- a/problems/nvidia/final_nvfp4_dual_gemm/submission.py
+++ /dev/null
@@ -1,957 +0,0 @@
-from torch._higher_order_ops.torchbind import call_torchbind_fake
-import cuda.bindings.driver as cuda
-
-import torch
-from task import input_t, output_t
-
-import cutlass
-import cutlass.cute as cute
-import cutlass.utils as utils
-import cutlass.pipeline as pipeline
-from cutlass.cute.nvgpu import cpasync, tcgen05
-import cutlass.torch as cutlass_torch
-import cutlass.utils.blackwell_helpers as sm100_utils
-import cutlass.utils.blockscaled_layout as blockscaled_utils
-from cutlass.cute.runtime import make_ptr
-
-# Kernel configuration parameters
-# Tile sizes for M, N, K dimensions
-mma_tiler_mnk= (128, 128, 256)  
-# Shape of the K dimension for the MMA instruction
-mma_inst_shape_k = 64
-# FP4 data type for A and B
-ab_dtype = cutlass.Float4E2M1FN  
-# FP8 data type for scale factors
-sf_dtype = cutlass.Float8E4M3FN  
-# FP16 output type
-c_dtype = cutlass.Float16  
-# Scale factor block size (16 elements share one scale)
-sf_vec_size = 16  
-# Number of threads per CUDA thread block
-threads_per_cta = 128  
-# Stage numbers of shared memory and tmem
-num_acc_stage = 1
-num_ab_stage = 1
-# Total number of columns in tmem
-num_tmem_alloc_cols = 512
-
-
-# Helper function for ceiling division
-def ceil_div(a, b):
-    return (a + b - 1) // b
-
-
-#  GPU device kernel
-@cute.kernel
-def kernel(
-    tiled_mma: cute.TiledMma,
-    tma_atom_a: cute.CopyAtom,
-    mA_mkl: cute.Tensor,
-    tma_atom_b1: cute.CopyAtom,
-    mB_nkl1: cute.Tensor,
-    tma_atom_b2: cute.CopyAtom,
-    mB_nkl2: cute.Tensor,
-    tma_atom_sfa: cute.CopyAtom,
-    mSFA_mkl: cute.Tensor,
-    tma_atom_sfb1: cute.CopyAtom,
-    mSFB_nkl1: cute.Tensor,
-    tma_atom_sfb2: cute.CopyAtom,
-    mSFB_nkl2: cute.Tensor,
-    mC_mnl: cute.Tensor,
-    a_smem_layout_staged: cute.ComposedLayout,
-    b_smem_layout_staged: cute.ComposedLayout,
-    sfa_smem_layout_staged: cute.Layout,
-    sfb_smem_layout_staged: cute.Layout,
-    num_tma_load_bytes: cutlass.Constexpr[int],
-    epilogue_op: cutlass.Constexpr = lambda x: x
-    * (1.0 / (1.0 + cute.math.exp(-x, fastmath=True))),
-):
-    """
-    GPU device kernel performing the batched GEMM computation.
-    """
-    warp_idx = cute.arch.warp_idx()
-    warp_idx = cute.arch.make_warp_uniform(warp_idx)
-    tidx = cute.arch.thread_idx()
-
-    #
-    # Setup cta/thread coordinates
-    #
-    # Coords inside cluster
-    bidx, bidy, bidz = cute.arch.block_idx()
-    mma_tile_coord_v = bidx % cute.size(tiled_mma.thr_id.shape)
-
-    # Coords outside cluster
-    cta_coord = (bidx, bidy, bidz)
-    mma_tile_coord_mnl = (
-        cta_coord[0] // cute.size(tiled_mma.thr_id.shape),
-        cta_coord[1],
-        cta_coord[2],
-    )
-    # Coord inside cta
-    tidx, _, _ = cute.arch.thread_idx()
-
-    #
-    # Define shared storage for kernel
-    #
-    @cute.struct
-    class SharedStorage:
-        ab_mbar_ptr: cute.struct.MemRange[cutlass.Int64, num_ab_stage * 2]
-        acc_mbar_ptr: cute.struct.MemRange[cutlass.Int64, num_acc_stage * 2]
-        tmem_holding_buf: cutlass.Int32
-
-    smem = utils.SmemAllocator()
-    storage = smem.allocate(SharedStorage)
-    # (MMA, MMA_M, MMA_K, STAGE)
-    sA = smem.allocate_tensor(
-        element_type=ab_dtype,
-        layout=a_smem_layout_staged.outer,
-        byte_alignment=128,
-        swizzle=a_smem_layout_staged.inner,
-    )
-    # (MMA, MMA_N, MMA_K, STAGE)
-    sB1 = smem.allocate_tensor(
-        element_type=ab_dtype,
-        layout=b_smem_layout_staged.outer,
-        byte_alignment=128,
-        swizzle=b_smem_layout_staged.inner,
-    )
-    # (MMA, MMA_N, MMA_K, STAGE)
-    sB2 = smem.allocate_tensor(
-        element_type=ab_dtype,
-        layout=b_smem_layout_staged.outer,
-        byte_alignment=128,
-        swizzle=b_smem_layout_staged.inner,
-    )
-    # (MMA, MMA_M, MMA_K, STAGE)
-    sSFA = smem.allocate_tensor(
-        element_type=sf_dtype,
-        layout=sfa_smem_layout_staged,
-        byte_alignment=128,
-    )
-    # (MMA, MMA_N, MMA_K, STAGE)
-    sSFB1 = smem.allocate_tensor(
-        element_type=sf_dtype,
-        layout=sfb_smem_layout_staged,
-        byte_alignment=128,
-    )
-    # (MMA, MMA_N, MMA_K, STAGE)
-    sSFB2 = smem.allocate_tensor(
-        element_type=sf_dtype,
-        layout=sfb_smem_layout_staged,
-        byte_alignment=128,
-    )
-
-    #
-    # Initialize mainloop ab_pipeline, acc_pipeline and their states
-    #
-    ab_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
-    ab_pipeline_consumer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread, 1)
-    ab_producer, ab_consumer = pipeline.PipelineTmaUmma.create(
-        barrier_storage=storage.ab_mbar_ptr.data_ptr(),
-        num_stages=num_ab_stage,
-        producer_group=ab_pipeline_producer_group,
-        consumer_group=ab_pipeline_consumer_group,
-        tx_count=num_tma_load_bytes,
-    ).make_participants()
-    acc_producer, acc_consumer = pipeline.PipelineUmmaAsync.create(
-        barrier_storage=storage.acc_mbar_ptr.data_ptr(),
-        num_stages=num_acc_stage,
-        producer_group=ab_pipeline_producer_group,
-        consumer_group=pipeline.CooperativeGroup(
-            pipeline.Agent.Thread,
-            threads_per_cta,
-        ),
-    ).make_participants()
-
-    #
-    # Local_tile partition global tensors
-    #
-    # (bM, bK, RestM, RestK, RestL)
-    gA_mkl = cute.local_tile(
-        mA_mkl, cute.slice_(mma_tiler_mnk, (None, 0, None)), (None, None, None)
-    )
-    # (bN, bK, RestN, RestK, RestL)
-    gB_nkl1 = cute.local_tile(
-        mB_nkl1, cute.slice_(mma_tiler_mnk, (0, None, None)), (None, None, None)
-    )
-    # (bN, bK, RestN, RestK, RestL)
-    gB_nkl2 = cute.local_tile(
-        mB_nkl2, cute.slice_(mma_tiler_mnk, (0, None, None)), (None, None, None)
-    )
-    gSFA_mkl = cute.local_tile(
-        mSFA_mkl, cute.slice_(mma_tiler_mnk, (None, 0, None)), (None, None, None)
-    )
-    gSFB_nkl1 = cute.local_tile(
-        mSFB_nkl1, cute.slice_(mma_tiler_mnk, (0, None, None)), (None, None, None)
-    )
-    # (bN, bK, RestN, RestK, RestL)
-    gSFB_nkl2 = cute.local_tile(
-        mSFB_nkl2, cute.slice_(mma_tiler_mnk, (0, None, None)), (None, None, None)
-    )
-    # (bM, bN, RestM, RestN, RestL)
-    gC_mnl = cute.local_tile(
-        mC_mnl, cute.slice_(mma_tiler_mnk, (None, None, 0)), (None, None, None)
-    )
-    k_tile_cnt = cute.size(gA_mkl, mode=[3])
-
-    #
-    # Partition global tensor for TiledMMA_A/B/SFA/SFB/C
-    #
-    # (MMA, MMA_M, MMA_K, RestK)
-    thr_mma = tiled_mma.get_slice(mma_tile_coord_v)
-    # (MMA, MMA_M, MMA_K, RestM, RestK, RestL)
-    tCgA = thr_mma.partition_A(gA_mkl)
-    # (MMA, MMA_N, MMA_K, RestN, RestK, RestL)
-    tCgB1 = thr_mma.partition_B(gB_nkl1)
-    # (MMA, MMA_N, MMA_K, RestN, RestK, RestL)
-    tCgB2 = thr_mma.partition_B(gB_nkl2)
-    # (MMA, MMA_M, MMA_K, RestM, RestK, RestL)
-    tCgSFA = thr_mma.partition_A(gSFA_mkl)
-    # (MMA, MMA_N, MMA_K, RestN, RestK, RestL)
-    tCgSFB1 = thr_mma.partition_B(gSFB_nkl1)
-    # (MMA, MMA_N, MMA_K, RestN, RestK, RestL)
-    tCgSFB2 = thr_mma.partition_B(gSFB_nkl2)
-    # (MMA, MMA_M, MMA_N, RestM, RestN, RestL)
-    tCgC = thr_mma.partition_C(gC_mnl)
-
-    #
-    # Partition global/shared tensor for TMA load A/B/SFA/SFB
-    #
-    # TMA Partition_S/D for A
-    # ((atom_v, rest_v), STAGE)
-    # ((atom_v, rest_v), RestM, RestK, RestL)
-    tAsA, tAgA = cpasync.tma_partition(
-        tma_atom_a,
-        0,
-        cute.make_layout(1),
-        cute.group_modes(sA, 0, 3),
-        cute.group_modes(tCgA, 0, 3),
-    )
-    # TMA Partition_S/D for B1
-    # ((atom_v, rest_v), STAGE)
-    # ((atom_v, rest_v), RestN, RestK, RestL)
-    tBsB1, tBgB1 = cpasync.tma_partition(
-        tma_atom_b1,
-        0,
-        cute.make_layout(1),
-        cute.group_modes(sB1, 0, 3),
-        cute.group_modes(tCgB1, 0, 3),
-    )
-    # TMA Partition_S/D for B2
-    # ((atom_v, rest_v), STAGE)
-    # ((atom_v, rest_v), RestN, RestK, RestL)
-    tBsB2, tBgB2 = cpasync.tma_partition(
-        tma_atom_b2,
-        0,
-        cute.make_layout(1),
-        cute.group_modes(sB2, 0, 3),
-        cute.group_modes(tCgB2, 0, 3),
-    )
-    #  TMA Partition_S/D for SFA
-    # ((atom_v, rest_v), STAGE)
-    # ((atom_v, rest_v), RestM, RestK, RestL)
-    tAsSFA, tAgSFA = cpasync.tma_partition(
-        tma_atom_sfa,
-        0,
-        cute.make_layout(1),
-        cute.group_modes(sSFA, 0, 3),
-        cute.group_modes(tCgSFA, 0, 3),
-    )
-    tAsSFA = cute.filter_zeros(tAsSFA)
-    tAgSFA = cute.filter_zeros(tAgSFA)
-    # TMA Partition_S/D for SFB1
-    # ((atom_v, rest_v), STAGE)
-    # ((atom_v, rest_v), RestN, RestK, RestL)
-    tBsSFB1, tBgSFB1 = cpasync.tma_partition(
-        tma_atom_sfb1,
-        0,
-        cute.make_layout(1),
-        cute.group_modes(sSFB1, 0, 3),
-        cute.group_modes(tCgSFB1, 0, 3),
-    )
-    tBsSFB1 = cute.filter_zeros(tBsSFB1)
-    tBgSFB1 = cute.filter_zeros(tBgSFB1)
-    # TMA Partition_S/D for SFB2
-    # ((atom_v, rest_v), STAGE)
-    # ((atom_v, rest_v), RestN, RestK, RestL)
-    tBsSFB2, tBgSFB2 = cpasync.tma_partition(
-        tma_atom_sfb2,
-        0,
-        cute.make_layout(1),
-        cute.group_modes(sSFB2, 0, 3),
-        cute.group_modes(tCgSFB2, 0, 3),
-    )
-    tBsSFB2 = cute.filter_zeros(tBsSFB2)
-    tBgSFB2 = cute.filter_zeros(tBgSFB2)
-
-    #
-    # Partition shared/tensor memory tensor for TiledMMA_A/B/C
-    #
-    # (MMA, MMA_M, MMA_K, STAGE)
-    tCrA = tiled_mma.make_fragment_A(sA)
-    # (MMA, MMA_N, MMA_K, STAGE)
-    tCrB1 = tiled_mma.make_fragment_B(sB1)
-    # (MMA, MMA_N, MMA_K, STAGE)
-    tCrB2 = tiled_mma.make_fragment_B(sB2)
-    # (MMA, MMA_M, MMA_N)
-    acc_shape = tiled_mma.partition_shape_C(mma_tiler_mnk[:2])
-    # (MMA, MMA_M, MMA_N)
-    tCtAcc_fake = tiled_mma.make_fragment_C(acc_shape)
-
-    #
-    # Alloc tensor memory buffer
-    # Make ACC1 and ACC2 tmem tensor
-    # ACC1 += A @ B1
-    # ACC2 += A @ B2
-    #
-    tmem_alloc_barrier = pipeline.NamedBarrier(
-        barrier_id=1,
-        num_threads=threads_per_cta,
-    )
-    tmem = utils.TmemAllocator(
-        storage.tmem_holding_buf,
-        barrier_for_retrieve=tmem_alloc_barrier,
-    )
-    tmem.allocate(num_tmem_alloc_cols)
-    tmem.wait_for_alloc()
-    acc_tmem_ptr = tmem.retrieve_ptr(cutlass.Float32)
-    tCtAcc1 = cute.make_tensor(acc_tmem_ptr, tCtAcc_fake.layout)
-    acc_tmem_ptr1 = cute.recast_ptr(
-        acc_tmem_ptr + tcgen05.find_tmem_tensor_col_offset(tCtAcc1),
-        dtype=cutlass.Float32,
-    )
-    tCtAcc2 = cute.make_tensor(acc_tmem_ptr1, tCtAcc_fake.layout)
-
-    #
-    # Make SFA/SFB1/SFB2 tmem tensor
-    #
-    # SFA tmem layout: (MMA, MMA_M, MMA_K)
-    tCtSFA_layout = blockscaled_utils.make_tmem_layout_sfa(
-        tiled_mma,
-        mma_tiler_mnk,
-        sf_vec_size,
-        cute.slice_(sfa_smem_layout_staged, (None, None, None, 0)),
-    )
-    # Get SFA tmem ptr
-    sfa_tmem_ptr = cute.recast_ptr(
-        acc_tmem_ptr
-        + tcgen05.find_tmem_tensor_col_offset(tCtAcc1)
-        + tcgen05.find_tmem_tensor_col_offset(tCtAcc2),
-        dtype=sf_dtype,
-    )
-    tCtSFA = cute.make_tensor(sfa_tmem_ptr, tCtSFA_layout)
-
-    # SFB1, SFB2 tmem layout: (MMA, MMA_N, MMA_K)
-    tCtSFB_layout = blockscaled_utils.make_tmem_layout_sfb(
-        tiled_mma,
-        mma_tiler_mnk,
-        sf_vec_size,
-        cute.slice_(sfb_smem_layout_staged, (None, None, None, 0)),
-    )
-    # Get SFB1 tmem ptr
-    sfb_tmem_ptr1 = cute.recast_ptr(
-        acc_tmem_ptr
-        + tcgen05.find_tmem_tensor_col_offset(tCtAcc1)
-        + tcgen05.find_tmem_tensor_col_offset(tCtAcc2)
-        + tcgen05.find_tmem_tensor_col_offset(tCtSFA),
-        dtype=sf_dtype,
-    )
-    tCtSFB1 = cute.make_tensor(sfb_tmem_ptr1, tCtSFB_layout)
-    # Get SFB2 tmem ptr
-    sfb_tmem_ptr2 = cute.recast_ptr(
-        acc_tmem_ptr
-        + tcgen05.find_tmem_tensor_col_offset(tCtAcc1)
-        + tcgen05.find_tmem_tensor_col_offset(tCtAcc2)
-        + tcgen05.find_tmem_tensor_col_offset(tCtSFA)
-        + tcgen05.find_tmem_tensor_col_offset(tCtSFB1),
-        dtype=sf_dtype,
-    )
-    tCtSFB2 = cute.make_tensor(sfb_tmem_ptr2, tCtSFB_layout)
-
-    #
-    # Partition for S2T copy of SFA/SFB1/SFB2
-    #
-    # Make S2T CopyAtom
-    copy_atom_s2t = cute.make_copy_atom(
-        tcgen05.Cp4x32x128bOp(tcgen05.CtaGroup.ONE),
-        sf_dtype,
-    )
-    # (MMA, MMA_MN, MMA_K, STAGE)
-    tCsSFA_compact = cute.filter_zeros(sSFA)
-    # (MMA, MMA_MN, MMA_K)
-    tCtSFA_compact = cute.filter_zeros(tCtSFA)
-    tiled_copy_s2t_sfa = tcgen05.make_s2t_copy(copy_atom_s2t, tCtSFA_compact)
-    thr_copy_s2t_sfa = tiled_copy_s2t_sfa.get_slice(0)
-    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
-    tCsSFA_compact_s2t_ = thr_copy_s2t_sfa.partition_S(tCsSFA_compact)
-    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
-    tCsSFA_compact_s2t = tcgen05.get_s2t_smem_desc_tensor(
-        tiled_copy_s2t_sfa, tCsSFA_compact_s2t_
-    )
-    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K)
-    tCtSFA_compact_s2t = thr_copy_s2t_sfa.partition_D(tCtSFA_compact)
-
-    # (MMA, MMA_MN, MMA_K, STAGE)
-    tCsSFB1_compact = cute.filter_zeros(sSFB1)
-    # (MMA, MMA_MN, MMA_K)
-    tCtSFB1_compact = cute.filter_zeros(tCtSFB1)
-    tiled_copy_s2t_sfb = tcgen05.make_s2t_copy(copy_atom_s2t, tCtSFB1_compact)
-    thr_copy_s2t_sfb = tiled_copy_s2t_sfb.get_slice(0)
-    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
-    tCsSFB1_compact_s2t_ = thr_copy_s2t_sfb.partition_S(tCsSFB1_compact)
-    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
-    tCsSFB1_compact_s2t = tcgen05.get_s2t_smem_desc_tensor(
-        tiled_copy_s2t_sfb, tCsSFB1_compact_s2t_
-    )
-    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K)
-    tCtSFB1_compact_s2t = thr_copy_s2t_sfb.partition_D(tCtSFB1_compact)
-
-    # SFB2 S2T copy and partition
-    # (MMA, MMA_MN, MMA_K, STAGE)
-    tCsSFB2_compact = cute.filter_zeros(sSFB2)
-    # (MMA, MMA_MN, MMA_K)
-    tCtSFB2_compact = cute.filter_zeros(tCtSFB2)
-    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
-    tCsSFB2_compact_s2t_ = thr_copy_s2t_sfb.partition_S(tCsSFB2_compact)
-    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
-    tCsSFB2_compact_s2t = tcgen05.get_s2t_smem_desc_tensor(
-        tiled_copy_s2t_sfb, tCsSFB2_compact_s2t_
-    )
-    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K)
-    tCtSFB2_compact_s2t = thr_copy_s2t_sfb.partition_D(tCtSFB2_compact)
-
-    #
-    # Slice to per mma tile index
-    #
-    # ((atom_v, rest_v), RestK)
-    tAgA = tAgA[(None, mma_tile_coord_mnl[0], None, mma_tile_coord_mnl[2])]
-    # ((atom_v, rest_v), RestK)
-    tBgB1 = tBgB1[(None, mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2])]
-    # ((atom_v, rest_v), RestK)
-    tBgB2 = tBgB2[(None, mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2])]
-    # ((atom_v, rest_v), RestK)
-    tAgSFA = tAgSFA[(None, mma_tile_coord_mnl[0], None, mma_tile_coord_mnl[2])]
-    # ((atom_v, rest_v), RestK)
-    tBgSFB1 = tBgSFB1[(None, mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2])]
-    # ((atom_v, rest_v), RestK)
-    tBgSFB2 = tBgSFB2[(None, mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2])]
-
-    #
-    # Execute Data copy and Math computation in the k_tile loop
-    #
-    if warp_idx == 0:
-        # Wait for accumulator buffer empty
-        acc_empty = acc_producer.acquire_and_advance()
-        # Set ACCUMULATE field to False for the first k_tile iteration
-        tiled_mma.set(tcgen05.Field.ACCUMULATE, False)
-        # Execute k_tile loop
-        for k_tile in range(k_tile_cnt):
-            # Wait for AB buffer empty
-            ab_empty = ab_producer.acquire_and_advance()
-
-            #  TMA load A/B1/B2/SFA/SFB1/SFB2 to shared memory
-            cute.copy(
-                tma_atom_a,
-                tAgA[(None, ab_empty.count)],
-                tAsA[(None, ab_empty.index)],
-                tma_bar_ptr=ab_empty.barrier,
-            )
-            cute.copy(
-                tma_atom_b1,
-                tBgB1[(None, ab_empty.count)],
-                tBsB1[(None, ab_empty.index)],
-                tma_bar_ptr=ab_empty.barrier,
-            )
-            cute.copy(
-                tma_atom_b2,
-                tBgB2[(None, ab_empty.count)],
-                tBsB2[(None, ab_empty.index)],
-                tma_bar_ptr=ab_empty.barrier,
-            )
-            cute.copy(
-                tma_atom_sfa,
-                tAgSFA[(None, ab_empty.count)],
-                tAsSFA[(None, ab_empty.index)],
-                tma_bar_ptr=ab_empty.barrier,
-            )
-            cute.copy(
-                tma_atom_sfb1,
-                tBgSFB1[(None, ab_empty.count)],
-                tBsSFB1[(None, ab_empty.index)],
-                tma_bar_ptr=ab_empty.barrier,
-            )
-            cute.copy(
-                tma_atom_sfb2,
-                tBgSFB2[(None, ab_empty.count)],
-                tBsSFB2[(None, ab_empty.index)],
-                tma_bar_ptr=ab_empty.barrier,
-            )
-
-            # Wait for AB buffer full
-            ab_full = ab_consumer.wait_and_advance()
-
-            #  Copy SFA/SFB1/SFB2 to tmem
-            s2t_stage_coord = (None, None, None, None, ab_full.index)
-            tCsSFA_compact_s2t_staged = tCsSFA_compact_s2t[s2t_stage_coord]
-            tCsSFB1_compact_s2t_staged = tCsSFB1_compact_s2t[s2t_stage_coord]
-            tCsSFB2_compact_s2t_staged = tCsSFB2_compact_s2t[s2t_stage_coord]
-            cute.copy(
-                tiled_copy_s2t_sfa,
-                tCsSFA_compact_s2t_staged,
-                tCtSFA_compact_s2t,
-            )
-            cute.copy(
-                tiled_copy_s2t_sfb,
-                tCsSFB1_compact_s2t_staged,
-                tCtSFB1_compact_s2t,
-            )
-            cute.copy(
-                tiled_copy_s2t_sfb,
-                tCsSFB2_compact_s2t_staged,
-                tCtSFB2_compact_s2t,
-            )
-
-            # tCtAcc1 += tCrA * tCrSFA * tCrB1 * tCrSFB1
-            # tCtAcc2 += tCrA * tCrSFA * tCrB2 * tCrSFB2
-            num_kblocks = cute.size(tCrA, mode=[2])
-            for kblock_idx in cutlass.range(num_kblocks, unroll_full=True):
-                kblock_coord = (
-                    None,
-                    None,
-                    kblock_idx,
-                    ab_full.index,
-                )
-
-                # Set SFA/SFB tensor to tiled_mma
-                sf_kblock_coord = (None, None, kblock_idx)
-                tiled_mma.set(
-                    tcgen05.Field.SFA,
-                    tCtSFA[sf_kblock_coord].iterator,
-                )
-                tiled_mma.set(
-                    tcgen05.Field.SFB,
-                    tCtSFB1[sf_kblock_coord].iterator,
-                )
-                cute.gemm(
-                    tiled_mma,
-                    tCtAcc1,
-                    tCrA[kblock_coord],
-                    tCrB1[kblock_coord],
-                    tCtAcc1,
-                )
-
-                tiled_mma.set(
-                    tcgen05.Field.SFB,
-                    tCtSFB2[sf_kblock_coord].iterator,
-                )
-                cute.gemm(
-                    tiled_mma,
-                    tCtAcc2,
-                    tCrA[kblock_coord],
-                    tCrB2[kblock_coord],
-                    tCtAcc2,
-                )
-
-                # Enable accumulate on tCtAcc1/tCtAcc2 after first kblock
-                tiled_mma.set(tcgen05.Field.ACCUMULATE, True)
-
-            # Async arrive AB buffer empty
-            ab_full.release()
-        acc_empty.commit()
-
-    #
-    # Epilogue
-    # Partition for epilogue
-    #
-    op = tcgen05.Ld32x32bOp(tcgen05.Repetition.x128, tcgen05.Pack.NONE)
-    copy_atom_t2r = cute.make_copy_atom(op, cutlass.Float32)
-    tiled_copy_t2r = tcgen05.make_tmem_copy(copy_atom_t2r, tCtAcc1)
-    thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
-    # (T2R_M, T2R_N, EPI_M, EPI_M)
-    tTR_tAcc1 = thr_copy_t2r.partition_S(tCtAcc1)
-    # (T2R_M, T2R_N, EPI_M, EPI_M)
-    tTR_tAcc2 = thr_copy_t2r.partition_S(tCtAcc2)
-    # (T2R_M, T2R_N, EPI_M, EPI_N, RestM, RestN, RestL)
-    tTR_gC = thr_copy_t2r.partition_D(tCgC)
-    # (T2R_M, T2R_N, EPI_M, EPI_N）
-    tTR_rAcc1 = cute.make_rmem_tensor(
-        tTR_gC[None, None, None, None, 0, 0, 0].shape, cutlass.Float32
-    )
-    # (T2R_M, T2R_N, EPI_M, EPI_N）
-    tTR_rAcc2 = cute.make_rmem_tensor(
-        tTR_gC[None, None, None, None, 0, 0, 0].shape, cutlass.Float32
-    )
-    # (T2R_M, T2R_N, EPI_M, EPI_N）
-    tTR_rC = cute.make_rmem_tensor(
-        tTR_gC[None, None, None, None, 0, 0, 0].shape, c_dtype
-    )
-    # STG Atom
-    simt_atom = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), c_dtype)
-    tTR_gC = tTR_gC[(None, None, None, None, *mma_tile_coord_mnl)]
-
-    # Wait for accumulator buffer full
-    acc_full = acc_consumer.wait_and_advance()
-
-    # Copy accumulator to register
-    cute.copy(tiled_copy_t2r, tTR_tAcc1, tTR_rAcc1)
-    cute.copy(tiled_copy_t2r, tTR_tAcc2, tTR_rAcc2)
-
-    # Silu activation on acc1 and multiply with acc2
-    acc_vec1 = epilogue_op(tTR_rAcc1.load())
-    acc_vec2 = tTR_rAcc2.load()
-    acc_vec = acc_vec1 * acc_vec2
-
-    tTR_rC.store(acc_vec.to(c_dtype))
-    # Store C to global memory
-    cute.copy(simt_atom, tTR_rC, tTR_gC)
-
-    acc_full.release()
-    # Deallocate TMEM
-    cute.arch.barrier()
-    tmem.free(acc_tmem_ptr)
-    return
-
-
-@cute.jit
-def my_kernel(
-    a_ptr: cute.Pointer,
-    b1_ptr: cute.Pointer,
-    b2_ptr: cute.Pointer,
-    sfa_ptr: cute.Pointer,
-    sfb1_ptr: cute.Pointer,
-    sfb2_ptr: cute.Pointer,
-    c_ptr: cute.Pointer,
-    problem_size: tuple,
-    epilogue_op: cutlass.Constexpr = lambda x: x
-    * (1.0 / (1.0 + cute.math.exp(-x, fastmath=True))),
-):
-    """
-    Host-side JIT function to prepare tensors and launch GPU kernel.
-    """
-    m, n, k, l = problem_size
-
-    # Setup attributes that depend on gemm inputs
-    a_tensor = cute.make_tensor(
-        a_ptr,
-        cute.make_layout(
-            (m, cute.assume(k, 32), l),
-            stride=(cute.assume(k, 32), 1, cute.assume(m * k, 32)),
-        ),
-    )
-    b_tensor1 = cute.make_tensor(
-        b1_ptr,
-        cute.make_layout(
-            (n, cute.assume(k, 32), l),
-            stride=(cute.assume(k, 32), 1, cute.assume(n * k, 32)),
-        ),
-    )
-    b_tensor2 = cute.make_tensor(
-        b2_ptr,
-        cute.make_layout(
-            (n, cute.assume(k, 32), l),
-            stride=(cute.assume(k, 32), 1, cute.assume(n * k, 32)),
-        ),
-    )
-    c_tensor = cute.make_tensor(
-        c_ptr, cute.make_layout((cute.assume(m, 32), n, l), stride=(n, 1, m * n))
-    )
-    # Setup sfa/sfb tensor by filling A/B tensor to scale factor atom layout
-    # ((Atom_M, Rest_M),(Atom_K, Rest_K),RestL)
-    sfa_layout = blockscaled_utils.tile_atom_to_shape_SF(
-        a_tensor.shape, sf_vec_size
-    )
-    sfa_tensor = cute.make_tensor(sfa_ptr, sfa_layout)
-
-    # ((Atom_N, Rest_N),(Atom_K, Rest_K),RestL)
-    sfb_layout = blockscaled_utils.tile_atom_to_shape_SF(
-        b_tensor1.shape, sf_vec_size
-    )
-    sfb_tensor1 = cute.make_tensor(sfb1_ptr, sfb_layout)
-    sfb_tensor2 = cute.make_tensor(sfb2_ptr, sfb_layout)
-
-    mma_op = tcgen05.MmaMXF4NVF4Op(
-        sf_dtype,
-        (mma_tiler_mnk[0], mma_tiler_mnk[1], mma_inst_shape_k),
-        tcgen05.CtaGroup.ONE,
-        tcgen05.OperandSource.SMEM,
-    )
-    tiled_mma = cute.make_tiled_mma(mma_op)
-
-    cluster_layout_vmnk  = cute.tiled_divide(
-        cute.make_layout((1, 1, 1)),
-        (tiled_mma.thr_id.shape,),
-    )
-
-    # Compute A/B/SFA/SFB/C shared memory layout
-    a_smem_layout_staged = sm100_utils.make_smem_layout_a(
-        tiled_mma,
-        mma_tiler_mnk,
-        ab_dtype,
-        num_ab_stage,
-    )
-    # B1 and B2 have the same size thus share the same smem layout
-    b_smem_layout_staged = sm100_utils.make_smem_layout_b(
-        tiled_mma,
-        mma_tiler_mnk,
-        ab_dtype,
-        num_ab_stage,
-    )
-    sfa_smem_layout_staged = blockscaled_utils.make_smem_layout_sfa(
-        tiled_mma,
-        mma_tiler_mnk,
-        sf_vec_size,
-        num_ab_stage,
-    )
-    # SFB1 and SFB2 have the same size thus share the same smem layout
-    sfb_smem_layout_staged = blockscaled_utils.make_smem_layout_sfb(
-        tiled_mma,
-        mma_tiler_mnk,
-        sf_vec_size,
-        num_ab_stage,
-    )
-    atom_thr_size = cute.size(tiled_mma.thr_id.shape)
-
-    # Setup TMA for A
-    a_smem_layout = cute.slice_(a_smem_layout_staged, (None, None, None, 0))
-    tma_atom_a, tma_tensor_a = cute.nvgpu.make_tiled_tma_atom_A(
-        cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE),
-        a_tensor,
-        a_smem_layout,
-        mma_tiler_mnk,
-        tiled_mma,
-        cluster_layout_vmnk .shape,
-    )
-    # Setup TMA for B1
-    b_smem_layout = cute.slice_(b_smem_layout_staged, (None, None, None, 0))
-    tma_atom_b1, tma_tensor_b1 = cute.nvgpu.make_tiled_tma_atom_B(
-        cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE),
-        b_tensor1,
-        b_smem_layout,
-        mma_tiler_mnk,
-        tiled_mma,
-        cluster_layout_vmnk .shape,
-    )
-    # Setup TMA for B2
-    tma_atom_b2, tma_tensor_b2 = cute.nvgpu.make_tiled_tma_atom_B(
-        cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE),
-        b_tensor2,
-        b_smem_layout,
-        mma_tiler_mnk,
-        tiled_mma,
-        cluster_layout_vmnk .shape,
-    )
-    # Setup TMA for SFA
-    sfa_smem_layout = cute.slice_(
-        sfa_smem_layout_staged , (None, None, None, 0)
-    )
-    tma_atom_sfa, tma_tensor_sfa = cute.nvgpu.make_tiled_tma_atom_A(
-        cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE),
-        sfa_tensor,
-        sfa_smem_layout,
-        mma_tiler_mnk,
-        tiled_mma,
-        cluster_layout_vmnk .shape,
-        internal_type=cutlass.Int16,
-    )
-    # Setup TMA for SFB1
-    sfb_smem_layout = cute.slice_(
-        sfb_smem_layout_staged , (None, None, None, 0)
-    )
-    tma_atom_sfb1, tma_tensor_sfb1 = cute.nvgpu.make_tiled_tma_atom_B(
-        cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE),
-        sfb_tensor1,
-        sfb_smem_layout,
-        mma_tiler_mnk,
-        tiled_mma,
-        cluster_layout_vmnk .shape,
-        internal_type=cutlass.Int16,
-    )
-    # Setup TMA for SFB2
-    tma_atom_sfb2, tma_tensor_sfb2 = cute.nvgpu.make_tiled_tma_atom_B(
-        cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE),
-        sfb_tensor2,
-        sfb_smem_layout,
-        mma_tiler_mnk,
-        tiled_mma,
-        cluster_layout_vmnk .shape,
-        internal_type=cutlass.Int16,
-    )
-
-    # Compute TMA load bytes
-    a_copy_size = cute.size_in_bytes(ab_dtype, a_smem_layout)
-    b_copy_size = cute.size_in_bytes(ab_dtype, b_smem_layout)
-    sfa_copy_size = cute.size_in_bytes(sf_dtype, sfa_smem_layout)
-    sfb_copy_size = cute.size_in_bytes(sf_dtype, sfb_smem_layout)
-    num_tma_load_bytes = (
-        a_copy_size + b_copy_size * 2 + sfa_copy_size + sfb_copy_size * 2
-    ) * atom_thr_size
-
-    # Compute grid size
-    grid = (
-        cute.ceil_div(c_tensor.shape[0], mma_tiler_mnk[0]),
-        cute.ceil_div(c_tensor.shape[1], mma_tiler_mnk[1]),
-        c_tensor.shape[2],
-    )
-
-    # Launch the kernel.
-    kernel(
-        # MMA (Matrix Multiply-Accumulate) configuration
-        tiled_mma,                  # Tiled MMA object defining NVFP4 GEMM compute pattern
-        
-        # TMA (Tensor Memory Accelerator) atoms and tensors for shared input matrix A
-        tma_atom_a,                 # TMA copy atom defining how to load A from global memory
-        tma_tensor_a,               # Tensor descriptor for A matrix (m, k, l) - shared by both GEMMs
-        
-        # TMA atoms and tensors for first B matrix (B1)
-        tma_atom_b1,                # TMA copy atom defining how to load B1 from global memory
-        tma_tensor_b1,              # Tensor descriptor for B1 matrix (n, k, l) - first GEMM
-        
-        # TMA atoms and tensors for second B matrix (B2)
-        tma_atom_b2,                # TMA copy atom defining how to load B2 from global memory
-        tma_tensor_b2,              # Tensor descriptor for B2 matrix (n, k, l) - second GEMM
-        
-        # TMA atoms and tensors for scale factor A (shared)
-        tma_atom_sfa,               # TMA copy atom for loading scale factors for A
-        tma_tensor_sfa,             # Tensor descriptor for SFA (block scale factors for A) - shared
-        
-        # TMA atoms and tensors for scale factor B1
-        tma_atom_sfb1,              # TMA copy atom for loading scale factors for B1
-        tma_tensor_sfb1,            # Tensor descriptor for SFB1 (block scale factors for B1)
-        
-        # TMA atoms and tensors for scale factor B2
-        tma_atom_sfb2,              # TMA copy atom for loading scale factors for B2
-        tma_tensor_sfb2,            # Tensor descriptor for SFB2 (block scale factors for B2)
-        
-        # Output tensor C (stores both C1 and C2 results)
-        c_tensor,                   # Output tensor where both GEMM results will be stored (m, n, l)
-        
-        # Shared memory layouts with staging for pipelined execution
-        a_smem_layout_staged,       # Staged shared memory layout for A (includes stage dimension)
-        b_smem_layout_staged,       # Staged shared memory layout for B1/B2 (includes stage dimension)
-        sfa_smem_layout_staged,     # Staged shared memory layout for SFA (includes stage dimension)
-        sfb_smem_layout_staged,     # Staged shared memory layout for SFB1/SFB2 (includes stage dimension)
-        
-        # Pipeline synchronization parameter
-        num_tma_load_bytes,         # Total bytes to load per TMA transaction (for barrier setup)
-        
-        # Epilogue operation
-        epilogue_op,                # Epilogue operation to apply to output (e.g., element-wise ops)
-    ).launch(
-        grid=grid,
-        block=[threads_per_cta, 1, 1],
-        cluster=(1, 1, 1),
-    )
-    return
-
-
-# Global cache for compiled kernel
-_compiled_kernel_cache = None
-# This function is used to compile the kernel once and cache it and then allow users to 
-# run the kernel multiple times to get more accurate timing results.
-def compile_kernel():
-    """
-    Compile the kernel once and cache it.
-    This should be called before any timing measurements.
-    
-    Returns:
-        The compiled kernel function
-    """
-    global _compiled_kernel_cache
-    
-    if _compiled_kernel_cache is not None:
-        return _compiled_kernel_cache
-    
-
-    # Create CuTe pointers for A/B/C/SFA/SFB via torch tensor data pointer
-    a_ptr = make_ptr(
-        ab_dtype, 0, cute.AddressSpace.gmem, assumed_align=16
-    )
-    b1_ptr = make_ptr(
-        ab_dtype, 0, cute.AddressSpace.gmem, assumed_align=16
-    )
-    b2_ptr = make_ptr(
-        ab_dtype, 0, cute.AddressSpace.gmem, assumed_align=16
-    )
-    c_ptr = make_ptr(
-        c_dtype, 0, cute.AddressSpace.gmem, assumed_align=16
-    )
-    sfa_ptr = make_ptr(
-        sf_dtype, 0, cute.AddressSpace.gmem, assumed_align=32
-    )
-    sfb1_ptr = make_ptr(
-        sf_dtype, 0, cute.AddressSpace.gmem, assumed_align=32
-    )
-    sfb2_ptr = make_ptr(
-        sf_dtype, 0, cute.AddressSpace.gmem, assumed_align=32
-    )
-
-    # Compile the kernel
-    _compiled_kernel_cache = cute.compile(my_kernel, a_ptr, b1_ptr, b2_ptr, sfa_ptr, sfb1_ptr, sfb2_ptr, c_ptr, (0, 0, 0, 0))
-    
-    return _compiled_kernel_cache
-
-
-def custom_kernel(data: input_t) -> output_t:
-    """
-    Execute the block-scaled dual GEMM kernel with silu activation,
-    C = silu(A @ B1) * (A @ B2).
-    
-    This is the main entry point called by the evaluation framework.
-    It converts PyTorch tensors to CuTe tensors, launches the kernel,
-    and returns the result.
-    
-    Args:
-        data: Tuple of (a, b1, b2, sfa_cpu, sfb1_cpu, sfb2_cpu, c) PyTorch tensors
-            a: [m, k, l] - Input matrix in float4e2m1fn 
-            b1: [n, k, l] - Input matrix in float4e2m1fn 
-            b2: [n, k, l] - Input matrix in float4e2m1fn 
-            sfa_cpu: [m, k, l] - Scale factors in float8_e4m3fn, used by reference implementation
-            sfb1_cpu: [n, k, l] - Scale factors in float8_e4m3fn, used by reference implementation
-            sfb2_cpu: [n, k, l] - Scale factors in float8_e4m3fn, used by reference implementation
-            sfa_permuted: [32, 4, rest_m, 4, rest_k, l] - Scale factors in float8_e4m3fn
-            sfb1_permuted: [32, 4, rest_n, 4, rest_k, l] - Scale factors in float8_e4m3fn
-            sfb2_permuted: [32, 4, rest_n, 4, rest_k, l] - Scale factors in float8_e4m3fn
-            c: [m, n, l] - Output vector in float16
-    
-    Returns:
-        Output tensor c with computed results
-    """
-    a, b1, b2, _, _, _, sfa_permuted, sfb1_permuted, sfb2_permuted, c = data
-    
-    # Ensure kernel is compiled (will use cached version if available)
-    # To avoid the compilation overhead, we compile the kernel once and cache it.
-    compiled_func = compile_kernel()
-
-    # Get dimensions from MxKxL layout
-    _, k, _ = a.shape
-    m, n, l = c.shape
-    # Torch use e2m1_x2 data type, thus k is halved
-    k = k * 2 
-
-    # Create CuTe pointers for A/B/C/SFA/SFB via torch tensor data pointer
-    a_ptr = make_ptr(
-        ab_dtype, a.data_ptr(), cute.AddressSpace.gmem, assumed_align=16
-    )
-    b1_ptr = make_ptr(
-        ab_dtype, b1.data_ptr(), cute.AddressSpace.gmem, assumed_align=16
-    )
-    b2_ptr = make_ptr(
-        ab_dtype, b2.data_ptr(), cute.AddressSpace.gmem, assumed_align=16
-    )
-    c_ptr = make_ptr(
-        c_dtype, c.data_ptr(), cute.AddressSpace.gmem, assumed_align=16
-    )
-    sfa_ptr = make_ptr(
-        sf_dtype, sfa_permuted.data_ptr(), cute.AddressSpace.gmem, assumed_align=32
-    )
-    sfb1_ptr = make_ptr(
-        sf_dtype, sfb1_permuted.data_ptr(), cute.AddressSpace.gmem, assumed_align=32
-    )
-    sfb2_ptr = make_ptr(
-        sf_dtype, sfb2_permuted.data_ptr(), cute.AddressSpace.gmem, assumed_align=32
-    )
-
-    # Execute the compiled kernel
-    compiled_func(a_ptr, b1_ptr, b2_ptr, sfa_ptr, sfb1_ptr, sfb2_ptr, c_ptr, (m, n, k, l))
-
-    return c
diff --git a/problems/nvidia/final_nvfp4_dual_gemm/task.py b/problems/nvidia/final_nvfp4_dual_gemm/task.py
deleted file mode 100644
index 8facfb07..00000000
--- a/problems/nvidia/final_nvfp4_dual_gemm/task.py
+++ /dev/null
@@ -1,11 +0,0 @@
-import torch
-from typing import TypedDict, TypeVar
-
-input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor])
-output_t = TypeVar("output_t", bound=torch.Tensor)
-class TestSpec(TypedDict):
-    m: int
-    n: int
-    k: int
-    l: int
-    seed: int
\ No newline at end of file
diff --git a/problems/nvidia/final_nvfp4_dual_gemm/task.yml b/problems/nvidia/final_nvfp4_dual_gemm/task.yml
deleted file mode 100644
index 4d36bde6..00000000
--- a/problems/nvidia/final_nvfp4_dual_gemm/task.yml
+++ /dev/null
@@ -1,64 +0,0 @@
-# name: nvfp4-dual-gemm
-
-files:
-  - {"name": "submission.py", "source": "@SUBMISSION@"}
-  - {"name": "task.py", "source": "task.py"}
-  - {"name": "utils.py", "source": "../utils.py"}
-  - {"name": "reference.py", "source": "reference.py"}
-  - {"name": "eval.py", "source": "../eval_better_bench.py"}
-
-lang: "py"
-
-description: |
-  
-  You will implement a block scaled dual matrix-matrix multiplication kernel with silu activation optimized for NVIDIA B200.
-  To be explicit, you will be given a tuple of tensors:
-  ```
-  (a, b1, b2, sfa, sfb1, sfb2, c)
-  ```
-  where:
-  * `a` is M x K x L in K-major order in nvfp4(e2m1)
-  * `b1` is N x K x L in K-major order in nvfp4(e2m1)
-  * `b2` is N x K x L in K-major order in nvfp4(e2m1)
-  * `sfa` is M x (K // 16) x L in K-major order in fp8(e4m3fnuz)
-  * `sfb1` is N x (K // 16) x L in K-major order in fp8(e4m3fnuz)
-  * `sfb2` is N x (K // 16) x L in K-major order in fp8(e4m3fnuz)
-  * `c` is M x N x L in fp16
-  
-  Matrix sizes `M` is divisible by mma_tiler_mn[0], `N` is divisible by mma_tiler_mn[1], `K` is divisible by 256.
-  The ranking criteria is the geometric mean of the benchmark results.
-  For the grand price, your kernel will be evaluated against the speed of light analysis
-  and the solution closest to the speed of light will be awarded the grand price.
-  ```
-  The speed of light analysis based on the max(FP4 Tensor Core math throughput, DRAM memory throughput) of B200 and tested under 1.5Ghz clock:
-    M   N   K   L time[us] 
-  256 4096 7168 1 4.708
-  512 4096 7168 1 8.714
-  256 3072 4096 1 2.125
-  512 3072 7168 1 6.535
-  ```
-config:
-  main: "eval.py"
-
-templates:
-  Python: "template.py"
-
-tests:
-  - {"m": 1536, "n": 512, "k": 7168, "l": 1, "seed": 1111}
-  - {"m": 256, "n": 512, "k": 256, "l": 1, "seed": 1111}
-  - {"m": 1536, "n": 512, "k": 7168, "l": 1, "seed": 1111}
-  - {"m": 3072, "n": 1024, "k": 1536, "l": 1, "seed": 1111}
-  - {"m": 7168, "n": 1024, "k": 256, "l": 1, "seed": 1111}
-  - {"m": 7168, "n": 2304, "k": 2048, "l": 1, "seed": 1111}
-  - {"m": 4608, "n": 384, "k": 7168, "l": 1, "seed": 1111}
-  - {"m": 7168, "n": 384, "k": 2304, "l": 1, "seed": 1111}
-  - {"m": 512, "n": 768, "k": 7168, "l": 1, "seed": 1111}
-  - {"m": 4096, "n": 768, "k": 512, "l": 1, "seed": 1111}
-  
-benchmarks:
-  - {"m": 256, "n": 4096, "k": 7168, "l": 1, "seed": 1111}
-  - {"m": 512, "n": 4096, "k": 7168, "l": 1, "seed": 1111}
-  - {"m": 256, "n": 3072, "k": 4096, "l": 1, "seed": 1111}
-  - {"m": 512, "n": 3072, "k": 7168, "l": 1, "seed": 1111}
-
-ranking_by: "geom"
diff --git a/problems/nvidia/final_nvfp4_dual_gemm/template.py b/problems/nvidia/final_nvfp4_dual_gemm/template.py
deleted file mode 100644
index d8985df5..00000000
--- a/problems/nvidia/final_nvfp4_dual_gemm/template.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from task import input_t, output_t
-
-
-def custom_kernel(data: input_t) -> output_t:
-    """
-    Reference implementation of block-scale fp4 dual gemm with silu activation
-    Args:
-        data: Tuple that expands to:
-            a: torch.Tensor[float4e2m1fn] of shape [m, k, l],
-            b1: torch.Tensor[float4e2m1fn] of shape [n, k, l],
-            b2: torch.Tensor[float4e2m1fn] of shape [n, k, l],
-            sfa: torch.Tensor[float8_e4m3fnuz] of shape [m, k // 16, l], used by reference implementation
-            sfb1: torch.Tensor[float8_e4m3fnuz] of shape [n, k // 16, l], used by reference implementation
-            sfb2: torch.Tensor[float8_e4m3fnuz] of shape [n, k // 16, l], used by reference implementation
-            sfa_permuted: torch.Tensor[float8_e4m3fnuz] of shape [32, 4, rest_m, 4, rest_k, l],
-            sfb1_permuted: torch.Tensor[float8_e4m3fnuz] of shape [32, 4, rest_n, 4, rest_k, l],
-            sfb2_permuted: torch.Tensor[float8_e4m3fnuz] of shape [32, 4, rest_n, 4, rest_k, l],
-            c: torch.Tensor[float16] of shape [m, n, l]
-    Returns:
-        Tensor containing output in float16
-        c: torch.Tensor[float16] of shape [m, n, l]
-    """
-    # c: [m, n, l] is pre-allocated memory to avoid timing allocation overhead.
-    a, b1, b2, sfa, sfb1, sfb2, sfa_permuted, sfb1_permuted, sfb2_permuted, c = data
-
-    # Your implementation here
-
-    return c
\ No newline at end of file
diff --git a/problems/nvidia/final_nvfp4_dual_gemm/utils.py b/problems/nvidia/final_nvfp4_dual_gemm/utils.py
deleted file mode 100644
index d9b3a69e..00000000
--- a/problems/nvidia/final_nvfp4_dual_gemm/utils.py
+++ /dev/null
@@ -1,172 +0,0 @@
-import os
-import random
-import numpy as np
-import torch
-
-
-def set_seed(seed=42):
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(seed)
-        torch.cuda.manual_seed_all(seed)
-
-
-def get_device(use_cuda: bool = True) -> torch.device:
-    """Get the appropriate device (GPU or CPU)."""
-    if use_cuda:
-        if torch.cuda.is_available():
-            return torch.device("cuda")
-        elif torch.backends.mps.is_available():
-            return torch.device("mps")
-        else:
-            print("No compatible GPU found. Falling back to CPU.")
-    return torch.device("cpu")
-
-
-# Adapted from https://github.com/linkedin/Liger-Kernel/blob/main/test/utils.py
-@torch.no_grad()
-def verbose_allclose(
-        received: torch.Tensor,
-        expected: torch.Tensor,
-        rtol=1e-05,
-        atol=1e-08,
-        max_print=5
-) -> list[str]:
-    """
-    Assert that two tensors are element-wise equal within a tolerance, providing detailed information about mismatches.
-    Parameters:
-    received (torch.Tensor): Tensor we actually got.
-    expected (torch.Tensor): Tensor we expected to receive.
-    rtol (float): Relative tolerance; relative to expected
-    atol (float): Absolute tolerance.
-    max_print (int): Maximum number of mismatched elements to print.
-    Raises:
-    AssertionError: If the tensors are not all close within the given tolerance.
-    """
-    # Check if the shapes of the tensors match
-    if received.shape != expected.shape:
-        return ["SIZE MISMATCH"]
-
-    # Calculate the difference between the tensors
-    diff = torch.abs(received - expected)
-
-    # Determine the tolerance
-    tolerance = atol + rtol * torch.abs(expected)
-
-    # Find tolerance mismatched elements
-    tol_mismatched = diff > tolerance
-
-    # Find nan mismatched elements
-    nan_mismatched = torch.logical_xor(torch.isnan(received), torch.isnan(expected))
-
-    # Find +inf mismatched elements
-    posinf_mismatched = torch.logical_xor(torch.isposinf(received), torch.isposinf(expected))
-    # Find -inf mismatched elements
-    neginf_mismatched = torch.logical_xor(torch.isneginf(received), torch.isneginf(expected))
-
-    # Find all mismatched elements
-    mismatched = torch.logical_or(
-        torch.logical_or(tol_mismatched, nan_mismatched),
-        torch.logical_or(posinf_mismatched, neginf_mismatched),
-    )
-
-    mismatched_indices = torch.nonzero(mismatched)
-
-    # Count the number of mismatched elements
-    num_mismatched = mismatched.count_nonzero().item()
-
-    # Generate detailed information if there are mismatches
-    if num_mismatched >= 1:
-        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
-
-        for index in mismatched_indices[:max_print]:
-            i = tuple(index.tolist())
-            mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
-        if num_mismatched > max_print:
-            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
-        return mismatch_details
-
-    return []
-
-
-@torch.no_grad()
-def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print: int=5):
-    """
-    Assert that two tensors are element-wise perfectly equal, providing detailed information about mismatches.
-    Parameters:
-    received (torch.Tensor): Tensor we actually got.
-    expected (torch.Tensor): Tensor we expected to receive.
-    max_print (int): Maximum number of mismatched elements to print.
-    Returns:
-         Empty string if tensors are equal, otherwise detailed error information
-    """
-    mismatched = torch.not_equal(received, expected)
-    mismatched_indices = torch.nonzero(mismatched)
-
-    # Count the number of mismatched elements
-    num_mismatched = mismatched.count_nonzero().item()
-
-    # Generate detailed information if there are mismatches
-    if num_mismatched >= 1:
-        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
-
-        for index in mismatched_indices[:max_print]:
-            i = tuple(index.tolist())
-            mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
-        if num_mismatched > max_print:
-            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
-        return mismatch_details
-
-    return []
-
-
-def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08) -> tuple[bool, str]:
-    """
-    Convenient "default" implementation for tasks' `check_implementation` function.
-    """
-    expected = reference(data)
-    reasons = verbose_allclose(output, expected, rtol=rtol, atol=atol)
-
-    if len(reasons) > 0:
-        return False, "mismatch found! custom implementation doesn't match reference: " + " ".join(reasons)
-
-    return True, ''
-
-
-def make_match_reference(reference: callable, **kwargs):
-    def wrapped(data, output):
-        return match_reference(data, output, reference=reference, **kwargs)
-    return wrapped
-
-
-class DeterministicContext:
-    def __init__(self):
-        self.allow_tf32 = None
-        self.deterministic = None
-        self.cublas = None
-
-    def __enter__(self):
-        self.cublas = os.environ.get('CUBLAS_WORKSPACE_CONFIG', '')
-        self.allow_tf32 = torch.backends.cudnn.allow_tf32
-        self.deterministic = torch.backends.cudnn.deterministic
-        torch.backends.cudnn.allow_tf32 = False
-        torch.backends.cudnn.deterministic = True
-        torch.use_deterministic_algorithms(True)
-        return self
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        torch.backends.cudnn.allow_tf32 = self.allow_tf32
-        torch.backends.cudnn.deterministic = self.deterministic
-        torch.use_deterministic_algorithms(False)
-        os.environ['CUBLAS_WORKSPACE_CONFIG'] = self.cublas
-
-def clear_l2_cache():
-    # import cupy as cp
-    # cp.cuda.runtime.deviceSetLimit(cp.cuda.runtime.cudaLimitPersistingL2CacheSize, 0)
-    # create a large dummy tensor
-    dummy = torch.empty((32, 1024, 1024), dtype=torch.int64, device="cuda")
-    # write stuff to
-    dummy.fill_(42)
-    del dummy

From 0f80d7aebe7d5a506b040a97eb452e659f1f6e6d Mon Sep 17 00:00:00 2001
From: vickiw973 <vickiw@nvidia.com>
Date: Tue, 20 Jan 2026 00:56:45 +0800
Subject: [PATCH 192/207] Add nvfp4 group gemm example. (#77)

* add nvfp4 group gemm example.

* extend reference code to support m and n are not multiple of 128 cases.

* support different problem sizes to match real group gemm use case.

* modify test

* reduce range to reduce inf
---
 problems/nvidia/nvfp4_group_gemm/eval.py      |  444 +++++++
 problems/nvidia/nvfp4_group_gemm/reference.py |  199 ++++
 .../nvidia/nvfp4_group_gemm/submission.py     | 1059 +++++++++++++++++
 problems/nvidia/nvfp4_group_gemm/task.py      |    8 +
 problems/nvidia/nvfp4_group_gemm/task.yml     |   65 +
 problems/nvidia/nvfp4_group_gemm/template.py  |   31 +
 problems/nvidia/nvfp4_group_gemm/utils.py     |  176 +++
 7 files changed, 1982 insertions(+)
 create mode 100644 problems/nvidia/nvfp4_group_gemm/eval.py
 create mode 100644 problems/nvidia/nvfp4_group_gemm/reference.py
 create mode 100644 problems/nvidia/nvfp4_group_gemm/submission.py
 create mode 100644 problems/nvidia/nvfp4_group_gemm/task.py
 create mode 100644 problems/nvidia/nvfp4_group_gemm/task.yml
 create mode 100644 problems/nvidia/nvfp4_group_gemm/template.py
 create mode 100644 problems/nvidia/nvfp4_group_gemm/utils.py

diff --git a/problems/nvidia/nvfp4_group_gemm/eval.py b/problems/nvidia/nvfp4_group_gemm/eval.py
new file mode 100644
index 00000000..09ecaa43
--- /dev/null
+++ b/problems/nvidia/nvfp4_group_gemm/eval.py
@@ -0,0 +1,444 @@
+import base64
+import dataclasses
+import multiprocessing
+import re
+import time
+import os
+import sys
+import math
+from pathlib import Path
+from typing import Any, Optional
+import tempfile
+
+import torch.cuda
+from cutlass.cute.nvgpu.common import OpError
+
+from utils import set_seed, clear_l2_cache
+
+try:
+    from task import TestSpec
+except ImportError:
+    TestSpec = dict
+
+from reference import check_implementation, generate_input
+
+
+class PopcornOutput:
+    def __init__(self, fd: int):
+        self.file = os.fdopen(fd, "w")
+        os.set_inheritable(fd, False)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.file.close()
+
+    def print(self, *args, **kwargs):
+        print(*args, **kwargs, file=self.file, flush=True)
+
+    def log(self, key, value):
+        self.print(f"{key}: {value}")
+
+
+@dataclasses.dataclass
+class TestCase:
+    args: dict
+    spec: str
+
+
+def _combine(a: int, b: int) -> int:
+    # combine two integers into one:
+    # we need this to generate a secret seed based on the test-level seed and
+    # the global secret seed.
+    # the test-level seeds are public knowledge, and typically relatively small numbers,
+    # so we need to make sure they don't provide any useful info for the full seed.
+    # This Cantor construction ensures that if the secret seed is a large number,
+    # then so is the overall seed.
+    return int(a + (a + b) * (a + b + 1) // 2)
+
+
+def get_test_cases(file_name: str, seed: Optional[int]) -> list[TestCase]:
+    try:
+        content = Path(file_name).read_text()
+    except Exception as E:
+        print(f"Could not open test file`{file_name}`: {E}", file=sys.stderr)
+        exit(113)
+
+    tests = []
+    lines = content.splitlines()
+    match = r"\s*([a-zA-Z]+):\s*(\([^)]+\)|\[[^\]]+\]|[a-zA-Z]+|[+-]?[0-9]+)\s*"
+    for line in lines:
+        parts = line.split(";")
+        case = {}
+        for part in parts:
+            matched = re.match(match, part)
+            if not re.fullmatch(match, part):
+                print(f"invalid test case: '{line}': '{part}'", file=sys.stderr)
+                exit(113)
+            key = matched[1]
+            val = matched[2]
+            try:
+                val = int(val)
+            except ValueError:
+                # Try parsing as tuple/list
+                if (val.startswith('(') and val.endswith(')')) or (val.startswith('[') and val.endswith(']')):
+                    try:
+                        val = tuple(int(x.strip()) for x in val[1:-1].split(','))
+                    except ValueError:
+                        pass
+
+            case[key] = val
+        tests.append(TestCase(spec=line, args=case))
+
+    if seed is not None:
+        for test in tests:
+            if "seed" in test.args:
+                test.args["seed"] = _combine(test.args["seed"], seed)
+
+    return tests
+
+
+@dataclasses.dataclass
+class Stats:
+    runs: int
+    mean: float
+    std: float
+    err: float
+    best: float
+    worst: float
+
+
+def calculate_stats(durations: list[int]):
+    """
+    Calculate statistical data from a list of durations.
+    @param durations: A list of durations in nanoseconds.
+    @return: A Stats object containing the number of runs, mean, standard deviation, error, best, and worst durations.
+    """
+    runs = len(durations)
+    total = sum(durations)
+    best = min(durations)
+    worst = max(durations)
+
+    avg = total / runs
+    variance = sum(map(lambda x: (x - avg) ** 2, durations))
+    std = math.sqrt(variance / (runs - 1))
+    err = std / math.sqrt(runs)
+
+    return Stats(
+        runs=runs, mean=avg, std=std, err=err, best=float(best), worst=float(worst)
+    )
+
+
+def _clone_data(data):
+    """
+    Recursively goes through data and clones all tensors.
+    """
+    if isinstance(data, tuple):
+        return tuple(_clone_data(x) for x in data)
+    elif isinstance(data, list):
+        return [_clone_data(x) for x in data]
+    elif isinstance(data, dict):
+        return {k: _clone_data(v) for k, v in data.items()}
+    elif isinstance(data, torch.Tensor):
+        return data.clone()
+    else:
+        return data
+
+
+def _run_single_test(test: TestCase):
+    """
+    Runs a single test case. Do not call directly
+    """
+    from submission import custom_kernel
+
+    data = generate_input(**test.args)
+    torch.cuda.synchronize()
+    try:
+        submission_output = custom_kernel(_clone_data(data))
+
+    except OpError as E:
+        print(f"Encountered {E}", file=sys.stderr)
+        return False, str(E)
+    torch.cuda.synchronize()
+    return check_implementation(data, submission_output)
+
+
+def run_single_test(pool: multiprocessing.Pool, test: TestCase):
+    """
+    Runs a single test in another process.
+    """
+    return pool.apply(_run_single_test, (test,))
+
+
+def run_testing(
+    logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]
+):
+    """
+    Executes the actual test case code and checks for correctness.
+    @param logger: A PopcornOutput object used for logging test results.
+    @param tests: A list of TestCase objects representing the test cases to be executed.
+    @return: An integer representing the exit status: 0 if all tests pass, otherwise 112.
+    """
+    passed = True
+    logger.log("test-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"test.{idx}.spec", test.spec)
+        good, message = run_single_test(pool, test)
+        if not good:
+            logger.log(f"test.{idx}.status", "fail")
+            logger.log(f"test.{idx}.error", message)
+            passed = False
+        else:
+            logger.log(f"test.{idx}.status", "pass")
+            if message:
+                logger.log(f"test.{idx}.message", message)
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def _run_single_benchmark(
+    test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float
+) -> Stats | Any:
+    """
+    Runs one benchmark. Do not call directly.
+    """
+    from submission import custom_kernel
+
+    durations = []
+    # generate input data once
+    data = generate_input(**test.args)
+    check_copy = _clone_data(data)
+    #  first, one obligatory correctness check
+    try:
+        output = custom_kernel(_clone_data(data))
+    except OpError as E:
+        return f"Encountered {E}"
+    good, message = check_implementation(check_copy, output)
+    if not good:
+        return message
+
+    # now, do multiple timing runs without further correctness testing
+    # there is an upper bound of 100 runs, and a lower bound of 3 runs;
+    # otherwise, we repeat until we either measure at least 10 full seconds,
+    # or the relative error of the mean is below 1%.
+
+    bm_start_time = time.perf_counter_ns()
+    for i in range(max_repeats):
+        if recheck:
+            # ensure we use a different seed for every benchmark
+            if "seed" in test.args:
+                test.args["seed"] += 13
+
+            data = generate_input(**test.args)
+            check_copy = _clone_data(data)
+        torch.cuda.synchronize()
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        clear_l2_cache()
+
+        start_event.record()
+        output = custom_kernel(data)
+        end_event.record()
+        torch.cuda.synchronize()
+        duration = start_event.elapsed_time(end_event) * 1e6  # Convert ms to ns
+
+        if recheck:
+            good, message = check_implementation(check_copy, output)
+            if not good:
+                return message
+
+        del output
+        durations.append(duration)
+
+        if i > 1:
+            total_bm_duration = time.perf_counter_ns() - bm_start_time
+            stats = calculate_stats(durations)
+            # stop if either
+            # a) relative error dips below 0.1%
+            # b) we exceed the total time limit for benchmarking the kernel
+            # c) we exceed 2 minutes of total wallclock time.
+            if (
+                stats.err / stats.mean < 0.001
+                or stats.mean * stats.runs > max_time_ns
+                or total_bm_duration > 120e9
+            ):
+                break
+
+    return calculate_stats(durations)
+
+
+def run_single_benchmark(
+    pool: multiprocessing.Pool,
+    test: TestCase,
+    recheck: bool,
+    max_repeats: int,
+    max_time_ns: float,
+):
+    """
+    For a particular test case, check correctness (if applicable) and grab runtime results.
+    @param pool: Process on which the benchmark will be launched.
+    @param test: TestCase object.
+    @param recheck: Flag for whether to explicitly check functional correctness.
+    @param max_repeats: Number of trials to repeat.
+    @param max_time_ns: Timeout time in nanoseconds.
+    @return: A Stats object for this particular benchmark case or an error if the test fails.
+    """
+    return pool.apply(_run_single_benchmark, (test, recheck, max_repeats, max_time_ns))
+
+
+def run_benchmarking(
+    logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]
+):
+    """
+    Executes benchmarking code for a CUDA Kernel and logs runtimes.
+    @param logger: A PopcornOutput object used for logging benchmark results.
+    @param pool: Process on which the benchmarks will be launched.
+    @param tests: A list of TestCase objects representing the test cases to be benchmarked.
+    @return: An integer representing the exit status: 0 if all benchmarks pass, otherwise 112.
+    """
+    # warm up
+    run_single_benchmark(pool, tests[0], False, 100, 10e7)
+
+    passed = True
+    logger.log("benchmark-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"benchmark.{idx}.spec", test.spec)
+        result = run_single_benchmark(pool, test, False, 100, 10e9)
+        if isinstance(result, Stats):
+            for field in dataclasses.fields(Stats):
+                logger.log(f"benchmark.{idx}.{field.name}", getattr(result, field.name))
+        else:
+            passed = False
+            logger.log(f"benchmark.{idx}.status", "fail")
+            logger.log(f"benchmark.{idx}.error", result)
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def run_single_profile(test: TestCase) -> str:
+    """
+    Runs a single test case. Do not call directly
+    """
+    from submission import custom_kernel
+    from torch.profiler import profile, record_function, ProfilerActivity
+
+    data = generate_input(**test.args)
+    torch.cuda.synchronize()
+
+    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
+        submission_output = custom_kernel(_clone_data(data))
+        torch.cuda.synchronize()
+    return prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=20)
+
+
+def run_profiling(logger: PopcornOutput, tests: list[TestCase]):
+    logger.log("benchmark-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"benchmark.{idx}.spec", test.spec)
+        report = run_single_profile(test)
+        logger.log(
+            f"benchmark.{idx}.report",
+            base64.b64encode(report.encode("utf-8"), b"+*").decode("utf-8"),
+        )
+    logger.log("check", "pass")
+    return 0
+
+
+def main():
+    fd = os.getenv("POPCORN_FD")
+    if not fd:
+        return 111
+
+    if len(sys.argv) < 3:
+        return 2
+
+    mode = sys.argv[1]
+    seed = os.getenv("POPCORN_SEED")
+    os.unsetenv("POPCORN_SEED")
+    seed = int(seed) if seed else None
+    set_seed(seed or 42)
+
+    filename = None
+
+    with tempfile.NamedTemporaryFile(delete=False) as tmp:
+
+        def build_test_string(tests: list[dict]):
+            as_str = ""
+            for test in tests:
+                kvs = []
+                for k, v in test.items():
+                    kvs.append(f"{k}: {v}")
+                as_str += "; ".join(kvs) + "\n"
+            return as_str
+
+        import yaml
+
+        yaml_content = yaml.safe_load(open(sys.argv[2], "r"))
+        if mode == "test":
+            tests_str = build_test_string(yaml_content.get("tests", []))
+        elif mode in ("benchmark", "leaderboard", "profile"):
+            tests_str = build_test_string(yaml_content.get("benchmarks", []))
+
+        tmp.write(tests_str.encode("utf-8"))
+        tmp.flush()
+        filename = tmp.name
+
+    tests = get_test_cases(filename, seed)
+
+    os.unlink(filename)
+
+    with PopcornOutput(int(fd)) as logger:
+        import multiprocessing
+
+        mp_context = multiprocessing.get_context("spawn")
+        with mp_context.Pool(1) as pool:
+            if mode == "test":
+                return run_testing(logger, pool, tests)
+            if mode == "benchmark":
+                return run_benchmarking(logger, pool, tests)
+
+            if mode == "leaderboard":
+                # warmup
+                run_single_benchmark(pool, tests[0], False, 100, 1e7)
+                logger.log("benchmark-count", len(tests))
+                passed = True
+                for i in range(len(tests)):
+                    result = run_single_benchmark(pool, tests[i], True, 100, 30e9)
+                    logger.log(f"benchmark.{i}.spec", tests[i].spec)
+                    if isinstance(result, Stats):
+                        for field in dataclasses.fields(Stats):
+                            logger.log(
+                                f"benchmark.{i}.{field.name}",
+                                getattr(result, field.name),
+                            )
+                    else:
+                        passed = False
+                        logger.log(f"benchmark.{i}.status", "fail")
+                        logger.log(
+                            f"benchmark.{i}.error", str(result)
+                        )  # TODO: Make sure result implements __str__?
+                        break
+
+                logger.log("check", "pass" if passed else "fail")
+            elif mode == "profile":
+                run_profiling(logger, tests)
+            else:
+                # TODO: Implement script mode
+                return 2
+
+
+if __name__ == "__main__":
+    print("main")
+    main()
+    print("main end")
\ No newline at end of file
diff --git a/problems/nvidia/nvfp4_group_gemm/reference.py b/problems/nvidia/nvfp4_group_gemm/reference.py
new file mode 100644
index 00000000..f12f504b
--- /dev/null
+++ b/problems/nvidia/nvfp4_group_gemm/reference.py
@@ -0,0 +1,199 @@
+import torch
+from task import input_t, output_t
+from utils import make_match_reference
+
+# Scaling factor vector size
+sf_vec_size = 16
+
+# Helper function for ceiling division
+def ceil_div(a, b):
+    return (a + b - 1) // b
+
+
+# Helper function to convert scale factor tensor to blocked format
+def to_blocked(input_matrix):
+    rows, cols = input_matrix.shape
+
+    # Please ensure rows and cols are multiples of 128 and 4 respectively
+    n_row_blocks = ceil_div(rows, 128)
+    n_col_blocks = ceil_div(cols, 4)
+    padded_rows = n_row_blocks * 128
+    padded_cols = n_col_blocks * 4
+
+    # Pad the input matrix if necessary
+    if padded_rows != rows or padded_cols != cols:
+        padded = torch.nn.functional.pad(
+            input_matrix,
+            (0, padded_cols - cols, 0, padded_rows - rows),
+            mode="constant",
+            value=0,
+        )
+    else:
+        padded = input_matrix
+    blocks = padded.view(n_row_blocks, 128, n_col_blocks, 4).permute(0, 2, 1, 3)
+    rearranged = blocks.reshape(-1, 4, 32, 4).transpose(1, 2).reshape(-1, 32, 16)
+
+    return rearranged.flatten()
+
+
+def ref_kernel(
+    data: input_t,
+) -> output_t:
+    """
+    PyTorch reference implementation of NVFP4 block-scaled group GEMM.
+    """
+    abc_tensors, sfasfb_tensors, _, problem_sizes = data
+    
+    result_tensors = []
+    for i, (
+        (a_ref, b_ref, c_ref),
+        (sfa_ref, sfb_ref),
+        (m, n, k, l),
+    ) in enumerate(
+        zip(
+            abc_tensors,
+            sfasfb_tensors,
+            problem_sizes,
+        )
+    ):
+        for l_idx in range(l):
+            # Convert the scale factor tensor to blocked format
+            scale_a = to_blocked(sfa_ref[:, :, l_idx])
+            scale_b = to_blocked(sfb_ref[:, :, l_idx])
+            # (m, k) @ (n, k).T -> (m, n)
+            res = torch._scaled_mm(
+                a_ref[:, :, l_idx].view(torch.float4_e2m1fn_x2),
+                b_ref[:, :, l_idx].transpose(0, 1).view(torch.float4_e2m1fn_x2),
+                scale_a.cuda(),
+                scale_b.cuda(),
+                bias=None,
+                out_dtype=torch.float16,
+            )
+            c_ref[:, :, l_idx] = res
+        result_tensors.append((c_ref))
+    return result_tensors
+
+
+# Helper function to prepare the scale factor tensors for both reference
+# kernel and customize kernel. The customized data layout can be found in:
+# https://docs.nvidia.com/cuda/cublas/index.html?highlight=fp4#d-block-scaling-factors-layout
+def create_reordered_scale_factor_tensor(l, mn, k, ref_f8_tensor):
+    sf_k = ceil_div(k, sf_vec_size)
+    atom_m = (32, 4)
+    atom_k = 4
+    mma_shape = (
+        l,  # batch size
+        ceil_div(mn, atom_m[0] * atom_m[1]),
+        ceil_div(sf_k, atom_k),
+        atom_m[0],
+        atom_m[1],
+        atom_k,
+    )
+    # Create the reordered scale factor tensor (32, 4, rest_m, 4, rest_k, l) on GPU.
+    mma_permute_order = (3, 4, 1, 5, 2, 0)
+    # Generate a random int8 tensor, then convert to float8_e4m3fn
+    rand_int_tensor = torch.randint(1, 3, mma_shape, dtype=torch.int8, device='cuda')
+    reordered_f8_tensor = rand_int_tensor.to(dtype=torch.float8_e4m3fn)
+    # Permute according to mma_permute_order
+    reordered_f8_tensor = reordered_f8_tensor.permute(*mma_permute_order)
+
+    # Move ref_f8_tensor to GPU if not already there
+    if ref_f8_tensor.device.type == 'cpu':
+        ref_f8_tensor = ref_f8_tensor.cuda()
+
+    # GPU-side vectorized reordering (replaces slow CPU nested loops)
+    # Create index grids for all dimensions
+    i_idx = torch.arange(mn, device='cuda')
+    j_idx = torch.arange(sf_k, device='cuda')
+    b_idx = torch.arange(l, device='cuda')
+    
+    # Create meshgrid for all combinations of (i, j, b)
+    i_grid, j_grid, b_grid = torch.meshgrid(i_idx, j_idx, b_idx, indexing='ij')
+    
+    # Calculate target indices in vectorized manner
+    mm = i_grid // (atom_m[0] * atom_m[1])
+    mm32 = i_grid % atom_m[0]
+    mm4 = (i_grid % 128) // atom_m[0]
+    kk = j_grid // atom_k
+    kk4 = j_grid % atom_k
+    
+    # Perform the reordering with advanced indexing (all on GPU)
+    reordered_f8_tensor[mm32, mm4, mm, kk4, kk, b_grid] = ref_f8_tensor[i_grid, j_grid, b_grid]
+    
+    return reordered_f8_tensor
+
+
+def generate_input(
+    m: tuple,
+    n: tuple,
+    k: tuple,
+    g: int,
+    seed: int,
+):
+    """
+    Generate input tensors for NVFP4 block-scaled group GEMM. 
+    Each group can have different m, n, k, l.
+    
+    Args:
+        problem_sizes: List of tuples (m, n, k, l) for each problem
+        m: Number of rows in matrix A
+        n: Number of columns in matrix B
+        k: Number of columns in A and rows of B
+        l: Batch size, always is 1
+        groups: Number of groups
+        seed: Random seed for reproducibility
+    
+    Returns:
+        Tuple of (list(tuple(a, b, c)), list(tuple(sfa, sfb)), list(tuple(sfa_reordered, sfb_reordered)), list(tuple(m, n, k, l))) where each group has its own a, b, c, sfa, sfb.
+            a: [m, k, l] - Input matrix in torch.float4e2m1fn_x2 data type
+            b: [n, k, l] - Input matrix in torch.float4e2m1fn_x2 data type
+            sfa: [m, k // 16, l] - Input scale factors in torch.float8e4m3fn data type
+            sfb: [n, k // 16, l] - Input scale factors in torch.float8e4m3fn data type
+            sfa_reordered: [32, 4, rest_m, 4, rest_k, l] - Input scale factors in torch.float8e4m3fn data type
+            sfb_reordered: [32, 4, rest_n, 4, rest_k, l] - Input scale factors in torch.float8e4m3fn data type
+            c: [m, n, l] - Output matrix in torch.float16 data type
+    """
+    torch.manual_seed(seed)
+    
+    abc_tensors = []
+    sfasfb_tensors = []
+    sfasfb_reordered_tensors = []
+    problem_sizes = []
+    l = 1
+    # Generate a, b, c, sfa, sfb tensors for all groups
+    for group_idx in range(g):
+        mi = m[group_idx]
+        ni = n[group_idx]
+        ki = k[group_idx]
+        a_ref = torch.randint(
+            -1, 2, (l, mi, ki // 2), dtype=torch.int8, device="cuda"
+        ).permute(1, 2, 0)
+        b_ref = torch.randint(
+            -1, 2, (l, ni, ki // 2), dtype=torch.int8, device="cuda"
+        ).permute(1, 2, 0)
+        a_ref = a_ref.view(torch.float4_e2m1fn_x2)
+        b_ref = b_ref.view(torch.float4_e2m1fn_x2)
+
+        c_ref = torch.randn((l, mi, ni), dtype=torch.float16, device="cuda").permute(
+            1, 2, 0
+        )
+
+        sf_k = ceil_div(ki, sf_vec_size)
+        sfa_ref_cpu = torch.randint(
+            1, 3, (l, mi, sf_k), dtype=torch.int8
+        ).to(dtype=torch.float8_e4m3fn).permute(1, 2, 0)
+        sfb_ref_cpu = torch.randint(
+            1, 3, (l, ni, sf_k), dtype=torch.int8
+        ).to(dtype=torch.float8_e4m3fn).permute(1, 2, 0)
+
+        sfa_reordered = create_reordered_scale_factor_tensor(l, mi, ki, sfa_ref_cpu)
+        sfb_reordered = create_reordered_scale_factor_tensor(l, ni, ki, sfb_ref_cpu)
+
+        abc_tensors.append((a_ref, b_ref, c_ref))
+        sfasfb_tensors.append((sfa_ref_cpu, sfb_ref_cpu))
+        sfasfb_reordered_tensors.append((sfa_reordered, sfb_reordered))
+        problem_sizes.append((mi, ni, ki, l))
+    return (abc_tensors, sfasfb_tensors, sfasfb_reordered_tensors, problem_sizes)
+
+
+check_implementation = make_match_reference(ref_kernel, rtol=1e-03, atol=1e-03)
diff --git a/problems/nvidia/nvfp4_group_gemm/submission.py b/problems/nvidia/nvfp4_group_gemm/submission.py
new file mode 100644
index 00000000..439a0ea2
--- /dev/null
+++ b/problems/nvidia/nvfp4_group_gemm/submission.py
@@ -0,0 +1,1059 @@
+import cutlass
+import cutlass.cute as cute
+import cutlass.utils as utils
+import cutlass.pipeline as pipeline
+from cutlass.cute.nvgpu import cpasync, tcgen05
+import cutlass.utils.blackwell_helpers as sm100_utils
+import cutlass.utils.blockscaled_layout as blockscaled_utils
+from cutlass.cute.runtime import make_ptr
+
+import functools
+from typing import Tuple, List
+
+import torch
+from task import input_t, output_t
+
+# Kernel configuration parameters
+# Size of tma descriptor in bytes
+bytes_per_tensormap = 128
+# Number of tensormaps: a, b, sfa, sfb
+num_tensormaps = 4
+# Tile sizes for M, N, K dimensions
+mma_tiler_mnk = (128, 128, 256)  
+# Shape of the K dimension for the MMA instruction
+mma_inst_shape_k = 64
+# FP4 data type for A and B
+ab_dtype = cutlass.Float4E2M1FN  
+# FP8 data type for scale factors
+sf_dtype = cutlass.Float8E4M3FN  
+# FP16 output type
+c_dtype = cutlass.Float16  
+# Scale factor block size (16 elements share one scale)
+sf_vec_size = 16  
+# Number of threads per CUDA thread block
+threads_per_cta = 128  
+# Stage numbers of shared memory and tmem
+num_acc_stage = 1
+num_ab_stage = 1
+# Total number of columns in tmem
+num_tmem_alloc_cols = 512
+
+
+# Helper function for ceiling division
+def ceil_div(a, b):
+    return (a + b - 1) // b
+
+
+# The CuTe reference implementation for NVFP4 block-scaled GEMM
+@cute.kernel
+def kernel(
+    tiled_mma: cute.TiledMma,
+    tma_atom_a: cute.CopyAtom,
+    mA_mkl: cute.Tensor,
+    tma_atom_b: cute.CopyAtom,
+    mB_nkl: cute.Tensor,
+    tma_atom_sfa: cute.CopyAtom,
+    mSFA_mkl: cute.Tensor,
+    tma_atom_sfb: cute.CopyAtom,
+    mSFB_nkl: cute.Tensor,
+    tensor_of_abc_ptrs: cute.Tensor,
+    tensor_of_sfasfb_ptrs: cute.Tensor,
+    tensormaps: cute.Tensor,
+    tensor_of_problem_sizes: cute.Tensor,
+    a_smem_layout_staged: cute.ComposedLayout,
+    b_smem_layout_staged: cute.ComposedLayout,
+    sfa_smem_layout_staged: cute.Layout,
+    sfb_smem_layout_staged: cute.Layout,
+    cta_mn_list: List[Tuple[int, int]],
+    num_tma_load_bytes: cutlass.Constexpr[int],
+):
+    """
+    GPU device kernel performing the Group GEMM computation.
+    """
+    warp_idx = cute.arch.warp_idx()
+    warp_idx = cute.arch.make_warp_uniform(warp_idx)
+    tidx, _, _ = cute.arch.thread_idx()
+
+    #
+    # Delinearize bidz to coord_x, coord_y and group_idx for each CTA
+    #
+    bidx, bidy, bidz = cute.arch.block_idx()
+    group_idx = 0
+    find = False
+    coord_x = 0
+    coord_y = 0
+    cta_rest = bidz
+    for _, (cta_m, cta_n) in enumerate(cta_mn_list):
+        if cta_rest >= (cta_m * cta_n):
+            group_idx += 1
+            cta_rest -= cta_m * cta_n
+        else:
+            if not find:
+                coord_y = cta_rest // cta_m
+                coord_x = cta_rest % cta_m
+                cta_rest -= cta_m * cta_n
+                find = True
+
+    #
+    # Construct C Tensor for each CTA
+    #
+    mC_mnl_iter = cute.make_ptr(
+        c_dtype, tensor_of_abc_ptrs[group_idx, 2], cute.AddressSpace.gmem
+    ).align(32)
+    m = tensor_of_problem_sizes[group_idx, 0]
+    n = tensor_of_problem_sizes[group_idx, 1]
+    k = tensor_of_problem_sizes[group_idx, 2]
+    l = tensor_of_problem_sizes[group_idx, 3]
+
+    mC_mnl_layout = cute.make_layout(
+        (m, n, l),
+        stride=(cute.assume(n, 32), 1, cute.assume(m * n, 32),))
+    mC_mnl = cute.make_tensor(mC_mnl_iter, mC_mnl_layout)
+    # Local partition for global C Tensor
+    # (bM, bN, RestM, RestN, RestL)
+    gC_mnl = cute.local_tile(
+        mC_mnl, cute.slice_(mma_tiler_mnk, (None, None, 0)), (coord_x, coord_y, 0)
+    )
+
+    #
+    # Define shared storage for kernel
+    #
+    size_tensormap_in_i64 = (
+        num_tensormaps * bytes_per_tensormap // 8
+    )
+    @cute.struct
+    class SharedStorage:
+        tensormap_buffer: cute.struct.MemRange[
+            cutlass.Int64, size_tensormap_in_i64
+        ]
+        ab_mbar_ptr: cute.struct.MemRange[cutlass.Int64, num_ab_stage * 2]
+        acc_mbar_ptr: cute.struct.MemRange[cutlass.Int64, num_acc_stage * 2]
+        tmem_holding_buf: cutlass.Int32
+    smem = utils.SmemAllocator()
+    storage = smem.allocate(SharedStorage)
+
+    tensormap_smem_ptr = storage.tensormap_buffer.data_ptr()
+    tensormap_a_smem_ptr = tensormap_smem_ptr
+    tensormap_b_smem_ptr = (
+        tensormap_a_smem_ptr
+        + bytes_per_tensormap // 8
+    )
+    tensormap_sfa_smem_ptr = (
+        tensormap_b_smem_ptr
+        + bytes_per_tensormap // 8
+    )
+    tensormap_sfb_smem_ptr = (
+        tensormap_sfa_smem_ptr
+        + bytes_per_tensormap // 8
+    )
+    # Setup smem tensor for A, B, SFA, SFB
+    # (MMA, MMA_M, MMA_K, STAGE)
+    sA = smem.allocate_tensor(
+        element_type=ab_dtype,
+        layout=a_smem_layout_staged.outer,
+        byte_alignment=128,
+        swizzle=a_smem_layout_staged.inner,
+    )
+    # (MMA, MMA_N, MMA_K, STAGE)
+    sB = smem.allocate_tensor(
+        element_type=ab_dtype,
+        layout=b_smem_layout_staged.outer,
+        byte_alignment=128,
+        swizzle=b_smem_layout_staged.inner,
+    )
+    # (MMA, MMA_M, MMA_K, STAGE)
+    sSFA = smem.allocate_tensor(
+        element_type=sf_dtype,
+        layout=sfa_smem_layout_staged,
+        byte_alignment=128,
+    )
+    # (MMA, MMA_N, MMA_K, STAGE)
+    sSFB = smem.allocate_tensor(
+        element_type=sf_dtype,
+        layout=sfb_smem_layout_staged,
+        byte_alignment=128,
+    )
+
+    # Initialize mainloop ab_pipeline, acc_pipeline and their states
+    ab_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
+    ab_pipeline_consumer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread, 1)
+    ab_producer, ab_consumer = pipeline.PipelineTmaUmma.create(
+        barrier_storage=storage.ab_mbar_ptr.data_ptr(),
+        num_stages=num_ab_stage,
+        producer_group=ab_pipeline_producer_group,
+        consumer_group=ab_pipeline_consumer_group,
+        tx_count=num_tma_load_bytes,
+    ).make_participants()
+    acc_producer, acc_consumer = pipeline.PipelineUmmaAsync.create(
+        barrier_storage=storage.acc_mbar_ptr.data_ptr(),
+        num_stages=num_acc_stage,
+        producer_group=pipeline.CooperativeGroup(pipeline.Agent.Thread),
+        consumer_group=pipeline.CooperativeGroup(
+            pipeline.Agent.Thread,
+            threads_per_cta,
+        ),
+    ).make_participants()
+
+    #
+    # Local_tile partition global tensors
+    #
+    # (bM, bK, RestM, RestK, RestL)
+    gA_mkl = cute.local_tile(
+        mA_mkl, cute.slice_(mma_tiler_mnk, (None, 0, None)), (None, None, None)
+    )
+    # (bN, bK, RestN, RestK, RestL)
+    gB_nkl = cute.local_tile(
+        mB_nkl, cute.slice_(mma_tiler_mnk, (0, None, None)), (None, None, None)
+    )
+    # (bM, bK, RestM, RestK, RestL)
+    gSFA_mkl = cute.local_tile(
+        mSFA_mkl, cute.slice_(mma_tiler_mnk, (None, 0, None)), (None, None, None)
+    )
+    # (bN, bK, RestN, RestK, RestL)
+    gSFB_nkl = cute.local_tile(
+        mSFB_nkl, cute.slice_(mma_tiler_mnk, (0, None, None)), (None, None, None)
+    )
+    #
+    # Partition global tensor for TiledMMA_A/B/C
+    #
+    thr_mma = tiled_mma.get_slice(tidx)
+    # (MMA, MMA_M, MMA_K, RestM, RestK, RestL)
+    tCgA = thr_mma.partition_A(gA_mkl)
+    # (MMA, MMA_N, MMA_K, RestN, RestK, RestL)
+    tCgB = thr_mma.partition_B(gB_nkl)
+    # (MMA, MMA_M, MMA_K, RestM, RestK, RestL)
+    tCgSFA = thr_mma.partition_A(gSFA_mkl)
+    # (MMA, MMA_N, MMA_K, RestN, RestK, RestL)
+    tCgSFB = thr_mma.partition_B(gSFB_nkl)
+    # (MMA, MMA_M, MMA_N, RestM, RestN, RestL)
+    tCgC = thr_mma.partition_C(gC_mnl)
+
+    # Update tma descriptor with the correct shapes and strides
+    tensormap_manager = utils.TensorMapManager(
+        utils.TensorMapUpdateMode.SMEM,
+        128,
+    )
+    tensormap_a_gmem_ptr = tensormap_manager.get_tensormap_ptr(
+        tensormaps[(bidz, 0, None)].iterator
+    )
+    tensormap_b_gmem_ptr = tensormap_manager.get_tensormap_ptr(
+        tensormaps[(bidz, 1, None)].iterator
+    )
+    tensormap_sfa_gmem_ptr = tensormap_manager.get_tensormap_ptr(
+        tensormaps[(bidz, 2, None)].iterator
+    )
+    tensormap_sfb_gmem_ptr = tensormap_manager.get_tensormap_ptr(
+        tensormaps[(bidz, 3, None)].iterator
+    )
+
+    mA_mkl_iter = cute.make_ptr(
+        ab_dtype, tensor_of_abc_ptrs[group_idx, 0], cute.AddressSpace.gmem
+    ).align(32)
+    mB_nkl_iter = cute.make_ptr(
+        ab_dtype, tensor_of_abc_ptrs[group_idx, 1], cute.AddressSpace.gmem
+    ).align(32)
+    sfa_mkl_iter = cute.make_ptr(
+        sf_dtype, tensor_of_sfasfb_ptrs[group_idx, 0], cute.AddressSpace.gmem
+    ).align(32)
+    sfb_nkl_iter = cute.make_ptr(
+        sf_dtype, tensor_of_sfasfb_ptrs[group_idx, 1], cute.AddressSpace.gmem
+    ).align(32)
+    mA_mkl_layout = cute.make_layout(
+        (m, k, l), stride=(cute.assume(k, 32), 1, cute.assume(m * k, 32),))
+    mB_nkl_layout = cute.make_layout(
+        (n, k, l), stride=(cute.assume(k, 32), 1, cute.assume(n * k, 32),))
+
+    # SFA, SFB follows specialized layout defined in the following link:
+    # https://docs.nvidia.com/cuda/cublas/index.html?highlight=fp4#d-block-scaling-factors-layout
+    atom_shape = ((32, 4), (sf_vec_size, 4))
+    atom_stride = ((16, 4), (0, 1))
+    sfa_layout = cute.tile_to_shape(
+        cute.make_layout(atom_shape, stride=atom_stride),
+        mA_mkl_layout.shape,
+        (2, 1, 3),
+    )
+    sfb_layout = cute.tile_to_shape(
+        cute.make_layout(atom_shape, stride=atom_stride),
+        mB_nkl_layout.shape,
+        (2, 1, 3),
+    )
+    real_tensor_a = cute.make_tensor(mA_mkl_iter, mA_mkl_layout)
+    real_tensor_b = cute.make_tensor(mB_nkl_iter, mB_nkl_layout)
+    real_tensor_sfa = cute.make_tensor(sfa_mkl_iter, sfa_layout)
+    real_tensor_sfb = cute.make_tensor(sfb_nkl_iter, sfb_layout)
+
+    # Let warp 0 initialize tensormap
+    if warp_idx == 0:
+        tensormap_manager.init_tensormap_from_atom(
+            tma_atom_a, tensormap_a_smem_ptr, 0
+        )
+        tensormap_manager.init_tensormap_from_atom(
+            tma_atom_b, tensormap_b_smem_ptr, 0
+        )
+        tensormap_manager.init_tensormap_from_atom(
+            tma_atom_sfa, tensormap_sfa_smem_ptr, 0
+        )
+        tensormap_manager.init_tensormap_from_atom(
+            tma_atom_sfb, tensormap_sfb_smem_ptr, 0
+        )
+        tensormap_manager.update_tensormap(
+            (
+                real_tensor_a,
+                real_tensor_b,
+                real_tensor_sfa,
+                real_tensor_sfb,
+            ),
+            (tma_atom_a, tma_atom_b, tma_atom_sfa, tma_atom_sfb),
+            (
+                tensormap_a_gmem_ptr,
+                tensormap_b_gmem_ptr,
+                tensormap_sfa_gmem_ptr,
+                tensormap_sfb_gmem_ptr,
+            ),
+            0,  # tma warp id
+            (
+                tensormap_a_smem_ptr,
+                tensormap_b_smem_ptr,
+                tensormap_sfa_smem_ptr,
+                tensormap_sfb_smem_ptr,
+            ),
+        )
+
+        tensormap_manager.fence_tensormap_update(tensormap_a_gmem_ptr)
+        tensormap_manager.fence_tensormap_update(tensormap_b_gmem_ptr)
+        tensormap_manager.fence_tensormap_update(tensormap_sfa_gmem_ptr)
+        tensormap_manager.fence_tensormap_update(tensormap_sfb_gmem_ptr)
+
+    cute.arch.barrier()
+
+    #
+    # Partition global/shared tensor for TMA load A/B/SFA/SFB
+    #
+    # TMA Partition_S/D for A
+    # ((atom_v, rest_v), STAGE)
+    # ((atom_v, rest_v), RestM, RestK, RestL)
+    tAsA, tAgA = cpasync.tma_partition(
+        tma_atom_a,
+        0,
+        cute.make_layout(1),
+        cute.group_modes(sA, 0, 3),
+        cute.group_modes(tCgA, 0, 3),
+    )
+    # TMA Partition_S/D for B
+    # ((atom_v, rest_v), STAGE)
+    # ((atom_v, rest_v), RestN, RestK, RestL)
+    tBsB, tBgB = cpasync.tma_partition(
+        tma_atom_b,
+        0,
+        cute.make_layout(1),
+        cute.group_modes(sB, 0, 3),
+        cute.group_modes(tCgB, 0, 3),
+    )
+    #  TMA Partition_S/D for SFA
+    # ((atom_v, rest_v), STAGE)
+    # ((atom_v, rest_v), RestM, RestK, RestL)
+    tAsSFA, tAgSFA = cpasync.tma_partition(
+        tma_atom_sfa,
+        0,
+        cute.make_layout(1),
+        cute.group_modes(sSFA, 0, 3),
+        cute.group_modes(tCgSFA, 0, 3),
+    )
+    tAsSFA = cute.filter_zeros(tAsSFA)
+    tAgSFA = cute.filter_zeros(tAgSFA)
+    # TMA Partition_S/D for SFB
+    # ((atom_v, rest_v), STAGE)
+    # ((atom_v, rest_v), RestN, RestK, RestL)
+    tBsSFB, tBgSFB = cpasync.tma_partition(
+        tma_atom_sfb,
+        0,
+        cute.make_layout(1),
+        cute.group_modes(sSFB, 0, 3),
+        cute.group_modes(tCgSFB, 0, 3),
+    )
+    tBsSFB = cute.filter_zeros(tBsSFB)
+    tBgSFB = cute.filter_zeros(tBgSFB)
+
+    #
+    # Partition shared/tensor memory tensor for TiledMMA_A/B/C
+    #
+    # (MMA, MMA_M, MMA_K, STAGE)
+    tCrA = tiled_mma.make_fragment_A(sA)
+    # (MMA, MMA_N, MMA_K, STAGE)
+    tCrB = tiled_mma.make_fragment_B(sB)
+    # (MMA, MMA_M, MMA_N)
+    acc_shape = tiled_mma.partition_shape_C(mma_tiler_mnk[:2])
+    # (MMA, MMA_M, MMA_N)
+    tCtAcc_fake = tiled_mma.make_fragment_C(acc_shape)
+    #
+    # Alloc tensor memory buffer
+    #
+    tmem_alloc_barrier = pipeline.NamedBarrier(
+        barrier_id=1,
+        num_threads=threads_per_cta,
+    )
+    tmem = utils.TmemAllocator(
+        storage.tmem_holding_buf,
+        barrier_for_retrieve=tmem_alloc_barrier,
+    )
+    tmem.allocate(num_tmem_alloc_cols)
+    tmem.wait_for_alloc()
+    acc_tmem_ptr = tmem.retrieve_ptr(cutlass.Float32)
+    tCtAcc = cute.make_tensor(acc_tmem_ptr, tCtAcc_fake.layout)
+
+    #
+    # Make SFA/SFB tmem tensor
+    #
+    # Get SFA tmem ptr
+    sfa_tmem_ptr = cute.recast_ptr(
+        acc_tmem_ptr + tcgen05.find_tmem_tensor_col_offset(tCtAcc),
+        dtype=sf_dtype,
+    )
+    # (MMA, MMA_M, MMA_K)
+    tCtSFA_layout = blockscaled_utils.make_tmem_layout_sfa(
+        tiled_mma,
+        mma_tiler_mnk,
+        sf_vec_size,
+        cute.slice_(sfa_smem_layout_staged, (None, None, None, 0)),
+    )
+    tCtSFA = cute.make_tensor(sfa_tmem_ptr, tCtSFA_layout)
+    # Get SFB tmem ptr
+    sfb_tmem_ptr = cute.recast_ptr(
+        acc_tmem_ptr
+        + tcgen05.find_tmem_tensor_col_offset(tCtAcc)
+        + tcgen05.find_tmem_tensor_col_offset(tCtSFA),
+        dtype=sf_dtype,
+    )
+    # (MMA, MMA_N, MMA_K)
+    tCtSFB_layout = blockscaled_utils.make_tmem_layout_sfb(
+        tiled_mma,
+        mma_tiler_mnk,
+        sf_vec_size,
+        cute.slice_(sfb_smem_layout_staged, (None, None, None, 0)),
+    )
+    tCtSFB = cute.make_tensor(sfb_tmem_ptr, tCtSFB_layout)
+
+    #
+    # Partition for S2T copy of SFA/SFB
+    #
+    # Make S2T CopyAtom
+    copy_atom_s2t = cute.make_copy_atom(
+        tcgen05.Cp4x32x128bOp(tcgen05.CtaGroup.ONE),
+        sf_dtype,
+    )
+    # (MMA, MMA_MN, MMA_K, STAGE)
+    tCsSFA_compact = cute.filter_zeros(sSFA)
+    tCtSFA_compact = cute.filter_zeros(tCtSFA)
+    tiled_copy_s2t_sfa = tcgen05.make_s2t_copy(copy_atom_s2t, tCtSFA_compact)
+    thr_copy_s2t_sfa = tiled_copy_s2t_sfa.get_slice(0)
+    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+    tCsSFA_compact_s2t_ = thr_copy_s2t_sfa.partition_S(tCsSFA_compact)
+    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+    tCsSFA_compact_s2t = tcgen05.get_s2t_smem_desc_tensor(
+        tiled_copy_s2t_sfa, tCsSFA_compact_s2t_
+    )
+    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K)
+    tCtSFA_compact_s2t = thr_copy_s2t_sfa.partition_D(tCtSFA_compact)
+
+    # (MMA, MMA_MN, MMA_K, STAGE)
+    tCsSFB_compact = cute.filter_zeros(sSFB)
+    # (MMA, MMA_MN, MMA_K)
+    tCtSFB_compact = cute.filter_zeros(tCtSFB)
+    tiled_copy_s2t_sfb = tcgen05.make_s2t_copy(copy_atom_s2t, tCtSFB_compact)
+    thr_copy_s2t_sfb = tiled_copy_s2t_sfb.get_slice(0)
+    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+    tCsSFB_compact_s2t_ = thr_copy_s2t_sfb.partition_S(tCsSFB_compact)
+    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+    tCsSFB_compact_s2t = tcgen05.get_s2t_smem_desc_tensor(
+        tiled_copy_s2t_sfb, tCsSFB_compact_s2t_
+    )
+    # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K)
+    tCtSFB_compact_s2t = thr_copy_s2t_sfb.partition_D(tCtSFB_compact)
+
+    # Number of K loops
+    k_tile_cnt = cute.ceil_div(real_tensor_a.shape[1], mma_tiler_mnk[2])
+
+    #
+    # Slice to per mma tile index
+    #
+    mma_tile_coord_mnl = (coord_x, coord_y, 0)
+    # ((atom_v, rest_v), RestK)
+    tAgA = tAgA[(None, mma_tile_coord_mnl[0], None, mma_tile_coord_mnl[2])]
+    # ((atom_v, rest_v), RestK)
+    tBgB = tBgB[(None, mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2])]
+    # ((atom_v, rest_v), RestK)
+    tAgSFA = tAgSFA[(None, mma_tile_coord_mnl[0], None, mma_tile_coord_mnl[2])]
+    # ((atom_v, rest_v), RestK)
+    tBgSFB = tBgSFB[(None, mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2])]
+
+    #
+    # Main loop
+    #
+    if warp_idx == 0:
+        # Wait for accumulator buffer empty
+        acc_empty = acc_producer.acquire_and_advance()
+        # Set ACCUMULATE field to False for the first k_tile iteration
+        tiled_mma.set(tcgen05.Field.ACCUMULATE, False)
+        # Execute k_tile loop
+        for k_tile in range(k_tile_cnt):
+            # Wait for AB buffer empty
+            ab_empty = ab_producer.acquire_and_advance()
+
+            #  TMA load A/B/SFA/SFB to shared memory
+            cute.copy(
+                tma_atom_a,
+                tAgA[(None, k_tile)],
+                tAsA[(None, ab_empty.index)],
+                tma_bar_ptr=ab_empty.barrier,
+                tma_desc_ptr=tensormap_manager.get_tensormap_ptr(
+                    tensormap_a_gmem_ptr,
+                    cute.AddressSpace.generic,
+                ),
+            )
+            cute.copy(
+                tma_atom_b,
+                tBgB[(None, k_tile)],
+                tBsB[(None, ab_empty.index)],
+                tma_bar_ptr=ab_empty.barrier,
+                tma_desc_ptr=tensormap_manager.get_tensormap_ptr(
+                    tensormap_b_gmem_ptr,
+                    cute.AddressSpace.generic,
+                ),
+            )
+            cute.copy(
+                tma_atom_sfa,
+                tAgSFA[(None, k_tile)],
+                tAsSFA[(None, ab_empty.index)],
+                tma_bar_ptr=ab_empty.barrier,
+                tma_desc_ptr=tensormap_manager.get_tensormap_ptr(
+                    tensormap_sfa_gmem_ptr,
+                    cute.AddressSpace.generic,
+                ),
+            )
+            cute.copy(
+                tma_atom_sfb,
+                tBgSFB[(None, k_tile)],
+                tBsSFB[(None, ab_empty.index)],
+                tma_bar_ptr=ab_empty.barrier,
+                tma_desc_ptr=tensormap_manager.get_tensormap_ptr(
+                    tensormap_sfb_gmem_ptr,
+                    cute.AddressSpace.generic,
+                ),
+            )
+
+            # Wait for AB buffer full
+            ab_full = ab_consumer.wait_and_advance()
+
+            #  Copy SFA/SFB from shared memory to TMEM
+            s2t_stage_coord = (None, None, None, None, ab_full.index)
+            tCsSFA_compact_s2t_staged = tCsSFA_compact_s2t[s2t_stage_coord]
+            tCsSFB_compact_s2t_staged = tCsSFB_compact_s2t[s2t_stage_coord]
+            cute.copy(
+                tiled_copy_s2t_sfa,
+                tCsSFA_compact_s2t_staged,
+                tCtSFA_compact_s2t,
+            )
+            cute.copy(
+                tiled_copy_s2t_sfb,
+                tCsSFB_compact_s2t_staged,
+                tCtSFB_compact_s2t,
+            )
+
+            # tCtAcc += tCrA * tCrSFA * tCrB * tCrSFB
+            num_kblocks = cute.size(tCrA, mode=[2])
+            for kblock_idx in cutlass.range(num_kblocks, unroll_full=True):
+                kblock_coord = (
+                    None,
+                    None,
+                    kblock_idx,
+                    ab_full.index,
+                )
+
+                # Set SFA/SFB tensor to tiled_mma
+                sf_kblock_coord = (None, None, kblock_idx)
+                tiled_mma.set(
+                    tcgen05.Field.SFA,
+                    tCtSFA[sf_kblock_coord].iterator,
+                )
+                tiled_mma.set(
+                    tcgen05.Field.SFB,
+                    tCtSFB[sf_kblock_coord].iterator,
+                )
+
+                cute.gemm(
+                    tiled_mma,
+                    tCtAcc,
+                    tCrA[kblock_coord],
+                    tCrB[kblock_coord],
+                    tCtAcc,
+                )
+                # Enable accumulate on tCtAcc after first kblock
+                tiled_mma.set(tcgen05.Field.ACCUMULATE, True)
+
+            # Async arrive AB buffer empty
+            ab_full.release()
+        acc_empty.commit()
+
+    #
+    # Epilogue
+    # Partition for epilogue
+    #
+    op = tcgen05.Ld32x32bOp(tcgen05.Repetition.x128, tcgen05.Pack.NONE)
+    copy_atom_t2r = cute.make_copy_atom(op, cutlass.Float32)
+    tiled_copy_t2r = tcgen05.make_tmem_copy(copy_atom_t2r, tCtAcc[None,0,0])
+    thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
+    # (TmemCpy, NumTmemCpy)
+    tDtAcc = thr_copy_t2r.partition_S(tCtAcc[None,0,0])
+    # (TmemCpy, NumTmemCpy)
+    tDgC = thr_copy_t2r.partition_D(tCgC[None,0,0])
+
+    # (TmemCpy, NumTmemCpy)
+    tDrAcc = cute.make_rmem_tensor(tDgC.shape, cutlass.Float32)
+    # (TmemCpy, NumTmemCpy)
+    tDrC = cute.make_rmem_tensor(tDgC.shape, c_dtype)
+
+    # Release TMEM allocation lock
+    tmem.relinquish_alloc_permit()
+    # Wait for accumulator buffer full
+    acc_full = acc_consumer.wait_and_advance()
+
+    # Copy accumulator to register
+    cute.copy(tiled_copy_t2r, tDtAcc, tDrAcc)
+    acc_vec = tDrAcc.load()
+    tDrC.store(acc_vec.to(c_dtype))
+
+    # STG Atom, just to ensure functionality
+    # For performance optimization, better to use Tma store operation to
+    # reduce address calculation and predicate calulation instructions
+    simt_atom = cute.make_copy_atom(
+        cute.nvgpu.CopyUniversalOp(), c_dtype, num_bits_per_copy=16
+    )
+    thread_layout = cute.make_layout(
+        (1, threads_per_cta), stride=(threads_per_cta, 1))
+    value_layout = cute.make_layout((1, 1))
+    tiled_copy_r2g = cute.make_tiled_copy_tv(
+        simt_atom, thread_layout, value_layout
+    )
+    thr_copy_r2g = tiled_copy_r2g.get_slice(tidx)
+    cC = cute.make_identity_tensor(gC_mnl.shape)
+    # ((atom_v, rest_v), NumGmemCpy)
+    tDcC = thr_copy_r2g.partition_D(cC)
+
+    # ((atom_v, rest_v), NumGmemCpy)
+    tDpC = cute.make_rmem_tensor(tDrC.shape, cutlass.Boolean)
+    residue_m = mC_mnl.shape[0] - cutlass.Int32(coord_x) * mma_tiler_mnk[0]
+    residue_n = mC_mnl.shape[1] - cutlass.Int32(coord_y) * mma_tiler_mnk[1]
+    for i in range(cute.size(tDrC.shape)):
+        # Swap residue_m and residue_n to match the order of tDcC
+        tDpC[i] = cute.elem_less(tDcC[i], (residue_n, residue_m))
+    cute.copy(simt_atom, cute.flatten(tDrC), cute.flatten(tDgC), pred=cute.flatten(tDpC))
+
+    acc_full.release()
+    # Deallocate TMEM
+    cute.arch.barrier()
+    tmem.free(acc_tmem_ptr)
+    pass
+
+
+# Host-side JIT function to prepare tensors and launch GPU kernel.
+@cute.jit
+def my_kernel(
+    ptr_of_tensor_of_problem_sizes: cute.Pointer,
+    ptr_of_tensor_of_abc_ptrs: cute.Pointer,
+    ptr_of_tensor_of_sfasfb_ptrs: cute.Pointer,
+    ptr_of_tensor_of_tensormap: cute.Pointer,
+    total_num_clusters: cutlass.Int32,
+    problem_sizes: List[
+        Tuple[int, int, int, int]
+    ],  # Problem sizes for each group
+    num_groups: cutlass.Int32,
+):
+
+    tensor_of_abc_ptrs = cute.make_tensor(
+        ptr_of_tensor_of_abc_ptrs, cute.make_layout((num_groups, 3), stride=(3, 1))
+    )
+    tensor_of_sfasfb_ptrs = cute.make_tensor(
+        ptr_of_tensor_of_sfasfb_ptrs, cute.make_layout((num_groups, 2), stride=(2, 1))
+    )
+    tensor_of_problem_sizes = cute.make_tensor(
+        ptr_of_tensor_of_problem_sizes, cute.make_layout((num_groups, 4), stride=(4, 1))
+    )
+    tensor_of_tensormap = cute.make_tensor(
+        ptr_of_tensor_of_tensormap, cute.make_layout((total_num_clusters, 4, 16), stride=(64, 16, 1))
+    )
+
+    # Use fake shape for initial Tma descriptor and atom setup
+    # The real Tma desc and atom will be updated during kernel execution.
+    min_a_shape = (cutlass.Int32(64), cutlass.Int32(64), cutlass.Int32(64), cutlass.Int32(1))
+    min_b_shape = (cutlass.Int32(64), cutlass.Int32(64), cutlass.Int32(64), cutlass.Int32(1))
+    initial_a = cute.make_tensor(
+        cute.make_ptr(ab_dtype, 0, cute.AddressSpace.gmem, assumed_align=16,),
+        cute.make_layout(
+            (min_a_shape[0], cute.assume(min_a_shape[2], 32), min_a_shape[3]),
+            stride=(
+                cute.assume(min_a_shape[2], 32),
+                1,
+                cute.assume(min_a_shape[0] * min_a_shape[2], 32),
+            ),
+        ),
+    )
+    initial_b = cute.make_tensor(
+        cute.make_ptr(ab_dtype, 0, cute.AddressSpace.gmem, assumed_align=16,),
+        cute.make_layout(
+            (min_b_shape[1], cute.assume(min_b_shape[2], 32), min_b_shape[3]),
+            stride=(
+                cute.assume(min_b_shape[2], 32),
+                1,
+                cute.assume(min_b_shape[1] * min_b_shape[2], 32),
+            ),
+        ),
+    )
+
+    # Setup sfa/sfb tensor by filling A/B tensor to scale factor atom layout
+    # ((Atom_M, Rest_M),(Atom_K, Rest_K),RestL)
+    sfa_layout = blockscaled_utils.tile_atom_to_shape_SF(
+        initial_a.shape, sf_vec_size
+    )
+    # ((Atom_N, Rest_N),(Atom_K, Rest_K),RestL)
+    sfb_layout = blockscaled_utils.tile_atom_to_shape_SF(
+        initial_b.shape, sf_vec_size
+    )
+    # Create initial SFA and SFB tensors with fake shape and null pointer.
+    initial_sfa = cute.make_tensor(
+        cute.make_ptr(sf_dtype, 0, cute.AddressSpace.gmem, assumed_align=16,), sfa_layout)
+    initial_sfb = cute.make_tensor(
+        cute.make_ptr(sf_dtype, 0, cute.AddressSpace.gmem, assumed_align=16,), sfb_layout)
+
+    # Select MMA operation
+    mma_op = tcgen05.MmaMXF4NVF4Op(
+        sf_dtype,
+        (mma_tiler_mnk[0], mma_tiler_mnk[1], mma_inst_shape_k),
+        tcgen05.CtaGroup.ONE,
+        tcgen05.OperandSource.SMEM,
+    )
+    tiled_mma = cute.make_tiled_mma(mma_op)
+
+    cluster_layout_vmnk = cute.tiled_divide(
+        cute.make_layout((1, 1, 1)),
+        (tiled_mma.thr_id.shape,),
+    )
+
+    # Compute A/B/SFA/SFB/C shared memory layout
+    a_smem_layout_staged = sm100_utils.make_smem_layout_a(
+        tiled_mma,
+        mma_tiler_mnk,
+        ab_dtype,
+        num_ab_stage,
+    )
+    b_smem_layout_staged = sm100_utils.make_smem_layout_b(
+        tiled_mma,
+        mma_tiler_mnk,
+        ab_dtype,
+        num_ab_stage,
+    )
+    sfa_smem_layout_staged = blockscaled_utils.make_smem_layout_sfa(
+        tiled_mma,
+        mma_tiler_mnk,
+        sf_vec_size,
+        num_ab_stage,
+    )
+    sfb_smem_layout_staged = blockscaled_utils.make_smem_layout_sfb(
+        tiled_mma,
+        mma_tiler_mnk,
+        sf_vec_size,
+        num_ab_stage,
+    )
+    atom_thr_size = cute.size(tiled_mma.thr_id.shape)
+
+    # Setup TMA for A
+    a_smem_layout = cute.slice_(a_smem_layout_staged, (None, None, None, 0))
+    tma_atom_a, tma_tensor_a = cute.nvgpu.make_tiled_tma_atom_A(
+        cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE),
+        initial_a,
+        a_smem_layout,
+        mma_tiler_mnk,
+        tiled_mma,
+        cluster_layout_vmnk.shape,
+    )
+    # Setup TMA for B
+    b_smem_layout = cute.slice_(b_smem_layout_staged, (None, None, None, 0))
+    tma_atom_b, tma_tensor_b = cute.nvgpu.make_tiled_tma_atom_B(
+        cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE),
+        initial_b,
+        b_smem_layout,
+        mma_tiler_mnk,
+        tiled_mma,
+        cluster_layout_vmnk.shape,
+    )
+    # Setup TMA for SFA
+    sfa_smem_layout = cute.slice_(
+        sfa_smem_layout_staged, (None, None, None, 0)
+    )
+    tma_atom_sfa, tma_tensor_sfa = cute.nvgpu.make_tiled_tma_atom_A(
+        cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE),
+        initial_sfa,
+        sfa_smem_layout,
+        mma_tiler_mnk,
+        tiled_mma,
+        cluster_layout_vmnk.shape,
+        internal_type=cutlass.Int16,
+    )
+    # Setup TMA for SFB
+    sfb_smem_layout = cute.slice_(
+        sfb_smem_layout_staged, (None, None, None, 0)
+    )
+    tma_atom_sfb, tma_tensor_sfb = cute.nvgpu.make_tiled_tma_atom_B(
+        cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE),
+        initial_sfb,
+        sfb_smem_layout,
+        mma_tiler_mnk,
+        tiled_mma,
+        cluster_layout_vmnk.shape,
+        internal_type=cutlass.Int16,
+    )
+
+    # Compute TMA load bytes
+    a_copy_size = cute.size_in_bytes(ab_dtype, a_smem_layout)
+    b_copy_size = cute.size_in_bytes(ab_dtype, b_smem_layout)
+    sfa_copy_size = cute.size_in_bytes(sf_dtype, sfa_smem_layout)
+    sfb_copy_size = cute.size_in_bytes(sf_dtype, sfb_smem_layout)
+    num_tma_load_bytes = (
+        a_copy_size + b_copy_size + sfa_copy_size + sfb_copy_size
+    ) * atom_thr_size
+
+    # Store CTA shape information for each Group in a List
+    cta_mn_list = []
+    for group_idx, (m, n, k, l) in enumerate(problem_sizes):
+        x, y = cute.ceil_div(problem_sizes[group_idx][:2], mma_tiler_mnk[0:2])
+        cta_mn_list.append((x, y))
+
+    # Compute grid size
+    grid = (1, 1, total_num_clusters)
+
+    # Launch the kernel
+    kernel(
+        # MMA (Matrix Multiply-Accumulate) configuration
+        tiled_mma,                  # Tiled MMA object defining NVFP4 GEMM compute pattern
+        
+        # TMA (Tensor Memory Accelerator) atoms and tensors for input matrix A
+        tma_atom_a,                 # TMA copy atom defining how to load A from global memory
+        tma_tensor_a,               # Tensor descriptor for A (created from smallest A tensor)
+        
+        # TMA atoms and tensors for input matrix B
+        tma_atom_b,                 # TMA copy atom defining how to load B from global memory
+        tma_tensor_b,               # Tensor descriptor for B (created from smallest B tensor)
+        
+        # TMA atoms and tensors for scale factor A
+        tma_atom_sfa,               # TMA copy atom for loading scale factors for A
+        tma_tensor_sfa,             # Tensor descriptor for SFA (block scale factors for A)
+        
+        # TMA atoms and tensors for scale factor B
+        tma_atom_sfb,               # TMA copy atom for loading scale factors for B
+        tma_tensor_sfb,             # Tensor descriptor for SFB (block scale factors for B)
+        
+        # Runtime tensor metadata for dynamic group access
+        tensor_of_abc_ptrs,         # Device tensor containing pointers to A, B, C for all groups
+        tensor_of_sfasfb_ptrs,      # Device tensor containing pointers to SFA, SFB for all groups
+        tensor_of_tensormap,        # Pre-allocated buffer for tensormap descriptors per CTA
+        tensor_of_problem_sizes,    # Device tensor containing (m, n, k, l) for each group
+        
+        # Shared memory layouts with staging for pipelined execution
+        a_smem_layout_staged,       # Staged shared memory layout for A (includes stage dimension)
+        b_smem_layout_staged,       # Staged shared memory layout for B (includes stage dimension)
+        sfa_smem_layout_staged,     # Staged shared memory layout for SFA (includes stage dimension)
+        sfb_smem_layout_staged,     # Staged shared memory layout for SFB (includes stage dimension)
+        
+        # CTA grid configuration per group
+        cta_mn_list,                # List of (M_tiles, N_tiles) for each group
+        
+        # Pipeline synchronization parameter
+        num_tma_load_bytes,         # Total bytes to load per TMA transaction (for barrier setup)
+    ).launch(
+        grid=grid,
+        block=[threads_per_cta, 1, 1],
+        cluster=(1, 1, 1),
+    )
+    return
+
+
+# Global cache for compiled kernels (keyed by group size)
+_compiled_kernel_cache = {}
+# This function is used to compile the kernel once and cache it and then allow users to 
+# run the kernel multiple times to get more accurate timing results.
+def compile_kernel(problem_sizes):
+    """
+    Compile the kernel once and cache it using problem_sizes as the key.
+    This should be called before any timing measurements.
+
+    Returns:
+        The compiled kernel function
+    """
+    global _compiled_kernel_cache
+    
+    # Convert problem_sizes list to a hashable tuple for use as dictionary key
+    cache_key = f"{len(problem_sizes)}"
+
+    # Check if we already have a compiled kernel for these problem sizes
+    if cache_key in _compiled_kernel_cache:
+        return _compiled_kernel_cache[cache_key]
+
+    cute_ptr_of_tensor_of_problem_sizes = make_ptr(
+        cutlass.Int32, 0, cute.AddressSpace.gmem, assumed_align=16,
+    )
+    cute_ptr_of_tensor_of_abc_ptrs = make_ptr(
+        cutlass.Int64, 0, cute.AddressSpace.gmem, assumed_align=16,
+    )
+    cute_ptr_of_tensor_of_sfasfb_ptrs = make_ptr(
+        cutlass.Int64, 0, cute.AddressSpace.gmem, assumed_align=16,
+    )
+    # Fake cluster numbers for compile only.
+    total_num_clusters = cutlass.Int32(1)
+    num_groups = cutlass.Int32(len(problem_sizes))
+    # Each cluster needs its own set of tensormaps (one for A, B, SFA, SFB)
+    # Shape: (total_num_clusters, num_tensormaps=4, bytes_per_tensormap/8=16)
+    cute_ptr_of_tensor_of_tensormap = make_ptr(
+        cutlass.Int64, 0, cute.AddressSpace.gmem, assumed_align=16,
+    )
+    compiled_func = cute.compile(
+        my_kernel,
+        cute_ptr_of_tensor_of_problem_sizes,
+        cute_ptr_of_tensor_of_abc_ptrs,
+        cute_ptr_of_tensor_of_sfasfb_ptrs,
+        cute_ptr_of_tensor_of_tensormap,
+        total_num_clusters,
+        problem_sizes,
+        num_groups
+    )
+    # Store compiled kernel in cache with problem_sizes as key
+    _compiled_kernel_cache[cache_key] = compiled_func
+    return compiled_func
+
+
+def custom_kernel(data: input_t) -> output_t:
+    """
+    Execute the block-scaled group GEMM kernel.
+    
+    This is the main entry point called by the evaluation framework.
+    It converts PyTorch tensors to CuTe tensors, launches the kernel,
+    and returns the result.
+    
+    Args:
+        data: Tuple of (abc_tensors, sfasfb_tensors, problem_sizes) where:
+            abc_tensors: list of tuples (a, b, c) where 
+                a is torch.Tensor[float4e2m1fn_x2] of shape [m, k // 2, l]
+                b is torch.Tensor[float4e2m1fn_x2] of shape [n, k // 2, l]
+                c is torch.Tensor[float16] of shape [m, n, l]
+            sfasfb_tensors: list of tuples (sfa, sfb) where 
+                sfa is torch.Tensor[float8_e4m3fnuz] of shape [m, k // 16, l]
+                sfb is torch.Tensor[float8_e4m3fnuz] of shape [n, k // 16, l]
+            problem_sizes: list of tuples (m, n, k, l)
+            each group has its own a, b, c, sfa, sfb with different m, n, k, l problem sizes
+            l should always be 1 for each group.
+            list size is the number of groups.
+    
+    Returns:
+        list of c tensors where c is torch.Tensor[float16] of shape [m, n, l] for each group
+    """
+    abc_tensors, _, sfasfb_reordered_tensors, problem_sizes = data
+
+    compiled_func = compile_kernel(problem_sizes)
+
+    # Extract raw data pointers from all input tensors for each group
+    # These will be passed to the GPU kernel to access the actual tensor data
+    abc_ptrs = []
+    sfasfb_ptrs = []
+    for i, ((a, b, c), (sfa_reordered, sfb_reordered), (m, n, k, l)) in enumerate(zip(abc_tensors, sfasfb_reordered_tensors, problem_sizes)):
+        # Store pointers to A, B, and C matrices for this group
+        abc_ptrs.append((a.data_ptr(), b.data_ptr(), c.data_ptr()))
+        # Store pointers to scale factor tensors for this group
+        sfasfb_ptrs.append((sfa_reordered.data_ptr(), sfb_reordered.data_ptr()))
+
+    # Create torch tensor to store problem sizes for all groups
+    # Shape: (num_groups, 4) where each row contains (m, n, k, l) for that group
+    # Layout: (num_groups, 4):(4, 1) means row-major storage
+    tensor_of_problem_sizes = torch.tensor(
+        problem_sizes, dtype=torch.int32, device="cuda"
+    )
+
+    # Create torch tensors to store data pointers for all groups
+    # These allow the GPU kernel to dynamically access different tensors per group
+    # tensor_of_abc_ptrs: Shape (num_groups, 3) containing (a_ptr, b_ptr, c_ptr) per group
+    # tensor_of_sfasfb_ptrs: Shape (num_groups, 2) containing (sfa_ptr, sfb_ptr) per group
+    tensor_of_abc_ptrs = torch.tensor(abc_ptrs, dtype=torch.int64, device="cuda")
+    tensor_of_sfasfb_ptrs = torch.tensor(sfasfb_ptrs, dtype=torch.int64, device="cuda")
+
+    # Compute the tile shape for each CUDA Thread Block (CTA)
+    # cta_tile_shape_mn: [M_tile, N_tile] = [128, 128] for this kernel
+    cta_tile_shape_mn = [128, mma_tiler_mnk[1]]
+    # cluster_tile_shape_mn: Total tile shape per cluster (same as CTA since cluster is 1x1)
+    cluster_tile_shape_mn = tuple(
+        x * y for x, y in zip(cta_tile_shape_mn, (1, 1))
+    )
+    
+    # Compute total number of cluster tiles needed across all groups
+    # Each group's (m, n) dimensions are divided into tiles of size cluster_tile_shape_mn
+    # This determines the total grid size (bidz dimension) for kernel launch
+    total_num_clusters = 0
+    num_groups = len(problem_sizes)
+    for m, n, _, _ in problem_sizes:
+        # Calculate number of tiles needed in M and N dimensions for this group
+        num_clusters_mn = tuple(
+            (x + y - 1) // y for x, y in zip((m, n), cluster_tile_shape_mn)
+        )
+        # Multiply M_tiles * N_tiles to get total tiles for this group
+        total_num_clusters += functools.reduce(lambda x, y: x * y, num_clusters_mn)
+
+    # Allocate device memory for tensormap descriptors
+    # Each cluster needs its own set of tensormaps (one for A, B, SFA, SFB)
+    # Shape: (total_num_clusters, num_tensormaps=4, bytes_per_tensormap/8=16)
+    # Tensormaps are hardware descriptors used by TMA for efficient memory transfers
+    tensormap_shape = (
+        total_num_clusters,
+        num_tensormaps,
+        bytes_per_tensormap // 8,
+    )
+    tensor_of_tensormap = torch.empty(tensormap_shape, dtype=torch.int64, device="cuda")
+
+    # Create CuTe pointers to the metadata tensors that will be passed to the kernel
+    # These allow the GPU kernel to read problem sizes and tensor pointers
+    cute_ptr_of_tensor_of_abc_ptrs = make_ptr(
+        cutlass.Int64,
+        tensor_of_abc_ptrs.data_ptr(),
+        cute.AddressSpace.gmem,
+        assumed_align=16,
+    )
+    cute_ptr_of_tensor_of_sfasfb_ptrs = make_ptr(
+        cutlass.Int64,
+        tensor_of_sfasfb_ptrs.data_ptr(),
+        cute.AddressSpace.gmem,
+        assumed_align=16,
+    )
+    cute_ptr_of_tensor_of_problem_sizes = make_ptr(
+        cutlass.Int32,
+        tensor_of_problem_sizes.data_ptr(),
+        cute.AddressSpace.gmem,
+        assumed_align=16,
+    )
+    cute_ptr_of_tensor_of_tensormap = make_ptr(
+        cutlass.Int64,
+        tensor_of_tensormap.data_ptr(),
+        cute.AddressSpace.gmem,
+        assumed_align=16,
+    )
+
+    # Launch the JIT-compiled GPU kernel with all prepared data
+    # The kernel will perform block-scaled group GEMM: C = A * SFA * B * SFB for all groups
+    compiled_func(
+        cute_ptr_of_tensor_of_problem_sizes, # Pointer to problem sizes array
+        cute_ptr_of_tensor_of_abc_ptrs,      # Pointer to ABC tensor pointers array
+        cute_ptr_of_tensor_of_sfasfb_ptrs,   # Pointer to scale factor pointers array
+        cute_ptr_of_tensor_of_tensormap,     # Pointer to tensormap buffer
+        total_num_clusters,                  # Total number of CTAs to launch
+        problem_sizes,                       # Problem sizes list (for host-side processing)
+        num_groups,                          # Number of groups in this batch
+    )
+
+    res = []
+    for i in range(num_groups):
+        res.append(abc_tensors[i][2])
+    return res
\ No newline at end of file
diff --git a/problems/nvidia/nvfp4_group_gemm/task.py b/problems/nvidia/nvfp4_group_gemm/task.py
new file mode 100644
index 00000000..94c11435
--- /dev/null
+++ b/problems/nvidia/nvfp4_group_gemm/task.py
@@ -0,0 +1,8 @@
+import torch
+from typing import TypedDict, TypeVar
+
+input_t = TypeVar("input_t", bound=tuple[list[tuple[torch.Tensor, torch.Tensor, torch.Tensor]], list[tuple[torch.Tensor, torch.Tensor]], list[tuple[torch.Tensor, torch.Tensor]], list[tuple[int, int, int, int]]])
+output_t = TypeVar("output_t", bound=list[torch.Tensor])
+class TestSpec(TypedDict):
+    problem_sizes: list[tuple[int, int, int, int]]
+    seed: int
\ No newline at end of file
diff --git a/problems/nvidia/nvfp4_group_gemm/task.yml b/problems/nvidia/nvfp4_group_gemm/task.yml
new file mode 100644
index 00000000..3bf08d93
--- /dev/null
+++ b/problems/nvidia/nvfp4_group_gemm/task.yml
@@ -0,0 +1,65 @@
+# name: nvfp4-block-scaled-gemm
+
+files:
+  - {"name": "submission.py", "source": "@SUBMISSION@"}
+  - {"name": "task.py", "source": "task.py"}
+  - {"name": "utils.py", "source": "utils.py"}
+  - {"name": "reference.py", "source": "reference.py"}
+  - {"name": "eval.py", "source": "../eval.py"}
+
+lang: "py"
+
+description: |
+  
+  You will implement a block scaled group matrix-matrix multiplication kernel optimized for NVIDIA B200.
+  To be explicit, you will be given a tuple of tensors:
+  ```
+  (abc_tensors, sfasfb_tensors, problem_sizes)
+  ```
+  where:
+  * `abc_tensors` is list of tuples (a, b, c) where 
+    a is torch.Tensor[float4e2m1fn_x2] of shape [M, K // 2, L]
+    b is torch.Tensor[float4e2m1fn_x2] of shape [N, K // 2, L]
+    c is torch.Tensor[float16] of shape [M, N, L]
+  * `sfasfb_tensors` is list of tuples (sfa, sfb) where 
+    sfa is torch.Tensor[float8_e4m3fnuz] of shape [M, K // 16, L]
+    sfb is torch.Tensor[float8_e4m3fnuz] of shape [N, K // 16, L]
+  * `problem_sizes` is list of tuples (M, N, K, L)
+  
+  Each group's matrix sizes `M` is divisible by mma_tiler_mn[0], `N` is divisible by mma_tiler_mn[1], `K` is divisible by 256.
+  The ranking criteria is the geometric mean of the benchmark results.
+  For the grand price, your kernel will be evaluated against the speed of light analysis
+  and the solution closest to the speed of light will be awarded the grand price.
+  ```
+  The speed of light analysis based on the max(FP4 Tensor Core math throughput, DRAM memory throughput) of B200 and tested under 1.5Ghz clock with the average M, N, K values per group:
+  G  M_values    N_values    K_values  L time[us]
+  8 [80, 176, 128, 72, 64, 248, 96, 160] [4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096] [7168, 7168, 7168, 7168, 7168, 7168, 7168, 7168] 1 18.833
+  8 [40, 76, 168, 72, 164, 148, 196, 160] [7168, 7168, 7168, 7168, 7168, 7168, 7168, 7168] [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] 1 10.667
+  2 [192, 320] [3072, 3072] [4096, 4096] 1 2.406
+  2 [128, 384] [4096, 4096] [1536, 1536] 1 1.525
+  ```
+config:
+  main: "eval.py"
+
+templates:
+  Python: "template.py"
+
+tests:
+  - {"m": [96, 128], "n": [128, 256], "k": [128, 512], "g": 2, "seed": 1111}
+  - {"m": [256, 72], "n": [512, 384], "k": [256, 256], "g": 2, "seed": 1111}
+  - {"m": [128, 128], "n": [128, 256], "k": [128, 512], "g": 2, "seed": 1111}
+  - {"m": [80, 128, 256], "n": [384, 256, 128], "k": [256, 512, 128], "g": 3, "seed": 1111}
+  - {"m": [64, 72, 96], "n": [128, 384, 512], "k": [384, 512, 128], "g": 3, "seed": 1111}
+  - {"m": [64, 256, 128], "n": [768, 128, 256], "k": [512, 512, 128], "g": 3, "seed": 1111}
+  - {"m": [128, 128, 64], "n": [256, 512, 512], "k": [768, 128, 768], "g": 3, "seed": 1111}
+  - {"m": [128, 128, 128, 128], "n": [128, 128, 128, 128], "k": [128, 128, 128, 128], "g": 4, "seed": 1111}
+  - {"m": [40, 56, 384, 512], "n": [512, 384, 256, 128], "k": [256, 256, 256, 256], "g": 4, "seed": 1111}
+  - {"m": [512, 384, 256, 128], "n": [256, 256, 256, 256], "k": [512, 128, 512, 128], "g": 4, "seed": 1111}
+
+benchmarks:
+  - {"m": [80, 176, 128, 72, 64, 248, 96, 160], "n": [4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096], "k": [7168, 7168, 7168, 7168, 7168, 7168, 7168, 7168], "g": 8, "seed": 1111}
+  - {"m": [40, 76, 168, 72, 164, 148, 196, 160], "n": [7168, 7168, 7168, 7168, 7168, 7168, 7168, 7168], "k": [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048], "g": 8, "seed": 1111}
+  - {"m": [192, 320], "n": [3072, 3072], "k": [4096, 4096], "g": 2, "seed": 1111}
+  - {"m": [128, 384], "n": [4096, 4096], "k": [1536, 1536], "g": 2, "seed": 1111}
+
+ranking_by: "geom"
diff --git a/problems/nvidia/nvfp4_group_gemm/template.py b/problems/nvidia/nvfp4_group_gemm/template.py
new file mode 100644
index 00000000..b6005faa
--- /dev/null
+++ b/problems/nvidia/nvfp4_group_gemm/template.py
@@ -0,0 +1,31 @@
+from task import input_t, output_t
+
+
+def custom_kernel(data: input_t) -> output_t:
+    """
+    Reference implementation of block-scale fp4 group gemm
+    Args:
+        data: list of tuples (abc_tensors, sfasfb_tensors, sfasfb_reordered_tensors, problem_sizes) where:
+            abc_tensors: list of tuples (a, b, c) where 
+                a is torch.Tensor[float4e2m1fn_x2] of shape [m, k // 2, l]
+                b is torch.Tensor[float4e2m1fn_x2] of shape [n, k // 2, l]
+                c is torch.Tensor[float16] of shape [m, n, l]
+            sfasfb_tensors: list of tuples (sfa, sfb) where 
+                sfa is torch.Tensor[float8_e4m3fnuz] of shape [m, k // 16, l]
+                sfb is torch.Tensor[float8_e4m3fnuz] of shape [n, k // 16, l]
+            sfasfb_reordered_tensors: list of tuples (sfa_reordered, sfb_reordered) where 
+                sfa_reordered is torch.Tensor[float8_e4m3fnuz] of shape [32, 4, rest_m, 4, rest_k, l]
+                sfb_reordered is torch.Tensor[float8_e4m3fnuz] of shape [32, 4, rest_n, 4, rest_k, l]
+            problem_sizes: list of tuples (m, n, k, l)
+        each group has its own a, b, c, sfa, sfb with different m, n, k, l problem sizes
+        l should always be 1 for each group.
+    Returns:
+        list of tuples (c) where c is torch.Tensor[float16] of shape [m, n, l]
+    """
+    abc_tensors, sfasfb_tensors, sfasfb_reordered_tensors, problem_sizes = data
+    result_tensors = []
+    for i, ((a, b, c), (sfa_reordered, sfb_reordered), (m, n, k, l)) in enumerate(zip(abc_tensors, sfasfb_reordered_tensors, problem_sizes)):
+        # add you implementation here
+        result_tensors.append(c)
+
+    return result_tensors
\ No newline at end of file
diff --git a/problems/nvidia/nvfp4_group_gemm/utils.py b/problems/nvidia/nvfp4_group_gemm/utils.py
new file mode 100644
index 00000000..486116b8
--- /dev/null
+++ b/problems/nvidia/nvfp4_group_gemm/utils.py
@@ -0,0 +1,176 @@
+import os
+import random
+import numpy as np
+import torch
+
+
+def set_seed(seed=42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+
+
+def get_device(use_cuda: bool = True) -> torch.device:
+    """Get the appropriate device (GPU or CPU)."""
+    if use_cuda:
+        if torch.cuda.is_available():
+            return torch.device("cuda")
+        elif torch.backends.mps.is_available():
+            return torch.device("mps")
+        else:
+            print("No compatible GPU found. Falling back to CPU.")
+    return torch.device("cpu")
+
+
+# Adapted from https://github.com/linkedin/Liger-Kernel/blob/main/test/utils.py
+@torch.no_grad()
+def verbose_allclose(
+        received: torch.Tensor,
+        expected: torch.Tensor,
+        rtol=1e-05,
+        atol=1e-08,
+        max_print=5
+) -> list[str]:
+    """
+    Assert that two tensors are element-wise equal within a tolerance, providing detailed information about mismatches.
+
+    Parameters:
+    received (torch.Tensor): Tensor we actually got.
+    expected (torch.Tensor): Tensor we expected to receive.
+    rtol (float): Relative tolerance; relative to expected
+    atol (float): Absolute tolerance.
+    max_print (int): Maximum number of mismatched elements to print.
+
+    Raises:
+    AssertionError: If the tensors are not all close within the given tolerance.
+    """
+    # Check if the shapes of the tensors match
+    if received.shape != expected.shape:
+        return ["SIZE MISMATCH"]
+
+    # Calculate the difference between the tensors
+    diff = torch.abs(received - expected)
+
+    # Determine the tolerance
+    tolerance = atol + rtol * torch.abs(expected)
+
+    # Find tolerance mismatched elements
+    tol_mismatched = diff > tolerance
+
+    # Find nan mismatched elements
+    nan_mismatched = torch.logical_xor(torch.isnan(received), torch.isnan(expected))
+
+    # Find +inf mismatched elements
+    posinf_mismatched = torch.logical_xor(torch.isposinf(received), torch.isposinf(expected))
+    # Find -inf mismatched elements
+    neginf_mismatched = torch.logical_xor(torch.isneginf(received), torch.isneginf(expected))
+
+    # Find all mismatched elements
+    mismatched = torch.logical_or(
+        torch.logical_or(tol_mismatched, nan_mismatched),
+        torch.logical_or(posinf_mismatched, neginf_mismatched),
+    )
+
+    mismatched_indices = torch.nonzero(mismatched)
+
+    # Count the number of mismatched elements
+    num_mismatched = mismatched.count_nonzero().item()
+
+    # Generate detailed information if there are mismatches
+    if num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
+
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return mismatch_details
+
+    return []
+
+
+@torch.no_grad()
+def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print: int=5):
+    """
+    Assert that two tensors are element-wise perfectly equal, providing detailed information about mismatches.
+
+    Parameters:
+    received (torch.Tensor): Tensor we actually got.
+    expected (torch.Tensor): Tensor we expected to receive.
+    max_print (int): Maximum number of mismatched elements to print.
+
+    Returns:
+         Empty string if tensors are equal, otherwise detailed error information
+    """
+    mismatched = torch.not_equal(received, expected)
+    mismatched_indices = torch.nonzero(mismatched)
+
+    # Count the number of mismatched elements
+    num_mismatched = mismatched.count_nonzero().item()
+
+    # Generate detailed information if there are mismatches
+    if num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
+
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return mismatch_details
+
+    return []
+
+
+def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08) -> tuple[bool, str]:
+    """
+    Convenient "default" implementation for tasks' `check_implementation` function.
+    """
+    expected = reference(data)
+    for i, (output_i, expected_i) in enumerate(zip(output, expected)):
+        reasons = verbose_allclose(output_i, expected_i, rtol=rtol, atol=atol)
+        if len(reasons) > 0:
+            return False, f"mismatch found! custom implementation doesn't match reference: {i} {reasons}"
+
+    return True, ''
+
+
+def make_match_reference(reference: callable, **kwargs):
+    def wrapped(data, output):
+        return match_reference(data, output, reference=reference, **kwargs)
+    return wrapped
+
+
+class DeterministicContext:
+    def __init__(self):
+        self.allow_tf32 = None
+        self.deterministic = None
+        self.cublas = None
+
+    def __enter__(self):
+        self.cublas = os.environ.get('CUBLAS_WORKSPACE_CONFIG', '')
+        self.allow_tf32 = torch.backends.cudnn.allow_tf32
+        self.deterministic = torch.backends.cudnn.deterministic
+        torch.backends.cudnn.allow_tf32 = False
+        torch.backends.cudnn.deterministic = True
+        torch.use_deterministic_algorithms(True)
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        torch.backends.cudnn.allow_tf32 = self.allow_tf32
+        torch.backends.cudnn.deterministic = self.deterministic
+        torch.use_deterministic_algorithms(False)
+        os.environ['CUBLAS_WORKSPACE_CONFIG'] = self.cublas
+
+def clear_l2_cache():
+    # import cupy as cp
+    # cp.cuda.runtime.deviceSetLimit(cp.cuda.runtime.cudaLimitPersistingL2CacheSize, 0)
+    # create a large dummy tensor
+    dummy = torch.empty((32, 1024, 1024), dtype=torch.int64, device="cuda")
+    # write stuff to
+    dummy.fill_(42)
+    del dummy
\ No newline at end of file

From 250b00478f27047ee73aaa24944e8802b7fb76be Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Mon, 19 Jan 2026 08:59:33 -0800
Subject: [PATCH 193/207] Add nvfp4_group_gemm problem to nvidia.yaml (#92)

- Deadline: Feb 20, 2026
- Runners: B200 and NVIDIA
---
 problems/nvidia.yaml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/problems/nvidia.yaml b/problems/nvidia.yaml
index 2bc74568..8b1ebf7b 100644
--- a/problems/nvidia.yaml
+++ b/problems/nvidia.yaml
@@ -25,4 +25,9 @@ problems:
     deadline: "2026-01-20 7:59"
     gpus:
       - B200
-
+  - directory: nvidia/nvfp4_group_gemm
+    name: nvfp4_group_gemm
+    deadline: "2026-02-20"
+    gpus:
+      - B200
+      - NVIDIA

From 5c9d38e4f40b2f7e13616ddc488b754ff3b25de3 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Mon, 19 Jan 2026 09:07:35 -0800
Subject: [PATCH 194/207] Add nvfp4_group_gemm to nvidia.yaml + fix eval.py
 (#93)

* Add nvfp4_group_gemm problem to nvidia.yaml

- Deadline: Feb 20, 2026
- Runners: B200 and NVIDIA

* Fix eval.py to handle list values in test cases

Bypass text serialization and parse YAML directly to properly
handle list values for m, n, k in group GEMM test cases.
---
 problems/nvidia/nvfp4_group_gemm/eval.py | 62 ++++++++++++++----------
 1 file changed, 36 insertions(+), 26 deletions(-)

diff --git a/problems/nvidia/nvfp4_group_gemm/eval.py b/problems/nvidia/nvfp4_group_gemm/eval.py
index 09ecaa43..df94400e 100644
--- a/problems/nvidia/nvfp4_group_gemm/eval.py
+++ b/problems/nvidia/nvfp4_group_gemm/eval.py
@@ -355,6 +355,33 @@ def run_profiling(logger: PopcornOutput, tests: list[TestCase]):
     return 0
 
 
+def get_test_cases_from_yaml(yaml_tests: list[dict], seed: Optional[int]) -> list[TestCase]:
+    """
+    Create TestCase objects directly from YAML test definitions.
+    This bypasses text serialization to properly handle list values.
+    """
+    tests = []
+    for test in yaml_tests:
+        # Convert lists to tuples for consistency
+        args = {}
+        spec_parts = []
+        for k, v in test.items():
+            if isinstance(v, list):
+                args[k] = tuple(v)
+            else:
+                args[k] = v
+            spec_parts.append(f"{k}: {v}")
+        spec = "; ".join(spec_parts)
+        tests.append(TestCase(spec=spec, args=args))
+
+    if seed is not None:
+        for test in tests:
+            if "seed" in test.args:
+                test.args["seed"] = _combine(test.args["seed"], seed)
+
+    return tests
+
+
 def main():
     fd = os.getenv("POPCORN_FD")
     if not fd:
@@ -369,34 +396,17 @@ def main():
     seed = int(seed) if seed else None
     set_seed(seed or 42)
 
-    filename = None
-
-    with tempfile.NamedTemporaryFile(delete=False) as tmp:
-
-        def build_test_string(tests: list[dict]):
-            as_str = ""
-            for test in tests:
-                kvs = []
-                for k, v in test.items():
-                    kvs.append(f"{k}: {v}")
-                as_str += "; ".join(kvs) + "\n"
-            return as_str
+    import yaml
 
-        import yaml
-
-        yaml_content = yaml.safe_load(open(sys.argv[2], "r"))
-        if mode == "test":
-            tests_str = build_test_string(yaml_content.get("tests", []))
-        elif mode in ("benchmark", "leaderboard", "profile"):
-            tests_str = build_test_string(yaml_content.get("benchmarks", []))
-
-        tmp.write(tests_str.encode("utf-8"))
-        tmp.flush()
-        filename = tmp.name
-
-    tests = get_test_cases(filename, seed)
+    yaml_content = yaml.safe_load(open(sys.argv[2], "r"))
+    if mode == "test":
+        yaml_tests = yaml_content.get("tests", [])
+    elif mode in ("benchmark", "leaderboard", "profile"):
+        yaml_tests = yaml_content.get("benchmarks", [])
+    else:
+        yaml_tests = []
 
-    os.unlink(filename)
+    tests = get_test_cases_from_yaml(yaml_tests, seed)
 
     with PopcornOutput(int(fd)) as logger:
         import multiprocessing

From 3f230476f114b907d155798b4a4debfe5c18f993 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Mon, 19 Jan 2026 09:11:54 -0800
Subject: [PATCH 195/207] Fix eval.py source path in task configuration

Updated the source path for eval.py in task.yml.
---
 problems/nvidia/nvfp4_group_gemm/task.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/nvidia/nvfp4_group_gemm/task.yml b/problems/nvidia/nvfp4_group_gemm/task.yml
index 3bf08d93..df25a7cb 100644
--- a/problems/nvidia/nvfp4_group_gemm/task.yml
+++ b/problems/nvidia/nvfp4_group_gemm/task.yml
@@ -5,7 +5,7 @@ files:
   - {"name": "task.py", "source": "task.py"}
   - {"name": "utils.py", "source": "utils.py"}
   - {"name": "reference.py", "source": "reference.py"}
-  - {"name": "eval.py", "source": "../eval.py"}
+  - {"name": "eval.py", "source": "eval.py"}
 
 lang: "py"
 

From 0942e96d3c5c1b5590e85a79703111f15714ac35 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Mon, 19 Jan 2026 09:36:31 -0800
Subject: [PATCH 196/207] Fix eval.py to properly parse list values in test
 cases (#94)

* Fix eval.py to properly parse list values in test cases

- Updated regex to use [^\]]* instead of [^\]]+ to handle edge cases
- Added underscores to key pattern [a-zA-Z_]+
- Skip empty lines and empty parts when parsing
- Use re.fullmatch directly instead of both re.match and re.fullmatch
- Handle empty tuples/lists in value parsing

* Fix eval.py to use text parsing instead of YAML

Kernelbot passes a text file with format like:
  m: [96, 128]; n: [128, 256]; k: [128, 512]; g: 2; seed: 1111

Use get_test_cases() to parse this text format directly.
Remove unused get_test_cases_from_yaml function.
---
 problems/nvidia/nvfp4_group_gemm/eval.py | 61 ++++++++----------------
 1 file changed, 19 insertions(+), 42 deletions(-)

diff --git a/problems/nvidia/nvfp4_group_gemm/eval.py b/problems/nvidia/nvfp4_group_gemm/eval.py
index df94400e..2f00f53d 100644
--- a/problems/nvidia/nvfp4_group_gemm/eval.py
+++ b/problems/nvidia/nvfp4_group_gemm/eval.py
@@ -67,13 +67,22 @@ def get_test_cases(file_name: str, seed: Optional[int]) -> list[TestCase]:
 
     tests = []
     lines = content.splitlines()
-    match = r"\s*([a-zA-Z]+):\s*(\([^)]+\)|\[[^\]]+\]|[a-zA-Z]+|[+-]?[0-9]+)\s*"
+    # Match key: value pairs where value can be:
+    # - a list like [1, 2, 3]
+    # - a tuple like (1, 2, 3)
+    # - an integer
+    # - an alphabetic string
+    match = r"\s*([a-zA-Z_]+)\s*:\s*(\[[^\]]*\]|\([^)]*\)|[a-zA-Z_]+|[+-]?[0-9]+)\s*"
     for line in lines:
+        if not line.strip():
+            continue
         parts = line.split(";")
         case = {}
         for part in parts:
-            matched = re.match(match, part)
-            if not re.fullmatch(match, part):
+            if not part.strip():
+                continue
+            matched = re.fullmatch(match, part)
+            if not matched:
                 print(f"invalid test case: '{line}': '{part}'", file=sys.stderr)
                 exit(113)
             key = matched[1]
@@ -84,7 +93,11 @@ def get_test_cases(file_name: str, seed: Optional[int]) -> list[TestCase]:
                 # Try parsing as tuple/list
                 if (val.startswith('(') and val.endswith(')')) or (val.startswith('[') and val.endswith(']')):
                     try:
-                        val = tuple(int(x.strip()) for x in val[1:-1].split(','))
+                        inner = val[1:-1].strip()
+                        if inner:
+                            val = tuple(int(x.strip()) for x in inner.split(','))
+                        else:
+                            val = tuple()
                     except ValueError:
                         pass
 
@@ -355,33 +368,6 @@ def run_profiling(logger: PopcornOutput, tests: list[TestCase]):
     return 0
 
 
-def get_test_cases_from_yaml(yaml_tests: list[dict], seed: Optional[int]) -> list[TestCase]:
-    """
-    Create TestCase objects directly from YAML test definitions.
-    This bypasses text serialization to properly handle list values.
-    """
-    tests = []
-    for test in yaml_tests:
-        # Convert lists to tuples for consistency
-        args = {}
-        spec_parts = []
-        for k, v in test.items():
-            if isinstance(v, list):
-                args[k] = tuple(v)
-            else:
-                args[k] = v
-            spec_parts.append(f"{k}: {v}")
-        spec = "; ".join(spec_parts)
-        tests.append(TestCase(spec=spec, args=args))
-
-    if seed is not None:
-        for test in tests:
-            if "seed" in test.args:
-                test.args["seed"] = _combine(test.args["seed"], seed)
-
-    return tests
-
-
 def main():
     fd = os.getenv("POPCORN_FD")
     if not fd:
@@ -396,17 +382,8 @@ def main():
     seed = int(seed) if seed else None
     set_seed(seed or 42)
 
-    import yaml
-
-    yaml_content = yaml.safe_load(open(sys.argv[2], "r"))
-    if mode == "test":
-        yaml_tests = yaml_content.get("tests", [])
-    elif mode in ("benchmark", "leaderboard", "profile"):
-        yaml_tests = yaml_content.get("benchmarks", [])
-    else:
-        yaml_tests = []
-
-    tests = get_test_cases_from_yaml(yaml_tests, seed)
+    # Parse test cases from temp file (text format from kernelbot)
+    tests = get_test_cases(sys.argv[2], seed)
 
     with PopcornOutput(int(fd)) as logger:
         import multiprocessing

From aeee2ba37fa375c761ff05c779b8f44de03b0b31 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Tue, 20 Jan 2026 16:35:57 -0800
Subject: [PATCH 197/207] patch utils.py to avoid [] as a valid submission

---
 problems/nvidia/nvfp4_group_gemm/utils.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/problems/nvidia/nvfp4_group_gemm/utils.py b/problems/nvidia/nvfp4_group_gemm/utils.py
index 486116b8..18a1a420 100644
--- a/problems/nvidia/nvfp4_group_gemm/utils.py
+++ b/problems/nvidia/nvfp4_group_gemm/utils.py
@@ -131,6 +131,10 @@ def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08) -
     Convenient "default" implementation for tasks' `check_implementation` function.
     """
     expected = reference(data)
+
+    if len(output) != len(expected):
+        return False, f"output length mismatch: got {len(output)}, expected {len(expected)}"
+
     for i, (output_i, expected_i) in enumerate(zip(output, expected)):
         reasons = verbose_allclose(output_i, expected_i, rtol=rtol, atol=atol)
         if len(reasons) > 0:

From 66065c201cb4cf1b33e7f33c38b0b4ee84643b0e Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sun, 25 Jan 2026 01:05:43 +0100
Subject: [PATCH 198/207] Fix: better configuration for grouped gemm launch

---
 .../nvidia/eval_better_bench_grouped_gemm.py  | 526 ++++++++++++++++++
 problems/nvidia/nvfp4_group_gemm/task.yml     |   4 +-
 2 files changed, 529 insertions(+), 1 deletion(-)
 create mode 100644 problems/nvidia/eval_better_bench_grouped_gemm.py

diff --git a/problems/nvidia/eval_better_bench_grouped_gemm.py b/problems/nvidia/eval_better_bench_grouped_gemm.py
new file mode 100644
index 00000000..d9cecc35
--- /dev/null
+++ b/problems/nvidia/eval_better_bench_grouped_gemm.py
@@ -0,0 +1,526 @@
+import base64
+import dataclasses
+import multiprocessing
+import re
+import time
+import os
+import sys
+import math
+
+# Disable CuTe DSL file caching for more stable benchmarking
+os.environ["CUTE_DSL_DISABLE_FILE_CACHING"] = "1"
+
+
+def _init_worker():
+    """Initialize worker process with correct env vars."""
+    os.environ["CUTE_DSL_DISABLE_FILE_CACHING"] = "1"
+
+
+from pathlib import Path
+from typing import Any, Optional
+
+import torch.cuda
+from cutlass.cute.nvgpu.common import OpError
+from torch.cuda.nvtx import range as nvtx_range
+
+from utils import set_seed, clear_l2_cache_large as clear_l2_cache
+
+try:
+    from task import TestSpec
+except ImportError:
+    TestSpec = dict
+
+from reference import check_implementation, generate_input
+
+NUM_ITERATIONS_PER_BENCHMARK = 15
+
+
+class PopcornOutput:
+    def __init__(self, fd: int):
+        self.file = os.fdopen(fd, "w")
+        os.set_inheritable(fd, False)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.file.close()
+
+    def print(self, *args, **kwargs):
+        print(*args, **kwargs, file=self.file, flush=True)
+
+    def log(self, key, value):
+        self.print(f"{key}: {value}")
+
+
+@dataclasses.dataclass
+class TestCase:
+    args: dict
+    spec: str
+
+
+def _combine(a: int, b: int) -> int:
+    # combine two integers into one:
+    # we need this to generate a secret seed based on the test-level seed and
+    # the global secret seed.
+    # the test-level seeds are public knowledge, and typically relatively small numbers,
+    # so we need to make sure they don't provide any useful info for the full seed.
+    # This Cantor construction ensures that if the secret seed is a large number,
+    # then so is the overall seed.
+    return int(a + (a + b) * (a + b + 1) // 2)
+
+
+def get_test_cases(file_name: str, seed: Optional[int]) -> list[TestCase]:
+    try:
+        content = Path(file_name).read_text()
+    except Exception as E:
+        print(f"Could not open test file`{file_name}`: {E}", file=sys.stderr)
+        exit(113)
+
+    tests = []
+    lines = content.splitlines()
+    # Match key: value pairs where value can be:
+    # - a list like [1, 2, 3] (needed for group gemm which has per-group dimensions)
+    # - a tuple like (1, 2, 3)
+    # - an integer
+    # - an alphabetic string
+    match = r"\s*([a-zA-Z_]+)\s*:\s*(\[[^\]]*\]|\([^)]*\)|[a-zA-Z_]+|[+-]?[0-9]+)\s*"
+    for line in lines:
+        parts = line.split(";")
+        case = {}
+        for part in parts:
+            matched = re.match(match, part)
+            if not re.fullmatch(match, part):
+                print(f"invalid test case: '{line}': '{part}'", file=sys.stderr)
+                exit(113)
+            key = matched[1]
+            val = matched[2]
+            try:
+                val = int(val)
+            except ValueError:
+                # Try parsing as tuple/list (e.g., [1, 2, 3] for group gemm dimensions)
+                if (val.startswith("(") and val.endswith(")")) or (
+                    val.startswith("[") and val.endswith("]")
+                ):
+                    try:
+                        inner = val[1:-1].strip()
+                        if inner:
+                            val = tuple(int(x.strip()) for x in inner.split(","))
+                        else:
+                            val = tuple()
+                    except ValueError:
+                        pass
+
+            case[key] = val
+        tests.append(TestCase(spec=line, args=case))
+
+    if seed is not None:
+        for test in tests:
+            if "seed" in test.args:
+                test.args["seed"] = _combine(test.args["seed"], seed)
+
+    return tests
+
+
+@dataclasses.dataclass
+class Stats:
+    runs: int
+    mean: float
+    std: float
+    err: float
+    best: float
+    worst: float
+
+
+def calculate_stats(durations: list[int]):
+    """
+    Calculate statistical data from a list of durations.
+
+    @param durations: A list of durations in nanoseconds.
+    @return: A Stats object containing the number of runs, mean, standard deviation, error, best, and worst durations.
+    """
+    runs = len(durations)
+    total = sum(durations)
+    best = min(durations)
+    worst = max(durations)
+
+    avg = total / runs
+    variance = sum(map(lambda x: (x - avg) ** 2, durations))
+    std = math.sqrt(variance / (runs - 1))
+    err = std / math.sqrt(runs)
+
+    return Stats(
+        runs=runs, mean=avg, std=std, err=err, best=float(best), worst=float(worst)
+    )
+
+
+def _clone_data(data):
+    """
+    Recursively goes through data and clones all tensors.
+    """
+    if isinstance(data, tuple):
+        return tuple(_clone_data(x) for x in data)
+    elif isinstance(data, list):
+        return [_clone_data(x) for x in data]
+    elif isinstance(data, dict):
+        return {k: _clone_data(v) for k, v in data.items()}
+    elif isinstance(data, torch.Tensor):
+        return data.clone()
+    else:
+        return data
+
+
+def _run_single_test(test: TestCase):
+    """
+    Runs a single test case. Do not call directly
+    """
+    from submission import custom_kernel
+
+    data = generate_input(**test.args)
+    torch.cuda.synchronize()
+    try:
+        submission_output = custom_kernel(_clone_data(data))
+
+    except OpError as E:
+        print(f"Encountered {E}", file=sys.stderr)
+        return False, str(E)
+    torch.cuda.synchronize()
+    return check_implementation(data, submission_output)
+
+
+def run_single_test(pool: multiprocessing.Pool, test: TestCase):
+    """
+    Runs a single test in another process.
+    """
+    return pool.apply(_run_single_test, (test,))
+
+
+def run_testing(
+    logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]
+):
+    """
+    Executes the actual test case code and checks for correctness.
+
+    @param logger: A PopcornOutput object used for logging test results.
+    @param tests: A list of TestCase objects representing the test cases to be executed.
+    @return: An integer representing the exit status: 0 if all tests pass, otherwise 112.
+    """
+    passed = True
+    logger.log("test-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"test.{idx}.spec", test.spec)
+        good, message = run_single_test(pool, test)
+        if not good:
+            logger.log(f"test.{idx}.status", "fail")
+            logger.log(f"test.{idx}.error", message)
+            passed = False
+        else:
+            logger.log(f"test.{idx}.status", "pass")
+            if message:
+                logger.log(f"test.{idx}.message", message)
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def _run_single_benchmark(
+    test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float
+) -> Stats | Any:
+    """
+    Runs one benchmark. Do not call directly.
+    """
+    from submission import custom_kernel
+
+    durations = []
+    data_list = []
+    # generate input data once
+
+    for i in range(NUM_ITERATIONS_PER_BENCHMARK):
+        if "seed" in test.args:
+            test.args["seed"] += 42
+        data = generate_input(**test.args)
+        data_list.append(data)
+
+    check_copy = _clone_data(data_list)
+
+    #  first, one obligatory correctness check
+    outputs = []
+    try:
+        for data in data_list:
+            output = custom_kernel(_clone_data(data))
+            outputs.append(output)
+    except OpError as E:
+        return f"Encountered {E}"
+    for reference_output, custom_output in zip(check_copy, outputs):
+        good, message = check_implementation(reference_output, custom_output)
+        if not good:
+            return message
+
+    # now, do multiple timing runs without further correctness testing
+    # there is an upper bound of 200 runs, and a lower bound of 3 runs;
+    # otherwise, we repeat until we either measure at least 10 full seconds,
+    # or the relative error of the mean is below 1%.
+
+    bm_start_time = time.perf_counter_ns()
+    for i in range(max_repeats):
+        torch.cuda.synchronize()
+
+        outputs = []
+        clear_l2_cache()
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        start_event.record()
+        for data in data_list:
+            output = custom_kernel(data)
+            outputs.append(output)
+        end_event.record()
+        torch.cuda.synchronize()
+        duration = (
+            start_event.elapsed_time(end_event) / NUM_ITERATIONS_PER_BENCHMARK
+        ) * 1e6  # Convert ms to ns
+
+        if recheck:
+            for reference_output, custom_output in zip(check_copy, outputs):
+                good, message = check_implementation(reference_output, custom_output)
+            if not good:
+                return message
+
+        durations.append(duration)
+
+        total_bm_duration = time.perf_counter_ns() - bm_start_time
+        if (
+            i > 1 and total_bm_duration > 1e8
+        ):  # at least 2 runs, and at least 100 ms total time
+            stats = calculate_stats(durations)
+            # stop if either
+            # a) relative error dips below 0.1%
+            # b) we exceed the total time limit for benchmarking the kernel
+            # c) we exceed 2 minutes of total wallclock time.
+            if (
+                stats.err / stats.mean < 0.001
+                or stats.mean * stats.runs > max_time_ns
+                or total_bm_duration > 120e9
+            ):
+                break
+
+    return calculate_stats(durations)
+
+
+def run_single_benchmark(
+    pool: multiprocessing.Pool,
+    test: TestCase,
+    recheck: bool,
+    max_repeats: int,
+    max_time_ns: float,
+):
+    """
+    For a particular test case, check correctness (if applicable) and grab runtime results.
+
+    @param pool: Process on which the benchmark will be launched.
+    @param test: TestCase object.
+    @param recheck: Flag for whether to explicitly check functional correctness.
+    @param max_repeats: Number of trials to repeat.
+    @param max_time_ns: Timeout time in nanoseconds.
+    @return: A Stats object for this particular benchmark case or an error if the test fails.
+    """
+    return pool.apply(_run_single_benchmark, (test, recheck, max_repeats, max_time_ns))
+
+
+def run_benchmarking(
+    logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]
+):
+    """
+    Executes benchmarking code for a CUDA Kernel and logs runtimes.
+
+    @param logger: A PopcornOutput object used for logging benchmark results.
+    @param pool: Process on which the benchmarks will be launched.
+    @param tests: A list of TestCase objects representing the test cases to be benchmarked.
+    @return: An integer representing the exit status: 0 if all benchmarks pass, otherwise 112.
+    """
+
+    run_single_benchmark(pool, tests[0], False, 100, 10e7)
+
+    passed = True
+    logger.log("benchmark-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"benchmark.{idx}.spec", test.spec)
+        result = run_single_benchmark(pool, test, False, 100, 10e9)
+        if isinstance(result, Stats):
+            for field in dataclasses.fields(Stats):
+                logger.log(f"benchmark.{idx}.{field.name}", getattr(result, field.name))
+        else:
+            passed = False
+            logger.log(f"benchmark.{idx}.status", "fail")
+            logger.log(f"benchmark.{idx}.error", result)
+
+    if passed:
+        logger.log("check", "pass")
+        return 0
+    else:
+        logger.log("check", "fail")
+        return 112
+
+
+def _run_single_profile_torch(test: TestCase) -> str:
+    """
+    Profiles a single benchmark using the torch profiler.
+    """
+    from submission import custom_kernel
+    from torch.profiler import profile, ProfilerActivity
+
+    with nvtx_range("generate input"):
+        data = generate_input(**test.args)
+        torch.cuda.synchronize()
+
+    cloned = _clone_data(data)
+    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
+        with nvtx_range("custom_kernel"):
+            submission_output = custom_kernel(cloned)
+            torch.cuda.synchronize()
+
+    return prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=20)
+
+
+def _run_single_profile_ncu(test: TestCase) -> str:
+    """
+    Profiles a single benchmark using ncu. Note: this does not
+    invoke NCU; instead, it is expected that eval is launched
+    under NCU, and this function will rurnthe kernel excactly
+    once in the 'custom_kernel' nvtx range.
+    """
+    from submission import custom_kernel
+
+    with nvtx_range("generate input"):
+        data = generate_input(**test.args)
+        torch.cuda.synchronize()
+
+    cloned = _clone_data(data)
+    with nvtx_range("custom_kernel"):
+        submission_output = custom_kernel(cloned)
+        torch.cuda.synchronize()
+
+    return ""
+
+
+def _combine_traces(traces: list["EventList"]) -> "EventList":
+    """
+    Combine multiple event traces obtained from multiple (distributed) torch.profiler
+    activities. This function simply aggregates the data as like `prof.key_averages()`,
+    except over multiple traces. Most of this function is reimplemented
+    from `torch.autograd.profiler_util.EventList.key_averages()`.
+    """
+    from torch.autograd.profiler_util import FunctionEventAvg, EventList
+    from collections import defaultdict
+
+    def get_key(event) -> tuple[str, ...]:
+        return (
+            str(event.key),
+            str(event.node_id),
+            str(event.device_type),
+            str(event.is_legacy),
+            str(event.is_user_annotation),
+        )
+
+    stats: dict[tuple[str, ...], FunctionEventAvg] = defaultdict(FunctionEventAvg)
+
+    for events in traces:
+        for event in events:
+            stats[get_key(event)].add(event)
+
+    avg_list = EventList(stats.values())
+    for event in avg_list:
+        event.stack = []
+        event.input_shapes = ""
+        event.overload_name = ""
+
+    return avg_list
+
+
+def run_single_profile(test: TestCase, pool: multiprocessing.Pool) -> str:
+    """
+    Runs a single profiling activity in another process.
+    """
+    if bool(os.getenv("POPCORN_NCU", "0")):
+        return pool.apply(_run_single_profile_ncu, (test,))
+    else:
+        return pool.apply(_run_single_profile_torch, (test,))
+
+
+def run_profiling(
+    logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]
+):
+    logger.log("benchmark-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"benchmark.{idx}.spec", test.spec)
+        report = run_single_profile(test, pool)
+        logger.log(
+            f"benchmark.{idx}.report",
+            base64.b64encode(report.encode("utf-8"), b"+*").decode("utf-8"),
+        )
+    logger.log("check", "pass")
+    return 0
+
+
+def main():
+    fd = os.getenv("POPCORN_FD")
+    if not fd:
+        return 111
+
+    if len(sys.argv) < 3:
+        return 2
+
+    mode = sys.argv[1]
+    seed = os.getenv("POPCORN_SEED")
+    os.unsetenv("POPCORN_SEED")
+    seed = int(seed) if seed else None
+    set_seed(seed or 42)
+
+    tests = get_test_cases(sys.argv[2], seed)
+
+    with PopcornOutput(int(fd)) as logger:
+        import multiprocessing
+
+        mp_context = multiprocessing.get_context("spawn")
+        with mp_context.Pool(1, initializer=_init_worker) as pool:
+            if mode == "test":
+                return run_testing(logger, pool, tests)
+            if mode == "benchmark":
+                return run_benchmarking(logger, pool, tests)
+
+            if mode == "leaderboard":
+                # Warmup all test shapes to ensure consistent benchmarking
+                for test in tests:
+                    run_single_benchmark(pool, test, False, 50, 5e8)
+                logger.log("benchmark-count", len(tests))
+                passed = True
+                for i in range(len(tests)):
+                    result = run_single_benchmark(pool, tests[i], True, 100, 30e9)
+                    logger.log(f"benchmark.{i}.spec", tests[i].spec)
+                    if isinstance(result, Stats):
+                        for field in dataclasses.fields(Stats):
+                            logger.log(
+                                f"benchmark.{i}.{field.name}",
+                                getattr(result, field.name),
+                            )
+                    else:
+                        passed = False
+                        logger.log(f"benchmark.{i}.status", "fail")
+                        logger.log(
+                            f"benchmark.{i}.error", str(result)
+                        )  # TODO: Make sure result implements __str__?
+                        break
+
+                logger.log("check", "pass" if passed else "fail")
+            elif mode == "profile":
+                run_profiling(logger, pool, tests)
+            else:
+                # TODO: Implement script mode
+                return 2
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/problems/nvidia/nvfp4_group_gemm/task.yml b/problems/nvidia/nvfp4_group_gemm/task.yml
index df25a7cb..92040c0a 100644
--- a/problems/nvidia/nvfp4_group_gemm/task.yml
+++ b/problems/nvidia/nvfp4_group_gemm/task.yml
@@ -5,7 +5,7 @@ files:
   - {"name": "task.py", "source": "task.py"}
   - {"name": "utils.py", "source": "utils.py"}
   - {"name": "reference.py", "source": "reference.py"}
-  - {"name": "eval.py", "source": "eval.py"}
+  - {"name": "eval.py", "source": "../eval_better_bench_grouped_gemm.py"}
 
 lang: "py"
 
@@ -63,3 +63,5 @@ benchmarks:
   - {"m": [128, 384], "n": [4096, 4096], "k": [1536, 1536], "g": 2, "seed": 1111}
 
 ranking_by: "geom"
+
+ranked_timeout: 300

From 9ca8ea5d9728fd272e07b437075439510b805a0f Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sun, 25 Jan 2026 01:10:56 +0100
Subject: [PATCH 199/207] Fix: add better l2 cache clear

---
 problems/nvidia/nvfp4_group_gemm/utils.py | 58 ++++++++++++++++-------
 1 file changed, 41 insertions(+), 17 deletions(-)

diff --git a/problems/nvidia/nvfp4_group_gemm/utils.py b/problems/nvidia/nvfp4_group_gemm/utils.py
index 18a1a420..f6c1a8b3 100644
--- a/problems/nvidia/nvfp4_group_gemm/utils.py
+++ b/problems/nvidia/nvfp4_group_gemm/utils.py
@@ -28,11 +28,7 @@ def get_device(use_cuda: bool = True) -> torch.device:
 # Adapted from https://github.com/linkedin/Liger-Kernel/blob/main/test/utils.py
 @torch.no_grad()
 def verbose_allclose(
-        received: torch.Tensor,
-        expected: torch.Tensor,
-        rtol=1e-05,
-        atol=1e-08,
-        max_print=5
+    received: torch.Tensor, expected: torch.Tensor, rtol=1e-05, atol=1e-08, max_print=5
 ) -> list[str]:
     """
     Assert that two tensors are element-wise equal within a tolerance, providing detailed information about mismatches.
@@ -64,9 +60,13 @@ def verbose_allclose(
     nan_mismatched = torch.logical_xor(torch.isnan(received), torch.isnan(expected))
 
     # Find +inf mismatched elements
-    posinf_mismatched = torch.logical_xor(torch.isposinf(received), torch.isposinf(expected))
+    posinf_mismatched = torch.logical_xor(
+        torch.isposinf(received), torch.isposinf(expected)
+    )
     # Find -inf mismatched elements
-    neginf_mismatched = torch.logical_xor(torch.isneginf(received), torch.isneginf(expected))
+    neginf_mismatched = torch.logical_xor(
+        torch.isneginf(received), torch.isneginf(expected)
+    )
 
     # Find all mismatched elements
     mismatched = torch.logical_or(
@@ -87,14 +87,18 @@ def verbose_allclose(
             i = tuple(index.tolist())
             mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
         if num_mismatched > max_print:
-            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+            mismatch_details.append(
+                f"... and {num_mismatched - max_print} more mismatched elements."
+            )
         return mismatch_details
 
     return []
 
 
 @torch.no_grad()
-def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print: int=5):
+def verbose_allequal(
+    received: torch.Tensor, expected: torch.Tensor, max_print: int = 5
+):
     """
     Assert that two tensors are element-wise perfectly equal, providing detailed information about mismatches.
 
@@ -120,32 +124,43 @@ def verbose_allequal(received: torch.Tensor, expected: torch.Tensor, max_print:
             i = tuple(index.tolist())
             mismatch_details.append(f"ERROR AT {i}: {received[i]} {expected[i]}")
         if num_mismatched > max_print:
-            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+            mismatch_details.append(
+                f"... and {num_mismatched - max_print} more mismatched elements."
+            )
         return mismatch_details
 
     return []
 
 
-def match_reference(data, output, reference: callable, rtol=1e-05, atol=1e-08) -> tuple[bool, str]:
+def match_reference(
+    data, output, reference: callable, rtol=1e-05, atol=1e-08
+) -> tuple[bool, str]:
     """
     Convenient "default" implementation for tasks' `check_implementation` function.
     """
     expected = reference(data)
 
     if len(output) != len(expected):
-        return False, f"output length mismatch: got {len(output)}, expected {len(expected)}"
+        return (
+            False,
+            f"output length mismatch: got {len(output)}, expected {len(expected)}",
+        )
 
     for i, (output_i, expected_i) in enumerate(zip(output, expected)):
         reasons = verbose_allclose(output_i, expected_i, rtol=rtol, atol=atol)
         if len(reasons) > 0:
-            return False, f"mismatch found! custom implementation doesn't match reference: {i} {reasons}"
+            return (
+                False,
+                f"mismatch found! custom implementation doesn't match reference: {i} {reasons}",
+            )
 
-    return True, ''
+    return True, ""
 
 
 def make_match_reference(reference: callable, **kwargs):
     def wrapped(data, output):
         return match_reference(data, output, reference=reference, **kwargs)
+
     return wrapped
 
 
@@ -156,7 +171,7 @@ def __init__(self):
         self.cublas = None
 
     def __enter__(self):
-        self.cublas = os.environ.get('CUBLAS_WORKSPACE_CONFIG', '')
+        self.cublas = os.environ.get("CUBLAS_WORKSPACE_CONFIG", "")
         self.allow_tf32 = torch.backends.cudnn.allow_tf32
         self.deterministic = torch.backends.cudnn.deterministic
         torch.backends.cudnn.allow_tf32 = False
@@ -168,7 +183,8 @@ def __exit__(self, exc_type, exc_value, traceback):
         torch.backends.cudnn.allow_tf32 = self.allow_tf32
         torch.backends.cudnn.deterministic = self.deterministic
         torch.use_deterministic_algorithms(False)
-        os.environ['CUBLAS_WORKSPACE_CONFIG'] = self.cublas
+        os.environ["CUBLAS_WORKSPACE_CONFIG"] = self.cublas
+
 
 def clear_l2_cache():
     # import cupy as cp
@@ -177,4 +193,12 @@ def clear_l2_cache():
     dummy = torch.empty((32, 1024, 1024), dtype=torch.int64, device="cuda")
     # write stuff to
     dummy.fill_(42)
-    del dummy
\ No newline at end of file
+    del dummy
+
+
+def clear_l2_cache_large():
+    # import cupy as cp
+    # cp.cuda.runtime.deviceSetLimit(cp.cuda.runtime.cudaLimitPersistingL2CacheSize, 0)
+    # create a large dummy tensor
+    dummy = torch.randn((16000, 1024, 1024), device="cuda")
+    del dummy

From 07f03216bc5680eb3a6deedd390fb019fe2706f8 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Mon, 26 Jan 2026 08:47:23 +0800
Subject: [PATCH 200/207] change fp4 init range (#96)

---
 problems/nvidia/nvfp4_group_gemm/reference.py | 21 ++++++++++++-------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/problems/nvidia/nvfp4_group_gemm/reference.py b/problems/nvidia/nvfp4_group_gemm/reference.py
index f12f504b..7ce6bc09 100644
--- a/problems/nvidia/nvfp4_group_gemm/reference.py
+++ b/problems/nvidia/nvfp4_group_gemm/reference.py
@@ -123,6 +123,17 @@ def create_reordered_scale_factor_tensor(l, mn, k, ref_f8_tensor):
     return reordered_f8_tensor
 
 
+def _create_fp4_tensors(l, mn, k):
+    # generate uint8 tensor, then convert to float4e2m1fn_x2 data type
+    # generate all bit patterns
+    ref_i8 = torch.randint(255, size=(l, mn, k // 2), dtype=torch.uint8, device="cuda")
+
+    # for each nibble, only keep the sign bit and 2 LSBs
+    # the possible values are [-1.5, -1, -0.5, 0, +0.5, +1, +1.5]
+    ref_i8 = ref_i8 & 0b1011_1011
+    return ref_i8.permute(1, 2, 0).view(torch.float4_e2m1fn_x2)
+
+
 def generate_input(
     m: tuple,
     n: tuple,
@@ -165,14 +176,8 @@ def generate_input(
         mi = m[group_idx]
         ni = n[group_idx]
         ki = k[group_idx]
-        a_ref = torch.randint(
-            -1, 2, (l, mi, ki // 2), dtype=torch.int8, device="cuda"
-        ).permute(1, 2, 0)
-        b_ref = torch.randint(
-            -1, 2, (l, ni, ki // 2), dtype=torch.int8, device="cuda"
-        ).permute(1, 2, 0)
-        a_ref = a_ref.view(torch.float4_e2m1fn_x2)
-        b_ref = b_ref.view(torch.float4_e2m1fn_x2)
+        a_ref = _create_fp4_tensors(l, mi, ki)
+        b_ref = _create_fp4_tensors(l, ni, ki)
 
         c_ref = torch.randn((l, mi, ni), dtype=torch.float16, device="cuda").permute(
             1, 2, 0

From db1c91e99b101d3857d1bacce948eb38519d6da8 Mon Sep 17 00:00:00 2001
From: vickiw973 <vickiw@nvidia.com>
Date: Mon, 26 Jan 2026 08:47:52 +0800
Subject: [PATCH 201/207] change k's value to a multiple of 256 (#98)

* Add new problem nvfp4_gemm to nvidia.yaml

* change k's value to a multiple of 256 for better perf (simplify some logic).

* revert unnecessary  change.
---
 problems/nvidia/nvfp4_group_gemm/task.yml | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/problems/nvidia/nvfp4_group_gemm/task.yml b/problems/nvidia/nvfp4_group_gemm/task.yml
index 92040c0a..a41302c0 100644
--- a/problems/nvidia/nvfp4_group_gemm/task.yml
+++ b/problems/nvidia/nvfp4_group_gemm/task.yml
@@ -45,16 +45,16 @@ templates:
   Python: "template.py"
 
 tests:
-  - {"m": [96, 128], "n": [128, 256], "k": [128, 512], "g": 2, "seed": 1111}
+  - {"m": [96, 128], "n": [128, 256], "k": [256, 512], "g": 2, "seed": 1111}
   - {"m": [256, 72], "n": [512, 384], "k": [256, 256], "g": 2, "seed": 1111}
-  - {"m": [128, 128], "n": [128, 256], "k": [128, 512], "g": 2, "seed": 1111}
-  - {"m": [80, 128, 256], "n": [384, 256, 128], "k": [256, 512, 128], "g": 3, "seed": 1111}
-  - {"m": [64, 72, 96], "n": [128, 384, 512], "k": [384, 512, 128], "g": 3, "seed": 1111}
-  - {"m": [64, 256, 128], "n": [768, 128, 256], "k": [512, 512, 128], "g": 3, "seed": 1111}
-  - {"m": [128, 128, 64], "n": [256, 512, 512], "k": [768, 128, 768], "g": 3, "seed": 1111}
-  - {"m": [128, 128, 128, 128], "n": [128, 128, 128, 128], "k": [128, 128, 128, 128], "g": 4, "seed": 1111}
+  - {"m": [128, 128], "n": [128, 256], "k": [512, 256], "g": 2, "seed": 1111}
+  - {"m": [80, 128, 256], "n": [384, 256, 128], "k": [256, 512, 256], "g": 3, "seed": 1111}
+  - {"m": [64, 72, 96], "n": [128, 384, 512], "k": [512, 512, 256], "g": 3, "seed": 1111}
+  - {"m": [64, 256, 128], "n": [768, 128, 256], "k": [512, 256, 512], "g": 3, "seed": 1111}
+  - {"m": [128, 128, 64], "n": [256, 512, 512], "k": [768, 256, 768], "g": 3, "seed": 1111}
+  - {"m": [128, 128, 128, 128], "n": [128, 128, 128, 128], "k": [512, 256, 512, 256], "g": 4, "seed": 1111}
   - {"m": [40, 56, 384, 512], "n": [512, 384, 256, 128], "k": [256, 256, 256, 256], "g": 4, "seed": 1111}
-  - {"m": [512, 384, 256, 128], "n": [256, 256, 256, 256], "k": [512, 128, 512, 128], "g": 4, "seed": 1111}
+  - {"m": [512, 384, 256, 128], "n": [256, 256, 256, 256], "k": [512, 768, 512, 768], "g": 4, "seed": 1111}
 
 benchmarks:
   - {"m": [80, 176, 128, 72, 64, 248, 96, 160], "n": [4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096], "k": [7168, 7168, 7168, 7168, 7168, 7168, 7168, 7168], "g": 8, "seed": 1111}

From 53801cc7ace94554f14867e0f8cc07aad9a12dfd Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Tue, 27 Jan 2026 21:07:47 -0800
Subject: [PATCH 202/207] update pmppv2 dates

---
 problems/pmpp_v2.yaml | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/problems/pmpp_v2.yaml b/problems/pmpp_v2.yaml
index e3c6915c..3bdf6777 100644
--- a/problems/pmpp_v2.yaml
+++ b/problems/pmpp_v2.yaml
@@ -7,7 +7,7 @@ description: ""
 problems:
   - directory: pmpp_v2/conv2d_py
     name: conv2d_v2
-    deadline: "2025-12-30"
+    deadline: "2100-12-31"
     gpus:
       - B200
       - H100
@@ -15,7 +15,7 @@ problems:
       - L4
   - directory: pmpp_v2/grayscale_py
     name: grayscale_v2
-    deadline: "2025-12-30"
+    deadline: "2100-12-31"
     gpus:
       - B200
       - H100
@@ -23,7 +23,7 @@ problems:
       - L4
   - directory: pmpp_v2/histogram_py
     name: histogram_v2
-    deadline: "2025-12-30"
+    deadline: "2100-12-31"
     gpus:
       - B200
       - H100
@@ -31,7 +31,7 @@ problems:
       - L4
   - directory: pmpp_v2/matmul_py
     name: matmul_v2
-    deadline: "2025-12-30"
+    deadline: "2100-12-31"
     gpus:
       - B200
       - H100
@@ -39,7 +39,7 @@ problems:
       - L4
   - directory: pmpp_v2/prefixsum_py
     name: prefixsum_v2
-    deadline: "2025-12-30"
+    deadline: "2100-12-31"
     gpus:
       - B200
       - H100
@@ -47,7 +47,7 @@ problems:
       - L4
   - directory: pmpp_v2/sort_py
     name: sort_v2
-    deadline: "2025-12-30"
+    deadline: "2100-12-31"
     gpus:
       - B200
       - H100
@@ -55,7 +55,7 @@ problems:
       - L4
   - directory: pmpp_v2/vectoradd_py
     name: vectoradd_v2
-    deadline: "2025-12-30"
+    deadline: "2100-12-31"
     gpus:
       - B200
       - H100
@@ -63,7 +63,7 @@ problems:
       - L4
   - directory: pmpp_v2/vectorsum_py
     name: vectorsum_v2
-    deadline: "2025-12-30"
+    deadline: "2100-12-31"
     gpus:
       - B200
       - H100

From 62e4b611932da32dc19f6c1d0e8647f06d56c8ba Mon Sep 17 00:00:00 2001
From: Dan Saunders <danjsaund@gmail.com>
Date: Thu, 29 Jan 2026 12:30:22 -0500
Subject: [PATCH 203/207] add MLIRError, UNSERIALIZABLE_EXCEPTIONS tuple

---
 problems/nvidia/eval_better_bench_grouped_gemm.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/problems/nvidia/eval_better_bench_grouped_gemm.py b/problems/nvidia/eval_better_bench_grouped_gemm.py
index d9cecc35..09b52790 100644
--- a/problems/nvidia/eval_better_bench_grouped_gemm.py
+++ b/problems/nvidia/eval_better_bench_grouped_gemm.py
@@ -21,6 +21,8 @@ def _init_worker():
 
 import torch.cuda
 from cutlass.cute.nvgpu.common import OpError
+from cutlass._mlir.ir import MLIRError
+
 from torch.cuda.nvtx import range as nvtx_range
 
 from utils import set_seed, clear_l2_cache_large as clear_l2_cache
@@ -33,6 +35,7 @@ def _init_worker():
 from reference import check_implementation, generate_input
 
 NUM_ITERATIONS_PER_BENCHMARK = 15
+UNSERIALIZABLE_EXCEPTIONS = (OpError, MLIRError)
 
 
 class PopcornOutput:
@@ -181,7 +184,7 @@ def _run_single_test(test: TestCase):
     try:
         submission_output = custom_kernel(_clone_data(data))
 
-    except OpError as E:
+    except UNSERIALIZABLE_EXCEPTIONS as E:
         print(f"Encountered {E}", file=sys.stderr)
         return False, str(E)
     torch.cuda.synchronize()
@@ -253,7 +256,7 @@ def _run_single_benchmark(
         for data in data_list:
             output = custom_kernel(_clone_data(data))
             outputs.append(output)
-    except OpError as E:
+    except UNSERIALIZABLE_EXCEPTIONS as E:
         return f"Encountered {E}"
     for reference_output, custom_output in zip(check_copy, outputs):
         good, message = check_implementation(reference_output, custom_output)

From efa52179f1803b5d94d9c5a5dd4027b76441fbdd Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Thu, 29 Jan 2026 14:25:56 -0800
Subject: [PATCH 204/207] Update competition list in README

Added NVIDIA Blackwell NVFP4 competition to the competition list.
---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f362097e..5d0890f1 100644
--- a/README.md
+++ b/README.md
@@ -5,10 +5,11 @@ This repo holds reference kernels for the KernelBot which hosts regular competit
 You can see what's going on [gpumode.com](https://www.gpumode.com/)
 
 ## Competition
-1. [PMPP practice problems](https://gpu-mode.github.io/discord-cluster-manager/docs/active#practice-round-leaderboard): Starting on Sunday Feb 21, 2025.
+1. [PMPP practice problems](https://gpu-mode.github.io/discord-cluster-manager/docs/active#practice-round-leaderboard)
 2. [AMD $100K kernel competition](problems/amd)
 3. [BioML kernels](problems/bioml)
 4. [AMD $100K distributed kernel competition](problems/amd_distributed)
+5. [NVIDIA Blackwell NVFP4 competition](problems/nvidia)
 
 ## Making a Leaderboard Submission
 

From 64e88da208075f39e51d3b8090f39ab6ae735ed0 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Fri, 30 Jan 2026 09:23:57 -0800
Subject: [PATCH 205/207] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 5d0890f1..218787da 100644
--- a/README.md
+++ b/README.md
@@ -23,6 +23,7 @@ To add a new problem, create a new folder in the `problems/glory` directory wher
 - `task.yml` - This is the problem specification that will be used to generate test cases for different shapes
 - `task.py` - Specifies the schema of the inputs and outputs for the problem
 
+You can evaluate problems with your own Modal account (they give you a free $30) by borrowing this [neat script from @gau-nernst](https://github.com/gpu-mode/reference-kernels/pull/96#issue-3850136894)
 
 
 

From 2998db4f2ea0767f10d43bf36ef2c1288c4aabcd Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Thu, 5 Feb 2026 18:07:38 -0800
Subject: [PATCH 206/207] remove unsued eval (#101)

---
 problems/nvidia/nvfp4_gemm/eval.py | 500 -----------------------------
 1 file changed, 500 deletions(-)
 delete mode 100644 problems/nvidia/nvfp4_gemm/eval.py

diff --git a/problems/nvidia/nvfp4_gemm/eval.py b/problems/nvidia/nvfp4_gemm/eval.py
deleted file mode 100644
index e8bb5b21..00000000
--- a/problems/nvidia/nvfp4_gemm/eval.py
+++ /dev/null
@@ -1,500 +0,0 @@
-import base64
-import dataclasses
-import multiprocessing
-import re
-import time
-import os
-import sys
-import math
-from pathlib import Path
-from typing import Any, Optional
-import tempfile
-
-import torch.cuda
-from cutlass.cute.nvgpu.common import OpError
-
-from utils import set_seed, clear_l2_cache
-
-try:
-    from task import TestSpec
-except ImportError:
-    TestSpec = dict
-
-from reference import check_implementation, generate_input
-
-
-class PopcornOutput:
-    def __init__(self, fd: int):
-        self.file = os.fdopen(fd, "w")
-        os.set_inheritable(fd, False)
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.file.close()
-
-    def print(self, *args, **kwargs):
-        print(*args, **kwargs, file=self.file, flush=True)
-
-    def log(self, key, value):
-        self.print(f"{key}: {value}")
-
-
-@dataclasses.dataclass
-class TestCase:
-    args: dict
-    spec: str
-
-
-def _combine(a: int, b: int) -> int:
-    # combine two integers into one:
-    # we need this to generate a secret seed based on the test-level seed and
-    # the global secret seed.
-    # the test-level seeds are public knowledge, and typically relatively small numbers,
-    # so we need to make sure they don't provide any useful info for the full seed.
-    # This Cantor construction ensures that if the secret seed is a large number,
-    # then so is the overall seed.
-    return int(a + (a + b) * (a + b + 1) // 2)
-
-
-def get_test_cases(file_name: str, seed: Optional[int]) -> list[TestCase]:
-    try:
-        content = Path(file_name).read_text()
-    except Exception as E:
-        print(f"Could not open test file`{file_name}`: {E}", file=sys.stderr)
-        exit(113)
-
-    tests = []
-    lines = content.splitlines()
-    match = r"\s*([a-zA-Z]+):\s*([a-zA-Z]+|[+-]?[0-9]+)\s*"
-    for line in lines:
-        parts = line.split(";")
-        case = {}
-        for part in parts:
-            matched = re.match(match, part)
-            if not re.fullmatch(match, part):
-                print(f"invalid test case: '{line}': '{part}'", file=sys.stderr)
-                exit(113)
-            key = matched[1]
-            val = matched[2]
-            try:
-                val = int(val)
-            except ValueError:
-                pass
-
-            case[key] = val
-        tests.append(TestCase(spec=line, args=case))
-
-    if seed is not None:
-        for test in tests:
-            if "seed" in test.args:
-                test.args["seed"] = _combine(test.args["seed"], seed)
-
-    return tests
-
-
-@dataclasses.dataclass
-class Stats:
-    runs: int
-    mean: float
-    std: float
-    err: float
-    best: float
-    worst: float
-
-
-def calculate_stats(durations: list[int]):
-    """
-    Calculate statistical data from a list of durations.
-
-    @param durations: A list of durations in nanoseconds.
-    @return: A Stats object containing the number of runs, mean, standard deviation, error, best, and worst durations.
-    """
-    runs = len(durations)
-    total = sum(durations)
-    best = min(durations)
-    worst = max(durations)
-
-    avg = total / runs
-    variance = sum(map(lambda x: (x - avg) ** 2, durations))
-    std = math.sqrt(variance / (runs - 1))
-    err = std / math.sqrt(runs)
-
-    return Stats(
-        runs=runs, mean=avg, std=std, err=err, best=float(best), worst=float(worst)
-    )
-
-
-def _clone_data(data):
-    """
-    Recursively goes through data and clones all tensors.
-    """
-    if isinstance(data, tuple):
-        return tuple(_clone_data(x) for x in data)
-    elif isinstance(data, list):
-        return [_clone_data(x) for x in data]
-    elif isinstance(data, dict):
-        return {k: _clone_data(v) for k, v in data.items()}
-    elif isinstance(data, torch.Tensor):
-        return data.clone()
-    else:
-        return data
-
-
-def _run_single_test(test: TestCase):
-    """
-    Runs a single test case. Do not call directly
-    """
-    from submission import custom_kernel
-
-    data = generate_input(**test.args)
-    torch.cuda.synchronize()
-    try:
-        submission_output = custom_kernel(_clone_data(data))
-
-    except OpError as E:
-        print(f"Encountered {E}", file=sys.stderr)
-        return False, str(E)
-    torch.cuda.synchronize()
-    return check_implementation(data, submission_output)
-
-
-def run_single_test(pool: multiprocessing.Pool, test: TestCase):
-    """
-    Runs a single test in another process.
-    """
-    return pool.apply(_run_single_test, (test,))
-
-
-def run_testing(
-    logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]
-):
-    """
-    Executes the actual test case code and checks for correctness.
-
-    @param logger: A PopcornOutput object used for logging test results.
-    @param tests: A list of TestCase objects representing the test cases to be executed.
-    @return: An integer representing the exit status: 0 if all tests pass, otherwise 112.
-    """
-    # Step 1: Compile kernel once before running tests
-    logger.log("compile", "start")
-    compile_success, compile_error = pool.apply(_compile_kernel_once)
-    if not compile_success:
-        logger.log("compile", "fail")
-        logger.log("compile.error", compile_error)
-        return 112
-    logger.log("compile", "pass")
-    
-    # Step 2: Run all tests with compiled kernel
-    passed = True
-    logger.log("test-count", len(tests))
-    for idx, test in enumerate(tests):
-        logger.log(f"test.{idx}.spec", test.spec)
-        good, message = run_single_test(pool, test)
-        if not good:
-            logger.log(f"test.{idx}.status", "fail")
-            logger.log(f"test.{idx}.error", message)
-            passed = False
-        else:
-            logger.log(f"test.{idx}.status", "pass")
-            if message:
-                logger.log(f"test.{idx}.message", message)
-
-    if passed:
-        logger.log("check", "pass")
-        return 0
-    else:
-        logger.log("check", "fail")
-        return 112
-
-
-def _compile_kernel_once():
-    """
-    Compile the kernel once before any benchmarking.
-    This ensures compilation time is not included in benchmark results.
-    """
-    from submission import compile_kernel
-    
-    try:
-        # Trigger compilation (will be cached)
-        compile_kernel()
-        torch.cuda.synchronize()
-        return True, None
-    except OpError as E:
-        return False, f"Compilation failed: {E}"
-    except Exception as E:
-        return False, f"Compilation failed: {E}"
-
-
-def _run_single_benchmark(
-    test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float
-) -> Stats | Any:
-    """
-    Runs one benchmark. Do not call directly.
-    """
-    from submission import custom_kernel, compile_kernel
-
-    durations = []
-    # generate input data once
-    data = generate_input(**test.args)
-    check_copy = _clone_data(data)
-    
-    # Ensure kernel is compiled before any timing (compilation is cached)
-    try:
-        compile_kernel()
-        torch.cuda.synchronize()
-    except OpError as E:
-        return f"Compilation failed: {E}"
-    except Exception as E:
-        return f"Compilation failed: {E}"
-    
-    #  first, one obligatory correctness check
-    try:
-        output = custom_kernel(_clone_data(data))
-    except OpError as E:
-        return f"Encountered {E}"
-    good, message = check_implementation(check_copy, output)
-    if not good:
-        return message
-
-    # now, do multiple timing runs without further correctness testing
-    # there is an upper bound of 200 runs, and a lower bound of 3 runs;
-    # otherwise, we repeat until we either measure at least 10 full seconds,
-    # or the relative error of the mean is below 1%.
-
-    bm_start_time = time.perf_counter_ns()
-    for i in range(max_repeats):
-        if recheck:
-            # ensure we use a different seed for every benchmark
-            if "seed" in test.args:
-                test.args["seed"] += 13
-
-            data = generate_input(**test.args)
-            check_copy = _clone_data(data)
-        torch.cuda.synchronize()
-        start_event = torch.cuda.Event(enable_timing=True)
-        end_event = torch.cuda.Event(enable_timing=True)
-        clear_l2_cache()
-
-        start_event.record()
-        output = custom_kernel(data)
-        end_event.record()
-        torch.cuda.synchronize()
-        duration = start_event.elapsed_time(end_event) * 1e6  # Convert ms to ns
-
-        if recheck:
-            good, message = check_implementation(check_copy, output)
-            if not good:
-                return message
-
-        del output
-        durations.append(duration)
-
-        if i > 1:
-            total_bm_duration = time.perf_counter_ns() - bm_start_time
-            stats = calculate_stats(durations)
-            # stop if either
-            # a) relative error dips below 0.1%
-            # b) we exceed the total time limit for benchmarking the kernel
-            # c) we exceed 2 minutes of total wallclock time.
-            if (
-                stats.err / stats.mean < 0.001
-                or stats.mean * stats.runs > max_time_ns
-                or total_bm_duration > 120e9
-            ):
-                break
-
-    return calculate_stats(durations)
-
-
-def run_single_benchmark(
-    pool: multiprocessing.Pool,
-    test: TestCase,
-    recheck: bool,
-    max_repeats: int,
-    max_time_ns: float,
-):
-    """
-    For a particular test case, check correctness (if applicable) and grab runtime results.
-
-    @param pool: Process on which the benchmark will be launched.
-    @param test: TestCase object.
-    @param recheck: Flag for whether to explicitly check functional correctness.
-    @param max_repeats: Number of trials to repeat.
-    @param max_time_ns: Timeout time in nanoseconds.
-    @return: A Stats object for this particular benchmark case or an error if the test fails.
-    """
-    return pool.apply(_run_single_benchmark, (test, recheck, max_repeats, max_time_ns))
-
-
-def run_benchmarking(
-    logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]
-):
-    """
-    Executes benchmarking code for a CUDA Kernel and logs runtimes.
-
-    @param logger: A PopcornOutput object used for logging benchmark results.
-    @param pool: Process on which the benchmarks will be launched.
-    @param tests: A list of TestCase objects representing the test cases to be benchmarked.
-    @return: An integer representing the exit status: 0 if all benchmarks pass, otherwise 112.
-    """
-    # Step 1: Compile kernel once (outside of timing)
-    logger.log("compile", "start")
-    compile_success, compile_error = pool.apply(_compile_kernel_once)
-    if not compile_success:
-        logger.log("compile", "fail")
-        logger.log("compile.error", compile_error)
-        return 112
-    logger.log("compile", "pass")
-    
-    # Step 2: Warm up with compiled kernel
-    run_single_benchmark(pool, tests[0], False, 200, 10e7)
-
-    # Step 3: Run benchmarks (compilation time excluded)
-    passed = True
-    logger.log("benchmark-count", len(tests))
-    for idx, test in enumerate(tests):
-        logger.log(f"benchmark.{idx}.spec", test.spec)
-        result = run_single_benchmark(pool, test, False, 200, 10e9)
-        if isinstance(result, Stats):
-            for field in dataclasses.fields(Stats):
-                logger.log(f"benchmark.{idx}.{field.name}", getattr(result, field.name))
-        else:
-            passed = False
-            logger.log(f"benchmark.{idx}.status", "fail")
-            logger.log(f"benchmark.{idx}.error", result)
-
-    if passed:
-        logger.log("check", "pass")
-        return 0
-    else:
-        logger.log("check", "fail")
-        return 112
-
-
-def run_single_profile(test: TestCase) -> str:
-    """
-    Runs a single test case. Do not call directly
-    """
-    from submission import custom_kernel
-    from torch.profiler import profile, record_function, ProfilerActivity
-
-    data = generate_input(**test.args)
-    torch.cuda.synchronize()
-
-    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
-        submission_output = custom_kernel(_clone_data(data))
-        torch.cuda.synchronize()
-    return prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=20)
-
-
-def run_profiling(logger: PopcornOutput, tests: list[TestCase]):
-    logger.log("benchmark-count", len(tests))
-    for idx, test in enumerate(tests):
-        logger.log(f"benchmark.{idx}.spec", test.spec)
-        report = run_single_profile(test)
-        logger.log(
-            f"benchmark.{idx}.report",
-            base64.b64encode(report.encode("utf-8"), b"+*").decode("utf-8"),
-        )
-    logger.log("check", "pass")
-    return 0
-
-
-def main():
-    fd = os.getenv("POPCORN_FD")
-    if not fd:
-        return 111
-
-    if len(sys.argv) < 3:
-        return 2
-
-    mode = sys.argv[1]
-    seed = os.getenv("POPCORN_SEED")
-    os.unsetenv("POPCORN_SEED")
-    seed = int(seed) if seed else None
-    set_seed(seed or 42)
-
-    filename = None
-
-    with tempfile.NamedTemporaryFile(delete=False) as tmp:
-
-        def build_test_string(tests: list[dict]):
-            as_str = ""
-            for test in tests:
-                kvs = []
-                for k, v in test.items():
-                    kvs.append(f"{k}: {v}")
-                as_str += "; ".join(kvs) + "\n"
-            return as_str
-
-        import yaml
-
-        yaml_content = yaml.safe_load(open(sys.argv[2], "r"))
-        if mode == "test":
-            tests_str = build_test_string(yaml_content.get("tests", []))
-        elif mode in ("benchmark", "leaderboard", "profile"):
-            tests_str = build_test_string(yaml_content.get("benchmarks", []))
-
-        tmp.write(tests_str.encode("utf-8"))
-        tmp.flush()
-        filename = tmp.name
-
-    tests = get_test_cases(filename, seed)
-
-    os.unlink(filename)
-
-    with PopcornOutput(int(fd)) as logger:
-        import multiprocessing
-
-        mp_context = multiprocessing.get_context("spawn")
-        with mp_context.Pool(1) as pool:
-            if mode == "test":
-                return run_testing(logger, pool, tests)
-            if mode == "benchmark":
-                return run_benchmarking(logger, pool, tests)
-
-            if mode == "leaderboard":
-                # Step 1: Compile kernel once (outside of timing)
-                logger.log("compile", "start")
-                compile_success, compile_error = pool.apply(_compile_kernel_once)
-                if not compile_success:
-                    logger.log("compile", "fail")
-                    logger.log("compile.error", compile_error)
-                    return 112
-                logger.log("compile", "pass")
-                
-                # Step 2: Warmup with compiled kernel
-                run_single_benchmark(pool, tests[0], False, 200, 1e7)
-                
-                # Step 3: Run leaderboard benchmarks (compilation time excluded)
-                logger.log("benchmark-count", len(tests))
-                passed = True
-                for i in range(len(tests)):
-                    result = run_single_benchmark(pool, tests[i], True, 200, 30e9)
-                    logger.log(f"benchmark.{i}.spec", tests[i].spec)
-                    if isinstance(result, Stats):
-                        for field in dataclasses.fields(Stats):
-                            logger.log(
-                                f"benchmark.{i}.{field.name}",
-                                getattr(result, field.name),
-                            )
-                    else:
-                        passed = False
-                        logger.log(f"benchmark.{i}.status", "fail")
-                        logger.log(
-                            f"benchmark.{i}.error", str(result)
-                        )  # TODO: Make sure result implements __str__?
-                        break
-
-                logger.log("check", "pass" if passed else "fail")
-            elif mode == "profile":
-                run_profiling(logger, tests)
-            else:
-                # TODO: Implement script mode
-                return 2
-
-
-if __name__ == "__main__":
-    sys.exit(main())

From b98f12302d006124ace0426984a23a4a8e6fd5d5 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Sat, 7 Feb 2026 10:50:00 -0800
Subject: [PATCH 207/207] Update deadline for trimul problem

---
 problems/bioml.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/bioml.yaml b/problems/bioml.yaml
index 73e320ee..09af151b 100644
--- a/problems/bioml.yaml
+++ b/problems/bioml.yaml
@@ -8,7 +8,7 @@ description: "Popular and important kernels for BioML models like AlphaFold3"
 problems:
   - directory: bioml/trimul
     name: trimul
-    deadline: "2026-09-30"
+    deadline: "2026-02-06"
     gpus:
       - B200
       - H100