Profiling Code With Timeit#

Twitter Handle LinkedIn Profile GitHub Profile Tag Tag Code

# %pip install -q omniverse==0.0.63

Common Functions#

This module include GPT model definitions as well as some common config.

from __future__ import annotations

from typing import Literal, Tuple, cast

import torch
from pydantic import BaseModel
from torch import nn

from omnivault.modules.activation import GELU, SoftmaxStable
from omnivault.transformer.modules.layers.normalization import RMSNorm

__tagged__ = "This code tags to `30d963e` of cs336-stanford-spring2024-assignment1-gpt-from-scratch."
__reference__ = ["https://github.com/marcelroed/spring2024-assignment2-systems/blob/master/writeup.pdf"]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class General(BaseModel):
    batch_size: int = 16
    seed: int = 20230310


class ProfilerConfig(BaseModel):
    computation: Literal["forward", "backward", "forward_backward"]
    warmup_steps: int | None = None
    profile_steps: int
    mixed_precision: bool = False


class GPTConfig(BaseModel):
    approximate: Literal["tanh"] | None = None
    activation_name: Literal["gelu"] = "gelu"
    d_model: int
    d_ff: int | None = None
    num_heads: int
    context_length: int
    attn_pdrop: float = 0.0
    resid_pdrop: float = 0.0
    bias: bool = False
    vocab_size: int
    num_blocks: int
    token_position_pdrop: float = 0.0
    weight_tie: bool = False


class PositionwiseFeedForward(nn.Module):
    def __init__(
        self,
        d_model: int,
        d_ff: int | None = None,
        bias: bool = False,
        activation_name: Literal["gelu"] = "gelu",
        dropout: float = 0.0,
    ) -> None:
        super().__init__()

        self.d_model = d_model
        self.d_ff = d_ff or 4 * d_model
        self.bias = bias  # bias False in this exercise
        self.activation_name = activation_name
        self.dropout = dropout

        self.ffn = nn.ModuleDict(
            {
                # incoming `B x T x D` and we are interested in `T x D` so weight is `D x d_ff`
                # so that `Z @ W1 -> (T x D) @ (D x d_ff)`
                "context_fc": nn.Linear(in_features=self.d_model, out_features=self.d_ff, bias=self.bias),
                "activation": self.activation,
                # apply dropout after activation for random lights out
                "dropout": nn.Dropout(p=self.dropout, inplace=False),
                # incoming is Z @ W1 -> T x d_ff -> (T x d_ff) @ (d_ff x D) project back to D
                "context_projection": nn.Linear(in_features=self.d_ff, out_features=self.d_model, bias=self.bias),
            }
        )

    @property
    def activation(self) -> nn.Module:
        if self.activation_name == "gelu":
            activation = GELU(approximate=None)  # no approx using tanh
        else:
            raise ValueError(f"Unsupported activation: {self._activation}")
        return activation

    def forward(self, z: torch.Tensor) -> torch.Tensor:
        # fmt: off
        z = self.ffn["context_fc"](z)           # Z @ W1 = [B, T, D] @ [D, d_ff] = [B, T, d_ff]
        z = self.ffn["activation"](z)           # \sigma(Z @ W1) = [B, T, d_ff]
        z = self.ffn["dropout"](z)              # \dropout(\sigma(Z @ W1)) = [B, T, d_ff]
        z = self.ffn["context_projection"](z)   # \dropout(\sigma(Z @ W1)) @ W2 = [B, T, d_ff] @ [d_ff, D] = [B, T, D]
        # fmt: on
        return z


class ScaledDotProductAttention(nn.Module):
    def __init__(self, dropout: float = 0.0) -> None:
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

    def forward(
        self,
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
        mask: torch.BoolTensor | None = None,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        # fmt: off
        T, d_q = query.size(-2), query.size(-1)

        attention_scores  = torch.matmul(query, key.transpose(dim0=-2, dim1=-1)) / torch.sqrt(torch.tensor(d_q).float())        # Q @ K.T = [B, H, T, d_q] @ [B, H, d_q, T] = [B, H, T, T]

        if mask is not None:
            mask = mask[:, :, :T, :T] # type: ignore[assignment]
            attention_scores  = attention_scores.masked_fill(mask == 1, float("-inf")) if mask is not None else attention_scores    # [B, H, T, T]

        softmax           = SoftmaxStable(dim=-1)
        attention_weights = softmax(attention_scores)               # [B, H, T, T]
        attention_weights = self.dropout(attention_weights)         # [B, H, T, T]

        context_vector    = torch.matmul(attention_weights, value)  # [B, H, T, T] @ [B, H, T, d_v] = [B, H, T, d_v]
        # fmt: on
        return context_vector, attention_weights


class CausalMultiHeadSelfAttention(nn.Module):
    context_vector: torch.Tensor
    attention_weights: torch.Tensor

    def __init__(
        self,
        d_model: int,
        num_heads: int,
        context_length: int,
        attn_pdrop: float = 0.0,  # pdrop means prob of dropout
        resid_pdrop: float = 0.0,
        bias: bool = False,
    ) -> None:
        super().__init__()

        assert d_model % num_heads == 0

        self.d_model = d_model
        self.H = num_heads
        self.context_length = context_length
        self.attn_pdrop = attn_pdrop
        self.resid_pdrop = resid_pdrop
        self.bias = bias

        self.W_Q = nn.Linear(in_features=self.d_model, out_features=self.d_model, bias=self.bias)
        self.W_K = nn.Linear(in_features=self.d_model, out_features=self.d_model, bias=self.bias)
        self.W_V = nn.Linear(in_features=self.d_model, out_features=self.d_model, bias=self.bias)

        # alias of W_O
        self.context_projection = nn.Linear(in_features=self.d_model, out_features=self.d_model, bias=self.bias)

        # regularization
        self.resid_dropout = nn.Dropout(self.resid_pdrop)

        self.attention = ScaledDotProductAttention(dropout=self.attn_pdrop)

        # causal mask to ensure that attention is only applied to the left in the input sequence
        # register buffer cause not learnable weights
        self.register_buffer(
            "causal_mask",
            torch.triu(
                torch.ones((self.context_length, self.context_length)).bool(),
                diagonal=1,
            ).view(1, 1, self.context_length, self.context_length),
        )

    def forward(self, *, z: torch.Tensor) -> torch.Tensor:
        B, T, D = z.size()

        # fmt: off
        Q: torch.Tensor = self.W_Q(z).contiguous() # Z @ W_Q = [B, T, D] @ [D, D] = [B, T, D]
        K: torch.Tensor = self.W_K(z).contiguous() # Z @ W_K = [B, T, D] @ [D, D] = [B, T, D]
        V: torch.Tensor = self.W_V(z).contiguous() # Z @ W_V = [B, T, D] @ [D, D] = [B, T, D]

        Q = Q.view(B, T, self.H, D // self.H).transpose(dim0=1, dim1=2) # [B, T, D] -> [B, T, H, D // H] -> [B, H, T, D//H]
        K = K.view(B, T, self.H, D // self.H).transpose(dim0=1, dim1=2)
        V = V.view(B, T, self.H, D // self.H).transpose(dim0=1, dim1=2)

        # Now pass them to self attention
        self.context_vector, self.attention_weights = self.attention(query=Q, key=K, value=V, mask=self.causal_mask) # ([B, H, T, D // H], [B, H, T, T])
        assert isinstance(self.context_vector, torch.Tensor) # do this for type hint in IDE

        # Now context vector is shape [B, H, T, D // H] but we want [B, T, D] to matmul with W_O/context_projection
        self.context_vector = self.context_vector.transpose(dim0=1, dim1=2).contiguous().view(B, T, D) # merge all heads together
        # fmt: on

        projected_context_vector: torch.Tensor = self.resid_dropout(
            self.context_projection(self.context_vector)  # [B, T, D] @ [D, D] = [B, T, D]
        )
        return projected_context_vector


class GPTBlock(nn.Module):
    def __init__(
        self,
        config: GPTConfig,
    ) -> None:
        super().__init__()

        self.rmns_1 = RMSNorm(d_model=config.d_model, eps=1e-5)
        self.attn = CausalMultiHeadSelfAttention(
            d_model=config.d_model,
            num_heads=config.num_heads,
            context_length=config.context_length,
            attn_pdrop=config.attn_pdrop,
            resid_pdrop=config.resid_pdrop,
            bias=config.bias,
        )
        self.rmns_2 = RMSNorm(d_model=config.d_model, eps=1e-5)
        self.ffn = PositionwiseFeedForward(
            d_model=config.d_model,
            d_ff=config.d_ff,
            bias=config.bias,
            activation_name=config.activation_name,
            dropout=config.resid_pdrop,
        )

    def forward(self, z: torch.Tensor) -> torch.Tensor:
        z = z + self.attn(z=self.rmns_1(z))
        z = z + self.ffn(self.rmns_2(z))
        return z


class GPT(nn.Module):
    def __init__(self, config: GPTConfig) -> None:
        super().__init__()

        self.config = config
        self.d_model = config.d_model
        self.num_blocks = config.num_blocks
        self.vocab_size = config.vocab_size

        self.blocks = nn.ModuleList([GPTBlock(config=config) for _ in range(self.num_blocks)])

        self.backbone = nn.ModuleDict(
            dict(  # noqa: C408
                token_embeddings=nn.Embedding(num_embeddings=self.vocab_size, embedding_dim=self.d_model),
                position_embeddings=nn.Embedding(num_embeddings=config.context_length, embedding_dim=self.d_model),
                dropout=nn.Dropout(p=config.token_position_pdrop),
                layers=self.blocks,
                ln_final=RMSNorm(d_model=self.d_model, eps=1e-5),
            )
        )
        self.head = nn.Linear(in_features=self.d_model, out_features=self.vocab_size, bias=config.bias)

        self.apply(self._init_weights)

        context_projections = "context_projection.weight"
        # apply special scaled init to the residual projections, per GPT-2 paper
        for parameter_name, parameter in self.named_parameters():
            # NOTE: W_O is also projection but I did not have foresight to name it as such.
            if parameter_name.endswith(context_projections):
                mean = 0.0
                std_dev = 0.02 / torch.sqrt(torch.tensor(2 * config.num_blocks, dtype=torch.float))
                torch.nn.init.normal_(parameter, mean=mean, std=std_dev)

        if config.weight_tie:
            self.backbone.token_embeddings.weight = self.head.weight

    def crop_context_length(self, context_length: int) -> None:
        # NOTE: conveniently took Karpathy's implementation here for cropping
        assert context_length <= self.config.context_length
        self.config.context_length = context_length  # update config

        self.backbone.position_embeddings.weight = nn.Parameter(
            self.backbone.position_embeddings.weight[:context_length]
        )
        for block in self.backbone.layers:
            if hasattr(block.attn, "causal_mask"):
                block.attn.causal_mask = block.attn.causal_mask[:, :, :context_length, :context_length]

            # update context length attribute in MultiHeadSelfAttention
            block.attn.context_length = context_length

    def _init_weights(self, module: nn.Module) -> None:
        normal_init_modules = (nn.Linear, nn.Embedding)
        if isinstance(module, normal_init_modules):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if hasattr(module, "bias") and module.bias is not None:
                torch.nn.init.zeros_(module.bias)

    def forward(self, in_indices: torch.LongTensor) -> torch.FloatTensor:
        device = in_indices.device

        B, T = in_indices.size()

        positions = torch.arange(0, T, dtype=torch.long, device=device)  # [T]
        token_embeddings = self.backbone.token_embeddings(in_indices)  # [B, T, D]
        positional_embeddings = self.backbone.position_embeddings(positions)  # [T, D]
        # fmt: off
        positional_embeddings = positional_embeddings.unsqueeze(0) # .expand(B, -1, -1) # [B, T, D]
        # fmt: on

        z = self.backbone.dropout(token_embeddings + positional_embeddings)  # [B, T, D]

        for block in self.backbone.layers:
            z = block(z)  # [B, T, D]

        z = self.backbone.ln_final(z)  # [B, T, D]

        logits = self.head(z)  # [B, T, V]
        return cast(torch.FloatTensor, logits)  # [B, T, V]


def initialize_model(
    config: GPTConfig,
    device: str = "cuda",
) -> GPT:
    if config.d_ff is None:
        config.d_ff = 4 * config.d_model

    model = GPT(config)
    return model.to(device)


def get_random_batch(
    batch_size: int,
    context_length: int,
    vocab_size: int,
    device: str = "cuda",
) -> Tuple[torch.Tensor, torch.Tensor]:
    inputs = torch.randint(  # [B, T]
        0,
        vocab_size,
        (batch_size, context_length),
        dtype=torch.long,
        device=device,
    )

    targets = torch.randint(  # [B, T]
        0,
        vocab_size,
        (batch_size, context_length),
        dtype=torch.long,
        device=device,
    )
    return inputs, targets

Timeit Profiler#

from __future__ import annotations

from contextlib import nullcontext
from timeit import default_timer
from typing import List, Literal, Tuple

import numpy as np
import torch
from pydantic import BaseModel, Field

from omnivault.modules.loss import CrossEntropyLoss


class ProfilingResult(BaseModel):
    computation: Literal["forward", "backward", "forward_backward"] = Field(..., description="Type of computation")
    times: List[float] = Field(..., description="Raw list of measured times")
    mean_time: float = Field(..., description="Mean execution time")
    median_time: float = Field(..., description="Median execution time")
    std_dev: float = Field(..., description="Standard deviation of execution times")
    min_time: float = Field(..., description="Minimum execution time")
    max_time: float = Field(..., description="Maximum execution time")
    total_time: float = Field(..., description="Total execution time")
    profile_steps: int = Field(..., description="Number of profiling runs")


def profile_model(
    model: GPT,
    batch: Tuple[torch.Tensor, torch.Tensor],
    profile_steps: int,
    computation: Literal["forward", "backward", "forward_backward"],
    warmup_steps: int | None = None,
    mixed_precision: bool = False,
) -> ProfilingResult:
    device = next(model.parameters()).device
    dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
    mixed_context = torch.autocast(device.type, dtype=dtype) if mixed_precision else nullcontext()
    criterion = CrossEntropyLoss()
    inputs, targets = batch[0], batch[1]

    with mixed_context:  # type: ignore[attr-defined]
        if warmup_steps:
            for _ in range(warmup_steps):
                logits = model(inputs)
                loss = criterion(logits, targets)
                if computation in ["backward", "forward_backward"]:
                    loss.backward()
                torch.cuda.synchronize()

        times = np.zeros(profile_steps)

        for step in range(profile_steps):
            if computation == "forward":
                start = default_timer()
                logits = model(inputs)
                loss = criterion(logits, targets)
            elif computation == "backward":
                logits = model(inputs)
                loss = criterion(logits, targets)
                torch.cuda.synchronize()
                start = default_timer()
                loss.backward()
            elif computation == "forward_backward":
                start = default_timer()
                logits = model(inputs)
                loss = criterion(logits, targets)
                loss.backward()
            else:
                raise ValueError(f"Invalid computation: {computation}")

            torch.cuda.synchronize()
            end = default_timer()

            time = end - start
            times[step] = time

    return ProfilingResult(
        computation=computation,
        times=times.tolist(),
        mean_time=float(np.mean(times)),
        median_time=float(np.median(times)),
        std_dev=float(np.std(times)),
        min_time=float(np.min(times)),
        max_time=float(np.max(times)),
        total_time=float(np.sum(times)),
        profile_steps=profile_steps,
    )

Main Profiling Code#

import gc
import itertools
import logging
import sys
from typing import Dict, Iterable, Literal, Tuple

import pandas as pd
import torch
from rich.pretty import pprint
from tqdm.auto import tqdm

from omnivault.utils.reproducibility.seed import seed_all
from omnivault.utils.torch_utils.cleanup import purge_global_scope
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    handlers=[logging.StreamHandler(sys.stdout)],
    force=True,
)
logger = logging.getLogger(__name__)

logger.info("Device=%s", device)
2024-08-11 10:57:45,858 - __main__ - INFO - Device=cuda
gpt_small_config = GPTConfig(
    context_length=128,
    vocab_size=10_000,
    d_model=768,
    num_blocks=12,
    num_heads=12,
)
general = General()

seed_all(general.seed, True, False)

batch = get_random_batch(
    batch_size=general.batch_size,
    context_length=gpt_small_config.context_length,
    vocab_size=gpt_small_config.vocab_size,
)

gpt_small = GPT(gpt_small_config).to(device)

results = profile_model(
    model=gpt_small,
    batch=batch,
    warmup_steps=0,
    profile_steps=5,
    computation="forward_backward",
    mixed_precision=False,
)

pprint(results)

results = profile_model(
    model=gpt_small,
    batch=batch,
    warmup_steps=1,
    profile_steps=5,
    computation="forward_backward",
    mixed_precision=False,
)

pprint(results)

purge_global_scope(variable_name_or_names=["gpt_small", "batch"])

try:
    del gpt_small
    del batch
except NameError as exc:
    logger.error("Error deleting variables: %s", exc)
ProfilingResult(
computation='forward_backward',
times=[0.4144910469995011, 0.2255011379993448, 0.22108890900017286, 0.22048294700016413, 0.22087210900008358],
mean_time=0.2604872299998533,
median_time=0.22108890900017286,
std_dev=0.0770235424421242,
min_time=0.22048294700016413,
max_time=0.4144910469995011,
total_time=1.3024361499992665,
profile_steps=5
)
ProfilingResult(
computation='forward_backward',
times=[
│   │   0.22120609100056754,
│   │   0.22078104100000928,
│   │   0.2206222049999269,
│   │   0.22060177200000908,
│   │   0.22016962399993645
],
mean_time=0.22067614660008986,
median_time=0.2206222049999269,
std_dev=0.0003337215379814711,
min_time=0.22016962399993645,
max_time=0.22120609100056754,
total_time=1.1033807330004493,
profile_steps=5
)
2024-08-11 10:58:05,131 - __main__ - ERROR - Error deleting variables: name 'gpt_small' is not defined
def create_profile_configs(context_length: int, vocab_size: int) -> Iterable[Tuple[str, GPTConfig, ProfilerConfig]]:
    gpt_configs: Dict[str, Dict[str, int]] = {
        "small": {"d_model": 768, "num_blocks": 12, "num_heads": 12},
        "medium": {"d_model": 1024, "num_blocks": 24, "num_heads": 16},
    }
    computations: Tuple[Literal["forward", "backward", "forward_backward"], ...] = (
        "forward",
        "backward",
        "forward_backward",
    )
    warmup_steps: Tuple[int, ...] = (0, 1)
    mixed_precision_options: Tuple[bool, ...] = (False, True)
    profile_steps: Tuple[int, ...] = (5,)

    for (config_name, config), computation, warmup, mixed, steps in itertools.product(
        gpt_configs.items(),
        computations,
        warmup_steps,
        mixed_precision_options,
        profile_steps,
    ):
        gpt_config = GPTConfig(**config, context_length=context_length, vocab_size=vocab_size)  # type: ignore[arg-type]
        profiler_config = ProfilerConfig(
            computation=computation,
            warmup_steps=warmup,
            profile_steps=steps,
            mixed_precision=mixed,
        )
        yield config_name, gpt_config, profiler_config


def run_profile(
    device: torch.device,
    gpt_config: GPTConfig,
    profiler_config: ProfilerConfig,
    general: General,
) -> ProfilingResult:
    logger.info("Running profile with GPT config: \n%s", gpt_config.model_dump_json(indent=4))
    logger.info("Profiler config: \n%s", profiler_config.model_dump_json(indent=4))

    seed_all(general.seed, True, False)
    batch = get_random_batch(
        batch_size=general.batch_size,
        context_length=gpt_config.context_length,
        vocab_size=gpt_config.vocab_size,
    )

    gpt = GPT(config=gpt_config).to(device)

    result = profile_model(
        model=gpt,
        batch=batch,
        warmup_steps=profiler_config.warmup_steps,
        profile_steps=profiler_config.profile_steps,
        mixed_precision=profiler_config.mixed_precision,
        computation=profiler_config.computation,
    )

    logger.warning("Purging global scope variables `gpt` and `batch` to free up memory.")

    del gpt
    del batch
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    return result


def results_to_dataframe(results: Dict[str, ProfilingResult]) -> pd.DataFrame:
    data = []
    for name, result in results.items():
        row = result.model_dump()
        row["name"] = name
        data.append(row)

    df = pd.DataFrame(data)
    columns = ["name"] + [col for col in df.columns if col != "name"]
    df = df[columns]
    return df


def main() -> Dict[str, ProfilingResult]:
    context_length = 128
    vocab_size = 10_000
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    general = General()

    results: Dict[str, ProfilingResult] = {}

    all_configs = list(create_profile_configs(context_length, vocab_size))

    for config_name, gpt_config, profiler_config in tqdm(all_configs, desc="Profiling Configurations"):
        key = (
            f"{config_name}_{profiler_config.computation}_"
            f"warmup_{profiler_config.warmup_steps}_"
            f"mixed_{profiler_config.mixed_precision}"
        )
        logger.info("Running profile for: %s", key)
        results[key] = run_profile(device, gpt_config, profiler_config, general)
        logger.info("Profile result: \n%s\n\n\n", results[key].model_dump_json(indent=4))

    return results
results = main()
2024-08-11 10:58:52,908 - __main__ - INFO - Running profile for: small_forward_warmup_0_mixed_False
2024-08-11 10:58:52,909 - __main__ - INFO - Running profile with GPT config: 
{
    "approximate": null,
    "activation_name": "gelu",
    "d_model": 768,
    "d_ff": null,
    "num_heads": 12,
    "context_length": 128,
    "attn_pdrop": 0.0,
    "resid_pdrop": 0.0,
    "bias": false,
    "vocab_size": 10000,
    "num_blocks": 12,
    "token_position_pdrop": 0.0,
    "weight_tie": false
}
2024-08-11 10:58:52,910 - __main__ - INFO - Profiler config: 
{
    "computation": "forward",
    "warmup_steps": 0,
    "profile_steps": 5,
    "mixed_precision": false
}
2024-08-11 10:58:56,410 - __main__ - INFO - Profile result: 
{
    "computation": "forward",
    "times": [
        0.07846403200073837,
        0.0737714920005601,
        0.07313682499989227,
        0.07290069700047752,
        0.07229203399947437
    ],
    "mean_time": 0.07411301600022853,
    "median_time": 0.07313682499989227,
    "std_dev": 0.0022265049091528,
    "min_time": 0.07229203399947437,
    "max_time": 0.07846403200073837,
    "total_time": 0.37056508000114263,
    "profile_steps": 5
}



2024-08-11 10:58:56,411 - __main__ - INFO - Running profile for: small_forward_warmup_0_mixed_True
2024-08-11 10:58:56,413 - __main__ - INFO - Running profile with GPT config: 
{
    "approximate": null,
    "activation_name": "gelu",
    "d_model": 768,
    "d_ff": null,
    "num_heads": 12,
    "context_length": 128,
    "attn_pdrop": 0.0,
    "resid_pdrop": 0.0,
    "bias": false,
    "vocab_size": 10000,
    "num_blocks": 12,
    "token_position_pdrop": 0.0,
    "weight_tie": false
}
2024-08-11 10:58:56,413 - __main__ - INFO - Profiler config: 
{
    "computation": "forward",
    "warmup_steps": 0,
    "profile_steps": 5,
    "mixed_precision": true
}
2024-08-11 10:58:59,672 - __main__ - INFO - Profile result: 
{
    "computation": "forward",
    "times": [
        0.0935192509996341,
        0.07841499799997109,
        0.07517127899973275,
        0.07425991700074519,
        0.07425473500006774
    ],
    "mean_time": 0.07912403600003018,
    "median_time": 0.07517127899973275,
    "std_dev": 0.007358246850183679,
    "min_time": 0.07425473500006774,
    "max_time": 0.0935192509996341,
    "total_time": 0.39562018000015087,
    "profile_steps": 5
}



2024-08-11 10:58:59,674 - __main__ - INFO - Running profile for: small_forward_warmup_1_mixed_False
2024-08-11 10:58:59,675 - __main__ - INFO - Running profile with GPT config: 
{
    "approximate": null,
    "activation_name": "gelu",
    "d_model": 768,
    "d_ff": null,
    "num_heads": 12,
    "context_length": 128,
    "attn_pdrop": 0.0,
    "resid_pdrop": 0.0,
    "bias": false,
    "vocab_size": 10000,
    "num_blocks": 12,
    "token_position_pdrop": 0.0,
    "weight_tie": false
}
2024-08-11 10:58:59,676 - __main__ - INFO - Profiler config: 
{
    "computation": "forward",
    "warmup_steps": 1,
    "profile_steps": 5,
    "mixed_precision": false
}
2024-08-11 10:59:03,039 - __main__ - INFO - Profile result: 
{
    "computation": "forward",
    "times": [
        0.0713498349996371,
        0.0709589389998655,
        0.07128997600011644,
        0.07113105599910341,
        0.07112360400060425
    ],
    "mean_time": 0.07117068199986534,
    "median_time": 0.07113105599910341,
    "std_dev": 0.00013780312900865102,
    "min_time": 0.0709589389998655,
    "max_time": 0.0713498349996371,
    "total_time": 0.3558534099993267,
    "profile_steps": 5
}



2024-08-11 10:59:03,041 - __main__ - INFO - Running profile for: small_forward_warmup_1_mixed_True
2024-08-11 10:59:03,042 - __main__ - INFO - Running profile with GPT config: 
{
    "approximate": null,
    "activation_name": "gelu",
    "d_model": 768,
    "d_ff": null,
    "num_heads": 12,
    "context_length": 128,
    "attn_pdrop": 0.0,
    "resid_pdrop": 0.0,
    "bias": false,
    "vocab_size": 10000,
    "num_blocks": 12,
    "token_position_pdrop": 0.0,
    "weight_tie": false
}
2024-08-11 10:59:03,043 - __main__ - INFO - Profiler config: 
{
    "computation": "forward",
    "warmup_steps": 1,
    "profile_steps": 5,
    "mixed_precision": true
}
2024-08-11 10:59:06,362 - __main__ - INFO - Profile result: 
{
    "computation": "forward",
    "times": [
        0.07369880799979,
        0.07382382700052403,
        0.07385925299968221,
        0.07376415900034772,
        0.07382289199995284
    ],
    "mean_time": 0.07379378780005937,
    "median_time": 0.07382289199995284,
    "std_dev": 0.00005645197712961995,
    "min_time": 0.07369880799979,
    "max_time": 0.07385925299968221,
    "total_time": 0.3689689390002968,
    "profile_steps": 5
}



2024-08-11 10:59:06,364 - __main__ - INFO - Running profile for: small_backward_warmup_0_mixed_False
2024-08-11 10:59:06,365 - __main__ - INFO - Running profile with GPT config: 
{
    "approximate": null,
    "activation_name": "gelu",
    "d_model": 768,
    "d_ff": null,
    "num_heads": 12,
    "context_length": 128,
    "attn_pdrop": 0.0,
    "resid_pdrop": 0.0,
    "bias": false,
    "vocab_size": 10000,
    "num_blocks": 12,
    "token_position_pdrop": 0.0,
    "weight_tie": false
}
2024-08-11 10:59:06,365 - __main__ - INFO - Profiler config: 
{
    "computation": "backward",
    "warmup_steps": 0,
    "profile_steps": 5,
    "mixed_precision": false
}
2024-08-11 10:59:10,363 - __main__ - INFO - Profile result: 
{
    "computation": "backward",
    "times": [
        0.14778646800004935,
        0.1506624739995459,
        0.15020622800057026,
        0.15019696000035765,
        0.15047101199979807
    ],
    "mean_time": 0.14986462840006426,
    "median_time": 0.15020622800057026,
    "std_dev": 0.0010535790334329838,
    "min_time": 0.14778646800004935,
    "max_time": 0.1506624739995459,
    "total_time": 0.7493231420003212,
    "profile_steps": 5
}



2024-08-11 10:59:10,365 - __main__ - INFO - Running profile for: small_backward_warmup_0_mixed_True
2024-08-11 10:59:10,366 - __main__ - INFO - Running profile with GPT config: 
{
    "approximate": null,
    "activation_name": "gelu",
    "d_model": 768,
    "d_ff": null,
    "num_heads": 12,
    "context_length": 128,
    "attn_pdrop": 0.0,
    "resid_pdrop": 0.0,
    "bias": false,
    "vocab_size": 10000,
    "num_blocks": 12,
    "token_position_pdrop": 0.0,
    "weight_tie": false
}
2024-08-11 10:59:10,367 - __main__ - INFO - Profiler config: 
{
    "computation": "backward",
    "warmup_steps": 0,
    "profile_steps": 5,
    "mixed_precision": true
}
2024-08-11 10:59:14,375 - __main__ - INFO - Profile result: 
{
    "computation": "backward",
    "times": [
        0.14960491599958914,
        0.15169357899958413,
        0.15144059900012508,
        0.15148220399987622,
        0.15148427300027834
    ],
    "mean_time": 0.15114111419989057,
    "median_time": 0.15148220399987622,
    "std_dev": 0.000773164099660496,
    "min_time": 0.14960491599958914,
    "max_time": 0.15169357899958413,
    "total_time": 0.7557055709994529,
    "profile_steps": 5
}



2024-08-11 10:59:14,377 - __main__ - INFO - Running profile for: small_backward_warmup_1_mixed_False
2024-08-11 10:59:14,378 - __main__ - INFO - Running profile with GPT config: 
{
    "approximate": null,
    "activation_name": "gelu",
    "d_model": 768,
    "d_ff": null,
    "num_heads": 12,
    "context_length": 128,
    "attn_pdrop": 0.0,
    "resid_pdrop": 0.0,
    "bias": false,
    "vocab_size": 10000,
    "num_blocks": 12,
    "token_position_pdrop": 0.0,
    "weight_tie": false
}
2024-08-11 10:59:14,379 - __main__ - INFO - Profiler config: 
{
    "computation": "backward",
    "warmup_steps": 1,
    "profile_steps": 5,
    "mixed_precision": false
}
2024-08-11 10:59:18,635 - __main__ - INFO - Profile result: 
{
    "computation": "backward",
    "times": [
        0.15043866700034414,
        0.15016293000007863,
        0.15051599399976112,
        0.14995777100011765,
        0.15019284299978608
    ],
    "mean_time": 0.15025364100001753,
    "median_time": 0.15019284299978608,
    "std_dev": 0.00020125986010833822,
    "min_time": 0.14995777100011765,
    "max_time": 0.15051599399976112,
    "total_time": 0.7512682050000876,
    "profile_steps": 5
}



2024-08-11 10:59:18,637 - __main__ - INFO - Running profile for: small_backward_warmup_1_mixed_True
2024-08-11 10:59:18,638 - __main__ - INFO - Running profile with GPT config: 
{
    "approximate": null,
    "activation_name": "gelu",
    "d_model": 768,
    "d_ff": null,
    "num_heads": 12,
    "context_length": 128,
    "attn_pdrop": 0.0,
    "resid_pdrop": 0.0,
    "bias": false,
    "vocab_size": 10000,
    "num_blocks": 12,
    "token_position_pdrop": 0.0,
    "weight_tie": false
}
2024-08-11 10:59:18,638 - __main__ - INFO - Profiler config: 
{
    "computation": "backward",
    "warmup_steps": 1,
    "profile_steps": 5,
    "mixed_precision": true
}
2024-08-11 10:59:22,886 - __main__ - INFO - Profile result: 
{
    "computation": "backward",
    "times": [
        0.15156354600003397,
        0.15143018500020844,
        0.15140369100026874,
        0.15138537599978008,
        0.15155341100035002
    ],
    "mean_time": 0.15146724180012824,
    "median_time": 0.15143018500020844,
    "std_dev": 0.00007591251522123026,
    "min_time": 0.15138537599978008,
    "max_time": 0.15156354600003397,
    "total_time": 0.7573362090006412,
    "profile_steps": 5
}



2024-08-11 10:59:22,888 - __main__ - INFO - Running profile for: small_forward_backward_warmup_0_mixed_False
2024-08-11 10:59:22,890 - __main__ - INFO - Running profile with GPT config: 
{
    "approximate": null,
    "activation_name": "gelu",
    "d_model": 768,
    "d_ff": null,
    "num_heads": 12,
    "context_length": 128,
    "attn_pdrop": 0.0,
    "resid_pdrop": 0.0,
    "bias": false,
    "vocab_size": 10000,
    "num_blocks": 12,
    "token_position_pdrop": 0.0,
    "weight_tie": false
}
2024-08-11 10:59:22,890 - __main__ - INFO - Profiler config: 
{
    "computation": "forward_backward",
    "warmup_steps": 0,
    "profile_steps": 5,
    "mixed_precision": false
}
2024-08-11 10:59:26,934 - __main__ - INFO - Profile result: 
{
    "computation": "forward_backward",
    "times": [
        0.21832664599969576,
        0.22057815300013317,
        0.22008085299967206,
        0.22023602300032508,
        0.22247255000002042
    ],
    "mean_time": 0.2203388449999693,
    "median_time": 0.22023602300032508,
    "std_dev": 0.0013218201389535288,
    "min_time": 0.21832664599969576,
    "max_time": 0.22247255000002042,
    "total_time": 1.1016942249998465,
    "profile_steps": 5
}



2024-08-11 10:59:26,936 - __main__ - INFO - Running profile for: small_forward_backward_warmup_0_mixed_True
2024-08-11 10:59:26,937 - __main__ - INFO - Running profile with GPT config: 
{
    "approximate": null,
    "activation_name": "gelu",
    "d_model": 768,
    "d_ff": null,
    "num_heads": 12,
    "context_length": 128,
    "attn_pdrop": 0.0,
    "resid_pdrop": 0.0,
    "bias": false,
    "vocab_size": 10000,
    "num_blocks": 12,
    "token_position_pdrop": 0.0,
    "weight_tie": false
}
2024-08-11 10:59:26,938 - __main__ - INFO - Profiler config: 
{
    "computation": "forward_backward",
    "warmup_steps": 0,
    "profile_steps": 5,
    "mixed_precision": true
}
2024-08-11 10:59:30,955 - __main__ - INFO - Profile result: 
{
    "computation": "forward_backward",
    "times": [
        0.22407326100073988,
        0.22444049899968377,
        0.22447669499979384,
        0.22424623200004135,
        0.2243646639999497
    ],
    "mean_time": 0.2243202702000417,
    "median_time": 0.2243646639999497,
    "std_dev": 0.00014655353377531386,
    "min_time": 0.22407326100073988,
    "max_time": 0.22447669499979384,
    "total_time": 1.1216013510002085,
    "profile_steps": 5
}



2024-08-11 10:59:30,957 - __main__ - INFO - Running profile for: small_forward_backward_warmup_1_mixed_False
2024-08-11 10:59:30,958 - __main__ - INFO - Running profile with GPT config: 
{
    "approximate": null,
    "activation_name": "gelu",
    "d_model": 768,
    "d_ff": null,
    "num_heads": 12,
    "context_length": 128,
    "attn_pdrop": 0.0,
    "resid_pdrop": 0.0,
    "bias": false,
    "vocab_size": 10000,
    "num_blocks": 12,
    "token_position_pdrop": 0.0,
    "weight_tie": false
}
2024-08-11 10:59:30,958 - __main__ - INFO - Profiler config: 
{
    "computation": "forward_backward",
    "warmup_steps": 1,
    "profile_steps": 5,
    "mixed_precision": false
}
2024-08-11 10:59:35,215 - __main__ - INFO - Profile result: 
{
    "computation": "forward_backward",
    "times": [
        0.22083036900039588,
        0.22020937799970852,
        0.22077888900003018,
        0.2204785499998252,
        0.22015970299980836
    ],
    "mean_time": 0.22049137779995362,
    "median_time": 0.2204785499998252,
    "std_dev": 0.0002783071457913188,
    "min_time": 0.22015970299980836,
    "max_time": 0.22083036900039588,
    "total_time": 1.1024568889997681,
    "profile_steps": 5
}



2024-08-11 10:59:35,217 - __main__ - INFO - Running profile for: small_forward_backward_warmup_1_mixed_True
2024-08-11 10:59:35,218 - __main__ - INFO - Running profile with GPT config: 
{
    "approximate": null,
    "activation_name": "gelu",
    "d_model": 768,
    "d_ff": null,
    "num_heads": 12,
    "context_length": 128,
    "attn_pdrop": 0.0,
    "resid_pdrop": 0.0,
    "bias": false,
    "vocab_size": 10000,
    "num_blocks": 12,
    "token_position_pdrop": 0.0,
    "weight_tie": false
}
2024-08-11 10:59:35,219 - __main__ - INFO - Profiler config: 
{
    "computation": "forward_backward",
    "warmup_steps": 1,
    "profile_steps": 5,
    "mixed_precision": true
}
2024-08-11 10:59:39,489 - __main__ - INFO - Profile result: 
{
    "computation": "forward_backward",
    "times": [
        0.22448583999994298,
        0.22421371400014323,
        0.22419899699980306,
        0.224439366000297,
        0.22438234299988835
    ],
    "mean_time": 0.22434405200001492,
    "median_time": 0.22438234299988835,
    "std_dev": 0.00011720387667794871,
    "min_time": 0.22419899699980306,
    "max_time": 0.22448583999994298,
    "total_time": 1.1217202600000746,
    "profile_steps": 5
}



2024-08-11 10:59:39,491 - __main__ - INFO - Running profile for: medium_forward_warmup_0_mixed_False
2024-08-11 10:59:39,492 - __main__ - INFO - Running profile with GPT config: 
{
    "approximate": null,
    "activation_name": "gelu",
    "d_model": 1024,
    "d_ff": null,
    "num_heads": 16,
    "context_length": 128,
    "attn_pdrop": 0.0,
    "resid_pdrop": 0.0,
    "bias": false,
    "vocab_size": 10000,
    "num_blocks": 24,
    "token_position_pdrop": 0.0,
    "weight_tie": false
}
2024-08-11 10:59:39,493 - __main__ - INFO - Profiler config: 
{
    "computation": "forward",
    "warmup_steps": 0,
    "profile_steps": 5,
    "mixed_precision": false
}
2024-08-11 10:59:50,458 - __main__ - INFO - Profile result: 
{
    "computation": "forward",
    "times": [
        0.24876118500014854,
        0.24528420900060155,
        0.2450883880001129,
        0.24495568000020285,
        0.24501344400050584
    ],
    "mean_time": 0.24582058120031433,
    "median_time": 0.2450883880001129,
    "std_dev": 0.0014744814187987805,
    "min_time": 0.24495568000020285,
    "max_time": 0.24876118500014854,
    "total_time": 1.2291029060015717,
    "profile_steps": 5
}



2024-08-11 10:59:50,459 - __main__ - INFO - Running profile for: medium_forward_warmup_0_mixed_True
2024-08-11 10:59:50,460 - __main__ - INFO - Running profile with GPT config: 
{
    "approximate": null,
    "activation_name": "gelu",
    "d_model": 1024,
    "d_ff": null,
    "num_heads": 16,
    "context_length": 128,
    "attn_pdrop": 0.0,
    "resid_pdrop": 0.0,
    "bias": false,
    "vocab_size": 10000,
    "num_blocks": 24,
    "token_position_pdrop": 0.0,
    "weight_tie": false
}
2024-08-11 10:59:50,461 - __main__ - INFO - Profiler config: 
{
    "computation": "forward",
    "warmup_steps": 0,
    "profile_steps": 5,
    "mixed_precision": true
}
2024-08-11 11:00:00,845 - __main__ - INFO - Profile result: 
{
    "computation": "forward",
    "times": [
        0.22800831199947424,
        0.22234672700051306,
        0.22269292600049084,
        0.22257933899982163,
        0.22283626599983108
    ],
    "mean_time": 0.22369271400002616,
    "median_time": 0.22269292600049084,
    "std_dev": 0.002163735205972857,
    "min_time": 0.22234672700051306,
    "max_time": 0.22800831199947424,
    "total_time": 1.1184635700001309,
    "profile_steps": 5
}



2024-08-11 11:00:00,847 - __main__ - INFO - Running profile for: medium_forward_warmup_1_mixed_False
2024-08-11 11:00:00,848 - __main__ - INFO - Running profile with GPT config: 
{
    "approximate": null,
    "activation_name": "gelu",
    "d_model": 1024,
    "d_ff": null,
    "num_heads": 16,
    "context_length": 128,
    "attn_pdrop": 0.0,
    "resid_pdrop": 0.0,
    "bias": false,
    "vocab_size": 10000,
    "num_blocks": 24,
    "token_position_pdrop": 0.0,
    "weight_tie": false
}
2024-08-11 11:00:00,849 - __main__ - INFO - Profiler config: 
{
    "computation": "forward",
    "warmup_steps": 1,
    "profile_steps": 5,
    "mixed_precision": false
}
2024-08-11 11:00:12,121 - __main__ - INFO - Profile result: 
{
    "computation": "forward",
    "times": [
        0.24505015699924115,
        0.2447970040002474,
        0.2449814139999944,
        0.24467239400019025,
        0.24507564099985757
    ],
    "mean_time": 0.24491532199990615,
    "median_time": 0.2449814139999944,
    "std_dev": 0.00015573308791388857,
    "min_time": 0.24467239400019025,
    "max_time": 0.24507564099985757,
    "total_time": 1.2245766099995308,
    "profile_steps": 5
}



2024-08-11 11:00:12,123 - __main__ - INFO - Running profile for: medium_forward_warmup_1_mixed_True
2024-08-11 11:00:12,124 - __main__ - INFO - Running profile with GPT config: 
{
    "approximate": null,
    "activation_name": "gelu",
    "d_model": 1024,
    "d_ff": null,
    "num_heads": 16,
    "context_length": 128,
    "attn_pdrop": 0.0,
    "resid_pdrop": 0.0,
    "bias": false,
    "vocab_size": 10000,
    "num_blocks": 24,
    "token_position_pdrop": 0.0,
    "weight_tie": false
}
2024-08-11 11:00:12,125 - __main__ - INFO - Profiler config: 
{
    "computation": "forward",
    "warmup_steps": 1,
    "profile_steps": 5,
    "mixed_precision": true
}
2024-08-11 11:00:22,989 - __main__ - INFO - Profile result: 
{
    "computation": "forward",
    "times": [
        0.22247573399999965,
        0.22273382500043226,
        0.22256395700060239,
        0.22253749899937247,
        0.2227441450004335
    ],
    "mean_time": 0.22261103200016805,
    "median_time": 0.22256395700060239,
    "std_dev": 0.00010837518017815013,
    "min_time": 0.22247573399999965,
    "max_time": 0.2227441450004335,
    "total_time": 1.1130551600008403,
    "profile_steps": 5
}



2024-08-11 11:00:22,991 - __main__ - INFO - Running profile for: medium_backward_warmup_0_mixed_False
2024-08-11 11:00:22,992 - __main__ - INFO - Running profile with GPT config: 
{
    "approximate": null,
    "activation_name": "gelu",
    "d_model": 1024,
    "d_ff": null,
    "num_heads": 16,
    "context_length": 128,
    "attn_pdrop": 0.0,
    "resid_pdrop": 0.0,
    "bias": false,
    "vocab_size": 10000,
    "num_blocks": 24,
    "token_position_pdrop": 0.0,
    "weight_tie": false
}
2024-08-11 11:00:22,993 - __main__ - INFO - Profiler config: 
{
    "computation": "backward",
    "warmup_steps": 0,
    "profile_steps": 5,
    "mixed_precision": false
}
2024-08-11 11:00:36,042 - __main__ - INFO - Profile result: 
{
    "computation": "backward",
    "times": [
        0.4803364019999208,
        0.4842157639996003,
        0.48418703799961804,
        0.4834751440002947,
        0.48341351999988547
    ],
    "mean_time": 0.48312557359986386,
    "median_time": 0.4834751440002947,
    "std_dev": 0.001435256951747305,
    "min_time": 0.4803364019999208,
    "max_time": 0.4842157639996003,
    "total_time": 2.4156278679993193,
    "profile_steps": 5
}



2024-08-11 11:00:36,044 - __main__ - INFO - Running profile for: medium_backward_warmup_0_mixed_True
2024-08-11 11:00:36,045 - __main__ - INFO - Running profile with GPT config: 
{
    "approximate": null,
    "activation_name": "gelu",
    "d_model": 1024,
    "d_ff": null,
    "num_heads": 16,
    "context_length": 128,
    "attn_pdrop": 0.0,
    "resid_pdrop": 0.0,
    "bias": false,
    "vocab_size": 10000,
    "num_blocks": 24,
    "token_position_pdrop": 0.0,
    "weight_tie": false
}
2024-08-11 11:00:36,046 - __main__ - INFO - Profiler config: 
{
    "computation": "backward",
    "warmup_steps": 0,
    "profile_steps": 5,
    "mixed_precision": true
}
2024-08-11 11:00:49,070 - __main__ - INFO - Profile result: 
{
    "computation": "backward",
    "times": [
        0.4554312190002747,
        0.4636755310002627,
        0.46323080499951175,
        0.46364944700053456,
        0.46372856899961334
    ],
    "mean_time": 0.4619431142000394,
    "median_time": 0.46364944700053456,
    "std_dev": 0.0032607856453568743,
    "min_time": 0.4554312190002747,
    "max_time": 0.46372856899961334,
    "total_time": 2.309715571000197,
    "profile_steps": 5
}



2024-08-11 11:00:49,072 - __main__ - INFO - Running profile for: medium_backward_warmup_1_mixed_False
2024-08-11 11:00:49,073 - __main__ - INFO - Running profile with GPT config: 
{
    "approximate": null,
    "activation_name": "gelu",
    "d_model": 1024,
    "d_ff": null,
    "num_heads": 16,
    "context_length": 128,
    "attn_pdrop": 0.0,
    "resid_pdrop": 0.0,
    "bias": false,
    "vocab_size": 10000,
    "num_blocks": 24,
    "token_position_pdrop": 0.0,
    "weight_tie": false
}
2024-08-11 11:00:49,073 - __main__ - INFO - Profiler config: 
{
    "computation": "backward",
    "warmup_steps": 1,
    "profile_steps": 5,
    "mixed_precision": false
}
2024-08-11 11:01:02,813 - __main__ - INFO - Profile result: 
{
    "computation": "backward",
    "times": [
        0.48379133800062846,
        0.4842440839993287,
        0.48351270700004534,
        0.48362178600018524,
        0.48389172999941366
    ],
    "mean_time": 0.48381232899992027,
    "median_time": 0.48379133800062846,
    "std_dev": 0.00025268062760848447,
    "min_time": 0.48351270700004534,
    "max_time": 0.4842440839993287,
    "total_time": 2.4190616449996014,
    "profile_steps": 5
}



2024-08-11 11:01:02,815 - __main__ - INFO - Running profile for: medium_backward_warmup_1_mixed_True
2024-08-11 11:01:02,816 - __main__ - INFO - Running profile with GPT config: 
{
    "approximate": null,
    "activation_name": "gelu",
    "d_model": 1024,
    "d_ff": null,
    "num_heads": 16,
    "context_length": 128,
    "attn_pdrop": 0.0,
    "resid_pdrop": 0.0,
    "bias": false,
    "vocab_size": 10000,
    "num_blocks": 24,
    "token_position_pdrop": 0.0,
    "weight_tie": false
}
2024-08-11 11:01:02,816 - __main__ - INFO - Profiler config: 
{
    "computation": "backward",
    "warmup_steps": 1,
    "profile_steps": 5,
    "mixed_precision": true
}
2024-08-11 11:01:16,508 - __main__ - INFO - Profile result: 
{
    "computation": "backward",
    "times": [
        0.46393922600054793,
        0.4636971440004345,
        0.4640383700007078,
        0.4659291400002985,
        0.4634621249997508
    ],
    "mean_time": 0.4642132010003479,
    "median_time": 0.46393922600054793,
    "std_dev": 0.0008809659262803036,
    "min_time": 0.4634621249997508,
    "max_time": 0.4659291400002985,
    "total_time": 2.3210660050017395,
    "profile_steps": 5
}



2024-08-11 11:01:16,510 - __main__ - INFO - Running profile for: medium_forward_backward_warmup_0_mixed_False
2024-08-11 11:01:16,511 - __main__ - INFO - Running profile with GPT config: 
{
    "approximate": null,
    "activation_name": "gelu",
    "d_model": 1024,
    "d_ff": null,
    "num_heads": 16,
    "context_length": 128,
    "attn_pdrop": 0.0,
    "resid_pdrop": 0.0,
    "bias": false,
    "vocab_size": 10000,
    "num_blocks": 24,
    "token_position_pdrop": 0.0,
    "weight_tie": false
}
2024-08-11 11:01:16,512 - __main__ - INFO - Profiler config: 
{
    "computation": "forward_backward",
    "warmup_steps": 0,
    "profile_steps": 5,
    "mixed_precision": false
}
2024-08-11 11:01:29,358 - __main__ - INFO - Profile result: 
{
    "computation": "forward_backward",
    "times": [
        0.7304752370000642,
        0.727164707999691,
        0.7266143320002811,
        0.7294365609996021,
        0.7265293540003768
    ],
    "mean_time": 0.7280440384000031,
    "median_time": 0.727164707999691,
    "std_dev": 0.0016100557130809725,
    "min_time": 0.7265293540003768,
    "max_time": 0.7304752370000642,
    "total_time": 3.6402201920000152,
    "profile_steps": 5
}



2024-08-11 11:01:29,359 - __main__ - INFO - Running profile for: medium_forward_backward_warmup_0_mixed_True
2024-08-11 11:01:29,360 - __main__ - INFO - Running profile with GPT config: 
{
    "approximate": null,
    "activation_name": "gelu",
    "d_model": 1024,
    "d_ff": null,
    "num_heads": 16,
    "context_length": 128,
    "attn_pdrop": 0.0,
    "resid_pdrop": 0.0,
    "bias": false,
    "vocab_size": 10000,
    "num_blocks": 24,
    "token_position_pdrop": 0.0,
    "weight_tie": false
}
2024-08-11 11:01:29,361 - __main__ - INFO - Profiler config: 
{
    "computation": "forward_backward",
    "warmup_steps": 0,
    "profile_steps": 5,
    "mixed_precision": true
}
2024-08-11 11:01:42,075 - __main__ - INFO - Profile result: 
{
    "computation": "forward_backward",
    "times": [
        0.6820840510008566,
        0.684006079000028,
        0.6853365530005249,
        0.6837199530000362,
        0.6839223050010332
    ],
    "mean_time": 0.6838137882004958,
    "median_time": 0.6839223050010332,
    "std_dev": 0.0010361814617577633,
    "min_time": 0.6820840510008566,
    "max_time": 0.6853365530005249,
    "total_time": 3.419068941002479,
    "profile_steps": 5
}



2024-08-11 11:01:42,077 - __main__ - INFO - Running profile for: medium_forward_backward_warmup_1_mixed_False
2024-08-11 11:01:42,078 - __main__ - INFO - Running profile with GPT config: 
{
    "approximate": null,
    "activation_name": "gelu",
    "d_model": 1024,
    "d_ff": null,
    "num_heads": 16,
    "context_length": 128,
    "attn_pdrop": 0.0,
    "resid_pdrop": 0.0,
    "bias": false,
    "vocab_size": 10000,
    "num_blocks": 24,
    "token_position_pdrop": 0.0,
    "weight_tie": false
}
2024-08-11 11:01:42,079 - __main__ - INFO - Profiler config: 
{
    "computation": "forward_backward",
    "warmup_steps": 1,
    "profile_steps": 5,
    "mixed_precision": false
}
2024-08-11 11:01:56,159 - __main__ - INFO - Profile result: 
{
    "computation": "forward_backward",
    "times": [
        0.7272930130002351,
        0.7277811669991934,
        0.7261910039997019,
        0.7264165149990731,
        0.7264139199996862
    ],
    "mean_time": 0.7268191237995779,
    "median_time": 0.7264165149990731,
    "std_dev": 0.0006117052461950424,
    "min_time": 0.7261910039997019,
    "max_time": 0.7277811669991934,
    "total_time": 3.6340956189978897,
    "profile_steps": 5
}



2024-08-11 11:01:56,161 - __main__ - INFO - Running profile for: medium_forward_backward_warmup_1_mixed_True
2024-08-11 11:01:56,162 - __main__ - INFO - Running profile with GPT config: 
{
    "approximate": null,
    "activation_name": "gelu",
    "d_model": 1024,
    "d_ff": null,
    "num_heads": 16,
    "context_length": 128,
    "attn_pdrop": 0.0,
    "resid_pdrop": 0.0,
    "bias": false,
    "vocab_size": 10000,
    "num_blocks": 24,
    "token_position_pdrop": 0.0,
    "weight_tie": false
}
2024-08-11 11:01:56,163 - __main__ - INFO - Profiler config: 
{
    "computation": "forward_backward",
    "warmup_steps": 1,
    "profile_steps": 5,
    "mixed_precision": true
}
2024-08-11 11:02:09,545 - __main__ - INFO - Profile result: 
{
    "computation": "forward_backward",
    "times": [
        0.6852539360006631,
        0.68381571899954,
        0.6833910059995105,
        0.6835393290002685,
        0.6835425659992325
    ],
    "mean_time": 0.6839085111998429,
    "median_time": 0.6835425659992325,
    "std_dev": 0.000686556815682806,
    "min_time": 0.6833910059995105,
    "max_time": 0.6852539360006631,
    "total_time": 3.4195425559992145,
    "profile_steps": 5
}

We see torch.cuda.synchronize() is scattered to ensure the timing is accurate since CUDA operations are asynchronous and non blocking for CPU operations.

df = results_to_dataframe(results)

df_by_mean = df.sort_values(by='mean_time', ascending=True)
display(df_by_mean)
name computation times mean_time median_time std_dev min_time max_time total_time profile_steps
2 small_forward_warmup_1_mixed_False forward [0.0713498349996371, 0.0709589389998655, 0.071... 0.071171 0.071131 0.000138 0.070959 0.071350 0.355853 5
3 small_forward_warmup_1_mixed_True forward [0.07369880799979, 0.07382382700052403, 0.0738... 0.073794 0.073823 0.000056 0.073699 0.073859 0.368969 5
0 small_forward_warmup_0_mixed_False forward [0.07846403200073837, 0.0737714920005601, 0.07... 0.074113 0.073137 0.002227 0.072292 0.078464 0.370565 5
1 small_forward_warmup_0_mixed_True forward [0.0935192509996341, 0.07841499799997109, 0.07... 0.079124 0.075171 0.007358 0.074255 0.093519 0.395620 5
4 small_backward_warmup_0_mixed_False backward [0.14778646800004935, 0.1506624739995459, 0.15... 0.149865 0.150206 0.001054 0.147786 0.150662 0.749323 5
6 small_backward_warmup_1_mixed_False backward [0.15043866700034414, 0.15016293000007863, 0.1... 0.150254 0.150193 0.000201 0.149958 0.150516 0.751268 5
5 small_backward_warmup_0_mixed_True backward [0.14960491599958914, 0.15169357899958413, 0.1... 0.151141 0.151482 0.000773 0.149605 0.151694 0.755706 5
7 small_backward_warmup_1_mixed_True backward [0.15156354600003397, 0.15143018500020844, 0.1... 0.151467 0.151430 0.000076 0.151385 0.151564 0.757336 5
8 small_forward_backward_warmup_0_mixed_False forward_backward [0.21832664599969576, 0.22057815300013317, 0.2... 0.220339 0.220236 0.001322 0.218327 0.222473 1.101694 5
10 small_forward_backward_warmup_1_mixed_False forward_backward [0.22083036900039588, 0.22020937799970852, 0.2... 0.220491 0.220479 0.000278 0.220160 0.220830 1.102457 5
15 medium_forward_warmup_1_mixed_True forward [0.22247573399999965, 0.22273382500043226, 0.2... 0.222611 0.222564 0.000108 0.222476 0.222744 1.113055 5
13 medium_forward_warmup_0_mixed_True forward [0.22800831199947424, 0.22234672700051306, 0.2... 0.223693 0.222693 0.002164 0.222347 0.228008 1.118464 5
9 small_forward_backward_warmup_0_mixed_True forward_backward [0.22407326100073988, 0.22444049899968377, 0.2... 0.224320 0.224365 0.000147 0.224073 0.224477 1.121601 5
11 small_forward_backward_warmup_1_mixed_True forward_backward [0.22448583999994298, 0.22421371400014323, 0.2... 0.224344 0.224382 0.000117 0.224199 0.224486 1.121720 5
14 medium_forward_warmup_1_mixed_False forward [0.24505015699924115, 0.2447970040002474, 0.24... 0.244915 0.244981 0.000156 0.244672 0.245076 1.224577 5
12 medium_forward_warmup_0_mixed_False forward [0.24876118500014854, 0.24528420900060155, 0.2... 0.245821 0.245088 0.001474 0.244956 0.248761 1.229103 5
17 medium_backward_warmup_0_mixed_True backward [0.4554312190002747, 0.4636755310002627, 0.463... 0.461943 0.463649 0.003261 0.455431 0.463729 2.309716 5
19 medium_backward_warmup_1_mixed_True backward [0.46393922600054793, 0.4636971440004345, 0.46... 0.464213 0.463939 0.000881 0.463462 0.465929 2.321066 5
16 medium_backward_warmup_0_mixed_False backward [0.4803364019999208, 0.4842157639996003, 0.484... 0.483126 0.483475 0.001435 0.480336 0.484216 2.415628 5
18 medium_backward_warmup_1_mixed_False backward [0.48379133800062846, 0.4842440839993287, 0.48... 0.483812 0.483791 0.000253 0.483513 0.484244 2.419062 5
21 medium_forward_backward_warmup_0_mixed_True forward_backward [0.6820840510008566, 0.684006079000028, 0.6853... 0.683814 0.683922 0.001036 0.682084 0.685337 3.419069 5
23 medium_forward_backward_warmup_1_mixed_True forward_backward [0.6852539360006631, 0.68381571899954, 0.68339... 0.683909 0.683543 0.000687 0.683391 0.685254 3.419543 5
22 medium_forward_backward_warmup_1_mixed_False forward_backward [0.7272930130002351, 0.7277811669991934, 0.726... 0.726819 0.726417 0.000612 0.726191 0.727781 3.634096 5
20 medium_forward_backward_warmup_0_mixed_False forward_backward [0.7304752370000642, 0.727164707999691, 0.7266... 0.728044 0.727165 0.001610 0.726529 0.730475 3.640220 5

Why is warmup recommended by CS336’s lecturers before timing? Across the board you can see that if we keep all other variables constant, then having warmup of 1 step has a much lower standard deviation that those that don’t. The basic intuition is that after warmup, the GPU is “warmed-up”, it is like cold caches in action, and other compilation and optimization that happen under the hood, so warmup is good for stabalisation.

display(df[df["name"].isin(["medium_forward_backward_warmup_0_mixed_False", "medium_forward_backward_warmup_1_mixed_False"])])
name computation times mean_time median_time std_dev min_time max_time total_time profile_steps
20 medium_forward_backward_warmup_0_mixed_False forward_backward [0.7304752370000642, 0.727164707999691, 0.7266... 0.728044 0.727165 0.001610 0.726529 0.730475 3.640220 5
22 medium_forward_backward_warmup_1_mixed_False forward_backward [0.7272930130002351, 0.7277811669991934, 0.726... 0.726819 0.726417 0.000612 0.726191 0.727781 3.634096 5

References And Further Readings#