Profiling Code With Timeit#
# %pip install -q omniverse==0.0.63
Common Functions#
This module include GPT model definitions as well as some common config.
from __future__ import annotations
from typing import Literal, Tuple, cast
import torch
from pydantic import BaseModel
from torch import nn
from omnivault.modules.activation import GELU, SoftmaxStable
from omnivault.transformer.modules.layers.normalization import RMSNorm
__tagged__ = "This code tags to `30d963e` of cs336-stanford-spring2024-assignment1-gpt-from-scratch."
__reference__ = ["https://github.com/marcelroed/spring2024-assignment2-systems/blob/master/writeup.pdf"]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class General(BaseModel):
batch_size: int = 16
seed: int = 20230310
class ProfilerConfig(BaseModel):
computation: Literal["forward", "backward", "forward_backward"]
warmup_steps: int | None = None
profile_steps: int
mixed_precision: bool = False
class GPTConfig(BaseModel):
approximate: Literal["tanh"] | None = None
activation_name: Literal["gelu"] = "gelu"
d_model: int
d_ff: int | None = None
num_heads: int
context_length: int
attn_pdrop: float = 0.0
resid_pdrop: float = 0.0
bias: bool = False
vocab_size: int
num_blocks: int
token_position_pdrop: float = 0.0
weight_tie: bool = False
class PositionwiseFeedForward(nn.Module):
def __init__(
self,
d_model: int,
d_ff: int | None = None,
bias: bool = False,
activation_name: Literal["gelu"] = "gelu",
dropout: float = 0.0,
) -> None:
super().__init__()
self.d_model = d_model
self.d_ff = d_ff or 4 * d_model
self.bias = bias # bias False in this exercise
self.activation_name = activation_name
self.dropout = dropout
self.ffn = nn.ModuleDict(
{
# incoming `B x T x D` and we are interested in `T x D` so weight is `D x d_ff`
# so that `Z @ W1 -> (T x D) @ (D x d_ff)`
"context_fc": nn.Linear(in_features=self.d_model, out_features=self.d_ff, bias=self.bias),
"activation": self.activation,
# apply dropout after activation for random lights out
"dropout": nn.Dropout(p=self.dropout, inplace=False),
# incoming is Z @ W1 -> T x d_ff -> (T x d_ff) @ (d_ff x D) project back to D
"context_projection": nn.Linear(in_features=self.d_ff, out_features=self.d_model, bias=self.bias),
}
)
@property
def activation(self) -> nn.Module:
if self.activation_name == "gelu":
activation = GELU(approximate=None) # no approx using tanh
else:
raise ValueError(f"Unsupported activation: {self._activation}")
return activation
def forward(self, z: torch.Tensor) -> torch.Tensor:
# fmt: off
z = self.ffn["context_fc"](z) # Z @ W1 = [B, T, D] @ [D, d_ff] = [B, T, d_ff]
z = self.ffn["activation"](z) # \sigma(Z @ W1) = [B, T, d_ff]
z = self.ffn["dropout"](z) # \dropout(\sigma(Z @ W1)) = [B, T, d_ff]
z = self.ffn["context_projection"](z) # \dropout(\sigma(Z @ W1)) @ W2 = [B, T, d_ff] @ [d_ff, D] = [B, T, D]
# fmt: on
return z
class ScaledDotProductAttention(nn.Module):
def __init__(self, dropout: float = 0.0) -> None:
super().__init__()
self.dropout = nn.Dropout(p=dropout)
def forward(
self,
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
mask: torch.BoolTensor | None = None,
) -> Tuple[torch.Tensor, torch.Tensor]:
# fmt: off
T, d_q = query.size(-2), query.size(-1)
attention_scores = torch.matmul(query, key.transpose(dim0=-2, dim1=-1)) / torch.sqrt(torch.tensor(d_q).float()) # Q @ K.T = [B, H, T, d_q] @ [B, H, d_q, T] = [B, H, T, T]
if mask is not None:
mask = mask[:, :, :T, :T] # type: ignore[assignment]
attention_scores = attention_scores.masked_fill(mask == 1, float("-inf")) if mask is not None else attention_scores # [B, H, T, T]
softmax = SoftmaxStable(dim=-1)
attention_weights = softmax(attention_scores) # [B, H, T, T]
attention_weights = self.dropout(attention_weights) # [B, H, T, T]
context_vector = torch.matmul(attention_weights, value) # [B, H, T, T] @ [B, H, T, d_v] = [B, H, T, d_v]
# fmt: on
return context_vector, attention_weights
class CausalMultiHeadSelfAttention(nn.Module):
context_vector: torch.Tensor
attention_weights: torch.Tensor
def __init__(
self,
d_model: int,
num_heads: int,
context_length: int,
attn_pdrop: float = 0.0, # pdrop means prob of dropout
resid_pdrop: float = 0.0,
bias: bool = False,
) -> None:
super().__init__()
assert d_model % num_heads == 0
self.d_model = d_model
self.H = num_heads
self.context_length = context_length
self.attn_pdrop = attn_pdrop
self.resid_pdrop = resid_pdrop
self.bias = bias
self.W_Q = nn.Linear(in_features=self.d_model, out_features=self.d_model, bias=self.bias)
self.W_K = nn.Linear(in_features=self.d_model, out_features=self.d_model, bias=self.bias)
self.W_V = nn.Linear(in_features=self.d_model, out_features=self.d_model, bias=self.bias)
# alias of W_O
self.context_projection = nn.Linear(in_features=self.d_model, out_features=self.d_model, bias=self.bias)
# regularization
self.resid_dropout = nn.Dropout(self.resid_pdrop)
self.attention = ScaledDotProductAttention(dropout=self.attn_pdrop)
# causal mask to ensure that attention is only applied to the left in the input sequence
# register buffer cause not learnable weights
self.register_buffer(
"causal_mask",
torch.triu(
torch.ones((self.context_length, self.context_length)).bool(),
diagonal=1,
).view(1, 1, self.context_length, self.context_length),
)
def forward(self, *, z: torch.Tensor) -> torch.Tensor:
B, T, D = z.size()
# fmt: off
Q: torch.Tensor = self.W_Q(z).contiguous() # Z @ W_Q = [B, T, D] @ [D, D] = [B, T, D]
K: torch.Tensor = self.W_K(z).contiguous() # Z @ W_K = [B, T, D] @ [D, D] = [B, T, D]
V: torch.Tensor = self.W_V(z).contiguous() # Z @ W_V = [B, T, D] @ [D, D] = [B, T, D]
Q = Q.view(B, T, self.H, D // self.H).transpose(dim0=1, dim1=2) # [B, T, D] -> [B, T, H, D // H] -> [B, H, T, D//H]
K = K.view(B, T, self.H, D // self.H).transpose(dim0=1, dim1=2)
V = V.view(B, T, self.H, D // self.H).transpose(dim0=1, dim1=2)
# Now pass them to self attention
self.context_vector, self.attention_weights = self.attention(query=Q, key=K, value=V, mask=self.causal_mask) # ([B, H, T, D // H], [B, H, T, T])
assert isinstance(self.context_vector, torch.Tensor) # do this for type hint in IDE
# Now context vector is shape [B, H, T, D // H] but we want [B, T, D] to matmul with W_O/context_projection
self.context_vector = self.context_vector.transpose(dim0=1, dim1=2).contiguous().view(B, T, D) # merge all heads together
# fmt: on
projected_context_vector: torch.Tensor = self.resid_dropout(
self.context_projection(self.context_vector) # [B, T, D] @ [D, D] = [B, T, D]
)
return projected_context_vector
class GPTBlock(nn.Module):
def __init__(
self,
config: GPTConfig,
) -> None:
super().__init__()
self.rmns_1 = RMSNorm(d_model=config.d_model, eps=1e-5)
self.attn = CausalMultiHeadSelfAttention(
d_model=config.d_model,
num_heads=config.num_heads,
context_length=config.context_length,
attn_pdrop=config.attn_pdrop,
resid_pdrop=config.resid_pdrop,
bias=config.bias,
)
self.rmns_2 = RMSNorm(d_model=config.d_model, eps=1e-5)
self.ffn = PositionwiseFeedForward(
d_model=config.d_model,
d_ff=config.d_ff,
bias=config.bias,
activation_name=config.activation_name,
dropout=config.resid_pdrop,
)
def forward(self, z: torch.Tensor) -> torch.Tensor:
z = z + self.attn(z=self.rmns_1(z))
z = z + self.ffn(self.rmns_2(z))
return z
class GPT(nn.Module):
def __init__(self, config: GPTConfig) -> None:
super().__init__()
self.config = config
self.d_model = config.d_model
self.num_blocks = config.num_blocks
self.vocab_size = config.vocab_size
self.blocks = nn.ModuleList([GPTBlock(config=config) for _ in range(self.num_blocks)])
self.backbone = nn.ModuleDict(
dict( # noqa: C408
token_embeddings=nn.Embedding(num_embeddings=self.vocab_size, embedding_dim=self.d_model),
position_embeddings=nn.Embedding(num_embeddings=config.context_length, embedding_dim=self.d_model),
dropout=nn.Dropout(p=config.token_position_pdrop),
layers=self.blocks,
ln_final=RMSNorm(d_model=self.d_model, eps=1e-5),
)
)
self.head = nn.Linear(in_features=self.d_model, out_features=self.vocab_size, bias=config.bias)
self.apply(self._init_weights)
context_projections = "context_projection.weight"
# apply special scaled init to the residual projections, per GPT-2 paper
for parameter_name, parameter in self.named_parameters():
# NOTE: W_O is also projection but I did not have foresight to name it as such.
if parameter_name.endswith(context_projections):
mean = 0.0
std_dev = 0.02 / torch.sqrt(torch.tensor(2 * config.num_blocks, dtype=torch.float))
torch.nn.init.normal_(parameter, mean=mean, std=std_dev)
if config.weight_tie:
self.backbone.token_embeddings.weight = self.head.weight
def crop_context_length(self, context_length: int) -> None:
# NOTE: conveniently took Karpathy's implementation here for cropping
assert context_length <= self.config.context_length
self.config.context_length = context_length # update config
self.backbone.position_embeddings.weight = nn.Parameter(
self.backbone.position_embeddings.weight[:context_length]
)
for block in self.backbone.layers:
if hasattr(block.attn, "causal_mask"):
block.attn.causal_mask = block.attn.causal_mask[:, :, :context_length, :context_length]
# update context length attribute in MultiHeadSelfAttention
block.attn.context_length = context_length
def _init_weights(self, module: nn.Module) -> None:
normal_init_modules = (nn.Linear, nn.Embedding)
if isinstance(module, normal_init_modules):
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
if hasattr(module, "bias") and module.bias is not None:
torch.nn.init.zeros_(module.bias)
def forward(self, in_indices: torch.LongTensor) -> torch.FloatTensor:
device = in_indices.device
B, T = in_indices.size()
positions = torch.arange(0, T, dtype=torch.long, device=device) # [T]
token_embeddings = self.backbone.token_embeddings(in_indices) # [B, T, D]
positional_embeddings = self.backbone.position_embeddings(positions) # [T, D]
# fmt: off
positional_embeddings = positional_embeddings.unsqueeze(0) # .expand(B, -1, -1) # [B, T, D]
# fmt: on
z = self.backbone.dropout(token_embeddings + positional_embeddings) # [B, T, D]
for block in self.backbone.layers:
z = block(z) # [B, T, D]
z = self.backbone.ln_final(z) # [B, T, D]
logits = self.head(z) # [B, T, V]
return cast(torch.FloatTensor, logits) # [B, T, V]
def initialize_model(
config: GPTConfig,
device: str = "cuda",
) -> GPT:
if config.d_ff is None:
config.d_ff = 4 * config.d_model
model = GPT(config)
return model.to(device)
def get_random_batch(
batch_size: int,
context_length: int,
vocab_size: int,
device: str = "cuda",
) -> Tuple[torch.Tensor, torch.Tensor]:
inputs = torch.randint( # [B, T]
0,
vocab_size,
(batch_size, context_length),
dtype=torch.long,
device=device,
)
targets = torch.randint( # [B, T]
0,
vocab_size,
(batch_size, context_length),
dtype=torch.long,
device=device,
)
return inputs, targets
Timeit Profiler#
from __future__ import annotations
from contextlib import nullcontext
from timeit import default_timer
from typing import List, Literal, Tuple
import numpy as np
import torch
from pydantic import BaseModel, Field
from omnivault.modules.loss import CrossEntropyLoss
class ProfilingResult(BaseModel):
computation: Literal["forward", "backward", "forward_backward"] = Field(..., description="Type of computation")
times: List[float] = Field(..., description="Raw list of measured times")
mean_time: float = Field(..., description="Mean execution time")
median_time: float = Field(..., description="Median execution time")
std_dev: float = Field(..., description="Standard deviation of execution times")
min_time: float = Field(..., description="Minimum execution time")
max_time: float = Field(..., description="Maximum execution time")
total_time: float = Field(..., description="Total execution time")
profile_steps: int = Field(..., description="Number of profiling runs")
def profile_model(
model: GPT,
batch: Tuple[torch.Tensor, torch.Tensor],
profile_steps: int,
computation: Literal["forward", "backward", "forward_backward"],
warmup_steps: int | None = None,
mixed_precision: bool = False,
) -> ProfilingResult:
device = next(model.parameters()).device
dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
mixed_context = torch.autocast(device.type, dtype=dtype) if mixed_precision else nullcontext()
criterion = CrossEntropyLoss()
inputs, targets = batch[0], batch[1]
with mixed_context: # type: ignore[attr-defined]
if warmup_steps:
for _ in range(warmup_steps):
logits = model(inputs)
loss = criterion(logits, targets)
if computation in ["backward", "forward_backward"]:
loss.backward()
torch.cuda.synchronize()
times = np.zeros(profile_steps)
for step in range(profile_steps):
if computation == "forward":
start = default_timer()
logits = model(inputs)
loss = criterion(logits, targets)
elif computation == "backward":
logits = model(inputs)
loss = criterion(logits, targets)
torch.cuda.synchronize()
start = default_timer()
loss.backward()
elif computation == "forward_backward":
start = default_timer()
logits = model(inputs)
loss = criterion(logits, targets)
loss.backward()
else:
raise ValueError(f"Invalid computation: {computation}")
torch.cuda.synchronize()
end = default_timer()
time = end - start
times[step] = time
return ProfilingResult(
computation=computation,
times=times.tolist(),
mean_time=float(np.mean(times)),
median_time=float(np.median(times)),
std_dev=float(np.std(times)),
min_time=float(np.min(times)),
max_time=float(np.max(times)),
total_time=float(np.sum(times)),
profile_steps=profile_steps,
)
Main Profiling Code#
import gc
import itertools
import logging
import sys
from typing import Dict, Iterable, Literal, Tuple
import pandas as pd
import torch
from rich.pretty import pprint
from tqdm.auto import tqdm
from omnivault.utils.reproducibility.seed import seed_all
from omnivault.utils.torch_utils.cleanup import purge_global_scope
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
handlers=[logging.StreamHandler(sys.stdout)],
force=True,
)
logger = logging.getLogger(__name__)
logger.info("Device=%s", device)
2024-08-11 10:57:45,858 - __main__ - INFO - Device=cuda
gpt_small_config = GPTConfig(
context_length=128,
vocab_size=10_000,
d_model=768,
num_blocks=12,
num_heads=12,
)
general = General()
seed_all(general.seed, True, False)
batch = get_random_batch(
batch_size=general.batch_size,
context_length=gpt_small_config.context_length,
vocab_size=gpt_small_config.vocab_size,
)
gpt_small = GPT(gpt_small_config).to(device)
results = profile_model(
model=gpt_small,
batch=batch,
warmup_steps=0,
profile_steps=5,
computation="forward_backward",
mixed_precision=False,
)
pprint(results)
results = profile_model(
model=gpt_small,
batch=batch,
warmup_steps=1,
profile_steps=5,
computation="forward_backward",
mixed_precision=False,
)
pprint(results)
purge_global_scope(variable_name_or_names=["gpt_small", "batch"])
try:
del gpt_small
del batch
except NameError as exc:
logger.error("Error deleting variables: %s", exc)
ProfilingResult( │ computation='forward_backward', │ times=[0.4144910469995011, 0.2255011379993448, 0.22108890900017286, 0.22048294700016413, 0.22087210900008358], │ mean_time=0.2604872299998533, │ median_time=0.22108890900017286, │ std_dev=0.0770235424421242, │ min_time=0.22048294700016413, │ max_time=0.4144910469995011, │ total_time=1.3024361499992665, │ profile_steps=5 )
ProfilingResult( │ computation='forward_backward', │ times=[ │ │ 0.22120609100056754, │ │ 0.22078104100000928, │ │ 0.2206222049999269, │ │ 0.22060177200000908, │ │ 0.22016962399993645 │ ], │ mean_time=0.22067614660008986, │ median_time=0.2206222049999269, │ std_dev=0.0003337215379814711, │ min_time=0.22016962399993645, │ max_time=0.22120609100056754, │ total_time=1.1033807330004493, │ profile_steps=5 )
2024-08-11 10:58:05,131 - __main__ - ERROR - Error deleting variables: name 'gpt_small' is not defined
def create_profile_configs(context_length: int, vocab_size: int) -> Iterable[Tuple[str, GPTConfig, ProfilerConfig]]:
gpt_configs: Dict[str, Dict[str, int]] = {
"small": {"d_model": 768, "num_blocks": 12, "num_heads": 12},
"medium": {"d_model": 1024, "num_blocks": 24, "num_heads": 16},
}
computations: Tuple[Literal["forward", "backward", "forward_backward"], ...] = (
"forward",
"backward",
"forward_backward",
)
warmup_steps: Tuple[int, ...] = (0, 1)
mixed_precision_options: Tuple[bool, ...] = (False, True)
profile_steps: Tuple[int, ...] = (5,)
for (config_name, config), computation, warmup, mixed, steps in itertools.product(
gpt_configs.items(),
computations,
warmup_steps,
mixed_precision_options,
profile_steps,
):
gpt_config = GPTConfig(**config, context_length=context_length, vocab_size=vocab_size) # type: ignore[arg-type]
profiler_config = ProfilerConfig(
computation=computation,
warmup_steps=warmup,
profile_steps=steps,
mixed_precision=mixed,
)
yield config_name, gpt_config, profiler_config
def run_profile(
device: torch.device,
gpt_config: GPTConfig,
profiler_config: ProfilerConfig,
general: General,
) -> ProfilingResult:
logger.info("Running profile with GPT config: \n%s", gpt_config.model_dump_json(indent=4))
logger.info("Profiler config: \n%s", profiler_config.model_dump_json(indent=4))
seed_all(general.seed, True, False)
batch = get_random_batch(
batch_size=general.batch_size,
context_length=gpt_config.context_length,
vocab_size=gpt_config.vocab_size,
)
gpt = GPT(config=gpt_config).to(device)
result = profile_model(
model=gpt,
batch=batch,
warmup_steps=profiler_config.warmup_steps,
profile_steps=profiler_config.profile_steps,
mixed_precision=profiler_config.mixed_precision,
computation=profiler_config.computation,
)
logger.warning("Purging global scope variables `gpt` and `batch` to free up memory.")
del gpt
del batch
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
return result
def results_to_dataframe(results: Dict[str, ProfilingResult]) -> pd.DataFrame:
data = []
for name, result in results.items():
row = result.model_dump()
row["name"] = name
data.append(row)
df = pd.DataFrame(data)
columns = ["name"] + [col for col in df.columns if col != "name"]
df = df[columns]
return df
def main() -> Dict[str, ProfilingResult]:
context_length = 128
vocab_size = 10_000
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
general = General()
results: Dict[str, ProfilingResult] = {}
all_configs = list(create_profile_configs(context_length, vocab_size))
for config_name, gpt_config, profiler_config in tqdm(all_configs, desc="Profiling Configurations"):
key = (
f"{config_name}_{profiler_config.computation}_"
f"warmup_{profiler_config.warmup_steps}_"
f"mixed_{profiler_config.mixed_precision}"
)
logger.info("Running profile for: %s", key)
results[key] = run_profile(device, gpt_config, profiler_config, general)
logger.info("Profile result: \n%s\n\n\n", results[key].model_dump_json(indent=4))
return results
results = main()
2024-08-11 10:58:52,908 - __main__ - INFO - Running profile for: small_forward_warmup_0_mixed_False
2024-08-11 10:58:52,909 - __main__ - INFO - Running profile with GPT config:
{
"approximate": null,
"activation_name": "gelu",
"d_model": 768,
"d_ff": null,
"num_heads": 12,
"context_length": 128,
"attn_pdrop": 0.0,
"resid_pdrop": 0.0,
"bias": false,
"vocab_size": 10000,
"num_blocks": 12,
"token_position_pdrop": 0.0,
"weight_tie": false
}
2024-08-11 10:58:52,910 - __main__ - INFO - Profiler config:
{
"computation": "forward",
"warmup_steps": 0,
"profile_steps": 5,
"mixed_precision": false
}
2024-08-11 10:58:56,410 - __main__ - INFO - Profile result:
{
"computation": "forward",
"times": [
0.07846403200073837,
0.0737714920005601,
0.07313682499989227,
0.07290069700047752,
0.07229203399947437
],
"mean_time": 0.07411301600022853,
"median_time": 0.07313682499989227,
"std_dev": 0.0022265049091528,
"min_time": 0.07229203399947437,
"max_time": 0.07846403200073837,
"total_time": 0.37056508000114263,
"profile_steps": 5
}
2024-08-11 10:58:56,411 - __main__ - INFO - Running profile for: small_forward_warmup_0_mixed_True
2024-08-11 10:58:56,413 - __main__ - INFO - Running profile with GPT config:
{
"approximate": null,
"activation_name": "gelu",
"d_model": 768,
"d_ff": null,
"num_heads": 12,
"context_length": 128,
"attn_pdrop": 0.0,
"resid_pdrop": 0.0,
"bias": false,
"vocab_size": 10000,
"num_blocks": 12,
"token_position_pdrop": 0.0,
"weight_tie": false
}
2024-08-11 10:58:56,413 - __main__ - INFO - Profiler config:
{
"computation": "forward",
"warmup_steps": 0,
"profile_steps": 5,
"mixed_precision": true
}
2024-08-11 10:58:59,672 - __main__ - INFO - Profile result:
{
"computation": "forward",
"times": [
0.0935192509996341,
0.07841499799997109,
0.07517127899973275,
0.07425991700074519,
0.07425473500006774
],
"mean_time": 0.07912403600003018,
"median_time": 0.07517127899973275,
"std_dev": 0.007358246850183679,
"min_time": 0.07425473500006774,
"max_time": 0.0935192509996341,
"total_time": 0.39562018000015087,
"profile_steps": 5
}
2024-08-11 10:58:59,674 - __main__ - INFO - Running profile for: small_forward_warmup_1_mixed_False
2024-08-11 10:58:59,675 - __main__ - INFO - Running profile with GPT config:
{
"approximate": null,
"activation_name": "gelu",
"d_model": 768,
"d_ff": null,
"num_heads": 12,
"context_length": 128,
"attn_pdrop": 0.0,
"resid_pdrop": 0.0,
"bias": false,
"vocab_size": 10000,
"num_blocks": 12,
"token_position_pdrop": 0.0,
"weight_tie": false
}
2024-08-11 10:58:59,676 - __main__ - INFO - Profiler config:
{
"computation": "forward",
"warmup_steps": 1,
"profile_steps": 5,
"mixed_precision": false
}
2024-08-11 10:59:03,039 - __main__ - INFO - Profile result:
{
"computation": "forward",
"times": [
0.0713498349996371,
0.0709589389998655,
0.07128997600011644,
0.07113105599910341,
0.07112360400060425
],
"mean_time": 0.07117068199986534,
"median_time": 0.07113105599910341,
"std_dev": 0.00013780312900865102,
"min_time": 0.0709589389998655,
"max_time": 0.0713498349996371,
"total_time": 0.3558534099993267,
"profile_steps": 5
}
2024-08-11 10:59:03,041 - __main__ - INFO - Running profile for: small_forward_warmup_1_mixed_True
2024-08-11 10:59:03,042 - __main__ - INFO - Running profile with GPT config:
{
"approximate": null,
"activation_name": "gelu",
"d_model": 768,
"d_ff": null,
"num_heads": 12,
"context_length": 128,
"attn_pdrop": 0.0,
"resid_pdrop": 0.0,
"bias": false,
"vocab_size": 10000,
"num_blocks": 12,
"token_position_pdrop": 0.0,
"weight_tie": false
}
2024-08-11 10:59:03,043 - __main__ - INFO - Profiler config:
{
"computation": "forward",
"warmup_steps": 1,
"profile_steps": 5,
"mixed_precision": true
}
2024-08-11 10:59:06,362 - __main__ - INFO - Profile result:
{
"computation": "forward",
"times": [
0.07369880799979,
0.07382382700052403,
0.07385925299968221,
0.07376415900034772,
0.07382289199995284
],
"mean_time": 0.07379378780005937,
"median_time": 0.07382289199995284,
"std_dev": 0.00005645197712961995,
"min_time": 0.07369880799979,
"max_time": 0.07385925299968221,
"total_time": 0.3689689390002968,
"profile_steps": 5
}
2024-08-11 10:59:06,364 - __main__ - INFO - Running profile for: small_backward_warmup_0_mixed_False
2024-08-11 10:59:06,365 - __main__ - INFO - Running profile with GPT config:
{
"approximate": null,
"activation_name": "gelu",
"d_model": 768,
"d_ff": null,
"num_heads": 12,
"context_length": 128,
"attn_pdrop": 0.0,
"resid_pdrop": 0.0,
"bias": false,
"vocab_size": 10000,
"num_blocks": 12,
"token_position_pdrop": 0.0,
"weight_tie": false
}
2024-08-11 10:59:06,365 - __main__ - INFO - Profiler config:
{
"computation": "backward",
"warmup_steps": 0,
"profile_steps": 5,
"mixed_precision": false
}
2024-08-11 10:59:10,363 - __main__ - INFO - Profile result:
{
"computation": "backward",
"times": [
0.14778646800004935,
0.1506624739995459,
0.15020622800057026,
0.15019696000035765,
0.15047101199979807
],
"mean_time": 0.14986462840006426,
"median_time": 0.15020622800057026,
"std_dev": 0.0010535790334329838,
"min_time": 0.14778646800004935,
"max_time": 0.1506624739995459,
"total_time": 0.7493231420003212,
"profile_steps": 5
}
2024-08-11 10:59:10,365 - __main__ - INFO - Running profile for: small_backward_warmup_0_mixed_True
2024-08-11 10:59:10,366 - __main__ - INFO - Running profile with GPT config:
{
"approximate": null,
"activation_name": "gelu",
"d_model": 768,
"d_ff": null,
"num_heads": 12,
"context_length": 128,
"attn_pdrop": 0.0,
"resid_pdrop": 0.0,
"bias": false,
"vocab_size": 10000,
"num_blocks": 12,
"token_position_pdrop": 0.0,
"weight_tie": false
}
2024-08-11 10:59:10,367 - __main__ - INFO - Profiler config:
{
"computation": "backward",
"warmup_steps": 0,
"profile_steps": 5,
"mixed_precision": true
}
2024-08-11 10:59:14,375 - __main__ - INFO - Profile result:
{
"computation": "backward",
"times": [
0.14960491599958914,
0.15169357899958413,
0.15144059900012508,
0.15148220399987622,
0.15148427300027834
],
"mean_time": 0.15114111419989057,
"median_time": 0.15148220399987622,
"std_dev": 0.000773164099660496,
"min_time": 0.14960491599958914,
"max_time": 0.15169357899958413,
"total_time": 0.7557055709994529,
"profile_steps": 5
}
2024-08-11 10:59:14,377 - __main__ - INFO - Running profile for: small_backward_warmup_1_mixed_False
2024-08-11 10:59:14,378 - __main__ - INFO - Running profile with GPT config:
{
"approximate": null,
"activation_name": "gelu",
"d_model": 768,
"d_ff": null,
"num_heads": 12,
"context_length": 128,
"attn_pdrop": 0.0,
"resid_pdrop": 0.0,
"bias": false,
"vocab_size": 10000,
"num_blocks": 12,
"token_position_pdrop": 0.0,
"weight_tie": false
}
2024-08-11 10:59:14,379 - __main__ - INFO - Profiler config:
{
"computation": "backward",
"warmup_steps": 1,
"profile_steps": 5,
"mixed_precision": false
}
2024-08-11 10:59:18,635 - __main__ - INFO - Profile result:
{
"computation": "backward",
"times": [
0.15043866700034414,
0.15016293000007863,
0.15051599399976112,
0.14995777100011765,
0.15019284299978608
],
"mean_time": 0.15025364100001753,
"median_time": 0.15019284299978608,
"std_dev": 0.00020125986010833822,
"min_time": 0.14995777100011765,
"max_time": 0.15051599399976112,
"total_time": 0.7512682050000876,
"profile_steps": 5
}
2024-08-11 10:59:18,637 - __main__ - INFO - Running profile for: small_backward_warmup_1_mixed_True
2024-08-11 10:59:18,638 - __main__ - INFO - Running profile with GPT config:
{
"approximate": null,
"activation_name": "gelu",
"d_model": 768,
"d_ff": null,
"num_heads": 12,
"context_length": 128,
"attn_pdrop": 0.0,
"resid_pdrop": 0.0,
"bias": false,
"vocab_size": 10000,
"num_blocks": 12,
"token_position_pdrop": 0.0,
"weight_tie": false
}
2024-08-11 10:59:18,638 - __main__ - INFO - Profiler config:
{
"computation": "backward",
"warmup_steps": 1,
"profile_steps": 5,
"mixed_precision": true
}
2024-08-11 10:59:22,886 - __main__ - INFO - Profile result:
{
"computation": "backward",
"times": [
0.15156354600003397,
0.15143018500020844,
0.15140369100026874,
0.15138537599978008,
0.15155341100035002
],
"mean_time": 0.15146724180012824,
"median_time": 0.15143018500020844,
"std_dev": 0.00007591251522123026,
"min_time": 0.15138537599978008,
"max_time": 0.15156354600003397,
"total_time": 0.7573362090006412,
"profile_steps": 5
}
2024-08-11 10:59:22,888 - __main__ - INFO - Running profile for: small_forward_backward_warmup_0_mixed_False
2024-08-11 10:59:22,890 - __main__ - INFO - Running profile with GPT config:
{
"approximate": null,
"activation_name": "gelu",
"d_model": 768,
"d_ff": null,
"num_heads": 12,
"context_length": 128,
"attn_pdrop": 0.0,
"resid_pdrop": 0.0,
"bias": false,
"vocab_size": 10000,
"num_blocks": 12,
"token_position_pdrop": 0.0,
"weight_tie": false
}
2024-08-11 10:59:22,890 - __main__ - INFO - Profiler config:
{
"computation": "forward_backward",
"warmup_steps": 0,
"profile_steps": 5,
"mixed_precision": false
}
2024-08-11 10:59:26,934 - __main__ - INFO - Profile result:
{
"computation": "forward_backward",
"times": [
0.21832664599969576,
0.22057815300013317,
0.22008085299967206,
0.22023602300032508,
0.22247255000002042
],
"mean_time": 0.2203388449999693,
"median_time": 0.22023602300032508,
"std_dev": 0.0013218201389535288,
"min_time": 0.21832664599969576,
"max_time": 0.22247255000002042,
"total_time": 1.1016942249998465,
"profile_steps": 5
}
2024-08-11 10:59:26,936 - __main__ - INFO - Running profile for: small_forward_backward_warmup_0_mixed_True
2024-08-11 10:59:26,937 - __main__ - INFO - Running profile with GPT config:
{
"approximate": null,
"activation_name": "gelu",
"d_model": 768,
"d_ff": null,
"num_heads": 12,
"context_length": 128,
"attn_pdrop": 0.0,
"resid_pdrop": 0.0,
"bias": false,
"vocab_size": 10000,
"num_blocks": 12,
"token_position_pdrop": 0.0,
"weight_tie": false
}
2024-08-11 10:59:26,938 - __main__ - INFO - Profiler config:
{
"computation": "forward_backward",
"warmup_steps": 0,
"profile_steps": 5,
"mixed_precision": true
}
2024-08-11 10:59:30,955 - __main__ - INFO - Profile result:
{
"computation": "forward_backward",
"times": [
0.22407326100073988,
0.22444049899968377,
0.22447669499979384,
0.22424623200004135,
0.2243646639999497
],
"mean_time": 0.2243202702000417,
"median_time": 0.2243646639999497,
"std_dev": 0.00014655353377531386,
"min_time": 0.22407326100073988,
"max_time": 0.22447669499979384,
"total_time": 1.1216013510002085,
"profile_steps": 5
}
2024-08-11 10:59:30,957 - __main__ - INFO - Running profile for: small_forward_backward_warmup_1_mixed_False
2024-08-11 10:59:30,958 - __main__ - INFO - Running profile with GPT config:
{
"approximate": null,
"activation_name": "gelu",
"d_model": 768,
"d_ff": null,
"num_heads": 12,
"context_length": 128,
"attn_pdrop": 0.0,
"resid_pdrop": 0.0,
"bias": false,
"vocab_size": 10000,
"num_blocks": 12,
"token_position_pdrop": 0.0,
"weight_tie": false
}
2024-08-11 10:59:30,958 - __main__ - INFO - Profiler config:
{
"computation": "forward_backward",
"warmup_steps": 1,
"profile_steps": 5,
"mixed_precision": false
}
2024-08-11 10:59:35,215 - __main__ - INFO - Profile result:
{
"computation": "forward_backward",
"times": [
0.22083036900039588,
0.22020937799970852,
0.22077888900003018,
0.2204785499998252,
0.22015970299980836
],
"mean_time": 0.22049137779995362,
"median_time": 0.2204785499998252,
"std_dev": 0.0002783071457913188,
"min_time": 0.22015970299980836,
"max_time": 0.22083036900039588,
"total_time": 1.1024568889997681,
"profile_steps": 5
}
2024-08-11 10:59:35,217 - __main__ - INFO - Running profile for: small_forward_backward_warmup_1_mixed_True
2024-08-11 10:59:35,218 - __main__ - INFO - Running profile with GPT config:
{
"approximate": null,
"activation_name": "gelu",
"d_model": 768,
"d_ff": null,
"num_heads": 12,
"context_length": 128,
"attn_pdrop": 0.0,
"resid_pdrop": 0.0,
"bias": false,
"vocab_size": 10000,
"num_blocks": 12,
"token_position_pdrop": 0.0,
"weight_tie": false
}
2024-08-11 10:59:35,219 - __main__ - INFO - Profiler config:
{
"computation": "forward_backward",
"warmup_steps": 1,
"profile_steps": 5,
"mixed_precision": true
}
2024-08-11 10:59:39,489 - __main__ - INFO - Profile result:
{
"computation": "forward_backward",
"times": [
0.22448583999994298,
0.22421371400014323,
0.22419899699980306,
0.224439366000297,
0.22438234299988835
],
"mean_time": 0.22434405200001492,
"median_time": 0.22438234299988835,
"std_dev": 0.00011720387667794871,
"min_time": 0.22419899699980306,
"max_time": 0.22448583999994298,
"total_time": 1.1217202600000746,
"profile_steps": 5
}
2024-08-11 10:59:39,491 - __main__ - INFO - Running profile for: medium_forward_warmup_0_mixed_False
2024-08-11 10:59:39,492 - __main__ - INFO - Running profile with GPT config:
{
"approximate": null,
"activation_name": "gelu",
"d_model": 1024,
"d_ff": null,
"num_heads": 16,
"context_length": 128,
"attn_pdrop": 0.0,
"resid_pdrop": 0.0,
"bias": false,
"vocab_size": 10000,
"num_blocks": 24,
"token_position_pdrop": 0.0,
"weight_tie": false
}
2024-08-11 10:59:39,493 - __main__ - INFO - Profiler config:
{
"computation": "forward",
"warmup_steps": 0,
"profile_steps": 5,
"mixed_precision": false
}
2024-08-11 10:59:50,458 - __main__ - INFO - Profile result:
{
"computation": "forward",
"times": [
0.24876118500014854,
0.24528420900060155,
0.2450883880001129,
0.24495568000020285,
0.24501344400050584
],
"mean_time": 0.24582058120031433,
"median_time": 0.2450883880001129,
"std_dev": 0.0014744814187987805,
"min_time": 0.24495568000020285,
"max_time": 0.24876118500014854,
"total_time": 1.2291029060015717,
"profile_steps": 5
}
2024-08-11 10:59:50,459 - __main__ - INFO - Running profile for: medium_forward_warmup_0_mixed_True
2024-08-11 10:59:50,460 - __main__ - INFO - Running profile with GPT config:
{
"approximate": null,
"activation_name": "gelu",
"d_model": 1024,
"d_ff": null,
"num_heads": 16,
"context_length": 128,
"attn_pdrop": 0.0,
"resid_pdrop": 0.0,
"bias": false,
"vocab_size": 10000,
"num_blocks": 24,
"token_position_pdrop": 0.0,
"weight_tie": false
}
2024-08-11 10:59:50,461 - __main__ - INFO - Profiler config:
{
"computation": "forward",
"warmup_steps": 0,
"profile_steps": 5,
"mixed_precision": true
}
2024-08-11 11:00:00,845 - __main__ - INFO - Profile result:
{
"computation": "forward",
"times": [
0.22800831199947424,
0.22234672700051306,
0.22269292600049084,
0.22257933899982163,
0.22283626599983108
],
"mean_time": 0.22369271400002616,
"median_time": 0.22269292600049084,
"std_dev": 0.002163735205972857,
"min_time": 0.22234672700051306,
"max_time": 0.22800831199947424,
"total_time": 1.1184635700001309,
"profile_steps": 5
}
2024-08-11 11:00:00,847 - __main__ - INFO - Running profile for: medium_forward_warmup_1_mixed_False
2024-08-11 11:00:00,848 - __main__ - INFO - Running profile with GPT config:
{
"approximate": null,
"activation_name": "gelu",
"d_model": 1024,
"d_ff": null,
"num_heads": 16,
"context_length": 128,
"attn_pdrop": 0.0,
"resid_pdrop": 0.0,
"bias": false,
"vocab_size": 10000,
"num_blocks": 24,
"token_position_pdrop": 0.0,
"weight_tie": false
}
2024-08-11 11:00:00,849 - __main__ - INFO - Profiler config:
{
"computation": "forward",
"warmup_steps": 1,
"profile_steps": 5,
"mixed_precision": false
}
2024-08-11 11:00:12,121 - __main__ - INFO - Profile result:
{
"computation": "forward",
"times": [
0.24505015699924115,
0.2447970040002474,
0.2449814139999944,
0.24467239400019025,
0.24507564099985757
],
"mean_time": 0.24491532199990615,
"median_time": 0.2449814139999944,
"std_dev": 0.00015573308791388857,
"min_time": 0.24467239400019025,
"max_time": 0.24507564099985757,
"total_time": 1.2245766099995308,
"profile_steps": 5
}
2024-08-11 11:00:12,123 - __main__ - INFO - Running profile for: medium_forward_warmup_1_mixed_True
2024-08-11 11:00:12,124 - __main__ - INFO - Running profile with GPT config:
{
"approximate": null,
"activation_name": "gelu",
"d_model": 1024,
"d_ff": null,
"num_heads": 16,
"context_length": 128,
"attn_pdrop": 0.0,
"resid_pdrop": 0.0,
"bias": false,
"vocab_size": 10000,
"num_blocks": 24,
"token_position_pdrop": 0.0,
"weight_tie": false
}
2024-08-11 11:00:12,125 - __main__ - INFO - Profiler config:
{
"computation": "forward",
"warmup_steps": 1,
"profile_steps": 5,
"mixed_precision": true
}
2024-08-11 11:00:22,989 - __main__ - INFO - Profile result:
{
"computation": "forward",
"times": [
0.22247573399999965,
0.22273382500043226,
0.22256395700060239,
0.22253749899937247,
0.2227441450004335
],
"mean_time": 0.22261103200016805,
"median_time": 0.22256395700060239,
"std_dev": 0.00010837518017815013,
"min_time": 0.22247573399999965,
"max_time": 0.2227441450004335,
"total_time": 1.1130551600008403,
"profile_steps": 5
}
2024-08-11 11:00:22,991 - __main__ - INFO - Running profile for: medium_backward_warmup_0_mixed_False
2024-08-11 11:00:22,992 - __main__ - INFO - Running profile with GPT config:
{
"approximate": null,
"activation_name": "gelu",
"d_model": 1024,
"d_ff": null,
"num_heads": 16,
"context_length": 128,
"attn_pdrop": 0.0,
"resid_pdrop": 0.0,
"bias": false,
"vocab_size": 10000,
"num_blocks": 24,
"token_position_pdrop": 0.0,
"weight_tie": false
}
2024-08-11 11:00:22,993 - __main__ - INFO - Profiler config:
{
"computation": "backward",
"warmup_steps": 0,
"profile_steps": 5,
"mixed_precision": false
}
2024-08-11 11:00:36,042 - __main__ - INFO - Profile result:
{
"computation": "backward",
"times": [
0.4803364019999208,
0.4842157639996003,
0.48418703799961804,
0.4834751440002947,
0.48341351999988547
],
"mean_time": 0.48312557359986386,
"median_time": 0.4834751440002947,
"std_dev": 0.001435256951747305,
"min_time": 0.4803364019999208,
"max_time": 0.4842157639996003,
"total_time": 2.4156278679993193,
"profile_steps": 5
}
2024-08-11 11:00:36,044 - __main__ - INFO - Running profile for: medium_backward_warmup_0_mixed_True
2024-08-11 11:00:36,045 - __main__ - INFO - Running profile with GPT config:
{
"approximate": null,
"activation_name": "gelu",
"d_model": 1024,
"d_ff": null,
"num_heads": 16,
"context_length": 128,
"attn_pdrop": 0.0,
"resid_pdrop": 0.0,
"bias": false,
"vocab_size": 10000,
"num_blocks": 24,
"token_position_pdrop": 0.0,
"weight_tie": false
}
2024-08-11 11:00:36,046 - __main__ - INFO - Profiler config:
{
"computation": "backward",
"warmup_steps": 0,
"profile_steps": 5,
"mixed_precision": true
}
2024-08-11 11:00:49,070 - __main__ - INFO - Profile result:
{
"computation": "backward",
"times": [
0.4554312190002747,
0.4636755310002627,
0.46323080499951175,
0.46364944700053456,
0.46372856899961334
],
"mean_time": 0.4619431142000394,
"median_time": 0.46364944700053456,
"std_dev": 0.0032607856453568743,
"min_time": 0.4554312190002747,
"max_time": 0.46372856899961334,
"total_time": 2.309715571000197,
"profile_steps": 5
}
2024-08-11 11:00:49,072 - __main__ - INFO - Running profile for: medium_backward_warmup_1_mixed_False
2024-08-11 11:00:49,073 - __main__ - INFO - Running profile with GPT config:
{
"approximate": null,
"activation_name": "gelu",
"d_model": 1024,
"d_ff": null,
"num_heads": 16,
"context_length": 128,
"attn_pdrop": 0.0,
"resid_pdrop": 0.0,
"bias": false,
"vocab_size": 10000,
"num_blocks": 24,
"token_position_pdrop": 0.0,
"weight_tie": false
}
2024-08-11 11:00:49,073 - __main__ - INFO - Profiler config:
{
"computation": "backward",
"warmup_steps": 1,
"profile_steps": 5,
"mixed_precision": false
}
2024-08-11 11:01:02,813 - __main__ - INFO - Profile result:
{
"computation": "backward",
"times": [
0.48379133800062846,
0.4842440839993287,
0.48351270700004534,
0.48362178600018524,
0.48389172999941366
],
"mean_time": 0.48381232899992027,
"median_time": 0.48379133800062846,
"std_dev": 0.00025268062760848447,
"min_time": 0.48351270700004534,
"max_time": 0.4842440839993287,
"total_time": 2.4190616449996014,
"profile_steps": 5
}
2024-08-11 11:01:02,815 - __main__ - INFO - Running profile for: medium_backward_warmup_1_mixed_True
2024-08-11 11:01:02,816 - __main__ - INFO - Running profile with GPT config:
{
"approximate": null,
"activation_name": "gelu",
"d_model": 1024,
"d_ff": null,
"num_heads": 16,
"context_length": 128,
"attn_pdrop": 0.0,
"resid_pdrop": 0.0,
"bias": false,
"vocab_size": 10000,
"num_blocks": 24,
"token_position_pdrop": 0.0,
"weight_tie": false
}
2024-08-11 11:01:02,816 - __main__ - INFO - Profiler config:
{
"computation": "backward",
"warmup_steps": 1,
"profile_steps": 5,
"mixed_precision": true
}
2024-08-11 11:01:16,508 - __main__ - INFO - Profile result:
{
"computation": "backward",
"times": [
0.46393922600054793,
0.4636971440004345,
0.4640383700007078,
0.4659291400002985,
0.4634621249997508
],
"mean_time": 0.4642132010003479,
"median_time": 0.46393922600054793,
"std_dev": 0.0008809659262803036,
"min_time": 0.4634621249997508,
"max_time": 0.4659291400002985,
"total_time": 2.3210660050017395,
"profile_steps": 5
}
2024-08-11 11:01:16,510 - __main__ - INFO - Running profile for: medium_forward_backward_warmup_0_mixed_False
2024-08-11 11:01:16,511 - __main__ - INFO - Running profile with GPT config:
{
"approximate": null,
"activation_name": "gelu",
"d_model": 1024,
"d_ff": null,
"num_heads": 16,
"context_length": 128,
"attn_pdrop": 0.0,
"resid_pdrop": 0.0,
"bias": false,
"vocab_size": 10000,
"num_blocks": 24,
"token_position_pdrop": 0.0,
"weight_tie": false
}
2024-08-11 11:01:16,512 - __main__ - INFO - Profiler config:
{
"computation": "forward_backward",
"warmup_steps": 0,
"profile_steps": 5,
"mixed_precision": false
}
2024-08-11 11:01:29,358 - __main__ - INFO - Profile result:
{
"computation": "forward_backward",
"times": [
0.7304752370000642,
0.727164707999691,
0.7266143320002811,
0.7294365609996021,
0.7265293540003768
],
"mean_time": 0.7280440384000031,
"median_time": 0.727164707999691,
"std_dev": 0.0016100557130809725,
"min_time": 0.7265293540003768,
"max_time": 0.7304752370000642,
"total_time": 3.6402201920000152,
"profile_steps": 5
}
2024-08-11 11:01:29,359 - __main__ - INFO - Running profile for: medium_forward_backward_warmup_0_mixed_True
2024-08-11 11:01:29,360 - __main__ - INFO - Running profile with GPT config:
{
"approximate": null,
"activation_name": "gelu",
"d_model": 1024,
"d_ff": null,
"num_heads": 16,
"context_length": 128,
"attn_pdrop": 0.0,
"resid_pdrop": 0.0,
"bias": false,
"vocab_size": 10000,
"num_blocks": 24,
"token_position_pdrop": 0.0,
"weight_tie": false
}
2024-08-11 11:01:29,361 - __main__ - INFO - Profiler config:
{
"computation": "forward_backward",
"warmup_steps": 0,
"profile_steps": 5,
"mixed_precision": true
}
2024-08-11 11:01:42,075 - __main__ - INFO - Profile result:
{
"computation": "forward_backward",
"times": [
0.6820840510008566,
0.684006079000028,
0.6853365530005249,
0.6837199530000362,
0.6839223050010332
],
"mean_time": 0.6838137882004958,
"median_time": 0.6839223050010332,
"std_dev": 0.0010361814617577633,
"min_time": 0.6820840510008566,
"max_time": 0.6853365530005249,
"total_time": 3.419068941002479,
"profile_steps": 5
}
2024-08-11 11:01:42,077 - __main__ - INFO - Running profile for: medium_forward_backward_warmup_1_mixed_False
2024-08-11 11:01:42,078 - __main__ - INFO - Running profile with GPT config:
{
"approximate": null,
"activation_name": "gelu",
"d_model": 1024,
"d_ff": null,
"num_heads": 16,
"context_length": 128,
"attn_pdrop": 0.0,
"resid_pdrop": 0.0,
"bias": false,
"vocab_size": 10000,
"num_blocks": 24,
"token_position_pdrop": 0.0,
"weight_tie": false
}
2024-08-11 11:01:42,079 - __main__ - INFO - Profiler config:
{
"computation": "forward_backward",
"warmup_steps": 1,
"profile_steps": 5,
"mixed_precision": false
}
2024-08-11 11:01:56,159 - __main__ - INFO - Profile result:
{
"computation": "forward_backward",
"times": [
0.7272930130002351,
0.7277811669991934,
0.7261910039997019,
0.7264165149990731,
0.7264139199996862
],
"mean_time": 0.7268191237995779,
"median_time": 0.7264165149990731,
"std_dev": 0.0006117052461950424,
"min_time": 0.7261910039997019,
"max_time": 0.7277811669991934,
"total_time": 3.6340956189978897,
"profile_steps": 5
}
2024-08-11 11:01:56,161 - __main__ - INFO - Running profile for: medium_forward_backward_warmup_1_mixed_True
2024-08-11 11:01:56,162 - __main__ - INFO - Running profile with GPT config:
{
"approximate": null,
"activation_name": "gelu",
"d_model": 1024,
"d_ff": null,
"num_heads": 16,
"context_length": 128,
"attn_pdrop": 0.0,
"resid_pdrop": 0.0,
"bias": false,
"vocab_size": 10000,
"num_blocks": 24,
"token_position_pdrop": 0.0,
"weight_tie": false
}
2024-08-11 11:01:56,163 - __main__ - INFO - Profiler config:
{
"computation": "forward_backward",
"warmup_steps": 1,
"profile_steps": 5,
"mixed_precision": true
}
2024-08-11 11:02:09,545 - __main__ - INFO - Profile result:
{
"computation": "forward_backward",
"times": [
0.6852539360006631,
0.68381571899954,
0.6833910059995105,
0.6835393290002685,
0.6835425659992325
],
"mean_time": 0.6839085111998429,
"median_time": 0.6835425659992325,
"std_dev": 0.000686556815682806,
"min_time": 0.6833910059995105,
"max_time": 0.6852539360006631,
"total_time": 3.4195425559992145,
"profile_steps": 5
}
We see torch.cuda.synchronize()
is scattered to ensure the timing is accurate since CUDA operations
are asynchronous and non blocking for CPU operations.
df = results_to_dataframe(results)
df_by_mean = df.sort_values(by='mean_time', ascending=True)
display(df_by_mean)
name | computation | times | mean_time | median_time | std_dev | min_time | max_time | total_time | profile_steps | |
---|---|---|---|---|---|---|---|---|---|---|
2 | small_forward_warmup_1_mixed_False | forward | [0.0713498349996371, 0.0709589389998655, 0.071... | 0.071171 | 0.071131 | 0.000138 | 0.070959 | 0.071350 | 0.355853 | 5 |
3 | small_forward_warmup_1_mixed_True | forward | [0.07369880799979, 0.07382382700052403, 0.0738... | 0.073794 | 0.073823 | 0.000056 | 0.073699 | 0.073859 | 0.368969 | 5 |
0 | small_forward_warmup_0_mixed_False | forward | [0.07846403200073837, 0.0737714920005601, 0.07... | 0.074113 | 0.073137 | 0.002227 | 0.072292 | 0.078464 | 0.370565 | 5 |
1 | small_forward_warmup_0_mixed_True | forward | [0.0935192509996341, 0.07841499799997109, 0.07... | 0.079124 | 0.075171 | 0.007358 | 0.074255 | 0.093519 | 0.395620 | 5 |
4 | small_backward_warmup_0_mixed_False | backward | [0.14778646800004935, 0.1506624739995459, 0.15... | 0.149865 | 0.150206 | 0.001054 | 0.147786 | 0.150662 | 0.749323 | 5 |
6 | small_backward_warmup_1_mixed_False | backward | [0.15043866700034414, 0.15016293000007863, 0.1... | 0.150254 | 0.150193 | 0.000201 | 0.149958 | 0.150516 | 0.751268 | 5 |
5 | small_backward_warmup_0_mixed_True | backward | [0.14960491599958914, 0.15169357899958413, 0.1... | 0.151141 | 0.151482 | 0.000773 | 0.149605 | 0.151694 | 0.755706 | 5 |
7 | small_backward_warmup_1_mixed_True | backward | [0.15156354600003397, 0.15143018500020844, 0.1... | 0.151467 | 0.151430 | 0.000076 | 0.151385 | 0.151564 | 0.757336 | 5 |
8 | small_forward_backward_warmup_0_mixed_False | forward_backward | [0.21832664599969576, 0.22057815300013317, 0.2... | 0.220339 | 0.220236 | 0.001322 | 0.218327 | 0.222473 | 1.101694 | 5 |
10 | small_forward_backward_warmup_1_mixed_False | forward_backward | [0.22083036900039588, 0.22020937799970852, 0.2... | 0.220491 | 0.220479 | 0.000278 | 0.220160 | 0.220830 | 1.102457 | 5 |
15 | medium_forward_warmup_1_mixed_True | forward | [0.22247573399999965, 0.22273382500043226, 0.2... | 0.222611 | 0.222564 | 0.000108 | 0.222476 | 0.222744 | 1.113055 | 5 |
13 | medium_forward_warmup_0_mixed_True | forward | [0.22800831199947424, 0.22234672700051306, 0.2... | 0.223693 | 0.222693 | 0.002164 | 0.222347 | 0.228008 | 1.118464 | 5 |
9 | small_forward_backward_warmup_0_mixed_True | forward_backward | [0.22407326100073988, 0.22444049899968377, 0.2... | 0.224320 | 0.224365 | 0.000147 | 0.224073 | 0.224477 | 1.121601 | 5 |
11 | small_forward_backward_warmup_1_mixed_True | forward_backward | [0.22448583999994298, 0.22421371400014323, 0.2... | 0.224344 | 0.224382 | 0.000117 | 0.224199 | 0.224486 | 1.121720 | 5 |
14 | medium_forward_warmup_1_mixed_False | forward | [0.24505015699924115, 0.2447970040002474, 0.24... | 0.244915 | 0.244981 | 0.000156 | 0.244672 | 0.245076 | 1.224577 | 5 |
12 | medium_forward_warmup_0_mixed_False | forward | [0.24876118500014854, 0.24528420900060155, 0.2... | 0.245821 | 0.245088 | 0.001474 | 0.244956 | 0.248761 | 1.229103 | 5 |
17 | medium_backward_warmup_0_mixed_True | backward | [0.4554312190002747, 0.4636755310002627, 0.463... | 0.461943 | 0.463649 | 0.003261 | 0.455431 | 0.463729 | 2.309716 | 5 |
19 | medium_backward_warmup_1_mixed_True | backward | [0.46393922600054793, 0.4636971440004345, 0.46... | 0.464213 | 0.463939 | 0.000881 | 0.463462 | 0.465929 | 2.321066 | 5 |
16 | medium_backward_warmup_0_mixed_False | backward | [0.4803364019999208, 0.4842157639996003, 0.484... | 0.483126 | 0.483475 | 0.001435 | 0.480336 | 0.484216 | 2.415628 | 5 |
18 | medium_backward_warmup_1_mixed_False | backward | [0.48379133800062846, 0.4842440839993287, 0.48... | 0.483812 | 0.483791 | 0.000253 | 0.483513 | 0.484244 | 2.419062 | 5 |
21 | medium_forward_backward_warmup_0_mixed_True | forward_backward | [0.6820840510008566, 0.684006079000028, 0.6853... | 0.683814 | 0.683922 | 0.001036 | 0.682084 | 0.685337 | 3.419069 | 5 |
23 | medium_forward_backward_warmup_1_mixed_True | forward_backward | [0.6852539360006631, 0.68381571899954, 0.68339... | 0.683909 | 0.683543 | 0.000687 | 0.683391 | 0.685254 | 3.419543 | 5 |
22 | medium_forward_backward_warmup_1_mixed_False | forward_backward | [0.7272930130002351, 0.7277811669991934, 0.726... | 0.726819 | 0.726417 | 0.000612 | 0.726191 | 0.727781 | 3.634096 | 5 |
20 | medium_forward_backward_warmup_0_mixed_False | forward_backward | [0.7304752370000642, 0.727164707999691, 0.7266... | 0.728044 | 0.727165 | 0.001610 | 0.726529 | 0.730475 | 3.640220 | 5 |
Why is warmup recommended by CS336’s lecturers before timing? Across the board you can see that if we keep all other variables constant, then having warmup of 1 step has a much lower standard deviation that those that don’t. The basic intuition is that after warmup, the GPU is “warmed-up”, it is like cold caches in action, and other compilation and optimization that happen under the hood, so warmup is good for stabalisation.
display(df[df["name"].isin(["medium_forward_backward_warmup_0_mixed_False", "medium_forward_backward_warmup_1_mixed_False"])])
name | computation | times | mean_time | median_time | std_dev | min_time | max_time | total_time | profile_steps | |
---|---|---|---|---|---|---|---|---|---|---|
20 | medium_forward_backward_warmup_0_mixed_False | forward_backward | [0.7304752370000642, 0.727164707999691, 0.7266... | 0.728044 | 0.727165 | 0.001610 | 0.726529 | 0.730475 | 3.640220 | 5 |
22 | medium_forward_backward_warmup_1_mixed_False | forward_backward | [0.7272930130002351, 0.7277811669991934, 0.726... | 0.726819 | 0.726417 | 0.000612 | 0.726191 | 0.727781 | 3.634096 | 5 |