Pytorch Profiler can help you detect performance bottlenecks when training/deploying a model
Here is one implementation
import torch
import torchvision.models as models
from torch.profiler import profile, record_function, ProfilerActivity
model = models.resnet18()
inputs = torch.randn(1, 3, 512, 512)
with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
with record_function("model_inference"):
model(inputs)
print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=20))