import torch
import numpy as np
Pytorch Model Creation
torch.cuda.is_available()
True
Autograd
= torch.tensor([1,2,3], dtype=torch.float, requires_grad = True)
x x
tensor([1., 2., 3.], requires_grad=True)
Containers
Module
import torch.nn as nn
import torch.nn.functional as F
class Model(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(1, 20, 5)
self.conv2 = nn.Conv2d(20, 20, 5)
def forward(self, x):
= F.relu(self.conv1(x))
x return F.relu(self.conv2(x))
@torch.no_grad()
def init_weights(m):
print(m)
if type(m) == nn.Linear:
1.0)
m.weight.fill_(print(m.weight)
= nn.Sequential(nn.Linear(2, 2), nn.Linear(2, 2))
net = net.apply(init_weights) new
Linear(in_features=2, out_features=2, bias=True)
Parameter containing:
tensor([[1., 1.],
[1., 1.]], requires_grad=True)
Linear(in_features=2, out_features=2, bias=True)
Parameter containing:
tensor([[1., 1.],
[1., 1.]], requires_grad=True)
Sequential(
(0): Linear(in_features=2, out_features=2, bias=True)
(1): Linear(in_features=2, out_features=2, bias=True)
)
= Model()
model model
Model(
(conv1): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
(conv2): Conv2d(20, 20, kernel_size=(5, 5), stride=(1, 1))
)
model.__dict__
{'training': True,
'_parameters': OrderedDict(),
'_buffers': OrderedDict(),
'_non_persistent_buffers_set': set(),
'_backward_pre_hooks': OrderedDict(),
'_backward_hooks': OrderedDict(),
'_is_full_backward_hook': None,
'_forward_hooks': OrderedDict(),
'_forward_hooks_with_kwargs': OrderedDict(),
'_forward_hooks_always_called': OrderedDict(),
'_forward_pre_hooks': OrderedDict(),
'_forward_pre_hooks_with_kwargs': OrderedDict(),
'_state_dict_hooks': OrderedDict(),
'_state_dict_pre_hooks': OrderedDict(),
'_load_state_dict_pre_hooks': OrderedDict(),
'_load_state_dict_post_hooks': OrderedDict(),
'_modules': OrderedDict([('conv1',
Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))),
('conv2', Conv2d(20, 20, kernel_size=(5, 5), stride=(1, 1)))])}
for para in model.parameters():
print(para.shape)
torch.Size([20, 1, 5, 5])
torch.Size([20])
torch.Size([20, 20, 5, 5])
torch.Size([20])
Sequential
= nn.Sequential(
model 1,20,5),
nn.Conv2d(
nn.ReLU(),20,64,5),
nn.Conv2d(
nn.ReLU() )
model
Sequential(
(0): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
(1): ReLU()
(2): Conv2d(20, 64, kernel_size=(5, 5), stride=(1, 1))
(3): ReLU()
)
ModuleList
class MyModule(nn.Module):
def __init__(self):
super().__init__()
self.linears = nn.ModuleList([nn.Linear(10, 10) for i in range(10)])
def forward(self, x):
# ModuleList can act as an iterable, or be indexed using ints
for i, l in enumerate(self.linears):
= self.linears[i // 2](x) + l(x)
x return x
= MyModule()
model model
MyModule(
(linears): ModuleList(
(0-9): 10 x Linear(in_features=10, out_features=10, bias=True)
)
)
class MyModule(nn.Module):
def __init__(self):
super().__init__()
self.choices = nn.ModuleDict({
'conv': nn.Conv2d(10, 10, 3),
'pool': nn.MaxPool2d(3)
})self.activations = nn.ModuleDict([
'lrelu', nn.LeakyReLU()],
['prelu', nn.PReLU()]
[
])
def forward(self, x, choice, act):
= self.choices[choice](x)
x = self.activations[act](x)
x return x
= MyModule()
model model
MyModule(
(choices): ModuleDict(
(conv): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1))
(pool): MaxPool2d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
)
(activations): ModuleDict(
(lrelu): LeakyReLU(negative_slope=0.01)
(prelu): PReLU(num_parameters=1)
)
)
ParameterList
class MyModule(nn.Module):
def __init__(self):
super().__init__()
self.params = nn.ParameterList([nn.Parameter(torch.randn(10, 10)) for i in range(10)])
def forward(self, x):
# ParameterList can act as an iterable, or be indexed using ints
for i, p in enumerate(self.params):
= self.params[i // 2].mm(x) + p.mm(x)
x return x
= MyModule()
model model
MyModule(
(params): ParameterList(
(0): Parameter containing: [torch.float32 of size 10x10]
(1): Parameter containing: [torch.float32 of size 10x10]
(2): Parameter containing: [torch.float32 of size 10x10]
(3): Parameter containing: [torch.float32 of size 10x10]
(4): Parameter containing: [torch.float32 of size 10x10]
(5): Parameter containing: [torch.float32 of size 10x10]
(6): Parameter containing: [torch.float32 of size 10x10]
(7): Parameter containing: [torch.float32 of size 10x10]
(8): Parameter containing: [torch.float32 of size 10x10]
(9): Parameter containing: [torch.float32 of size 10x10]
)
)
ParameterDict
class MyModule(nn.Module):
def __init__(self):
super().__init__()
self.params = nn.ParameterDict({
'left': nn.Parameter(torch.randn(5, 10)),
'right': nn.Parameter(torch.randn(5, 10))
})
def forward(self, x, choice):
= self.params[choice].mm(x)
x return x
= MyModule()
model model
MyModule(
(params): ParameterDict(
(left): Parameter containing: [torch.FloatTensor of size 5x10]
(right): Parameter containing: [torch.FloatTensor of size 5x10]
)
)
Convolution Layers
nn.Conv1d
= torch.torch.tensor([[[[ 1., 2., 3., 4., 5.],
input1 6., 7., 8., 9., 10.],
[ 11., 12., 13., 14., 15.],
[16., 17., 18., 19., 20.]]]]) [
= input1.reshape(4,5)
new new
tensor([[ 1., 2., 3., 4., 5.],
[ 6., 7., 8., 9., 10.],
[11., 12., 13., 14., 15.],
[16., 17., 18., 19., 20.]])
= nn.Conv1d(4, 2, 3, stride=2)
m type(m)
torch.nn.modules.conv.Conv1d
for para in m.parameters():
print(para.shape)
torch.Size([2, 4, 3])
torch.Size([2])
# input = torch.randn(20, 16, 50)
= m(new) output
output.shape
torch.Size([2, 2])
output
tensor([[-1.3600, -1.1102],
[ 6.1355, 6.0188]], grad_fn=<SqueezeBackward1>)
nn.Conv2d
= input1.reshape(4, 5,1)
new new
tensor([[[ 1.],
[ 2.],
[ 3.],
[ 4.],
[ 5.]],
[[ 6.],
[ 7.],
[ 8.],
[ 9.],
[10.]],
[[11.],
[12.],
[13.],
[14.],
[15.]],
[[16.],
[17.],
[18.],
[19.],
[20.]]])
# With square kernels and equal stride
= nn.Conv2d(4, 2, 3, stride=2)
m # non-square kernels and unequal stride and with padding
= nn.Conv2d(4, 2, (3, 5), stride=(2, 1), padding=(4, 2))
m # non-square kernels and unequal stride and with padding and dilation
= nn.Conv2d(4, 2, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(3, 1))
m
= m(new)
output output.shape
torch.Size([2, 4, 1])
output
tensor([[[-1.3839],
[-1.8465],
[-0.8563],
[-0.0644]],
[[ 1.2715],
[ 1.5135],
[-2.8098],
[-3.2210]]], grad_fn=<SqueezeBackward1>)
nn.Conv3d
# With square kernels and equal stride
= nn.Conv3d(16, 33, 3, stride=2)
m # non-square kernels and unequal} stride and with padding
= nn.Conv3d(16, 33, (3, 5, 2), stride=(2, 1, 1), padding=(4, 2, 0))
m input = torch.randn(20, 16, 10, 50, 100)
= m(input) output
output.shape
torch.Size([20, 33, 8, 50, 99])
nn.ConvTranspose2d
This module can be seen as the gradient of Conv2d with respect to its input. It is also known as a fractionally-strided convolution or a deconvolution (although it is not an actual deconvolution operation as it does not compute a true inverse of convolution).
# With square kernels and equal stride
= nn.ConvTranspose2d(16, 33, 3, stride=2)
m # non-square kernels and unequal stride and with padding
= nn.ConvTranspose2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2))
m input = torch.randn(20, 16, 50, 100)
= m(input)
output output.shape
torch.Size([20, 33, 93, 100])
# exact output size can be also specified as an argument
input = torch.randn(1, 16, 12, 12)
= nn.Conv2d(16, 16, 3, stride=2, padding=1)
downsample = nn.ConvTranspose2d(16, 16, 3, stride=2, padding=1)
upsample = downsample(input)
h h.size()
torch.Size([1, 16, 6, 6])
= upsample(h, output_size=input.size())
output output.size()
torch.Size([1, 16, 12, 12])
Pooling Layers
nn.MaxPool2d
Applies a 2D max pooling over an input signal composed of several input planes.
= input1.reshape(1, 4,5)
new new
tensor([[[ 1., 2., 3., 4., 5.],
[ 6., 7., 8., 9., 10.],
[11., 12., 13., 14., 15.],
[16., 17., 18., 19., 20.]]])
# pool of square window of size=3, stride=2
= nn.MaxPool2d(3, stride=2)
m # pool of non-square window
= nn.MaxPool2d((2, 2), stride=(2, 1))
m input = torch.randn(20, 16, 50, 32)
= m(new)
output output
tensor([[[ 7., 8., 9., 10.],
[17., 18., 19., 20.]]])
nn.MaxUnpool2d
Computes a partial inverse of MaxPool2d.
input = torch.tensor([[[[ 1., 2., 3., 4.],
5., 6., 7., 8.],
[ 9., 10., 11., 12.],
[ 13., 14., 15., 16.]]]]) [
= nn.MaxPool2d(2, stride=2, return_indices=True)
pool = nn.MaxUnpool2d(2, stride=2)
unpool
= pool(input)
output, indices unpool(output, indices)
tensor([[[[ 0., 0., 0., 0.],
[ 0., 6., 0., 8.],
[ 0., 0., 0., 0.],
[ 0., 14., 0., 16.]]]])
input1
tensor([[[[ 1., 2., 3., 4., 5.],
[ 6., 7., 8., 9., 10.],
[11., 12., 13., 14., 15.],
[16., 17., 18., 19., 20.]]]])
= pool(input1)
output, indices # This call will not work without specifying output_size
=input1.size()) unpool(output, indices, output_size
tensor([[[[ 0., 0., 0., 0., 0.],
[ 0., 7., 0., 9., 0.],
[ 0., 0., 0., 0., 0.],
[ 0., 17., 0., 19., 0.]]]])
nn.AvgPool2d
Applies a 2D average pooling over an input signal composed of several input planes.
input1
tensor([[[[ 1., 2., 3., 4., 5.],
[ 6., 7., 8., 9., 10.],
[11., 12., 13., 14., 15.],
[16., 17., 18., 19., 20.]]]])
# pool of square window of size=3, stride=2
= nn.AvgPool2d(3, stride=2)
m # pool of non-square window
= nn.AvgPool2d((2, 2), stride=(1, 1))
m # input = torch.randn(20, 16, 50, 32)
= m(input1)
output output
tensor([[[[ 4., 5., 6., 7.],
[ 9., 10., 11., 12.],
[14., 15., 16., 17.]]]])
nn.FractionalMaxPool2d
Applies a 2D fractional max pooling over an input signal composed of several input planes.
# pool of square window of size=3, and target output size 13x12
= nn.FractionalMaxPool2d(3, output_size=(13, 12))
m # pool of square window and target output size being half of input image size
= nn.FractionalMaxPool2d(2, output_ratio=(0.7, 0.7))
m = m(input1)
output output
tensor([[[[ 7., 8., 10.],
[17., 18., 20.]]]])
nn.LPPool2d
Applies a 2D power-average pooling over an input signal composed of several input planes.
# power-2 pool of square window of size=3, stride=2
= nn.LPPool2d(2, 3, stride=2)
m # pool of non-square window of power 1.2
= nn.LPPool2d(1.2, (3, 2), stride=(2, 1))
m
= m(input)
output output
tensor([[[[25.4396, 29.7206, 34.0561]]]])
nn.AdaptiveMaxPool2d
Applies a 2D adaptive max pooling over an input signal composed of several input planes.
# target output size of 5x7
= nn.AdaptiveMaxPool2d((5, 7))
m = m(input1)
output output
tensor([[[[ 1., 2., 3., 3., 4., 5., 5.],
[ 6., 7., 8., 8., 9., 10., 10.],
[11., 12., 13., 13., 14., 15., 15.],
[16., 17., 18., 18., 19., 20., 20.],
[16., 17., 18., 18., 19., 20., 20.]]]])
# target output size of 7x7 (square)
= nn.AdaptiveMaxPool2d(7)
m = m(input1)
output output
tensor([[[[ 1., 2., 3., 3., 4., 5., 5.],
[ 6., 7., 8., 8., 9., 10., 10.],
[ 6., 7., 8., 8., 9., 10., 10.],
[11., 12., 13., 13., 14., 15., 15.],
[11., 12., 13., 13., 14., 15., 15.],
[16., 17., 18., 18., 19., 20., 20.],
[16., 17., 18., 18., 19., 20., 20.]]]])
# target output size of 10x7
= nn.AdaptiveMaxPool2d((None, 7))
m = m(input1)
output output
tensor([[[[ 1., 2., 3., 3., 4., 5., 5.],
[ 6., 7., 8., 8., 9., 10., 10.],
[11., 12., 13., 13., 14., 15., 15.],
[16., 17., 18., 18., 19., 20., 20.]]]])
nn.AdaptiveAvgPool2d
Applies a 2D adaptive average pooling over an input signal composed of several input planes.
# target output size of 5x7
= nn.AdaptiveAvgPool2d((5, 7))
m m(input1)
tensor([[[[ 1.0000, 1.5000, 2.5000, 3.0000, 3.5000, 4.5000, 5.0000],
[ 3.5000, 4.0000, 5.0000, 5.5000, 6.0000, 7.0000, 7.5000],
[ 8.5000, 9.0000, 10.0000, 10.5000, 11.0000, 12.0000, 12.5000],
[13.5000, 14.0000, 15.0000, 15.5000, 16.0000, 17.0000, 17.5000],
[16.0000, 16.5000, 17.5000, 18.0000, 18.5000, 19.5000, 20.0000]]]])
# target output size of 7x7 (square)
= nn.AdaptiveAvgPool2d(7)
m m(input1)
tensor([[[[ 1.0000, 1.5000, 2.5000, 3.0000, 3.5000, 4.5000, 5.0000],
[ 3.5000, 4.0000, 5.0000, 5.5000, 6.0000, 7.0000, 7.5000],
[ 6.0000, 6.5000, 7.5000, 8.0000, 8.5000, 9.5000, 10.0000],
[ 8.5000, 9.0000, 10.0000, 10.5000, 11.0000, 12.0000, 12.5000],
[11.0000, 11.5000, 12.5000, 13.0000, 13.5000, 14.5000, 15.0000],
[13.5000, 14.0000, 15.0000, 15.5000, 16.0000, 17.0000, 17.5000],
[16.0000, 16.5000, 17.5000, 18.0000, 18.5000, 19.5000, 20.0000]]]])
# target output size of 10x7
= nn.AdaptiveAvgPool2d((None, 7))
m m(input1)
tensor([[[[ 1.0000, 1.5000, 2.5000, 3.0000, 3.5000, 4.5000, 5.0000],
[ 6.0000, 6.5000, 7.5000, 8.0000, 8.5000, 9.5000, 10.0000],
[11.0000, 11.5000, 12.5000, 13.0000, 13.5000, 14.5000, 15.0000],
[16.0000, 16.5000, 17.5000, 18.0000, 18.5000, 19.5000, 20.0000]]]])
Padding Layers
nn.ReflectionPad2d
Pads the input tensor using the reflection of the input boundary.
= nn.ReflectionPad2d(2)
m input = torch.arange(9, dtype=torch.float).reshape(1, 1, 3, 3)
input
tensor([[[[0., 1., 2.],
[3., 4., 5.],
[6., 7., 8.]]]])
input) m(
tensor([[[[8., 7., 6., 7., 8., 7., 6.],
[5., 4., 3., 4., 5., 4., 3.],
[2., 1., 0., 1., 2., 1., 0.],
[5., 4., 3., 4., 5., 4., 3.],
[8., 7., 6., 7., 8., 7., 6.],
[5., 4., 3., 4., 5., 4., 3.],
[2., 1., 0., 1., 2., 1., 0.]]]])
# using different paddings for different sides
= nn.ReflectionPad2d((1, 1, 2, 0))
m input) m(
tensor([[[[7., 6., 7., 8., 7.],
[4., 3., 4., 5., 4.],
[1., 0., 1., 2., 1.],
[4., 3., 4., 5., 4.],
[7., 6., 7., 8., 7.]]]])
nn.ReplicationPad2d
Pads the input tensor using replication of the input boundary.
= nn.ReplicationPad2d(2)
m
input) m(
tensor([[[[0., 0., 0., 1., 2., 2., 2.],
[0., 0., 0., 1., 2., 2., 2.],
[0., 0., 0., 1., 2., 2., 2.],
[3., 3., 3., 4., 5., 5., 5.],
[6., 6., 6., 7., 8., 8., 8.],
[6., 6., 6., 7., 8., 8., 8.],
[6., 6., 6., 7., 8., 8., 8.]]]])
# using different paddings for different sides
= nn.ReplicationPad2d((1, 1, 2, 0))
m input) m(
tensor([[[[0., 0., 1., 2., 2.],
[0., 0., 1., 2., 2.],
[0., 0., 1., 2., 2.],
[3., 3., 4., 5., 5.],
[6., 6., 7., 8., 8.]]]])
nn.ZeroPad2d
Pads the input tensor boundaries with zero.
= nn.ZeroPad2d(2)
m input) m(
tensor([[[[0., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 1., 2., 0., 0.],
[0., 0., 3., 4., 5., 0., 0.],
[0., 0., 6., 7., 8., 0., 0.],
[0., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 0.]]]])
# using different paddings for different sides
= nn.ZeroPad2d((1, 1, 2, 0))
m input) m(
tensor([[[[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.],
[0., 0., 1., 2., 0.],
[0., 3., 4., 5., 0.],
[0., 6., 7., 8., 0.]]]])
nn.ConstantPad2d
Pads the input tensor boundaries with a constant value.
= nn.ConstantPad2d(2, 11)
m
input) m(
tensor([[[[11., 11., 11., 11., 11., 11., 11.],
[11., 11., 11., 11., 11., 11., 11.],
[11., 11., 0., 1., 2., 11., 11.],
[11., 11., 3., 4., 5., 11., 11.],
[11., 11., 6., 7., 8., 11., 11.],
[11., 11., 11., 11., 11., 11., 11.],
[11., 11., 11., 11., 11., 11., 11.]]]])
# using different paddings for different sides
= nn.ConstantPad2d((3, 0, 2, 1), 11)
m input) m(
tensor([[[[11., 11., 11., 11., 11., 11.],
[11., 11., 11., 11., 11., 11.],
[11., 11., 11., 0., 1., 2.],
[11., 11., 11., 3., 4., 5.],
[11., 11., 11., 6., 7., 8.],
[11., 11., 11., 11., 11., 11.]]]])
Non-linear Activations (weighted sum, nonlinearity)
input = torch.linspace(-5,5,100)
import matplotlib.pyplot as plt
def plot_show(input, output):
input, input, color='green', linestyle='dashed',
plt.plot(=1, label = 'input')
linewidthinput, output, color='red',
plt.plot(=1, label = 'output')
linewidth
plt.legend() plt.show()
nn.LogSigmoid()
= nn.LogSigmoid()
m = m(input) output
input, output) plot_show(
nn.ReLU()
= nn.ReLU()
m = m(input)
output input, output) plot_show(
nn.LeakyReLU(0.5)
= nn.LeakyReLU(0.5)
m = m(input)
output input, output) plot_show(
= nn.SELU()
m = m(input)
output input, output) plot_show(
nn.Sigmoid()
= nn.Sigmoid()
m = m(input)
output input, output) plot_show(
nn.Softplus()
= nn.Softplus()
m = m(input)
output input, output) plot_show(
nn.Tanh()
= nn.Tanh()
m = m(input)
output input, output) plot_show(
nn.Threshold
= nn.Threshold(0, -5)
m = m(input)
output input, output) plot_show(
nn.SELU
Non-linear Activations (other)
input = torch.linspace(-1,1,10)
input = input.reshape(2,5)
input
tensor([[-1.0000, -0.7778, -0.5556, -0.3333, -0.1111],
[ 0.1111, 0.3333, 0.5556, 0.7778, 1.0000]])
= nn.Softmin(dim=1) m
= m(input) output
output
tensor([[0.2970, 0.2379, 0.1905, 0.1525, 0.1221],
[0.2970, 0.2379, 0.1905, 0.1525, 0.1221]])
sum() output.
tensor(2.0000)
input.flatten(), output.flatten()) plot_show(
Normalization Layers
nn.BatchNorm2d
input = torch.arange(27, dtype=torch.float).reshape(1,3, 3, 3)
input
tensor([[[[ 0., 1., 2.],
[ 3., 4., 5.],
[ 6., 7., 8.]],
[[ 9., 10., 11.],
[12., 13., 14.],
[15., 16., 17.]],
[[18., 19., 20.],
[21., 22., 23.],
[24., 25., 26.]]]])
# With Learnable Parameters
= nn.BatchNorm2d(3)
m # Without Learnable Parameters
= nn.BatchNorm2d(3, affine=False)
m
= m(input)
output output
tensor([[[[-1.5492e+00, -1.1619e+00, -7.7460e-01],
[-3.8730e-01, 0.0000e+00, 3.8730e-01],
[ 7.7460e-01, 1.1619e+00, 1.5492e+00]],
[[-1.5492e+00, -1.1619e+00, -7.7460e-01],
[-3.8730e-01, 1.7881e-07, 3.8730e-01],
[ 7.7460e-01, 1.1619e+00, 1.5492e+00]],
[[-1.5492e+00, -1.1619e+00, -7.7460e-01],
[-3.8730e-01, -3.5763e-07, 3.8730e-01],
[ 7.7460e-01, 1.1619e+00, 1.5492e+00]]]])
nn.GroupNorm
# Separate 6 channels into 3 groups
= nn.GroupNorm(1, 3)
m = m(input)
output output
tensor([[[[-1.6690e+00, -1.5407e+00, -1.4123e+00],
[-1.2839e+00, -1.1555e+00, -1.0271e+00],
[-8.9872e-01, -7.7033e-01, -6.4194e-01]],
[[-5.1355e-01, -3.8516e-01, -2.5678e-01],
[-1.2839e-01, -2.9802e-08, 1.2839e-01],
[ 2.5678e-01, 3.8516e-01, 5.1355e-01]],
[[ 6.4194e-01, 7.7033e-01, 8.9872e-01],
[ 1.0271e+00, 1.1555e+00, 1.2839e+00],
[ 1.4123e+00, 1.5407e+00, 1.6690e+00]]]],
grad_fn=<NativeGroupNormBackward0>)
= torch.randn(20, 6, 10, 10)
input_t # Separate 6 channels into 6 groups (equivalent with InstanceNorm)
= nn.GroupNorm(6, 6)
m = m(input_t) output
# Put all 6 channels into a single group (equivalent with LayerNorm)
= nn.GroupNorm(1, 6)
m # Activating the module
= m(input_t) output
nn.LayerNorm
# NLP Example
= 20, 5, 10
batch, sentence_length, embedding_dim = torch.randn(batch, sentence_length, embedding_dim)
embedding = nn.LayerNorm(embedding_dim)
layer_norm # Activate module
= layer_norm(embedding) output
7,:,:].std() embedding[
tensor(1.0733)
7,:,:].std() output[
tensor(1.0101, grad_fn=<StdBackward0>)
Recurrent Layers
RNNBase - Base class for RNN modules (RNN, LSTM, GRU).
nn.RNN
= nn.RNN(10, 20, 2)
rnn input = torch.randn(5, 3, 10)
= torch.randn(2, 3, 20)
h0 = rnn(input, h0) output, hn
output.shape
torch.Size([5, 3, 20])
nn.LSTM
= nn.LSTM(10, 20, 2)
lstm input = torch.randn(5, 3, 10)
= torch.randn(2, 3, 20)
h0 = torch.randn(2, 3, 20)
c0 = lstm(input, (h0, c0))
output, (hn, cn) output.shape
torch.Size([5, 3, 20])
nn.GRU
= nn.GRU(10, 20, 2)
gru input = torch.randn(5, 3, 10)
= torch.randn(2, 3, 20)
h0 = gru(input, h0)
output, hn output.shape
torch.Size([5, 3, 20])
nn.RNNCell
= nn.RNNCell(10, 20)
rnn input = torch.randn(6, 3, 10)
= torch.randn(3, 20)
hx = []
output for i in range(6):
= rnn(input[i], hx)
hx output.append(hx)
Transformer Layers
nn.Transformer
= nn.Transformer(nhead=16, num_encoder_layers=12)
transformer_model = torch.rand((10, 32, 512))
src = torch.rand((20, 32, 512))
tgt = transformer_model(src, tgt) out
nn.TransformerEncoderLayer
= nn.TransformerEncoderLayer(d_model=512, nhead=8)
encoder_layer = nn.TransformerEncoder(encoder_layer, num_layers=6)
transformer_encoder = torch.rand(10, 32, 512)
src = transformer_encoder(src) out
nn.TransformerDecoderLayer
= nn.TransformerDecoderLayer(d_model=512, nhead=8)
decoder_layer = nn.TransformerDecoder(decoder_layer, num_layers=6)
transformer_decoder = torch.rand(10, 32, 512)
memory = torch.rand(20, 32, 512)
tgt = transformer_decoder(tgt, memory) out
nn.TransformerEncoderLayer
= nn.TransformerEncoderLayer(d_model=512, nhead=8, batch_first=True)
encoder_layer = torch.rand(32, 10, 512)
src = encoder_layer(src) out
nn.TransformerDecoderLayer
= nn.TransformerDecoderLayer(d_model=512, nhead=8, batch_first=True)
decoder_layer = torch.rand(32, 10, 512)
memory = torch.rand(32, 20, 512)
tgt = decoder_layer(tgt, memory) out
Linear Layers
nn.Identity
A placeholder identity operator that is argument-insensitive.
= nn.Identity(54, unused_argument1=0.1, unused_argument2=False)
m input = torch.randn(128, 20)
= m(input)
output print(output.size())
torch.Size([128, 20])
nn.Linear
= nn.Linear(20, 30)
m input = torch.randn(128, 20)
= m(input)
output print(output.size())
torch.Size([128, 30])
for para in m.parameters():
print(para.shape)
torch.Size([30, 20])
torch.Size([30])
Dropout Layers
= nn.Dropout(p=0.2)
m input = torch.randn(20, 16)
= m(input)
output output.shape
torch.Size([20, 16])
= nn.Dropout2d(p=0.2)
m input = torch.randn(20, 16, 32, 32)
= m(input)
output output.shape
torch.Size([20, 16, 32, 32])
= nn.AlphaDropout(p=0.2)
m input = torch.randn(20, 16)
= m(input)
output output.shape
torch.Size([20, 16])
= nn.FeatureAlphaDropout(p=0.2)
m input = torch.randn(20, 16, 4, 32, 32)
= m(input)
output output.shape
torch.Size([20, 16, 4, 32, 32])
Loss Functions
nn.L1Loss
input = torch.linspace(1,10,10, requires_grad=True)
= torch.linspace(1.1, 10.1, 10, requires_grad=True)
target input, target
(tensor([ 1., 2., 3., 4., 5., 6., 7., 8., 9., 10.], requires_grad=True),
tensor([ 1.1000, 2.1000, 3.1000, 4.1000, 5.1000, 6.1000, 7.1000, 8.1000,
9.1000, 10.1000], requires_grad=True))
= nn.L1Loss()
loss = loss(input, target)
output output
tensor(0.1000, grad_fn=<MeanBackward0>)
nn.MSELoss
= nn.MSELoss()
loss = loss(input, target)
output output
tensor(0.0100, grad_fn=<MseLossBackward0>)
nn.CrossEntropyLoss
# Example of target with class indices
= nn.CrossEntropyLoss()
loss = loss(input, target)
output output
tensor(195.1833, grad_fn=<DivBackward1>)
= nn.GaussianNLLLoss()
loss
= torch.ones(10, requires_grad=True) # heteroscedastic
var = loss(input, target, var)
output output
tensor(0.0050, grad_fn=<MeanBackward0>)
Vision Layers
Rearrange elements in a tensor according to an upscaling factor.
= nn.PixelShuffle(3)
pixel_shuffle input = torch.randn(1, 9, 4, 4)
= pixel_shuffle(input)
output print(output.size())
torch.Size([1, 1, 12, 12])
= nn.PixelUnshuffle(3)
pixel_unshuffle input = torch.randn(1, 1, 12, 12)
= pixel_unshuffle(input)
output print(output.size())
torch.Size([1, 9, 4, 4])