#export
from exp.nb_02 import *
def get_data():
= datasets.download_data(MNIST_URL, ext='.gz')
path with gzip.open(path, 'rb') as f:
= pickle.load(f, encoding='latin-1')
((x_train, y_train), (x_valid, y_valid), _) return map(tensor, (x_train,y_train,x_valid,y_valid))
def normalize(x, m, s): return (x-m)/s
0022_fastai_pt2_2019_why_sqrt5
Does nn.Conv2d init work well?
00:00 one of the purpose of part 2 is to demonstrate how Jeremy does research; Jeremy is going to show us how he does research to find out how well does a mysterious line of code in the pytorch work
01:28 -
02:28 - how to resize a 3 color channel image into a single changel image 28x28
03:06 - when would Jeremy create a function during research; experiment to show that the line is not performing well
08:55 - Jeremy writing his own version of kaiming init
15:59 - Jeremy reimplemented what pytorch had on kaiming init; Jeremy used an example to test on how useless or useful of the line in pytorch;
17:30 - using kaiming_uniform_ to test the line, and the result is better but still problematic
18:58 - look at 2b why need a good init; why in the past neuralnet is so hard to train; why weights initialization is so crucial to training or learning
21:04 - Sylvian further explained something interesting
21:30 - how pytorch team responded
23:52 - many init papers and approaches
27:11 - ground up so that we can ask questions on pytorch strange and historical edges
28:56 - let’s train a model with our fully connected architecture with cross-entropy
30:21 - how to understand log cross-entropy from scratch
34:26 - how to write negative log likelihood in pytorch with a trick
torch.nn.modules.conv._ConvNd.reset_parameters??
= get_data()
x_train,y_train,x_valid,y_valid = x_train.mean(),x_train.std()
train_mean,train_std = normalize(x_train, train_mean, train_std)
x_train = normalize(x_valid, train_mean, train_std) x_valid
= x_train.view(-1,1,28,28)
x_train = x_valid.view(-1,1,28,28)
x_valid x_train.shape,x_valid.shape
*_ = x_train.shape
n,= y_train.max()+1
c = 32
nh n,c
= nn.Conv2d(1, nh, 5) l1
= x_valid[:100] x
x.shape
def stats(x): return x.mean(),x.std()
l1.weight.shape
stats(l1.weight),stats(l1.bias)
= l1(x) t
stats(t)
=1.)
init.kaiming_normal_(l1.weight, a stats(l1(x))
import torch.nn.functional as F
def f1(x,a=0): return F.leaky_relu(l1(x),a)
=0)
init.kaiming_normal_(l1.weight, a stats(f1(x))
= nn.Conv2d(1, nh, 5)
l1 stats(f1(x))
l1.weight.shape
# receptive field size
= l1.weight[0,0].numel()
rec_fs rec_fs
*_ = l1.weight.shape
nf,ni, nf,ni
= ni*rec_fs
fan_in = nf*rec_fs
fan_out fan_in,fan_out
def gain(a): return math.sqrt(2.0 / (1 + a**2))
1),gain(0),gain(0.01),gain(0.1),gain(math.sqrt(5.)) gain(
10000).uniform_(-1,1).std() torch.zeros(
1/math.sqrt(3.)
def kaiming2(x,a, use_fan_out=False):
*_ = x.shape
nf,ni,= x[0,0].shape.numel()
rec_fs = nf*rec_fs if use_fan_out else ni*rec_fs
fan = gain(a) / math.sqrt(fan)
std = math.sqrt(3.) * std
bound -bound,bound) x.data.uniform_(
=0);
kaiming2(l1.weight, a stats(f1(x))
=math.sqrt(5.))
kaiming2(l1.weight, a stats(f1(x))
class Flatten(nn.Module):
def forward(self,x): return x.view(-1)
= nn.Sequential(
m 1,8, 5,stride=2,padding=2), nn.ReLU(),
nn.Conv2d(8,16,3,stride=2,padding=1), nn.ReLU(),
nn.Conv2d(16,32,3,stride=2,padding=1), nn.ReLU(),
nn.Conv2d(32,1,3,stride=2,padding=1),
nn.Conv2d(1),
nn.AdaptiveAvgPool2d(
Flatten(), )
= y_valid[:100].float() y
= m(x)
t stats(t)
= mse(t,y)
l l.backward()
0].weight.grad) stats(m[
init.kaiming_uniform_??
for l in m:
if isinstance(l,nn.Conv2d):
init.kaiming_uniform_(l.weight) l.bias.data.zero_()
= m(x)
t stats(t)
= mse(t,y)
l
l.backward()0].weight.grad) stats(m[
Export
!./notebook2script.py 02a_why_sqrt5.ipynb
from fastdebug.utils import *
"The forward and backward passes") fastnbs(
the forward and backward passes
This section contains only the current heading 2 and its subheadings ### get_data
1:23:03 - how to download and prepare the mnist dataset and wrap the process into a function called get_data
;
#export
from exp.nb_01 import *
def get_data():
= datasets.download_data(MNIST_URL, ext='.gz')
path with gzip.open(path, 'rb') as f:
= pickle.load(f, encoding='latin-1')
((x_train, y_train), (x_valid, y_valid), _) return map(tensor, (x_train,y_train,x_valid,y_valid))
= get_data() x_train,y_train,x_valid,y_valid
normalize(x, m, s)
test_near_zero and assert
1:24:52 - how to check the mean and std values are close to 0 and 1 using test_near_zero
using assert
= x_train.mean(),x_train.std()
train_mean,train_std train_mean,train_std
#export
def test_near_zero(a,tol=1e-3): assert a.abs()<tol, f"Near zero: {a}"
test_near_zero(x_train.mean())1-x_train.std()) test_near_zero(
getting dimensions of weights of different layers
1:25:16 - how to get the number of activations of each layer n
(rows of input), m
(columns of input), c
(number of targets/classes) from the shape of x_train
and y_train
= x_train.shape
n,m = y_train.max()+1
c n,m,c
start of another heading 2 ## Foundations version
1:55:22 - how to put forward pass and backward pass into one function foward_and_backward
; and backward pass is the chain rule (people who say no are liars) and saving the gradients as well;
This section contains only the current heading 3 and its subheadings
def forward_and_backward(inp, targ):
# forward pass:
= inp @ w1 + b1
l1 = relu(l1)
l2 = l2 @ w2 + b2
out # we don't actually need the loss in backward!
= mse(out, targ)
loss
# backward pass:
mse_grad(out, targ)
lin_grad(l2, out, w2, b2)
relu_grad(l1, l2) lin_grad(inp, l1, w1, b1)
forward_and_backward(x_train, y_train)
start of another heading 3 ### 1:56:41 - how to use pytorch’s gradient calculation functions to test whether our own gradients are calculated correctly;