I am training a Variational Autoencoder. Suddenly my Loss Explodes and then becomes NaN

and I dont know why.

When evaluating the trained Vae on an Image the output data has Inf value, so I guess its happening in the sampling Method of the VAE, but why does it suddenly explode and how can i prevent it?

```
class VAE(nn.Module):
def __init__(self, input_shape, z_dim):
super().__init__()
self.z_dim = z_dim
self.input_shape = input_shape
# encoder
self.encoder_conv = nn.Sequential(
nn.Conv2d(1, 32, 3, stride=2, padding=1),
nn.BatchNorm2d(32),
nn.LeakyReLU(),
nn.Conv2d(32, 64, 3, stride=2, padding=1),
nn.BatchNorm2d(64),
nn.LeakyReLU(),
nn.Conv2d(64, 64, 3, stride=2, padding=1),
nn.BatchNorm2d(64),
nn.LeakyReLU(),
nn.Conv2d(64, 64, 3, stride=2, padding=1),
nn.BatchNorm2d(64),
nn.LeakyReLU()
)
self.conv_out_size = self._get_conv_out_size(input_shape)
self.mu = nn.Sequential(
nn.Linear(self.conv_out_size, z_dim),
nn.LeakyReLU(),
nn.Dropout(0.2)
)
self.log_var = nn.Sequential(
nn.Linear(self.conv_out_size, z_dim),
nn.LeakyReLU(),
nn.Dropout(0.2)
)
# decoder
self.decoder_linear = nn.Sequential(
nn.Linear(z_dim, self.conv_out_size),
nn.LeakyReLU(),
nn.Dropout(0.2)
)
self.decoder_conv = nn.Sequential(
nn.UpsamplingNearest2d(scale_factor=2),
nn.ConvTranspose2d(64, 64, 3, stride=1, padding=1),
nn.BatchNorm2d(64),
nn.LeakyReLU(),
nn.UpsamplingNearest2d(scale_factor=2),
nn.ConvTranspose2d(64, 64, 3, stride=1, padding=1),
nn.BatchNorm2d(64),
nn.LeakyReLU(),
nn.UpsamplingNearest2d(scale_factor=2),
nn.ConvTranspose2d(64, 32, 3, stride=1, padding=1),
nn.BatchNorm2d(32),
nn.LeakyReLU(),
nn.UpsamplingNearest2d(scale_factor=2),
nn.ConvTranspose2d(32, 1, 3, stride=1, padding=(5,3)),
nn.Sigmoid()
)
def sampling(self, mu, log_var):
## TODO: epsilon should be at the model's device (not CUDA)
epsilon = torch.Tensor(np.random.normal(size=(self.z_dim), scale=1.0)).cuda()
return mu + epsilon * torch.exp(log_var / 2)
def forward_encoder(self, x):
x = self.encoder_conv(x)
x = x.view(x.size()(0), -1)
mu_p = self.mu(x)
log_var_p = self.log_var(x)
return (mu_p, log_var_p)
def forward_decoder(self, x):
x = self.decoder_linear(x)
x = x.view(x.size()(0), *self.conv_out_shape(1:))
x = self.decoder_conv(x)
return x
def forward(self, x):
mu_p, log_var_p = self.forward_encoder(x)
x = self.sampling(mu_p, log_var_p)
images_p = self.forward_decoder(x)
return (mu_p, log_var_p, images_p)
def _get_conv_out_size(self, shape):
out = self.encoder_conv(torch.zeros(1, *shape))
self.conv_out_shape = out.size()
return int(np.prod(self.conv_out_shape))
def forward_no_epsilon(self, x):
mu_p, log_var_p = self.forward_encoder(x)
x = mu_p
images_p = self.forward_decoder(x)
return images_p
```

Loss:

```
def kl_loss(mu, log_var):
# TODO: dividir entre el numero de batches?
return -0.5 * torch.mean(1 + log_var - mu.pow(2) - torch.exp(log_var))
def r_loss(y_train, y_pred):
r_loss = torch.mean((y_train - y_pred) ** 2)
return r_loss
```

train:

```
mu_v, log_var_v, images_out_v = vae(images_v)
r_loss_v = r_loss(images_out_v, labels_v)
kl_loss_v = kl_loss(mu_v, log_var_v)
loss = kl_loss_v + r_loss_v * 10000.0
loss.backward()
optimizer.step()
```

Losses: