I have written the following code to practice parallelizing a PyTorch code on GPUs:

```
import math
import torch
import pickle
import time
import numpy as np
import torch.optim as optim
from torch import nn
print('device_count()', torch.cuda.device_count())
for i in range(torch.cuda.device_count()):
print('get_device_name', torch.cuda.get_device_name(i))
def _data(dimension, num_examples):
num_mislabeled_examples = 20
ground_truth_weights = np.random.normal(size=dimension) / math.sqrt(dimension)
ground_truth_threshold = 0
features = np.random.normal(size=(num_examples, dimension)).astype(
np.float32) / math.sqrt(dimension)
labels = (np.matmul(features, ground_truth_weights) >
ground_truth_threshold).astype(np.float32)
mislabeled_indices = np.random.choice(
num_examples, num_mislabeled_examples, replace=False)
labels(mislabeled_indices) = 1 - labels(mislabeled_indices)
return torch.tensor(labels), torch.tensor(features)
class tools:
def __init__(self):
self.name = 'x_2'
def SomeFunc(self, model, input_):
print(model.first_term(input_)(0)) # change to model.module.first_term when the flag is True
class predictor(nn.Module):
def __init__(self, dim):
super(predictor, self).__init__()
self.weights = torch.nn.Parameter(torch.zeros(dim, 1, requires_grad=True))
self.threshold = torch.nn.Parameter(torch.zeros(1, 1, requires_grad=True))
def first_term(self, features):
return features @ self.weights
def forward(self, features):
return self.first_term(features) - self.threshold
class HingeLoss(nn.Module):
def __init__(self):
super(HingeLoss, self).__init__()
self.relu = nn.ReLU()
def forward(self, output, target):
all_ones = torch.ones_like(target)
labels = 2 * target - all_ones
losses = all_ones - torch.mul(output.squeeze(1), labels)
return torch.norm(self.relu(losses))
class function(object):
def __init__(self, epochs):
dim = 10
N = 100
self.target, self.features = _data(dim, N)
self.epochs = epochs
self.model = predictor(dim).to('cuda')
self.optimizer = optim.SGD(self.model.parameters(), lr=1e-3)
self.target = self.target.to('cuda')
self.features = self.features.to('cuda')
self.loss_function = HingeLoss().to('cuda')
self.tools = tools()
def train(self):
self.model.train()
for epoch in range(self.epochs):
self.optimizer.zero_grad()
output = self.model(self.features)
# self.tools.SomeFunc(self.model, self.features)
print(output.is_cuda)
loss = self.loss_function(output, self.target)
loss.backward()
print('For epoch {}, loss is: {}.'.format(epoch, loss.item()))
self.optimizer.step()
def main():
model = function(1000)
print(torch.cuda.device_count())
if False: # This is Flag
if torch.cuda.device_count() > 1:
model.model = nn.DataParallel(model.model)
t = time.time()
model.train()
print('elapsed: {}'.format(time.time() - t))
if __name__ == '__main__':
main()
```

As far as I understand setting *the flag* to `True`

should fix the thing, but my run time increases from 1 sec to 15 sec’s. I was wondering how to improve the performance.