diff --git a/beginner_source/audio_classifier_tutorial.py b/beginner_source/audio_classifier_tutorial.py deleted file mode 100644 index 5a9b29a3108..00000000000 --- a/beginner_source/audio_classifier_tutorial.py +++ /dev/null @@ -1,310 +0,0 @@ -""" -Audio Classifier Tutorial -========================= -**Author**: `Winston Herring `_ - -This tutorial will show you how to correctly format an audio dataset and -then train/test an audio classifier network on the dataset. First, let’s -import the common torch packages as well as ``torchaudio``, ``pandas``, -and ``numpy``. ``torchaudio`` is available `here `_ -and can be installed by following the -instructions on the website. - -""" - -import torch -import torch.nn as nn -import torch.nn.functional as F -import torch.optim as optim -from torchvision import datasets, transforms -from torch.utils.data import Dataset -import torchaudio -import pandas as pd -import numpy as np - - -###################################################################### -# Let’s check if a CUDA GPU is available and select our device. Running -# the network on a GPU will greatly decrease the training/testing runtime. -# - -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") -print(device) - - -###################################################################### -# Importing the Dataset -# --------------------- -# -# We will use the UrbanSound8K dataset to train our network. It is -# available for free `here `_ and contains -# 10 audio classes with over 8000 audio samples! Once you have downloaded -# the compressed dataset, extract it to your current working directory. -# First, we will look at the csv file that provides information about the -# individual sound files. ``pandas`` allows us to open the csv file and -# use ``.iloc()`` to access the data within it. -# - -csvData = pd.read_csv('./data/UrbanSound8K/metadata/UrbanSound8K.csv') -print(csvData.iloc[0, :]) - - -###################################################################### -# The 10 audio classes in the UrbanSound8K dataset are air_conditioner, -# car_horn, children_playing, dog_bark, drilling, enginge_idling, -# gun_shot, jackhammer, siren, and street_music. Let’s play a couple files -# and see what they sound like. The first file is street music and the -# second is an air conditioner. -# - -import IPython.display as ipd -ipd.Audio('./data/UrbanSound8K/audio/fold1/108041-9-0-5.wav') - -ipd.Audio('./data/UrbanSound8K/audio/fold5/100852-0-0-19.wav') - - -###################################################################### -# Formatting the Data -# ------------------- -# -# Now that we know the format of the csv file entries, we can construct -# our dataset. We will create a rapper class for our dataset using -# ``torch.utils.data.Dataset`` that will handle loading the files and -# performing some formatting steps. The UrbanSound8K dataset is separated -# into 10 folders. We will use the data from 9 of these folders to train -# our network and then use the 10th folder to test the network. The rapper -# class will store the file names, labels, and folder numbers of the audio -# files in the inputted folder list when initialized. The actual loading -# and formatting steps will happen in the access function ``__getitem__``. -# -# In ``__getitem__``, we use ``torchaudio.load()`` to convert the wav -# files to tensors. ``torchaudio.load()`` returns a tuple containing the -# newly created tensor along with the sampling frequency of the audio file -# (44.1kHz for UrbanSound8K). The dataset uses two channels for audio so -# we will use ``torchaudio.transforms.DownmixMono()`` to convert the audio -# data to one channel. Next, we need to format the audio data. The network -# we will make takes an input size of 32,000, while most of the audio -# files have well over 100,000 samples. The UrbanSound8K audio is sampled -# at 44.1kHz, so 32,000 samples only covers around 700 milliseconds. By -# downsampling the audio to aproximately 8kHz, we can represent 4 seconds -# with the 32,000 samples. This downsampling is achieved by taking every -# fifth sample of the original audio tensor. Not every audio tensor is -# long enough to handle the downsampling so these tensors will need to be -# padded with zeros. The minimum length that won’t require padding is -# 160,000 samples. -# - -class UrbanSoundDataset(Dataset): -#rapper for the UrbanSound8K dataset - # Argument List - # path to the UrbanSound8K csv file - # path to the UrbanSound8K audio files - # list of folders to use in the dataset - - def __init__(self, csv_path, file_path, folderList): - csvData = pd.read_csv(csv_path) - #initialize lists to hold file names, labels, and folder numbers - self.file_names = [] - self.labels = [] - self.folders = [] - #loop through the csv entries and only add entries from folders in the folder list - for i in range(0,len(csvData)): - if csvData.iloc[i, 5] in folderList: - self.file_names.append(csvData.iloc[i, 0]) - self.labels.append(csvData.iloc[i, 6]) - self.folders.append(csvData.iloc[i, 5]) - - self.file_path = file_path - self.mixer = torchaudio.transforms.DownmixMono() #UrbanSound8K uses two channels, this will convert them to one - self.folderList = folderList - - def __getitem__(self, index): - #format the file path and load the file - path = self.file_path + "fold" + str(self.folders[index]) + "/" + self.file_names[index] - sound = torchaudio.load(path, out = None, normalization = True) - #load returns a tensor with the sound data and the sampling frequency (44.1kHz for UrbanSound8K) - soundData = self.mixer(sound[0]) - #downsample the audio to ~8kHz - tempData = torch.zeros([160000, 1]) #tempData accounts for audio clips that are too short - if soundData.numel() < 160000: - tempData[:soundData.numel()] = soundData[:] - else: - tempData[:] = soundData[:160000] - - soundData = tempData - soundFormatted = torch.zeros([32000, 1]) - soundFormatted[:32000] = soundData[::5] #take every fifth sample of soundData - soundFormatted = soundFormatted.permute(1, 0) - return soundFormatted, self.labels[index] - - def __len__(self): - return len(self.file_names) - - -csv_path = './data/UrbanSound8K/metadata/UrbanSound8K.csv' -file_path = './data/UrbanSound8K/audio/' - -train_set = UrbanSoundDataset(csv_path, file_path, range(1,10)) -test_set = UrbanSoundDataset(csv_path, file_path, [10]) -print("Train set size: " + str(len(train_set))) -print("Test set size: " + str(len(test_set))) - -kwargs = {'num_workers': 1, 'pin_memory': True} if device == 'cuda' else {} #needed for using datasets on gpu - -train_loader = torch.utils.data.DataLoader(train_set, batch_size = 128, shuffle = True, **kwargs) -test_loader = torch.utils.data.DataLoader(test_set, batch_size = 128, shuffle = True, **kwargs) - - -###################################################################### -# Define the Network -# ------------------ -# -# For this tutorial we will use a convolutional neural network to process -# the raw audio data. Usually more advanced transforms are applied to the -# audio data, however CNNs can be used to accurately process the raw data. -# The specific architecture is modeled after the M5 network architecture -# described in https://arxiv.org/pdf/1610.00087.pdf. An important aspect -# of models processing raw audio data is the receptive field of their -# first layer’s filters. Our model’s first filter is length 80 so when -# processing audio sampled at 8kHz the receptive field is around 10ms. -# This size is similar to speech processing applications that often use -# receptive fields ranging from 20ms to 40ms. -# - -class Net(nn.Module): - def __init__(self): - super(Net, self).__init__() - self.conv1 = nn.Conv1d(1, 128, 80, 4) - self.bn1 = nn.BatchNorm1d(128) - self.pool1 = nn.MaxPool1d(4) - self.conv2 = nn.Conv1d(128, 128, 3) - self.bn2 = nn.BatchNorm1d(128) - self.pool2 = nn.MaxPool1d(4) - self.conv3 = nn.Conv1d(128, 256, 3) - self.bn3 = nn.BatchNorm1d(256) - self.pool3 = nn.MaxPool1d(4) - self.conv4 = nn.Conv1d(256, 512, 3) - self.bn4 = nn.BatchNorm1d(512) - self.pool4 = nn.MaxPool1d(4) - self.avgPool = nn.AvgPool1d(30) #input should be 512x30 so this outputs a 512x1 - self.fc1 = nn.Linear(512, 10) - - def forward(self, x): - x = self.conv1(x) - x = F.relu(self.bn1(x)) - x = self.pool1(x) - x = self.conv2(x) - x = F.relu(self.bn2(x)) - x = self.pool2(x) - x = self.conv3(x) - x = F.relu(self.bn3(x)) - x = self.pool3(x) - x = self.conv4(x) - x = F.relu(self.bn4(x)) - x = self.pool4(x) - x = self.avgPool(x) - x = x.permute(0, 2, 1) #change the 512x1 to 1x512 - x = self.fc1(x) - return F.log_softmax(x, dim = 2) - -model = Net() -model.to(device) -print(model) - - -###################################################################### -# We will use the same optimization technique used in the paper, an Adam -# optimizer with weight decay set to 0.0001. At first, we will train with -# a learning rate of 0.01, but we will use a ``scheduler`` to decrease it -# to 0.001 during training. -# - -optimizer = optim.Adam(model.parameters(), lr = 0.01, weight_decay = 0.0001) -scheduler = optim.lr_scheduler.StepLR(optimizer, step_size = 20, gamma = 0.1) - - -###################################################################### -# Training and Testing the Network -# -------------------------------- -# -# Now let’s define a training function that will feed our training data -# into the model and perform the backward pass and optimization steps. -# - -def train(model, epoch): - model.train() - for batch_idx, (data, target) in enumerate(train_loader): - optimizer.zero_grad() - data = data.to(device) - target = target.to(device) - data = data.requires_grad_() #set requires_grad to True for training - output = model(data) - output = output.permute(1, 0, 2) #original output dimensions are batchSizex1x10 - loss = F.nll_loss(output[0], target) #the loss functions expects a batchSizex10 input - loss.backward() - optimizer.step() - if batch_idx % log_interval == 0: #print training stats - print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( - epoch, batch_idx * len(data), len(train_loader.dataset), - 100. * batch_idx / len(train_loader), loss)) - - -###################################################################### -# Now that we have a training function, we need to make one for testing -# the networks accuracy. We will set the model to ``eval()`` mode and then -# run inference on the test dataset. Calling ``eval()`` sets the training -# variable in all modules in the network to false. Certain layers like -# batch normalization and dropout layers behave differently during -# training so this step is crucial for getting correct results. -# - -def test(model, epoch): - model.eval() - correct = 0 - for data, target in test_loader: - data = data.to(device) - target = target.to(device) - output = model(data) - output = output.permute(1, 0, 2) - pred = output.max(2)[1] # get the index of the max log-probability - correct += pred.eq(target).cpu().sum().item() - print('\nTest set: Accuracy: {}/{} ({:.0f}%)\n'.format( - correct, len(test_loader.dataset), - 100. * correct / len(test_loader.dataset))) - - -###################################################################### -# Finally, we can train and test the network. We will train the network -# for ten epochs then reduce the learn rate and train for ten more epochs. -# The network will be tested after each epoch to see how the accuracy -# varies during the training. -# -# .. note:: Due to a build issue, we've reduced the number of epochs to 10. -# Run this sample with 40 locally to get the proper values. -# - -log_interval = 20 -for epoch in range(1, 11): - if epoch == 31: - print("First round of training complete. Setting learn rate to 0.001.") - scheduler.step() - train(model, epoch) - test(model, epoch) - - -###################################################################### -# Conclusion -# ---------- -# -# If trained on 9 folders, the network should be more than 50% accurate by -# the end of the training process. Training on less folders will result in -# a lower overall accuracy but may be necessary if long runtimes are a -# problem. Greater accuracies can be achieved using deeper CNNs at the -# expense of a larger memory footprint. -# -# For more advanced audio applications, such as speech recognition, -# recurrent neural networks (RNNs) are commonly used. There are also other -# data preprocessing methods, such as finding the mel frequency cepstral -# coefficients (MFCC), that can reduce the size of the dataset. -# -