Skip to content
This repository was archived by the owner on Feb 13, 2025. It is now read-only.

Commit c38b970

Browse files
sguggerBradLarson
andauthoredMay 7, 2020
Convert MNIST to Epochs (#497)
* Convert MNIST to Epochs Co-authored-by: Brad Larson <bradlarson@google.com>
1 parent e275188 commit c38b970

File tree

19 files changed

+558
-186
lines changed

19 files changed

+558
-186
lines changed
 

‎Autoencoder/Autoencoder1D/main.swift

+1-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ let imageHeight = 28
2424
let imageWidth = 28
2525

2626
let outputFolder = "./output/"
27-
let dataset = FashionMNIST(batchSize: batchSize, flattening: true)
27+
let dataset = OldFashionMNIST(batchSize: batchSize, flattening: true)
2828
// An autoencoder.
2929
var autoencoder = Sequential {
3030
// The encoder.

‎Autoencoder/Autoencoder2D/main.swift

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ let imageHeight = 28
2626
let imageWidth = 28
2727

2828
let outputFolder = "./output/"
29-
let dataset = KuzushijiMNIST(batchSize: batchSize, flattening: true)
29+
let dataset = OldKuzushijiMNIST(batchSize: batchSize, flattening: true)
3030

3131
// An autoencoder.
3232
struct Autoencoder2D: Layer {

‎Autoencoder/VAE1D/main.swift

+2-2
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ let imageHeight = 28
2626
let imageWidth = 28
2727

2828
let outputFolder = "./output/"
29-
let dataset = MNIST(batchSize: 128, flattening: true)
29+
let dataset = OldMNIST(batchSize: 128, flattening: true)
3030

3131
let inputDim = 784 // 28*28 for any MNIST
3232
let hiddenDim = 400
@@ -84,7 +84,7 @@ func vaeLossFunction(
8484
}
8585

8686
// TODO: Find a cleaner way of extracting individual images that doesn't require a second dataset.
87-
let singleImageDataset = MNIST(batchSize: 1, flattening: true)
87+
let singleImageDataset = OldMNIST(batchSize: 1, flattening: true)
8888
let individualTestImages = singleImageDataset.test
8989
var testImageIterator = individualTestImages.sequenced()
9090

‎Benchmarks/Models/LeNetMnist.swift

+2-2
Original file line numberDiff line numberDiff line change
@@ -37,11 +37,11 @@ enum LeNetMNIST: BenchmarkModel {
3737
}
3838

3939
static func makeInferenceBenchmark(settings: BenchmarkSettings) -> Benchmark {
40-
return ImageClassificationInference<LeNet, MNIST>(settings: settings)
40+
return ImageClassificationInference<LeNet, OldMNIST>(settings: settings)
4141
}
4242

4343
static func makeTrainingBenchmark(settings: BenchmarkSettings) -> Benchmark {
44-
return ImageClassificationTraining<LeNet, MNIST>(settings: settings)
44+
return ImageClassificationTraining<LeNet, OldMNIST>(settings: settings)
4545
}
4646
}
4747

‎DCGAN/main.swift

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ import ModelSupport
1818
import TensorFlow
1919

2020
let batchSize = 512
21-
let mnist = MNIST(batchSize: batchSize, flattening: false, normalizing: true)
21+
let mnist = OldMNIST(batchSize: batchSize, flattening: false, normalizing: true)
2222

2323
let outputFolder = "./output/"
2424

‎Datasets/CIFAR10/CIFAR10.swift

+12-9
Original file line numberDiff line numberDiff line change
@@ -48,12 +48,13 @@ public struct CIFAR10<Entropy: RandomNumberGenerator> {
4848
self.init(
4949
batchSize: batchSize,
5050
entropy: entropy,
51+
device: Device.default,
5152
remoteBinaryArchiveLocation: URL(
5253
string: "https://storage.googleapis.com/s4tf-hosted-binaries/datasets/CIFAR10/cifar-10-binary.tar.gz")!,
5354
normalizing: true)
5455
}
5556

56-
/// Creates an instance with `batchSize` using `remoteBinaryArchiveLocation`.
57+
/// Creates an instance with `batchSize` on `device` using `remoteBinaryArchiveLocation`.
5758
///
5859
/// - Parameters:
5960
/// - entropy: a source of randomness used to shuffle sample ordering. It
@@ -65,6 +66,7 @@ public struct CIFAR10<Entropy: RandomNumberGenerator> {
6566
public init(
6667
batchSize: Int,
6768
entropy: Entropy,
69+
device: Device,
6870
remoteBinaryArchiveLocation: URL,
6971
localStorageDirectory: URL = DatasetUtilities.defaultDirectory
7072
.appendingPathComponent("CIFAR10", isDirectory: true),
@@ -76,13 +78,13 @@ public struct CIFAR10<Entropy: RandomNumberGenerator> {
7678
let trainingSamples = loadCIFARTrainingFiles(in: localStorageDirectory)
7779
training = TrainingEpochs(samples: trainingSamples, batchSize: batchSize, entropy: entropy)
7880
.lazy.map { (batches: Batches) -> LazyMapSequence<Batches, LabeledImage> in
79-
return batches.lazy.map{ makeBatch(samples: $0, normalizing: normalizing) }
81+
return batches.lazy.map{ makeBatch(samples: $0, normalizing: normalizing, device: device) }
8082
}
8183

8284
// Validation data
8385
let validationSamples = loadCIFARTestFile(in: localStorageDirectory)
8486
validation = validationSamples.inBatches(of: batchSize).lazy.map {
85-
makeBatch(samples: $0, normalizing: normalizing)
87+
makeBatch(samples: $0, normalizing: normalizing, device: device)
8688
}
8789
}
8890
}
@@ -145,19 +147,20 @@ func loadCIFARTestFile(in localStorageDirectory: URL) -> [(data: [UInt8], label:
145147
return loadCIFARFile(named: "test_batch.bin", in: localStorageDirectory)
146148
}
147149

148-
func makeBatch<BatchSamples: Collection>(samples: BatchSamples, normalizing: Bool) -> LabeledImage
149-
where BatchSamples.Element == (data: [UInt8], label: Int32) {
150+
fileprivate func makeBatch<BatchSamples: Collection>(
151+
samples: BatchSamples, normalizing: Bool, device: Device
152+
) -> LabeledImage where BatchSamples.Element == (data: [UInt8], label: Int32) {
150153
let bytes = samples.lazy.map(\.data).reduce(into: [], +=)
151-
let images = Tensor<UInt8>(shape: [samples.count, 3, 32, 32], scalars: bytes)
154+
let images = Tensor<UInt8>(shape: [samples.count, 3, 32, 32], scalars: bytes, on: device)
152155

153156
var imageTensor = Tensor<Float>(images.transposed(permutation: [0, 2, 3, 1]))
154157
imageTensor /= 255.0
155158
if normalizing {
156-
let mean = Tensor<Float>([0.4913996898, 0.4821584196, 0.4465309242])
157-
let std = Tensor<Float>([0.2470322324, 0.2434851280, 0.2615878417])
159+
let mean = Tensor<Float>([0.4913996898, 0.4821584196, 0.4465309242], on: device)
160+
let std = Tensor<Float>([0.2470322324, 0.2434851280, 0.2615878417], on: device)
158161
imageTensor = (imageTensor - mean) / std
159162
}
160163

161164
let labels = Tensor<Int32>(samples.map(\.label))
162165
return LabeledImage(data: imageTensor, label: labels)
163-
}
166+
}

‎Datasets/CMakeLists.txt

+4
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@ add_library(Datasets
1515
MNIST/MNIST.swift
1616
MNIST/FashionMNIST.swift
1717
MNIST/KuzushijiMNIST.swift
18+
MNIST/OldMNISTDatasetHandler.swift
19+
MNIST/OldMNIST.swift
20+
MNIST/OldFashionMNIST.swift
21+
MNIST/OldKuzushijiMNIST.swift
1822
ObjectDetectionDataset.swift
1923
BostonHousing/BostonHousing.swift
2024
TextUnsupervised/TextUnsupervised.swift

‎Datasets/MNIST/FashionMNIST.swift

+72-34
Original file line numberDiff line numberDiff line change
@@ -21,41 +21,79 @@ import Foundation
2121
import TensorFlow
2222
import Batcher
2323

24-
public struct FashionMNIST: ImageClassificationDataset {
25-
public typealias SourceDataSet = [TensorPair<Float, Int32>]
26-
public let training: Batcher<SourceDataSet>
27-
public let test: Batcher<SourceDataSet>
24+
public struct FashionMNIST<Entropy: RandomNumberGenerator> {
25+
/// Type of the collection of non-collated batches.
26+
public typealias Batches = Slices<Sampling<[(data: [UInt8], label: Int32)], ArraySlice<Int>>>
27+
/// The type of the training data, represented as a sequence of epochs, which
28+
/// are collection of batches.
29+
public typealias Training = LazyMapSequence<
30+
TrainingEpochs<[(data: [UInt8], label: Int32)], Entropy>,
31+
LazyMapSequence<Batches, LabeledImage>
32+
>
33+
/// The type of the validation data, represented as a collection of batches.
34+
public typealias Validation = LazyMapSequence<Slices<[(data: [UInt8], label: Int32)]>, LabeledImage>
35+
/// The training epochs.
36+
public let training: Training
37+
/// The validation batches.
38+
public let validation: Validation
2839

29-
public init(batchSize: Int) {
30-
self.init(batchSize: batchSize, flattening: false, normalizing: false)
31-
}
32-
33-
public init(
34-
batchSize: Int, flattening: Bool = false, normalizing: Bool = false,
35-
localStorageDirectory: URL = DatasetUtilities.defaultDirectory
36-
.appendingPathComponent("FashionMNIST", isDirectory: true)
37-
) {
38-
training = Batcher<SourceDataSet>(
39-
on: fetchMNISTDataset(
40-
localStorageDirectory: localStorageDirectory,
41-
remoteBaseDirectory: "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/",
42-
imagesFilename: "train-images-idx3-ubyte",
43-
labelsFilename: "train-labels-idx1-ubyte",
44-
flattening: flattening,
45-
normalizing: normalizing),
46-
batchSize: batchSize,
47-
numWorkers: 1, //No need to use parallelism since everything is loaded in memory
48-
shuffle: true)
40+
/// Creates an instance with `batchSize`.
41+
///
42+
/// - Parameter entropy: a source of randomness used to shuffle sample
43+
/// ordering. It will be stored in `self`, so if it is only pseudorandom
44+
/// and has value semantics, the sequence of epochs is deterministic and not
45+
/// dependent on other operations.
46+
public init(batchSize: Int, entropy: Entropy) {
47+
self.init(batchSize: batchSize, device: Device.default, entropy: entropy,
48+
flattening: false, normalizing: false)
49+
}
4950

50-
test = Batcher<SourceDataSet>(
51-
on: fetchMNISTDataset(
52-
localStorageDirectory: localStorageDirectory,
53-
remoteBaseDirectory: "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/",
54-
imagesFilename: "t10k-images-idx3-ubyte",
55-
labelsFilename: "t10k-labels-idx1-ubyte",
56-
flattening: flattening,
57-
normalizing: normalizing),
58-
batchSize: batchSize,
59-
numWorkers: 1) //No need to use parallelism since everything is loaded in memory
51+
/// Creates an instance with `batchSize` on `device`.
52+
///
53+
/// - Parameters:
54+
/// - entropy: a source of randomness used to shuffle sample ordering. It
55+
/// will be stored in `self`, so if it is only pseudorandom and has value
56+
/// semantics, the sequence of epochs is deterministic and not dependent
57+
/// on other operations.
58+
/// - flattening: flattens the data to be a 2d-tensor iff `true. The default value
59+
/// is `false`.
60+
/// - normalizing: normalizes the batches to have values from -1.0 to 1.0 iff `true`.
61+
/// The default value is `false`.
62+
/// - localStorageDirectory: the directory in which the dataset is stored.
63+
public init(
64+
batchSize: Int, device: Device, entropy: Entropy, flattening: Bool = false,
65+
normalizing: Bool = false,
66+
localStorageDirectory: URL = DatasetUtilities.defaultDirectory
67+
.appendingPathComponent("FashionMNIST", isDirectory: true)
68+
) {
69+
training = TrainingEpochs(
70+
samples: fetchMNISTDataset(
71+
localStorageDirectory: localStorageDirectory,
72+
remoteBaseDirectory: "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/",
73+
imagesFilename: "train-images-idx3-ubyte",
74+
labelsFilename: "train-labels-idx1-ubyte"),
75+
batchSize: batchSize, entropy: entropy
76+
).lazy.map { (batches: Batches) -> LazyMapSequence<Batches, LabeledImage> in
77+
return batches.lazy.map{ makeMNISTBatch(
78+
samples: $0, flattening: flattening, normalizing: normalizing, device: device
79+
)}
6080
}
81+
82+
validation = fetchMNISTDataset(
83+
localStorageDirectory: localStorageDirectory,
84+
remoteBaseDirectory: "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/",
85+
imagesFilename: "t10k-images-idx3-ubyte",
86+
labelsFilename: "t10k-labels-idx1-ubyte"
87+
).inBatches(of: batchSize).lazy.map {
88+
makeMNISTBatch(samples: $0, flattening: flattening, normalizing: normalizing,
89+
device: device)
90+
}
91+
}
6192
}
93+
94+
extension FashionMNIST: ImageClassificationData where Entropy == SystemRandomNumberGenerator {
95+
/// Creates an instance with `batchSize`.
96+
public init(batchSize: Int) {
97+
self.init(batchSize: batchSize, entropy: SystemRandomNumberGenerator())
98+
}
99+
}

‎Datasets/MNIST/KuzushijiMNIST.swift

+72-34
Original file line numberDiff line numberDiff line change
@@ -20,41 +20,79 @@ import Foundation
2020
import TensorFlow
2121
import Batcher
2222

23-
public struct KuzushijiMNIST: ImageClassificationDataset {
24-
public typealias SourceDataSet = [TensorPair<Float, Int32>]
25-
public let training: Batcher<SourceDataSet>
26-
public let test: Batcher<SourceDataSet>
23+
public struct KuzushijiMNIST<Entropy: RandomNumberGenerator> {
24+
/// Type of the collection of non-collated batches.
25+
public typealias Batches = Slices<Sampling<[(data: [UInt8], label: Int32)], ArraySlice<Int>>>
26+
/// The type of the training data, represented as a sequence of epochs, which
27+
/// are collection of batches.
28+
public typealias Training = LazyMapSequence<
29+
TrainingEpochs<[(data: [UInt8], label: Int32)], Entropy>,
30+
LazyMapSequence<Batches, LabeledImage>
31+
>
32+
/// The type of the validation data, represented as a collection of batches.
33+
public typealias Validation = LazyMapSequence<Slices<[(data: [UInt8], label: Int32)]>, LabeledImage>
34+
/// The training epochs.
35+
public let training: Training
36+
/// The validation batches.
37+
public let validation: Validation
2738

28-
public init(batchSize: Int) {
29-
self.init(batchSize: batchSize, flattening: false, normalizing: false)
30-
}
31-
32-
public init(
33-
batchSize: Int, flattening: Bool = false, normalizing: Bool = false,
34-
localStorageDirectory: URL = DatasetUtilities.defaultDirectory
35-
.appendingPathComponent("KuzushijiMNIST", isDirectory: true)
36-
) {
37-
training = Batcher<SourceDataSet>(
38-
on: fetchMNISTDataset(
39-
localStorageDirectory: localStorageDirectory,
40-
remoteBaseDirectory: "https://storage.googleapis.com/s4tf-hosted-binaries/datasets/KMNIST",
41-
imagesFilename: "train-images-idx3-ubyte",
42-
labelsFilename: "train-labels-idx1-ubyte",
43-
flattening: flattening,
44-
normalizing: normalizing),
45-
batchSize: batchSize,
46-
numWorkers: 1, //No need to use parallelism since everything is loaded in memory
47-
shuffle: true)
39+
/// Creates an instance with `batchSize`.
40+
///
41+
/// - Parameter entropy: a source of randomness used to shuffle sample
42+
/// ordering. It will be stored in `self`, so if it is only pseudorandom
43+
/// and has value semantics, the sequence of epochs is deterministic and not
44+
/// dependent on other operations.
45+
public init(batchSize: Int, entropy: Entropy) {
46+
self.init(batchSize: batchSize, device: Device.default, entropy: entropy,
47+
flattening: false, normalizing: false)
48+
}
4849

49-
test = Batcher<SourceDataSet>(
50-
on: fetchMNISTDataset(
51-
localStorageDirectory: localStorageDirectory,
52-
remoteBaseDirectory: "https://storage.googleapis.com/s4tf-hosted-binaries/datasets/KMNIST",
53-
imagesFilename: "t10k-images-idx3-ubyte",
54-
labelsFilename: "t10k-labels-idx1-ubyte",
55-
flattening: flattening,
56-
normalizing: normalizing),
57-
batchSize: batchSize,
58-
numWorkers: 1) //No need to use parallelism since everything is loaded in memory
50+
/// Creates an instance with `batchSize` on `device`.
51+
///
52+
/// - Parameters:
53+
/// - entropy: a source of randomness used to shuffle sample ordering. It
54+
/// will be stored in `self`, so if it is only pseudorandom and has value
55+
/// semantics, the sequence of epochs is deterministic and not dependent
56+
/// on other operations.
57+
/// - flattening: flattens the data to be a 2d-tensor iff `true. The default value
58+
/// is `false`.
59+
/// - normalizing: normalizes the batches to have values from -1.0 to 1.0 iff `true`.
60+
/// The default value is `false`.
61+
/// - localStorageDirectory: the directory in which the dataset is stored.
62+
public init(
63+
batchSize: Int, device: Device, entropy: Entropy, flattening: Bool = false,
64+
normalizing: Bool = false,
65+
localStorageDirectory: URL = DatasetUtilities.defaultDirectory
66+
.appendingPathComponent("KuzushijiMNIST", isDirectory: true)
67+
) {
68+
training = TrainingEpochs(
69+
samples: fetchMNISTDataset(
70+
localStorageDirectory: localStorageDirectory,
71+
remoteBaseDirectory: "https://storage.googleapis.com/s4tf-hosted-binaries/datasets/KMNIST",
72+
imagesFilename: "train-images-idx3-ubyte",
73+
labelsFilename: "train-labels-idx1-ubyte"),
74+
batchSize: batchSize, entropy: entropy
75+
).lazy.map { (batches: Batches) -> LazyMapSequence<Batches, LabeledImage> in
76+
return batches.lazy.map{ makeMNISTBatch(
77+
samples: $0, flattening: flattening, normalizing: normalizing, device: device
78+
)}
5979
}
80+
81+
validation = fetchMNISTDataset(
82+
localStorageDirectory: localStorageDirectory,
83+
remoteBaseDirectory: "https://storage.googleapis.com/s4tf-hosted-binaries/datasets/KMNIST",
84+
imagesFilename: "t10k-images-idx3-ubyte",
85+
labelsFilename: "t10k-labels-idx1-ubyte"
86+
).inBatches(of: batchSize).lazy.map {
87+
makeMNISTBatch(samples: $0, flattening: flattening, normalizing: normalizing,
88+
device: device)
89+
}
90+
}
6091
}
92+
93+
extension KuzushijiMNIST: ImageClassificationData where Entropy == SystemRandomNumberGenerator {
94+
/// Creates an instance with `batchSize`.
95+
public init(batchSize: Int) {
96+
self.init(batchSize: batchSize, entropy: SystemRandomNumberGenerator())
97+
}
98+
}

0 commit comments

Comments
 (0)
This repository has been archived.