Tensorflow Learning: part1

Shashank Shekhar
Sep 1, 2024
8 min read

Updated: Sep 24, 2024

'''

what's the difference between these three optimizers - SGD, Adam and RMSprop? And when should which be used?

SGD (Stochastic Gradient Descent), Adam, and RMSprop are popular optimization algorithms used in deep learning. Each has strengths and weaknesses:

1. SGD (Stochastic Gradient Descent)

Updates weights based on the gradient of the loss function

Learning rate (lr) is crucial; too high can cause oscillations, too low can lead to slow convergence

No momentum or adaptive learning rate

Simple to implement, computationally efficient

Pros: Simple, efficient, easy to implement

Cons: Requires manual lr tuning, can get stuck in local minima

2. RMSprop (Root Mean Square Propagation)

An extension of SGD with adaptive learning rate

Divides lr by magnitude of gradient (normalized by square root of sum of squares)

Helps escape local minima and handles large gradients

Introduced in Geoff Hinton's Coursera course (2012)

Pros: Adaptive lr, handles large gradients, escapes local minima

Cons: Hyperparameters (decay rate, epsilon) require tuning

3. Adam (Adaptive Moment Estimation)

Combines benefits of RMSprop and momentum

Computes adaptive lr based on first and second moments (mean and variance) of gradients

Handles sparse gradients and non-stationary objectives

Introduced by Kingma and Ba (2014)

Pros: Adaptive lr, handles sparse gradients, non-stationary objectives

Cons: More computationally expensive, hyperparameters (beta1, beta2, epsilon) require tuning

When to use each:

SGD: Simple problems, small datasets, when computational efficiency is crucial. Use when you want fine-grained control over lr.

RMSprop: Medium-sized problems, when gradients are large or noisy. Use when you want adaptive lr without momentum.

Adam: Complex problems, large datasets, or non-stationary objectives. Use when you want adaptive lr with momentum and robustness to sparse gradients.

General guidelines:

Start with Adam, as it's a popular, robust choice.

If Adam converges slowly, try RMSprop or SGD with momentum.

If gradients are very large, try RMSprop.

If computational efficiency is critical, use SGD.

Hyperparameter tuning:

SGD: lr (learning rate)

RMSprop: lr, decay rate, epsilon

Adam: lr, beta1, beta2, epsilon

Keep in mind that these are general guidelines, and the best optimizer choice depends on the specific problem, dataset, and model architecture.

Additional resources:

Original papers: SGD (1959), RMSprop (2012), Adam (2014)

Optimization algorithms overview: (link unavailable)

Comparative study: (link unavailable)

'''

#Importing TensorFlow and Printing Version

import tensorflow as tf

print("TensorFlow version:", tf.__version__)

#Importing Keras Layers and Model

from tensorflow.keras.layers import Dense, Flatten, Conv2D

from tensorflow.keras import Model

# Loading MNIST Dataset and spliting

mnist = tf.keras.datasets.mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()

#Normalizes the input data by dividing pixel values by 255

x_train, x_test = x_train / 255.0, x_test / 255.0

# Adds a channels dimension (e.g., RGB) to the input data using tf.newaxis. Converts data type to float32.

x_train = x_train[..., tf.newaxis].astype("float32")

x_test = x_test[..., tf.newaxis].astype("float32")

#Shuffles the training data with a buffer size of 10,000. Batches the data into batches of 32

train_ds = tf .data.Dataset.from_tensor_slices(

(x_train, y_train)).shuffle(10000).batch(32)

test_ds = tf .data.Dataset.from_tensor_slices((x_test, y_test)).batch(32)

#Defines a custom Keras model using the Model class. Initializes layers in init: Conv2D, Flatten, two Dense layers.

class MyModel(Model):

def init(self):

super().__init__()

self.conv1 = Conv2D(32, 3, activation='relu')

self.flatten = Flatten()

self.d1 = Dense(128, activation='relu')

self.d2 = Dense(10)

#Defines the forward pass in call

def call(self, x):

x = self.conv1(x)

x = self.flatten(x)

x = self.d1(x)

return self.d2(x)

# Create an instance of the model

model = MyModel()

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

#optimizer = tf.keras.optimizers.Adam()

#optimizer = tf.keras.optimizers.SGD()

optimizer = tf.keras.optimizers.RMSprop()

train_loss = tf.keras.metrics.Mean(name='train_loss')

train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

test_loss = tf.keras.metrics.Mean(name='test_loss')

test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='test_accuracy')

@tf.function

def train_step(images, labels):

with tf.GradientTape() as tape:

# training=True is only needed if there are layers with different

# behavior during training versus inference (e.g. Dropout).

predictions = model(images, training=True)

loss = loss_object(labels, predictions)

gradients = tape.gradient(loss, model.trainable_variables)

optimizer.apply_gradients(zip(gradients, model.trainable_variables))

train_loss(loss)

train_accuracy(labels, predictions)

@tf.function

def test_step(images, labels):

# training=False is only needed if there are layers with different

# behavior during training versus inference (e.g. Dropout).

predictions = model(images, training=False)

t_loss = loss_object(labels, predictions)

test_loss(t_loss)

test_accuracy(labels, predictions)

#Model training loop

EPOCHS = 5

for epoch in range(EPOCHS):

# Reset the metrics at the start of the next epoch

train_loss.reset_state()

train_accuracy.reset_state()

test_loss.reset_state()

test_accuracy.reset_state()

for images, labels in train_ds:

train_step(images, labels)

for test_images, test_labels in test_ds:

test_step(test_images, test_labels)

print(

f'Epoch {epoch + 1}, '

f'Loss: {train_loss.result():0.2f}, '

f'Accuracy: {train_accuracy.result() * 100:0.2f}, '

f'Test Loss: {test_loss.result():0.2f}, '

f'Test Accuracy: {test_accuracy.result() * 100:0.2f}'

)

'''

I've identified several factors that might contribute to RMSProp performing better than Adam and SGD:

1. Conv2D layer: The model starts with a Conv2D layer, which can produce large gradients due to the convolutional operation. RMSProp's adaptive learning rate helps mitigate these large gradients.

2. ReLU activation: ReLU activation is being used in Conv2D and Dense layers. ReLU can produce dying neurons (output 0) for large negative inputs. RMSProp's adaptation helps recover from these dead neurons.

3. Small model size: The model has relatively few parameters (~150k). RMSProp's simpler adaptation mechanism might be more effective for smaller models.

4. Simple dataset: MNIST is a relatively simple dataset. RMSProp's robustness to large gradients and adaptive learning rate might be sufficient for this dataset.

5. Limited training epochs: Model is training for only 5 epochs. RMSProp's faster convergence rate might be beneficial in this scenario.

6. No dropout or regularization: Without dropout or regularization, RMSProp's adaptation helps prevent overfitting.

7. Default hyperparameters of RMSProp is working well for MINST dataset: RMSProp (learning rate=0.001, rho=0.9, epsilon=1e-07).

In contrast:

- Adam's momentum term might cause oscillations or slow convergence in this specific case.

- SGD's fixed learning rate might require manual tuning, which can be challenging.

To further improve performance:

1. Hyperparameter tuning: Experiment with different learning rates, rho values, and epsilon values for RMSProp.

2. Batch normalization: Consider adding batch normalization layers to stabilize the training process.

3. Data augmentation: Apply random transformations to the MNIST images to increase diversity and prevent overfitting.

'''

#Previous implementation got 98.3% accuracy with RMSProp. Let's enhance it.

# Importing TensorFlow and Printing Version

import tensorflow as tf

print("TensorFlow version:", tf.__version__)

# Importing Keras Layers and Model

from tensorflow.keras.layers import Dense, Flatten, Conv2D, BatchNormalization

from tensorflow.keras import Model

# Loading MNIST Dataset and splitting

mnist = tf.keras.datasets.mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Normalizes the input data by dividing pixel values by 255

x_train, x_test = x_train / 255.0, x_test / 255.0

# Adds a channels dimension (e.g., RGB) to the input data using tf.newaxis.

# Converts data type to float32.

x_train = x_train[..., tf.newaxis].astype("float32")

x_test = x_test[..., tf.newaxis].astype("float32")

# Data augmentation

datagen = tf.keras.preprocessing.image.ImageDataGenerator(

rotation_range=10,

width_shift_range=0.1,

height_shift_range=0.1,

horizontal_flip=False,

)

datagen .fit(x_train)

# Shuffles the training data with a buffer size of 10,000.

# Batches the data into batches of 32

train_ds = datagen.flow(x_train, y_train, batch_size=32)

test_ds = tf .data.Dataset.from_tensor_slices((x_test, y_test)).batch(32)

# Defines a custom Keras model using the Model class.

# Initializes layers in init: Conv2D, BatchNormalization, Flatten, two Dense layers.

class MyModel(Model):

def init(self):

super().__init__()

self.conv1 = Conv2D(32, 3, activation='relu')

self.bn1 = BatchNormalization()

self.conv2 = Conv2D(64, 3, activation='relu')

self.bn2 = BatchNormalization()

self.flatten = Flatten()

self.d1 = Dense(128, activation='relu')

self.d2 = Dense(10)

# Defines the forward pass in call

def call(self, x):

x = self.conv1(x)

x = self.bn1(x)

x = self.conv2(x)

x = self.bn2(x)

x = self.flatten(x)

x = self.d1(x)

return self.d2(x)

# Create an instance of the model

model = MyModel()

# Compile the model

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.0005)

model.compile(loss=loss_object, optimizer=optimizer, metrics=['accuracy'])

# Define callbacks

callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

# Model training loop

EPOCHS = 10

history = model .fit(train_ds, epochs=EPOCHS, validation_data=test_ds, callbacks=[callback])

# Print the history

print(history.history)

#With these enhancement of data augmentation, batchnormalization, lr = 0.0005, early_stop and 10 epochs, the accuracy climbed to 99.15%

Tensorflow Learning: part1

Recent Posts

Comments