# GRADED FUNCTION: update_parameters_with_gddefupdate_parameters_with_gd(parameters, grads, learning_rate):"""Update parameters using one step of gradient descentArguments:parameters -- python dictionary containing your parameters to be updated:parameters['W' + str(l)] = Wlparameters['b' + str(l)] = blgrads -- python dictionary containing your gradients to update each parameters:grads['dW' + str(l)] = dWlgrads['db' + str(l)] = dbllearning_rate -- the learning rate, scalar.Returns:parameters -- python dictionary containing your updated parameters """L =len(parameters)//2# number of layers in the neural networks# Update rule for each parameterfor l inrange(L):### START CODE HERE ### (approx. 2 lines)parameters["W"+str(l+1)]= parameters['W'+str(l+1)]- learning_rate * grads['dW'+str(l+1)]parameters["b"+str(l+1)]= parameters['b'+str(l+1)]- learning_rate * grads['db'+str(l+1)]### END CODE HERE ###return parameters
# GRADED FUNCTION: random_mini_batchesdefrandom_mini_batches(X, Y, mini_batch_size =64, seed =0):"""Creates a list of random minibatches from (X, Y)Arguments:X -- input data, of shape (input size, number of examples)Y -- true "label" vector (1 for blue dot / 0 for red dot), of shape (1, number of examples)mini_batch_size -- size of the mini-batches, integerReturns:mini_batches -- list of synchronous (mini_batch_X, mini_batch_Y)"""np.random.seed(seed)# To make your "random" minibatches the same as oursm = X.shape[1]# number of training examplesmini_batches =[]# Step 1: Shuffle (X, Y)permutation =list(np.random.permutation(m))shuffled_X = X[:, permutation]shuffled_Y = Y[:, permutation].reshape((1,m))# Step 2: Partition (shuffled_X, shuffled_Y). Minus the end case.num_complete_minibatches = math.floor(m/mini_batch_size)# number of mini batches of size mini_batch_size in your partitionningfor k inrange(0, num_complete_minibatches):### START CODE HERE ### (approx. 2 lines)mini_batch_X = X[:, k*mini_batch_size :(k+1)*mini_batch_size]mini_batch_Y = Y[:, k*mini_batch_size :(k+1)*mini_batch_size]### END CODE HERE ###mini_batch =(mini_batch_X, mini_batch_Y)mini_batches.append(mini_batch)# Handling the end case (last mini-batch < mini_batch_size)if m % mini_batch_size !=0:### START CODE HERE ### (approx. 2 lines)mini_batch_X = X[:, num_complete_minibatches*mini_batch_size :]mini_batch_Y = Y[:, num_complete_minibatches*mini_batch_size :]### END CODE HERE ###mini_batch =(mini_batch_X, mini_batch_Y)mini_batches.append(mini_batch)return mini_batches
3. 動量
帶動量 的 梯度下降可以降低 mini-batch 梯度下降時的震蕩
原因:Momentum 考慮過去的梯度對當前的梯度進行平滑,梯度不會劇烈變化
初始化梯度的 初速度為 0
# GRADED FUNCTION: initialize_velocitydefinitialize_velocity(parameters):"""Initializes the velocity as a python dictionary with:- keys: "dW1", "db1", ..., "dWL", "dbL" - values: numpy arrays of zeros of the same shape as the corresponding gradients/parameters.Arguments:parameters -- python dictionary containing your parameters.parameters['W' + str(l)] = Wlparameters['b' + str(l)] = blReturns:v -- python dictionary containing the current velocity.v['dW' + str(l)] = velocity of dWlv['db' + str(l)] = velocity of dbl"""L =len(parameters)//2# number of layers in the neural networksv ={}# Initialize velocityfor l inrange(L):### START CODE HERE ### (approx. 2 lines)v["dW"+str(l+1)]= np.zeros(parameters['W'+str(l+1)].shape)v["db"+str(l+1)]= np.zeros(parameters['b'+str(l+1)].shape)### END CODE HERE ###return v
# GRADED FUNCTION: initialize_adamdefinitialize_adam(parameters):"""Initializes v and s as two python dictionaries with:- keys: "dW1", "db1", ..., "dWL", "dbL" - values: numpy arrays of zeros of the same shape as the corresponding gradients/parameters.Arguments:parameters -- python dictionary containing your parameters.parameters["W" + str(l)] = Wlparameters["b" + str(l)] = blReturns: v -- python dictionary that will contain the exponentially weighted average of the gradient.v["dW" + str(l)] = ...v["db" + str(l)] = ...s -- python dictionary that will contain the exponentially weighted average of the squared gradient.s["dW" + str(l)] = ...s["db" + str(l)] = ..."""L =len(parameters)//2# number of layers in the neural networksv ={}s ={}# Initialize v, s. Input: "parameters". Outputs: "v, s".for l inrange(L):### START CODE HERE ### (approx. 4 lines)v["dW"+str(l+1)]= np.zeros(parameters["W"+str(l+1)].shape)v["db"+str(l+1)]= np.zeros(parameters["b"+str(l+1)].shape)s["dW"+str(l+1)]= np.zeros(parameters["W"+str(l+1)].shape)s["db"+str(l+1)]= np.zeros(parameters["b"+str(l+1)].shape)### END CODE HERE ###return v, s
迭代更新
# GRADED FUNCTION: update_parameters_with_adamdefupdate_parameters_with_adam(parameters, grads, v, s, t, learning_rate =0.01,beta1 =0.9, beta2 =0.999, epsilon =1e-8):"""Update parameters using AdamArguments:parameters -- python dictionary containing your parameters:parameters['W' + str(l)] = Wlparameters['b' + str(l)] = blgrads -- python dictionary containing your gradients for each parameters:grads['dW' + str(l)] = dWlgrads['db' + str(l)] = dblv -- Adam variable, moving average of the first gradient, python dictionarys -- Adam variable, moving average of the squared gradient, python dictionarylearning_rate -- the learning rate, scalar.beta1 -- Exponential decay hyperparameter for the first moment estimates beta2 -- Exponential decay hyperparameter for the second moment estimates epsilon -- hyperparameter preventing division by zero in Adam updatesReturns:parameters -- python dictionary containing your updated parameters v -- Adam variable, moving average of the first gradient, python dictionarys -- Adam variable, moving average of the squared gradient, python dictionary"""L =len(parameters)//2# number of layers in the neural networksv_corrected ={}# Initializing first moment estimate, python dictionarys_corrected ={}# Initializing second moment estimate, python dictionary# Perform Adam update on all parametersfor l inrange(L):# Moving average of the gradients. Inputs: "v, grads, beta1". Output: "v".### START CODE HERE ### (approx. 2 lines)v["dW"+str(l+1)]= beta1*v["dW"+str(l+1)]+(1-beta1)*grads['dW'+str(l+1)]v["db"+str(l+1)]= beta1*v["db"+str(l+1)]+(1-beta1)*grads['db'+str(l+1)]### END CODE HERE #### Compute bias-corrected first moment estimate. Inputs: "v, beta1, t". Output: "v_corrected".### START CODE HERE ### (approx. 2 lines)v_corrected["dW"+str(l+1)]= v["dW"+str(l+1)]/(1-np.power(beta1,t))v_corrected["db"+str(l+1)]= v["db"+str(l+1)]/(1-np.power(beta1,t))### END CODE HERE #### Moving average of the squared gradients. Inputs: "s, grads, beta2". Output: "s".### START CODE HERE ### (approx. 2 lines)s["dW"+str(l+1)]= beta2*s["dW"+str(l+1)]+(1-beta2)*grads['dW'+str(l+1)]**2s["db"+str(l+1)]= beta2*s["db"+str(l+1)]+(1-beta2)*grads['db'+str(l+1)]**2### END CODE HERE #### Compute bias-corrected second raw moment estimate. Inputs: "s, beta2, t". Output: "s_corrected".### START CODE HERE ### (approx. 2 lines)s_corrected["dW"+str(l+1)]= s["dW"+str(l+1)]/(1-np.power(beta2,t))s_corrected["db"+str(l+1)]= s["db"+str(l+1)]/(1-np.power(beta2,t))### END CODE HERE #### Update parameters. Inputs: "parameters, learning_rate, v_corrected, s_corrected, epsilon". Output: "parameters".### START CODE HERE ### (approx. 2 lines)parameters["W"+str(l+1)]= parameters["W"+str(l+1)]- learning_rate*v_corrected["dW"+str(l+1)]/(np.sqrt(s_corrected["dW"+str(l+1)])+epsilon)parameters["b"+str(l+1)]= parameters["b"+str(l+1)]- learning_rate*v_corrected["db"+str(l+1)]/(np.sqrt(s_corrected["db"+str(l+1)])+epsilon)### END CODE HERE ###return parameters, v, s
defmodel(X, Y, layers_dims, optimizer, learning_rate =0.0007, mini_batch_size =64, beta =0.9,beta1 =0.9, beta2 =0.999, epsilon =1e-8, num_epochs =10000, print_cost =True):"""3-layer neural network model which can be run in different optimizer modes.Arguments:X -- input data, of shape (2, number of examples)Y -- true "label" vector (1 for blue dot / 0 for red dot), of shape (1, number of examples)layers_dims -- python list, containing the size of each layerlearning_rate -- the learning rate, scalar.mini_batch_size -- the size of a mini batchbeta -- Momentum hyperparameterbeta1 -- Exponential decay hyperparameter for the past gradients estimates beta2 -- Exponential decay hyperparameter for the past squared gradients estimates epsilon -- hyperparameter preventing division by zero in Adam updatesnum_epochs -- number of epochsprint_cost -- True to print the cost every 1000 epochsReturns:parameters -- python dictionary containing your updated parameters """L =len(layers_dims)# number of layers in the neural networkscosts =[]# to keep track of the costt =0# initializing the counter required for Adam updateseed =10# For grading purposes, so that your "random" minibatches are the same as ours# Initialize parametersparameters = initialize_parameters(layers_dims)# Initialize the optimizerif optimizer =="gd":pass# no initialization required for gradient descentelif optimizer =="momentum":v = initialize_velocity(parameters)elif optimizer =="adam":v, s = initialize_adam(parameters)# Optimization loopfor i inrange(num_epochs):# Define the random minibatches. We increment the seed to reshuffle differently the dataset after each epochseed = seed +1minibatches = random_mini_batches(X, Y, mini_batch_size, seed)for minibatch in minibatches:# Select a minibatch(minibatch_X, minibatch_Y)= minibatch# Forward propagationa3, caches = forward_propagation(minibatch_X, parameters)# Compute costcost = compute_cost(a3, minibatch_Y)# Backward propagationgrads = backward_propagation(minibatch_X, minibatch_Y, caches)# Update parametersif optimizer =="gd":parameters = update_parameters_with_gd(parameters, grads, learning_rate)elif optimizer =="momentum":parameters, v = update_parameters_with_momentum(parameters, grads, v, beta, learning_rate)elif optimizer =="adam":t = t +1# Adam counterparameters, v, s = update_parameters_with_adam(parameters, grads, v, s,t, learning_rate, beta1, beta2, epsilon)# Print the cost every 1000 epochif print_cost and i %1000==0:print("Cost after epoch %i: %f"%(i, cost))if print_cost and i %100==0:costs.append(cost)# plot the costplt.plot(costs)plt.ylabel('cost')plt.xlabel('epochs (per 100)')plt.title("Learning rate = "+str(learning_rate))plt.show()return parameters