import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('mode.chained_assignment', None)
from keras.models import Sequential
from keras.layers import Dense, SimpleRNN
from tensorflow.keras.optimizers.legacy import RMSprop
from keras.callbacks import Callback
humidity = pd.read_csv("../weather-pogodynka/humidity.csv")
temp = pd.read_csv("../weather-pogodynka/temperature.csv")
pressure = pd.read_csv("../weather-pogodynka/pressure.csv")
humidity_ALB = humidity[['datetime','Albuquerque']]
temp_ALB = temp[['datetime','Albuquerque']]
pressure_ALB = pressure[['datetime','Albuquerque']]
#humidity_ALB.head(10)
pressure_ALB.head(10)
datetime | Albuquerque | |
---|---|---|
0 | 2012-10-01 12:00:00 | NaN |
1 | 2012-10-01 13:00:00 | 1024.0 |
2 | 2012-10-01 14:00:00 | 1024.0 |
3 | 2012-10-01 15:00:00 | 1024.0 |
4 | 2012-10-01 16:00:00 | 1024.0 |
5 | 2012-10-01 17:00:00 | 1024.0 |
6 | 2012-10-01 18:00:00 | 1024.0 |
7 | 2012-10-01 19:00:00 | 1024.0 |
8 | 2012-10-01 20:00:00 | 1024.0 |
9 | 2012-10-01 21:00:00 | 1024.0 |
pressure_ALB.tail(10)
datetime | Albuquerque | |
---|---|---|
45243 | 2017-11-29 15:00:00 | 1028.0 |
45244 | 2017-11-29 16:00:00 | 1028.0 |
45245 | 2017-11-29 17:00:00 | 1028.0 |
45246 | 2017-11-29 18:00:00 | 1028.0 |
45247 | 2017-11-29 19:00:00 | 1026.0 |
45248 | 2017-11-29 20:00:00 | 1025.0 |
45249 | 2017-11-29 21:00:00 | 1024.0 |
45250 | 2017-11-29 22:00:00 | 1024.0 |
45251 | 2017-11-29 23:00:00 | 1024.0 |
45252 | 2017-11-30 00:00:00 | 1024.0 |
print(humidity_ALB.shape)
print(temp_ALB.shape)
print(pressure_ALB.shape)
(45253, 2) (45253, 2) (45253, 2)
NaN
values (Not a Numbers) in the dataset¶print("How many NaN are there in the humidity dataset?",humidity_ALB.isna().sum()['Albuquerque'])
print("How many NaN are there in the temperature dataset?",temp_ALB.isna().sum()['Albuquerque'])
print("How many NaN are there in the pressure dataset?",pressure_ALB.isna().sum()['Albuquerque'])
How many NaN are there in the humidity dataset? 710 How many NaN are there in the temperature dataset? 1 How many NaN are there in the pressure dataset? 456
I choose Tp=7000 here which means I will train the RNN with only first 7000 data points and then let it predict the long-term trend (for the next > 35000 data points or so). That is not a lot of training data compared to the number of test points, but it still get its job done. Due to abrupt data samples in the pressure data values i decided to select a data frame where the values are in a accaptable range.
Tp = 7000
def plot_train_points(quantity='humidity',Tp=7000):
plt.figure(figsize=(15,4))
if quantity=='humidity':
plt.title("Humidity of first {} data points".format(Tp),fontsize=16)
plt.plot(humidity_ALB['Albuquerque'][:Tp],c='k',lw=1)
if quantity=='temperature':
plt.title("Temperature of first {} data points".format(Tp),fontsize=16)
plt.plot(temp_ALB['Albuquerque'][:Tp],c='k',lw=1)
if quantity=='pressure':
plt.title("Pressure of selected {} data points".format(Tp),fontsize=16)
plt.plot(pressure_ALB['Albuquerque'][16300:23300],c='k',lw=1)
plt.grid(True)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.show()
plot_train_points('humidity')
plot_train_points('temperature')
plot_train_points('pressure')
NaN
values¶I observed some NaN
values in the dataset. I could just eliminate these points. But assuming that the changes in the parameters are not extremely abrupt, I could try to fill them using simple linear interpolation.
humidity_ALB.interpolate(inplace=True)
humidity_ALB.dropna(inplace=True)
temp_ALB.interpolate(inplace=True)
temp_ALB.dropna(inplace=True)
pressure_ALB.interpolate(inplace=True)
pressure_ALB.dropna(inplace=True)
print(humidity_ALB.shape)
print(temp_ALB.shape)
print(pressure_ALB.shape)
(45252, 2) (45252, 2) (45252, 2)
Tp=7000
¶train = np.array(humidity_ALB['Albuquerque'][:Tp])
test = np.array(humidity_ALB['Albuquerque'][Tp:])
C:\Users\kompiuter\AppData\Local\Temp\ipykernel_2340\3039798577.py:1: FutureWarning: The behavior of `series[i:j]` with an integer-dtype index is deprecated. In a future version, this will be treated as *label-based* indexing, consistent with e.g. `series[i]` lookups. To retain the old behavior, use `series.iloc[i:j]`. To get the future behavior, use `series.loc[i:j]`. train = np.array(humidity_ALB['Albuquerque'][:Tp]) C:\Users\kompiuter\AppData\Local\Temp\ipykernel_2340\3039798577.py:2: FutureWarning: The behavior of `series[i:j]` with an integer-dtype index is deprecated. In a future version, this will be treated as *label-based* indexing, consistent with e.g. `series[i]` lookups. To retain the old behavior, use `series.iloc[i:j]`. To get the future behavior, use `series.loc[i:j]`. test = np.array(humidity_ALB['Albuquerque'][Tp:])
print("Train data length:", train.shape)
print("Test data length:", test.shape)
Train data length: (7000,) Test data length: (38252,)
train=train.reshape(-1,1)
test=test.reshape(-1,1)
plt.figure(figsize=(15,4))
plt.title("Train and test data plotted together",fontsize=16)
plt.plot(np.arange(Tp),train,c='blue')
plt.plot(np.arange(Tp,45252),test,c='orange',alpha=0.7)
plt.legend(['Train','Test'])
plt.grid(True)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.show()
RNN model requires a step value that contains n number of elements as an input sequence.
Suppose x = {1,2,3,4,5,6,7,8,9,10}
for step=1, x input and its y prediction become:
x | y |
---|---|
1 | 2 |
2 | 3 |
3 | 4 |
... | ... |
9 | 10 |
for step=3, x and y contain:
x | y |
---|---|
1,2,3 | 4 |
2,3,4 | 5 |
3,4,5 | 6 |
... | ... |
7,8,9 | 10 |
Here, i choosed step=8
. In more complex RNN and in particular for text processing, this is also called embedding size. The idea here is that I'm assuming that 8 hours of Iather data can effectively predict the 9th hour data, and so on.
step = 8
# add step elements into train and test
test = np.append(test,np.repeat(test[-1,],step))
train = np.append(train,np.repeat(train[-1,],step))
print("Train data length:", train.shape)
print("Test data length:", test.shape)
Train data length: (7008,) Test data length: (38260,)
Next, i'll convert test and train data into the matrix with step value as it has shown above example.
def convertToMatrix(data, step):
X, Y =[], []
for i in range(len(data)-step):
d=i+step
X.append(data[i:d,])
Y.append(data[d,])
return np.array(X), np.array(Y)
trainX,trainY =convertToMatrix(train,step)
testX,testY =convertToMatrix(test,step)
trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))
print("Training data shape:", trainX.shape,', ',trainY.shape)
print("Test data shape:", testX.shape,', ',testY.shape)
Training data shape: (7000, 1, 8) , (7000,) Test data shape: (38252, 1, 8) , (38252,)
SimpleRNN
layer¶I build a simple function to define the RNN model. It uses a single neuron for the output layer because i'm predicting a real-valued number here. As activation, it uses the ReLU function. Following arguments are supported.
def build_simple_rnn(num_units=128, embedding=4,num_dense=32,lr=0.001):
"""
Builds and compiles a simple RNN model
Arguments:
num_units: Number of units of a the simple RNN layer
embedding: Embedding length
num_dense: Number of neurons in the dense layer folloId by the RNN layer
lr: Learning rate (uses RMSprop optimizer)
Returns:
A compiled Keras model.
"""
model = Sequential()
model.add(SimpleRNN(units=num_units, input_shape=(1,embedding), activation="relu"))
model.add(Dense(num_dense, activation="relu"))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer=RMSprop(lr=lr),metrics=['mse'])
return model
model_humidity = build_simple_rnn(num_units=128,num_dense=32,embedding=8,lr=0.0005)
c:\Users\kompiuter\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\optimizers\optimizer_v2\rmsprop.py:143: UserWarning: The `lr` argument is deprecated, use `learning_rate` instead. super().__init__(name, **kwargs)
model_humidity.summary()
Model: "sequential_3" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= simple_rnn_3 (SimpleRNN) (None, 128) 17536 dense_6 (Dense) (None, 32) 4128 dense_7 (Dense) (None, 1) 33 ================================================================= Total params: 21,697 Trainable params: 21,697 Non-trainable params: 0 _________________________________________________________________
Callback
class to print progress of the training at regular epoch interval¶Since the RNN training is usually long, I want to see regular updates about epochs finishing. HoIver, I may not want to see this update every epoch as that may flood the output stream. Therefore, I write a simple custom Callback
function to print the finishing update every 50th epoch. You can think of adding other bells and whistles to this function to print error and other metrics dynamically.
class MyCallback(Callback):
def on_epoch_end(self, epoch, logs=None):
if (epoch+1) % 50 == 0 and epoch>0:
print("Epoch number {} done".format(epoch+1))
batch_size=8
num_epochs = 1000
model_humidity.fit(trainX,trainY,
epochs=num_epochs,
batch_size=batch_size,
callbacks=[MyCallback()],verbose=0)
Epoch number 50 done Epoch number 100 done Epoch number 150 done Epoch number 200 done Epoch number 250 done Epoch number 300 done Epoch number 350 done Epoch number 400 done Epoch number 450 done Epoch number 500 done Epoch number 550 done Epoch number 600 done Epoch number 650 done Epoch number 700 done Epoch number 750 done Epoch number 800 done Epoch number 850 done Epoch number 900 done Epoch number 950 done Epoch number 1000 done
<keras.callbacks.History at 0x230b59bba30>
Note that the loss
metric available in the history
attribute of the model is the MSE loss and you have to take a square-root to compute the RMSE loss.
plt.figure(figsize=(7,5))
plt.title("RMSE loss over epochs",fontsize=16)
plt.plot(np.sqrt(model_humidity.history.history['loss']),c='k',lw=2)
plt.grid(True)
plt.xlabel("Epochs",fontsize=14)
plt.ylabel("Root-mean-squared error",fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.show()
I'm emphasizing and showing again what exactly the model see during training. If you look above, the model fitting code is,
model_humidity.fit(trainX,trainY,
epochs=num_epochs,
batch_size=batch_size,
callbacks=[MyCallback()],verbose=0)
So, the model was fitted with trainX
which is plotted below, and trainY
which is just the 8 step shifted and shaped vector.
plt.figure(figsize=(15,4))
plt.title("This is what the model saw",fontsize=18)
plt.plot(trainX[:,0][:,0],c='blue')
plt.grid(True)
plt.show()
Now, I can generate predictions for the future by passing testX
to the trained model.
trainPredict = model_humidity.predict(trainX)
testPredict= model_humidity.predict(testX)
predicted=np.concatenate((trainPredict,testPredict),axis=0)
219/219 [==============================] - 0s 696us/step 1196/1196 [==============================] - 1s 670us/step
plt.figure(figsize=(10,4))
plt.title("This is what the model predicted",fontsize=18)
plt.plot(testPredict,c='orange')
plt.grid(True)
plt.show()
I plotted the ground truth and the model predictions together to show that it follows the general trends in the ground truth data pretty Ill. Considering less than 25% data was used for training, this is sort of amazing. The boundary betIen train and test splits is denoted by the vertical red line.
There are, of course, some obvious mistakes in the model predictions, such as humidity values going above 100 and some very low values. These can be pruned with post-processing or a better model can be built with proper hyperparameter tuning.
index = humidity_ALB.index.values
plt.figure(figsize=(15,5))
plt.title("Humidity: Ground truth and prediction together",fontsize=18)
plt.plot(index,humidity_ALB['Albuquerque'],c='blue')
plt.plot(index,predicted,c='orange',alpha=0.75)
plt.legend(['True data','Predicted'],fontsize=15)
plt.axvline(x=Tp, c="r")
plt.grid(True)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.ylim(-20,120)
plt.show()
Since I have covered modeling the humidity data step-by-step in detail, I will show the modeling with other two parameters - temperature and pressure - in a simpler manner.
train = np.array(temp_ALB['Albuquerque'][:Tp])
test = np.array(temp_ALB['Albuquerque'][Tp:])
train=train.reshape(-1,1)
test=test.reshape(-1,1)
step = 8
# add step elements into train and test
test = np.append(test,np.repeat(test[-1,],step))
train = np.append(train,np.repeat(train[-1,],step))
trainX,trainY =convertToMatrix(train,step)
testX,testY =convertToMatrix(test,step)
trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))
C:\Users\kompiuter\AppData\Local\Temp\ipykernel_2340\2750091313.py:1: FutureWarning: The behavior of `series[i:j]` with an integer-dtype index is deprecated. In a future version, this will be treated as *label-based* indexing, consistent with e.g. `series[i]` lookups. To retain the old behavior, use `series.iloc[i:j]`. To get the future behavior, use `series.loc[i:j]`. train = np.array(temp_ALB['Albuquerque'][:Tp]) C:\Users\kompiuter\AppData\Local\Temp\ipykernel_2340\2750091313.py:2: FutureWarning: The behavior of `series[i:j]` with an integer-dtype index is deprecated. In a future version, this will be treated as *label-based* indexing, consistent with e.g. `series[i]` lookups. To retain the old behavior, use `series.iloc[i:j]`. To get the future behavior, use `series.loc[i:j]`. test = np.array(temp_ALB['Albuquerque'][Tp:])
model_temp = build_simple_rnn(num_units=128,num_dense=32,embedding=8,lr=0.0005)
batch_size=8
num_epochs = 2000
model_temp.fit(trainX,trainY,
epochs=num_epochs,
batch_size=batch_size,
callbacks=[MyCallback()],verbose=0)
Epoch number 50 done Epoch number 100 done Epoch number 150 done Epoch number 200 done Epoch number 250 done Epoch number 300 done Epoch number 350 done Epoch number 400 done Epoch number 450 done Epoch number 500 done Epoch number 550 done Epoch number 600 done Epoch number 650 done Epoch number 700 done Epoch number 750 done Epoch number 800 done Epoch number 850 done Epoch number 900 done Epoch number 950 done Epoch number 1000 done Epoch number 1050 done Epoch number 1100 done Epoch number 1150 done Epoch number 1200 done Epoch number 1250 done Epoch number 1300 done Epoch number 1350 done Epoch number 1400 done Epoch number 1450 done Epoch number 1500 done Epoch number 1550 done Epoch number 1600 done Epoch number 1650 done Epoch number 1700 done Epoch number 1750 done Epoch number 1800 done Epoch number 1850 done Epoch number 1900 done Epoch number 1950 done Epoch number 2000 done
<keras.callbacks.History at 0x230b5cc2e00>
plt.figure(figsize=(7,5))
plt.title("RMSE loss over epochs",fontsize=16)
plt.plot(np.sqrt(model_temp.history.history['loss']),c='k',lw=2)
plt.grid(True)
plt.xlabel("Epochs",fontsize=14)
plt.ylabel("Root-mean-squared error",fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.show()
trainPredict = model_temp.predict(trainX)
testPredict= model_temp.predict(testX)
predicted=np.concatenate((trainPredict,testPredict),axis=0)
219/219 [==============================] - 0s 693us/step 1196/1196 [==============================] - 1s 698us/step
index = temp_ALB.index.values
plt.figure(figsize=(15,5))
plt.title("Temperature: Ground truth and prediction together",fontsize=18)
plt.plot(index,temp_ALB['Albuquerque'],c='blue')
plt.plot(index,predicted,c='orange',alpha=0.75)
plt.legend(['True data','Predicted'],fontsize=15)
plt.axvline(x=Tp, c="r")
plt.grid(True)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.show()
train = np.array(pressure_ALB['Albuquerque'][:Tp])
test = np.array(pressure_ALB['Albuquerque'][Tp:])
train=train.reshape(-1,1)
test=test.reshape(-1,1)
step = 8
# add step elements into train and test
test = np.append(test,np.repeat(test[-1,],step))
train = np.append(train,np.repeat(train[-1,],step))
trainX,trainY =convertToMatrix(train,step)
testX,testY =convertToMatrix(test,step)
trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))
C:\Users\kompiuter\AppData\Local\Temp\ipykernel_2340\2873042680.py:1: FutureWarning: The behavior of `series[i:j]` with an integer-dtype index is deprecated. In a future version, this will be treated as *label-based* indexing, consistent with e.g. `series[i]` lookups. To retain the old behavior, use `series.iloc[i:j]`. To get the future behavior, use `series.loc[i:j]`. train = np.array(pressure_ALB['Albuquerque'][:Tp]) C:\Users\kompiuter\AppData\Local\Temp\ipykernel_2340\2873042680.py:2: FutureWarning: The behavior of `series[i:j]` with an integer-dtype index is deprecated. In a future version, this will be treated as *label-based* indexing, consistent with e.g. `series[i]` lookups. To retain the old behavior, use `series.iloc[i:j]`. To get the future behavior, use `series.loc[i:j]`. test = np.array(pressure_ALB['Albuquerque'][Tp:])
model_pressure= build_simple_rnn(num_units=128,num_dense=32,embedding=8,lr=0.0005)
batch_size=8
num_epochs = 500
model_pressure.fit(trainX,trainY,
epochs=num_epochs,
batch_size=batch_size,
callbacks=[MyCallback()],verbose=0)
Epoch number 50 done Epoch number 100 done Epoch number 150 done Epoch number 200 done Epoch number 250 done Epoch number 300 done Epoch number 350 done Epoch number 400 done Epoch number 450 done Epoch number 500 done
<keras.callbacks.History at 0x230bc8b7940>
plt.figure(figsize=(7,5))
plt.title("RMSE loss over epochs",fontsize=16)
plt.plot(np.sqrt(model_pressure.history.history['loss']),c='k',lw=2)
plt.grid(True)
plt.xlabel("Epochs",fontsize=14)
plt.ylabel("Root-mean-squared error",fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.show()
trainPredict = model_pressure.predict(trainX)
testPredict= model_pressure.predict(testX)
predicted=np.concatenate((trainPredict,testPredict),axis=0)
219/219 [==============================] - 0s 651us/step 1196/1196 [==============================] - 1s 665us/step
index = pressure_ALB.index.values
plt.figure(figsize=(15,5))
plt.title("Pressure: Ground truth and prediction together",fontsize=18)
plt.plot(index,pressure_ALB['Albuquerque'],c='blue')
plt.plot(index,predicted,c='orange',alpha=0.75)
plt.legend(['True data','Predicted'],fontsize=15)
plt.axvline(x=Tp, c="r")
plt.grid(True)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.show()
**The prediction is semi accurate due to taking just 7000 data points but it still manages to predict future values with acceptable deviation. Even with the spikes of the pressure prediction manages to have close values to the real one.