I have trained a neural network with an LSTM model using a time series dataset. I am using this dataset which records data from 1970-2016 of the daily rainfall of 35 locations : https://www.kaggle.com/redikod/historical-rainfall-data-in-bangladesh
It looks like this :
StationIndex Station Year Month Day Rainfall dayofyear
1970-01-01 1 Dhaka 1970 1 1 0 1
1970-01-02 1 Dhaka 1970 1 2 0 2
1970-01-03 1 Dhaka 1970 1 3 0 3
1970-01-04 1 Dhaka 1970 1 4 0 4
1970-01-05 1 Dhaka 1970 1 5 0 5
I have completed the training using the train and test data. Then I checked the predicted value against true value. Here is the complete code, sorry if it is messy. I have put comments for each section:
import numpy as np
from pandas.plotting import register_matplotlib_converters
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc
from pylab import rcParams
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from keras.layers import (
Input,
Dense,
LSTM,
AveragePooling1D,
TimeDistributed,
Flatten,
Bidirectional,
Dropout
)
from keras.models import Model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
tf.keras.backend.clear_session()
register_matplotlib_converters()
sns.set(style='whitegrid', palette='muted', font_scale=1.5)
rcParams['figure.figsize'] = 22, 10
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)
#reading from CSV
df = pd.read_csv("\customized_daily_rainfall_data_Copy.csv")
#droping bad data
df = df[df.Rainfall != -999]
#droping dates (leapyear, wrong day numbers of month)
df.drop(df[(df['Day']>28) & (df['Month']==2) & (df['Year']%4!=0)].index,inplace=True)
df.drop(df[(df['Day']>29) & (df['Month']==2) & (df['Year']%4==0)].index,inplace=True)
df.drop(df[(df['Day']>30) & ((df['Month']==4)|(df['Month']==6)|(df['Month']==9)|(df['Month']==11))].index,inplace=True)
#date parcing (Index)
date = [str(y)+'-'+str(m)+'-'+str(d) for y, m, d in zip(df.Year, df.Month, df.Day)]
df.index = pd.to_datetime(date)
df['Date'] = df.index
df['Dayofyear']=df['Date'].dt.dayofyear
df.drop('Date',axis=1,inplace=True)
df.drop(['Station'],axis=1,inplace=True)
df.head()
#limiting the dataframe to just rows where StationIndex is 11
datarange = df.loc[df['StationIndex'] == 11]
#splitting train and test set
train_size = int(len(datarange) * 0.9)
test_size = len(datarange) - train_size
train, test = df.iloc[0:train_size], df.iloc[train_size:len(datarange)]
#Scaling the feature and label columns of the dataset
from sklearn.preprocessing import RobustScaler
f_columns = ['Year', 'Month','Day','Dayofyear']
f_transformer = RobustScaler()
l_transformer = RobustScaler()
f_transformer = f_transformer.fit(train[f_columns].to_numpy())
l_transformer = l_transformer.fit(train[['Rainfall']])
train.loc[:, f_columns] = f_transformer.transform(train[f_columns].to_numpy())
train['Rainfall'] = l_transformer.transform(train[['Rainfall']])
test.loc[:, f_columns] = f_transformer.transform(test[f_columns].to_numpy())
test['Rainfall'] = l_transformer.transform(test[['Rainfall']])
#making smaller train and test sections withing the dataset
def create_dataset(X, y, time_steps=1):
Xs, ys = [], []
for i in range(len(X) - time_steps):
v = X.iloc[i:(i + time_steps)].to_numpy()
Xs.append(v)
ys.append(y.iloc[i + time_steps])
return np.array(Xs), np.array(ys)
time_steps = 7
# reshape to [samples, time_steps, n_features]
X_train, y_train = create_dataset(train, train.Rainfall, time_steps)
X_test, y_test = create_dataset(test, test.Rainfall, time_steps)
#testing
X_test[0][0]
#model code
model = keras.Sequential()
#3 biderectional LSTM layers
model.add(keras.layers.Bidirectional(keras.layers.LSTM(units=128, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences = True)))
model.add(keras.layers.Bidirectional(keras.layers.LSTM(units=128, return_sequences = True)))
model.add(keras.layers.Bidirectional(keras.layers.LSTM(units=128 )))
model.add(keras.layers.Dropout(rate=0.1))
model.add(keras.layers.Dense(units=1))
model.compile(loss="mean_squared_error", optimizer="RMSprop")
#training the model
history = model.fit(
X_train, y_train,
epochs=500,
batch_size=1052,
validation_split=0.2,
shuffle=False,
)
#saving the model
from tensorflow.keras.models import load_model
model.save("\Timeseries-timestep7-batchsize1052.h5")
#Using text dataset to do a prediction
y_pred = model.predict(X_test)
#inverst transformation
y_train_inv = l_transformer.inverse_transform(y_train.reshape(1, -1))
y_test_inv = l_transformer.inverse_transform(y_test.reshape(1, -1))
y_pred_inv = l_transformer.inverse_transform(y_pred)
#score
from sklearn import metrics
score = np.sqrt(metrics.mean_squared_error(y_pred,y_test))
print(score)
What I want to do next, is for the predictions to go beyond the data in the dataset.
For example, using the model that I have trained to predict data of the future. Or possibly a random/custom range. Suppose I want to predict the day by day rainfall data of 2017. Or get the predicted data of 25-02-2017. Or maybe the data of X number of days after the end of the dataset.
Is there a good intuitive way to do that? Thank you in advanced to whoever can answer this question. It's been bugging me for a few days now.