Walk-forward optimization for Machine Learning in Python

 

 

 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import quandl
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import linear_model


def prepare_data(TICKER):
    quandl.ApiConfig.api_key = 'Z9xLyThUssR4n-KHqAr5'
    data = pd.DataFrame(quandl.get_table('WIKI/PRICES', ticker=TICKER))
    data = data[['date', 'adj_open', 'adj_high', 'adj_low', 'adj_close', 'adj_volume']]  
    data['label'] = data['adj_close'].shift(-1)
    data = data.head(len(data)-1)
    #data = data.diff()
    #data = data.tail(len(data)-1)
    return data


Ticker = 'GS'
n_samples = 1000
test_percentage = 10
anchored = False


def wf(TICKER, n_samples, test_percentage, anchored=False):
    data = prepare_data(TICKER)
    lenght = len(data)
    test_lenght = round(n_samples *(test_percentage/100))
    steps = round((lenght - n_samples) / test_lenght)
    print(steps)

    y_test_total = np.zeros(shape=(0,3))
    
    
    for i in range(0,steps):
        print(i)
        if anchored == False:
            if i==steps-1:

                start = ((i)*test_lenght)
                data_wf = data[(((i)*test_lenght)) : lenght]
                y_test = build_model(data_wf, len(data_wf)-round(n_samples *(1-(test_percentage/100))))
                y_test_total = np.concatenate([y_test_total,y_test])
            else:

                data_wf = data.iloc[(i*test_lenght):((i*test_lenght)+n_samples)]
                end = ((i*test_lenght)+n_samples)
                start = (i*test_lenght)
                y_test = build_model(data_wf, test_percentage/100)
                y_test_total = np.concatenate([y_test_total,y_test])
                
        else:
            if i==steps-1:

                start = 0
                data_wf = data[0 : lenght]
                y_test = build_model(data_wf, len(data)-n_samples-(test_lenght*(i-1))) 
                y_test_total = np.concatenate([y_test_total,y_test])
            

            else:

                data_wf = data.iloc[0:((i*test_lenght)+n_samples)]
                end = ((i*test_lenght)+n_samples)
                start = (0)
                y_test = build_model(data_wf, test_lenght)
                y_test_total = np.concatenate([y_test_total,y_test])

    alldf = np.vstack(y_test_total)

    return alldf


def build_model(data_wf, test_percentage):
    y = data_wf['label']
    X = data_wf.drop(['date', 'label'], axis=1)    
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= test_percentage, shuffle = False)#, random_state=42)
    print('X_train shape ' + str(X_train.shape))
    print('X_test shape ' + str(X_test.shape))
    print('y_train shape ' + str(y_train.shape))
    print('y_test shape ' + str(y_test.shape))
    
    regr = linear_model.LinearRegression()
    regr.fit(X_train, y_train)
    y_pred = pd.DataFrame(regr.predict(X_test))
    
    y_test = pd.DataFrame(y_test).reset_index()
    y_test['prediction'] = y_pred

    return(y_test)


alldf = wf(Ticker,n_samples,test_percentage, anchored)
print(alldf)


alldf = pd.DataFrame(alldf)
alldf.columns = ['Index','Real','Predicted']
alldf.tail(100).plot(0, figsize=(14,7), grid=True, title = 'Real vs. Predicted')