import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import quandl
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import linear_model
def prepare_data(TICKER):
quandl.ApiConfig.api_key = 'Z9xLyThUssR4n-KHqAr5'
data = pd.DataFrame(quandl.get_table('WIKI/PRICES', ticker=TICKER))
data = data[['date', 'adj_open', 'adj_high', 'adj_low', 'adj_close', 'adj_volume']]
data['label'] = data['adj_close'].shift(-1)
data = data.head(len(data)-1)
#data = data.diff()
#data = data.tail(len(data)-1)
return data
Ticker = 'GS'
n_samples = 1000
test_percentage = 10
anchored = False
def wf(TICKER, n_samples, test_percentage, anchored=False):
data = prepare_data(TICKER)
lenght = len(data)
test_lenght = round(n_samples *(test_percentage/100))
steps = round((lenght - n_samples) / test_lenght)
print(steps)
y_test_total = np.zeros(shape=(0,3))
for i in range(0,steps):
print(i)
if anchored == False:
if i==steps-1:
start = ((i)*test_lenght)
data_wf = data[(((i)*test_lenght)) : lenght]
y_test = build_model(data_wf, len(data_wf)-round(n_samples *(1-(test_percentage/100))))
y_test_total = np.concatenate([y_test_total,y_test])
else:
data_wf = data.iloc[(i*test_lenght):((i*test_lenght)+n_samples)]
end = ((i*test_lenght)+n_samples)
start = (i*test_lenght)
y_test = build_model(data_wf, test_percentage/100)
y_test_total = np.concatenate([y_test_total,y_test])
else:
if i==steps-1:
start = 0
data_wf = data[0 : lenght]
y_test = build_model(data_wf, len(data)-n_samples-(test_lenght*(i-1)))
y_test_total = np.concatenate([y_test_total,y_test])
else:
data_wf = data.iloc[0:((i*test_lenght)+n_samples)]
end = ((i*test_lenght)+n_samples)
start = (0)
y_test = build_model(data_wf, test_lenght)
y_test_total = np.concatenate([y_test_total,y_test])
alldf = np.vstack(y_test_total)
return alldf
def build_model(data_wf, test_percentage):
y = data_wf['label']
X = data_wf.drop(['date', 'label'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= test_percentage, shuffle = False)#, random_state=42)
print('X_train shape ' + str(X_train.shape))
print('X_test shape ' + str(X_test.shape))
print('y_train shape ' + str(y_train.shape))
print('y_test shape ' + str(y_test.shape))
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)
y_pred = pd.DataFrame(regr.predict(X_test))
y_test = pd.DataFrame(y_test).reset_index()
y_test['prediction'] = y_pred
return(y_test)
alldf = wf(Ticker,n_samples,test_percentage, anchored)
print(alldf)
alldf = pd.DataFrame(alldf)
alldf.columns = ['Index','Real','Predicted']
alldf.tail(100).plot(0, figsize=(14,7), grid=True, title = 'Real vs. Predicted')