import os
os.chdir('../src')
import numpy as np
import pandas as pd
from sklearn import svm, model_selection, linear_model, preprocessing, metrics
import matplotlib.pyplot as plt
from matplotlib import colors
import seaborn as sns
df = pd.read_hdf('../data/features/technical.h5')
scaler = preprocessing.StandardScaler()
x = df.drop(['target', 'SQZMI'], axis=1)
x = scaler.fit_transform(x)
x = np.hstack([x, np.expand_dims(df.SQZMI, axis=1)])
# ^ this affects our keynames
keynames = df.keys().drop(['target', 'SQZMI']).tolist()
keynames.append('SQZMI')
def score_logistic(n):
model = linear_model.LogisticRegression(solver='liblinear')
model.fit(x[40000:45000], df.target[40000:45000])
print(n, model.score(x[40000-n:40000], df.target[40000-n:40000]))
for n in [40000, 20000, 10000, 5000, 2500, 2000, 1500, 1250, 500, 700, 250, 125, 70, 50, 25, 10, 5, 1]:
score_logistic(n)
linear_model.LogisticRegression(solver='liblinear')
This is fucking insane. The accuracy is highly sensitive on the time window. The optimal training set is probably highly dependent on the window.