In [1]:
import os
os.chdir('../src')
In [2]:
import pandas as pd
import numpy as np
from sklearn import metrics, model_selection
from sklearn import linear_model

import os
import math

import lightgbm as lgb
import luigi
import extramlutils
from data import DownloadPoloniex

Download the data:

In [3]:
luigi.build([DownloadPoloniex()], local_scheduler=True)
DEBUG: Checking if DownloadPoloniex() is complete
INFO: Informed scheduler that task   DownloadPoloniex__99914b932b   has status   DONE
INFO: Done scheduling tasks
INFO: Running Worker with 1 processes
DEBUG: Asking scheduler for work...
DEBUG: Done
DEBUG: There are no more tasks to run at this time
INFO: Worker Worker(salt=447319705, workers=1, host=adrian-pc, username=adrian, pid=9557) was stopped. Shutting down Keep-Alive thread
INFO: 
===== Luigi Execution Summary =====

Scheduled 1 tasks of which:
* 1 complete ones were encountered:
    - 1 DownloadPoloniex()

Did not run any tasks
This progress looks :) because there were no failed tasks or missing dependencies

===== Luigi Execution Summary =====

Out[3]:
True
In [40]:
# parameters for light gradient boosting
param = {'n_estimators': 10000,
         'learning_rate': 0.005,
         'num_leaves': 27,
         'boosting': 'dart',
        #  'max_depth': 8,
         'early_stopping_round': 500}

Load data

In [11]:
df = pd.read_json('../data/raw/BTC_ETH.json')
df = df.drop('date', axis=1)
df['target'] = df.close < df.close.shift(-1)

One feature

In [12]:
def add_log_diff(df, key):
    df[key + '_log_diff'] = np.log(df[key])
    df[key + '_log_diff'] = df[key + '_log_diff'].diff()

add_log_diff(df, 'close')

df = df[['target', 'close_log_diff']]
df = df.dropna()
In [22]:
cv = model_selection.TimeSeriesSplit(n_splits=5)
model = linear_model.LogisticRegression(solver='lbfgs')

x = np.expand_dims(np.array(df.close_log_diff), axis=1)
result = model_selection.cross_validate(model, x, np.array(df.target), cv=cv, return_estimator=True, scoring=['roc_auc', 'accuracy'])
print(result)
print('\ntest roc auc:', np.mean(result['test_roc_auc']))
print('\ntest accuracy:', np.mean(result['test_accuracy']))
{'fit_time': array([0.01970935, 0.03931522, 0.05308819, 0.08288264, 0.09672332]), 'score_time': array([0.00586486, 0.00674129, 0.0065155 , 0.00664258, 0.00660372]), 'estimator': (LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False), LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False), LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False), LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False), LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)), 'test_roc_auc': array([0.54915167, 0.54678582, 0.52785332, 0.51630062, 0.55079409]), 'train_roc_auc': array([0.57531891, 0.56387012, 0.55870951, 0.55142416, 0.54488341]), 'test_accuracy': array([0.51996352, 0.53698824, 0.52391569, 0.49533847, 0.53161735]), 'train_accuracy': array([0.54026132, 0.54561572, 0.53710021, 0.53380449, 0.5306718 ])}

test roc auc: 0.5381771036311085

test accuracy: 0.5215646534252129

Shifted

In [55]:
df = pd.read_json('../data/raw/BTC_ETH.json')
df = df.drop('date', axis=1)
df['target'] = df.close < df.close.shift(-1)
In [60]:
shifts = [1,2,3,4,5,6,7,8,9,10]
df = df[['target','close']]

add_log_diff(df, 'close')

for i in list(shifts):
    df["close_log_diff_{}".format(i)] = df.close_log_diff.shift(i)

df = df.dropna()
df = df.reset_index()
df = df.drop(['close', 'index'], axis=1)
In [61]:
cv = model_selection.TimeSeriesSplit(n_splits=5)
model = linear_model.LogisticRegression(solver='lbfgs')

x = np.array(df.drop('target', axis=1))
result = model_selection.cross_validate(model, x, np.array(df.target), cv=cv, return_estimator=True, scoring=['roc_auc', 'accuracy'])
print(result)
print('\ntest roc auc:', np.mean(result['test_roc_auc']))
print('test accuracy:', np.mean(result['test_accuracy']))
{'fit_time': array([0.04023314, 0.06795859, 0.08682799, 0.12252474, 0.21098614]), 'score_time': array([0.00971889, 0.00709534, 0.00711632, 0.00721145, 0.00724864]), 'estimator': (LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False), LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False), LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False), LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False), LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)), 'test_roc_auc': array([0.55229625, 0.55112674, 0.53431591, 0.51608348, 0.55261639]), 'train_roc_auc': array([0.5822808 , 0.57096546, 0.56515006, 0.558088  , 0.55023809]), 'test_accuracy': array([0.52468823, 0.54293825, 0.52651323, 0.49822569, 0.53229241]), 'train_accuracy': array([0.54429353, 0.54716407, 0.54085564, 0.53772652, 0.5332333 ])}

test roc auc: 0.5412877539923825
test accuracy: 0.5249315624049478
In [64]:
model = lgb.LGBMClassifier(**param)
key_names = list(df.drop('target', axis=1).keys())
models = extramlutils.cross_validate_lgb(model, np.array(df.drop('target', axis=1)), np.array(df.target), cv, feature_name=key_names)
/home/adrian/.conda/envs/ml/lib/python3.7/site-packages/lightgbm/engine.py:121: UserWarning: Found `early_stopping_round` in params. Will use it instead of argument
  warnings.warn("Found `{}` in params. Will use it instead of argument".format(alias))
Training until validation scores don't improve for 500 rounds.
[100]	train's binary_logloss: 0.684405	train's auc: 0.636937	valid's binary_logloss: 0.691027	valid's auc: 0.543814
[200]	train's binary_logloss: 0.680197	train's auc: 0.651896	valid's binary_logloss: 0.690313	valid's auc: 0.546723
[300]	train's binary_logloss: 0.675644	train's auc: 0.663278	valid's binary_logloss: 0.689924	valid's auc: 0.548543
[400]	train's binary_logloss: 0.671529	train's auc: 0.671656	valid's binary_logloss: 0.689732	valid's auc: 0.549044
[500]	train's binary_logloss: 0.667318	train's auc: 0.680476	valid's binary_logloss: 0.689708	valid's auc: 0.549422
[600]	train's binary_logloss: 0.665428	train's auc: 0.687624	valid's binary_logloss: 0.689665	valid's auc: 0.549763
[700]	train's binary_logloss: 0.662309	train's auc: 0.693875	valid's binary_logloss: 0.689682	valid's auc: 0.549768
[800]	train's binary_logloss: 0.66001	train's auc: 0.700418	valid's binary_logloss: 0.689672	valid's auc: 0.550025
[900]	train's binary_logloss: 0.656903	train's auc: 0.70692	valid's binary_logloss: 0.689801	valid's auc: 0.550234
[1000]	train's binary_logloss: 0.654727	train's auc: 0.71272	valid's binary_logloss: 0.68989	valid's auc: 0.550164
[1100]	train's binary_logloss: 0.651902	train's auc: 0.720579	valid's binary_logloss: 0.68982	valid's auc: 0.550803
Early stopping, best iteration is:
[607]	train's binary_logloss: 0.665498	train's auc: 0.68786	valid's binary_logloss: 0.68964	valid's auc: 0.549889
/home/adrian/.conda/envs/ml/lib/python3.7/site-packages/lightgbm/engine.py:121: UserWarning: Found `early_stopping_round` in params. Will use it instead of argument
  warnings.warn("Found `{}` in params. Will use it instead of argument".format(alias))
Training until validation scores don't improve for 500 rounds.
[100]	train's binary_logloss: 0.688199	train's auc: 0.605539	valid's binary_logloss: 0.691227	valid's auc: 0.543592
[200]	train's binary_logloss: 0.685917	train's auc: 0.613639	valid's binary_logloss: 0.690652	valid's auc: 0.546411
[300]	train's binary_logloss: 0.683419	train's auc: 0.622903	valid's binary_logloss: 0.690017	valid's auc: 0.548914
[400]	train's binary_logloss: 0.681137	train's auc: 0.628702	valid's binary_logloss: 0.689534	valid's auc: 0.550008
[500]	train's binary_logloss: 0.678663	train's auc: 0.635788	valid's binary_logloss: 0.689142	valid's auc: 0.551942
[600]	train's binary_logloss: 0.677521	train's auc: 0.641777	valid's binary_logloss: 0.689087	valid's auc: 0.551254
[700]	train's binary_logloss: 0.675636	train's auc: 0.647202	valid's binary_logloss: 0.688914	valid's auc: 0.551062
[800]	train's binary_logloss: 0.67431	train's auc: 0.651995	valid's binary_logloss: 0.688796	valid's auc: 0.551262
[900]	train's binary_logloss: 0.672502	train's auc: 0.656938	valid's binary_logloss: 0.688645	valid's auc: 0.551733
[1000]	train's binary_logloss: 0.671227	train's auc: 0.661333	valid's binary_logloss: 0.688576	valid's auc: 0.55196
[1100]	train's binary_logloss: 0.66971	train's auc: 0.66584	valid's binary_logloss: 0.688448	valid's auc: 0.552534
[1200]	train's binary_logloss: 0.668759	train's auc: 0.669415	valid's binary_logloss: 0.688368	valid's auc: 0.552863
[1300]	train's binary_logloss: 0.6675	train's auc: 0.673285	valid's binary_logloss: 0.68831	valid's auc: 0.552932
[1400]	train's binary_logloss: 0.665721	train's auc: 0.677886	valid's binary_logloss: 0.688304	valid's auc: 0.552758
[1500]	train's binary_logloss: 0.664854	train's auc: 0.681482	valid's binary_logloss: 0.688298	valid's auc: 0.552697
[1600]	train's binary_logloss: 0.6633	train's auc: 0.685361	valid's binary_logloss: 0.688261	valid's auc: 0.552738
[1700]	train's binary_logloss: 0.66162	train's auc: 0.690131	valid's binary_logloss: 0.688233	valid's auc: 0.552727
[1800]	train's binary_logloss: 0.660214	train's auc: 0.694579	valid's binary_logloss: 0.688249	valid's auc: 0.552611
Early stopping, best iteration is:
[1363]	train's binary_logloss: 0.666428	train's auc: 0.676007	valid's binary_logloss: 0.68827	valid's auc: 0.553029
Training until validation scores don't improve for 500 rounds.
/home/adrian/.conda/envs/ml/lib/python3.7/site-packages/lightgbm/engine.py:121: UserWarning: Found `early_stopping_round` in params. Will use it instead of argument
  warnings.warn("Found `{}` in params. Will use it instead of argument".format(alias))
[100]	train's binary_logloss: 0.689124	train's auc: 0.594472	valid's binary_logloss: 0.691797	valid's auc: 0.529215
[200]	train's binary_logloss: 0.687318	train's auc: 0.600243	valid's binary_logloss: 0.691485	valid's auc: 0.530877
[300]	train's binary_logloss: 0.685347	train's auc: 0.605945	valid's binary_logloss: 0.691188	valid's auc: 0.532665
[400]	train's binary_logloss: 0.68358	train's auc: 0.610468	valid's binary_logloss: 0.690978	valid's auc: 0.534482
[500]	train's binary_logloss: 0.681799	train's auc: 0.614957	valid's binary_logloss: 0.690818	valid's auc: 0.535201
[600]	train's binary_logloss: 0.681064	train's auc: 0.617949	valid's binary_logloss: 0.690789	valid's auc: 0.535377
[700]	train's binary_logloss: 0.679672	train's auc: 0.622449	valid's binary_logloss: 0.690793	valid's auc: 0.535369
[800]	train's binary_logloss: 0.67872	train's auc: 0.626064	valid's binary_logloss: 0.6908	valid's auc: 0.535266
[900]	train's binary_logloss: 0.677423	train's auc: 0.6302	valid's binary_logloss: 0.690847	valid's auc: 0.534997
[1000]	train's binary_logloss: 0.676511	train's auc: 0.633329	valid's binary_logloss: 0.69086	valid's auc: 0.53498
[1100]	train's binary_logloss: 0.675412	train's auc: 0.63694	valid's binary_logloss: 0.690933	valid's auc: 0.534954
Early stopping, best iteration is:
[645]	train's binary_logloss: 0.680339	train's auc: 0.620307	valid's binary_logloss: 0.690768	valid's auc: 0.535565
Training until validation scores don't improve for 500 rounds.
/home/adrian/.conda/envs/ml/lib/python3.7/site-packages/lightgbm/engine.py:121: UserWarning: Found `early_stopping_round` in params. Will use it instead of argument
  warnings.warn("Found `{}` in params. Will use it instead of argument".format(alias))
[100]	train's binary_logloss: 0.689942	train's auc: 0.583075	valid's binary_logloss: 0.693024	valid's auc: 0.520558
[200]	train's binary_logloss: 0.688565	train's auc: 0.587367	valid's binary_logloss: 0.692832	valid's auc: 0.521119
[300]	train's binary_logloss: 0.687068	train's auc: 0.591549	valid's binary_logloss: 0.692841	valid's auc: 0.521393
[400]	train's binary_logloss: 0.685704	train's auc: 0.595492	valid's binary_logloss: 0.69287	valid's auc: 0.522684
[500]	train's binary_logloss: 0.68422	train's auc: 0.600824	valid's binary_logloss: 0.692985	valid's auc: 0.523034
[600]	train's binary_logloss: 0.68359	train's auc: 0.603574	valid's binary_logloss: 0.693081	valid's auc: 0.522759
[700]	train's binary_logloss: 0.682518	train's auc: 0.606567	valid's binary_logloss: 0.693275	valid's auc: 0.522005
Early stopping, best iteration is:
[225]	train's binary_logloss: 0.688315	train's auc: 0.588044	valid's binary_logloss: 0.692813	valid's auc: 0.520854
Training until validation scores don't improve for 500 rounds.
/home/adrian/.conda/envs/ml/lib/python3.7/site-packages/lightgbm/engine.py:121: UserWarning: Found `early_stopping_round` in params. Will use it instead of argument
  warnings.warn("Found `{}` in params. Will use it instead of argument".format(alias))
[100]	train's binary_logloss: 0.69065	train's auc: 0.572531	valid's binary_logloss: 0.691695	valid's auc: 0.544407
[200]	train's binary_logloss: 0.689529	train's auc: 0.578183	valid's binary_logloss: 0.69127	valid's auc: 0.545982
[300]	train's binary_logloss: 0.688304	train's auc: 0.582411	valid's binary_logloss: 0.690812	valid's auc: 0.546735
[400]	train's binary_logloss: 0.687196	train's auc: 0.586191	valid's binary_logloss: 0.690436	valid's auc: 0.546705
[500]	train's binary_logloss: 0.686102	train's auc: 0.589336	valid's binary_logloss: 0.690124	valid's auc: 0.546698
[600]	train's binary_logloss: 0.685635	train's auc: 0.592122	valid's binary_logloss: 0.690003	valid's auc: 0.547262
[700]	train's binary_logloss: 0.684772	train's auc: 0.595369	valid's binary_logloss: 0.689829	valid's auc: 0.546842
[800]	train's binary_logloss: 0.684158	train's auc: 0.598535	valid's binary_logloss: 0.689726	valid's auc: 0.547139
[900]	train's binary_logloss: 0.683354	train's auc: 0.60175	valid's binary_logloss: 0.689618	valid's auc: 0.54685
[1000]	train's binary_logloss: 0.682782	train's auc: 0.604528	valid's binary_logloss: 0.689554	valid's auc: 0.547327
[1100]	train's binary_logloss: 0.682103	train's auc: 0.607422	valid's binary_logloss: 0.689484	valid's auc: 0.54766
[1200]	train's binary_logloss: 0.681685	train's auc: 0.609823	valid's binary_logloss: 0.68942	valid's auc: 0.548252
[1300]	train's binary_logloss: 0.68112	train's auc: 0.612364	valid's binary_logloss: 0.689321	valid's auc: 0.548937
[1400]	train's binary_logloss: 0.680313	train's auc: 0.615358	valid's binary_logloss: 0.689171	valid's auc: 0.549785
[1500]	train's binary_logloss: 0.679952	train's auc: 0.61742	valid's binary_logloss: 0.689136	valid's auc: 0.550029
[1600]	train's binary_logloss: 0.679268	train's auc: 0.619912	valid's binary_logloss: 0.689034	valid's auc: 0.550431
[1700]	train's binary_logloss: 0.678575	train's auc: 0.622471	valid's binary_logloss: 0.688967	valid's auc: 0.550377
[1800]	train's binary_logloss: 0.678006	train's auc: 0.624635	valid's binary_logloss: 0.688939	valid's auc: 0.550395
[1900]	train's binary_logloss: 0.677376	train's auc: 0.626954	valid's binary_logloss: 0.688931	valid's auc: 0.550199
[2000]	train's binary_logloss: 0.676847	train's auc: 0.629418	valid's binary_logloss: 0.68892	valid's auc: 0.550203
[2100]	train's binary_logloss: 0.676064	train's auc: 0.632153	valid's binary_logloss: 0.68891	valid's auc: 0.549886
Early stopping, best iteration is:
[1647]	train's binary_logloss: 0.679064	train's auc: 0.620972	valid's binary_logloss: 0.689015	valid's auc: 0.550543
In [67]:
extramlutils.get_best_iterations(models)
Out[67]:
{'train_binary_logloss_mean': 0.6759286019337181,
 'train_auc_mean': 0.6386381902217113,
 'valid_binary_logloss_mean': 0.6901011245634779,
 'valid_auc_mean': 0.5419760135755023,
 'train_binary_logloss_std': 0.008738008390101788,
 'train_auc_std': 0.03748894811310406,
 'valid_binary_logloss_std': 0.0015840541370217105,
 'valid_auc_std': 0.012209817862622336}