Browse Source

First successful GridSearchCV run completed

Achieving a BACC score of 0.9995999768006462

clf = Pipeline(steps=[
	('vectorize', TraceTransformer(N=10)),
	('pca', PCA()),
	('scaler', StandardScaler()),
	('forest', IsolationForest(max_samples=0.8, n_estimators=1000))],
)

best_params = {
	'forest__contamination': 'auto',
	'forest__max_features': 1.0,
	'forest__max_samples': 0.8,
	'forest__n_estimators': 1000,
	'pca__n_components': None,
	'vectorize__N': 10
}
master
Markus Becker 2 years ago
parent
commit
d51a4e6a3c
  1. 33
      validate/ml.py

33
validate/ml.py

@ -41,7 +41,7 @@ def ngrams_to_features(ngrams):
num_features = 6
features = np.zeros((ngrams.shape[0], num_features * int(scipy.special.binom(ngrams.shape[1], 2))))
for n in range(ngrams.shape[0]):
offset = 0;
offset = 0
for i in range(ngrams.shape[1]):
for j in range(i+1, ngrams.shape[1]):
#print(f"comparing {n}: {j}-{i}")
@ -65,7 +65,7 @@ def traces_to_features(traces, N=3):
def _walk_files(rootdir):
files = set()
for (dirpath, dirnames, filenames) in os.walk(rootdir):
for (dirpath, _, filenames) in os.walk(rootdir):
for filename in filenames:
files.add(os.path.join(dirpath, filename))
return files
@ -130,17 +130,15 @@ def traces(geolife_path, maxtraces=-1):
if trace_count == maxtraces:
break
def make_clf_scorer(geolife):
def clf_score(clf, X, *args, **kwargs) -> float:
res_fake = clf.predict(np.array([np.random.uniform(size=(2000, 3)) * np.array((360, 360, 1000)) - np.array((180, 180 ,0)) for i in range(X.shape[0])], dtype=object))
res_valid = clf.predict(X)
def clf_score(clf, X, *args, **kwargs) -> float:
res_fake = clf.predict(np.array([np.random.uniform(size=(2000, 3)) * np.array((360, 360, 1000)) - np.array((180, 180 ,0)) for i in range(X.shape[0])], dtype=object))
res_valid = clf.predict(X)
return balanced_accuracy_score(
np.concatenate([-np.ones(res_fake.shape), np.ones(res_valid.shape)]),
np.concatenate([res_fake, res_valid])
)
return clf_score
return balanced_accuracy_score(
np.concatenate([-np.ones(res_fake.shape), np.ones(res_valid.shape)]),
np.concatenate([res_fake, res_valid])
)
if __name__ == '__main__':
@ -161,19 +159,18 @@ if __name__ == '__main__':
('scaler', StandardScaler()),
('forest', IsolationForest())
],
verbose=3,
)
parameters = {
'vectorize__N':(3,4,5,10,15),
'pca__n_components':(None, 'mle'),
'scaler__with_mean':(True, False),
'scaler__with_std':(True, False),
'vectorize__N':(3,5,7,10),
'pca__n_components':('mle', None),
'forest__n_estimators':(100,1000),
'forest__max_samples':(0.3, 0.8, 'auto', 10),
'forest__max_samples':(0.1, 0.3, 0.5, 0.8, 'auto'),
'forest__contamination':(1e-2, 1e-1, 'auto'),
'forest__max_features':(0.5, 1.0)
'forest__max_features':(0.3, 0.5, 1.0)
}
clf = GridSearchCV(pipeline, parameters, n_jobs=args.jobs, verbose=2, scoring=make_clf_scorer(args.geolife), cv=3)
clf = GridSearchCV(pipeline, parameters, n_jobs=args.jobs, verbose=3, scoring=clf_score, cv=3)
train_traces = np.array(list(traces(args.geolife, args.traces)), dtype=object)

Loading…
Cancel
Save